├── .Rbuildignore
├── LICENSE
├── .gitignore
├── tests
    ├── testthat.R
    └── testthat
    │   ├── utils.R
    │   ├── test_utils.R
    │   └── test_main.R
├── R
    ├── create_entityset.R
    ├── list_primitives.R
    ├── zzz.R
    ├── load_features.R
    ├── extract_features.R
    ├── add_entity.R
    ├── as_entityset.R
    ├── calculate_feature_matrix.R
    ├── install_featuretools.R
    ├── dfs.R
    ├── save_features.R
    ├── tidy_feature_matrix.R
    └── add_relationship.R
├── man
    ├── list_primitives.Rd
    ├── create_entityset.Rd
    ├── install_featuretools.Rd
    ├── as_entityset.Rd
    ├── dfs.Rd
    ├── add_entity.Rd
    ├── extract_features.Rd
    ├── add_relationship.Rd
    ├── load_features.Rd
    ├── save_features.Rd
    ├── tidy_feature_matrix.Rd
    └── calculate_feature_matrix.Rd
├── NAMESPACE
├── DESCRIPTION
└── README.md


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^\.git$
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2020
2 | COPYRIGHT HOLDER: Magnus Furugård
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | *.Rproj
5 | .Ruserdata
6 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(featuretoolsR)
3 | test_check("featuretoolsR")
4 | 


--------------------------------------------------------------------------------
/R/create_entityset.R:
--------------------------------------------------------------------------------
 1 | #' Create entityset
 2 | #' @description Create a blank entityset. A shortcut for `featuretools'` `EntitySet`.
 3 | #' @export
 4 | #'
 5 | #' @param id The id of this entityset.
 6 | #' @return An entityset.
 7 | #'
 8 | #' @examples
 9 | #' \donttest{
10 | #' create_entityset(id = "my_entityset")
11 | #' }
12 | create_entityset <- function(id) {
13 |   es <- .ft$EntitySet(id = id)
14 |   return(es)
15 | }
16 | 


--------------------------------------------------------------------------------
/R/list_primitives.R:
--------------------------------------------------------------------------------
 1 | #' List all available primitives.
 2 | #' @description List all available primitives from `featuretools` which can be passed to \link[featuretoolsR]{dfs}.
 3 | #' @export
 4 | #'
 5 | #' @return A list of all primitives available.
 6 | #'
 7 | #' @examples
 8 | #' \donttest{
 9 | #' featuretoolsR::list_primitives()
10 | #' }
11 | list_primitives <- function() {
12 |   .ft$list_primitives()
13 | }
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/man/list_primitives.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/list_primitives.R
 3 | \name{list_primitives}
 4 | \alias{list_primitives}
 5 | \title{List all available primitives.}
 6 | \usage{
 7 | list_primitives()
 8 | }
 9 | \value{
10 | A list of all primitives available.
11 | }
12 | \description{
13 | List all available primitives from `featuretools` which can be passed to \link[featuretoolsR]{dfs}.
14 | }
15 | \examples{
16 | \donttest{
17 | featuretoolsR::list_primitives()
18 | }
19 | }
20 | 


--------------------------------------------------------------------------------
/man/create_entityset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create_entityset.R
 3 | \name{create_entityset}
 4 | \alias{create_entityset}
 5 | \title{Create entityset}
 6 | \usage{
 7 | create_entityset(id)
 8 | }
 9 | \arguments{
10 | \item{id}{The id of this entityset.}
11 | }
12 | \value{
13 | An entityset.
14 | }
15 | \description{
16 | Create a blank entityset. A shortcut for `featuretools'` `EntitySet`.
17 | }
18 | \examples{
19 | \donttest{
20 | create_entityset(id = "my_entityset")
21 | }
22 | }
23 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(add_entity)
 4 | export(add_relationship)
 5 | export(as_entityset)
 6 | export(calculate_feature_matrix)
 7 | export(create_entityset)
 8 | export(dfs)
 9 | export(extract_features)
10 | export(install_featuretools)
11 | export(list_primitives)
12 | export(load_features)
13 | export(save_features)
14 | export(tidy_feature_matrix)
15 | importFrom(caret,nearZeroVar)
16 | importFrom(dplyr,ungroup)
17 | importFrom(purrr,map)
18 | importFrom(stringr,str_sub)
19 | importFrom(tibble,as_tibble)
20 | importFrom(tibble,is_tibble)
21 | importFrom(tibble,tibble)
22 | 


--------------------------------------------------------------------------------
/tests/testthat/utils.R:
--------------------------------------------------------------------------------
 1 | library(testthat)
 2 | 
 3 | featuretools_available <- function() {
 4 |   return(reticulate::py_module_available("featuretools"))
 5 | }
 6 | 
 7 | skip_if_no_featuretools <- function() {
 8 |   if (!featuretools_available())
 9 |     skip("required featuretools module not available for testing")
10 | }
11 | 
12 | generate_mock_data <- function(size = 2, value_variables = 2) {
13 |   options(stringsAsFactors = TRUE)
14 |   d <- data.frame(key = 1:size)
15 |   for (i in 1:value_variables) {
16 |     colnames <- c(names(d), paste0("value", i))
17 |     d <- cbind(d, data.frame("new" = sample(1:10, size, TRUE)))
18 |     names(d) <- colnames
19 |   }
20 |   return(d)
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/testthat/test_utils.R:
--------------------------------------------------------------------------------
 1 | # Test utility functions used for the package.
 2 | 
 3 | # Common functions for all tests.
 4 | source("utils.R")
 5 | 
 6 | # Create mock data for tests
 7 | library(magrittr)
 8 | set_size <- 2
 9 | value_variables <- 2
10 | set_1 <- generate_mock_data(set_size)
11 | set_2 <- generate_mock_data(set_size)
12 | 
13 | # Make sure we can list primitives
14 | test_that("can list primitives", {
15 |   skip_if_no_featuretools()
16 | 
17 |   primitives <- list_primitives()
18 |   expect_true(nrow(primitives) > 0)
19 |   expect_true(all(names(primitives) == c("name", "type", "description")))
20 | })
21 | 
22 | # Can create empty entityset
23 | test_that("can create empty entityset", {
24 |   skip_if_no_featuretools()
25 | 
26 |   es <- create_entityset(id = "my_entityset")
27 | 
28 |   expect_true(is.list(es$entities))
29 |   expect_true(length(es$entities) == 0)
30 | })
31 | 


--------------------------------------------------------------------------------
/man/install_featuretools.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/install_featuretools.R
 3 | \name{install_featuretools}
 4 | \alias{install_featuretools}
 5 | \title{Install featuretools}
 6 | \usage{
 7 | install_featuretools(custom_virtualenv = FALSE, method = "auto",
 8 |   conda = "auto")
 9 | }
10 | \arguments{
11 | \item{custom_virtualenv}{Defaults to false. Set to true if you wish to use a custom virtualenv for featuretoolsR.}
12 | 
13 | \item{method}{The installation method passed to \link[reticulate]{py_install}. Defaults to "auto".}
14 | 
15 | \item{conda}{Whether to use conda or not. Passed to `reticulate::py_install`. Defaults to "auto".}
16 | }
17 | \description{
18 | Setup for featuretools in it's own virtualenv, or into the default reticulate virtualenv.
19 | }
20 | \examples{
21 | \dontrun{
22 | featuretoolsR::install_featuretools()
23 | }
24 | }
25 | 


--------------------------------------------------------------------------------
/man/as_entityset.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/as_entityset.R
 3 | \name{as_entityset}
 4 | \alias{as_entityset}
 5 | \title{Create entityset and entity from data frame.}
 6 | \usage{
 7 | as_entityset(.data, id = "entityset", index = NA, time_index = NULL,
 8 |   entity_id = "df1", ...)
 9 | }
10 | \arguments{
11 | \item{.data}{The `data.frame` to be added as an entity to entityset.}
12 | 
13 | \item{id}{The id of this entityset.}
14 | 
15 | \item{index}{Name of id column in the dataframe.}
16 | 
17 | \item{time_index}{Name of the time column in the dataframe.}
18 | 
19 | \item{entity_id}{An identifier for this entity.}
20 | 
21 | \item{...}{Additional variables passed to `add_entity`.}
22 | }
23 | \value{
24 | A modified entityset.
25 | }
26 | \description{
27 | Create an entityset with a selected `data.frame` as an entity.
28 | }
29 | \examples{
30 | \donttest{
31 | as_entityset(cars, index = "row_number")
32 | }
33 | }
34 | 


--------------------------------------------------------------------------------
/man/dfs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dfs.R
 3 | \name{dfs}
 4 | \alias{dfs}
 5 | \title{Deep Feature Synthesis}
 6 | \usage{
 7 | dfs(entityset, target_entity, agg_primitives = NULL,
 8 |   trans_primitives = NULL, max_depth = 2L, ...)
 9 | }
10 | \arguments{
11 | \item{entityset}{The entityset on which to perform dfs.}
12 | 
13 | \item{target_entity}{The name of the entity on which to perform dfs.}
14 | 
15 | \item{agg_primitives}{Primitives passed to relational data.}
16 | 
17 | \item{trans_primitives}{Primitives passed to non-relational data.}
18 | 
19 | \item{max_depth}{Controls the maximum depth of features.}
20 | 
21 | \item{...}{Additional parameters passed to `featuretools.dfs`.}
22 | }
23 | \value{
24 | A `featuretools` feature matrix.
25 | }
26 | \description{
27 | The main function from featuretools used to create new features.
28 | }
29 | \examples{
30 | \donttest{
31 | es <- as_entityset(cars, index = "row_number")
32 | dfs(es, target_entity = "df1", trans_primitives = c("and"))
33 | }
34 | }
35 | 


--------------------------------------------------------------------------------
/man/add_entity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/add_entity.R
 3 | \name{add_entity}
 4 | \alias{add_entity}
 5 | \title{add_entity}
 6 | \usage{
 7 | add_entity(entityset, entity_id, df, index = NULL, time_index = NULL,
 8 |   ...)
 9 | }
10 | \arguments{
11 | \item{entityset}{The entity set to modify.}
12 | 
13 | \item{entity_id}{The name of the entity to add.}
14 | 
15 | \item{df}{The data frame to add as an entity.}
16 | 
17 | \item{index}{The index parameter specifies the column that uniquely identifies rows in the dataframe}
18 | 
19 | \item{time_index}{Name of the time column in the dataframe.}
20 | 
21 | \item{...}{Additional parameters passed to `featuretools.entity_from_dataframe`.}
22 | }
23 | \value{
24 | A modified entityset.
25 | }
26 | \description{
27 | Add an entity to an entityset.
28 | }
29 | \examples{
30 | \donttest{
31 | library(magrittr)
32 | create_entityset("set") \%>\%
33 |   add_entity(df = cars,
34 |              entity_id = "cars",
35 |              index = "row_number")
36 | }
37 | }
38 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: featuretoolsR
 2 | Type: Package
 3 | Title: Interact with the 'Python' Module 'Featuretools'
 4 | Version: 0.4.4
 5 | Authors@R: person("Magnus", "Furugård", email = "magnus.furugard@gmail.com", role = c("aut", "cre"))
 6 | Maintainer: Magnus Furugård <magnus.furugard@gmail.com>
 7 | Description: A 'reticulate'-based interface to the 'Python' module 'Featuretools'.
 8 |   The package grants functionality to interact with 'Pythons' 'Featuretools' module, which allows 
 9 |   for automated feature engineering on any data frame. Valid features and new data sets can, after
10 |   feature synthesis, easily be extracted.
11 | License: MIT + file LICENSE
12 | URL: https://github.com/magnusfurugard/featuretoolsR
13 | BugReports: https://github.com/magnusfurugard/featuretoolsR/issues
14 | Depends: 
15 |   R (>= 3.4.2)
16 | Imports:
17 |   reticulate, 
18 |   caret, 
19 |   dplyr,
20 |   purrr, 
21 |   stringr, 
22 |   tibble, 
23 |   magrittr, 
24 |   cli, 
25 |   testthat, 
26 |   rstudioapi
27 | Encoding: UTF-8
28 | LazyData: true
29 | RoxygenNote: 6.1.1
30 | 


--------------------------------------------------------------------------------
/man/extract_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/extract_features.R
 3 | \name{extract_features}
 4 | \alias{extract_features}
 5 | \title{Extract features}
 6 | \usage{
 7 | extract_features(.data)
 8 | }
 9 | \arguments{
10 | \item{.data}{The featuretools-object returned from \link[featuretoolsR]{dfs}.}
11 | }
12 | \value{
13 | All features created during \link[featuretoolsR]{dfs}, as a tibble.
14 | }
15 | \description{
16 | This function is used to extract all features created from \link[featuretoolsR]{dfs}.
17 | }
18 | \examples{
19 | \donttest{
20 | library(magrittr)
21 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
22 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
23 | # Common variable: `key`
24 | 
25 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
26 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
27 |   add_relationship(
28 |     parent_set = "set_1",
29 |     child_set = "set_2",
30 |     parent_idx = "key",
31 |     child_idx = "key"
32 |   ) \%>\%
33 |   dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\%
34 |   extract_features()
35 | }
36 | }
37 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onAttach <- function(...) {
 2 | 
 3 |   start <- paste("featuretoolsR", utils::packageVersion("featuretoolsR"))
 4 |   packageStartupMessage(cli::cat_boxx(start, padding = c(0, 3, 0, 3), border_style = "double"), appendLF = FALSE)
 5 | 
 6 |   if(!reticulate::py_module_available("pip")) {
 7 |     m <- "pip is not installed. Please install pip to proceed."
 8 |     msg <- cli::cat_bullet(m, bullet = "cross", bullet_col = "red")
 9 |   } else {
10 |     # See if featuretools already is installed
11 |     if(!reticulate::py_module_available("featuretools")) {
12 |       msg <- cli::cat_bullet("Featuretools unavailable. Please run `install_featuretools()`, or install featuretools with pip.", bullet = "cross", bullet_col = "red")
13 |     } else {
14 |       # Display featuretools info
15 |       ft <- paste("Using Featuretools", reticulate::py_get_attr(.ft, "__version__"))
16 |       msg <- cli::cat_bullet(ft, bullet = "tick", bullet_col = "green")
17 |     }
18 |   }
19 | 
20 |   packageStartupMessage(msg)
21 | 
22 | }
23 | 
24 | .ft <- NULL
25 | .onLoad <- function(...){
26 |   .ft <<- reticulate::import("featuretools", delay_load = TRUE)
27 |   options(
28 |     featuretoolsR.force_posixct = TRUE,
29 |     featuretoolsR.posixct_tz = "UTC",
30 |     featuretoolsR.virtualenv_name = "featuretoolsR"
31 |   )
32 | }
33 | 


--------------------------------------------------------------------------------
/man/add_relationship.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/add_relationship.R
 3 | \name{add_relationship}
 4 | \alias{add_relationship}
 5 | \title{Add a relationship to an entityset}
 6 | \usage{
 7 | add_relationship(entityset, parent_set, child_set, parent_idx, child_idx)
 8 | }
 9 | \arguments{
10 | \item{entityset}{The entityset to modify.}
11 | 
12 | \item{parent_set}{The name of the parent set.}
13 | 
14 | \item{child_set}{The name of the child set.}
15 | 
16 | \item{parent_idx}{The index variable of the `parent_set`.}
17 | 
18 | \item{child_idx}{The index variable of the `child_set`.}
19 | }
20 | \value{
21 | A modified entityset.
22 | }
23 | \description{
24 | Add a relationship to an entityset.
25 | }
26 | \examples{
27 | \donttest{
28 | library(magrittr)
29 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
30 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
31 | # Common variable: `key`
32 | 
33 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
34 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
35 |   add_relationship(
36 |     parent_set = "set_1",
37 |     child_set = "set_2",
38 |     parent_idx = "key",
39 |     child_idx = "key"
40 |   )
41 | }
42 | }
43 | 


--------------------------------------------------------------------------------
/man/load_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/load_features.R
 3 | \name{load_features}
 4 | \alias{load_features}
 5 | \title{Load features}
 6 | \usage{
 7 | load_features(file = NA)
 8 | }
 9 | \arguments{
10 | \item{file}{The file containing the features.}
11 | }
12 | \description{
13 | Used to load previously saved features created during \link[featuretoolsR]{dfs}.
14 | }
15 | \examples{
16 | \donttest{
17 | library(magrittr)
18 | 
19 | # Create mock datasets
20 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
21 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
22 | # Common variable: `key`
23 | 
24 | # Use dfs to create features
25 | dir <- tempdir()
26 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
27 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
28 |   add_relationship(
29 |     parent_set = "set_1",
30 |     child_set = "set_2",
31 |     parent_idx = "key",
32 |     child_idx = "key"
33 |   ) \%>\%
34 |   dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\%
35 |   extract_features() \%>\%
36 |   save_features(filename = "some.features", path = dir)
37 | 
38 | # Load saves features
39 | features <- load_features(file.path(dir, "some.features"))
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/man/save_features.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/save_features.R
 3 | \name{save_features}
 4 | \alias{save_features}
 5 | \title{Save features}
 6 | \usage{
 7 | save_features(.data, filename = NA, path = NA)
 8 | }
 9 | \arguments{
10 | \item{.data}{The tibble of features returned from \link[featuretoolsR]{extract_features}.}
11 | 
12 | \item{filename}{(optional) The name of the file to produce.}
13 | 
14 | \item{path}{(optional) The path where the feature file should be placed.}
15 | }
16 | \description{
17 | Used to save all or a subset of features created during \link[featuretoolsR]{dfs}.
18 | }
19 | \examples{
20 | \donttest{
21 | library(magrittr)
22 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
23 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
24 | # Common variable: `key`
25 | 
26 | dir <- tempdir()
27 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
28 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
29 |   add_relationship(
30 |     parent_set = "set_1",
31 |     child_set = "set_2",
32 |     parent_idx = "key",
33 |     child_idx = "key"
34 |   ) \%>\%
35 |   dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\%
36 |   extract_features() \%>\%
37 |   save_features(filename = "some.features", path = dir)
38 | }
39 | }
40 | 


--------------------------------------------------------------------------------
/R/load_features.R:
--------------------------------------------------------------------------------
 1 | #' Load features
 2 | #' @description Used to load previously saved features created during \link[featuretoolsR]{dfs}.
 3 | #' @export
 4 | #'
 5 | #' @param file The file containing the features.
 6 | #'
 7 | #' @examples
 8 | #' \donttest{
 9 | #' library(magrittr)
10 | #'
11 | #' # Create mock datasets
12 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
13 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
14 | #' # Common variable: `key`
15 | #'
16 | #' # Use dfs to create features
17 | #' dir <- tempdir()
18 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
19 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
20 | #'   add_relationship(
21 | #'     parent_set = "set_1",
22 | #'     child_set = "set_2",
23 | #'     parent_idx = "key",
24 | #'     child_idx = "key"
25 | #'   ) %>%
26 | #'   dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
27 | #'   extract_features() %>%
28 | #'   save_features(filename = "some.features", path = dir)
29 | #'
30 | #' # Load saves features
31 | #' features <- load_features(file.path(dir, "some.features"))
32 | #' }
33 | load_features <- function(file = NA) {
34 | 
35 |   # Sanitize input
36 |   if(is.na(file))
37 |     stop("No file specified.")
38 | 
39 |   # Attempt to load file.
40 |   return(
41 |     .ft$load_features(
42 |       normalizePath(file)
43 |     )
44 |   )
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/tidy_feature_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidy_feature_matrix.R
 3 | \name{tidy_feature_matrix}
 4 | \alias{tidy_feature_matrix}
 5 | \title{Tidy feature matrix}
 6 | \usage{
 7 | tidy_feature_matrix(.data, remove_nzv = FALSE, nan_is_na = FALSE,
 8 |   clean_names = FALSE)
 9 | }
10 | \arguments{
11 | \item{.data}{The featuretools-object returned from \link[featuretoolsR]{dfs}.}
12 | 
13 | \item{remove_nzv}{Remove near zero variance variables created from \link[featuretoolsR]{dfs}.}
14 | 
15 | \item{nan_is_na}{Turn all `NaN` into `NA`.}
16 | 
17 | \item{clean_names}{Make variable names R-friendly (snake case).}
18 | }
19 | \value{
20 | A tidy data.frame.
21 | }
22 | \description{
23 | Used for tidying up ('R-ify') the feature matrix after deep feature synthethis (\link[featuretoolsR]{dfs}).
24 | }
25 | \examples{
26 | \donttest{
27 | library(magrittr)
28 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
29 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
30 | # Common variable: `key`
31 | 
32 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
33 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
34 |   add_relationship(
35 |     parent_set = "set_1",
36 |     child_set = "set_2",
37 |     parent_idx = "key",
38 |     child_idx = "key"
39 |   ) \%>\%
40 |   dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\%
41 |   tidy_feature_matrix(remove_nzv = TRUE, nan_is_na = TRUE)
42 | }
43 | }
44 | 


--------------------------------------------------------------------------------
/R/extract_features.R:
--------------------------------------------------------------------------------
 1 | #' Extract features
 2 | #' @description This function is used to extract all features created from \link[featuretoolsR]{dfs}.
 3 | #' @export
 4 | #'
 5 | #' @param .data The featuretools-object returned from \link[featuretoolsR]{dfs}.
 6 | #' @return All features created during \link[featuretoolsR]{dfs}, as a tibble.
 7 | #'
 8 | #' @importFrom tibble tibble
 9 | #' @importFrom purrr map
10 | #'
11 | #' @examples
12 | #' \donttest{
13 | #' library(magrittr)
14 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
15 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
16 | #' # Common variable: `key`
17 | #'
18 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
19 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
20 | #'   add_relationship(
21 | #'     parent_set = "set_1",
22 | #'     child_set = "set_2",
23 | #'     parent_idx = "key",
24 | #'     child_idx = "key"
25 | #'   ) %>%
26 | #'   dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
27 | #'   extract_features()
28 | #' }
29 | extract_features <- function(.data) {
30 | 
31 |   # List features in ft-object
32 |   feature_names <- unlist(purrr::map(
33 |     .data[[2]],
34 |     .f = function(feature) {
35 |       feature$get_name()
36 |     }
37 |   ))
38 | 
39 |   # Extract features
40 |   feature_actuals <- purrr::map(
41 |     .data[[2]],
42 |     .f = function(feature) {
43 |       feature
44 |     }
45 |   )
46 | 
47 |   # Construct informative tibble with features
48 |   return(
49 |     tibble::tibble(
50 |       name = feature_names,
51 |       feature = feature_actuals
52 |     )
53 |   )
54 | }
55 | 


--------------------------------------------------------------------------------
/R/add_entity.R:
--------------------------------------------------------------------------------
 1 | #' add_entity
 2 | #' @description Add an entity to an entityset.
 3 | #' @export
 4 | #'
 5 | #' @param entityset The entity set to modify.
 6 | #' @param entity_id The name of the entity to add.
 7 | #' @param df The data frame to add as an entity.
 8 | #' @param index The index parameter specifies the column that uniquely identifies rows in the dataframe
 9 | #' @param time_index Name of the time column in the dataframe.
10 | #' @param ... Additional parameters passed to `featuretools.entity_from_dataframe`.
11 | #' @return A modified entityset.
12 | #'
13 | #' @examples
14 | #' \donttest{
15 | #' library(magrittr)
16 | #' create_entityset("set") %>%
17 | #'   add_entity(df = cars,
18 | #'              entity_id = "cars",
19 | #'              index = "row_number")
20 | #' }
21 | add_entity <- function(
22 |   entityset,
23 |   entity_id,
24 |   df,
25 |   index = NULL,
26 |   time_index = NULL,
27 |   ...
28 | ) {
29 |   # Construct variable_types to handle factors as categorical variables.
30 |   classes <- purrr::map_dfr(sapply(df, FUN = function(col) {
31 |     c <- class(col)
32 |     # prettify difficult data types
33 |     if(length(c > 1))
34 |       c <- paste0(c, collapse = ", ")
35 |     return(c)
36 |   }), c)
37 | 
38 |   variable_types = list() #initialize
39 |   if (any(classes == "factor")) {
40 |     for (i in 1:length(classes)) {
41 |       suppressWarnings({
42 |         if (class(df[, i]) == "factor") {
43 |           variable_types[[names(df)[i]]] <- .ft$variable_types$Categorical
44 |         }
45 |       })
46 |     }
47 |   }
48 | 
49 |   variable_types <- reticulate::r_to_py(variable_types)
50 | 
51 |   # Add df as entity to entityset.
52 |   es <- entityset$entity_from_dataframe(
53 |     entity_id = entity_id,
54 |     dataframe = reticulate::r_to_py(x = df),
55 |     index = index,
56 |     time_index = time_index,
57 |     variable_types = variable_types,
58 |     ...
59 |   )
60 | 
61 |   return(es)
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/man/calculate_feature_matrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/calculate_feature_matrix.R
 3 | \name{calculate_feature_matrix}
 4 | \alias{calculate_feature_matrix}
 5 | \title{Calculate feature matrix}
 6 | \usage{
 7 | calculate_feature_matrix(entityset, features, ...)
 8 | }
 9 | \arguments{
10 | \item{entityset}{The entityset on which to create features.}
11 | 
12 | \item{features}{The features to create based on previous runs of \link[featuretoolsR]{dfs}.}
13 | 
14 | \item{...}{Additional parameters passed to `featuretoools.calculate_feature_matrix`.}
15 | }
16 | \value{
17 | A feature matrix
18 | }
19 | \description{
20 | This function is used to create a feature matrix based on a custom list of features (usually created from \link[featuretoolsR]{save_features}).
21 | }
22 | \examples{
23 | \donttest{
24 | library(magrittr)
25 | 
26 | # Create some mock data
27 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
28 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
29 | # Common variable: `key`
30 | 
31 | # Create features and save them
32 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
33 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
34 |   add_relationship(
35 |     parent_set = "set_1",
36 |     child_set = "set_2",
37 |     parent_idx = "key",
38 |     child_idx = "key"
39 |   ) \%>\%
40 |   dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\%
41 |   extract_features() \%>\%
42 |   save_features(filename = "some.features")
43 | 
44 | # Re-create entityset, but rather than dfs use calcualte_feature_matrix.
45 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\%
46 |   add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\%
47 |   add_relationship(
48 |     parent_set = "set_1",
49 |     child_set = "set_2",
50 |     parent_idx = "key",
51 |     child_idx = "key"
52 |   )
53 | calculate_feature_matrix(entityset = es, features = load_features("some.features"))
54 | }
55 | }
56 | 


--------------------------------------------------------------------------------
/R/as_entityset.R:
--------------------------------------------------------------------------------
 1 | #' Create entityset and entity from data frame.
 2 | #' @description Create an entityset with a selected `data.frame` as an entity.
 3 | #' @export
 4 | #'
 5 | #' @importFrom dplyr ungroup
 6 | #'
 7 | #' @param .data The `data.frame` to be added as an entity to entityset.
 8 | #' @param id The id of this entityset.
 9 | #' @param index Name of id column in the dataframe.
10 | #' @param time_index Name of the time column in the dataframe.
11 | #' @param entity_id An identifier for this entity.
12 | #' @param ... Additional variables passed to `add_entity`.
13 | #' @return A modified entityset.
14 | #'
15 | #' @examples
16 | #' \donttest{
17 | #' as_entityset(cars, index = "row_number")
18 | #' }
19 | as_entityset <- function(
20 |   .data,
21 |   id = "entityset",
22 |   index = NA,
23 |   time_index = NULL,
24 |   entity_id = "df1",
25 |   ...
26 | ) {
27 | 
28 |   # Sanitize input.
29 |   if (!is.data.frame(.data)) stop("`.data` is not of type `data.frame`")
30 |   if(is.na(id)) stop("`id` cannot be `NA`. Leave empty for default name.")
31 |   if(nrow(.data) == 0) warning("`.data` contains zero rows.`")
32 | 
33 |   # Create entityset.
34 |   es <- .ft$EntitySet(id = id)
35 | 
36 |   # If index is unset, warn user and create a new index variable.
37 |   if(is.na(index)) {
38 |     warning("`index` is `NA`. Using new variable `row_number` as index.")
39 |     .data <- dplyr::ungroup(.data)
40 |     .data$rownumber <- 1:nrow(.data)
41 |     index <- "rownumber"
42 |   }
43 | 
44 |   # Fix reticulate datetime64 support
45 |   if(getOption("featuretoolsR.force_posixct")) {
46 |     .cols = lapply(df, class)
47 |     for (i in 1:length(.cols)) {
48 |       colname <- names(.cols)[i]
49 |       coltype <- .cols[[colname]]
50 |       if (any(coltype == "Date")) {
51 |         df[,colname] <- as.POSIXct(df[,colname], getOption("featuretoolsR.posixct_tz"))
52 |       }
53 |     }
54 |   }
55 | 
56 |   # Add first entity to entityset.
57 |   es <- add_entity(
58 |     entityset = es,
59 |     entity_id = entity_id,
60 |     df = .data,
61 |     index = index,
62 |     time_index = time_index,
63 |     ...
64 |   )
65 | 
66 |   return(es)
67 | }
68 | 


--------------------------------------------------------------------------------
/R/calculate_feature_matrix.R:
--------------------------------------------------------------------------------
 1 | #' Calculate feature matrix
 2 | #' @description This function is used to create a feature matrix based on a custom list of features (usually created from \link[featuretoolsR]{save_features}).
 3 | #' @export
 4 | #'
 5 | #' @param entityset The entityset on which to create features.
 6 | #' @param features The features to create based on previous runs of \link[featuretoolsR]{dfs}.
 7 | #' @param ... Additional parameters passed to `featuretoools.calculate_feature_matrix`.
 8 | #' @return A feature matrix
 9 | #'
10 | #' @examples
11 | #' \donttest{
12 | #' library(magrittr)
13 | #'
14 | #' # Create some mock data
15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
17 | #' # Common variable: `key`
18 | #'
19 | #' # Create features and save them
20 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
21 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
22 | #'   add_relationship(
23 | #'     parent_set = "set_1",
24 | #'     child_set = "set_2",
25 | #'     parent_idx = "key",
26 | #'     child_idx = "key"
27 | #'   ) %>%
28 | #'   dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
29 | #'   extract_features() %>%
30 | #'   save_features(filename = "some.features")
31 | #'
32 | #' # Re-create entityset, but rather than dfs use calcualte_feature_matrix.
33 | #' es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
34 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
35 | #'   add_relationship(
36 | #'     parent_set = "set_1",
37 | #'     child_set = "set_2",
38 | #'     parent_idx = "key",
39 | #'     child_idx = "key"
40 | #'   )
41 | #' calculate_feature_matrix(entityset = es, features = load_features("some.features"))
42 | #' }
43 | calculate_feature_matrix <- function(
44 |   entityset,
45 |   features,
46 |   ...
47 | ) {
48 |   # Run featuretools
49 |   return(
50 |     .ft$calculate_feature_matrix(
51 |       features = features,
52 |       entityset = entityset,
53 |       ...
54 |     )
55 |   )
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/R/install_featuretools.R:
--------------------------------------------------------------------------------
 1 | #' Install featuretools
 2 | #' @description Setup for featuretools in it's own virtualenv, or into the default reticulate virtualenv.
 3 | #'
 4 | #' @param custom_virtualenv Set to true if you wish to use a custom virtualenv for featuretoolsR.
 5 | #' @param method The installation method passed to \link[reticulate]{py_install}. Defaults to "auto".
 6 | #' @param conda Whether to use conda or not. Passed to \link[reticulate]{py_install}. Defaults to "auto".
 7 | #' @export
 8 | #'
 9 | #' @examples
10 | #' \dontrun{
11 | #' featuretoolsR::install_featuretools()
12 | #' }
13 | install_featuretools <- function(custom_virtualenv = FALSE, method = "auto", conda = "auto") {
14 | 
15 |   # See if conda, pip or pip3 is installed.
16 |   status <- list(
17 |     conda = cli_is_installed("conda"),
18 |     pip = cli_is_installed("pip"),
19 |     pip3 = cli_is_installed("pip3")
20 |   )
21 | 
22 |   if(!any(status == TRUE)) {
23 |     stop("Neither `pip`, `pip3` or `conda` was found. At least one is required to install Featuretools.")
24 |   }
25 | 
26 |   # Installation
27 |   if(custom_virtualenv) {
28 |     virtualenv_name <- getOption("featuretoolsR.virtualenv_name")
29 |     path <- paste(reticulate::virtualenv_root(), virtualenv_name, sep = "/")
30 |     if(!file.exists(path)) {
31 |       reticulate::virtualenv_create(virtualenv_name)
32 |     } else {
33 |       message("Using existing virtualenv in ", path)
34 |     }
35 | 
36 |     # Check if featuretools is installed
37 |     if(!reticulate::py_module_available("featuretools")) {
38 |       message("Installing featuretools into ", path)
39 |       # Install featuretools
40 |       reticulate::virtualenv_install(virtualenv_name, packages = "featuretools")
41 |     }
42 | 
43 |     # Use new virtualenv
44 |     reticulate::use_virtualenv(virtualenv_name)
45 |   } else {
46 |     reticulate::py_install("featuretools", method = method, conda = conda)
47 |   }
48 | 
49 |   # Reload library
50 |   unloadNamespace("featuretoolsR")
51 |   rstudioapi::restartSession("library(featuretoolsR)")
52 | }
53 | 
54 | cli_is_installed <- function(command) {
55 |   tryCatch(expr = {
56 |     system(command, intern = T, ignore.stderr = T)
57 |     return(TRUE)
58 |   }, error = function(e) {
59 |     return(FALSE)
60 |   })
61 | }
62 | 


--------------------------------------------------------------------------------
/R/dfs.R:
--------------------------------------------------------------------------------
 1 | #' Deep Feature Synthesis
 2 | #' @description The main function from featuretools used to create new features.
 3 | #' @export
 4 | #'
 5 | #' @param entityset The entityset on which to perform dfs.
 6 | #' @param target_entity The name of the entity on which to perform dfs.
 7 | #' @param agg_primitives Primitives passed to relational data.
 8 | #' @param trans_primitives Primitives passed to non-relational data.
 9 | #' @param max_depth Controls the maximum depth of features.
10 | #' @param ... Additional parameters passed to `featuretools.dfs`.
11 | #' @return A `featuretools` feature matrix.
12 | #'
13 | #' @examples
14 | #' \donttest{
15 | #' es <- as_entityset(cars, index = "row_number")
16 | #' dfs(es, target_entity = "df1", trans_primitives = c("and"))
17 | #' }
18 | dfs <- function(
19 |   entityset,
20 |   target_entity,
21 |   agg_primitives = NULL,
22 |   trans_primitives = NULL,
23 |   max_depth = 2L,
24 |   ...
25 | ) {
26 |   # Ensure primitives are in the correct format
27 |   if(!is.list(agg_primitives)) {
28 |     agg_primitives <- as.list(agg_primitives)
29 |   }
30 |   if(!is.list(trans_primitives)) {
31 |     trans_primitives <- as.list(trans_primitives)
32 |   }
33 | 
34 |   # Ensure primitives are valid
35 |   aggs <- list_primitives()[list_primitives()$type=="aggregation", "name"]
36 |   .agg_primitives <- unlist(agg_primitives)
37 |   if(any(!(.agg_primitives %in% aggs))) {
38 |     invalid <- paste0(.agg_primitives[!(.agg_primitives %in% aggs)], collapse = "`, `")
39 |     stop("Invalid aggregate primitive(s): `", invalid, "`. Use list_primitives() to find valid primitives.")
40 |   }
41 | 
42 |   trans <- list_primitives()[list_primitives()$type=="transform", "name"]
43 |   .trans_primitives <- unlist(trans_primitives)
44 |   if(any(!(.trans_primitives %in% trans))) {
45 |     invalid <- paste0(.trans_primitives[!(.trans_primitives %in% trans)], collapse = "`, `")
46 |     stop("Invalid transform primitive(s): `", invalid, "`. Use list_primitives() to find valid primitives.")
47 |   }
48 | 
49 |   # DFS
50 |   feature_matrix <- .ft$dfs(
51 |     entityset = entityset,
52 |     target_entity = target_entity,
53 |     agg_primitives = reticulate::r_to_py(agg_primitives),
54 |     trans_primitives = reticulate::r_to_py(trans_primitives),
55 |     max_depth = max_depth,
56 |     ...
57 |   )
58 | 
59 |   return(feature_matrix)
60 | }
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/R/save_features.R:
--------------------------------------------------------------------------------
 1 | #' Save features
 2 | #' @description Used to save all or a subset of features created during \link[featuretoolsR]{dfs}.
 3 | #' @export
 4 | #'
 5 | #' @importFrom stringr str_sub
 6 | #' @importFrom tibble is_tibble
 7 | #'
 8 | #' @param .data The tibble of features returned from \link[featuretoolsR]{extract_features}.
 9 | #' @param filename (optional) The name of the file to produce.
10 | #' @param path (optional) The path where the feature file should be placed.
11 | #'
12 | #' @examples
13 | #' \donttest{
14 | #' library(magrittr)
15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
17 | #' # Common variable: `key`
18 | #'
19 | #' dir <- tempdir()
20 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
21 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
22 | #'   add_relationship(
23 | #'     parent_set = "set_1",
24 | #'     child_set = "set_2",
25 | #'     parent_idx = "key",
26 | #'     child_idx = "key"
27 | #'   ) %>%
28 | #'   dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
29 | #'   extract_features() %>%
30 | #'   save_features(filename = "some.features", path = dir)
31 | #' }
32 | save_features <- function(
33 |   .data,
34 |   filename = NA,
35 |   path = NA
36 | ) {
37 | 
38 |   # Sanitize input
39 |   ## Input should be a tibble with 2 variables.
40 |   if(any(c(colnames(.data) != c("name", "feature"), !tibble::is_tibble(.data))))
41 |     stop("Bad input. Did you forget to use `extract_features`?")
42 | 
43 |   ## If user didn't set path, use working directory.
44 |   ## (For featuretools' save_features, we need the full path)
45 |   if(is.na(path)) {
46 |     warning("No `path` set, defaulting to working directory\n")
47 |     path <- paste0(normalizePath(getwd()), "/")
48 |   } else {
49 |     # Writer in Python requires full path, so fix user given path
50 |     path <- paste0(normalizePath(path))
51 | 
52 |     # Make sure user entered path correctly
53 |     if(stringr::str_sub(path, -1, -1) != "/")
54 |       path <- paste0(path, "/")
55 |   }
56 | 
57 |   ## If user didn't specify a file name, generate one.
58 |   if(is.na(filename)) {
59 |     tmp <- paste0(paste0(sample(c(letters, LETTERS), 16, FALSE), collapse = ""), ".features")
60 |     warning("No `filename` passed, generated: ", tmp)
61 |     path <- paste0(path, tmp)
62 |   } else {
63 |     path <- paste0(path, filename)
64 |   }
65 | 
66 |   # Save all features passed from `extract_features`.
67 |   .ft$save_features(
68 |     features = .data$feature,
69 |     location = path
70 |   )
71 | 
72 |   return(TRUE)
73 | }
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/R/tidy_feature_matrix.R:
--------------------------------------------------------------------------------
 1 | #' Tidy feature matrix
 2 | #' @description Used for tidying up ('R-ify') the feature matrix after deep feature synthethis (\link[featuretoolsR]{dfs}).
 3 | #' @export
 4 | #'
 5 | #' @param .data The featuretools-object returned from \link[featuretoolsR]{dfs}.
 6 | #' @param remove_nzv Remove near zero variance variables created from \link[featuretoolsR]{dfs}.
 7 | #' @param nan_is_na Turn all `NaN` into `NA`.
 8 | #' @param clean_names Make variable names R-friendly (snake case).
 9 | #' @return A tidy data.frame.
10 | #'
11 | #' @importFrom caret nearZeroVar
12 | #' @importFrom purrr map
13 | #' @importFrom tibble as_tibble
14 | #'
15 | #' @examples
16 | #' \donttest{
17 | #' library(magrittr)
18 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
19 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
20 | #' # Common variable: `key`
21 | #'
22 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
23 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
24 | #'   add_relationship(
25 | #'     parent_set = "set_1",
26 | #'     child_set = "set_2",
27 | #'     parent_idx = "key",
28 | #'     child_idx = "key"
29 | #'   ) %>%
30 | #'   dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
31 | #'   tidy_feature_matrix(remove_nzv = TRUE, nan_is_na = TRUE)
32 | #' }
33 | tidy_feature_matrix <- function(
34 |   .data,
35 |   remove_nzv = FALSE,
36 |   nan_is_na = FALSE,
37 |   clean_names = FALSE
38 | ) {
39 | 
40 |   # Coerce into R-object.
41 |   to_r <- tibble::as_tibble(.data[[1]])
42 | 
43 |   # Variables get duplicated when coercing object from Python to R. Cleanup.
44 |   nondupe <- to_r[, !duplicated(names(to_r))]
45 | 
46 |   # Process `nondupe` according to user defined parameters.
47 |   ## Remove near zero variance
48 |   if(remove_nzv) {
49 |     nzvs <- purrr::map_dfr(
50 |       lapply(
51 |         X = names(nondupe),
52 |         FUN = function(colname) {
53 |           t <- caret::nearZeroVar(nondupe[, colname], saveMetrics = TRUE)
54 |           t$variable <- colname
55 |           return(t)
56 |         }
57 |       ), c)
58 | 
59 |     # Update nondupe-set.
60 |     nondupe <- nondupe[, !nzvs$nzv]
61 |   }
62 | 
63 |   ## Replace all `NaN` with `NA`
64 |   if(nan_is_na) {
65 |     for (colname in names(nondupe)) {
66 |       nondupe[, colname][[1]][is.nan(nondupe[, colname][[1]])] <- NA
67 |     }
68 |   }
69 | 
70 |   ## Make variable names more R-friendly
71 |   if(clean_names) {
72 |     n <- tolower(names(nondupe))
73 |     tn <- gsub("[^A-z0-9]", "_", n)
74 |     tn <- gsub("(_+?$)|(__+?)", "", tn)
75 |     names(nondupe) <- tn
76 |   }
77 | 
78 |   # Back to data.frame
79 |   result <- as.data.frame(nondupe)
80 | 
81 |   return(result)
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/R/add_relationship.R:
--------------------------------------------------------------------------------
 1 | #' Add a relationship to an entityset
 2 | #' @description Add a relationship to an entityset.
 3 | #' @export
 4 | #'
 5 | #' @param entityset The entityset to modify.
 6 | #' @param parent_set The name of the parent set.
 7 | #' @param child_set The name of the child set.
 8 | #' @param parent_idx The index variable of the `parent_set`.
 9 | #' @param child_idx The index variable of the `child_set`. Defaults to `parent_idx`.
10 | #' @return A modified entityset.
11 | #'
12 | #' @examples
13 | #' \donttest{
14 | #' library(magrittr)
15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE)
16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE)
17 | #' # Common variable: `key`
18 | #'
19 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
20 | #'   add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
21 | #'   add_relationship(
22 | #'     parent_set = "set_1",
23 | #'     child_set = "set_2",
24 | #'     parent_idx = "key",
25 | #'     child_idx = "key"
26 | #'   )
27 | #' }
28 | add_relationship <- function(
29 |   entityset,
30 |   parent_set,
31 |   child_set,
32 |   parent_idx,
33 |   child_idx=NULL
34 | ) {
35 | 
36 |   if(is.null(child_idx)) {
37 |     child_idx <- parent_idx
38 |   }
39 | 
40 |   # Find indexes for entites and variables inside entitysets
41 |   es_names <- purrr::map_dfr(lapply(
42 |     X = 1:length(entityset$entities),
43 |     FUN = function(set) {
44 |       variables <- unlist(lapply(
45 |         X = entityset$entities[[set]]$variables,
46 |         FUN = function(x) x$id
47 |       ))
48 | 
49 |       t <- data.frame("variable_name" = variables, stringsAsFactors = FALSE)
50 |       t$variable_idx <- 1:nrow(t)
51 |       t$entity_name <- names(entityset$entity_dict)[[set]]
52 |       t$entity_idx <- set
53 | 
54 |       return(t)
55 | 
56 |     }
57 |   ), c)
58 | 
59 |   entity_parent_set_pos <- es_names$entity_idx[es_names$entity_name == parent_set][[1]]
60 |   entity_child_set_pos <- es_names$entity_idx[es_names$entity_name == child_set][[1]]
61 |   index_parent_set_pos <- es_names$variable_idx[es_names$variable_name == parent_idx & es_names$entity_name == parent_set]
62 |   index_child_set_pos <- es_names$variable_idx[es_names$variable_name == child_idx & es_names$entity_name == child_set]
63 | 
64 |   if (length(index_parent_set_pos) == 0) {
65 |     stop("Couldn't find index column `", parent_idx, "` in `", parent_set, "`")
66 |   }
67 | 
68 |   if (length(index_child_set_pos) == 0) {
69 |     stop("Couldn't find index column `", child_idx, "` in `", child_set, "`")
70 |   }
71 | 
72 |   # Construct new relationship
73 |   rel <- .ft$Relationship(
74 |     entityset$entities[[entity_parent_set_pos]]$variables[[index_parent_set_pos]],
75 |     entityset$entities[[entity_child_set_pos]]$variables[[index_child_set_pos]]
76 |   )
77 | 
78 |   # Add relationship to entityset
79 |   entityset <- entityset$add_relationship(rel)
80 | 
81 |   return(entityset)
82 | }
83 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # featuretoolsR
 2 | An R interface to the Python module Featuretools.
 3 | 
 4 | # General
 5 | `featuretoolsR` provides functionality from the Python module `featuretools`, which aims to automate feature engineering. This package is very much a work in progress as Featuretools offers a lot of functionality. Any PRs are much appreciated.
 6 | 
 7 | # Installing
 8 | 
 9 | ## Package
10 | ### CRAN
11 | The latest stable release is found on [CRAN](https://cran.r-project.org/package=featuretoolsR).
12 | 
13 | ### Github
14 | You can get the latest version of `featuretoolsR` by installing it straight from Github:  `devtools::install_github("magnusfurugard/featuretoolsR")`.
15 | 
16 | ## Featuretools
17 | You'll need to have a working Python environment as well as `featuretools` installed. The recommended way is to use the built-in function `install_featuretools()` which automatically sets up a virtual environment for the package and installs `featuretools`.
18 | 
19 | # Usage
20 | All functions in `featuretoolsR` comes with documentation, but it's advised to briefly browse through the [Featuretools Python documentation](https://docs.featuretools.com/). It'll cover things like `entities`, `relationships` and `dfs`. 
21 | 
22 | ## Creating an entityset
23 | An entityset is the set which contain all your entities. To create a set and add an entity straight away, you can use `as_entityset`. 
24 | ```
25 | # Libs
26 | library(featuretoolsR)
27 | library(magrittr)
28 | 
29 | # Create some mock data
30 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, T), a = rep(Sys.Date(), 100))
31 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, T), b = rep(Sys.time(), 100))
32 | 
33 | # Create entityset
34 | es <- as_entityset(
35 |   set_1, 
36 |   index = "key", 
37 |   entity_id = "set_1", 
38 |   id = "demo", 
39 |   time_index = "a"
40 | )
41 | ```
42 | 
43 | ## Adding entities
44 | To add entities (i.e if you have relational data across multiple `data.frames`), this can be achieved with `add_entity`. This function is pipe friendly. For this demo-case, we'll use `set_2`.
45 | ```
46 | es <- es %>%
47 |   add_entity(
48 |     df = set_2, 
49 |     entity_id = "set_2", 
50 |     index = "key", 
51 |     time_index = "b"
52 |   )
53 | ```
54 | 
55 | ## Defining relationships
56 | With relational data, it's useful to define a relationship between two or more entities. This can be done with `add_relationship`.
57 | ```
58 | es <- es %>%
59 |   add_relationship(
60 |     parent_set = "set_1", 
61 |     child_set = "set_2", 
62 |     parent_idx = "key", 
63 |     child_idx = "key"
64 |   )
65 | ```
66 | 
67 | ## Deep feature synthesis
68 | The bread and butter of Featuretools is the `dfs`-function (official docs [here](https://docs.featuretools.com/en/stable/automated_feature_engineering/afe.html)). It will attempt to create features based on `*_primitives` you provide (more on primitives below).
69 | ```
70 | ft_matrix <- es %>%
71 |   dfs(
72 |     target_entity = "set_1", 
73 |     trans_primitives = c("and", "cum_sum")
74 |   )
75 | ```
76 | 
77 | ## Tidying up
78 | To use the new data.frame/features created by `dfs`, a function unique for `featuretoolsR`, `tidy_feature_matrix` can be used. A few "nice-to-have" arguments can be passed to clean the new data, like removing near zero variance variables, as well as replacing `NaN` with `NA`.
79 | ```
80 | tidy <- tidy_feature_matrix(ft_matrix, remove_nzv = T, nan_is_na = T, clean_names = T)
81 | ```
82 | 
83 | # Primitives
84 | Featuretools supports a lot of primitives. These are accessible with the function `list_primitives()` which returns a data.frame containing type (aggregation (`agg_primitives`) or transform (`trans_primitives`)), name (in the example above, "and" and "divide") as well as a brief description of the primitive itself.
85 | 
86 | # Credits
87 | [reticulate](https://github.com/rstudio/reticulate) - an R interface to Python.
88 | 
89 | [Featuretools](https://github.com/Featuretools/featuretools)
90 | 


--------------------------------------------------------------------------------
/tests/testthat/test_main.R:
--------------------------------------------------------------------------------
  1 | # This these tests ensures that the most used functions of this package works. Other
  2 | # utility functions are tested separately.
  3 | 
  4 | # Common functions for all tests.
  5 | source("utils.R")
  6 | 
  7 | # Create mock data for tests
  8 | library(magrittr)
  9 | set_size <- 2
 10 | value_variables <- 2
 11 | set_1 <- generate_mock_data(set_size, value_variables)
 12 | set_2 <- generate_mock_data(set_size, value_variables)
 13 | 
 14 | # Basic entity creation
 15 | test_that("can create entity through as_entityset", {
 16 |   skip_if_no_featuretools()
 17 | 
 18 |   es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo")
 19 |   expect_true(any(class(es) == "featuretools.entityset.entityset.EntitySet"))
 20 | })
 21 | 
 22 | # Add multiple entities to set
 23 | test_that("can add entity to entityset with add_entity", {
 24 |   skip_if_no_featuretools()
 25 | 
 26 |   es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
 27 |     add_entity(entity_id = "set_2", df = set_2, index = "key")
 28 |   expect_true(length(es$entities) == 2)
 29 |   expect_true(all(names(es$entity_dict) == c("set_1", "set_2")))
 30 | })
 31 | 
 32 | # Add relationship between entities
 33 | test_that("can add relationship between two entities", {
 34 |   skip_if_no_featuretools()
 35 | 
 36 |   es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
 37 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
 38 |     add_relationship(
 39 |       parent_set = "set_1",
 40 |       child_set = "set_2",
 41 |       parent_idx = "key",
 42 |       child_idx = "key"
 43 |     )
 44 | 
 45 |   expect_true(is.list(es$relationships) && length(es$relationships) > 0)
 46 | })
 47 | 
 48 | test_that("relationship can be inherited from parent_idx", {
 49 |   skip_if_no_featuretools()
 50 | 
 51 |   es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
 52 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
 53 |     add_relationship(
 54 |       parent_set = "set_1",
 55 |       child_set = "set_2",
 56 |       parent_idx = "key"
 57 |     )
 58 | 
 59 |   expect_true(is.list(es$relationships) && length(es$relationships) > 0)
 60 | })
 61 | 
 62 | 
 63 | # Deep feture synthesis
 64 | test_that("can perform dfs", {
 65 |   skip_if_no_featuretools()
 66 | 
 67 |   es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
 68 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
 69 |     add_relationship(
 70 |       parent_set = "set_1",
 71 |       child_set = "set_2",
 72 |       parent_idx = "key",
 73 |       child_idx = "key"
 74 |     ) %>%
 75 |     dfs(target_entity = "set_1", trans_primitives = c("and"))
 76 | 
 77 |   expect_true(length(es[[2]]) == 2)
 78 | })
 79 | 
 80 | # Feature extraction
 81 | test_that("can extract features from dfs", {
 82 |   skip_if_no_featuretools()
 83 | 
 84 |   features <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
 85 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
 86 |     add_relationship(
 87 |       parent_set = "set_1",
 88 |       child_set = "set_2",
 89 |       parent_idx = "key",
 90 |       child_idx = "key"
 91 |     ) %>%
 92 |     dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
 93 |     extract_features()
 94 | 
 95 |   expect_true(all(names(features) == c("name", "feature")))
 96 |   expect_true(nrow(features) == value_variables)
 97 |   expect_true(length(features$feature) == value_variables)
 98 |   expect_true(class(features$feature) == "list")
 99 | })
100 | 
101 | # Storing features locally
102 | test_that("can save features", {
103 |   skip_if_no_featuretools()
104 | 
105 |   as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
106 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
107 |     add_relationship(
108 |       parent_set = "set_1",
109 |       child_set = "set_2",
110 |       parent_idx = "key",
111 |       child_idx = "key"
112 |     ) %>%
113 |     dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
114 |     extract_features() %>%
115 |     save_features(filename = "some.features", path = ".")
116 | 
117 |   expect_true(file.exists("some.features"))
118 | })
119 | 
120 | # Loading stored features
121 | test_that("can load features", {
122 |   skip_if_no_featuretools()
123 | 
124 |   features <- load_features("some.features")
125 |   expect_true(!is.null(features))
126 | 
127 |   # Cleanup
128 |   if(file.exists("some.features")) file.remove("some.features")
129 | })
130 | 
131 | # Tidying a feature matrix
132 | test_that("can tidy feature matrix after dfs", {
133 |   skip_if_no_featuretools()
134 | 
135 |   tidy <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>%
136 |     add_entity(entity_id = "set_2", df = set_2, index = "key") %>%
137 |     add_relationship(
138 |       parent_set = "set_1",
139 |       child_set = "set_2",
140 |       parent_idx = "key",
141 |       child_idx = "key"
142 |     ) %>%
143 |     dfs(target_entity = "set_1", trans_primitives = c("and")) %>%
144 |     tidy_feature_matrix(
145 |       remove_nzv = TRUE,
146 |       nan_is_na = TRUE,
147 |       clean_names = TRUE
148 |     )
149 | 
150 |   expect_true(nrow(tidy) == set_size)
151 |   expect_true(!any(is.nan(tidy$value)))
152 |   expect_true(length(grep("[^A-z0-9_]", names(tidy))) == 0)
153 | })
154 | 


--------------------------------------------------------------------------------