├── .Rbuildignore ├── LICENSE ├── .gitignore ├── tests ├── testthat.R └── testthat │ ├── utils.R │ ├── test_utils.R │ └── test_main.R ├── R ├── create_entityset.R ├── list_primitives.R ├── zzz.R ├── load_features.R ├── extract_features.R ├── add_entity.R ├── as_entityset.R ├── calculate_feature_matrix.R ├── install_featuretools.R ├── dfs.R ├── save_features.R ├── tidy_feature_matrix.R └── add_relationship.R ├── man ├── list_primitives.Rd ├── create_entityset.Rd ├── install_featuretools.Rd ├── as_entityset.Rd ├── dfs.Rd ├── add_entity.Rd ├── extract_features.Rd ├── add_relationship.Rd ├── load_features.Rd ├── save_features.Rd ├── tidy_feature_matrix.Rd └── calculate_feature_matrix.Rd ├── NAMESPACE ├── DESCRIPTION └── README.md /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.git$ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2020 2 | COPYRIGHT HOLDER: Magnus Furugård 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | *.Rproj 5 | .Ruserdata 6 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(featuretoolsR) 3 | test_check("featuretoolsR") 4 | -------------------------------------------------------------------------------- /R/create_entityset.R: -------------------------------------------------------------------------------- 1 | #' Create entityset 2 | #' @description Create a blank entityset. A shortcut for `featuretools'` `EntitySet`. 3 | #' @export 4 | #' 5 | #' @param id The id of this entityset. 6 | #' @return An entityset. 7 | #' 8 | #' @examples 9 | #' \donttest{ 10 | #' create_entityset(id = "my_entityset") 11 | #' } 12 | create_entityset <- function(id) { 13 | es <- .ft$EntitySet(id = id) 14 | return(es) 15 | } 16 | -------------------------------------------------------------------------------- /R/list_primitives.R: -------------------------------------------------------------------------------- 1 | #' List all available primitives. 2 | #' @description List all available primitives from `featuretools` which can be passed to \link[featuretoolsR]{dfs}. 3 | #' @export 4 | #' 5 | #' @return A list of all primitives available. 6 | #' 7 | #' @examples 8 | #' \donttest{ 9 | #' featuretoolsR::list_primitives() 10 | #' } 11 | list_primitives <- function() { 12 | .ft$list_primitives() 13 | } 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /man/list_primitives.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/list_primitives.R 3 | \name{list_primitives} 4 | \alias{list_primitives} 5 | \title{List all available primitives.} 6 | \usage{ 7 | list_primitives() 8 | } 9 | \value{ 10 | A list of all primitives available. 11 | } 12 | \description{ 13 | List all available primitives from `featuretools` which can be passed to \link[featuretoolsR]{dfs}. 14 | } 15 | \examples{ 16 | \donttest{ 17 | featuretoolsR::list_primitives() 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /man/create_entityset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create_entityset.R 3 | \name{create_entityset} 4 | \alias{create_entityset} 5 | \title{Create entityset} 6 | \usage{ 7 | create_entityset(id) 8 | } 9 | \arguments{ 10 | \item{id}{The id of this entityset.} 11 | } 12 | \value{ 13 | An entityset. 14 | } 15 | \description{ 16 | Create a blank entityset. A shortcut for `featuretools'` `EntitySet`. 17 | } 18 | \examples{ 19 | \donttest{ 20 | create_entityset(id = "my_entityset") 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(add_entity) 4 | export(add_relationship) 5 | export(as_entityset) 6 | export(calculate_feature_matrix) 7 | export(create_entityset) 8 | export(dfs) 9 | export(extract_features) 10 | export(install_featuretools) 11 | export(list_primitives) 12 | export(load_features) 13 | export(save_features) 14 | export(tidy_feature_matrix) 15 | importFrom(caret,nearZeroVar) 16 | importFrom(dplyr,ungroup) 17 | importFrom(purrr,map) 18 | importFrom(stringr,str_sub) 19 | importFrom(tibble,as_tibble) 20 | importFrom(tibble,is_tibble) 21 | importFrom(tibble,tibble) 22 | -------------------------------------------------------------------------------- /tests/testthat/utils.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | 3 | featuretools_available <- function() { 4 | return(reticulate::py_module_available("featuretools")) 5 | } 6 | 7 | skip_if_no_featuretools <- function() { 8 | if (!featuretools_available()) 9 | skip("required featuretools module not available for testing") 10 | } 11 | 12 | generate_mock_data <- function(size = 2, value_variables = 2) { 13 | options(stringsAsFactors = TRUE) 14 | d <- data.frame(key = 1:size) 15 | for (i in 1:value_variables) { 16 | colnames <- c(names(d), paste0("value", i)) 17 | d <- cbind(d, data.frame("new" = sample(1:10, size, TRUE))) 18 | names(d) <- colnames 19 | } 20 | return(d) 21 | } 22 | -------------------------------------------------------------------------------- /tests/testthat/test_utils.R: -------------------------------------------------------------------------------- 1 | # Test utility functions used for the package. 2 | 3 | # Common functions for all tests. 4 | source("utils.R") 5 | 6 | # Create mock data for tests 7 | library(magrittr) 8 | set_size <- 2 9 | value_variables <- 2 10 | set_1 <- generate_mock_data(set_size) 11 | set_2 <- generate_mock_data(set_size) 12 | 13 | # Make sure we can list primitives 14 | test_that("can list primitives", { 15 | skip_if_no_featuretools() 16 | 17 | primitives <- list_primitives() 18 | expect_true(nrow(primitives) > 0) 19 | expect_true(all(names(primitives) == c("name", "type", "description"))) 20 | }) 21 | 22 | # Can create empty entityset 23 | test_that("can create empty entityset", { 24 | skip_if_no_featuretools() 25 | 26 | es <- create_entityset(id = "my_entityset") 27 | 28 | expect_true(is.list(es$entities)) 29 | expect_true(length(es$entities) == 0) 30 | }) 31 | -------------------------------------------------------------------------------- /man/install_featuretools.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/install_featuretools.R 3 | \name{install_featuretools} 4 | \alias{install_featuretools} 5 | \title{Install featuretools} 6 | \usage{ 7 | install_featuretools(custom_virtualenv = FALSE, method = "auto", 8 | conda = "auto") 9 | } 10 | \arguments{ 11 | \item{custom_virtualenv}{Defaults to false. Set to true if you wish to use a custom virtualenv for featuretoolsR.} 12 | 13 | \item{method}{The installation method passed to \link[reticulate]{py_install}. Defaults to "auto".} 14 | 15 | \item{conda}{Whether to use conda or not. Passed to `reticulate::py_install`. Defaults to "auto".} 16 | } 17 | \description{ 18 | Setup for featuretools in it's own virtualenv, or into the default reticulate virtualenv. 19 | } 20 | \examples{ 21 | \dontrun{ 22 | featuretoolsR::install_featuretools() 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/as_entityset.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/as_entityset.R 3 | \name{as_entityset} 4 | \alias{as_entityset} 5 | \title{Create entityset and entity from data frame.} 6 | \usage{ 7 | as_entityset(.data, id = "entityset", index = NA, time_index = NULL, 8 | entity_id = "df1", ...) 9 | } 10 | \arguments{ 11 | \item{.data}{The `data.frame` to be added as an entity to entityset.} 12 | 13 | \item{id}{The id of this entityset.} 14 | 15 | \item{index}{Name of id column in the dataframe.} 16 | 17 | \item{time_index}{Name of the time column in the dataframe.} 18 | 19 | \item{entity_id}{An identifier for this entity.} 20 | 21 | \item{...}{Additional variables passed to `add_entity`.} 22 | } 23 | \value{ 24 | A modified entityset. 25 | } 26 | \description{ 27 | Create an entityset with a selected `data.frame` as an entity. 28 | } 29 | \examples{ 30 | \donttest{ 31 | as_entityset(cars, index = "row_number") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/dfs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dfs.R 3 | \name{dfs} 4 | \alias{dfs} 5 | \title{Deep Feature Synthesis} 6 | \usage{ 7 | dfs(entityset, target_entity, agg_primitives = NULL, 8 | trans_primitives = NULL, max_depth = 2L, ...) 9 | } 10 | \arguments{ 11 | \item{entityset}{The entityset on which to perform dfs.} 12 | 13 | \item{target_entity}{The name of the entity on which to perform dfs.} 14 | 15 | \item{agg_primitives}{Primitives passed to relational data.} 16 | 17 | \item{trans_primitives}{Primitives passed to non-relational data.} 18 | 19 | \item{max_depth}{Controls the maximum depth of features.} 20 | 21 | \item{...}{Additional parameters passed to `featuretools.dfs`.} 22 | } 23 | \value{ 24 | A `featuretools` feature matrix. 25 | } 26 | \description{ 27 | The main function from featuretools used to create new features. 28 | } 29 | \examples{ 30 | \donttest{ 31 | es <- as_entityset(cars, index = "row_number") 32 | dfs(es, target_entity = "df1", trans_primitives = c("and")) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /man/add_entity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/add_entity.R 3 | \name{add_entity} 4 | \alias{add_entity} 5 | \title{add_entity} 6 | \usage{ 7 | add_entity(entityset, entity_id, df, index = NULL, time_index = NULL, 8 | ...) 9 | } 10 | \arguments{ 11 | \item{entityset}{The entity set to modify.} 12 | 13 | \item{entity_id}{The name of the entity to add.} 14 | 15 | \item{df}{The data frame to add as an entity.} 16 | 17 | \item{index}{The index parameter specifies the column that uniquely identifies rows in the dataframe} 18 | 19 | \item{time_index}{Name of the time column in the dataframe.} 20 | 21 | \item{...}{Additional parameters passed to `featuretools.entity_from_dataframe`.} 22 | } 23 | \value{ 24 | A modified entityset. 25 | } 26 | \description{ 27 | Add an entity to an entityset. 28 | } 29 | \examples{ 30 | \donttest{ 31 | library(magrittr) 32 | create_entityset("set") \%>\% 33 | add_entity(df = cars, 34 | entity_id = "cars", 35 | index = "row_number") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: featuretoolsR 2 | Type: Package 3 | Title: Interact with the 'Python' Module 'Featuretools' 4 | Version: 0.4.4 5 | Authors@R: person("Magnus", "Furugård", email = "magnus.furugard@gmail.com", role = c("aut", "cre")) 6 | Maintainer: Magnus Furugård 7 | Description: A 'reticulate'-based interface to the 'Python' module 'Featuretools'. 8 | The package grants functionality to interact with 'Pythons' 'Featuretools' module, which allows 9 | for automated feature engineering on any data frame. Valid features and new data sets can, after 10 | feature synthesis, easily be extracted. 11 | License: MIT + file LICENSE 12 | URL: https://github.com/magnusfurugard/featuretoolsR 13 | BugReports: https://github.com/magnusfurugard/featuretoolsR/issues 14 | Depends: 15 | R (>= 3.4.2) 16 | Imports: 17 | reticulate, 18 | caret, 19 | dplyr, 20 | purrr, 21 | stringr, 22 | tibble, 23 | magrittr, 24 | cli, 25 | testthat, 26 | rstudioapi 27 | Encoding: UTF-8 28 | LazyData: true 29 | RoxygenNote: 6.1.1 30 | -------------------------------------------------------------------------------- /man/extract_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/extract_features.R 3 | \name{extract_features} 4 | \alias{extract_features} 5 | \title{Extract features} 6 | \usage{ 7 | extract_features(.data) 8 | } 9 | \arguments{ 10 | \item{.data}{The featuretools-object returned from \link[featuretoolsR]{dfs}.} 11 | } 12 | \value{ 13 | All features created during \link[featuretoolsR]{dfs}, as a tibble. 14 | } 15 | \description{ 16 | This function is used to extract all features created from \link[featuretoolsR]{dfs}. 17 | } 18 | \examples{ 19 | \donttest{ 20 | library(magrittr) 21 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 22 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 23 | # Common variable: `key` 24 | 25 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 26 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 27 | add_relationship( 28 | parent_set = "set_1", 29 | child_set = "set_2", 30 | parent_idx = "key", 31 | child_idx = "key" 32 | ) \%>\% 33 | dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\% 34 | extract_features() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(...) { 2 | 3 | start <- paste("featuretoolsR", utils::packageVersion("featuretoolsR")) 4 | packageStartupMessage(cli::cat_boxx(start, padding = c(0, 3, 0, 3), border_style = "double"), appendLF = FALSE) 5 | 6 | if(!reticulate::py_module_available("pip")) { 7 | m <- "pip is not installed. Please install pip to proceed." 8 | msg <- cli::cat_bullet(m, bullet = "cross", bullet_col = "red") 9 | } else { 10 | # See if featuretools already is installed 11 | if(!reticulate::py_module_available("featuretools")) { 12 | msg <- cli::cat_bullet("Featuretools unavailable. Please run `install_featuretools()`, or install featuretools with pip.", bullet = "cross", bullet_col = "red") 13 | } else { 14 | # Display featuretools info 15 | ft <- paste("Using Featuretools", reticulate::py_get_attr(.ft, "__version__")) 16 | msg <- cli::cat_bullet(ft, bullet = "tick", bullet_col = "green") 17 | } 18 | } 19 | 20 | packageStartupMessage(msg) 21 | 22 | } 23 | 24 | .ft <- NULL 25 | .onLoad <- function(...){ 26 | .ft <<- reticulate::import("featuretools", delay_load = TRUE) 27 | options( 28 | featuretoolsR.force_posixct = TRUE, 29 | featuretoolsR.posixct_tz = "UTC", 30 | featuretoolsR.virtualenv_name = "featuretoolsR" 31 | ) 32 | } 33 | -------------------------------------------------------------------------------- /man/add_relationship.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/add_relationship.R 3 | \name{add_relationship} 4 | \alias{add_relationship} 5 | \title{Add a relationship to an entityset} 6 | \usage{ 7 | add_relationship(entityset, parent_set, child_set, parent_idx, child_idx) 8 | } 9 | \arguments{ 10 | \item{entityset}{The entityset to modify.} 11 | 12 | \item{parent_set}{The name of the parent set.} 13 | 14 | \item{child_set}{The name of the child set.} 15 | 16 | \item{parent_idx}{The index variable of the `parent_set`.} 17 | 18 | \item{child_idx}{The index variable of the `child_set`.} 19 | } 20 | \value{ 21 | A modified entityset. 22 | } 23 | \description{ 24 | Add a relationship to an entityset. 25 | } 26 | \examples{ 27 | \donttest{ 28 | library(magrittr) 29 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 30 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 31 | # Common variable: `key` 32 | 33 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 34 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 35 | add_relationship( 36 | parent_set = "set_1", 37 | child_set = "set_2", 38 | parent_idx = "key", 39 | child_idx = "key" 40 | ) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /man/load_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/load_features.R 3 | \name{load_features} 4 | \alias{load_features} 5 | \title{Load features} 6 | \usage{ 7 | load_features(file = NA) 8 | } 9 | \arguments{ 10 | \item{file}{The file containing the features.} 11 | } 12 | \description{ 13 | Used to load previously saved features created during \link[featuretoolsR]{dfs}. 14 | } 15 | \examples{ 16 | \donttest{ 17 | library(magrittr) 18 | 19 | # Create mock datasets 20 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 21 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 22 | # Common variable: `key` 23 | 24 | # Use dfs to create features 25 | dir <- tempdir() 26 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 27 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 28 | add_relationship( 29 | parent_set = "set_1", 30 | child_set = "set_2", 31 | parent_idx = "key", 32 | child_idx = "key" 33 | ) \%>\% 34 | dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\% 35 | extract_features() \%>\% 36 | save_features(filename = "some.features", path = dir) 37 | 38 | # Load saves features 39 | features <- load_features(file.path(dir, "some.features")) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/save_features.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/save_features.R 3 | \name{save_features} 4 | \alias{save_features} 5 | \title{Save features} 6 | \usage{ 7 | save_features(.data, filename = NA, path = NA) 8 | } 9 | \arguments{ 10 | \item{.data}{The tibble of features returned from \link[featuretoolsR]{extract_features}.} 11 | 12 | \item{filename}{(optional) The name of the file to produce.} 13 | 14 | \item{path}{(optional) The path where the feature file should be placed.} 15 | } 16 | \description{ 17 | Used to save all or a subset of features created during \link[featuretoolsR]{dfs}. 18 | } 19 | \examples{ 20 | \donttest{ 21 | library(magrittr) 22 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 23 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 24 | # Common variable: `key` 25 | 26 | dir <- tempdir() 27 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 28 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 29 | add_relationship( 30 | parent_set = "set_1", 31 | child_set = "set_2", 32 | parent_idx = "key", 33 | child_idx = "key" 34 | ) \%>\% 35 | dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\% 36 | extract_features() \%>\% 37 | save_features(filename = "some.features", path = dir) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /R/load_features.R: -------------------------------------------------------------------------------- 1 | #' Load features 2 | #' @description Used to load previously saved features created during \link[featuretoolsR]{dfs}. 3 | #' @export 4 | #' 5 | #' @param file The file containing the features. 6 | #' 7 | #' @examples 8 | #' \donttest{ 9 | #' library(magrittr) 10 | #' 11 | #' # Create mock datasets 12 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 13 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 14 | #' # Common variable: `key` 15 | #' 16 | #' # Use dfs to create features 17 | #' dir <- tempdir() 18 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 19 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 20 | #' add_relationship( 21 | #' parent_set = "set_1", 22 | #' child_set = "set_2", 23 | #' parent_idx = "key", 24 | #' child_idx = "key" 25 | #' ) %>% 26 | #' dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 27 | #' extract_features() %>% 28 | #' save_features(filename = "some.features", path = dir) 29 | #' 30 | #' # Load saves features 31 | #' features <- load_features(file.path(dir, "some.features")) 32 | #' } 33 | load_features <- function(file = NA) { 34 | 35 | # Sanitize input 36 | if(is.na(file)) 37 | stop("No file specified.") 38 | 39 | # Attempt to load file. 40 | return( 41 | .ft$load_features( 42 | normalizePath(file) 43 | ) 44 | ) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/tidy_feature_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tidy_feature_matrix.R 3 | \name{tidy_feature_matrix} 4 | \alias{tidy_feature_matrix} 5 | \title{Tidy feature matrix} 6 | \usage{ 7 | tidy_feature_matrix(.data, remove_nzv = FALSE, nan_is_na = FALSE, 8 | clean_names = FALSE) 9 | } 10 | \arguments{ 11 | \item{.data}{The featuretools-object returned from \link[featuretoolsR]{dfs}.} 12 | 13 | \item{remove_nzv}{Remove near zero variance variables created from \link[featuretoolsR]{dfs}.} 14 | 15 | \item{nan_is_na}{Turn all `NaN` into `NA`.} 16 | 17 | \item{clean_names}{Make variable names R-friendly (snake case).} 18 | } 19 | \value{ 20 | A tidy data.frame. 21 | } 22 | \description{ 23 | Used for tidying up ('R-ify') the feature matrix after deep feature synthethis (\link[featuretoolsR]{dfs}). 24 | } 25 | \examples{ 26 | \donttest{ 27 | library(magrittr) 28 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 29 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 30 | # Common variable: `key` 31 | 32 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 33 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 34 | add_relationship( 35 | parent_set = "set_1", 36 | child_set = "set_2", 37 | parent_idx = "key", 38 | child_idx = "key" 39 | ) \%>\% 40 | dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\% 41 | tidy_feature_matrix(remove_nzv = TRUE, nan_is_na = TRUE) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /R/extract_features.R: -------------------------------------------------------------------------------- 1 | #' Extract features 2 | #' @description This function is used to extract all features created from \link[featuretoolsR]{dfs}. 3 | #' @export 4 | #' 5 | #' @param .data The featuretools-object returned from \link[featuretoolsR]{dfs}. 6 | #' @return All features created during \link[featuretoolsR]{dfs}, as a tibble. 7 | #' 8 | #' @importFrom tibble tibble 9 | #' @importFrom purrr map 10 | #' 11 | #' @examples 12 | #' \donttest{ 13 | #' library(magrittr) 14 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 15 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 16 | #' # Common variable: `key` 17 | #' 18 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 19 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 20 | #' add_relationship( 21 | #' parent_set = "set_1", 22 | #' child_set = "set_2", 23 | #' parent_idx = "key", 24 | #' child_idx = "key" 25 | #' ) %>% 26 | #' dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 27 | #' extract_features() 28 | #' } 29 | extract_features <- function(.data) { 30 | 31 | # List features in ft-object 32 | feature_names <- unlist(purrr::map( 33 | .data[[2]], 34 | .f = function(feature) { 35 | feature$get_name() 36 | } 37 | )) 38 | 39 | # Extract features 40 | feature_actuals <- purrr::map( 41 | .data[[2]], 42 | .f = function(feature) { 43 | feature 44 | } 45 | ) 46 | 47 | # Construct informative tibble with features 48 | return( 49 | tibble::tibble( 50 | name = feature_names, 51 | feature = feature_actuals 52 | ) 53 | ) 54 | } 55 | -------------------------------------------------------------------------------- /R/add_entity.R: -------------------------------------------------------------------------------- 1 | #' add_entity 2 | #' @description Add an entity to an entityset. 3 | #' @export 4 | #' 5 | #' @param entityset The entity set to modify. 6 | #' @param entity_id The name of the entity to add. 7 | #' @param df The data frame to add as an entity. 8 | #' @param index The index parameter specifies the column that uniquely identifies rows in the dataframe 9 | #' @param time_index Name of the time column in the dataframe. 10 | #' @param ... Additional parameters passed to `featuretools.entity_from_dataframe`. 11 | #' @return A modified entityset. 12 | #' 13 | #' @examples 14 | #' \donttest{ 15 | #' library(magrittr) 16 | #' create_entityset("set") %>% 17 | #' add_entity(df = cars, 18 | #' entity_id = "cars", 19 | #' index = "row_number") 20 | #' } 21 | add_entity <- function( 22 | entityset, 23 | entity_id, 24 | df, 25 | index = NULL, 26 | time_index = NULL, 27 | ... 28 | ) { 29 | # Construct variable_types to handle factors as categorical variables. 30 | classes <- purrr::map_dfr(sapply(df, FUN = function(col) { 31 | c <- class(col) 32 | # prettify difficult data types 33 | if(length(c > 1)) 34 | c <- paste0(c, collapse = ", ") 35 | return(c) 36 | }), c) 37 | 38 | variable_types = list() #initialize 39 | if (any(classes == "factor")) { 40 | for (i in 1:length(classes)) { 41 | suppressWarnings({ 42 | if (class(df[, i]) == "factor") { 43 | variable_types[[names(df)[i]]] <- .ft$variable_types$Categorical 44 | } 45 | }) 46 | } 47 | } 48 | 49 | variable_types <- reticulate::r_to_py(variable_types) 50 | 51 | # Add df as entity to entityset. 52 | es <- entityset$entity_from_dataframe( 53 | entity_id = entity_id, 54 | dataframe = reticulate::r_to_py(x = df), 55 | index = index, 56 | time_index = time_index, 57 | variable_types = variable_types, 58 | ... 59 | ) 60 | 61 | return(es) 62 | 63 | } 64 | -------------------------------------------------------------------------------- /man/calculate_feature_matrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calculate_feature_matrix.R 3 | \name{calculate_feature_matrix} 4 | \alias{calculate_feature_matrix} 5 | \title{Calculate feature matrix} 6 | \usage{ 7 | calculate_feature_matrix(entityset, features, ...) 8 | } 9 | \arguments{ 10 | \item{entityset}{The entityset on which to create features.} 11 | 12 | \item{features}{The features to create based on previous runs of \link[featuretoolsR]{dfs}.} 13 | 14 | \item{...}{Additional parameters passed to `featuretoools.calculate_feature_matrix`.} 15 | } 16 | \value{ 17 | A feature matrix 18 | } 19 | \description{ 20 | This function is used to create a feature matrix based on a custom list of features (usually created from \link[featuretoolsR]{save_features}). 21 | } 22 | \examples{ 23 | \donttest{ 24 | library(magrittr) 25 | 26 | # Create some mock data 27 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 28 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 29 | # Common variable: `key` 30 | 31 | # Create features and save them 32 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 33 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 34 | add_relationship( 35 | parent_set = "set_1", 36 | child_set = "set_2", 37 | parent_idx = "key", 38 | child_idx = "key" 39 | ) \%>\% 40 | dfs(target_entity = "set_1", trans_primitives = c("and")) \%>\% 41 | extract_features() \%>\% 42 | save_features(filename = "some.features") 43 | 44 | # Re-create entityset, but rather than dfs use calcualte_feature_matrix. 45 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") \%>\% 46 | add_entity(entity_id = "set_2", df = set_2, index = "key") \%>\% 47 | add_relationship( 48 | parent_set = "set_1", 49 | child_set = "set_2", 50 | parent_idx = "key", 51 | child_idx = "key" 52 | ) 53 | calculate_feature_matrix(entityset = es, features = load_features("some.features")) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /R/as_entityset.R: -------------------------------------------------------------------------------- 1 | #' Create entityset and entity from data frame. 2 | #' @description Create an entityset with a selected `data.frame` as an entity. 3 | #' @export 4 | #' 5 | #' @importFrom dplyr ungroup 6 | #' 7 | #' @param .data The `data.frame` to be added as an entity to entityset. 8 | #' @param id The id of this entityset. 9 | #' @param index Name of id column in the dataframe. 10 | #' @param time_index Name of the time column in the dataframe. 11 | #' @param entity_id An identifier for this entity. 12 | #' @param ... Additional variables passed to `add_entity`. 13 | #' @return A modified entityset. 14 | #' 15 | #' @examples 16 | #' \donttest{ 17 | #' as_entityset(cars, index = "row_number") 18 | #' } 19 | as_entityset <- function( 20 | .data, 21 | id = "entityset", 22 | index = NA, 23 | time_index = NULL, 24 | entity_id = "df1", 25 | ... 26 | ) { 27 | 28 | # Sanitize input. 29 | if (!is.data.frame(.data)) stop("`.data` is not of type `data.frame`") 30 | if(is.na(id)) stop("`id` cannot be `NA`. Leave empty for default name.") 31 | if(nrow(.data) == 0) warning("`.data` contains zero rows.`") 32 | 33 | # Create entityset. 34 | es <- .ft$EntitySet(id = id) 35 | 36 | # If index is unset, warn user and create a new index variable. 37 | if(is.na(index)) { 38 | warning("`index` is `NA`. Using new variable `row_number` as index.") 39 | .data <- dplyr::ungroup(.data) 40 | .data$rownumber <- 1:nrow(.data) 41 | index <- "rownumber" 42 | } 43 | 44 | # Fix reticulate datetime64 support 45 | if(getOption("featuretoolsR.force_posixct")) { 46 | .cols = lapply(df, class) 47 | for (i in 1:length(.cols)) { 48 | colname <- names(.cols)[i] 49 | coltype <- .cols[[colname]] 50 | if (any(coltype == "Date")) { 51 | df[,colname] <- as.POSIXct(df[,colname], getOption("featuretoolsR.posixct_tz")) 52 | } 53 | } 54 | } 55 | 56 | # Add first entity to entityset. 57 | es <- add_entity( 58 | entityset = es, 59 | entity_id = entity_id, 60 | df = .data, 61 | index = index, 62 | time_index = time_index, 63 | ... 64 | ) 65 | 66 | return(es) 67 | } 68 | -------------------------------------------------------------------------------- /R/calculate_feature_matrix.R: -------------------------------------------------------------------------------- 1 | #' Calculate feature matrix 2 | #' @description This function is used to create a feature matrix based on a custom list of features (usually created from \link[featuretoolsR]{save_features}). 3 | #' @export 4 | #' 5 | #' @param entityset The entityset on which to create features. 6 | #' @param features The features to create based on previous runs of \link[featuretoolsR]{dfs}. 7 | #' @param ... Additional parameters passed to `featuretoools.calculate_feature_matrix`. 8 | #' @return A feature matrix 9 | #' 10 | #' @examples 11 | #' \donttest{ 12 | #' library(magrittr) 13 | #' 14 | #' # Create some mock data 15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 17 | #' # Common variable: `key` 18 | #' 19 | #' # Create features and save them 20 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 21 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 22 | #' add_relationship( 23 | #' parent_set = "set_1", 24 | #' child_set = "set_2", 25 | #' parent_idx = "key", 26 | #' child_idx = "key" 27 | #' ) %>% 28 | #' dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 29 | #' extract_features() %>% 30 | #' save_features(filename = "some.features") 31 | #' 32 | #' # Re-create entityset, but rather than dfs use calcualte_feature_matrix. 33 | #' es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 34 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 35 | #' add_relationship( 36 | #' parent_set = "set_1", 37 | #' child_set = "set_2", 38 | #' parent_idx = "key", 39 | #' child_idx = "key" 40 | #' ) 41 | #' calculate_feature_matrix(entityset = es, features = load_features("some.features")) 42 | #' } 43 | calculate_feature_matrix <- function( 44 | entityset, 45 | features, 46 | ... 47 | ) { 48 | # Run featuretools 49 | return( 50 | .ft$calculate_feature_matrix( 51 | features = features, 52 | entityset = entityset, 53 | ... 54 | ) 55 | ) 56 | 57 | } 58 | -------------------------------------------------------------------------------- /R/install_featuretools.R: -------------------------------------------------------------------------------- 1 | #' Install featuretools 2 | #' @description Setup for featuretools in it's own virtualenv, or into the default reticulate virtualenv. 3 | #' 4 | #' @param custom_virtualenv Set to true if you wish to use a custom virtualenv for featuretoolsR. 5 | #' @param method The installation method passed to \link[reticulate]{py_install}. Defaults to "auto". 6 | #' @param conda Whether to use conda or not. Passed to \link[reticulate]{py_install}. Defaults to "auto". 7 | #' @export 8 | #' 9 | #' @examples 10 | #' \dontrun{ 11 | #' featuretoolsR::install_featuretools() 12 | #' } 13 | install_featuretools <- function(custom_virtualenv = FALSE, method = "auto", conda = "auto") { 14 | 15 | # See if conda, pip or pip3 is installed. 16 | status <- list( 17 | conda = cli_is_installed("conda"), 18 | pip = cli_is_installed("pip"), 19 | pip3 = cli_is_installed("pip3") 20 | ) 21 | 22 | if(!any(status == TRUE)) { 23 | stop("Neither `pip`, `pip3` or `conda` was found. At least one is required to install Featuretools.") 24 | } 25 | 26 | # Installation 27 | if(custom_virtualenv) { 28 | virtualenv_name <- getOption("featuretoolsR.virtualenv_name") 29 | path <- paste(reticulate::virtualenv_root(), virtualenv_name, sep = "/") 30 | if(!file.exists(path)) { 31 | reticulate::virtualenv_create(virtualenv_name) 32 | } else { 33 | message("Using existing virtualenv in ", path) 34 | } 35 | 36 | # Check if featuretools is installed 37 | if(!reticulate::py_module_available("featuretools")) { 38 | message("Installing featuretools into ", path) 39 | # Install featuretools 40 | reticulate::virtualenv_install(virtualenv_name, packages = "featuretools") 41 | } 42 | 43 | # Use new virtualenv 44 | reticulate::use_virtualenv(virtualenv_name) 45 | } else { 46 | reticulate::py_install("featuretools", method = method, conda = conda) 47 | } 48 | 49 | # Reload library 50 | unloadNamespace("featuretoolsR") 51 | rstudioapi::restartSession("library(featuretoolsR)") 52 | } 53 | 54 | cli_is_installed <- function(command) { 55 | tryCatch(expr = { 56 | system(command, intern = T, ignore.stderr = T) 57 | return(TRUE) 58 | }, error = function(e) { 59 | return(FALSE) 60 | }) 61 | } 62 | -------------------------------------------------------------------------------- /R/dfs.R: -------------------------------------------------------------------------------- 1 | #' Deep Feature Synthesis 2 | #' @description The main function from featuretools used to create new features. 3 | #' @export 4 | #' 5 | #' @param entityset The entityset on which to perform dfs. 6 | #' @param target_entity The name of the entity on which to perform dfs. 7 | #' @param agg_primitives Primitives passed to relational data. 8 | #' @param trans_primitives Primitives passed to non-relational data. 9 | #' @param max_depth Controls the maximum depth of features. 10 | #' @param ... Additional parameters passed to `featuretools.dfs`. 11 | #' @return A `featuretools` feature matrix. 12 | #' 13 | #' @examples 14 | #' \donttest{ 15 | #' es <- as_entityset(cars, index = "row_number") 16 | #' dfs(es, target_entity = "df1", trans_primitives = c("and")) 17 | #' } 18 | dfs <- function( 19 | entityset, 20 | target_entity, 21 | agg_primitives = NULL, 22 | trans_primitives = NULL, 23 | max_depth = 2L, 24 | ... 25 | ) { 26 | # Ensure primitives are in the correct format 27 | if(!is.list(agg_primitives)) { 28 | agg_primitives <- as.list(agg_primitives) 29 | } 30 | if(!is.list(trans_primitives)) { 31 | trans_primitives <- as.list(trans_primitives) 32 | } 33 | 34 | # Ensure primitives are valid 35 | aggs <- list_primitives()[list_primitives()$type=="aggregation", "name"] 36 | .agg_primitives <- unlist(agg_primitives) 37 | if(any(!(.agg_primitives %in% aggs))) { 38 | invalid <- paste0(.agg_primitives[!(.agg_primitives %in% aggs)], collapse = "`, `") 39 | stop("Invalid aggregate primitive(s): `", invalid, "`. Use list_primitives() to find valid primitives.") 40 | } 41 | 42 | trans <- list_primitives()[list_primitives()$type=="transform", "name"] 43 | .trans_primitives <- unlist(trans_primitives) 44 | if(any(!(.trans_primitives %in% trans))) { 45 | invalid <- paste0(.trans_primitives[!(.trans_primitives %in% trans)], collapse = "`, `") 46 | stop("Invalid transform primitive(s): `", invalid, "`. Use list_primitives() to find valid primitives.") 47 | } 48 | 49 | # DFS 50 | feature_matrix <- .ft$dfs( 51 | entityset = entityset, 52 | target_entity = target_entity, 53 | agg_primitives = reticulate::r_to_py(agg_primitives), 54 | trans_primitives = reticulate::r_to_py(trans_primitives), 55 | max_depth = max_depth, 56 | ... 57 | ) 58 | 59 | return(feature_matrix) 60 | } 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /R/save_features.R: -------------------------------------------------------------------------------- 1 | #' Save features 2 | #' @description Used to save all or a subset of features created during \link[featuretoolsR]{dfs}. 3 | #' @export 4 | #' 5 | #' @importFrom stringr str_sub 6 | #' @importFrom tibble is_tibble 7 | #' 8 | #' @param .data The tibble of features returned from \link[featuretoolsR]{extract_features}. 9 | #' @param filename (optional) The name of the file to produce. 10 | #' @param path (optional) The path where the feature file should be placed. 11 | #' 12 | #' @examples 13 | #' \donttest{ 14 | #' library(magrittr) 15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 17 | #' # Common variable: `key` 18 | #' 19 | #' dir <- tempdir() 20 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 21 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 22 | #' add_relationship( 23 | #' parent_set = "set_1", 24 | #' child_set = "set_2", 25 | #' parent_idx = "key", 26 | #' child_idx = "key" 27 | #' ) %>% 28 | #' dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 29 | #' extract_features() %>% 30 | #' save_features(filename = "some.features", path = dir) 31 | #' } 32 | save_features <- function( 33 | .data, 34 | filename = NA, 35 | path = NA 36 | ) { 37 | 38 | # Sanitize input 39 | ## Input should be a tibble with 2 variables. 40 | if(any(c(colnames(.data) != c("name", "feature"), !tibble::is_tibble(.data)))) 41 | stop("Bad input. Did you forget to use `extract_features`?") 42 | 43 | ## If user didn't set path, use working directory. 44 | ## (For featuretools' save_features, we need the full path) 45 | if(is.na(path)) { 46 | warning("No `path` set, defaulting to working directory\n") 47 | path <- paste0(normalizePath(getwd()), "/") 48 | } else { 49 | # Writer in Python requires full path, so fix user given path 50 | path <- paste0(normalizePath(path)) 51 | 52 | # Make sure user entered path correctly 53 | if(stringr::str_sub(path, -1, -1) != "/") 54 | path <- paste0(path, "/") 55 | } 56 | 57 | ## If user didn't specify a file name, generate one. 58 | if(is.na(filename)) { 59 | tmp <- paste0(paste0(sample(c(letters, LETTERS), 16, FALSE), collapse = ""), ".features") 60 | warning("No `filename` passed, generated: ", tmp) 61 | path <- paste0(path, tmp) 62 | } else { 63 | path <- paste0(path, filename) 64 | } 65 | 66 | # Save all features passed from `extract_features`. 67 | .ft$save_features( 68 | features = .data$feature, 69 | location = path 70 | ) 71 | 72 | return(TRUE) 73 | } 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /R/tidy_feature_matrix.R: -------------------------------------------------------------------------------- 1 | #' Tidy feature matrix 2 | #' @description Used for tidying up ('R-ify') the feature matrix after deep feature synthethis (\link[featuretoolsR]{dfs}). 3 | #' @export 4 | #' 5 | #' @param .data The featuretools-object returned from \link[featuretoolsR]{dfs}. 6 | #' @param remove_nzv Remove near zero variance variables created from \link[featuretoolsR]{dfs}. 7 | #' @param nan_is_na Turn all `NaN` into `NA`. 8 | #' @param clean_names Make variable names R-friendly (snake case). 9 | #' @return A tidy data.frame. 10 | #' 11 | #' @importFrom caret nearZeroVar 12 | #' @importFrom purrr map 13 | #' @importFrom tibble as_tibble 14 | #' 15 | #' @examples 16 | #' \donttest{ 17 | #' library(magrittr) 18 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 19 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 20 | #' # Common variable: `key` 21 | #' 22 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 23 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 24 | #' add_relationship( 25 | #' parent_set = "set_1", 26 | #' child_set = "set_2", 27 | #' parent_idx = "key", 28 | #' child_idx = "key" 29 | #' ) %>% 30 | #' dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 31 | #' tidy_feature_matrix(remove_nzv = TRUE, nan_is_na = TRUE) 32 | #' } 33 | tidy_feature_matrix <- function( 34 | .data, 35 | remove_nzv = FALSE, 36 | nan_is_na = FALSE, 37 | clean_names = FALSE 38 | ) { 39 | 40 | # Coerce into R-object. 41 | to_r <- tibble::as_tibble(.data[[1]]) 42 | 43 | # Variables get duplicated when coercing object from Python to R. Cleanup. 44 | nondupe <- to_r[, !duplicated(names(to_r))] 45 | 46 | # Process `nondupe` according to user defined parameters. 47 | ## Remove near zero variance 48 | if(remove_nzv) { 49 | nzvs <- purrr::map_dfr( 50 | lapply( 51 | X = names(nondupe), 52 | FUN = function(colname) { 53 | t <- caret::nearZeroVar(nondupe[, colname], saveMetrics = TRUE) 54 | t$variable <- colname 55 | return(t) 56 | } 57 | ), c) 58 | 59 | # Update nondupe-set. 60 | nondupe <- nondupe[, !nzvs$nzv] 61 | } 62 | 63 | ## Replace all `NaN` with `NA` 64 | if(nan_is_na) { 65 | for (colname in names(nondupe)) { 66 | nondupe[, colname][[1]][is.nan(nondupe[, colname][[1]])] <- NA 67 | } 68 | } 69 | 70 | ## Make variable names more R-friendly 71 | if(clean_names) { 72 | n <- tolower(names(nondupe)) 73 | tn <- gsub("[^A-z0-9]", "_", n) 74 | tn <- gsub("(_+?$)|(__+?)", "", tn) 75 | names(nondupe) <- tn 76 | } 77 | 78 | # Back to data.frame 79 | result <- as.data.frame(nondupe) 80 | 81 | return(result) 82 | 83 | } 84 | -------------------------------------------------------------------------------- /R/add_relationship.R: -------------------------------------------------------------------------------- 1 | #' Add a relationship to an entityset 2 | #' @description Add a relationship to an entityset. 3 | #' @export 4 | #' 5 | #' @param entityset The entityset to modify. 6 | #' @param parent_set The name of the parent set. 7 | #' @param child_set The name of the child set. 8 | #' @param parent_idx The index variable of the `parent_set`. 9 | #' @param child_idx The index variable of the `child_set`. Defaults to `parent_idx`. 10 | #' @return A modified entityset. 11 | #' 12 | #' @examples 13 | #' \donttest{ 14 | #' library(magrittr) 15 | #' set_1 <- data.frame(key = 1:100, value = sample(letters, 100, TRUE), stringsAsFactors = TRUE) 16 | #' set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, TRUE), stringsAsFactors = TRUE) 17 | #' # Common variable: `key` 18 | #' 19 | #' as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 20 | #' add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 21 | #' add_relationship( 22 | #' parent_set = "set_1", 23 | #' child_set = "set_2", 24 | #' parent_idx = "key", 25 | #' child_idx = "key" 26 | #' ) 27 | #' } 28 | add_relationship <- function( 29 | entityset, 30 | parent_set, 31 | child_set, 32 | parent_idx, 33 | child_idx=NULL 34 | ) { 35 | 36 | if(is.null(child_idx)) { 37 | child_idx <- parent_idx 38 | } 39 | 40 | # Find indexes for entites and variables inside entitysets 41 | es_names <- purrr::map_dfr(lapply( 42 | X = 1:length(entityset$entities), 43 | FUN = function(set) { 44 | variables <- unlist(lapply( 45 | X = entityset$entities[[set]]$variables, 46 | FUN = function(x) x$id 47 | )) 48 | 49 | t <- data.frame("variable_name" = variables, stringsAsFactors = FALSE) 50 | t$variable_idx <- 1:nrow(t) 51 | t$entity_name <- names(entityset$entity_dict)[[set]] 52 | t$entity_idx <- set 53 | 54 | return(t) 55 | 56 | } 57 | ), c) 58 | 59 | entity_parent_set_pos <- es_names$entity_idx[es_names$entity_name == parent_set][[1]] 60 | entity_child_set_pos <- es_names$entity_idx[es_names$entity_name == child_set][[1]] 61 | index_parent_set_pos <- es_names$variable_idx[es_names$variable_name == parent_idx & es_names$entity_name == parent_set] 62 | index_child_set_pos <- es_names$variable_idx[es_names$variable_name == child_idx & es_names$entity_name == child_set] 63 | 64 | if (length(index_parent_set_pos) == 0) { 65 | stop("Couldn't find index column `", parent_idx, "` in `", parent_set, "`") 66 | } 67 | 68 | if (length(index_child_set_pos) == 0) { 69 | stop("Couldn't find index column `", child_idx, "` in `", child_set, "`") 70 | } 71 | 72 | # Construct new relationship 73 | rel <- .ft$Relationship( 74 | entityset$entities[[entity_parent_set_pos]]$variables[[index_parent_set_pos]], 75 | entityset$entities[[entity_child_set_pos]]$variables[[index_child_set_pos]] 76 | ) 77 | 78 | # Add relationship to entityset 79 | entityset <- entityset$add_relationship(rel) 80 | 81 | return(entityset) 82 | } 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # featuretoolsR 2 | An R interface to the Python module Featuretools. 3 | 4 | # General 5 | `featuretoolsR` provides functionality from the Python module `featuretools`, which aims to automate feature engineering. This package is very much a work in progress as Featuretools offers a lot of functionality. Any PRs are much appreciated. 6 | 7 | # Installing 8 | 9 | ## Package 10 | ### CRAN 11 | The latest stable release is found on [CRAN](https://cran.r-project.org/package=featuretoolsR). 12 | 13 | ### Github 14 | You can get the latest version of `featuretoolsR` by installing it straight from Github: `devtools::install_github("magnusfurugard/featuretoolsR")`. 15 | 16 | ## Featuretools 17 | You'll need to have a working Python environment as well as `featuretools` installed. The recommended way is to use the built-in function `install_featuretools()` which automatically sets up a virtual environment for the package and installs `featuretools`. 18 | 19 | # Usage 20 | All functions in `featuretoolsR` comes with documentation, but it's advised to briefly browse through the [Featuretools Python documentation](https://docs.featuretools.com/). It'll cover things like `entities`, `relationships` and `dfs`. 21 | 22 | ## Creating an entityset 23 | An entityset is the set which contain all your entities. To create a set and add an entity straight away, you can use `as_entityset`. 24 | ``` 25 | # Libs 26 | library(featuretoolsR) 27 | library(magrittr) 28 | 29 | # Create some mock data 30 | set_1 <- data.frame(key = 1:100, value = sample(letters, 100, T), a = rep(Sys.Date(), 100)) 31 | set_2 <- data.frame(key = 1:100, value = sample(LETTERS, 100, T), b = rep(Sys.time(), 100)) 32 | 33 | # Create entityset 34 | es <- as_entityset( 35 | set_1, 36 | index = "key", 37 | entity_id = "set_1", 38 | id = "demo", 39 | time_index = "a" 40 | ) 41 | ``` 42 | 43 | ## Adding entities 44 | To add entities (i.e if you have relational data across multiple `data.frames`), this can be achieved with `add_entity`. This function is pipe friendly. For this demo-case, we'll use `set_2`. 45 | ``` 46 | es <- es %>% 47 | add_entity( 48 | df = set_2, 49 | entity_id = "set_2", 50 | index = "key", 51 | time_index = "b" 52 | ) 53 | ``` 54 | 55 | ## Defining relationships 56 | With relational data, it's useful to define a relationship between two or more entities. This can be done with `add_relationship`. 57 | ``` 58 | es <- es %>% 59 | add_relationship( 60 | parent_set = "set_1", 61 | child_set = "set_2", 62 | parent_idx = "key", 63 | child_idx = "key" 64 | ) 65 | ``` 66 | 67 | ## Deep feature synthesis 68 | The bread and butter of Featuretools is the `dfs`-function (official docs [here](https://docs.featuretools.com/en/stable/automated_feature_engineering/afe.html)). It will attempt to create features based on `*_primitives` you provide (more on primitives below). 69 | ``` 70 | ft_matrix <- es %>% 71 | dfs( 72 | target_entity = "set_1", 73 | trans_primitives = c("and", "cum_sum") 74 | ) 75 | ``` 76 | 77 | ## Tidying up 78 | To use the new data.frame/features created by `dfs`, a function unique for `featuretoolsR`, `tidy_feature_matrix` can be used. A few "nice-to-have" arguments can be passed to clean the new data, like removing near zero variance variables, as well as replacing `NaN` with `NA`. 79 | ``` 80 | tidy <- tidy_feature_matrix(ft_matrix, remove_nzv = T, nan_is_na = T, clean_names = T) 81 | ``` 82 | 83 | # Primitives 84 | Featuretools supports a lot of primitives. These are accessible with the function `list_primitives()` which returns a data.frame containing type (aggregation (`agg_primitives`) or transform (`trans_primitives`)), name (in the example above, "and" and "divide") as well as a brief description of the primitive itself. 85 | 86 | # Credits 87 | [reticulate](https://github.com/rstudio/reticulate) - an R interface to Python. 88 | 89 | [Featuretools](https://github.com/Featuretools/featuretools) 90 | -------------------------------------------------------------------------------- /tests/testthat/test_main.R: -------------------------------------------------------------------------------- 1 | # This these tests ensures that the most used functions of this package works. Other 2 | # utility functions are tested separately. 3 | 4 | # Common functions for all tests. 5 | source("utils.R") 6 | 7 | # Create mock data for tests 8 | library(magrittr) 9 | set_size <- 2 10 | value_variables <- 2 11 | set_1 <- generate_mock_data(set_size, value_variables) 12 | set_2 <- generate_mock_data(set_size, value_variables) 13 | 14 | # Basic entity creation 15 | test_that("can create entity through as_entityset", { 16 | skip_if_no_featuretools() 17 | 18 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") 19 | expect_true(any(class(es) == "featuretools.entityset.entityset.EntitySet")) 20 | }) 21 | 22 | # Add multiple entities to set 23 | test_that("can add entity to entityset with add_entity", { 24 | skip_if_no_featuretools() 25 | 26 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 27 | add_entity(entity_id = "set_2", df = set_2, index = "key") 28 | expect_true(length(es$entities) == 2) 29 | expect_true(all(names(es$entity_dict) == c("set_1", "set_2"))) 30 | }) 31 | 32 | # Add relationship between entities 33 | test_that("can add relationship between two entities", { 34 | skip_if_no_featuretools() 35 | 36 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 37 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 38 | add_relationship( 39 | parent_set = "set_1", 40 | child_set = "set_2", 41 | parent_idx = "key", 42 | child_idx = "key" 43 | ) 44 | 45 | expect_true(is.list(es$relationships) && length(es$relationships) > 0) 46 | }) 47 | 48 | test_that("relationship can be inherited from parent_idx", { 49 | skip_if_no_featuretools() 50 | 51 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 52 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 53 | add_relationship( 54 | parent_set = "set_1", 55 | child_set = "set_2", 56 | parent_idx = "key" 57 | ) 58 | 59 | expect_true(is.list(es$relationships) && length(es$relationships) > 0) 60 | }) 61 | 62 | 63 | # Deep feture synthesis 64 | test_that("can perform dfs", { 65 | skip_if_no_featuretools() 66 | 67 | es <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 68 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 69 | add_relationship( 70 | parent_set = "set_1", 71 | child_set = "set_2", 72 | parent_idx = "key", 73 | child_idx = "key" 74 | ) %>% 75 | dfs(target_entity = "set_1", trans_primitives = c("and")) 76 | 77 | expect_true(length(es[[2]]) == 2) 78 | }) 79 | 80 | # Feature extraction 81 | test_that("can extract features from dfs", { 82 | skip_if_no_featuretools() 83 | 84 | features <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 85 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 86 | add_relationship( 87 | parent_set = "set_1", 88 | child_set = "set_2", 89 | parent_idx = "key", 90 | child_idx = "key" 91 | ) %>% 92 | dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 93 | extract_features() 94 | 95 | expect_true(all(names(features) == c("name", "feature"))) 96 | expect_true(nrow(features) == value_variables) 97 | expect_true(length(features$feature) == value_variables) 98 | expect_true(class(features$feature) == "list") 99 | }) 100 | 101 | # Storing features locally 102 | test_that("can save features", { 103 | skip_if_no_featuretools() 104 | 105 | as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 106 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 107 | add_relationship( 108 | parent_set = "set_1", 109 | child_set = "set_2", 110 | parent_idx = "key", 111 | child_idx = "key" 112 | ) %>% 113 | dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 114 | extract_features() %>% 115 | save_features(filename = "some.features", path = ".") 116 | 117 | expect_true(file.exists("some.features")) 118 | }) 119 | 120 | # Loading stored features 121 | test_that("can load features", { 122 | skip_if_no_featuretools() 123 | 124 | features <- load_features("some.features") 125 | expect_true(!is.null(features)) 126 | 127 | # Cleanup 128 | if(file.exists("some.features")) file.remove("some.features") 129 | }) 130 | 131 | # Tidying a feature matrix 132 | test_that("can tidy feature matrix after dfs", { 133 | skip_if_no_featuretools() 134 | 135 | tidy <- as_entityset(set_1, index = "key", entity_id = "set_1", id = "demo") %>% 136 | add_entity(entity_id = "set_2", df = set_2, index = "key") %>% 137 | add_relationship( 138 | parent_set = "set_1", 139 | child_set = "set_2", 140 | parent_idx = "key", 141 | child_idx = "key" 142 | ) %>% 143 | dfs(target_entity = "set_1", trans_primitives = c("and")) %>% 144 | tidy_feature_matrix( 145 | remove_nzv = TRUE, 146 | nan_is_na = TRUE, 147 | clean_names = TRUE 148 | ) 149 | 150 | expect_true(nrow(tidy) == set_size) 151 | expect_true(!any(is.nan(tidy$value))) 152 | expect_true(length(grep("[^A-z0-9_]", names(tidy))) == 0) 153 | }) 154 | --------------------------------------------------------------------------------