├── .circleci └── config.yml ├── .coveragerc ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── docs.md │ ├── feature_request.md │ └── jupyter-notebook-examples.md └── workflow │ └── workflow.yml ├── .gitignore ├── .readthedocs.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── _static │ ├── css │ │ └── feature-engine.css │ └── js │ │ └── copybutton.js ├── _templates │ ├── class.rst │ ├── layout.html │ └── numpydoc_docstring.rst ├── about │ ├── about.rst │ ├── authors.rst │ ├── former_authors.rst │ ├── governance.rst │ ├── index.rst │ └── roadmap.rst ├── api_doc │ ├── creation │ │ ├── CyclicalFeatures.rst │ │ ├── DecisionTreeFeatures.rst │ │ ├── MathFeatures.rst │ │ ├── RelativeFeatures.rst │ │ └── index.rst │ ├── datasets │ │ ├── index.rst │ │ └── titanic.rst │ ├── datetime │ │ ├── DatetimeFeatures.rst │ │ ├── DatetimeSubtraction.rst │ │ └── index.rst │ ├── discretisation │ │ ├── ArbitraryDiscretiser.rst │ │ ├── DecisionTreeDiscretiser.rst │ │ ├── EqualFrequencyDiscretiser.rst │ │ ├── EqualWidthDiscretiser.rst │ │ ├── GeometricWidthDiscretiser.rst │ │ └── index.rst │ ├── encoding │ │ ├── CountFrequencyEncoder.rst │ │ ├── DecisionTreeEncoder.rst │ │ ├── MeanEncoder.rst │ │ ├── OneHotEncoder.rst │ │ ├── OrdinalEncoder.rst │ │ ├── RareLabelEncoder.rst │ │ ├── StringSimilarityEncoder.rst │ │ ├── WoEEncoder.rst │ │ └── index.rst │ ├── imputation │ │ ├── AddMissingIndicator.rst │ │ ├── ArbitraryNumberImputer.rst │ │ ├── CategoricalImputer.rst │ │ ├── DropMissingData.rst │ │ ├── EndTailImputer.rst │ │ ├── MeanMedianImputer.rst │ │ ├── RandomSampleImputer.rst │ │ └── index.rst │ ├── index.rst │ ├── outliers │ │ ├── ArbitraryOutlierCapper.rst │ │ ├── OutlierTrimmer.rst │ │ ├── Winsorizer.rst │ │ └── index.rst │ ├── pipeline │ │ ├── Pipeline.rst │ │ ├── index.rst │ │ └── make_pipeline.rst │ ├── preprocessing │ │ ├── MatchCategories.rst │ │ ├── MatchVariables.rst │ │ └── index.rst │ ├── scaling │ │ ├── MeanNormalizationScaler.rst │ │ └── index.rst │ ├── selection │ │ ├── DropConstantFeatures.rst │ │ ├── DropCorrelatedFeatures.rst │ │ ├── DropDuplicateFeatures.rst │ │ ├── DropFeatures.rst │ │ ├── DropHighPSIFeatures.rst │ │ ├── MRMR.rst │ │ ├── ProbeFeatureSelection.rst │ │ ├── RecursiveFeatureAddition.rst │ │ ├── RecursiveFeatureElimination.rst │ │ ├── SelectByInformationValue.rst │ │ ├── SelectByShuffling.rst │ │ ├── SelectBySingleFeaturePerformance.rst │ │ ├── SelectByTargetMeanPerformance.rst │ │ ├── SmartCorrelatedSelection.rst │ │ └── index.rst │ ├── timeseries │ │ ├── forecasting │ │ │ ├── ExpandingWindowFeatures.rst │ │ │ ├── LagFeatures.rst │ │ │ ├── WindowFeatures.rst │ │ │ └── index.rst │ │ └── index.rst │ ├── transformation │ │ ├── ArcsinTransformer.rst │ │ ├── BoxCoxTransformer.rst │ │ ├── LogCpTransformer.rst │ │ ├── LogTransformer.rst │ │ ├── PowerTransformer.rst │ │ ├── ReciprocalTransformer.rst │ │ ├── YeoJohnsonTransformer.rst │ │ └── index.rst │ ├── variable_handling │ │ ├── check_all_variables.rst │ │ ├── check_categorical_variables.rst │ │ ├── check_datetime_variables.rst │ │ ├── check_numerical_variables.rst │ │ ├── find_all_variables.rst │ │ ├── find_categorical_and_numerical_variables.rst │ │ ├── find_categorical_variables.rst │ │ ├── find_datetime_variables.rst │ │ ├── find_numerical_variables.rst │ │ ├── index.rst │ │ └── retain_variables_if_in_df.rst │ └── wrappers │ │ ├── Wrapper.rst │ │ └── index.rst ├── conf.py ├── contribute │ ├── code_of_conduct.rst │ ├── contribute_code.rst │ ├── contribute_docs.rst │ ├── contribute_jup.rst │ ├── contribute_other.rst │ └── index.rst ├── donate.rst ├── images │ ├── 1024px-Relationship_between_mean_and_median_under_different_skewness.png │ ├── Discretisation.png │ ├── FeatureEnginePackageStructure.png │ ├── FeatureEnginePackageStructureCrossSectional.png │ ├── FeatureEnginePackageStructureDatetimeText.png │ ├── FeatureEnginePackageStructureTimeseries.png │ ├── PSI_distribution_case1.png │ ├── PSI_distribution_case3.png │ ├── PSI_distribution_case4.png │ ├── PSI_distribution_case5.png │ ├── Variable_Transformation.png │ ├── arbitraryvalueimputation.png │ ├── bmilogcp.png │ ├── bmiraw.png │ ├── boxplot-age-percentiles.png │ ├── boxplot-age.png │ ├── boxplot-fare-mad.png │ ├── boxplot-fare.png │ ├── boxplot-sibsp-fare-iqr.png │ ├── boxplot-sibsp.png │ ├── boxplot-titanic.png │ ├── breast_cancer_arcsin.png │ ├── breast_cancer_raw.png │ ├── cookbook.png │ ├── dmlm.png │ ├── endtailimputer.png │ ├── equalfrequencydiscretisation.png │ ├── equalfrequencydiscretisation_gaussian.png │ ├── equalfrequencydiscretisation_skewed.png │ ├── equalwidthdiscretisation.png │ ├── f_statistic.png │ ├── feml.png │ ├── fetsf.png │ ├── fork.png │ ├── frequentcategoryimputer.png │ ├── fsml.png │ ├── fsmlbook.png │ ├── fwml.png │ ├── hour_sin.png │ ├── hour_sin2.png │ ├── hour_sin3.png │ ├── hour_sin4.png │ ├── increasingwidthdisc.png │ ├── ivml_logo.png │ ├── logcpraw.png │ ├── logcptransform.png │ ├── logo │ │ ├── FeatureEngine.png │ │ ├── Logo.png │ │ ├── Logo_name.png │ │ ├── favicon.png │ │ └── logo.svg │ ├── lotarea_pt.png │ ├── lotarea_pt_custom_exp.png │ ├── lotarea_raw.png │ ├── lotareaboxcox.png │ ├── lotarealog.png │ ├── lotareapower.png │ ├── lotarearaw.png │ ├── lotareareciprocal.png │ ├── lotareayeojohnson.png │ ├── lotshape-price-per-cat-enc.png │ ├── lotshape-price-per-cat.png │ ├── meanmedianimputater_distributions.png │ ├── medianimputation.png │ ├── medinc_disc_arbitrarily.png │ ├── medinc_disc_arbitrarily2.png │ ├── medinc_hist.png │ ├── missingcategoryimputer.png │ ├── missingindicator.png │ ├── mli_logo.png │ ├── monotonic.png │ ├── mzoning-price-per-cat-enc.png │ ├── mzoning-price-per-cat.png │ ├── nonnormalvars2.png │ ├── nonnormalvars2logtransformed.png │ ├── nonnormalvars2transformed.png │ ├── ordinal_encoding_monotonic.png │ ├── pipelineprediction.png │ ├── probe-importance-std.png │ ├── probe_feature_normal.png │ ├── probe_features.png │ ├── quasiconstant.png │ ├── randomsampleimputation.png │ ├── reciprocal_transformer │ │ ├── reciprocal_transfomer_inverse.png │ │ ├── reciprocal_transfomer_new.png │ │ ├── reciprocal_transfomer_original.png │ │ ├── reciprocal_transformer_3plots_new.png │ │ └── reciprocal_transformer_3plots_original.png │ ├── rfa_linreg_imp.png │ ├── rfa_perf_drifts.png │ ├── rfe_perf_drift.png │ ├── rfimportancemrmr.png │ ├── selectionChart.png │ ├── shuffle-features-std.png │ ├── single-feature-perf-std.png │ ├── single_feature_probes_imp.png │ ├── sponsors │ │ ├── call_for_sponsors.png │ │ ├── how-did-you-discover.png │ │ └── trainindata.png │ ├── summary │ │ ├── imputersSummary.png │ │ └── selectionSummary.png │ ├── target-mean-sel-std.png │ ├── toydata_pt_raw.png │ ├── toydata_pt_transformed.png │ ├── toydata_pt_transformed_custom_exp.png │ ├── transformedcoupleYJ.png │ ├── treediscretisation.png │ ├── treemonotonicprediction.png │ ├── treepredictionrounded.png │ ├── untransformedcoupleYJ.png │ ├── woe_encoding.png │ ├── woe_prediction.png │ └── yeojohnsonformula.png ├── index.rst ├── quickstart │ ├── datasets.rst │ └── index.rst ├── requirements.txt ├── resources │ ├── blogs.rst │ ├── books.rst │ ├── courses.rst │ ├── index.rst │ └── tutorials.rst ├── sphinxext │ ├── LICENSE.txt │ ├── README.txt │ └── github_link.py ├── user_guide │ ├── creation │ │ ├── CyclicalFeatures.rst │ │ ├── DecisionTreeFeatures.rst │ │ ├── MathFeatures.rst │ │ ├── RelativeFeatures.rst │ │ └── index.rst │ ├── datetime │ │ ├── DatetimeFeatures.rst │ │ ├── DatetimeSubtraction.rst │ │ └── index.rst │ ├── discretisation │ │ ├── ArbitraryDiscretiser.rst │ │ ├── DecisionTreeDiscretiser.rst │ │ ├── EqualFrequencyDiscretiser.rst │ │ ├── EqualWidthDiscretiser.rst │ │ ├── GeometricWidthDiscretiser.rst │ │ └── index.rst │ ├── encoding │ │ ├── CountFrequencyEncoder.rst │ │ ├── DecisionTreeEncoder.rst │ │ ├── MeanEncoder.rst │ │ ├── OneHotEncoder.rst │ │ ├── OrdinalEncoder.rst │ │ ├── RareLabelEncoder.rst │ │ ├── StringSimilarityEncoder.rst │ │ ├── WoEEncoder.rst │ │ └── index.rst │ ├── imputation │ │ ├── AddMissingIndicator.rst │ │ ├── ArbitraryNumberImputer.rst │ │ ├── CategoricalImputer.rst │ │ ├── DropMissingData.rst │ │ ├── EndTailImputer.rst │ │ ├── MeanMedianImputer.rst │ │ ├── RandomSampleImputer.rst │ │ └── index.rst │ ├── index.rst │ ├── outliers │ │ ├── ArbitraryOutlierCapper.rst │ │ ├── OutlierTrimmer.rst │ │ ├── Winsorizer.rst │ │ └── index.rst │ ├── pipeline │ │ ├── Pipeline.rst │ │ ├── index.rst │ │ └── make_pipeline.rst │ ├── preprocessing │ │ ├── MatchCategories.rst │ │ ├── MatchVariables.rst │ │ └── index.rst │ ├── scaling │ │ ├── MeanNormalizationScaler.rst │ │ └── index.rst │ ├── selection │ │ ├── DropConstantFeatures.rst │ │ ├── DropCorrelatedFeatures.rst │ │ ├── DropDuplicateFeatures.rst │ │ ├── DropFeatures.rst │ │ ├── DropHighPSIFeatures.rst │ │ ├── MRMR.rst │ │ ├── ProbeFeatureSelection.rst │ │ ├── RecursiveFeatureAddition.rst │ │ ├── RecursiveFeatureElimination.rst │ │ ├── SelectByInformationValue.rst │ │ ├── SelectByShuffling.rst │ │ ├── SelectBySingleFeaturePerformance.rst │ │ ├── SelectByTargetMeanPerformance.rst │ │ ├── SmartCorrelatedSelection.rst │ │ └── index.rst │ ├── timeseries │ │ ├── forecasting │ │ │ ├── ExpandingWindowFeatures.rst │ │ │ ├── LagFeatures.rst │ │ │ ├── WindowFeatures.rst │ │ │ └── index.rst │ │ └── index.rst │ ├── transformation │ │ ├── ArcsinTransformer.rst │ │ ├── BoxCoxTransformer.rst │ │ ├── LogCpTransformer.rst │ │ ├── LogTransformer.rst │ │ ├── PowerTransformer.rst │ │ ├── ReciprocalTransformer.rst │ │ ├── YeoJohnsonTransformer.rst │ │ └── index.rst │ ├── variable_handling │ │ ├── check_all_variables.rst │ │ ├── check_categorical_variables.rst │ │ ├── check_datetime_variables.rst │ │ ├── check_numerical_variables.rst │ │ ├── find_all_variables.rst │ │ ├── find_categorical_and_numerical_variables.rst │ │ ├── find_categorical_variables.rst │ │ ├── find_datetime_variables.rst │ │ ├── find_numerical_variables.rst │ │ ├── index.rst │ │ └── retain_variables_if_in_df.rst │ └── wrappers │ │ ├── Wrapper.rst │ │ └── index.rst ├── versions │ └── index.rst └── whats_new │ ├── index.rst │ ├── v_06.rst │ ├── v_1.rst │ ├── v_120.rst │ ├── v_130.rst │ ├── v_140.rst │ ├── v_150.rst │ ├── v_160.rst │ ├── v_170.rst │ └── v_180.rst ├── feature_engine ├── VERSION ├── __init__.py ├── _base_transformers │ ├── __init__.py │ ├── base_numerical.py │ └── mixins.py ├── _check_init_parameters │ ├── __init__.py │ ├── check_init_input_params.py │ ├── check_input_dictionary.py │ └── check_variables.py ├── _docstrings │ ├── __init__.py │ ├── fit_attributes.py │ ├── init_parameters │ │ ├── __init__.py │ │ ├── all_trasnformers.py │ │ ├── creation.py │ │ ├── discretisers.py │ │ ├── encoders.py │ │ ├── outliers.py │ │ └── selection.py │ ├── methods.py │ ├── selection │ │ ├── __init__.py │ │ └── _docstring.py │ └── substitute.py ├── _prediction │ ├── __init__.py │ ├── base_predictor.py │ ├── target_mean_classifier.py │ └── target_mean_regressor.py ├── creation │ ├── __init__.py │ ├── base_creation.py │ ├── cyclical_features.py │ ├── decision_tree_features.py │ ├── math_features.py │ └── relative_features.py ├── dataframe_checks.py ├── datasets │ ├── __init__.py │ └── titanic.py ├── datetime │ ├── __init__.py │ ├── _datetime_constants.py │ ├── datetime.py │ └── datetime_subtraction.py ├── discretisation │ ├── __init__.py │ ├── arbitrary.py │ ├── base_discretiser.py │ ├── decision_tree.py │ ├── equal_frequency.py │ ├── equal_width.py │ └── geometric_width.py ├── encoding │ ├── __init__.py │ ├── _helper_functions.py │ ├── base_encoder.py │ ├── count_frequency.py │ ├── decision_tree.py │ ├── mean_encoding.py │ ├── one_hot.py │ ├── ordinal.py │ ├── rare_label.py │ ├── similarity_encoder.py │ └── woe.py ├── imputation │ ├── __init__.py │ ├── arbitrary_number.py │ ├── base_imputer.py │ ├── categorical.py │ ├── drop_missing_data.py │ ├── end_tail.py │ ├── mean_median.py │ ├── missing_indicator.py │ └── random_sample.py ├── outliers │ ├── __init__.py │ ├── artbitrary.py │ ├── base_outlier.py │ ├── trimmer.py │ └── winsorizer.py ├── pipeline │ ├── __init__.py │ └── pipeline.py ├── preprocessing │ ├── __init__.py │ ├── match_categories.py │ └── match_columns.py ├── py.typed ├── scaling │ ├── __init__.py │ └── mean_normalization.py ├── selection │ ├── __init__.py │ ├── _selection_constants.py │ ├── base_recursive_selector.py │ ├── base_selection_functions.py │ ├── base_selector.py │ ├── drop_constant_features.py │ ├── drop_correlated_features.py │ ├── drop_duplicate_features.py │ ├── drop_features.py │ ├── drop_psi_features.py │ ├── information_value.py │ ├── mrmr.py │ ├── probe_feature_selection.py │ ├── recursive_feature_addition.py │ ├── recursive_feature_elimination.py │ ├── shuffle_features.py │ ├── single_feature_performance.py │ ├── smart_correlation_selection.py │ └── target_mean_selection.py ├── tags.py ├── timeseries │ ├── __init__.py │ └── forecasting │ │ ├── __init__.py │ │ ├── base_forecast_transformers.py │ │ ├── expanding_window_features.py │ │ ├── lag_features.py │ │ └── window_features.py ├── transformation │ ├── __init__.py │ ├── arcsin.py │ ├── boxcox.py │ ├── log.py │ ├── power.py │ ├── reciprocal.py │ └── yeojohnson.py ├── variable_handling │ ├── __init__.py │ ├── _variable_type_checks.py │ ├── check_variables.py │ ├── dtypes.py │ ├── find_variables.py │ └── retain_variables.py └── wrappers │ ├── __init__.py │ └── wrappers.py ├── mypy.ini ├── paper ├── paper.bib └── paper.md ├── pytest.ini ├── requirements.txt ├── setup.py ├── test_requirements.txt ├── tests ├── __init__.py ├── check_estimators_with_parametrize_tests.py ├── conftest.py ├── estimator_checks │ ├── __init__.py │ ├── dataframe_for_checks.py │ ├── estimator_checks.py │ ├── fit_functionality_checks.py │ ├── get_feature_names_out_checks.py │ ├── init_params_allowed_values_checks.py │ ├── init_params_triggered_functionality_checks.py │ ├── non_fitted_error_checks.py │ └── variable_selection_checks.py ├── parametrize_with_checks_creation_v16.py ├── parametrize_with_checks_discretization_v16.py ├── parametrize_with_checks_encoders_v16.py ├── parametrize_with_checks_outliers_v16.py ├── parametrize_with_checks_prediction_v16.py ├── parametrize_with_checks_selection_v16.py ├── test_base_transformers │ ├── test_base_numerical_transformer.py │ ├── test_get_feature_names_out_mixin.py │ └── test_transform_xy_mixin.py ├── test_check_init_parameters │ ├── __init__.py │ ├── test_check_init_input_params.py │ ├── test_check_input_dictionary.py │ └── test_check_variables.py ├── test_creation │ ├── __init__.py │ ├── test_check_estimator_creation.py │ ├── test_cyclical_features.py │ ├── test_decision_tree_features.py │ ├── test_math_features.py │ └── test_relative_features.py ├── test_dataframe_checks.py ├── test_datasets │ ├── __init__().py │ └── datasets.py ├── test_datetime │ ├── __init__.py │ ├── conftest.py │ ├── test_check_estimator_datetime.py │ ├── test_datetime_features.py │ └── test_datetime_subtraction.py ├── test_discretisation │ ├── __init__.py │ ├── test_arbitrary_discretiser.py │ ├── test_base_discretizer.py │ ├── test_check_estimator_discretisers.py │ ├── test_decision_tree_discretiser.py │ ├── test_equal_frequency_discretiser.py │ ├── test_equal_width_discretiser.py │ └── test_geometric_width_discretiser.py ├── test_encoding │ ├── __init__.py │ ├── test_base_encoders │ │ ├── __init__.py │ │ ├── test_categorical_init_mixin.py │ │ ├── test_categorical_init_mixin_na.py │ │ └── test_categorical_method_mixin.py │ ├── test_check_estimator_encoders.py │ ├── test_count_frequency_encoder.py │ ├── test_decision_tree_encoder.py │ ├── test_helper_functions.py │ ├── test_mean_encoder.py │ ├── test_onehot_encoder.py │ ├── test_ordinal_encoder.py │ ├── test_rare_label_encoder.py │ ├── test_similarity_encoder.py │ └── test_woe │ │ ├── __init__.py │ │ ├── test_woe_class.py │ │ └── test_woe_encoder.py ├── test_imputation │ ├── __init__.py │ ├── test_arbitrary_number_imputer.py │ ├── test_categorical_imputer.py │ ├── test_check_estimator_imputers.py │ ├── test_drop_missing_data.py │ ├── test_end_tail_imputer.py │ ├── test_mean_mdian_imputer.py │ ├── test_missing_indicator.py │ └── test_random_sample_imputer.py ├── test_outliers │ ├── __init__.py │ ├── test_arbitrary_capper.py │ ├── test_check_estimator_outliers.py │ ├── test_outlier_trimmer.py │ └── test_winsorizer.py ├── test_pipeline │ ├── test_pipeline.py │ └── test_pipeline_sklearn.py ├── test_prediction │ ├── __init__.py │ ├── conftest.py │ ├── test_check_estimator_prediction.py │ ├── test_target_mean_classifier.py │ └── test_target_mean_regressor.py ├── test_preprocessing │ ├── __init__.py │ ├── test_check_estimator_preprocessing.py │ ├── test_match_categories.py │ └── test_match_columns.py ├── test_scaling │ ├── __init__.py │ └── test_mean_normalization.py ├── test_selection │ ├── __init__.py │ ├── conftest.py │ ├── test_base_selection_functions.py │ ├── test_base_selector.py │ ├── test_check_estimator_selectors.py │ ├── test_drop_constant_features.py │ ├── test_drop_correlated_features.py │ ├── test_drop_duplicate_features.py │ ├── test_drop_features.py │ ├── test_drop_high_psi_features.py │ ├── test_information_value.py │ ├── test_mrmr.py │ ├── test_probe_feature_selection.py │ ├── test_recursive_feature_addition.py │ ├── test_recursive_feature_elimination.py │ ├── test_recursive_feature_selectors.py │ ├── test_shuffle_features.py │ ├── test_single_feature_performance.py │ ├── test_smart_correlation_selection.py │ └── test_target_mean_selection.py ├── test_sklearn_compatible │ └── test_set_output.py ├── test_time_series │ ├── __init__.py │ └── test_forecasting │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_check_estimator_forecasting.py │ │ ├── test_expanding_window_features.py │ │ ├── test_lag_features.py │ │ └── test_window_features.py ├── test_transformation │ ├── __init__.py │ ├── test_arcsin_transformer.py │ ├── test_boxcox_transformer.py │ ├── test_check_estimator_transformers.py │ ├── test_log_transformer.py │ ├── test_logcp_transformer.py │ ├── test_power_transformer.py │ ├── test_reciprocal_transformer.py │ └── test_yeojohnson_transformer.py ├── test_variable_handling │ ├── __init__.py │ ├── conftest.py │ ├── test_check_variables.py │ ├── test_find_variables.py │ └── test_remove_variables.py └── test_wrappers │ ├── __init__.py │ ├── test_check_estimator_wrappers.py │ └── test_sklearn_wrapper.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | # configuration for coverage.py 2 | 3 | [run] 4 | branch = True 5 | source = feature_engine 6 | include = */feature_engine/* 7 | omit = 8 | */setup.py 9 | 10 | 11 | [report] 12 | exclude_lines = 13 | pragma: no cover 14 | 15 | show_missing = True -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [solegalli] 4 | buy_me_a_coffee: solegalliy 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/docs.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Docs 3 | about: What documentation is missing? 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please let us know if you think there is information missing, or how else we can improve the documentation from Feature-engine. 11 | 12 | If you are referring to an existing page, please paste the url. 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/jupyter-notebook-examples.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Jupyter notebook examples 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | Please let us know what is missing from existing Jupyter notebook demos, or suggest which new demo you think it would be useful for the community. 11 | -------------------------------------------------------------------------------- /.github/workflow/workflow.yml: -------------------------------------------------------------------------------- 1 | name: CodeCov 2 | on: [push, pull_request] 3 | jobs: 4 | run: 5 | runs-on: ubuntu-latest 6 | env: 7 | OS: ubuntu-latest 8 | PYTHON: '3.9' 9 | steps: 10 | - uses: checkout@v3 11 | with: 12 | fetch-depth: ‘2’ 13 | 14 | - name: Setup Python 15 | uses: actions/setup-python@master 16 | with: 17 | python-version: 3.9 18 | - name: Generate Report 19 | run: | 20 | pip install coverage 21 | coverage run -m pytest 22 | - name: Upload Coverage to Codecov 23 | uses: codecov/codecov-action@v3.1.1 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | docs/build/ 69 | build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # Miscelaneous 109 | .idea 110 | .vscode 111 | *.csv 112 | *.DS_Store 113 | *.db 114 | *.pptx -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Build documentation with MkDocs 19 | #mkdocs: 20 | # configuration: mkdocs.yml 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | formats: all 24 | 25 | # Optionally set the version of Python and requirements required to build your docs 26 | python: 27 | install: 28 | - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing to Feature-engine 2 | ============================== 3 | 4 | Feature-engine is a community-driven open-source project that relies on contributions from 5 | people like you. Every contribution, no matter how big or small, can make a significant 6 | impact on the project. If you've never contributed to an open-source project before, don't 7 | worry! Feature-engine is a great place to start. Your help will be appreciated and welcomed 8 | with gratitude. 9 | 10 | The latest contributing guide is available online at: 11 | 12 | https://feature-engine.trainindata.com/en/latest/contribute/index.html 13 | 14 | There are many ways to contribute to Feature-engine, with the most common ones 15 | being contribution of code or documentation to the project. Improving the 16 | documentation is no less important than improving the library itself. If you 17 | find a typo in the documentation, or have made improvements, do not hesitate to 18 | submit a GitHub pull request. 19 | 20 | Documentation can be found under the 21 | [doc/](https://github.com/feature-engine/feature_engine/tree/main/docs) directory. 22 | 23 | You can check out requested enhancements and current bugs on the 24 | [issue tracker](https://github.com/feature-engine/feature_engine/issues), 25 | and suggest a PR with the fix. Every contribution is valuable and decreases the burden 26 | on the project maintainer. 27 | 28 | Another way to contribute is to report issues you're facing, and give a "thumbs 29 | up" on issues that others reported and that are relevant to you. It also helps 30 | us if you spread the word: reference the project from your blog and articles, 31 | link to it from your website, or simply star it in GitHub to say "I use it". 32 | 33 | Quick links 34 | ----------- 35 | 36 | * [Submitting a bug report or feature request](https://github.com/feature-engine/feature_engine/issues) 37 | * [Contributing code](https://feature-engine.trainindata.com/en/latest/contribute/contribute_code.html) 38 | * [Contributing docs](https://feature-engine.trainindata.com/en/latest/contribute/contribute_docs.html) 39 | * [Other ways to contribute](https://feature-engine.trainindata.com/en/latest/contribute/contribute_other.html) 40 | 41 | Code of Conduct 42 | --------------- 43 | 44 | We abide by the principles of openness, respect, and consideration of others 45 | of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018-2024 The Feature-engine developers. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | include *.pkl 4 | recursive-include ./feature_engine/* 5 | 6 | include feature_engine/VERSION 7 | 8 | include ./requirements.txt 9 | include ./LICENSE 10 | exclude *.log 11 | exclude *.cfg 12 | 13 | recursive-exclude * __pycache__ 14 | recursive-exclude * *.py[co] -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = feature_engine 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/css/feature-engine.css: -------------------------------------------------------------------------------- 1 | @import url("theme.css"); 2 | 3 | 4 | /* Css template from sklearn: 5 | https://github.com/scikit-learn/scikit-learn/blob/f71c0313142c4e5f2f35a0021c36075cf8dba611/doc/themes/scikit-learn-modern/static/css/theme.css 6 | */ 7 | 8 | /* authors */ 9 | .authors-container { 10 | display: flex; 11 | flex-wrap: wrap; 12 | justify-content: center; 13 | } 14 | 15 | 16 | /* sponsors and testimonials */ 17 | 18 | div.sk-sponsor-div, div.sk-testimonial-div { 19 | display: flex; 20 | flex-wrap: wrap; 21 | -webkit-flex-align: center; 22 | -ms-flex-align: center; 23 | -webkit-align-items: center; 24 | align-items: center; 25 | } 26 | 27 | div.sk-sponsor-div-box, div.sk-testimonial-div-box { 28 | width: 100%; 29 | } 30 | 31 | @media screen and (min-width: 500px) { 32 | div.sk-sponsor-div-box, div.sk-testimonial-div-box { 33 | width: 50%; 34 | } 35 | } 36 | 37 | .caption { 38 | text-align: center 39 | } -------------------------------------------------------------------------------- /docs/_templates/class.rst: -------------------------------------------------------------------------------- 1 | {{objname}} 2 | {{ underline }}============== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | 8 | {% block methods %} 9 | 10 | {% if methods %} 11 | .. rubric:: Methods 12 | 13 | .. autosummary:: 14 | {% for item in methods %} 15 | {% if '__init__' not in item %} 16 | ~{{ name }}.{{ item }} 17 | {% endif %} 18 | {%- endfor %} 19 | {% endif %} 20 | {% endblock %} 21 | 22 | .. include:: {{module}}.{{objname}}.examples 23 | 24 | .. raw:: html 25 | 26 |
27 | -------------------------------------------------------------------------------- /docs/_templates/numpydoc_docstring.rst: -------------------------------------------------------------------------------- 1 | {{index}} 2 | {{summary}} 3 | {{extended_summary}} 4 | {{parameters}} 5 | {{returns}} 6 | {{yields}} 7 | {{other_parameters}} 8 | {{attributes}} 9 | {{raises}} 10 | {{warns}} 11 | {{warnings}} 12 | {{see_also}} 13 | {{notes}} 14 | {{references}} 15 | {{examples}} 16 | {{methods}} -------------------------------------------------------------------------------- /docs/about/authors.rst: -------------------------------------------------------------------------------- 1 | .. raw :: html 2 | 3 | 4 |
5 | 9 |
10 |
11 |

Soledad Galli

12 |
13 |
14 |
15 |

Morgan Sell

16 |
17 |
18 | -------------------------------------------------------------------------------- /docs/about/former_authors.rst: -------------------------------------------------------------------------------- 1 | .. raw :: html 2 | 3 | 4 |
5 | 9 |
10 |
11 |

Chris Samiullah

12 |
13 |
14 |
15 |

Nicolas Galli

16 |
17 |
-------------------------------------------------------------------------------- /docs/about/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. _about: 3 | 4 | About 5 | ===== 6 | 7 | In this section you will find information about the Feature-engine's origin, main 8 | developers, roadmap and overall vision for the package. You will also find information 9 | about how to cite Feature-engine and our main sponsors. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | about 15 | governance 16 | roadmap -------------------------------------------------------------------------------- /docs/api_doc/creation/CyclicalFeatures.rst: -------------------------------------------------------------------------------- 1 | CyclicalFeatures 2 | ================ 3 | 4 | .. autoclass:: feature_engine.creation.CyclicalFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/creation/DecisionTreeFeatures.rst: -------------------------------------------------------------------------------- 1 | DecisionTreeFeatures 2 | ==================== 3 | 4 | .. autoclass:: feature_engine.creation.DecisionTreeFeatures 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/creation/MathFeatures.rst: -------------------------------------------------------------------------------- 1 | MathFeatures 2 | ============ 3 | 4 | .. autoclass:: feature_engine.creation.MathFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/creation/RelativeFeatures.rst: -------------------------------------------------------------------------------- 1 | RelativeFeatures 2 | ================ 3 | 4 | .. autoclass:: feature_engine.creation.RelativeFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/creation/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Feature Creation 4 | ================ 5 | 6 | Feature-engine's creation transformers create and add new features to the dataframe 7 | by either combining or transforming existing features. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | MathFeatures 13 | RelativeFeatures 14 | CyclicalFeatures 15 | DecisionTreeFeatures 16 | 17 | 18 | Transformers in other Libraries 19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 20 | 21 | Check also the following transformer from Scikit-learn: 22 | 23 | * `PolynomialFeatures `_ 24 | * `SplineTransformer `_ 25 | -------------------------------------------------------------------------------- /docs/api_doc/datasets/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Datasets 4 | ======== 5 | 6 | We are starting to build a library of functions that allow you and us to quickly load 7 | datasets to demonstrate and test the functionality of Feature-engine (and, why not, 8 | other Python libraries). 9 | 10 | At the moment, we support the following functions: 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | 15 | titanic 16 | -------------------------------------------------------------------------------- /docs/api_doc/datasets/titanic.rst: -------------------------------------------------------------------------------- 1 | load__titanic 2 | ============= 3 | 4 | .. currentmodule:: feature_engine.datasets 5 | 6 | .. autofunction:: load_titanic -------------------------------------------------------------------------------- /docs/api_doc/datetime/DatetimeFeatures.rst: -------------------------------------------------------------------------------- 1 | DatetimeFeatures 2 | ================ 3 | 4 | .. autoclass:: feature_engine.datetime.DatetimeFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/datetime/DatetimeSubtraction.rst: -------------------------------------------------------------------------------- 1 | DatetimeSubtraction 2 | =================== 3 | 4 | .. autoclass:: feature_engine.datetime.DatetimeSubtraction 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/datetime/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Datetime Features 4 | ================= 5 | 6 | Feature-engine's datetime transformers are able to extract a wide variety of datetime 7 | features from existing datetime or object-like data. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | DatetimeFeatures 13 | DatetimeSubtraction 14 | 15 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/ArbitraryDiscretiser.rst: -------------------------------------------------------------------------------- 1 | ArbitraryDiscretiser 2 | ==================== 3 | 4 | .. autoclass:: feature_engine.discretisation.ArbitraryDiscretiser 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/DecisionTreeDiscretiser.rst: -------------------------------------------------------------------------------- 1 | DecisionTreeDiscretiser 2 | ======================= 3 | 4 | .. autoclass:: feature_engine.discretisation.DecisionTreeDiscretiser 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/EqualFrequencyDiscretiser.rst: -------------------------------------------------------------------------------- 1 | EqualFrequencyDiscretiser 2 | ========================= 3 | 4 | .. autoclass:: feature_engine.discretisation.EqualFrequencyDiscretiser 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/EqualWidthDiscretiser.rst: -------------------------------------------------------------------------------- 1 | EqualWidthDiscretiser 2 | ===================== 3 | 4 | .. autoclass:: feature_engine.discretisation.EqualWidthDiscretiser 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/GeometricWidthDiscretiser.rst: -------------------------------------------------------------------------------- 1 | GeometricWidthDiscretiser 2 | ========================= 3 | 4 | .. autoclass:: feature_engine.discretisation.GeometricWidthDiscretiser 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/discretisation/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. currentmodule:: feature_engine.discretisation 3 | 4 | Discretisation 5 | ============== 6 | 7 | Feature-engine's discretisation transformers transform continuous variables into 8 | discrete features. This is accomplished, in general, by sorting the variable values 9 | into continuous intervals. 10 | 11 | **Summary** 12 | 13 | ===================================== ======================================================================== 14 | Transformer Functionality 15 | ===================================== ======================================================================== 16 | :class:`EqualFrequencyDiscretiser()` Sorts values into intervals with similar number of observations. 17 | :class:`EqualWidthDiscretiser()` Sorts values into intervals of equal size. 18 | :class:`ArbitraryDiscretiser()` Sorts values into intervals predefined by the user. 19 | :class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete. 20 | :class:`GeometricWidthDiscretiser()` Sorts variable into geometrical intervals. 21 | ===================================== ======================================================================== 22 | 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :hidden: 27 | 28 | EqualFrequencyDiscretiser 29 | EqualWidthDiscretiser 30 | ArbitraryDiscretiser 31 | DecisionTreeDiscretiser 32 | GeometricWidthDiscretiser 33 | 34 | Additional transformers for discretisation 35 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 36 | 37 | For discretisation using K-means, check Scikit-learn's 38 | `KBinsDiscretizer `_. 39 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/CountFrequencyEncoder.rst: -------------------------------------------------------------------------------- 1 | CountFrequencyEncoder 2 | ===================== 3 | 4 | .. autoclass:: feature_engine.encoding.CountFrequencyEncoder 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/DecisionTreeEncoder.rst: -------------------------------------------------------------------------------- 1 | DecisionTreeEncoder 2 | =================== 3 | 4 | .. autoclass:: feature_engine.encoding.DecisionTreeEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/MeanEncoder.rst: -------------------------------------------------------------------------------- 1 | MeanEncoder 2 | =========== 3 | 4 | .. autoclass:: feature_engine.encoding.MeanEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/OneHotEncoder.rst: -------------------------------------------------------------------------------- 1 | OneHotEncoder 2 | ============= 3 | 4 | .. autoclass:: feature_engine.encoding.OneHotEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/OrdinalEncoder.rst: -------------------------------------------------------------------------------- 1 | OrdinalEncoder 2 | ============== 3 | 4 | .. autoclass:: feature_engine.encoding.OrdinalEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/RareLabelEncoder.rst: -------------------------------------------------------------------------------- 1 | RareLabelEncoder 2 | ================ 3 | 4 | 5 | .. autoclass:: feature_engine.encoding.RareLabelEncoder 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/StringSimilarityEncoder.rst: -------------------------------------------------------------------------------- 1 | StringSimilarityEncoder 2 | ======================= 3 | 4 | .. autoclass:: feature_engine.encoding.StringSimilarityEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/WoEEncoder.rst: -------------------------------------------------------------------------------- 1 | WoEEncoder 2 | ========== 3 | 4 | .. autoclass:: feature_engine.encoding.WoEEncoder 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/encoding/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Categorical Encoding 4 | ==================== 5 | 6 | Feature-engine's categorical encoders replace the categories of the variable with 7 | estimated or arbitrary numbers. 8 | 9 | **Summary of Feature-engine's encoders characteristics** 10 | 11 | ================================= ============ ================= ============== =============================================================== 12 | Transformer Regression Classification Multi-class Description 13 | ================================= ============ ================= ============== =============================================================== 14 | :class:`OneHotEncoder()` √ √ √ Adds dummy variables to represent each category 15 | :class:`OrdinalEncoder()` √ √ √ Replaces categories with an integer 16 | :class:`CountFreuencyEncoder()` √ √ √ Replaces categories with their count or frequency 17 | :class:`MeanEncoder()` √ √ x Replaces categories with the targe mean value 18 | :class:`WoEEncoder()` x √ x Replaces categories with the weight of the evidence 19 | :class:`DecisionTreeEncoder()` √ √ √ Replaces categories with the predictions of a decision tree 20 | :class:`RareLabelEncoder()` √ √ √ Groups infrequent categories into a single one 21 | ================================= ============ ================= ============== =============================================================== 22 | 23 | Feature-engine's categorical encoders encode only variables of type categorical or 24 | object by default. From version 1.1.0, you have the option to set the parameter 25 | `ignore_format` to True to make the transformers also accept numerical variables as 26 | input. 27 | 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | 32 | OneHotEncoder 33 | CountFrequencyEncoder 34 | OrdinalEncoder 35 | MeanEncoder 36 | WoEEncoder 37 | DecisionTreeEncoder 38 | RareLabelEncoder 39 | StringSimilarityEncoder 40 | 41 | Other categorical encoding libraries 42 | ------------------------------------ 43 | 44 | For additional categorical encoding transformations, visit the open-source package 45 | `Category encoders `_. 46 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/AddMissingIndicator.rst: -------------------------------------------------------------------------------- 1 | AddMissingIndicator 2 | =================== 3 | 4 | .. autoclass:: feature_engine.imputation.AddMissingIndicator 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/ArbitraryNumberImputer.rst: -------------------------------------------------------------------------------- 1 | ArbitraryNumberImputer 2 | ====================== 3 | 4 | .. autoclass:: feature_engine.imputation.ArbitraryNumberImputer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/CategoricalImputer.rst: -------------------------------------------------------------------------------- 1 | CategoricalImputer 2 | ================== 3 | 4 | .. autoclass:: feature_engine.imputation.CategoricalImputer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/DropMissingData.rst: -------------------------------------------------------------------------------- 1 | DropMissingData 2 | =============== 3 | 4 | .. autoclass:: feature_engine.imputation.DropMissingData 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/EndTailImputer.rst: -------------------------------------------------------------------------------- 1 | EndTailImputer 2 | ============== 3 | 4 | .. autoclass:: feature_engine.imputation.EndTailImputer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/MeanMedianImputer.rst: -------------------------------------------------------------------------------- 1 | MeanMedianImputer 2 | ================= 3 | 4 | .. autoclass:: feature_engine.imputation.MeanMedianImputer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/imputation/RandomSampleImputer.rst: -------------------------------------------------------------------------------- 1 | RandomSampleImputer 2 | =================== 3 | 4 | .. autoclass:: feature_engine.imputation.RandomSampleImputer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/index.rst: -------------------------------------------------------------------------------- 1 | .. _api: 2 | 3 | API 4 | === 5 | 6 | Full API documentation for Feature-engine transformers. 7 | 8 | Transformation 9 | -------------- 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | imputation/index 15 | encoding/index 16 | discretisation/index 17 | outliers/index 18 | transformation/index 19 | 20 | Creation 21 | -------- 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | 26 | creation/index 27 | datetime/index 28 | 29 | 30 | Selection 31 | --------- 32 | .. toctree:: 33 | :maxdepth: 1 34 | 35 | selection/index 36 | 37 | Time series 38 | ----------- 39 | 40 | .. toctree:: 41 | :maxdepth: 1 42 | 43 | timeseries/index 44 | 45 | Other 46 | ----- 47 | .. toctree:: 48 | :maxdepth: 1 49 | 50 | preprocessing/index 51 | scaling/index 52 | wrappers/index 53 | 54 | Pipeline 55 | -------- 56 | .. toctree:: 57 | :maxdepth: 1 58 | 59 | pipeline/index 60 | 61 | Datasets 62 | -------- 63 | .. toctree:: 64 | :maxdepth: 1 65 | 66 | datasets/index 67 | 68 | Tools 69 | ----- 70 | .. toctree:: 71 | :maxdepth: 1 72 | 73 | variable_handling/index -------------------------------------------------------------------------------- /docs/api_doc/outliers/ArbitraryOutlierCapper.rst: -------------------------------------------------------------------------------- 1 | ArbitraryOutlierCapper 2 | ====================== 3 | 4 | .. autoclass:: feature_engine.outliers.ArbitraryOutlierCapper 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/outliers/OutlierTrimmer.rst: -------------------------------------------------------------------------------- 1 | OutlierTrimmer 2 | ============== 3 | 4 | .. autoclass:: feature_engine.outliers.OutlierTrimmer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/outliers/Winsorizer.rst: -------------------------------------------------------------------------------- 1 | Winsorizer 2 | ========== 3 | 4 | .. autoclass:: feature_engine.outliers.Winsorizer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/outliers/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | .. currentmodule:: feature_engine.outliers 4 | 5 | Outlier Handling 6 | ================ 7 | 8 | Feature-engine's outlier transformers cap maximum or minimum values of a variable at an 9 | arbitrary or derived value. The OutlierTrimmer removes outliers from the dataset. 10 | 11 | =================================== ============================================================== 12 | Transformer Description 13 | =================================== ============================================================== 14 | :class:`Winsorizer()` Caps variables at automatically determined extreme values 15 | :class:`ArbitraryOutlierCapper()` Caps variables at values determined by the user 16 | :class:`OutlierTrimmer()` Removes outliers from the dataframe 17 | =================================== ============================================================== 18 | 19 | .. toctree:: 20 | :maxdepth: 1 21 | :hidden: 22 | 23 | Winsorizer 24 | ArbitraryOutlierCapper 25 | OutlierTrimmer -------------------------------------------------------------------------------- /docs/api_doc/pipeline/Pipeline.rst: -------------------------------------------------------------------------------- 1 | Pipeline 2 | ======== 3 | 4 | .. autoclass:: feature_engine.pipeline.Pipeline 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/pipeline/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | .. currentmodule:: feature_engine.pipeline 4 | 5 | Pipeline 6 | ======== 7 | 8 | Feature-engine's Pipeline is equivalent to Scikit-learn's pipeline, and in addition, 9 | it accepts the method `transform_x_y`, to adjust both X and y, in those cases where 10 | rows are removed from X. 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | 15 | Pipeline 16 | make_pipeline 17 | -------------------------------------------------------------------------------- /docs/api_doc/pipeline/make_pipeline.rst: -------------------------------------------------------------------------------- 1 | make_pipeline 2 | ============= 3 | 4 | .. currentmodule:: feature_engine.pipeline 5 | 6 | .. autofunction:: make_pipeline -------------------------------------------------------------------------------- /docs/api_doc/preprocessing/MatchCategories.rst: -------------------------------------------------------------------------------- 1 | MatchCategories 2 | =============== 3 | 4 | .. autoclass:: feature_engine.preprocessing.MatchCategories 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/preprocessing/MatchVariables.rst: -------------------------------------------------------------------------------- 1 | MatchVariables 2 | ============== 3 | 4 | .. autoclass:: feature_engine.preprocessing.MatchVariables 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/preprocessing/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Preprocessing 4 | ============= 5 | 6 | Feature-engine's preprocessing transformers apply general data pre-processing 7 | and transformation procedures. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | MatchCategories 13 | MatchVariables 14 | -------------------------------------------------------------------------------- /docs/api_doc/scaling/MeanNormalizationScaler.rst: -------------------------------------------------------------------------------- 1 | MeanNormalizationScaler 2 | ======================= 3 | 4 | .. autoclass:: feature_engine.scaling.MeanNormalizationScaler 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/scaling/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Scaling 4 | ======= 5 | 6 | Feature-engine's scaling transformers apply various scaling techniques to 7 | given columns 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | MeanNormalizationScaler 13 | -------------------------------------------------------------------------------- /docs/api_doc/selection/DropConstantFeatures.rst: -------------------------------------------------------------------------------- 1 | DropConstantFeatures 2 | ==================== 3 | 4 | .. autoclass:: feature_engine.selection.DropConstantFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/selection/DropCorrelatedFeatures.rst: -------------------------------------------------------------------------------- 1 | DropCorrelatedFeatures 2 | ====================== 3 | 4 | .. autoclass:: feature_engine.selection.DropCorrelatedFeatures 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/selection/DropDuplicateFeatures.rst: -------------------------------------------------------------------------------- 1 | DropDuplicateFeatures 2 | ===================== 3 | 4 | 5 | .. autoclass:: feature_engine.selection.DropDuplicateFeatures 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/selection/DropFeatures.rst: -------------------------------------------------------------------------------- 1 | DropFeatures 2 | ============= 3 | 4 | .. autoclass:: feature_engine.selection.DropFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/selection/DropHighPSIFeatures.rst: -------------------------------------------------------------------------------- 1 | DropHighPSIFeatures 2 | =================== 3 | 4 | 5 | .. autoclass:: feature_engine.selection.DropHighPSIFeatures 6 | :members: -------------------------------------------------------------------------------- /docs/api_doc/selection/MRMR.rst: -------------------------------------------------------------------------------- 1 | MRMR 2 | ==== 3 | 4 | 5 | .. autoclass:: feature_engine.selection.MRMR 6 | :members: -------------------------------------------------------------------------------- /docs/api_doc/selection/ProbeFeatureSelection.rst: -------------------------------------------------------------------------------- 1 | ProbeFeatureSelection 2 | ===================== 3 | 4 | .. autoclass:: feature_engine.selection.ProbeFeatureSelection 5 | :members: -------------------------------------------------------------------------------- /docs/api_doc/selection/RecursiveFeatureAddition.rst: -------------------------------------------------------------------------------- 1 | RecursiveFeatureAddition 2 | ======================== 3 | 4 | 5 | .. autoclass:: feature_engine.selection.RecursiveFeatureAddition 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/selection/RecursiveFeatureElimination.rst: -------------------------------------------------------------------------------- 1 | RecursiveFeatureElimination 2 | ============================ 3 | 4 | 5 | .. autoclass:: feature_engine.selection.RecursiveFeatureElimination 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/selection/SelectByInformationValue.rst: -------------------------------------------------------------------------------- 1 | SelectByInformationValue 2 | ======================== 3 | 4 | .. autoclass:: feature_engine.selection.SelectByInformationValue 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/selection/SelectByShuffling.rst: -------------------------------------------------------------------------------- 1 | SelectByShuffling 2 | ================= 3 | 4 | .. autoclass:: feature_engine.selection.SelectByShuffling 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/selection/SelectBySingleFeaturePerformance.rst: -------------------------------------------------------------------------------- 1 | SelectBySingleFeaturePerformance 2 | ================================ 3 | 4 | 5 | .. autoclass:: feature_engine.selection.SelectBySingleFeaturePerformance 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/api_doc/selection/SelectByTargetMeanPerformance.rst: -------------------------------------------------------------------------------- 1 | SelectByTargetMeanPerformance 2 | ============================= 3 | 4 | 5 | .. autoclass:: feature_engine.selection.SelectByTargetMeanPerformance 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/selection/SmartCorrelatedSelection.rst: -------------------------------------------------------------------------------- 1 | SmartCorrelatedSelection 2 | ======================== 3 | 4 | 5 | .. autoclass:: feature_engine.selection.SmartCorrelatedSelection 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/timeseries/forecasting/ExpandingWindowFeatures.rst: -------------------------------------------------------------------------------- 1 | ExpandingWindowFeatures 2 | ======================= 3 | 4 | .. autoclass:: feature_engine.timeseries.forecasting.ExpandingWindowFeatures 5 | :members: -------------------------------------------------------------------------------- /docs/api_doc/timeseries/forecasting/LagFeatures.rst: -------------------------------------------------------------------------------- 1 | LagFeatures 2 | =========== 3 | 4 | .. autoclass:: feature_engine.timeseries.forecasting.LagFeatures 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/timeseries/forecasting/WindowFeatures.rst: -------------------------------------------------------------------------------- 1 | WindowFeatures 2 | ============== 3 | 4 | .. autoclass:: feature_engine.timeseries.forecasting.WindowFeatures 5 | :members: -------------------------------------------------------------------------------- /docs/api_doc/timeseries/forecasting/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Forecasting Features 4 | ==================== 5 | 6 | Feature-engine's time series forecasting transformers create and add new features to the 7 | dataframe by lagging features or calculating statistics over windows of time in the 8 | past. 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | LagFeatures 14 | WindowFeatures 15 | ExpandingWindowFeatures 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/api_doc/timeseries/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Time Series Features 4 | ==================== 5 | 6 | Feature-engine's time series transformers derive features from time series data. 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | 11 | forecasting/index 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/ArcsinTransformer.rst: -------------------------------------------------------------------------------- 1 | ArcsinTransformer 2 | ================= 3 | 4 | 5 | .. autoclass:: feature_engine.transformation.ArcsinTransformer 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/BoxCoxTransformer.rst: -------------------------------------------------------------------------------- 1 | BoxCoxTransformer 2 | ================= 3 | 4 | .. autoclass:: feature_engine.transformation.BoxCoxTransformer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/LogCpTransformer.rst: -------------------------------------------------------------------------------- 1 | LogCpTransformer 2 | ================ 3 | 4 | .. autoclass:: feature_engine.transformation.LogCpTransformer 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/LogTransformer.rst: -------------------------------------------------------------------------------- 1 | LogTransformer 2 | ============== 3 | 4 | 5 | .. autoclass:: feature_engine.transformation.LogTransformer 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/PowerTransformer.rst: -------------------------------------------------------------------------------- 1 | PowerTransformer 2 | ================ 3 | 4 | 5 | .. autoclass:: feature_engine.transformation.PowerTransformer 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/ReciprocalTransformer.rst: -------------------------------------------------------------------------------- 1 | ReciprocalTransformer 2 | ===================== 3 | 4 | 5 | .. autoclass:: feature_engine.transformation.ReciprocalTransformer 6 | :members: 7 | 8 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/YeoJohnsonTransformer.rst: -------------------------------------------------------------------------------- 1 | YeoJohnsonTransformer 2 | ===================== 3 | 4 | .. autoclass:: feature_engine.transformation.YeoJohnsonTransformer 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/transformation/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Variance Stabilizing Transformations 4 | ==================================== 5 | 6 | Feature-engine's variable transformers transform numerical variables with various 7 | mathematical transformations. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | LogTransformer 13 | LogCpTransformer 14 | ReciprocalTransformer 15 | ArcsinTransformer 16 | PowerTransformer 17 | BoxCoxTransformer 18 | YeoJohnsonTransformer 19 | 20 | 21 | Transformers in other Libraries 22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | 24 | These and additional transformations can be obtained with the following Scikit-learn 25 | classes: 26 | 27 | * `FunctionTransformer `_ 28 | * `PowerTransformer `_ 29 | 30 | Note that Scikit-klearn classes return Numpy arrays and are applied to the entire dataset. 31 | -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/check_all_variables.rst: -------------------------------------------------------------------------------- 1 | check_all_variables 2 | =================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: check_all_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/check_categorical_variables.rst: -------------------------------------------------------------------------------- 1 | check_categorical_variables 2 | =========================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: check_categorical_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/check_datetime_variables.rst: -------------------------------------------------------------------------------- 1 | check_datetime_variables 2 | ======================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: check_datetime_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/check_numerical_variables.rst: -------------------------------------------------------------------------------- 1 | check_numerical_variables 2 | ========================= 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: check_numerical_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/find_all_variables.rst: -------------------------------------------------------------------------------- 1 | find_all_variables 2 | ================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: find_all_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/find_categorical_and_numerical_variables.rst: -------------------------------------------------------------------------------- 1 | find_categorical_and_numerical_variables 2 | ======================================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: find_categorical_and_numerical_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/find_categorical_variables.rst: -------------------------------------------------------------------------------- 1 | find_categorical_variables 2 | ========================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: find_categorical_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/find_datetime_variables.rst: -------------------------------------------------------------------------------- 1 | find_datetime_variables 2 | ======================= 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: find_datetime_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/find_numerical_variables.rst: -------------------------------------------------------------------------------- 1 | find_numerical_variables 2 | ======================== 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: find_numerical_variables -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Variable handling functions 4 | =========================== 5 | 6 | This set of functions find variables of a specific type in a dataframe, or check that a 7 | list of variables is of a specified data type. 8 | 9 | The `find` functions take a dataframe as an argument and returns a list with the names 10 | of the variables of the desired type. 11 | 12 | The `check` functions check that the list of variables are all of the desired data type. 13 | 14 | The `retain` functions select the variables in a list if they fulfill a condition. 15 | 16 | These functions are used under-the-hood by all Feature-engine transformers to select the 17 | variables that they will modify. 18 | 19 | .. toctree:: 20 | :maxdepth: 1 21 | 22 | find_all_variables 23 | find_categorical_variables 24 | find_datetime_variables 25 | find_numerical_variables 26 | find_categorical_and_numerical_variables 27 | check_all_variables 28 | check_categorical_variables 29 | check_datetime_variables 30 | check_numerical_variables 31 | retain_variables_if_in_df 32 | -------------------------------------------------------------------------------- /docs/api_doc/variable_handling/retain_variables_if_in_df.rst: -------------------------------------------------------------------------------- 1 | retain_variables_if_in_df 2 | ========================= 3 | 4 | .. currentmodule:: feature_engine.variable_handling 5 | 6 | .. autofunction:: retain_variables_if_in_df -------------------------------------------------------------------------------- /docs/api_doc/wrappers/Wrapper.rst: -------------------------------------------------------------------------------- 1 | SklearnTransformerWrapper 2 | ========================= 3 | 4 | .. autoclass:: feature_engine.wrappers.SklearnTransformerWrapper 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/api_doc/wrappers/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | .. currentmodule:: feature_engine.wrappers 4 | 5 | Scikit-learn Wrapper 6 | ==================== 7 | 8 | Feature-engine's Scikit-learn wrappers wrap Scikit-learn transformers allowing their 9 | implementation only on a selected subset of features. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | Wrapper 15 | 16 | Other wrappers 17 | ~~~~~~~~~~~~~~ 18 | 19 | The :class:`SklearnTransformerWrapper()` offers a similar function to the 20 | `ColumnTransformer `_ 21 | class available in Scikit-learn. They differ in the implementation to select the 22 | variables. -------------------------------------------------------------------------------- /docs/contribute/code_of_conduct.rst: -------------------------------------------------------------------------------- 1 | Code of Conduct 2 | =============== 3 | 4 | Feature-engine is an open source Python project. We follow the 5 | `Python Software Foundation Code of Conduct `_. 6 | All interactions among members of the Feature-engine community must meet those 7 | guidelines. This includes (but is not limited to) interactions through the mailing 8 | list, GitHub and StackOverflow. 9 | 10 | Everyone is expected to be open, considerate, and respectful of others no matter what 11 | their position is within the project. We show gratitude for any contribution, big or 12 | small. We welcome feedback and participation. We want to make Feature-engine a nice, 13 | welcoming and safe place for you to do your first contribution to open source, and why 14 | not the second, the third and so on :). 15 | -------------------------------------------------------------------------------- /docs/contribute/contribute_jup.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Contribute Jupyter notebooks 4 | ============================ 5 | 6 | We created a collection of Jupyter notebooks that showcase the main functionality of 7 | Feature-engine's transformers. We link these notebooks throughout the main documentation 8 | to offer users more examples and details about transformers and how to use them. 9 | 10 | **Note** that the Jupyter notebooks are hosted in a separate 11 | `Github repository `_. 12 | 13 | Here are some guidelines on how to add a new notebook or update an existing one. The 14 | contribution workflow is the same we use for the main source code base. 15 | 16 | Jupyter contribution workflow 17 | ----------------------------- 18 | 19 | 1. Fork the `Github repository `_. 20 | 2. Clone your fork into your local computer: `git clone https://github.com//feature-engine-examples.git`. 21 | 3. Navigate into the project directory: `cd feature-engine-examples`. 22 | 4. If you haven't done so yet, install feature-engine: `pip install feature_engine`. 23 | 5. Create a feature branch with a meaningful name: `git checkout -b mynotebookbranch`. 24 | 6. Develop your notebook 25 | 7. Add the changes to your copy of the fork: `git add .`, `git commit -m "a meaningful commit message"`, `git pull origin mynotebookbranch`. 26 | 8. Go to your fork on Github and make a PR to this repo 27 | 9. Done 28 | 29 | The review process for notebooks is usually much faster than for the main source code base. 30 | 31 | Jupyter creation guidelines 32 | --------------------------- 33 | 34 | If you want to add a new Jupyter notebook, there are a few things to note: 35 | 36 | - Make sure that the dataset you use is publicly available and with a clear license that it is free to use 37 | - Do not upload datasets to the repository 38 | - Add instructions on how to obtain and prepare the data for the demo 39 | - Throughout the notebook, add guidelines on what you are going to do next, and what is the conclusion of the output 40 | 41 | That's it! Fairly straightforward. 42 | 43 | We look forward to your contribution :) -------------------------------------------------------------------------------- /docs/donate.rst: -------------------------------------------------------------------------------- 1 | Sponsor us 2 | ---------- 3 | 4 | | 5 | 6 | .. image:: images/sponsors/call_for_sponsors.png 7 | :align: center 8 | :target: https://github.com/sponsors/feature-engine 9 | 10 | | 11 | 12 | Support Feature-engine financially through 13 | `Github Sponsors `_ and help further our 14 | mission to democratize machine learning and programming tools through open-source. 15 | 16 | More details about how we use donations in the 17 | `sponsors page `_. 18 | 19 | | 20 | 21 | Sponsors 22 | -------- 23 | 24 | Feature-engine is a community driven project, however institutional, private and 25 | individual support help to assure its sustainability. The project would like to thank 26 | the following sponsors: 27 | 28 | | 29 | 30 | .. image:: images/sponsors/trainindata.png 31 | :width: 200pt 32 | :align: center 33 | :target: https://www.trainindata.com/ 34 | 35 | -------------------------------------------------------------------------------- /docs/images/1024px-Relationship_between_mean_and_median_under_different_skewness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/1024px-Relationship_between_mean_and_median_under_different_skewness.png -------------------------------------------------------------------------------- /docs/images/Discretisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/Discretisation.png -------------------------------------------------------------------------------- /docs/images/FeatureEnginePackageStructure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructure.png -------------------------------------------------------------------------------- /docs/images/FeatureEnginePackageStructureCrossSectional.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureCrossSectional.png -------------------------------------------------------------------------------- /docs/images/FeatureEnginePackageStructureDatetimeText.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureDatetimeText.png -------------------------------------------------------------------------------- /docs/images/FeatureEnginePackageStructureTimeseries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureTimeseries.png -------------------------------------------------------------------------------- /docs/images/PSI_distribution_case1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case1.png -------------------------------------------------------------------------------- /docs/images/PSI_distribution_case3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case3.png -------------------------------------------------------------------------------- /docs/images/PSI_distribution_case4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case4.png -------------------------------------------------------------------------------- /docs/images/PSI_distribution_case5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case5.png -------------------------------------------------------------------------------- /docs/images/Variable_Transformation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/Variable_Transformation.png -------------------------------------------------------------------------------- /docs/images/arbitraryvalueimputation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/arbitraryvalueimputation.png -------------------------------------------------------------------------------- /docs/images/bmilogcp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/bmilogcp.png -------------------------------------------------------------------------------- /docs/images/bmiraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/bmiraw.png -------------------------------------------------------------------------------- /docs/images/boxplot-age-percentiles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-age-percentiles.png -------------------------------------------------------------------------------- /docs/images/boxplot-age.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-age.png -------------------------------------------------------------------------------- /docs/images/boxplot-fare-mad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-fare-mad.png -------------------------------------------------------------------------------- /docs/images/boxplot-fare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-fare.png -------------------------------------------------------------------------------- /docs/images/boxplot-sibsp-fare-iqr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-sibsp-fare-iqr.png -------------------------------------------------------------------------------- /docs/images/boxplot-sibsp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-sibsp.png -------------------------------------------------------------------------------- /docs/images/boxplot-titanic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-titanic.png -------------------------------------------------------------------------------- /docs/images/breast_cancer_arcsin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/breast_cancer_arcsin.png -------------------------------------------------------------------------------- /docs/images/breast_cancer_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/breast_cancer_raw.png -------------------------------------------------------------------------------- /docs/images/cookbook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/cookbook.png -------------------------------------------------------------------------------- /docs/images/dmlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/dmlm.png -------------------------------------------------------------------------------- /docs/images/endtailimputer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/endtailimputer.png -------------------------------------------------------------------------------- /docs/images/equalfrequencydiscretisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation.png -------------------------------------------------------------------------------- /docs/images/equalfrequencydiscretisation_gaussian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation_gaussian.png -------------------------------------------------------------------------------- /docs/images/equalfrequencydiscretisation_skewed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation_skewed.png -------------------------------------------------------------------------------- /docs/images/equalwidthdiscretisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalwidthdiscretisation.png -------------------------------------------------------------------------------- /docs/images/f_statistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/f_statistic.png -------------------------------------------------------------------------------- /docs/images/feml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/feml.png -------------------------------------------------------------------------------- /docs/images/fetsf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fetsf.png -------------------------------------------------------------------------------- /docs/images/fork.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fork.png -------------------------------------------------------------------------------- /docs/images/frequentcategoryimputer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/frequentcategoryimputer.png -------------------------------------------------------------------------------- /docs/images/fsml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fsml.png -------------------------------------------------------------------------------- /docs/images/fsmlbook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fsmlbook.png -------------------------------------------------------------------------------- /docs/images/fwml.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fwml.png -------------------------------------------------------------------------------- /docs/images/hour_sin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin.png -------------------------------------------------------------------------------- /docs/images/hour_sin2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin2.png -------------------------------------------------------------------------------- /docs/images/hour_sin3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin3.png -------------------------------------------------------------------------------- /docs/images/hour_sin4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin4.png -------------------------------------------------------------------------------- /docs/images/increasingwidthdisc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/increasingwidthdisc.png -------------------------------------------------------------------------------- /docs/images/ivml_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/ivml_logo.png -------------------------------------------------------------------------------- /docs/images/logcpraw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logcpraw.png -------------------------------------------------------------------------------- /docs/images/logcptransform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logcptransform.png -------------------------------------------------------------------------------- /docs/images/logo/FeatureEngine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/FeatureEngine.png -------------------------------------------------------------------------------- /docs/images/logo/Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/Logo.png -------------------------------------------------------------------------------- /docs/images/logo/Logo_name.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/Logo_name.png -------------------------------------------------------------------------------- /docs/images/logo/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/favicon.png -------------------------------------------------------------------------------- /docs/images/lotarea_pt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_pt.png -------------------------------------------------------------------------------- /docs/images/lotarea_pt_custom_exp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_pt_custom_exp.png -------------------------------------------------------------------------------- /docs/images/lotarea_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_raw.png -------------------------------------------------------------------------------- /docs/images/lotareaboxcox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareaboxcox.png -------------------------------------------------------------------------------- /docs/images/lotarealog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarealog.png -------------------------------------------------------------------------------- /docs/images/lotareapower.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareapower.png -------------------------------------------------------------------------------- /docs/images/lotarearaw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarearaw.png -------------------------------------------------------------------------------- /docs/images/lotareareciprocal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareareciprocal.png -------------------------------------------------------------------------------- /docs/images/lotareayeojohnson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareayeojohnson.png -------------------------------------------------------------------------------- /docs/images/lotshape-price-per-cat-enc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotshape-price-per-cat-enc.png -------------------------------------------------------------------------------- /docs/images/lotshape-price-per-cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotshape-price-per-cat.png -------------------------------------------------------------------------------- /docs/images/meanmedianimputater_distributions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/meanmedianimputater_distributions.png -------------------------------------------------------------------------------- /docs/images/medianimputation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medianimputation.png -------------------------------------------------------------------------------- /docs/images/medinc_disc_arbitrarily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_disc_arbitrarily.png -------------------------------------------------------------------------------- /docs/images/medinc_disc_arbitrarily2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_disc_arbitrarily2.png -------------------------------------------------------------------------------- /docs/images/medinc_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_hist.png -------------------------------------------------------------------------------- /docs/images/missingcategoryimputer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/missingcategoryimputer.png -------------------------------------------------------------------------------- /docs/images/missingindicator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/missingindicator.png -------------------------------------------------------------------------------- /docs/images/mli_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mli_logo.png -------------------------------------------------------------------------------- /docs/images/monotonic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/monotonic.png -------------------------------------------------------------------------------- /docs/images/mzoning-price-per-cat-enc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mzoning-price-per-cat-enc.png -------------------------------------------------------------------------------- /docs/images/mzoning-price-per-cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mzoning-price-per-cat.png -------------------------------------------------------------------------------- /docs/images/nonnormalvars2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2.png -------------------------------------------------------------------------------- /docs/images/nonnormalvars2logtransformed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2logtransformed.png -------------------------------------------------------------------------------- /docs/images/nonnormalvars2transformed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2transformed.png -------------------------------------------------------------------------------- /docs/images/ordinal_encoding_monotonic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/ordinal_encoding_monotonic.png -------------------------------------------------------------------------------- /docs/images/pipelineprediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/pipelineprediction.png -------------------------------------------------------------------------------- /docs/images/probe-importance-std.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe-importance-std.png -------------------------------------------------------------------------------- /docs/images/probe_feature_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe_feature_normal.png -------------------------------------------------------------------------------- /docs/images/probe_features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe_features.png -------------------------------------------------------------------------------- /docs/images/quasiconstant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/quasiconstant.png -------------------------------------------------------------------------------- /docs/images/randomsampleimputation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/randomsampleimputation.png -------------------------------------------------------------------------------- /docs/images/reciprocal_transformer/reciprocal_transfomer_inverse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_inverse.png -------------------------------------------------------------------------------- /docs/images/reciprocal_transformer/reciprocal_transfomer_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_new.png -------------------------------------------------------------------------------- /docs/images/reciprocal_transformer/reciprocal_transfomer_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_original.png -------------------------------------------------------------------------------- /docs/images/reciprocal_transformer/reciprocal_transformer_3plots_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_new.png -------------------------------------------------------------------------------- /docs/images/reciprocal_transformer/reciprocal_transformer_3plots_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_original.png -------------------------------------------------------------------------------- /docs/images/rfa_linreg_imp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfa_linreg_imp.png -------------------------------------------------------------------------------- /docs/images/rfa_perf_drifts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfa_perf_drifts.png -------------------------------------------------------------------------------- /docs/images/rfe_perf_drift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfe_perf_drift.png -------------------------------------------------------------------------------- /docs/images/rfimportancemrmr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfimportancemrmr.png -------------------------------------------------------------------------------- /docs/images/selectionChart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/selectionChart.png -------------------------------------------------------------------------------- /docs/images/shuffle-features-std.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/shuffle-features-std.png -------------------------------------------------------------------------------- /docs/images/single-feature-perf-std.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/single-feature-perf-std.png -------------------------------------------------------------------------------- /docs/images/single_feature_probes_imp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/single_feature_probes_imp.png -------------------------------------------------------------------------------- /docs/images/sponsors/call_for_sponsors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/call_for_sponsors.png -------------------------------------------------------------------------------- /docs/images/sponsors/how-did-you-discover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/how-did-you-discover.png -------------------------------------------------------------------------------- /docs/images/sponsors/trainindata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/trainindata.png -------------------------------------------------------------------------------- /docs/images/summary/imputersSummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/summary/imputersSummary.png -------------------------------------------------------------------------------- /docs/images/summary/selectionSummary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/summary/selectionSummary.png -------------------------------------------------------------------------------- /docs/images/target-mean-sel-std.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/target-mean-sel-std.png -------------------------------------------------------------------------------- /docs/images/toydata_pt_raw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_raw.png -------------------------------------------------------------------------------- /docs/images/toydata_pt_transformed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_transformed.png -------------------------------------------------------------------------------- /docs/images/toydata_pt_transformed_custom_exp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_transformed_custom_exp.png -------------------------------------------------------------------------------- /docs/images/transformedcoupleYJ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/transformedcoupleYJ.png -------------------------------------------------------------------------------- /docs/images/treediscretisation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treediscretisation.png -------------------------------------------------------------------------------- /docs/images/treemonotonicprediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treemonotonicprediction.png -------------------------------------------------------------------------------- /docs/images/treepredictionrounded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treepredictionrounded.png -------------------------------------------------------------------------------- /docs/images/untransformedcoupleYJ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/untransformedcoupleYJ.png -------------------------------------------------------------------------------- /docs/images/woe_encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/woe_encoding.png -------------------------------------------------------------------------------- /docs/images/woe_prediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/woe_prediction.png -------------------------------------------------------------------------------- /docs/images/yeojohnsonformula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/yeojohnsonformula.png -------------------------------------------------------------------------------- /docs/quickstart/datasets.rst: -------------------------------------------------------------------------------- 1 | .. _datasets: 2 | 3 | Datasets 4 | ======== 5 | 6 | The user guide and examples included in Feature-engine's documentation are based on 7 | these 3 datasets: 8 | 9 | Titanic dataset 10 | ~~~~~~~~~~~~~~~ 11 | 12 | We use the dataset available in `openML `_ which can be 13 | downloaded from `here `_. 14 | 15 | Ames House Prices dataset 16 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 17 | 18 | We use the data set created by Professor Dean De Cock: 19 | * Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing 20 | * Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3. 21 | 22 | The examples are based on a copy of the dataset available on 23 | `Kaggle `_. 24 | 25 | The original data and documentation can be found here: 26 | 27 | * `Documentation `_ 28 | 29 | * `Data `_ 30 | 31 | Credit Approval dataset 32 | ~~~~~~~~~~~~~~~~~~~~~~~ 33 | 34 | We use the Credit Approval dataset from the UCI Machine Learning Repository: 35 | 36 | Dua, D. and Graff, C. (2019). `UCI Machine Learning Repository `_. 37 | Irvine, CA: University of California, School of Information and Computer Science. 38 | 39 | To download the dataset visit this 40 | `website `_ 41 | and click on "crx.data" to download the data set. 42 | 43 | To prepare the data for the examples: 44 | 45 | .. code:: python 46 | 47 | import random 48 | import pandas as pd 49 | import numpy as np 50 | 51 | # load data 52 | data = pd.read_csv('crx.data', header=None) 53 | 54 | # create variable names according to UCI Machine Learning information 55 | varnames = ['A'+str(s) for s in range(1,17)] 56 | data.columns = varnames 57 | 58 | # replace ? by np.nan 59 | data = data.replace('?', np.nan) 60 | 61 | # re-cast some variables to the correct types 62 | data['A2'] = data['A2'].astype('float') 63 | data['A14'] = data['A14'].astype('float') 64 | 65 | # encode target to binary 66 | data['A16'] = data['A16'].map({'+':1, '-':0}) 67 | 68 | # save the data 69 | data.to_csv('creditApprovalUCI.csv', index=False) -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Library Dependencies 2 | numpy>=1.18.2 3 | pandas>=1.0.3 4 | scikit-learn>=1.0.0 5 | scipy>=1.4.1 6 | statsmodels>=0.11.1 7 | 8 | # Documentation Dependencies 9 | docutils==0.16 10 | Sphinx>=4.3.2 11 | pydata_sphinx_theme>=0.7.2 12 | sphinx_autodoc_typehints>=1.11.1,<=1.21.3 13 | numpydoc>=0.9.2 14 | -------------------------------------------------------------------------------- /docs/resources/books.rst: -------------------------------------------------------------------------------- 1 | Books 2 | ===== 3 | 4 | You can learn more about how to use Feature-engine and feature engineering in general 5 | in the following books: 6 | 7 | .. figure:: ../images/cookbook.png 8 | :width: 200 9 | :figclass: align-center 10 | :align: left 11 | :target: https://www.packtpub.com/en-us/product/python-feature-engineering-cookbook-9781835883587 12 | 13 | Python Feature Engineering Cookbook 14 | 15 | 16 | .. figure:: ../images/fsmlbook.png 17 | :width: 200 18 | :figclass: align-center 19 | :align: left 20 | :target: https://www.trainindata.com/p/feature-selection-in-machine-learning-book 21 | 22 | Feature Selection in Machine Learning -------------------------------------------------------------------------------- /docs/resources/courses.rst: -------------------------------------------------------------------------------- 1 | Courses 2 | ======= 3 | 4 | You can learn more about how to use Feature-engine and, feature engineering and feature 5 | selection in general in the following online courses: 6 | 7 | .. figure:: ../images/feml.png 8 | :width: 300 9 | :figclass: align-center 10 | :align: left 11 | :target: https://www.trainindata.com/p/feature-engineering-for-machine-learning 12 | 13 | Feature Engineering for Machine Learning 14 | 15 | .. figure:: ../images/fsml.png 16 | :width: 300 17 | :figclass: align-center 18 | :align: right 19 | :target: https://www.trainindata.com/p/feature-selection-for-machine-learning 20 | 21 | Feature Selection for Machine Learning 22 | 23 | .. figure:: ../images/fwml.png 24 | :width: 300 25 | :figclass: align-center 26 | :align: left 27 | :target: https://www.courses.trainindata.com/p/forecasting-with-machine-learning 28 | 29 | Forecasting with Machine Learning 30 | 31 | .. figure:: ../images/fetsf.png 32 | :width: 300 33 | :figclass: align-center 34 | :align: right 35 | :target: https://www.trainindata.com/p/feature-engineering-for-forecasting 36 | 37 | Feature Engineering for Time Series Forecasting 38 | 39 | .. figure:: ../images/mli_logo.png 40 | :width: 300 41 | :figclass: align-center 42 | :align: left 43 | :target: https://www.courses.trainindata.com/p/machine-learning-interpretability 44 | 45 | Interpreting Machine Learning Models 46 | 47 | 48 | | 49 | | 50 | -------------------------------------------------------------------------------- /docs/resources/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. _learning_resources: 3 | 4 | Resources 5 | ========= 6 | 7 | Here you find learning resources to know more about Feature-engine and feature 8 | engineering and selection in general. 9 | 10 | We have gathered online courses, books, blogs, videos, podcasts, jupyter notebook and 11 | kaggle kernels, so you can follow the resource with the way of learning that you like 12 | the most. 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | 17 | courses 18 | books 19 | blogs 20 | tutorials -------------------------------------------------------------------------------- /docs/resources/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | How To 5 | ------ 6 | 7 | Check our `jupyter notebooks `_ 8 | showcasing the functionality of each Feature-engine transformer. 9 | 10 | Kaggle Kernels 11 | -------------- 12 | 13 | We also prepared Kaggle kernels with demos mixing data exploration, feature engineering, 14 | feature creation, feature selection and hyperparameter optimization of entire pipelines. 15 | 16 | - `Feature selection for bank customer satisfaction prediction `_ 17 | - `Feature engineering and selection for house price prediction `_ 18 | - `Feature creation for wine quality prediction `_ 19 | - `Feature engineering and model stacking for house price modelling `_ 20 | - `Feature engineering with Feature-engine and Randomized search `_ 21 | - `Feature engineering with Feature-engine and Grid search `_ 22 | 23 | 24 | 25 | Video tutorials 26 | --------------- 27 | 28 | You can find some videos on how to use Feature-engine in the 29 | `Feature-engine playlist `_ 30 | in Train in Data's YouTube channel. The list is a bit short at the moment, apologies. -------------------------------------------------------------------------------- /docs/sphinxext/README.txt: -------------------------------------------------------------------------------- 1 | ===================================== 2 | numpydoc -- Numpy's Sphinx extensions 3 | ===================================== 4 | 5 | Numpy's documentation uses several custom extensions to Sphinx. These 6 | are shipped in this ``numpydoc`` package, in case you want to make use 7 | of them in third-party projects. 8 | 9 | The following extensions are available: 10 | 11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add 12 | the code description directives ``np-function``, ``np-cfunction``, etc. 13 | that support the Numpy docstring syntax. 14 | 15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. 16 | 17 | - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` 18 | directive. Note that this implementation may still undergo severe 19 | changes or eventually be deprecated. 20 | 21 | - ``numpydoc.only_directives``: (DEPRECATED) 22 | 23 | - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. 24 | Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, 25 | and it the Sphinx 1.0 version is recommended over that included in 26 | Numpydoc. 27 | 28 | 29 | numpydoc 30 | ======== 31 | 32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings 33 | following the Numpy/Scipy format to a form palatable to Sphinx. 34 | 35 | Options 36 | ------- 37 | 38 | The following options can be set in conf.py: 39 | 40 | - numpydoc_use_plots: bool 41 | 42 | Whether to produce ``plot::`` directives for Examples sections that 43 | contain ``import matplotlib``. 44 | 45 | - numpydoc_show_class_members: bool 46 | 47 | Whether to show all members of a class in the Methods and Attributes 48 | sections automatically. 49 | 50 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) 51 | 52 | Whether to insert an edit link after docstrings. 53 | -------------------------------------------------------------------------------- /docs/user_guide/datetime/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Datetime Features 4 | ================= 5 | 6 | Feature-engine’s datetime transformers are able to extract a wide variety of datetime 7 | features from existing datetime or object-like data. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | DatetimeFeatures 13 | DatetimeSubtraction -------------------------------------------------------------------------------- /docs/user_guide/discretisation/index.rst: -------------------------------------------------------------------------------- 1 | .. _discretization_transformers: 2 | 3 | .. -*- mode: rst -*- 4 | 5 | Discretisation 6 | ============== 7 | 8 | Feature-engine's variable discretisation transformers transform continuous numerical 9 | variables into discrete variables. The discrete variables will contain contiguous 10 | intervals in the case of the equal frequency and equal width transformers. The 11 | Decision Tree discretiser will return a discrete variable, in the sense that the 12 | new feature takes a finite number of values. 13 | 14 | The following illustration shows the process of discretisation: 15 | 16 | .. figure:: ../../images/Discretisation.png 17 | :align: center 18 | :width: 500 19 | 20 | 21 | With discretisation, sometimes we can obtain a more homogeneous value spread from an 22 | originally skewed variable. But this is not always possible. 23 | 24 | **Discretisation plus encoding** 25 | 26 | Very often, after we discretise the numerical continuous variables into discrete intervals 27 | we want to proceed their engineering as if they were categorical. This is common practice. 28 | Throughout the user guide, we point to jupyter notebooks that showcase this functionality. 29 | 30 | **Discretisers** 31 | 32 | .. toctree:: 33 | :maxdepth: 1 34 | 35 | EqualFrequencyDiscretiser 36 | EqualWidthDiscretiser 37 | ArbitraryDiscretiser 38 | DecisionTreeDiscretiser 39 | GeometricWidthDiscretiser 40 | -------------------------------------------------------------------------------- /docs/user_guide/imputation/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Missing Data Imputation 4 | ======================= 5 | 6 | Feature-engine's missing data imputers replace missing data by parameters estimated 7 | from data or arbitrary values pre-defined by the user. The following image summarizes 8 | the main imputer's functionality. 9 | 10 | .. figure:: ../../images/summary/imputersSummary.png 11 | :align: center 12 | 13 | | 14 | 15 | In this guide, you will find code snippets to quickly be able to apply the imputers 16 | to your datasets, as well as general knowledge and guidance on the imputation 17 | techniques. 18 | 19 | 20 | Imputers 21 | ~~~~~~~~ 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | 26 | MeanMedianImputer 27 | ArbitraryNumberImputer 28 | EndTailImputer 29 | CategoricalImputer 30 | RandomSampleImputer 31 | AddMissingIndicator 32 | DropMissingData -------------------------------------------------------------------------------- /docs/user_guide/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. _user_guide: 3 | 4 | User Guide 5 | ========== 6 | 7 | In this section you will find additional information about Feature-engine's transformers 8 | and feature engineering transformations in general, as well as additional examples. 9 | 10 | Transformation 11 | -------------- 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | 16 | imputation/index 17 | encoding/index 18 | discretisation/index 19 | outliers/index 20 | transformation/index 21 | scaling/index 22 | 23 | Creation 24 | -------- 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | 29 | creation/index 30 | datetime/index 31 | 32 | 33 | Selection 34 | --------- 35 | .. toctree:: 36 | :maxdepth: 1 37 | 38 | selection/index 39 | 40 | 41 | Time series 42 | ----------- 43 | 44 | .. toctree:: 45 | :maxdepth: 1 46 | 47 | timeseries/index 48 | 49 | 50 | Other 51 | ----- 52 | .. toctree:: 53 | :maxdepth: 1 54 | 55 | preprocessing/index 56 | wrappers/index 57 | 58 | Pipeline 59 | -------- 60 | .. toctree:: 61 | :maxdepth: 1 62 | 63 | pipeline/index 64 | 65 | Tools 66 | ----- 67 | .. toctree:: 68 | :maxdepth: 1 69 | 70 | variable_handling/index -------------------------------------------------------------------------------- /docs/user_guide/outliers/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Outlier Handling 4 | ================ 5 | 6 | Feature-engine's outlier cappers cap maximum or minimum values of a variable at an 7 | arbitrary or derived value. The OutlierTrimmer removes outliers from the dataset. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | Winsorizer 13 | ArbitraryOutlierCapper 14 | OutlierTrimmer -------------------------------------------------------------------------------- /docs/user_guide/pipeline/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Pipeline 4 | ======== 5 | 6 | Feature-engine's Pipeline is equivalent to Scikit-learn's pipeline, and in addition, 7 | it accepts the method `transform_x_y`, to adjust both X and y, in those cases where 8 | rows are removed from X. 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | Pipeline 14 | make_pipeline 15 | -------------------------------------------------------------------------------- /docs/user_guide/preprocessing/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Preprocessing 4 | ============= 5 | 6 | Feature-engine's preprocessing transformers apply general data pre-processing 7 | and transformation procedures. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | MatchCategories 13 | MatchVariables 14 | -------------------------------------------------------------------------------- /docs/user_guide/scaling/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. _scaling_user_guide: 3 | 4 | .. currentmodule:: feature_engine.scaling 5 | 6 | Scaling 7 | ======= 8 | 9 | `Feature scaling `_ 10 | is the process of transforming the range of numerical features so that they fit within a 11 | specific scale, usually to improve the performance and training stability of machine learning 12 | models. 13 | 14 | Scaling helps to normalize the input data, ensuring that each feature contributes proportionately 15 | to the final result, particularly in algorithms that are sensitive to the range of the data, 16 | such as gradient descent-based models (e.g., linear regression, logistic regression, neural networks) 17 | and distance-based models (e.g., K-nearest neighbors, clustering). 18 | 19 | Feature-engine's scalers replace the variables' values by the scaled ones. In this page, we 20 | discuss the importance of scaling numerical features, and then introduce the various 21 | scaling techniques supported by Feature-engine. 22 | 23 | Importance of scaling 24 | --------------------- 25 | 26 | Scaling is crucial in machine learning as it ensures that features contribute equally to model 27 | training, preventing bias toward variables with larger ranges. Properly scaled data enhances the 28 | performance of algorithms sensitive to the magnitude of input values, such as gradient descent 29 | and distance-based methods. Additionally, scaling can improve convergence speed and overall model 30 | accuracy, leading to more reliable predictions. 31 | 32 | 33 | When apply scaling 34 | ------------------ 35 | 36 | - **Training:** Most machine learning algorithms require data to be scaled before training, 37 | especially linear models, neural networks, and distance-based models. 38 | 39 | - **Feature Engineering:** Scaling can be essential for certain feature engineering techniques, 40 | like polynomial features. 41 | 42 | - **Resampling:** Some oversampling methods like SMOTE and many of the undersampling methods 43 | clean data based on KNN algorithms, which are distance based models. 44 | 45 | 46 | When Scaling Is Not Necessary 47 | ----------------------------- 48 | 49 | Not all algorithms require scaling. For example, tree-based algorithms (like Decision Trees, 50 | Random Forests, Gradient Boosting) are generally invariant to scaling because they split data 51 | based on the order of values, not the magnitude. 52 | 53 | Scalers 54 | ------- 55 | 56 | .. toctree:: 57 | :maxdepth: 1 58 | 59 | MeanNormalizationScaler 60 | -------------------------------------------------------------------------------- /docs/user_guide/timeseries/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | .. _timeseries: 3 | 4 | .. currentmodule:: feature_engine.timeseries 5 | 6 | 7 | Time Series Features 8 | ==================== 9 | 10 | Feature-engine's time series transformers create features from time series data. 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | 15 | forecasting/index 16 | 17 | | 18 | | 19 | | 20 | -------------------------------------------------------------------------------- /docs/user_guide/transformation/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Variance Stabilizing Transformations 4 | ==================================== 5 | 6 | Feature-engine's variable transformers transform numerical variables with various 7 | mathematical transformations. 8 | 9 | Variable transformations are commonly used to spread the values of the original variables 10 | over a wider value range. See the following illustration: 11 | 12 | .. figure:: ../../images/Variable_Transformation.png 13 | :align: center 14 | 15 | 16 | Article 17 | ------- 18 | 19 | We added a lot of information about **variance stabilizing transformations** in this 20 | `article `_. 21 | 22 | **Note** 23 | 24 | Note however, that improving the value spread is not always possible and it depends 25 | on the nature of the variable. 26 | 27 | **Transformers** 28 | 29 | .. toctree:: 30 | :maxdepth: 1 31 | 32 | LogTransformer 33 | LogCpTransformer 34 | ReciprocalTransformer 35 | ArcsinTransformer 36 | PowerTransformer 37 | BoxCoxTransformer 38 | YeoJohnsonTransformer 39 | -------------------------------------------------------------------------------- /docs/user_guide/variable_handling/check_numerical_variables.rst: -------------------------------------------------------------------------------- 1 | .. _check_num_vars: 2 | 3 | .. currentmodule:: feature_engine.variable_handling 4 | 5 | check_numerical_variables 6 | ========================= 7 | 8 | :class:`check_numerical_variables()` checks that the variables in the list are of 9 | type numerical. 10 | 11 | Let's create a toy dataset with numerical, categorical and datetime variables: 12 | 13 | .. code:: python 14 | 15 | import pandas as pd 16 | df = pd.DataFrame({ 17 | "Name": ["tom", "nick", "krish", "jack"], 18 | "City": ["London", "Manchester", "Liverpool", "Bristol"], 19 | "Age": [20, 21, 19, 18], 20 | "Marks": [0.9, 0.8, 0.7, 0.6], 21 | "dob": pd.date_range("2020-02-24", periods=4, freq="T"), 22 | }) 23 | 24 | print(df.head()) 25 | 26 | We see the resulting dataframe below: 27 | 28 | .. code:: python 29 | 30 | Name City Age Marks dob 31 | 0 tom London 20 0.9 2020-02-24 00:00:00 32 | 1 nick Manchester 21 0.8 2020-02-24 00:01:00 33 | 2 krish Liverpool 19 0.7 2020-02-24 00:02:00 34 | 3 jack Bristol 18 0.6 2020-02-24 00:03:00 35 | 36 | Let's now check that 2 of the variables are of type numerical: 37 | 38 | .. code:: python 39 | 40 | from feature_engine.variable_handling import check_numerical_variables 41 | 42 | var_num = check_numerical_variables(df, ['Age', 'Marks']) 43 | 44 | var_num 45 | 46 | If the variables are numerical, the function returns their names in a list: 47 | 48 | .. code:: python 49 | 50 | ['Age', 'Marks'] 51 | 52 | If we pass a variable that is not of type numerical, 53 | :class:`check_numerical_variables()` will return an error: 54 | 55 | .. code:: python 56 | 57 | check_numerical_variables(df, ['Age', 'Name']) 58 | 59 | Below we see the error message: 60 | 61 | .. code:: python 62 | 63 | TypeError: Some of the variables are not numerical. Please cast them as numerical 64 | before using this transformer. 65 | -------------------------------------------------------------------------------- /docs/user_guide/variable_handling/find_numerical_variables.rst: -------------------------------------------------------------------------------- 1 | .. _find_num_vars: 2 | 3 | .. currentmodule:: feature_engine.variable_handling 4 | 5 | find_numerical_variables 6 | ======================== 7 | 8 | :class:`find_numerical_variables()` returns a list with the names of the numerical 9 | variables in the dataset. 10 | 11 | Let's create a toy dataset with numerical, categorical and datetime variables: 12 | 13 | .. code:: python 14 | 15 | import pandas as pd 16 | df = pd.DataFrame({ 17 | "Name": ["tom", "nick", "krish", "jack"], 18 | "City": ["London", "Manchester", "Liverpool", "Bristol"], 19 | "Age": [20, 21, 19, 18], 20 | "Marks": [0.9, 0.8, 0.7, 0.6], 21 | "dob": pd.date_range("2020-02-24", periods=4, freq="T"), 22 | }) 23 | 24 | print(df.head()) 25 | 26 | We see the resulting dataframe below: 27 | 28 | .. code:: python 29 | 30 | Name City Age Marks dob 31 | 0 tom London 20 0.9 2020-02-24 00:00:00 32 | 1 nick Manchester 21 0.8 2020-02-24 00:01:00 33 | 2 krish Liverpool 19 0.7 2020-02-24 00:02:00 34 | 3 jack Bristol 18 0.6 2020-02-24 00:03:00 35 | 36 | With :class:`find_numerical_variables()` we capture the names of all numerical 37 | variables in a list. So let's do that and then display the list: 38 | 39 | .. code:: python 40 | 41 | from feature_engine.variable_handling import find_numerical_variables 42 | 43 | var_num = find_numerical_variables(df) 44 | 45 | var_num 46 | 47 | We see the names of the numerical variables in the list below: 48 | 49 | .. code:: python 50 | 51 | ['Age', 'Marks'] 52 | 53 | If there are no numerical variables in the dataset, :class:`find_numerical_variables()` 54 | will raise an error. 55 | -------------------------------------------------------------------------------- /docs/user_guide/variable_handling/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Variable handling functions 4 | =========================== 5 | 6 | This set of functions find variables of a specific type in a dataframe, or check that a 7 | list of variables is of a specified data type. 8 | 9 | The `find` functions take a dataframe as an argument and returns a list with the names 10 | of the variables of the desired type. 11 | 12 | The `check` functions check that the list of variables are all of the desired data type. 13 | 14 | The `retain` functions select the variables in a list if they fulfill a condition. 15 | 16 | You can use these functions to identify different sets of variables based on their 17 | data type to streamline your feature engineering pipelines or create your own 18 | Feature-engine or Scikit-learn compatible transformers. 19 | 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | 24 | find_all_variables 25 | find_categorical_variables 26 | find_datetime_variables 27 | find_numerical_variables 28 | find_categorical_and_numerical_variables 29 | check_all_variables 30 | check_categorical_variables 31 | check_datetime_variables 32 | check_numerical_variables 33 | retain_variables_if_in_df 34 | -------------------------------------------------------------------------------- /docs/user_guide/wrappers/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | Scikit-learn Wrapper 4 | ==================== 5 | 6 | Feature-engine's Scikit-learn wrappers wrap Scikit-learn transformers allowing their 7 | implementation only on a selected subset of features. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | Wrapper -------------------------------------------------------------------------------- /docs/versions/index.rst: -------------------------------------------------------------------------------- 1 | Other versions 2 | ============== 3 | 4 | Web-based documentation is available for versions listed below: 5 | 6 | - `Feature-engine 1.6 `_ 7 | 8 | -------------------------------------------------------------------------------- /docs/whats_new/index.rst: -------------------------------------------------------------------------------- 1 | .. -*- mode: rst -*- 2 | 3 | What's new 4 | ========== 5 | 6 | Find out what's new in each new version release. 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | 11 | v_180 12 | v_170 13 | v_160 14 | v_150 15 | v_140 16 | v_130 17 | v_120 18 | v_1 19 | v_06 -------------------------------------------------------------------------------- /feature_engine/VERSION: -------------------------------------------------------------------------------- 1 | 1.8.3 2 | -------------------------------------------------------------------------------- /feature_engine/__init__.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import feature_engine 4 | 5 | PACKAGE_ROOT = pathlib.Path(feature_engine.__file__).resolve().parent 6 | VERSION_PATH = PACKAGE_ROOT / "VERSION" 7 | 8 | name = "feature_engine" 9 | 10 | with open(VERSION_PATH, "r") as version_file: 11 | __version__ = version_file.read().strip() 12 | -------------------------------------------------------------------------------- /feature_engine/_base_transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_base_transformers/__init__.py -------------------------------------------------------------------------------- /feature_engine/_check_init_parameters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_check_init_parameters/__init__.py -------------------------------------------------------------------------------- /feature_engine/_check_init_parameters/check_init_input_params.py: -------------------------------------------------------------------------------- 1 | def _check_param_missing_values(missing_values): 2 | if missing_values not in ["raise", "ignore"]: 3 | raise ValueError( 4 | "missing_values takes only values 'raise' or 'ignore'. " 5 | f"Got {missing_values} instead." 6 | ) 7 | 8 | 9 | def _check_param_drop_original(drop_original): 10 | if not isinstance(drop_original, bool): 11 | raise ValueError( 12 | "drop_original takes only boolean values True and False. " 13 | f"Got {drop_original} instead." 14 | ) 15 | -------------------------------------------------------------------------------- /feature_engine/_check_init_parameters/check_input_dictionary.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | 4 | def _check_numerical_dict(dict_: Optional[dict]) -> Optional[dict]: 5 | """ 6 | Checks that all values in the dictionary are integers and floats. It can take also 7 | take None as value. 8 | 9 | Parameters 10 | ---------- 11 | dict_ : dict 12 | The dictionary that will be checked. 13 | 14 | Raises 15 | ------ 16 | ValueError 17 | If any of the values in the dictionary are not int or float. 18 | TypeError 19 | When input type is not a dictionary. 20 | """ 21 | 22 | if isinstance(dict_, dict): 23 | if not all([isinstance(x, (float, int)) for x in dict_.values()]): 24 | raise ValueError( 25 | "All values in the dictionary must be integer or float. " 26 | f"Got {dict_} instead." 27 | ) 28 | 29 | elif dict_ is not None: 30 | raise TypeError( 31 | f"The parameter can only take a dictionary or None. Got {dict_} instead." 32 | ) 33 | return None 34 | -------------------------------------------------------------------------------- /feature_engine/_check_init_parameters/check_variables.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Union 2 | 3 | Variables = Union[None, int, str, List[Union[str, int]]] 4 | 5 | 6 | def _check_variables_input_value(variables: Variables) -> Any: 7 | """ 8 | Checks that the input value for the `variables` parameter located in the init of 9 | all Feature-engine transformers is of the correct type. 10 | Allowed values are None, int, str or list of strings and integers. 11 | 12 | Parameters 13 | ---------- 14 | variables : string, int, list of strings, list of integers. Default=None 15 | 16 | Returns 17 | ------- 18 | variables: same as input 19 | """ 20 | 21 | msg = ( 22 | "`variables` should contain a string, an integer or a list of strings or " 23 | f"integers. Got {variables} instead." 24 | ) 25 | msg_dupes = "The list entered in `variables` contains duplicated variable names." 26 | msg_empty = "The list of `variables` is empty." 27 | 28 | if variables is not None: 29 | if isinstance(variables, list): 30 | if not all(isinstance(i, (str, int)) for i in variables): 31 | raise ValueError(msg) 32 | if len(variables) == 0: 33 | raise ValueError(msg_empty) 34 | if len(variables) != len(set(variables)): 35 | raise ValueError(msg_dupes) 36 | else: 37 | if not isinstance(variables, (str, int)): 38 | raise ValueError(msg) 39 | return variables 40 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/__init__.py -------------------------------------------------------------------------------- /feature_engine/_docstrings/fit_attributes.py: -------------------------------------------------------------------------------- 1 | """Docstrings for the attributes that are generated during fit.""" 2 | 3 | _variables_attribute_docstring = """variables_: 4 | The group of variables that will be transformed. 5 | """.rstrip() 6 | 7 | _feature_names_in_docstring = """feature_names_in_: 8 | List with the names of features seen during `fit`. 9 | """.rstrip() 10 | 11 | _n_features_in_docstring = """n_features_in_: 12 | The number of features in the train set used in fit. 13 | """.rstrip() 14 | 15 | # used by discretisers 16 | _binner_dict_docstring = """binner_dict_: 17 | Dictionary with the interval limits per variable. 18 | """.rstrip() 19 | 20 | # used by imputers 21 | _imputer_dict_docstring = """imputer_dict_: 22 | Dictionary with the values to replace missing data in each variable. 23 | """.rstrip() 24 | 25 | # used by outlier module 26 | _right_tail_caps_docstring = """right_tail_caps_: 27 | Dictionary with the maximum values beyond which a value will be considered an 28 | outlier. 29 | """.rstrip() 30 | 31 | _left_tail_caps_docstring = """left_tail_caps_: 32 | Dictionary with the minimum values beyond which a value will be considered an 33 | outlier. 34 | """.rstrip() 35 | 36 | # used by selection module 37 | _feature_importances_docstring = """feature_importances_: 38 | Pandas Series with the feature importance (comes from step 2) 39 | """.rstrip() 40 | 41 | _feature_importances_std_docstring = """feature_importances_std_: 42 | Pandas Series with the standard deviation of the feature importance. 43 | """.rstrip() 44 | 45 | _performance_drifts_docstring = """performance_drifts_: 46 | Dictionary with the performance drift per examined feature (comes from step 5). 47 | """.rstrip() 48 | 49 | _performance_drifts_std_docstring = """performance_drifts_std_: 50 | Dictionary with the performance drift's standard deviation of the 51 | examined feature (comes from step 5). 52 | """.rstrip() 53 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/init_parameters/__init__.py -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/all_trasnformers.py: -------------------------------------------------------------------------------- 1 | """Docstrings for the parameters corresponding to the __init__""" 2 | 3 | _variables_numerical_docstring = """variables: list, default=None 4 | The list of numerical variables to transform. If None, the transformer will 5 | automatically find and select all numerical variables. 6 | """.rstrip() 7 | 8 | _variables_categorical_docstring = """variables: list, default=None 9 | The list of categorical variables that will be encoded. If None, the 10 | encoder will find and transform all variables of type object or categorical by 11 | default. You can also make the transformer accept numerical variables, see the 12 | parameter `ignore_format`. 13 | """.rstrip() 14 | 15 | _drop_original_docstring = """drop_original: bool, default=False 16 | If True, the original variables to transform will be dropped from the dataframe. 17 | """.rstrip() 18 | 19 | _missing_values_docstring = """missing_values: string, default='raise' 20 | Indicates if missing values should be ignored or raised. If `'raise'` the 21 | transformer will return an error if the the datasets to `fit` or `transform` 22 | contain missing values. If `'ignore'`, missing data will be ignored when 23 | learning parameters or performing the transformation. 24 | """.rstrip() 25 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/discretisers.py: -------------------------------------------------------------------------------- 1 | _return_object_docstring = """return_object: bool, default=False 2 | Whether the the discrete variable should be returned as type numeric or type 3 | object. If you would like to encode the discrete variables with Feature-engine's 4 | categorical encoders, use True. Alternatively, keep the default to False. 5 | """.rstrip() 6 | 7 | _return_boundaries_docstring = """return_boundaries: bool, default=False 8 | Whether the output should be the interval boundaries. If True, it returns 9 | the interval boundaries. If False, it returns integers. 10 | """.rstrip() 11 | 12 | _precision_docstring = """precision: int, default=3 13 | The precision at which to store and display the bins labels. 14 | """.rstrip() 15 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/encoders.py: -------------------------------------------------------------------------------- 1 | _ignore_format_docstring = """ignore_format: bool, default=False 2 | This transformer operates only on variables of type object or categorical. To 3 | override this behaviour and allow the transformer to transform numerical 4 | variables as well, set to `True`.\n 5 | If `ignore_format` is `False`, the encoder will automatically select variables 6 | of type object or categorical, or check that the variables entered by the user 7 | are of type object or categorical. If `True`, the encoder will select all 8 | variables or accept all variables entered by the user, including those cast as 9 | numeric.\n 10 | In short, set to `True` when you want to encode numerical variables. 11 | """.rstrip() 12 | 13 | _unseen_docstring = """unseen: string, default='ignore' 14 | Indicates what to do when categories not present in the train set are 15 | encountered during transform. If `'raise'`, then unseen categories will raise 16 | an error. If `'ignore'`, then unseen categories will be encoded as NaN and a 17 | warning will be raised instead. 18 | """.rstrip() 19 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/outliers.py: -------------------------------------------------------------------------------- 1 | _capping_method_docstring = """capping_method: str, default='gaussian' 2 | Desired outlier detection method. Can be 'gaussian', 'iqr', 'mad', 3 | 'quantiles'. \n 4 | The transformer will find the maximum and / or minimum values beyond which a 5 | data point will be considered an outlier using: 6 | **'gaussian'**: the Gaussian approximation. 7 | **'iqr'**: the IQR proximity rule. 8 | **'quantiles'**: the percentiles. 9 | **'mad'**: the Gaussian approximation but using robust statistics. 10 | """.rstrip() 11 | 12 | _tail_docstring = """tail: str, default='right' 13 | Whether to look for outliers on the right, left or both tails of the 14 | distribution. Can take 'left', 'right' or 'both'. 15 | """.rstrip() 16 | 17 | _fold_docstring = """fold: int, float or 'auto', default='auto'. 18 | The factor used to multiply the std, MAD or IQR to calculate 19 | the maximum or minimum allowed values. 20 | When 'auto', `fold` is set based on the `capping_method`: \n 21 | - If `capping_method='quantile'` then `'fold'` = 0.05; \n 22 | - If `capping_method='gaussian'` then `'fold'` = 3.0; \n 23 | - If `capping_method='mad'` then `'fold'` = 3.29; \n 24 | - If `capping_method='iqr'` then `'fold'` = 1.5. \n 25 | Recommended values are 2, 2.5 or 3 for the gaussian approximation, 26 | 1.5 or 3 for the IQR proximity rule and 3 or 3.5 for MAD rule. \n 27 | If `capping_method='quantile'`, then `'fold'` indicates the percentile. So if 28 | `fold=0.05`, the limits will be the 95th and 5th percentiles. \n 29 | **Note**: When `capping_method='quantile'`, the maximum `fold` allowed is 0.2, 30 | which will find boundaries at the 20th and 80th percentile. 31 | """.rstrip() 32 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/init_parameters/selection.py: -------------------------------------------------------------------------------- 1 | _confirm_variables_docstring = """confirm_variables: bool, default=False 2 | If set to True, variables that are not present in the input dataframe will 3 | be removed from the list of variables. Only used when passing a variable 4 | list to the parameter `variables`. See parameter variables for more details. 5 | """.rstrip() 6 | 7 | _estimator_docstring = """estimator: object 8 | A Scikit-learn estimator for regression or classification. 9 | The estimator must have either a `feature_importances` or a `coef_` 10 | attribute after fitting. 11 | """.rstrip() 12 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/methods.py: -------------------------------------------------------------------------------- 1 | """Docstrings for the methods. They are meant to be used in the init docstrings of 2 | the transformers.""" 3 | 4 | _fit_not_learn_docstring = """fit: 5 | This transformer does not learn parameters. 6 | """.rstrip() 7 | 8 | _fit_transform_docstring = """fit_transform: 9 | Fit to data, then transform it. 10 | 11 | get_feature_names_out: 12 | Get output feature names for transformation. 13 | 14 | get_params: 15 | Get parameters for this estimator. 16 | 17 | set_params: 18 | Set the parameters of this estimator. 19 | """.rstrip() 20 | 21 | _inverse_transform_docstring = """inverse_transform: 22 | Convert the data back to the original representation. 23 | """.rstrip() 24 | 25 | # used in categorical encoders 26 | _transform_encoders_docstring = """transform: 27 | Encode the categories to numbers. 28 | """.rstrip() 29 | 30 | # used in creation module 31 | _transform_creation_docstring = """transform: 32 | Create new features. 33 | """.rstrip() 34 | 35 | # used in discretisers module 36 | _fit_discretiser_docstring = """fit: 37 | Find the interval limits. 38 | """.rstrip() 39 | 40 | _transform_discretiser_docstring = """transform: 41 | Sort continuous variable values into the intervals. 42 | """.rstrip() 43 | 44 | # used in imputation module 45 | _transform_imputers_docstring = """transform: 46 | Impute missing data. 47 | """.rstrip() 48 | -------------------------------------------------------------------------------- /feature_engine/_docstrings/selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/selection/__init__.py -------------------------------------------------------------------------------- /feature_engine/_docstrings/substitute.py: -------------------------------------------------------------------------------- 1 | """Utilities for docstring in Feature-engine. 2 | 3 | Taken from the project imbalanced-learn: 4 | 5 | https://github.com/scikit-learn-contrib/imbalanced-learn/blob/ 6 | imblearn/utils/_docstring.py#L7 7 | """ 8 | 9 | 10 | class Substitution: 11 | """Decorate a function's or a class' docstring to perform string 12 | substitution on it. 13 | This decorator should be robust even if obj.__doc__ is None 14 | (for example, if -OO was passed to the interpreter). 15 | """ 16 | 17 | def __init__(self, *args, **kwargs): 18 | if args and kwargs: 19 | raise AssertionError("Only positional or keyword args are allowed") 20 | 21 | self.params = args or kwargs 22 | 23 | def __call__(self, obj): 24 | obj.__doc__ = obj.__doc__.format(**self.params) 25 | return obj 26 | -------------------------------------------------------------------------------- /feature_engine/_prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_prediction/__init__.py -------------------------------------------------------------------------------- /feature_engine/creation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module creation includes classes to create new variables by combination of existing 3 | variables in the dataframe. 4 | """ 5 | from .cyclical_features import CyclicalFeatures 6 | from .decision_tree_features import DecisionTreeFeatures 7 | from .math_features import MathFeatures 8 | from .relative_features import RelativeFeatures 9 | 10 | __all__ = [ 11 | "DecisionTreeFeatures", 12 | "MathFeatures", 13 | "RelativeFeatures", 14 | "CyclicalFeatures", 15 | ] 16 | -------------------------------------------------------------------------------- /feature_engine/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .titanic import load_titanic 2 | 3 | __all__ = ["load_titanic"] 4 | -------------------------------------------------------------------------------- /feature_engine/datetime/__init__.py: -------------------------------------------------------------------------------- 1 | "The module datetime computes features from dates and times." 2 | 3 | from .datetime import DatetimeFeatures 4 | from .datetime_subtraction import DatetimeSubtraction 5 | 6 | __all__ = ["DatetimeFeatures", "DatetimeSubtraction"] 7 | -------------------------------------------------------------------------------- /feature_engine/datetime/_datetime_constants.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | FEATURES_SUPPORTED = [ 4 | "month", 5 | "quarter", 6 | "semester", 7 | "year", 8 | "week", 9 | "day_of_week", 10 | "day_of_month", 11 | "day_of_year", 12 | "weekend", 13 | "month_start", 14 | "month_end", 15 | "quarter_start", 16 | "quarter_end", 17 | "year_start", 18 | "year_end", 19 | "leap_year", 20 | "days_in_month", 21 | "hour", 22 | "minute", 23 | "second", 24 | ] 25 | 26 | FEATURES_DEFAULT = [ 27 | "month", 28 | "year", 29 | "day_of_week", 30 | "day_of_month", 31 | "hour", 32 | "minute", 33 | "second", 34 | ] 35 | 36 | FEATURES_SUFFIXES = { 37 | "month": "_month", 38 | "quarter": "_quarter", 39 | "semester": "_semester", 40 | "year": "_year", 41 | "week": "_week", 42 | "day_of_week": "_day_of_week", 43 | "day_of_month": "_day_of_month", 44 | "day_of_year": "_day_of_year", 45 | "weekend": "_weekend", 46 | "month_start": "_month_start", 47 | "month_end": "_month_end", 48 | "quarter_start": "_quarter_start", 49 | "quarter_end": "_quarter_end", 50 | "year_start": "_year_start", 51 | "year_end": "_year_end", 52 | "leap_year": "_leap_year", 53 | "days_in_month": "_days_in_month", 54 | "hour": "_hour", 55 | "minute": "_minute", 56 | "second": "_second", 57 | } 58 | 59 | FEATURES_FUNCTIONS = { 60 | "month": lambda x: x.dt.month, 61 | "quarter": lambda x: x.dt.quarter, 62 | "semester": lambda x: np.where(x.dt.month <= 6, 1, 2).astype(np.int64), 63 | "year": lambda x: x.dt.year, 64 | "week": lambda x: x.dt.isocalendar().week.astype(np.int64), 65 | "day_of_week": lambda x: x.dt.dayofweek, 66 | "day_of_month": lambda x: x.dt.day, 67 | "day_of_year": lambda x: x.dt.dayofyear, 68 | "weekend": lambda x: np.where(x.dt.dayofweek <= 4, 0, 1).astype(np.int64), 69 | "month_start": lambda x: x.dt.is_month_start.astype(np.int64), 70 | "month_end": lambda x: x.dt.is_month_end.astype(np.int64), 71 | "quarter_start": lambda x: x.dt.is_quarter_start.astype(np.int64), 72 | "quarter_end": lambda x: x.dt.is_quarter_end.astype(np.int64), 73 | "year_start": lambda x: x.dt.is_year_start.astype(np.int64), 74 | "year_end": lambda x: x.dt.is_year_end.astype(np.int64), 75 | "leap_year": lambda x: x.dt.is_leap_year.astype(np.int64), 76 | "days_in_month": lambda x: x.dt.days_in_month.astype(np.int64), 77 | "hour": lambda x: x.dt.hour, 78 | "minute": lambda x: x.dt.minute, 79 | "second": lambda x: x.dt.second, 80 | } 81 | -------------------------------------------------------------------------------- /feature_engine/discretisation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module discretisation includes classes to sort continuous variables into bins or 3 | intervals. 4 | """ 5 | 6 | from .arbitrary import ArbitraryDiscretiser 7 | from .decision_tree import DecisionTreeDiscretiser 8 | from .equal_frequency import EqualFrequencyDiscretiser 9 | from .equal_width import EqualWidthDiscretiser 10 | from .geometric_width import GeometricWidthDiscretiser 11 | 12 | __all__ = [ 13 | "DecisionTreeDiscretiser", 14 | "EqualFrequencyDiscretiser", 15 | "EqualWidthDiscretiser", 16 | "ArbitraryDiscretiser", 17 | "GeometricWidthDiscretiser", 18 | ] 19 | -------------------------------------------------------------------------------- /feature_engine/encoding/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module encoding includes classes to transform categorical variables into numerical. 3 | """ 4 | 5 | from .count_frequency import CountFrequencyEncoder 6 | from .decision_tree import DecisionTreeEncoder 7 | from .mean_encoding import MeanEncoder 8 | from .one_hot import OneHotEncoder 9 | from .ordinal import OrdinalEncoder 10 | from .rare_label import RareLabelEncoder 11 | from .similarity_encoder import StringSimilarityEncoder 12 | from .woe import WoEEncoder 13 | 14 | __all__ = [ 15 | "CountFrequencyEncoder", 16 | "DecisionTreeEncoder", 17 | "MeanEncoder", 18 | "OneHotEncoder", 19 | "OrdinalEncoder", 20 | "RareLabelEncoder", 21 | "StringSimilarityEncoder", 22 | "WoEEncoder", 23 | ] 24 | -------------------------------------------------------------------------------- /feature_engine/encoding/_helper_functions.py: -------------------------------------------------------------------------------- 1 | def check_parameter_unseen(unseen, accepted_values): 2 | if not isinstance(accepted_values, list) or not all( 3 | isinstance(item, str) for item in accepted_values 4 | ): 5 | raise ValueError( 6 | "accepted_values should be a list of strings. " 7 | f" Got {accepted_values} instead." 8 | ) 9 | if unseen not in accepted_values: 10 | raise ValueError( 11 | f"Parameter `unseen` takes only values {', '.join(accepted_values)}." 12 | f" Got {unseen} instead." 13 | ) 14 | -------------------------------------------------------------------------------- /feature_engine/imputation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module imputation includes classes to perform missing data imputation 3 | """ 4 | 5 | from .arbitrary_number import ArbitraryNumberImputer 6 | from .categorical import CategoricalImputer 7 | from .drop_missing_data import DropMissingData 8 | from .end_tail import EndTailImputer 9 | from .mean_median import MeanMedianImputer 10 | from .missing_indicator import AddMissingIndicator 11 | from .random_sample import RandomSampleImputer 12 | 13 | __all__ = [ 14 | "MeanMedianImputer", 15 | "ArbitraryNumberImputer", 16 | "CategoricalImputer", 17 | "EndTailImputer", 18 | "AddMissingIndicator", 19 | "RandomSampleImputer", 20 | "DropMissingData", 21 | ] 22 | -------------------------------------------------------------------------------- /feature_engine/outliers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module outliers includes classes to remove or cap outliers. 3 | """ 4 | 5 | from .artbitrary import ArbitraryOutlierCapper 6 | from .trimmer import OutlierTrimmer 7 | from .winsorizer import Winsorizer 8 | 9 | __all__ = ["Winsorizer", "ArbitraryOutlierCapper", "OutlierTrimmer"] 10 | -------------------------------------------------------------------------------- /feature_engine/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import Pipeline, make_pipeline 2 | 3 | __all__ = ["Pipeline", "make_pipeline"] 4 | -------------------------------------------------------------------------------- /feature_engine/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module preprocessing includes classes and functions for general data pre-processing 3 | and transformation. 4 | """ 5 | 6 | from .match_categories import MatchCategories 7 | from .match_columns import MatchVariables 8 | 9 | __all__ = [ 10 | "MatchCategories", 11 | "MatchVariables", 12 | ] 13 | -------------------------------------------------------------------------------- /feature_engine/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/py.typed -------------------------------------------------------------------------------- /feature_engine/scaling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module scaling includes classes to transform variables using various 3 | scaling methods. 4 | """ 5 | 6 | from .mean_normalization import MeanNormalizationScaler 7 | 8 | __all__ = [ 9 | "MeanNormalizationScaler", 10 | ] 11 | -------------------------------------------------------------------------------- /feature_engine/selection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module selection includes classes to select features or remove unwanted features. 3 | """ 4 | from .drop_constant_features import DropConstantFeatures 5 | from .drop_correlated_features import DropCorrelatedFeatures 6 | from .drop_duplicate_features import DropDuplicateFeatures 7 | from .drop_features import DropFeatures 8 | from .drop_psi_features import DropHighPSIFeatures 9 | from .information_value import SelectByInformationValue 10 | from .probe_feature_selection import ProbeFeatureSelection 11 | from .recursive_feature_addition import RecursiveFeatureAddition 12 | from .recursive_feature_elimination import RecursiveFeatureElimination 13 | from .shuffle_features import SelectByShuffling 14 | from .single_feature_performance import SelectBySingleFeaturePerformance 15 | from .smart_correlation_selection import SmartCorrelatedSelection 16 | from .target_mean_selection import SelectByTargetMeanPerformance 17 | from .mrmr import MRMR 18 | 19 | __all__ = [ 20 | "DropFeatures", 21 | "DropConstantFeatures", 22 | "DropDuplicateFeatures", 23 | "DropCorrelatedFeatures", 24 | "DropHighPSIFeatures", 25 | "SmartCorrelatedSelection", 26 | "SelectByShuffling", 27 | "SelectBySingleFeaturePerformance", 28 | "RecursiveFeatureAddition", 29 | "RecursiveFeatureElimination", 30 | "SelectByTargetMeanPerformance", 31 | "SelectByInformationValue", 32 | "ProbeFeatureSelection", 33 | "MRMR", 34 | ] 35 | -------------------------------------------------------------------------------- /feature_engine/selection/_selection_constants.py: -------------------------------------------------------------------------------- 1 | _CLASSIFICATION_METRICS = [ 2 | "accuracy", 3 | "balanced_accuracy", 4 | "top_k_accuracy", 5 | "average_precision", 6 | "neg_brier_score", 7 | "f1", 8 | "f1_micro", 9 | "f1_macro", 10 | "f1_weighted", 11 | "f1_samples", 12 | "neg_log_loss", 13 | "precision", 14 | "precision_micro", 15 | "precision_macro", 16 | "precision_weighted", 17 | "precision_samples", 18 | "recall", 19 | "recall_micro", 20 | "recall_macro", 21 | "recall_weighted", 22 | "recall_samples", 23 | "jaccard", 24 | "jaccard_micro", 25 | "jaccard_macro", 26 | "jaccard_weighted", 27 | "jaccard_samples", 28 | "roc_auc", 29 | "roc_auc_ovr", 30 | "roc_auc_ovo", 31 | "roc_auc_ovr_weighted", 32 | "roc_auc_ovo_weighted", 33 | ] 34 | 35 | _REGRESSION_METRICS = [ 36 | "explained_variance", 37 | "r2", 38 | "max_error", 39 | "neg_median_absolute_error", 40 | "neg_mean_absolute_error", 41 | "neg_mean_absolute_percentage_error", 42 | "neg_mean_squared_error", 43 | "neg_mean_squared_log_error", 44 | "neg_root_mean_squared_error", 45 | "neg_mean_poisson_deviance", 46 | "neg_mean_gamma_deviance", 47 | ] 48 | -------------------------------------------------------------------------------- /feature_engine/tags.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | from sklearn.utils.fixes import parse_version 3 | 4 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) 5 | 6 | 7 | def _return_tags(): 8 | tags = { 9 | "preserves_dtype": [], 10 | "_xfail_checks": { 11 | # Complex data in math terms, are values like 4i (imaginary numbers 12 | # so to speak). I've never seen such a thing in the dfs I've 13 | # worked with, so I don't think we need this test. 14 | "check_complex_data": "Test not needed.", 15 | # check that estimators treat dtype object as numeric if possible 16 | "check_dtype_object": "Feature-engine transformers use dtypes to select " 17 | "between numerical and categorical variables. Feature-engine trusts the " 18 | "user casts the variables appropriately", 19 | # Test fails because FE does not like the sklearn class _NotAnArray 20 | # The test aims to check that the check_X_y function from sklearn is 21 | # working, but we do not use that check, because we work with dfs. 22 | "check_transformer_data_not_an_array": "Ok to fail", 23 | "check_sample_weights_not_an_array": "Ok to fail", 24 | # TODO: we probably need the test below!! 25 | "check_methods_sample_order_invariance": "Test does not work on dataframes", 26 | # TODO: we probably need the test below!! 27 | # the test below tests that a second fit overrides a first fit. 28 | # the problem is that the test does not work with pandas df. 29 | "check_fit_idempotent": "Test does not work on dataframes.", 30 | "check_fit2d_predict1d": "Test not relevant, Feature-engine transformers " 31 | "only work with dataframes.", 32 | }, 33 | } 34 | 35 | if sklearn_version > parse_version("1.6"): 36 | msg1 = "against Feature-engines design." 37 | msg2 = "Our transformers do not preserve dtype." 38 | all_fail = { 39 | "check_do_not_raise_errors_in_init_or_set_params": msg1, 40 | "check_transformer_preserve_dtypes": msg2, 41 | # TODO: investigate this test further. 42 | "check_n_features_in_after_fitting": "not sure why it fails, we do check.", 43 | } 44 | tags["_xfail_checks"].update(all_fail) # type: ignore 45 | return tags 46 | -------------------------------------------------------------------------------- /feature_engine/timeseries/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/timeseries/__init__.py -------------------------------------------------------------------------------- /feature_engine/timeseries/forecasting/__init__.py: -------------------------------------------------------------------------------- 1 | """ Transformers that create features for time-series forecasting.""" 2 | 3 | from .expanding_window_features import ExpandingWindowFeatures 4 | from .lag_features import LagFeatures 5 | from .window_features import WindowFeatures 6 | 7 | __all__ = ["LagFeatures", "WindowFeatures", "ExpandingWindowFeatures"] 8 | -------------------------------------------------------------------------------- /feature_engine/transformation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module transformation includes classes to transform variables using mathematical 3 | functions. 4 | """ 5 | 6 | from .arcsin import ArcsinTransformer 7 | from .boxcox import BoxCoxTransformer 8 | from .log import LogCpTransformer, LogTransformer 9 | from .power import PowerTransformer 10 | from .reciprocal import ReciprocalTransformer 11 | from .yeojohnson import YeoJohnsonTransformer 12 | 13 | __all__ = [ 14 | "BoxCoxTransformer", 15 | "LogTransformer", 16 | "LogCpTransformer", 17 | "PowerTransformer", 18 | "ReciprocalTransformer", 19 | "YeoJohnsonTransformer", 20 | "ArcsinTransformer", 21 | ] 22 | -------------------------------------------------------------------------------- /feature_engine/variable_handling/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module variable handling includes functions to select variables of a certain type 3 | or check that a list of variables is in certain type. 4 | """ 5 | 6 | from .check_variables import ( 7 | check_all_variables, 8 | check_categorical_variables, 9 | check_datetime_variables, 10 | check_numerical_variables, 11 | ) 12 | from .find_variables import ( 13 | find_all_variables, 14 | find_categorical_and_numerical_variables, 15 | find_categorical_variables, 16 | find_datetime_variables, 17 | find_numerical_variables, 18 | ) 19 | from .retain_variables import retain_variables_if_in_df 20 | 21 | __all__ = [ 22 | "check_all_variables", 23 | "check_numerical_variables", 24 | "check_categorical_variables", 25 | "check_datetime_variables", 26 | "find_all_variables", 27 | "find_numerical_variables", 28 | "find_categorical_variables", 29 | "find_datetime_variables", 30 | "find_categorical_and_numerical_variables", 31 | "retain_variables_if_in_df", 32 | ] 33 | -------------------------------------------------------------------------------- /feature_engine/variable_handling/_variable_type_checks.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import pandas as pd 4 | from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime 5 | from pandas.core.dtypes.common import is_numeric_dtype as is_numeric 6 | from pandas.core.dtypes.common import is_object_dtype as is_object 7 | 8 | 9 | def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: 10 | # check for datetime only if object cannot be cast as numeric because 11 | # if it could pd.to_datetime would convert it to datetime regardless 12 | if is_object(column): 13 | is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column) 14 | 15 | # check for datetime only if the type of the categories is not numeric 16 | # because pd.to_datetime throws an error when it is an integer 17 | elif isinstance(column.dtype, pd.CategoricalDtype): 18 | is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column) 19 | 20 | return is_cat 21 | 22 | 23 | def _is_categories_num(column: pd.Series) -> bool: 24 | return is_numeric(column.dtype.categories) 25 | 26 | 27 | def _is_convertible_to_dt(column: pd.Series) -> bool: 28 | with warnings.catch_warnings(): 29 | warnings.simplefilter("ignore") 30 | return is_datetime(pd.to_datetime(column, errors="ignore", utc=True)) 31 | 32 | 33 | def _is_convertible_to_num(column: pd.Series) -> bool: 34 | try: 35 | ser = pd.to_numeric(column) 36 | except (ValueError, TypeError): 37 | ser = column 38 | return is_numeric(ser) 39 | 40 | 41 | def _is_categorical_and_is_datetime(column: pd.Series) -> bool: 42 | # check for datetime only if object cannot be cast as numeric because 43 | # if it could pd.to_datetime would convert it to datetime regardless 44 | if is_object(column): 45 | is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column) 46 | 47 | # check for datetime only if the type of the categories is not numeric 48 | # because pd.to_datetime throws an error when it is an integer 49 | elif isinstance(column.dtype, pd.CategoricalDtype): 50 | is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column) 51 | 52 | return is_dt 53 | -------------------------------------------------------------------------------- /feature_engine/variable_handling/dtypes.py: -------------------------------------------------------------------------------- 1 | DATETIME_TYPES = ("datetimetz", "datetime") 2 | -------------------------------------------------------------------------------- /feature_engine/variable_handling/retain_variables.py: -------------------------------------------------------------------------------- 1 | """Functions to remove variables from a list.""" 2 | 3 | from typing import List, Union 4 | 5 | Variables = Union[int, str, List[Union[str, int]]] 6 | 7 | 8 | def retain_variables_if_in_df(X, variables): 9 | """Returns the subset of variables in the list that are present in the dataframe. 10 | 11 | More details in the :ref:`User Guide `. 12 | 13 | Parameters 14 | ---------- 15 | X: pandas dataframe of shape = [n_samples, n_features] 16 | The dataset. 17 | 18 | variables: string, int or list of strings or int. 19 | The names of the variables to check. 20 | 21 | Returns 22 | ------- 23 | variables_in_df: List. 24 | The subset of `variables` that is present `X`. 25 | 26 | Examples 27 | -------- 28 | >>> import pandas as pd 29 | >>> from feature_engine.variable_handling import retain_variables_if_in_df 30 | >>> X = pd.DataFrame({ 31 | >>> "var_num": [1, 2, 3], 32 | >>> "var_cat": ["A", "B", "C"], 33 | >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T") 34 | >>> }) 35 | >>> vars_in_df = retain_variables_if_in_df(X, ['var_num', 'var_cat', 'var_other']) 36 | >>> vars_in_df 37 | ['var_num', 'var_cat'] 38 | """ 39 | if isinstance(variables, (str, int)): 40 | variables = [variables] 41 | 42 | variables_in_df = [var for var in variables if var in X.columns] 43 | 44 | # Raise an error if no column is left to work with. 45 | if len(variables_in_df) == 0: 46 | raise ValueError( 47 | "None of the variables in the list are present in the dataframe." 48 | ) 49 | 50 | return variables_in_df 51 | -------------------------------------------------------------------------------- /feature_engine/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The module wrappers includes classes to wrap Scikit-learn transformers so that they 3 | can be applied to a selected subset of features and return a dataframe. 4 | """ 5 | 6 | from .wrappers import SklearnTransformerWrapper 7 | 8 | __all__ = ["SklearnTransformerWrapper"] 9 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | warn_unused_ignores = True 3 | follow_imports = skip 4 | show_error_context = True 5 | warn_incomplete_stub = True 6 | ignore_missing_imports = True 7 | check_untyped_defs = True 8 | cache_dir = /dev/null 9 | warn_redundant_casts = True 10 | warn_unused_configs = True 11 | strict_optional = True 12 | 13 | exclude = (?x)( 14 | mixins\.py$ # or files ending with "two.pyi" 15 | ) -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | 3 | [pytest] 4 | filterwarnings = 5 | ignore::sklearn.exceptions.SkipTestWarning 6 | ignore::UserWarning 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.2 2 | pandas>=2.2.0 3 | scikit-learn>=1.4.0 4 | scipy>=1.4.1 5 | statsmodels>=0.11.1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from setuptools import find_packages, setup 4 | 5 | # Package meta-data. 6 | NAME = "feature_engine" 7 | DESCRIPTION = "Feature engineering and selection package with Scikit-learn's fit transform functionality" 8 | URL = "http://github.com/feature-engine/feature_engine" 9 | EMAIL = "solegalli@protonmail.com" 10 | AUTHOR = "Soledad Galli" 11 | REQUIRES_PYTHON = ">=3.9.0" 12 | 13 | # description 14 | with open("README.md", "r") as fh: 15 | long_description = fh.read() 16 | 17 | 18 | # Packages required for this module to be executed 19 | def list_reqs(fname='requirements.txt'): 20 | with open(fname) as fd: 21 | return fd.read().splitlines() 22 | 23 | 24 | # Load the package's VERSION file as a dictionary. 25 | about = {} 26 | ROOT_DIR = Path(__file__).resolve().parent 27 | PACKAGE_DIR = ROOT_DIR / 'feature_engine' 28 | with open(PACKAGE_DIR / "VERSION") as f: 29 | _version = f.read().strip() 30 | about["__version__"] = _version 31 | 32 | setup(name=NAME, 33 | version=about["__version__"], 34 | description=DESCRIPTION, 35 | long_description=long_description, 36 | long_description_content_type="text/markdown", 37 | url=URL, 38 | author=AUTHOR, 39 | author_email=EMAIL, 40 | python_requires=REQUIRES_PYTHON, 41 | packages=find_packages(exclude=("tests",)), 42 | package_data={"feature_engine": ["VERSION", "py.typed"]}, 43 | license='BSD 3 clause', 44 | install_requires=list_reqs(), 45 | include_package_data=True, 46 | classifiers=[ 47 | # Trove classifiers 48 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers 49 | "License :: OSI Approved :: BSD License", 50 | "Programming Language :: Python :: 3.9", 51 | "Programming Language :: Python :: 3.10", 52 | "Programming Language :: Python :: 3.11", 53 | "Programming Language :: Python :: 3.12", 54 | ], 55 | zip_safe=False) 56 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest>=5.4.1 3 | 4 | # repo maintenance tooling 5 | black>=21.5b1 6 | coverage>=6.4.4 7 | flake8>=3.9.2 8 | isort>=5.8.0 9 | mypy>=0.740 10 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/__init__.py -------------------------------------------------------------------------------- /tests/estimator_checks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/estimator_checks/__init__.py -------------------------------------------------------------------------------- /tests/estimator_checks/dataframe_for_checks.py: -------------------------------------------------------------------------------- 1 | """Dataframe used as input by many estimator checks.""" 2 | 3 | from typing import Tuple 4 | 5 | import pandas as pd 6 | from sklearn.datasets import make_classification 7 | 8 | 9 | def test_df( 10 | categorical: bool = False, datetime: bool = False 11 | ) -> Tuple[pd.DataFrame, pd.Series]: 12 | """ 13 | Creates a dataframe that contains only numerical features, or additionally, 14 | categorical and datetime features. 15 | 16 | Parameters 17 | ---------- 18 | categorical: bool, default=False 19 | Whether to add 2 additional categorical features. 20 | 21 | datetime: bool, default=False 22 | Whether to add one additional datetime feature. 23 | 24 | Returns 25 | ------- 26 | X: pd.DataFrame 27 | A pandas dataframe. 28 | """ 29 | X, y = make_classification( 30 | n_samples=1000, 31 | n_features=12, 32 | n_redundant=4, 33 | n_clusters_per_class=1, 34 | weights=[0.50], 35 | class_sep=2, 36 | random_state=1, 37 | ) 38 | 39 | # transform arrays into pandas df and series 40 | colnames = [f"var_{i}" for i in range(12)] 41 | X = pd.DataFrame(X, columns=colnames) 42 | y = pd.Series(y) 43 | 44 | if categorical is True: 45 | X["cat_var1"] = ["A"] * 1000 46 | X["cat_var2"] = ["B"] * 1000 47 | 48 | if datetime is True: 49 | X["date1"] = pd.date_range("2020-02-24", periods=1000, freq="min") 50 | X["date2"] = pd.date_range("2021-09-29", periods=1000, freq="h") 51 | 52 | return X, y 53 | -------------------------------------------------------------------------------- /tests/estimator_checks/fit_functionality_checks.py: -------------------------------------------------------------------------------- 1 | """Checks functionality in the fit method shared by all transformers.""" 2 | 3 | import pytest 4 | from sklearn import clone 5 | 6 | from tests.estimator_checks.dataframe_for_checks import test_df 7 | 8 | 9 | def check_feature_names_in(estimator): 10 | """Checks that transformers learn the variable names of the train set used 11 | during fit, as well as the number of variables. 12 | 13 | Should be applied to all transformers. 14 | """ 15 | # the estimator learns the parameters from the train set 16 | X, y = test_df(categorical=True, datetime=True) 17 | varnames = list(X.columns) 18 | estimator = clone(estimator) 19 | estimator.fit(X, y) 20 | assert estimator.feature_names_in_ == varnames 21 | assert estimator.n_features_in_ == len(varnames) 22 | 23 | 24 | def check_error_if_y_not_passed(estimator): 25 | """ 26 | Checks that transformer raises error when y is not passed during fit. Functionality 27 | is provided by Python, when making a parameter mandatory. 28 | 29 | For this test to run, we need to add the tag 'requires_y' to the transformer. 30 | """ 31 | X, y = test_df() 32 | estimator = clone(estimator) 33 | with pytest.raises(TypeError): 34 | estimator.fit(X) 35 | -------------------------------------------------------------------------------- /tests/estimator_checks/init_params_allowed_values_checks.py: -------------------------------------------------------------------------------- 1 | """Many transformers have similar init parameters which take the same input values. 2 | In this script, we add tests for the allowed values for those parameters. 3 | """ 4 | import pytest 5 | from sklearn import clone 6 | 7 | 8 | def check_error_param_missing_values(estimator): 9 | """ 10 | Only for transformers with a parameter `missing_values`in init. 11 | 12 | Checks transformer raises error when user enters non-permitted value to the 13 | parameter. 14 | """ 15 | # param takes values "raise" or "ignore" 16 | estimator = clone(estimator) 17 | for value in [2, "hola", False]: 18 | if estimator.__class__.__name__ == "MathFeatures": 19 | with pytest.raises(ValueError): 20 | estimator.__class__( 21 | variables=["var_1", "var_2", "var_3"], 22 | func="mean", 23 | missing_values=value, 24 | ) 25 | 26 | elif estimator.__class__.__name__ == "RelativeFeatures": 27 | with pytest.raises(ValueError): 28 | estimator.__class__( 29 | variables=["var_1", "var_2", "var_3"], 30 | reference=["var_4"], 31 | func="mean", 32 | missing_values=value, 33 | ) 34 | else: 35 | with pytest.raises(ValueError): 36 | estimator.__class__(missing_values=value) 37 | 38 | 39 | def check_error_param_confirm_variables(estimator): 40 | """ 41 | Only for transformers with a parameter `confirm_variables`in init. 42 | 43 | Checks transformer raises error when user enters non-permitted value to the 44 | parameter. 45 | """ 46 | # param takes values True or False 47 | estimator = clone(estimator) 48 | for value in [2, "hola", [True]]: 49 | msg = ( 50 | f"confirm_variables takes only values True and False. Got {value} instead." 51 | ) 52 | with pytest.raises(ValueError) as record: 53 | estimator.__class__(confirm_variables=value) 54 | assert record.value.args[0] == msg 55 | -------------------------------------------------------------------------------- /tests/estimator_checks/non_fitted_error_checks.py: -------------------------------------------------------------------------------- 1 | """Checks functionality in the transform method shared by all transformers.""" 2 | 3 | import pytest 4 | from sklearn import clone 5 | from sklearn.exceptions import NotFittedError 6 | 7 | from tests.estimator_checks.dataframe_for_checks import test_df 8 | 9 | 10 | def check_raises_non_fitted_error(estimator): 11 | """ 12 | Check if transformer raises error when transform() method is called before 13 | calling fit() method. 14 | 15 | The functionality is provided by sklearn's `check_is_fitted` function. 16 | """ 17 | X, y = test_df() 18 | transformer = clone(estimator) 19 | # Test when fit is not called prior to transform. 20 | with pytest.raises(NotFittedError): 21 | transformer.transform(X) 22 | -------------------------------------------------------------------------------- /tests/parametrize_with_checks_creation_v16.py: -------------------------------------------------------------------------------- 1 | """ 2 | File intended to help understand check_estimator tests for the module creation of 3 | Feature-engine. It is not run as part of the battery of acceptance tests. Works from 4 | sklearn > 1.6. 5 | """ 6 | 7 | from sklearn.utils.estimator_checks import parametrize_with_checks 8 | 9 | from feature_engine.creation import ( 10 | CyclicalFeatures, 11 | DecisionTreeFeatures, 12 | MathFeatures, 13 | RelativeFeatures, 14 | ) 15 | 16 | dtf = DecisionTreeFeatures(regression=False) 17 | cf = CyclicalFeatures() 18 | mf = MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore") 19 | rf = RelativeFeatures( 20 | variables=["x0", "x1"], 21 | reference=["x0"], 22 | func=["add"], 23 | missing_values="ignore", 24 | ) 25 | 26 | EXPECTED_FAILED_CHECKS = { 27 | "DecisionTreeFeatures": dtf._more_tags()["_xfail_checks"], 28 | "CyclicalFeatures": cf._more_tags()["_xfail_checks"], 29 | "MathFeatures": mf._more_tags()["_xfail_checks"], 30 | "RelativeFeatures": rf._more_tags()["_xfail_checks"], 31 | } 32 | 33 | 34 | # creation 35 | @parametrize_with_checks( 36 | estimators=[dtf, cf, mf, rf], 37 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get( 38 | est.__class__.__name__, {} 39 | ), 40 | ) 41 | def test_sklearn_compatible_creator(estimator, check): 42 | check(estimator) 43 | -------------------------------------------------------------------------------- /tests/parametrize_with_checks_discretization_v16.py: -------------------------------------------------------------------------------- 1 | """ 2 | File intended to help understand check_estimator tests for Feature-engine's 3 | discretization module. It is not run as part of the battery of acceptance tests. 4 | Works from sklearn > 1.6. 5 | """ 6 | 7 | import numpy as np 8 | from sklearn.utils.estimator_checks import parametrize_with_checks 9 | 10 | from feature_engine.discretisation import ( 11 | ArbitraryDiscretiser, 12 | DecisionTreeDiscretiser, 13 | EqualFrequencyDiscretiser, 14 | EqualWidthDiscretiser, 15 | GeometricWidthDiscretiser, 16 | ) 17 | 18 | dtd = DecisionTreeDiscretiser(regression=False) 19 | efd = EqualFrequencyDiscretiser() 20 | ewd = EqualWidthDiscretiser() 21 | ad = ArbitraryDiscretiser(binning_dict={"x0": [-np.inf, 0, np.inf]}) 22 | gd = GeometricWidthDiscretiser() 23 | 24 | EXPECTED_FAILED_CHECKS = { 25 | "DecisionTreeDiscretiser": dtd._more_tags()["_xfail_checks"], 26 | "EqualFrequencyDiscretiser": efd._more_tags()["_xfail_checks"], 27 | "EqualWidthDiscretiser": ewd._more_tags()["_xfail_checks"], 28 | "ArbitraryDiscretiser": ad._more_tags()["_xfail_checks"], 29 | "GeometricWidthDiscretiser": gd._more_tags()["_xfail_checks"], 30 | } 31 | 32 | 33 | # discretization 34 | @parametrize_with_checks( 35 | estimators=[dtd, efd, ewd, ad, gd], 36 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get( 37 | est.__class__.__name__, {} 38 | ), 39 | ) 40 | def test_sklearn_compatible_creator(estimator, check): 41 | check(estimator) 42 | -------------------------------------------------------------------------------- /tests/parametrize_with_checks_encoders_v16.py: -------------------------------------------------------------------------------- 1 | """ 2 | File intended to help understand check_estimator tests for Feature-engine's 3 | encoding module. It is not run as part of the battery of acceptance tests. 4 | Works from sklearn > 1.6. 5 | """ 6 | 7 | from sklearn.utils.estimator_checks import parametrize_with_checks 8 | 9 | from feature_engine.encoding import ( 10 | CountFrequencyEncoder, 11 | MeanEncoder, 12 | OneHotEncoder, 13 | OrdinalEncoder, 14 | RareLabelEncoder, 15 | StringSimilarityEncoder, 16 | WoEEncoder, 17 | ) 18 | from feature_engine.tags import _return_tags 19 | 20 | ce = CountFrequencyEncoder(ignore_format=True) 21 | me = MeanEncoder(ignore_format=True) 22 | ohe = OneHotEncoder(ignore_format=True) 23 | oe = OrdinalEncoder(ignore_format=True) 24 | re = RareLabelEncoder( 25 | tol=0.00000000001, 26 | n_categories=100000000000, 27 | replace_with=10, 28 | ignore_format=True, 29 | ) 30 | woe = WoEEncoder(ignore_format=True) 31 | sse = StringSimilarityEncoder(ignore_format=True) 32 | 33 | FAILED_CHECKS = _return_tags()["_xfail_checks"] 34 | FAILED_CHECKS.update({"check_estimators_nan_inf": "transformer allows NA"}) 35 | 36 | EXPECTED_FAILED_CHECKS = { 37 | "CountFrequencyEncoder": FAILED_CHECKS, 38 | "MeanEncoder": FAILED_CHECKS, 39 | "OneHotEncoder": FAILED_CHECKS, 40 | "OrdinalEncoder": FAILED_CHECKS, 41 | "RareLabelEncoder": FAILED_CHECKS, 42 | "StringSimilarityEncoder": FAILED_CHECKS, 43 | } 44 | 45 | 46 | # encoding 47 | @parametrize_with_checks( 48 | estimators=[ce, me, ohe, oe, re, woe, sse], 49 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get( 50 | est.__class__.__name__, {} 51 | ), 52 | ) 53 | def test_sklearn_compatible_creator(estimator, check): 54 | check(estimator) 55 | -------------------------------------------------------------------------------- /tests/parametrize_with_checks_outliers_v16.py: -------------------------------------------------------------------------------- 1 | """ 2 | File intended to help understand check_estimator tests for Feature-engine's 3 | outliers module. It is not run as part of the battery of acceptance tests. 4 | Works from sklearn > 1.6. 5 | """ 6 | 7 | from sklearn.utils.estimator_checks import parametrize_with_checks 8 | 9 | from feature_engine.outliers import ArbitraryOutlierCapper, OutlierTrimmer, Winsorizer 10 | from feature_engine.tags import _return_tags 11 | 12 | aoc = ArbitraryOutlierCapper(max_capping_dict={"x0": 10}) 13 | ot = OutlierTrimmer() 14 | wz = Winsorizer() 15 | 16 | FAILED_CHECKS = _return_tags()["_xfail_checks"] 17 | FAILED_CHECKS_AOC = _return_tags()["_xfail_checks"] 18 | 19 | msg1 = "transformers raise errors when data variation is low, " "thus this check fails" 20 | 21 | msg2 = "transformer has 1 mandatory parameter" 22 | 23 | FAILED_CHECKS.update({"check_fit2d_1sample": msg1}) 24 | FAILED_CHECKS_AOC.update( 25 | { 26 | "check_fit2d_1sample": msg1, 27 | "check_parameters_default_constructible": msg2, 28 | } 29 | ) 30 | 31 | EXPECTED_FAILED_CHECKS = { 32 | "ArbitraryOutlierCapper": FAILED_CHECKS_AOC, 33 | "OutlierTrimmer": FAILED_CHECKS, 34 | "Winsorizer": FAILED_CHECKS, 35 | } 36 | 37 | 38 | # encoding 39 | @parametrize_with_checks( 40 | estimators=[aoc, ot, wz], 41 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get( 42 | est.__class__.__name__, {} 43 | ), 44 | ) 45 | def test_sklearn_compatible_creator(estimator, check): 46 | check(estimator) 47 | -------------------------------------------------------------------------------- /tests/parametrize_with_checks_prediction_v16.py: -------------------------------------------------------------------------------- 1 | """ 2 | File intended to help understand check_estimator tests for Feature-engine's 3 | prediction module. It is not run as part of the battery of acceptance tests. 4 | Works from sklearn > 1.6. 5 | """ 6 | 7 | from sklearn.utils.estimator_checks import parametrize_with_checks 8 | 9 | from feature_engine._prediction.base_predictor import BaseTargetMeanEstimator 10 | from feature_engine._prediction.target_mean_classifier import TargetMeanClassifier 11 | from feature_engine._prediction.target_mean_regressor import TargetMeanRegressor 12 | from feature_engine.tags import _return_tags 13 | 14 | _estimators = [BaseTargetMeanEstimator(), TargetMeanClassifier(), TargetMeanRegressor()] 15 | 16 | FAILED_CHECKS = _return_tags()["_xfail_checks"] 17 | 18 | EXPECTED_FAILED_CHECKS = { 19 | "BaseTargetMeanEstimator": FAILED_CHECKS, 20 | "TargetMeanClassifier": FAILED_CHECKS, 21 | "TargetMeanRegressor": FAILED_CHECKS, 22 | } 23 | 24 | 25 | @parametrize_with_checks( 26 | estimators=_estimators, 27 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get( 28 | est.__class__.__name__, {} 29 | ), 30 | ) 31 | def test_sklearn_compatible_creator(estimator, check): 32 | check(estimator) 33 | -------------------------------------------------------------------------------- /tests/test_base_transformers/test_base_numerical_transformer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from numpy import inf 3 | from pandas.testing import assert_frame_equal 4 | 5 | from feature_engine._base_transformers.base_numerical import BaseNumericalTransformer 6 | from tests.estimator_checks.non_fitted_error_checks import check_raises_non_fitted_error 7 | 8 | 9 | class MockClass(BaseNumericalTransformer): 10 | def __init__(self): 11 | self.variables = None 12 | 13 | def transform(self, X): 14 | return self._check_transform_input_and_state(X) 15 | 16 | 17 | def test_fit_method(df_vartypes, df_na): 18 | transformer = MockClass() 19 | res = transformer.fit(df_vartypes) 20 | assert transformer.feature_names_in_ == list(df_vartypes.columns) 21 | assert transformer.n_features_in_ == len(df_vartypes.columns) 22 | assert_frame_equal(res, df_vartypes) 23 | 24 | with pytest.raises(ValueError): 25 | transformer.fit(df_na) 26 | 27 | df_na = df_na.fillna(inf) 28 | with pytest.raises(ValueError): 29 | assert transformer.fit(df_na) 30 | 31 | 32 | def test_transform_method(df_vartypes, df_na): 33 | transformer = MockClass() 34 | transformer.fit(df_vartypes) 35 | assert_frame_equal( 36 | transformer._check_transform_input_and_state(df_vartypes), df_vartypes 37 | ) 38 | assert_frame_equal( 39 | transformer._check_transform_input_and_state( 40 | df_vartypes[["City", "Age", "Name", "Marks", "dob"]] 41 | ), 42 | df_vartypes, 43 | ) 44 | 45 | with pytest.raises(ValueError): 46 | transformer.fit(df_na) 47 | 48 | df_na = df_na.fillna(inf) 49 | with pytest.raises(ValueError): 50 | assert transformer.fit(df_na) 51 | 52 | with pytest.raises(ValueError): 53 | assert transformer._check_transform_input_and_state( 54 | df_vartypes[["Age", "Marks"]] 55 | ) 56 | 57 | 58 | def test_raises_non_fitted_error(): 59 | check_raises_non_fitted_error(MockClass()) 60 | -------------------------------------------------------------------------------- /tests/test_base_transformers/test_transform_xy_mixin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from feature_engine._base_transformers.mixins import TransformXyMixin 5 | 6 | 7 | class MockTransformer(TransformXyMixin): 8 | def transform(self, X): 9 | return X.iloc[1:-1].copy() 10 | 11 | 12 | def test_transform_x_y_method(df_vartypes): 13 | # single target 14 | y = pd.Series(0, index=np.arange(len(df_vartypes))) 15 | transformer = MockTransformer() 16 | Xt, yt = transformer.transform_x_y(df_vartypes, y) 17 | 18 | assert len(Xt) == len(yt) 19 | assert len(Xt) != len(df_vartypes) 20 | assert len(yt) != len(y) 21 | assert (Xt.index == yt.index).all() 22 | assert (Xt.index == [1, 2]).all() 23 | 24 | # multioutput target 25 | y = ( 26 | pd.DataFrame(columns=["vara", "varb"], index=df_vartypes.index) 27 | .astype(float) 28 | .fillna(0) 29 | ) 30 | Xt, yt = transformer.transform_x_y(df_vartypes, y) 31 | 32 | assert len(Xt) == len(yt) 33 | assert len(Xt) != len(df_vartypes) 34 | assert len(yt) != len(y) 35 | assert (Xt.index == yt.index).all() 36 | assert (Xt.index == [1, 2]).all() 37 | -------------------------------------------------------------------------------- /tests/test_check_init_parameters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_check_init_parameters/__init__.py -------------------------------------------------------------------------------- /tests/test_check_init_parameters/test_check_init_input_params.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine._check_init_parameters.check_init_input_params import ( 4 | _check_param_drop_original, 5 | _check_param_missing_values, 6 | ) 7 | 8 | 9 | @pytest.mark.parametrize("missing_vals", [None, ["Hola"], True, "Hola"]) 10 | def test_check_param_missing_values(missing_vals): 11 | with pytest.raises(ValueError): 12 | _check_param_missing_values(missing_vals) 13 | 14 | 15 | @pytest.mark.parametrize("drop_orig", [None, ["Hola"], 10, "Hola"]) 16 | def test_check_param_drop_original(drop_orig): 17 | with pytest.raises(ValueError): 18 | _check_param_drop_original(drop_orig) 19 | -------------------------------------------------------------------------------- /tests/test_check_init_parameters/test_check_input_dictionary.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine._check_init_parameters.check_input_dictionary import ( 4 | _check_numerical_dict, 5 | ) 6 | 7 | 8 | @pytest.mark.parametrize("input_dict", [{"a": 1, "b": "c"}, {1: 1, 2: "c"}]) 9 | def test_raises_error_when_item_in_dict_not_numerical(input_dict): 10 | with pytest.raises(ValueError): 11 | _check_numerical_dict(input_dict) 12 | 13 | 14 | @pytest.mark.parametrize("input_dict", [[1, 2, 3], (1, 2, 3), "hola", 5]) 15 | def test_raises_error_when_input_not_dictionary_or_none(input_dict): 16 | with pytest.raises(TypeError): 17 | _check_numerical_dict(input_dict) 18 | -------------------------------------------------------------------------------- /tests/test_check_init_parameters/test_check_variables.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine._check_init_parameters.check_variables import ( 4 | _check_variables_input_value, 5 | ) 6 | 7 | 8 | @pytest.mark.parametrize("_input_vars", [("var1", "var2"), {"var1": 1, "var2": 2}]) 9 | def test_raises_errors_when_not_list_str_or_int(_input_vars): 10 | with pytest.raises(ValueError) as record: 11 | assert _check_variables_input_value(_input_vars) 12 | msg = ( 13 | "`variables` should contain a string, an integer or a list of strings or " 14 | f"integers. Got {_input_vars} instead." 15 | ) 16 | assert str(record.value) == msg 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "_input_vars", [["var1", "var2", "var2", "var3"], [0, 1, 1, 2]] 21 | ) 22 | def test_raises_error_when_duplicated_var_names(_input_vars): 23 | with pytest.raises(ValueError) as record: 24 | assert _check_variables_input_value(_input_vars) 25 | msg = "The list entered in `variables` contains duplicated variable names." 26 | assert str(record.value) == msg 27 | 28 | 29 | def test_raises_error_when_empty_list(): 30 | with pytest.raises(ValueError) as record: 31 | assert _check_variables_input_value([]) 32 | msg = "The list of `variables` is empty." 33 | assert str(record.value) == msg 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "_input_vars", 38 | [["var1", "var2", "var3"], [0, 1, 2, 3], "var1", ["var1"], 0, [0]], 39 | ) 40 | def test_return_variables(_input_vars): 41 | assert _check_variables_input_value(_input_vars) == _input_vars 42 | 43 | 44 | def test_return_when_variables_is_none(): 45 | assert _check_variables_input_value(None) is None 46 | -------------------------------------------------------------------------------- /tests/test_creation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_creation/__init__.py -------------------------------------------------------------------------------- /tests/test_creation/test_check_estimator_creation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | import sklearn 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.utils.estimator_checks import check_estimator 6 | from sklearn.utils.fixes import parse_version 7 | 8 | from feature_engine.creation import ( 9 | CyclicalFeatures, 10 | DecisionTreeFeatures, 11 | MathFeatures, 12 | RelativeFeatures, 13 | ) 14 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator 15 | 16 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) 17 | 18 | _estimators = [ 19 | MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"), 20 | RelativeFeatures( 21 | variables=["x0", "x1"], reference=["x0"], func=["add"], missing_values="ignore" 22 | ), 23 | CyclicalFeatures(), 24 | DecisionTreeFeatures(regression=False), 25 | ] 26 | 27 | if sklearn_version > parse_version("1.6"): 28 | 29 | @pytest.mark.parametrize("estimator", _estimators) 30 | def test_check_estimator_from_sklearn(estimator): 31 | return check_estimator( 32 | estimator=estimator, 33 | expected_failed_checks=estimator._more_tags()["_xfail_checks"], 34 | ) 35 | 36 | else: 37 | 38 | @pytest.mark.parametrize("estimator", _estimators) 39 | def test_check_estimator_from_sklearn(estimator): 40 | return check_estimator(estimator) 41 | 42 | 43 | _estimators = [ 44 | MathFeatures(variables=["var_1", "var_2", "var_3"], func="mean"), 45 | RelativeFeatures(variables=["var_1", "var_2"], reference=["var_3"], func=["add"]), 46 | CyclicalFeatures(), 47 | ] 48 | 49 | 50 | @pytest.mark.parametrize("estimator", _estimators) 51 | def test_check_estimator_from_feature_engine(estimator): 52 | return check_feature_engine_estimator(estimator) 53 | 54 | 55 | _estimators = [ 56 | CyclicalFeatures(), 57 | MathFeatures(variables=["feature_1", "feature_2"], func=["sum", "mean"]), 58 | RelativeFeatures(variables=["feature_1"], reference=["feature_2"], func=["div"]), 59 | ] 60 | 61 | 62 | @pytest.mark.parametrize("transformer", _estimators) 63 | def test_transformers_in_pipeline_with_set_output_pandas(transformer): 64 | X = pd.DataFrame({"feature_1": [1, 2, 3, 4, 5], "feature_2": [6, 7, 8, 9, 10]}) 65 | y = pd.Series([0, 1, 0, 1, 0]) 66 | 67 | pipe = Pipeline([("trs", transformer)]).set_output(transform="pandas") 68 | 69 | Xtt = transformer.fit_transform(X) 70 | Xtp = pipe.fit_transform(X, y) 71 | 72 | pd.testing.assert_frame_equal(Xtt, Xtp) 73 | -------------------------------------------------------------------------------- /tests/test_datasets/__init__().py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_datasets/__init__().py -------------------------------------------------------------------------------- /tests/test_datetime/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_datetime/__init__.py -------------------------------------------------------------------------------- /tests/test_datetime/test_check_estimator_datetime.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from sklearn.pipeline import Pipeline 4 | 5 | from feature_engine.datetime import DatetimeFeatures, DatetimeSubtraction 6 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator 7 | 8 | _estimators = [DatetimeFeatures()] 9 | 10 | 11 | @pytest.mark.parametrize("estimator", _estimators) 12 | def test_check_estimator_from_feature_engine(estimator): 13 | return check_feature_engine_estimator(estimator) 14 | 15 | 16 | transformers = [ 17 | DatetimeFeatures(), 18 | DatetimeSubtraction(variables="feature_1", reference="feature_2"), 19 | ] 20 | 21 | 22 | @pytest.mark.parametrize("transformer", transformers) 23 | def test_datetime_transformers(transformer): 24 | X = pd.DataFrame( 25 | { 26 | "feature_1": [ 27 | "2014-05-05", 28 | "2014-05-05", 29 | "2014-05-05", 30 | "2014-05-05", 31 | "2014-05-05", 32 | ], 33 | "feature_2": [ 34 | "2014-05-05", 35 | "2014-05-05", 36 | "2014-05-05", 37 | "2014-05-05", 38 | "2014-05-05", 39 | ], 40 | }, 41 | ) 42 | y = pd.Series([0, 1, 0, 1, 0]) 43 | 44 | pipe = Pipeline( 45 | [ 46 | ("trs", transformer), 47 | ] 48 | ).set_output(transform="pandas") 49 | 50 | Xtt = transformer.fit_transform(X) 51 | Xtp = pipe.fit_transform(X, y) 52 | 53 | pd.testing.assert_frame_equal(Xtt, Xtp) 54 | -------------------------------------------------------------------------------- /tests/test_discretisation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_discretisation/__init__.py -------------------------------------------------------------------------------- /tests/test_discretisation/test_check_estimator_discretisers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | import sklearn 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.utils.estimator_checks import check_estimator 7 | from sklearn.utils.fixes import parse_version 8 | 9 | from feature_engine.discretisation import ( 10 | ArbitraryDiscretiser, 11 | DecisionTreeDiscretiser, 12 | EqualFrequencyDiscretiser, 13 | EqualWidthDiscretiser, 14 | GeometricWidthDiscretiser, 15 | ) 16 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator 17 | 18 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) 19 | 20 | 21 | _estimators = [ 22 | DecisionTreeDiscretiser(regression=False), 23 | EqualFrequencyDiscretiser(), 24 | EqualWidthDiscretiser(), 25 | ArbitraryDiscretiser(binning_dict={"x0": [-np.inf, 0, np.inf]}), 26 | GeometricWidthDiscretiser(), 27 | ] 28 | 29 | if sklearn_version < parse_version("1.6"): 30 | 31 | @pytest.mark.parametrize("estimator", _estimators) 32 | def test_check_estimator_from_sklearn(estimator): 33 | return check_estimator(estimator) 34 | 35 | else: 36 | 37 | @pytest.mark.parametrize("estimator", _estimators) 38 | def test_check_estimator_from_sklearn(estimator): 39 | return check_estimator( 40 | estimator=estimator, 41 | expected_failed_checks=estimator._more_tags()["_xfail_checks"], 42 | ) 43 | 44 | 45 | @pytest.mark.parametrize("estimator", _estimators) 46 | def test_check_estimator_from_feature_engine(estimator): 47 | if estimator.__class__.__name__ == "ArbitraryDiscretiser": 48 | estimator.set_params(binning_dict={"var_1": [-np.inf, 0, np.inf]}) 49 | return check_feature_engine_estimator(estimator) 50 | 51 | 52 | @pytest.mark.parametrize("transformer", _estimators) 53 | def test_transformers_within_pipeline(transformer): 54 | if transformer.__class__.__name__ == "ArbitraryDiscretiser": 55 | transformer.set_params(binning_dict={"feature_1": [-np.inf, 0, np.inf]}) 56 | 57 | X = pd.DataFrame({"feature_1": [1, 2, 3, 4, 5], "feature_2": [6, 7, 8, 9, 10]}) 58 | y = pd.Series([0, 1, 0, 1, 0]) 59 | 60 | pipe = Pipeline([("trs", transformer)]).set_output(transform="pandas") 61 | 62 | Xtt = transformer.fit_transform(X, y) 63 | Xtp = pipe.fit_transform(X, y) 64 | 65 | pd.testing.assert_frame_equal(Xtt, Xtp) 66 | -------------------------------------------------------------------------------- /tests/test_encoding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/__init__.py -------------------------------------------------------------------------------- /tests/test_encoding/test_base_encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/test_base_encoders/__init__.py -------------------------------------------------------------------------------- /tests/test_encoding/test_base_encoders/test_categorical_init_mixin.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine.encoding.base_encoder import CategoricalInitMixin 4 | 5 | 6 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)]) 7 | def test_raises_error_when_ignore_format_not_permitted(param): 8 | with pytest.raises(ValueError) as record: 9 | CategoricalInitMixin(ignore_format=param) 10 | msg = f"ignore_format takes only booleans True and False. Got {param} instead." 11 | assert str(record.value) == msg 12 | 13 | 14 | @pytest.mark.parametrize("param", [True, False]) 15 | def test_ignore_format_value_assignment(param): 16 | enc = CategoricalInitMixin(ignore_format=param) 17 | assert enc.ignore_format == param 18 | -------------------------------------------------------------------------------- /tests/test_encoding/test_base_encoders/test_categorical_init_mixin_na.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine.encoding.base_encoder import CategoricalInitMixinNA 4 | 5 | 6 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)]) 7 | def test_raises_error_when_ignore_format_not_permitted(param): 8 | with pytest.raises(ValueError) as record: 9 | CategoricalInitMixinNA(ignore_format=param) 10 | msg = f"ignore_format takes only booleans True and False. Got {param} instead." 11 | assert str(record.value) == msg 12 | 13 | 14 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)]) 15 | def test_raises_error_when_missing_values_not_permitted(param): 16 | with pytest.raises(ValueError) as record: 17 | CategoricalInitMixinNA(missing_values=param) 18 | msg = f"missing_values takes only values 'raise' or 'ignore'. Got {param} instead." 19 | assert str(record.value) == msg 20 | 21 | 22 | @pytest.mark.parametrize("param", [(True, "ignore"), (False, "raise")]) 23 | def test_correct_param_value_assignment(param): 24 | format_, na_ = param 25 | enc = CategoricalInitMixinNA(ignore_format=format_, missing_values=na_) 26 | assert enc.ignore_format == format_ 27 | assert enc.missing_values == na_ 28 | -------------------------------------------------------------------------------- /tests/test_encoding/test_helper_functions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from feature_engine.encoding._helper_functions import check_parameter_unseen 4 | 5 | 6 | @pytest.mark.parametrize("accepted", ["one", False, [1, 2], ("one", "two"), 1]) 7 | def test_raises_error_when_accepted_values_not_permitted(accepted): 8 | with pytest.raises(ValueError) as record: 9 | check_parameter_unseen("zero", accepted) 10 | msg = "accepted_values should be a list of strings. " f" Got {accepted} instead." 11 | assert str(record.value) == msg 12 | 13 | 14 | @pytest.mark.parametrize("accepted", [["one", "two"], ["three", "four"]]) 15 | def test_raises_error_when_error_not_in_accepted_values(accepted): 16 | with pytest.raises(ValueError) as record: 17 | check_parameter_unseen("zero", accepted) 18 | msg = ( 19 | f"Parameter `unseen` takes only values {', '.join(accepted)}." 20 | " Got zero instead." 21 | ) 22 | assert str(record.value) == msg 23 | -------------------------------------------------------------------------------- /tests/test_encoding/test_woe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/test_woe/__init__.py -------------------------------------------------------------------------------- /tests/test_encoding/test_woe/test_woe_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | 5 | from feature_engine.encoding.woe import WoE 6 | 7 | 8 | def test_woe_calculation(df_enc): 9 | pos_exp = pd.Series({"A": 0.333333, "B": 0.333333, "C": 0.333333}) 10 | neg_exp = pd.Series({"A": 0.285714, "B": 0.571429, "C": 0.142857}) 11 | 12 | woe_class = WoE() 13 | pos, neg, woe = woe_class._calculate_woe(df_enc, df_enc["target"], "var_A") 14 | 15 | pd.testing.assert_series_equal(pos, pos_exp, check_names=False) 16 | pd.testing.assert_series_equal(neg, neg_exp, check_names=False) 17 | pd.testing.assert_series_equal(np.log(pos_exp / neg_exp), woe, check_names=False) 18 | 19 | 20 | def test_woe_error(): 21 | df = { 22 | "var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 3 + ["D"] * 2, 23 | "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4, 24 | "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0], 25 | } 26 | df = pd.DataFrame(df) 27 | woe_class = WoE() 28 | 29 | with pytest.raises(ValueError): 30 | woe_class._calculate_woe(df, df["target"], "var_A") 31 | 32 | 33 | @pytest.mark.parametrize("fill_value", [1, 10, 0.1]) 34 | def test_fill_value(fill_value): 35 | df = { 36 | "var_A": ["A"] * 9 + ["B"] * 6 + ["C"] * 3 + ["D"] * 2, 37 | "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4, 38 | "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0], 39 | } 40 | df = pd.DataFrame(df) 41 | 42 | pos_exp = pd.Series( 43 | { 44 | "A": 0.2857142857142857, 45 | "B": 0.2857142857142857, 46 | "C": 0.42857142857142855, 47 | "D": fill_value, 48 | } 49 | ) 50 | neg_exp = pd.Series( 51 | { 52 | "A": 0.5384615384615384, 53 | "B": 0.3076923076923077, 54 | "C": fill_value, 55 | "D": 0.15384615384615385, 56 | } 57 | ) 58 | 59 | woe_class = WoE() 60 | pos, neg, woe = woe_class._calculate_woe( 61 | df, df["target"], "var_A", fill_value=fill_value 62 | ) 63 | 64 | pd.testing.assert_series_equal(pos, pos_exp, check_names=False) 65 | pd.testing.assert_series_equal(neg, neg_exp, check_names=False) 66 | pd.testing.assert_series_equal(np.log(pos_exp / neg_exp), woe, check_names=False) 67 | -------------------------------------------------------------------------------- /tests/test_imputation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_imputation/__init__.py -------------------------------------------------------------------------------- /tests/test_imputation/test_mean_mdian_imputer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from feature_engine.imputation import MeanMedianImputer 5 | 6 | 7 | def test_mean_imputation_and_automatically_select_variables(df_na): 8 | # set up transformer 9 | imputer = MeanMedianImputer(imputation_method="mean", variables=None) 10 | X_transformed = imputer.fit_transform(df_na) 11 | 12 | # set up reference result 13 | X_reference = df_na.copy() 14 | X_reference["Age"] = X_reference["Age"].fillna(28.714285714285715) 15 | X_reference["Marks"] = X_reference["Marks"].fillna(0.6833333333333332) 16 | 17 | # test init params 18 | assert imputer.imputation_method == "mean" 19 | assert imputer.variables is None 20 | 21 | # test fit attributes 22 | assert imputer.variables_ == ["Age", "Marks"] 23 | imputer.imputer_dict_ = { 24 | key: round(value, 3) for (key, value) in imputer.imputer_dict_.items() 25 | } 26 | assert imputer.imputer_dict_ == { 27 | "Age": 28.714, 28 | "Marks": 0.683, 29 | } 30 | assert imputer.n_features_in_ == 6 31 | 32 | # test transform output: 33 | # selected variables should have no NA 34 | # not selected variables should still have NA 35 | assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0 36 | assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0 37 | pd.testing.assert_frame_equal(X_transformed, X_reference) 38 | 39 | 40 | def test_median_imputation_when_user_enters_single_variables(df_na): 41 | # set up trasnformer 42 | imputer = MeanMedianImputer(imputation_method="median", variables=["Age"]) 43 | X_transformed = imputer.fit_transform(df_na) 44 | 45 | # set up reference output 46 | X_reference = df_na.copy() 47 | X_reference["Age"] = X_reference["Age"].fillna(23.0) 48 | 49 | # test init params 50 | assert imputer.imputation_method == "median" 51 | assert imputer.variables == ["Age"] 52 | 53 | # test fit attributes 54 | assert imputer.n_features_in_ == 6 55 | assert imputer.imputer_dict_ == {"Age": 23.0} 56 | 57 | # test transform output 58 | assert X_transformed["Age"].isnull().sum() == 0 59 | pd.testing.assert_frame_equal(X_transformed, X_reference) 60 | 61 | 62 | def test_error_with_wrong_imputation_method(): 63 | with pytest.raises(ValueError): 64 | MeanMedianImputer(imputation_method="arbitrary") 65 | -------------------------------------------------------------------------------- /tests/test_outliers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_outliers/__init__.py -------------------------------------------------------------------------------- /tests/test_prediction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_prediction/__init__.py -------------------------------------------------------------------------------- /tests/test_prediction/conftest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.fixture(scope="module") 6 | def df_classification(): 7 | df = { 8 | "cat_var_A": ["A"] * 5 + ["B"] * 5 + ["C"] * 5 + ["D"] * 5, 9 | "cat_var_B": ["A"] * 6 10 | + ["B"] * 2 11 | + ["C"] * 2 12 | + ["B"] * 2 13 | + ["C"] * 2 14 | + ["D"] * 6, 15 | "num_var_A": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4], 16 | "num_var_B": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4], 17 | } 18 | 19 | df = pd.DataFrame(df) 20 | y = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) 21 | return df, y 22 | 23 | 24 | @pytest.fixture(scope="module") 25 | def df_regression(): 26 | df = { 27 | "cat_var_A": ["A"] * 5 + ["B"] * 5 + ["C"] * 5 + ["D"] * 5, 28 | "cat_var_B": ["A"] * 6 29 | + ["B"] * 2 30 | + ["C"] * 2 31 | + ["B"] * 2 32 | + ["C"] * 2 33 | + ["D"] * 6, 34 | "num_var_A": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4], 35 | "num_var_B": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4], 36 | } 37 | 38 | df = pd.DataFrame(df) 39 | y = pd.Series([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) 40 | return df, y 41 | -------------------------------------------------------------------------------- /tests/test_preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_preprocessing/__init__.py -------------------------------------------------------------------------------- /tests/test_scaling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_scaling/__init__.py -------------------------------------------------------------------------------- /tests/test_selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_selection/__init__.py -------------------------------------------------------------------------------- /tests/test_selection/test_base_selector.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pandas.testing import assert_frame_equal 3 | 4 | from feature_engine.selection.base_selector import BaseSelector 5 | 6 | 7 | @pytest.mark.parametrize("val", [None, "hola", [True]]) 8 | def test_confirm_variables_in_init(val): 9 | with pytest.raises(ValueError): 10 | BaseSelector(confirm_variables=val) 11 | 12 | 13 | class MockClass(BaseSelector): 14 | def __init__(self, variables=None, confirm_variables=False): 15 | self.variables = variables 16 | self.confirm_variables = confirm_variables 17 | 18 | def fit(self, X, y=None): 19 | self.features_to_drop_ = ["Name", "Marks"] 20 | self._get_feature_names_in(X) 21 | return self 22 | 23 | 24 | def test_transform_method(df_vartypes): 25 | transformer = MockClass() 26 | transformer.fit(df_vartypes) 27 | Xt = transformer.transform(df_vartypes) 28 | 29 | # tests output of transform 30 | assert_frame_equal(Xt, df_vartypes.drop(["Name", "Marks"], axis=1)) 31 | 32 | # tests this line: X = X[self.feature_names_in_] 33 | assert_frame_equal( 34 | transformer.transform(df_vartypes[["City", "Age", "Name", "Marks", "dob"]]), 35 | Xt, 36 | ) 37 | # test error when there is a df shape missmatch 38 | with pytest.raises(ValueError): 39 | assert transformer.transform(df_vartypes[["Age", "Marks"]]) 40 | 41 | 42 | def test_get_feature_names_in(df_vartypes): 43 | tr = MockClass() 44 | tr._get_feature_names_in(df_vartypes) 45 | assert tr.n_features_in_ == df_vartypes.shape[1] 46 | assert tr.feature_names_in_ == list(df_vartypes.columns) 47 | 48 | 49 | def test_get_support(df_vartypes): 50 | tr = MockClass() 51 | tr.fit(df_vartypes) 52 | v_bool = [False, True, True, False, True] 53 | v_ind = [1, 2, 4] 54 | assert tr.get_support() == v_bool 55 | assert list(tr.get_support(indices=True)) == v_ind 56 | -------------------------------------------------------------------------------- /tests/test_sklearn_compatible/test_set_output.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.datasets import load_iris 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.pipeline import make_pipeline 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | from feature_engine.transformation import YeoJohnsonTransformer 9 | 10 | 11 | def test_pipeline_with_set_output_sklearn_last(): 12 | 13 | X, y = load_iris(return_X_y=True, as_frame=True) 14 | 15 | pipeline = make_pipeline( 16 | YeoJohnsonTransformer(), StandardScaler(), LogisticRegression() 17 | ).set_output(transform="default") 18 | 19 | pipeline.fit(X, y) 20 | 21 | X_t = pipeline[:-1].transform(X) 22 | assert isinstance(X_t, np.ndarray) 23 | 24 | pipeline.set_output(transform="pandas") 25 | X_t = pipeline[:-1].transform(X) 26 | 27 | assert isinstance(X_t, pd.DataFrame) 28 | 29 | 30 | def test_pipeline_with_set_output_featureengine_last(): 31 | 32 | X, y = load_iris(return_X_y=True, as_frame=True) 33 | 34 | pipeline = make_pipeline( 35 | StandardScaler(), YeoJohnsonTransformer(), LogisticRegression() 36 | ).set_output(transform="default") 37 | 38 | pipeline.fit(X, y) 39 | 40 | X_t = pipeline[:-1].transform(X) 41 | pipeline.fit(X, y) 42 | assert isinstance(X_t, pd.DataFrame) 43 | 44 | pipeline.set_output(transform="pandas") 45 | pipeline.fit(X, y) 46 | 47 | X_t = pipeline[:-1].transform(X) 48 | 49 | assert isinstance(X_t, pd.DataFrame) 50 | 51 | 52 | def test_individual_transformer(): 53 | 54 | X, y = load_iris(return_X_y=True, as_frame=True) 55 | 56 | transformer = YeoJohnsonTransformer() 57 | transformer.set_output(transform="default") 58 | transformer.fit(X) 59 | 60 | X_t = transformer.transform(X) 61 | assert isinstance(X_t, pd.DataFrame) 62 | 63 | transformer.set_output(transform="pandas") 64 | X_t = transformer.transform(X) 65 | 66 | assert isinstance(X_t, pd.DataFrame) 67 | -------------------------------------------------------------------------------- /tests/test_time_series/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_time_series/__init__.py -------------------------------------------------------------------------------- /tests/test_time_series/test_forecasting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_time_series/test_forecasting/__init__.py -------------------------------------------------------------------------------- /tests/test_time_series/test_forecasting/conftest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.fixture(scope="module") 6 | def df_time(): 7 | date_time = [ 8 | "2020-05-15 12:00:00", 9 | "2020-05-15 12:15:00", 10 | "2020-05-15 12:30:00", 11 | "2020-05-15 12:45:00", 12 | "2020-05-15 13:00:00", 13 | "2020-05-15 13:15:00", 14 | "2020-05-15 13:30:00", 15 | "2020-05-15 13:45:00", 16 | "2020-05-15 14:00:00", 17 | "2020-05-15 14:15:00", 18 | "2020-05-15 14:30:00", 19 | "2020-05-15 14:45:00", 20 | "2020-05-15 15:00:00", 21 | "2020-05-15 15:15:00", 22 | "2020-05-15 15:30:00", 23 | ] 24 | 25 | data = { 26 | "ambient_temp": [ 27 | 31.31, 28 | 31.51, 29 | 32.15, 30 | 32.39, 31 | 32.62, 32 | 32.5, 33 | 32.52, 34 | 32.68, 35 | 33.76, 36 | 34.13, 37 | 34.08, 38 | 33.7, 39 | 33.89, 40 | 34.04, 41 | 34.4, 42 | ], 43 | "module_temp": [ 44 | 49.18, 45 | 49.84, 46 | 52.35, 47 | 50.63, 48 | 49.61, 49 | 47.01, 50 | 46.67, 51 | 47.52, 52 | 49.8, 53 | 55.03, 54 | 54.52, 55 | 47.62, 56 | 46.03, 57 | 44.29, 58 | 46.74, 59 | ], 60 | "irradiation": [ 61 | 0.51, 62 | 0.79, 63 | 0.65, 64 | 0.76, 65 | 0.42, 66 | 0.49, 67 | 0.57, 68 | 0.56, 69 | 0.74, 70 | 0.89, 71 | 0.47, 72 | 0.54, 73 | 0.4, 74 | 0.45, 75 | 0.57, 76 | ], 77 | "color": ["blue"] * 10 + ["green"] * 5, 78 | } 79 | 80 | df = pd.DataFrame(data, index=date_time) 81 | df.index = pd.to_datetime(df.index) 82 | return df 83 | -------------------------------------------------------------------------------- /tests/test_transformation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_transformation/__init__.py -------------------------------------------------------------------------------- /tests/test_transformation/test_arcsin_transformer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from sklearn.exceptions import NotFittedError 4 | 5 | from feature_engine.transformation import ArcsinTransformer 6 | 7 | 8 | def test_transform_and_inverse_transform(df_vartypes): 9 | transformer = ArcsinTransformer(variables=["Marks"]) 10 | X = transformer.fit_transform(df_vartypes) 11 | 12 | # expected output 13 | transf_df = df_vartypes.copy() 14 | transf_df["Marks"] = [1.24905, 1.10715, 0.99116, 0.88607] 15 | 16 | # test transform output 17 | pd.testing.assert_frame_equal(X, transf_df) 18 | 19 | # test inverse_transform 20 | Xit = transformer.inverse_transform(X) 21 | 22 | # convert numbers to original format. 23 | Xit["Marks"] = Xit["Marks"].round(1) 24 | 25 | # test 26 | pd.testing.assert_frame_equal(Xit, df_vartypes) 27 | 28 | 29 | def test_fit_raises_error_if_na_in_df(df_na): 30 | # test case 2: when dataset contains na, fit method 31 | transformer = ArcsinTransformer(variables=["Marks"]) 32 | with pytest.raises(ValueError): 33 | transformer.fit(df_na) 34 | 35 | 36 | def test_transform_raises_error_if_na_in_df(df_vartypes, df_na): 37 | # test case 3: when dataset contains na, transform method 38 | transformer = ArcsinTransformer(variables=["Marks"]) 39 | transformer.fit(df_vartypes) 40 | with pytest.raises(ValueError): 41 | transformer.transform(df_na[df_vartypes.columns]) 42 | 43 | 44 | def test_error_if_df_contains_outside_range_values(df_vartypes): 45 | # test error when data contains value outside range [0, +1] 46 | df_out_range = df_vartypes.copy() 47 | df_out_range.loc[1, "Marks"] = 2 48 | 49 | transformer = ArcsinTransformer(variables=["Marks"]) 50 | # test case 4: when variable contains value outside range, fit 51 | with pytest.raises(ValueError): 52 | transformer.fit(df_out_range) 53 | 54 | # test case 5: when variable contains value outside range, transform 55 | transformer.fit(df_vartypes) 56 | with pytest.raises(ValueError): 57 | transformer.transform(df_out_range) 58 | 59 | # when selecting variables automatically and some are outside range 60 | transformer = ArcsinTransformer() 61 | with pytest.raises(ValueError): 62 | transformer.fit(df_vartypes) 63 | 64 | 65 | def test_non_fitted_error(df_vartypes): 66 | transformer = ArcsinTransformer(variables="Marks") 67 | with pytest.raises(NotFittedError): 68 | transformer.transform(df_vartypes) 69 | -------------------------------------------------------------------------------- /tests/test_transformation/test_boxcox_transformer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from sklearn.exceptions import NotFittedError 4 | 5 | from feature_engine.transformation import BoxCoxTransformer 6 | 7 | 8 | def test_automatically_finds_variables(df_vartypes): 9 | # test case 1: automatically select variables 10 | transformer = BoxCoxTransformer(variables=None) 11 | X = transformer.fit_transform(df_vartypes) 12 | 13 | # expected output 14 | transf_df = df_vartypes.copy() 15 | transf_df["Age"] = [9.78731, 10.1666, 9.40189, 9.0099] 16 | transf_df["Marks"] = [-0.101687, -0.207092, -0.316843, -0.431788] 17 | 18 | # test init params 19 | assert transformer.variables is None 20 | # test fit attr 21 | assert transformer.variables_ == ["Age", "Marks"] 22 | assert transformer.n_features_in_ == 5 23 | # test transform output 24 | pd.testing.assert_frame_equal(X, transf_df) 25 | 26 | # test inverse_transform 27 | Xit = transformer.inverse_transform(X) 28 | 29 | # convert numbers to original format. 30 | Xit["Age"] = Xit["Age"].round().astype("int64") 31 | Xit["Marks"] = Xit["Marks"].round(1) 32 | 33 | # test 34 | pd.testing.assert_frame_equal(Xit, df_vartypes) 35 | 36 | 37 | def test_fit_raises_error_if_df_contains_na(df_na): 38 | # test case 2: when dataset contains na, fit method 39 | transformer = BoxCoxTransformer() 40 | with pytest.raises(ValueError): 41 | transformer.fit(df_na) 42 | 43 | 44 | def test_transform_raises_error_if_df_contains_na(df_vartypes, df_na): 45 | # test case 3: when dataset contains na, transform method 46 | transformer = BoxCoxTransformer() 47 | transformer.fit(df_vartypes) 48 | with pytest.raises(ValueError): 49 | transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]]) 50 | 51 | 52 | def test_error_if_df_contains_negative_values(df_vartypes): 53 | # test error when data contains negative values 54 | df_neg = df_vartypes.copy() 55 | df_neg.loc[1, "Age"] = -1 56 | 57 | # test case 4: when variable contains negative value, fit 58 | transformer = BoxCoxTransformer() 59 | with pytest.raises(ValueError): 60 | transformer.fit(df_neg) 61 | 62 | # test case 5: when variable contains negative value, transform 63 | transformer = BoxCoxTransformer() 64 | transformer.fit(df_vartypes) 65 | with pytest.raises(ValueError): 66 | transformer.transform(df_neg) 67 | 68 | 69 | def test_non_fitted_error(df_vartypes): 70 | transformer = BoxCoxTransformer() 71 | with pytest.raises(NotFittedError): 72 | transformer.transform(df_vartypes) 73 | -------------------------------------------------------------------------------- /tests/test_transformation/test_reciprocal_transformer.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from sklearn.exceptions import NotFittedError 4 | 5 | from feature_engine.transformation import ReciprocalTransformer 6 | 7 | 8 | def test_automatically_find_variables(df_vartypes): 9 | # test case 1: automatically select variables 10 | transformer = ReciprocalTransformer(variables=None) 11 | X = transformer.fit_transform(df_vartypes) 12 | 13 | # expected output 14 | transf_df = df_vartypes.copy() 15 | transf_df["Age"] = [0.05, 0.047619, 0.0526316, 0.0555556] 16 | transf_df["Marks"] = [1.11111, 1.25, 1.42857, 1.66667] 17 | 18 | # test init params 19 | assert transformer.variables is None 20 | # test fit attr 21 | assert transformer.variables_ == ["Age", "Marks"] 22 | assert transformer.n_features_in_ == 5 23 | # test transform output 24 | pd.testing.assert_frame_equal(X, transf_df) 25 | 26 | # test inverse_transform 27 | Xit = transformer.inverse_transform(X) 28 | 29 | # convert numbers to original format. 30 | Xit["Age"] = Xit["Age"].round().astype("int64") 31 | Xit["Marks"] = Xit["Marks"].round(1) 32 | 33 | # test 34 | pd.testing.assert_frame_equal(Xit, df_vartypes) 35 | 36 | 37 | def test_fit_raises_error_if_na_in_df(df_na): 38 | # test case 2: when dataset contains na, fit method 39 | with pytest.raises(ValueError): 40 | transformer = ReciprocalTransformer() 41 | transformer.fit(df_na) 42 | 43 | 44 | def test_transform_raises_error_if_na_in_df(df_vartypes, df_na): 45 | # test case 3: when dataset contains na, transform method 46 | with pytest.raises(ValueError): 47 | transformer = ReciprocalTransformer() 48 | transformer.fit(df_vartypes) 49 | transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]]) 50 | 51 | 52 | def test_error_if_df_contains_0_as_value(df_vartypes): 53 | # test error when data contains value zero 54 | df_neg = df_vartypes.copy() 55 | df_neg.loc[1, "Age"] = 0 56 | 57 | # test case 4: when variable contains zero, fit 58 | with pytest.raises(ValueError): 59 | transformer = ReciprocalTransformer() 60 | transformer.fit(df_neg) 61 | 62 | # test case 5: when variable contains zero, transform 63 | with pytest.raises(ValueError): 64 | transformer = ReciprocalTransformer() 65 | transformer.fit(df_vartypes) 66 | transformer.transform(df_neg) 67 | 68 | 69 | def test_non_fitted_error(df_vartypes): 70 | with pytest.raises(NotFittedError): 71 | transformer = ReciprocalTransformer() 72 | transformer.transform(df_vartypes) 73 | -------------------------------------------------------------------------------- /tests/test_variable_handling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_variable_handling/__init__.py -------------------------------------------------------------------------------- /tests/test_variable_handling/conftest.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | 5 | @pytest.fixture 6 | def df(): 7 | df = pd.DataFrame( 8 | { 9 | "Name": ["tom", "nick", "krish", "jack"], 10 | "City": ["London", "Manchester", "Liverpool", "Bristol"], 11 | "Age": [20, 21, 19, 18], 12 | "Marks": [0.9, 0.8, 0.7, 0.6], 13 | "date_range": pd.date_range("2020-02-24", periods=4, freq="min"), 14 | "date_obj0": ["2020-02-24", "2020-02-25", "2020-02-26", "2020-02-27"], 15 | "date_range_tz": pd.date_range( 16 | "2020-02-24", periods=4, freq="min" 17 | ).tz_localize("UTC"), 18 | } 19 | ) 20 | df["Name"] = df["Name"].astype("category") 21 | return df 22 | 23 | 24 | @pytest.fixture 25 | def df_int(df): 26 | df = df.copy() 27 | df.columns = range(1, len(df.columns) + 1) 28 | return df 29 | 30 | 31 | @pytest.fixture 32 | def df_datetime(df): 33 | df = df.copy() 34 | 35 | df["date_obj1"] = ["01-Jan-2010", "24-Feb-1945", "14-Jun-2100", "17-May-1999"] 36 | df["date_obj2"] = ["10/11/12", "12/31/09", "06/30/95", "03/17/04"] 37 | df["time_obj"] = ["21:45:23", "09:15:33", "12:34:59", "03:27:02"] 38 | 39 | df["time_objTZ"] = df["time_obj"].add(["+5", "+11", "-3", "-8"]) 40 | df["date_obj1"] = df["date_obj1"].astype("category") 41 | df["Age"] = df["Age"].astype("O") 42 | return df 43 | -------------------------------------------------------------------------------- /tests/test_variable_handling/test_remove_variables.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from feature_engine.variable_handling.retain_variables import retain_variables_if_in_df 5 | 6 | test_dict = [ 7 | ( 8 | pd.DataFrame(columns=["A", "B", "C", "D", "E"]), 9 | ["A", "C", "B", "G", "H"], 10 | ["A", "C", "B"], 11 | ["X", "Y"], 12 | ), 13 | (pd.DataFrame(columns=[1, 2, 3, 4, 5]), [1, 2, 4, 6], [1, 2, 4], [6, 7]), 14 | (pd.DataFrame(columns=[1, 2, 3, 4, 5]), 1, [1], 7), 15 | (pd.DataFrame(columns=["A", "B", "C", "D", "E"]), "C", ["C"], "G"), 16 | ] 17 | 18 | 19 | @pytest.mark.parametrize("df, variables, overlap, col_not_in_df", test_dict) 20 | def test_retain_variables_if_in_df(df, variables, overlap, col_not_in_df): 21 | 22 | msg = "None of the variables in the list are present in the dataframe." 23 | 24 | assert retain_variables_if_in_df(df, variables) == overlap 25 | 26 | with pytest.raises(ValueError) as record: 27 | retain_variables_if_in_df(df, col_not_in_df) 28 | assert str(record.value) == msg 29 | -------------------------------------------------------------------------------- /tests/test_wrappers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_wrappers/__init__.py -------------------------------------------------------------------------------- /tests/test_wrappers/test_check_estimator_wrappers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import sklearn 3 | from sklearn.impute import SimpleImputer 4 | from sklearn.preprocessing import OrdinalEncoder, StandardScaler 5 | from sklearn.utils.estimator_checks import check_estimator 6 | from sklearn.utils.fixes import parse_version 7 | 8 | from feature_engine.wrappers import SklearnTransformerWrapper 9 | from tests.estimator_checks.estimator_checks import ( 10 | check_raises_error_when_input_not_a_df, 11 | ) 12 | from tests.estimator_checks.fit_functionality_checks import check_feature_names_in 13 | from tests.estimator_checks.non_fitted_error_checks import check_raises_non_fitted_error 14 | from tests.estimator_checks.variable_selection_checks import ( 15 | check_all_types_variables_assignment, 16 | check_numerical_variables_assignment, 17 | ) 18 | 19 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) 20 | 21 | if sklearn_version < parse_version("1.6"): 22 | 23 | def test_sklearn_transformer_wrapper(): 24 | check_estimator(SklearnTransformerWrapper(transformer=SimpleImputer())) 25 | 26 | else: 27 | 28 | def test_sklearn_transformer_wrapper(): 29 | check_estimator( 30 | estimator=SklearnTransformerWrapper(transformer=SimpleImputer()), 31 | expected_failed_checks=SklearnTransformerWrapper( 32 | transformer=SimpleImputer() 33 | )._more_tags()["_xfail_checks"], 34 | ) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "estimator", [SklearnTransformerWrapper(transformer=OrdinalEncoder())] 39 | ) 40 | def test_check_estimator_from_feature_engine(estimator): 41 | check_raises_non_fitted_error(estimator) 42 | check_raises_error_when_input_not_a_df(estimator) 43 | check_feature_names_in(estimator) 44 | 45 | 46 | def test_check_variables_assignment(): 47 | check_numerical_variables_assignment( 48 | SklearnTransformerWrapper(transformer=StandardScaler()) 49 | ) 50 | check_all_types_variables_assignment( 51 | SklearnTransformerWrapper(transformer=OrdinalEncoder()) 52 | ) 53 | 54 | 55 | def test_raises_error_when_no_transformer_passed(): 56 | # this transformer needs an estimator as an input param. 57 | with pytest.raises(TypeError): 58 | SklearnTransformerWrapper() 59 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py39, py310, py311-sklearn150, py311-sklearn160, py312, codecov, docs, stylechecks, typechecks 3 | skipsdist = true 4 | 5 | [testenv] 6 | install_command = pip install {opts} {packages} 7 | envdir = {toxworkdir}/unit_tests 8 | setenv = 9 | PYTHONPATH=. 10 | COVERAGE_RCFILE = {envtmpdir}/coveragerc 11 | commands = 12 | pytest tests 13 | 14 | [testenv:py39] 15 | deps = 16 | -rtest_requirements.txt 17 | 18 | [testenv:py310] 19 | deps = 20 | -rtest_requirements.txt 21 | 22 | [testenv:py311-sklearn150] 23 | deps = 24 | -rtest_requirements.txt 25 | scikit-learn==1.5.1 26 | 27 | [testenv:py311-sklearn160] 28 | deps = 29 | -rtest_requirements.txt 30 | scikit-learn==1.6.1 31 | 32 | [testenv:py312] 33 | deps = 34 | -rtest_requirements.txt 35 | 36 | [testenv:codecov] 37 | deps = 38 | -rtest_requirements.txt 39 | commands_pre = 40 | {envpython} -c 'from pathlib import Path; Path(r"{env:COVERAGE_RCFILE}").write_text(Path(".coveragerc").read_text())' 41 | commands = 42 | coverage run -m pytest -v 43 | coverage report 44 | 45 | [testenv:docs] 46 | deps = 47 | -r docs/requirements.txt 48 | commands = 49 | sphinx-build -W -b html -d {envtmpdir}/doctrees docs {envtmpdir}/html 50 | 51 | [testenv:stylechecks] 52 | deps = 53 | flake8 54 | commands = {posargs:flake8 feature_engine tests} 55 | 56 | [testenv:typechecks] 57 | deps = 58 | mypy 59 | commands = {posargs:mypy feature_engine} 60 | 61 | [flake8] 62 | exclude = .git, env 63 | # match black code formatter 64 | max-line-length = 88 65 | 66 | profile = black 67 | line_length = 88 68 | lines_between_sections = 1 69 | known_first_party = "sentry" --------------------------------------------------------------------------------