├── .circleci
└── config.yml
├── .coveragerc
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── docs.md
│ ├── feature_request.md
│ └── jupyter-notebook-examples.md
└── workflow
│ └── workflow.yml
├── .gitignore
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── docs
├── Makefile
├── _static
│ ├── css
│ │ └── feature-engine.css
│ └── js
│ │ └── copybutton.js
├── _templates
│ ├── class.rst
│ ├── layout.html
│ └── numpydoc_docstring.rst
├── about
│ ├── about.rst
│ ├── authors.rst
│ ├── former_authors.rst
│ ├── governance.rst
│ ├── index.rst
│ └── roadmap.rst
├── api_doc
│ ├── creation
│ │ ├── CyclicalFeatures.rst
│ │ ├── DecisionTreeFeatures.rst
│ │ ├── MathFeatures.rst
│ │ ├── RelativeFeatures.rst
│ │ └── index.rst
│ ├── datasets
│ │ ├── index.rst
│ │ └── titanic.rst
│ ├── datetime
│ │ ├── DatetimeFeatures.rst
│ │ ├── DatetimeSubtraction.rst
│ │ └── index.rst
│ ├── discretisation
│ │ ├── ArbitraryDiscretiser.rst
│ │ ├── DecisionTreeDiscretiser.rst
│ │ ├── EqualFrequencyDiscretiser.rst
│ │ ├── EqualWidthDiscretiser.rst
│ │ ├── GeometricWidthDiscretiser.rst
│ │ └── index.rst
│ ├── encoding
│ │ ├── CountFrequencyEncoder.rst
│ │ ├── DecisionTreeEncoder.rst
│ │ ├── MeanEncoder.rst
│ │ ├── OneHotEncoder.rst
│ │ ├── OrdinalEncoder.rst
│ │ ├── RareLabelEncoder.rst
│ │ ├── StringSimilarityEncoder.rst
│ │ ├── WoEEncoder.rst
│ │ └── index.rst
│ ├── imputation
│ │ ├── AddMissingIndicator.rst
│ │ ├── ArbitraryNumberImputer.rst
│ │ ├── CategoricalImputer.rst
│ │ ├── DropMissingData.rst
│ │ ├── EndTailImputer.rst
│ │ ├── MeanMedianImputer.rst
│ │ ├── RandomSampleImputer.rst
│ │ └── index.rst
│ ├── index.rst
│ ├── outliers
│ │ ├── ArbitraryOutlierCapper.rst
│ │ ├── OutlierTrimmer.rst
│ │ ├── Winsorizer.rst
│ │ └── index.rst
│ ├── pipeline
│ │ ├── Pipeline.rst
│ │ ├── index.rst
│ │ └── make_pipeline.rst
│ ├── preprocessing
│ │ ├── MatchCategories.rst
│ │ ├── MatchVariables.rst
│ │ └── index.rst
│ ├── scaling
│ │ ├── MeanNormalizationScaler.rst
│ │ └── index.rst
│ ├── selection
│ │ ├── DropConstantFeatures.rst
│ │ ├── DropCorrelatedFeatures.rst
│ │ ├── DropDuplicateFeatures.rst
│ │ ├── DropFeatures.rst
│ │ ├── DropHighPSIFeatures.rst
│ │ ├── MRMR.rst
│ │ ├── ProbeFeatureSelection.rst
│ │ ├── RecursiveFeatureAddition.rst
│ │ ├── RecursiveFeatureElimination.rst
│ │ ├── SelectByInformationValue.rst
│ │ ├── SelectByShuffling.rst
│ │ ├── SelectBySingleFeaturePerformance.rst
│ │ ├── SelectByTargetMeanPerformance.rst
│ │ ├── SmartCorrelatedSelection.rst
│ │ └── index.rst
│ ├── timeseries
│ │ ├── forecasting
│ │ │ ├── ExpandingWindowFeatures.rst
│ │ │ ├── LagFeatures.rst
│ │ │ ├── WindowFeatures.rst
│ │ │ └── index.rst
│ │ └── index.rst
│ ├── transformation
│ │ ├── ArcsinTransformer.rst
│ │ ├── BoxCoxTransformer.rst
│ │ ├── LogCpTransformer.rst
│ │ ├── LogTransformer.rst
│ │ ├── PowerTransformer.rst
│ │ ├── ReciprocalTransformer.rst
│ │ ├── YeoJohnsonTransformer.rst
│ │ └── index.rst
│ ├── variable_handling
│ │ ├── check_all_variables.rst
│ │ ├── check_categorical_variables.rst
│ │ ├── check_datetime_variables.rst
│ │ ├── check_numerical_variables.rst
│ │ ├── find_all_variables.rst
│ │ ├── find_categorical_and_numerical_variables.rst
│ │ ├── find_categorical_variables.rst
│ │ ├── find_datetime_variables.rst
│ │ ├── find_numerical_variables.rst
│ │ ├── index.rst
│ │ └── retain_variables_if_in_df.rst
│ └── wrappers
│ │ ├── Wrapper.rst
│ │ └── index.rst
├── conf.py
├── contribute
│ ├── code_of_conduct.rst
│ ├── contribute_code.rst
│ ├── contribute_docs.rst
│ ├── contribute_jup.rst
│ ├── contribute_other.rst
│ └── index.rst
├── donate.rst
├── images
│ ├── 1024px-Relationship_between_mean_and_median_under_different_skewness.png
│ ├── Discretisation.png
│ ├── FeatureEnginePackageStructure.png
│ ├── FeatureEnginePackageStructureCrossSectional.png
│ ├── FeatureEnginePackageStructureDatetimeText.png
│ ├── FeatureEnginePackageStructureTimeseries.png
│ ├── PSI_distribution_case1.png
│ ├── PSI_distribution_case3.png
│ ├── PSI_distribution_case4.png
│ ├── PSI_distribution_case5.png
│ ├── Variable_Transformation.png
│ ├── arbitraryvalueimputation.png
│ ├── bmilogcp.png
│ ├── bmiraw.png
│ ├── boxplot-age-percentiles.png
│ ├── boxplot-age.png
│ ├── boxplot-fare-mad.png
│ ├── boxplot-fare.png
│ ├── boxplot-sibsp-fare-iqr.png
│ ├── boxplot-sibsp.png
│ ├── boxplot-titanic.png
│ ├── breast_cancer_arcsin.png
│ ├── breast_cancer_raw.png
│ ├── cookbook.png
│ ├── dmlm.png
│ ├── endtailimputer.png
│ ├── equalfrequencydiscretisation.png
│ ├── equalfrequencydiscretisation_gaussian.png
│ ├── equalfrequencydiscretisation_skewed.png
│ ├── equalwidthdiscretisation.png
│ ├── f_statistic.png
│ ├── feml.png
│ ├── fetsf.png
│ ├── fork.png
│ ├── frequentcategoryimputer.png
│ ├── fsml.png
│ ├── fsmlbook.png
│ ├── fwml.png
│ ├── hour_sin.png
│ ├── hour_sin2.png
│ ├── hour_sin3.png
│ ├── hour_sin4.png
│ ├── increasingwidthdisc.png
│ ├── ivml_logo.png
│ ├── logcpraw.png
│ ├── logcptransform.png
│ ├── logo
│ │ ├── FeatureEngine.png
│ │ ├── Logo.png
│ │ ├── Logo_name.png
│ │ ├── favicon.png
│ │ └── logo.svg
│ ├── lotarea_pt.png
│ ├── lotarea_pt_custom_exp.png
│ ├── lotarea_raw.png
│ ├── lotareaboxcox.png
│ ├── lotarealog.png
│ ├── lotareapower.png
│ ├── lotarearaw.png
│ ├── lotareareciprocal.png
│ ├── lotareayeojohnson.png
│ ├── lotshape-price-per-cat-enc.png
│ ├── lotshape-price-per-cat.png
│ ├── meanmedianimputater_distributions.png
│ ├── medianimputation.png
│ ├── medinc_disc_arbitrarily.png
│ ├── medinc_disc_arbitrarily2.png
│ ├── medinc_hist.png
│ ├── missingcategoryimputer.png
│ ├── missingindicator.png
│ ├── mli_logo.png
│ ├── monotonic.png
│ ├── mzoning-price-per-cat-enc.png
│ ├── mzoning-price-per-cat.png
│ ├── nonnormalvars2.png
│ ├── nonnormalvars2logtransformed.png
│ ├── nonnormalvars2transformed.png
│ ├── ordinal_encoding_monotonic.png
│ ├── pipelineprediction.png
│ ├── probe-importance-std.png
│ ├── probe_feature_normal.png
│ ├── probe_features.png
│ ├── quasiconstant.png
│ ├── randomsampleimputation.png
│ ├── reciprocal_transformer
│ │ ├── reciprocal_transfomer_inverse.png
│ │ ├── reciprocal_transfomer_new.png
│ │ ├── reciprocal_transfomer_original.png
│ │ ├── reciprocal_transformer_3plots_new.png
│ │ └── reciprocal_transformer_3plots_original.png
│ ├── rfa_linreg_imp.png
│ ├── rfa_perf_drifts.png
│ ├── rfe_perf_drift.png
│ ├── rfimportancemrmr.png
│ ├── selectionChart.png
│ ├── shuffle-features-std.png
│ ├── single-feature-perf-std.png
│ ├── single_feature_probes_imp.png
│ ├── sponsors
│ │ ├── call_for_sponsors.png
│ │ ├── how-did-you-discover.png
│ │ └── trainindata.png
│ ├── summary
│ │ ├── imputersSummary.png
│ │ └── selectionSummary.png
│ ├── target-mean-sel-std.png
│ ├── toydata_pt_raw.png
│ ├── toydata_pt_transformed.png
│ ├── toydata_pt_transformed_custom_exp.png
│ ├── transformedcoupleYJ.png
│ ├── treediscretisation.png
│ ├── treemonotonicprediction.png
│ ├── treepredictionrounded.png
│ ├── untransformedcoupleYJ.png
│ ├── woe_encoding.png
│ ├── woe_prediction.png
│ └── yeojohnsonformula.png
├── index.rst
├── quickstart
│ ├── datasets.rst
│ └── index.rst
├── requirements.txt
├── resources
│ ├── blogs.rst
│ ├── books.rst
│ ├── courses.rst
│ ├── index.rst
│ └── tutorials.rst
├── sphinxext
│ ├── LICENSE.txt
│ ├── README.txt
│ └── github_link.py
├── user_guide
│ ├── creation
│ │ ├── CyclicalFeatures.rst
│ │ ├── DecisionTreeFeatures.rst
│ │ ├── MathFeatures.rst
│ │ ├── RelativeFeatures.rst
│ │ └── index.rst
│ ├── datetime
│ │ ├── DatetimeFeatures.rst
│ │ ├── DatetimeSubtraction.rst
│ │ └── index.rst
│ ├── discretisation
│ │ ├── ArbitraryDiscretiser.rst
│ │ ├── DecisionTreeDiscretiser.rst
│ │ ├── EqualFrequencyDiscretiser.rst
│ │ ├── EqualWidthDiscretiser.rst
│ │ ├── GeometricWidthDiscretiser.rst
│ │ └── index.rst
│ ├── encoding
│ │ ├── CountFrequencyEncoder.rst
│ │ ├── DecisionTreeEncoder.rst
│ │ ├── MeanEncoder.rst
│ │ ├── OneHotEncoder.rst
│ │ ├── OrdinalEncoder.rst
│ │ ├── RareLabelEncoder.rst
│ │ ├── StringSimilarityEncoder.rst
│ │ ├── WoEEncoder.rst
│ │ └── index.rst
│ ├── imputation
│ │ ├── AddMissingIndicator.rst
│ │ ├── ArbitraryNumberImputer.rst
│ │ ├── CategoricalImputer.rst
│ │ ├── DropMissingData.rst
│ │ ├── EndTailImputer.rst
│ │ ├── MeanMedianImputer.rst
│ │ ├── RandomSampleImputer.rst
│ │ └── index.rst
│ ├── index.rst
│ ├── outliers
│ │ ├── ArbitraryOutlierCapper.rst
│ │ ├── OutlierTrimmer.rst
│ │ ├── Winsorizer.rst
│ │ └── index.rst
│ ├── pipeline
│ │ ├── Pipeline.rst
│ │ ├── index.rst
│ │ └── make_pipeline.rst
│ ├── preprocessing
│ │ ├── MatchCategories.rst
│ │ ├── MatchVariables.rst
│ │ └── index.rst
│ ├── scaling
│ │ ├── MeanNormalizationScaler.rst
│ │ └── index.rst
│ ├── selection
│ │ ├── DropConstantFeatures.rst
│ │ ├── DropCorrelatedFeatures.rst
│ │ ├── DropDuplicateFeatures.rst
│ │ ├── DropFeatures.rst
│ │ ├── DropHighPSIFeatures.rst
│ │ ├── MRMR.rst
│ │ ├── ProbeFeatureSelection.rst
│ │ ├── RecursiveFeatureAddition.rst
│ │ ├── RecursiveFeatureElimination.rst
│ │ ├── SelectByInformationValue.rst
│ │ ├── SelectByShuffling.rst
│ │ ├── SelectBySingleFeaturePerformance.rst
│ │ ├── SelectByTargetMeanPerformance.rst
│ │ ├── SmartCorrelatedSelection.rst
│ │ └── index.rst
│ ├── timeseries
│ │ ├── forecasting
│ │ │ ├── ExpandingWindowFeatures.rst
│ │ │ ├── LagFeatures.rst
│ │ │ ├── WindowFeatures.rst
│ │ │ └── index.rst
│ │ └── index.rst
│ ├── transformation
│ │ ├── ArcsinTransformer.rst
│ │ ├── BoxCoxTransformer.rst
│ │ ├── LogCpTransformer.rst
│ │ ├── LogTransformer.rst
│ │ ├── PowerTransformer.rst
│ │ ├── ReciprocalTransformer.rst
│ │ ├── YeoJohnsonTransformer.rst
│ │ └── index.rst
│ ├── variable_handling
│ │ ├── check_all_variables.rst
│ │ ├── check_categorical_variables.rst
│ │ ├── check_datetime_variables.rst
│ │ ├── check_numerical_variables.rst
│ │ ├── find_all_variables.rst
│ │ ├── find_categorical_and_numerical_variables.rst
│ │ ├── find_categorical_variables.rst
│ │ ├── find_datetime_variables.rst
│ │ ├── find_numerical_variables.rst
│ │ ├── index.rst
│ │ └── retain_variables_if_in_df.rst
│ └── wrappers
│ │ ├── Wrapper.rst
│ │ └── index.rst
├── versions
│ └── index.rst
└── whats_new
│ ├── index.rst
│ ├── v_06.rst
│ ├── v_1.rst
│ ├── v_120.rst
│ ├── v_130.rst
│ ├── v_140.rst
│ ├── v_150.rst
│ ├── v_160.rst
│ ├── v_170.rst
│ └── v_180.rst
├── feature_engine
├── VERSION
├── __init__.py
├── _base_transformers
│ ├── __init__.py
│ ├── base_numerical.py
│ └── mixins.py
├── _check_init_parameters
│ ├── __init__.py
│ ├── check_init_input_params.py
│ ├── check_input_dictionary.py
│ └── check_variables.py
├── _docstrings
│ ├── __init__.py
│ ├── fit_attributes.py
│ ├── init_parameters
│ │ ├── __init__.py
│ │ ├── all_trasnformers.py
│ │ ├── creation.py
│ │ ├── discretisers.py
│ │ ├── encoders.py
│ │ ├── outliers.py
│ │ └── selection.py
│ ├── methods.py
│ ├── selection
│ │ ├── __init__.py
│ │ └── _docstring.py
│ └── substitute.py
├── _prediction
│ ├── __init__.py
│ ├── base_predictor.py
│ ├── target_mean_classifier.py
│ └── target_mean_regressor.py
├── creation
│ ├── __init__.py
│ ├── base_creation.py
│ ├── cyclical_features.py
│ ├── decision_tree_features.py
│ ├── math_features.py
│ └── relative_features.py
├── dataframe_checks.py
├── datasets
│ ├── __init__.py
│ └── titanic.py
├── datetime
│ ├── __init__.py
│ ├── _datetime_constants.py
│ ├── datetime.py
│ └── datetime_subtraction.py
├── discretisation
│ ├── __init__.py
│ ├── arbitrary.py
│ ├── base_discretiser.py
│ ├── decision_tree.py
│ ├── equal_frequency.py
│ ├── equal_width.py
│ └── geometric_width.py
├── encoding
│ ├── __init__.py
│ ├── _helper_functions.py
│ ├── base_encoder.py
│ ├── count_frequency.py
│ ├── decision_tree.py
│ ├── mean_encoding.py
│ ├── one_hot.py
│ ├── ordinal.py
│ ├── rare_label.py
│ ├── similarity_encoder.py
│ └── woe.py
├── imputation
│ ├── __init__.py
│ ├── arbitrary_number.py
│ ├── base_imputer.py
│ ├── categorical.py
│ ├── drop_missing_data.py
│ ├── end_tail.py
│ ├── mean_median.py
│ ├── missing_indicator.py
│ └── random_sample.py
├── outliers
│ ├── __init__.py
│ ├── artbitrary.py
│ ├── base_outlier.py
│ ├── trimmer.py
│ └── winsorizer.py
├── pipeline
│ ├── __init__.py
│ └── pipeline.py
├── preprocessing
│ ├── __init__.py
│ ├── match_categories.py
│ └── match_columns.py
├── py.typed
├── scaling
│ ├── __init__.py
│ └── mean_normalization.py
├── selection
│ ├── __init__.py
│ ├── _selection_constants.py
│ ├── base_recursive_selector.py
│ ├── base_selection_functions.py
│ ├── base_selector.py
│ ├── drop_constant_features.py
│ ├── drop_correlated_features.py
│ ├── drop_duplicate_features.py
│ ├── drop_features.py
│ ├── drop_psi_features.py
│ ├── information_value.py
│ ├── mrmr.py
│ ├── probe_feature_selection.py
│ ├── recursive_feature_addition.py
│ ├── recursive_feature_elimination.py
│ ├── shuffle_features.py
│ ├── single_feature_performance.py
│ ├── smart_correlation_selection.py
│ └── target_mean_selection.py
├── tags.py
├── timeseries
│ ├── __init__.py
│ └── forecasting
│ │ ├── __init__.py
│ │ ├── base_forecast_transformers.py
│ │ ├── expanding_window_features.py
│ │ ├── lag_features.py
│ │ └── window_features.py
├── transformation
│ ├── __init__.py
│ ├── arcsin.py
│ ├── boxcox.py
│ ├── log.py
│ ├── power.py
│ ├── reciprocal.py
│ └── yeojohnson.py
├── variable_handling
│ ├── __init__.py
│ ├── _variable_type_checks.py
│ ├── check_variables.py
│ ├── dtypes.py
│ ├── find_variables.py
│ └── retain_variables.py
└── wrappers
│ ├── __init__.py
│ └── wrappers.py
├── mypy.ini
├── paper
├── paper.bib
└── paper.md
├── pytest.ini
├── requirements.txt
├── setup.py
├── test_requirements.txt
├── tests
├── __init__.py
├── check_estimators_with_parametrize_tests.py
├── conftest.py
├── estimator_checks
│ ├── __init__.py
│ ├── dataframe_for_checks.py
│ ├── estimator_checks.py
│ ├── fit_functionality_checks.py
│ ├── get_feature_names_out_checks.py
│ ├── init_params_allowed_values_checks.py
│ ├── init_params_triggered_functionality_checks.py
│ ├── non_fitted_error_checks.py
│ └── variable_selection_checks.py
├── parametrize_with_checks_creation_v16.py
├── parametrize_with_checks_discretization_v16.py
├── parametrize_with_checks_encoders_v16.py
├── parametrize_with_checks_outliers_v16.py
├── parametrize_with_checks_prediction_v16.py
├── parametrize_with_checks_selection_v16.py
├── test_base_transformers
│ ├── test_base_numerical_transformer.py
│ ├── test_get_feature_names_out_mixin.py
│ └── test_transform_xy_mixin.py
├── test_check_init_parameters
│ ├── __init__.py
│ ├── test_check_init_input_params.py
│ ├── test_check_input_dictionary.py
│ └── test_check_variables.py
├── test_creation
│ ├── __init__.py
│ ├── test_check_estimator_creation.py
│ ├── test_cyclical_features.py
│ ├── test_decision_tree_features.py
│ ├── test_math_features.py
│ └── test_relative_features.py
├── test_dataframe_checks.py
├── test_datasets
│ ├── __init__().py
│ └── datasets.py
├── test_datetime
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_check_estimator_datetime.py
│ ├── test_datetime_features.py
│ └── test_datetime_subtraction.py
├── test_discretisation
│ ├── __init__.py
│ ├── test_arbitrary_discretiser.py
│ ├── test_base_discretizer.py
│ ├── test_check_estimator_discretisers.py
│ ├── test_decision_tree_discretiser.py
│ ├── test_equal_frequency_discretiser.py
│ ├── test_equal_width_discretiser.py
│ └── test_geometric_width_discretiser.py
├── test_encoding
│ ├── __init__.py
│ ├── test_base_encoders
│ │ ├── __init__.py
│ │ ├── test_categorical_init_mixin.py
│ │ ├── test_categorical_init_mixin_na.py
│ │ └── test_categorical_method_mixin.py
│ ├── test_check_estimator_encoders.py
│ ├── test_count_frequency_encoder.py
│ ├── test_decision_tree_encoder.py
│ ├── test_helper_functions.py
│ ├── test_mean_encoder.py
│ ├── test_onehot_encoder.py
│ ├── test_ordinal_encoder.py
│ ├── test_rare_label_encoder.py
│ ├── test_similarity_encoder.py
│ └── test_woe
│ │ ├── __init__.py
│ │ ├── test_woe_class.py
│ │ └── test_woe_encoder.py
├── test_imputation
│ ├── __init__.py
│ ├── test_arbitrary_number_imputer.py
│ ├── test_categorical_imputer.py
│ ├── test_check_estimator_imputers.py
│ ├── test_drop_missing_data.py
│ ├── test_end_tail_imputer.py
│ ├── test_mean_mdian_imputer.py
│ ├── test_missing_indicator.py
│ └── test_random_sample_imputer.py
├── test_outliers
│ ├── __init__.py
│ ├── test_arbitrary_capper.py
│ ├── test_check_estimator_outliers.py
│ ├── test_outlier_trimmer.py
│ └── test_winsorizer.py
├── test_pipeline
│ ├── test_pipeline.py
│ └── test_pipeline_sklearn.py
├── test_prediction
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_check_estimator_prediction.py
│ ├── test_target_mean_classifier.py
│ └── test_target_mean_regressor.py
├── test_preprocessing
│ ├── __init__.py
│ ├── test_check_estimator_preprocessing.py
│ ├── test_match_categories.py
│ └── test_match_columns.py
├── test_scaling
│ ├── __init__.py
│ └── test_mean_normalization.py
├── test_selection
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_base_selection_functions.py
│ ├── test_base_selector.py
│ ├── test_check_estimator_selectors.py
│ ├── test_drop_constant_features.py
│ ├── test_drop_correlated_features.py
│ ├── test_drop_duplicate_features.py
│ ├── test_drop_features.py
│ ├── test_drop_high_psi_features.py
│ ├── test_information_value.py
│ ├── test_mrmr.py
│ ├── test_probe_feature_selection.py
│ ├── test_recursive_feature_addition.py
│ ├── test_recursive_feature_elimination.py
│ ├── test_recursive_feature_selectors.py
│ ├── test_shuffle_features.py
│ ├── test_single_feature_performance.py
│ ├── test_smart_correlation_selection.py
│ └── test_target_mean_selection.py
├── test_sklearn_compatible
│ └── test_set_output.py
├── test_time_series
│ ├── __init__.py
│ └── test_forecasting
│ │ ├── __init__.py
│ │ ├── conftest.py
│ │ ├── test_check_estimator_forecasting.py
│ │ ├── test_expanding_window_features.py
│ │ ├── test_lag_features.py
│ │ └── test_window_features.py
├── test_transformation
│ ├── __init__.py
│ ├── test_arcsin_transformer.py
│ ├── test_boxcox_transformer.py
│ ├── test_check_estimator_transformers.py
│ ├── test_log_transformer.py
│ ├── test_logcp_transformer.py
│ ├── test_power_transformer.py
│ ├── test_reciprocal_transformer.py
│ └── test_yeojohnson_transformer.py
├── test_variable_handling
│ ├── __init__.py
│ ├── conftest.py
│ ├── test_check_variables.py
│ ├── test_find_variables.py
│ └── test_remove_variables.py
└── test_wrappers
│ ├── __init__.py
│ ├── test_check_estimator_wrappers.py
│ └── test_sklearn_wrapper.py
└── tox.ini
/.coveragerc:
--------------------------------------------------------------------------------
1 | # configuration for coverage.py
2 |
3 | [run]
4 | branch = True
5 | source = feature_engine
6 | include = */feature_engine/*
7 | omit =
8 | */setup.py
9 |
10 |
11 | [report]
12 | exclude_lines =
13 | pragma: no cover
14 |
15 | show_missing = True
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [solegalli]
4 | buy_me_a_coffee: solegalliy
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/docs.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Docs
3 | about: What documentation is missing?
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please let us know if you think there is information missing, or how else we can improve the documentation from Feature-engine.
11 |
12 | If you are referring to an existing page, please paste the url.
13 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/jupyter-notebook-examples.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Jupyter notebook examples
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | Please let us know what is missing from existing Jupyter notebook demos, or suggest which new demo you think it would be useful for the community.
11 |
--------------------------------------------------------------------------------
/.github/workflow/workflow.yml:
--------------------------------------------------------------------------------
1 | name: CodeCov
2 | on: [push, pull_request]
3 | jobs:
4 | run:
5 | runs-on: ubuntu-latest
6 | env:
7 | OS: ubuntu-latest
8 | PYTHON: '3.9'
9 | steps:
10 | - uses: checkout@v3
11 | with:
12 | fetch-depth: ‘2’
13 |
14 | - name: Setup Python
15 | uses: actions/setup-python@master
16 | with:
17 | python-version: 3.9
18 | - name: Generate Report
19 | run: |
20 | pip install coverage
21 | coverage run -m pytest
22 | - name: Upload Coverage to Codecov
23 | uses: codecov/codecov-action@v3.1.1
24 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 | docs/build/
69 | build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | # Miscelaneous
109 | .idea
110 | .vscode
111 | *.csv
112 | *.DS_Store
113 | *.db
114 | *.pptx
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the OS, Python version and other tools you might need
9 | build:
10 | os: ubuntu-22.04
11 | tools:
12 | python: "3.11"
13 |
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 | configuration: docs/conf.py
17 |
18 | # Build documentation with MkDocs
19 | #mkdocs:
20 | # configuration: mkdocs.yml
21 |
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | formats: all
24 |
25 | # Optionally set the version of Python and requirements required to build your docs
26 | python:
27 | install:
28 | - requirements: docs/requirements.txt
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributing to Feature-engine
2 | ==============================
3 |
4 | Feature-engine is a community-driven open-source project that relies on contributions from
5 | people like you. Every contribution, no matter how big or small, can make a significant
6 | impact on the project. If you've never contributed to an open-source project before, don't
7 | worry! Feature-engine is a great place to start. Your help will be appreciated and welcomed
8 | with gratitude.
9 |
10 | The latest contributing guide is available online at:
11 |
12 | https://feature-engine.trainindata.com/en/latest/contribute/index.html
13 |
14 | There are many ways to contribute to Feature-engine, with the most common ones
15 | being contribution of code or documentation to the project. Improving the
16 | documentation is no less important than improving the library itself. If you
17 | find a typo in the documentation, or have made improvements, do not hesitate to
18 | submit a GitHub pull request.
19 |
20 | Documentation can be found under the
21 | [doc/](https://github.com/feature-engine/feature_engine/tree/main/docs) directory.
22 |
23 | You can check out requested enhancements and current bugs on the
24 | [issue tracker](https://github.com/feature-engine/feature_engine/issues),
25 | and suggest a PR with the fix. Every contribution is valuable and decreases the burden
26 | on the project maintainer.
27 |
28 | Another way to contribute is to report issues you're facing, and give a "thumbs
29 | up" on issues that others reported and that are relevant to you. It also helps
30 | us if you spread the word: reference the project from your blog and articles,
31 | link to it from your website, or simply star it in GitHub to say "I use it".
32 |
33 | Quick links
34 | -----------
35 |
36 | * [Submitting a bug report or feature request](https://github.com/feature-engine/feature_engine/issues)
37 | * [Contributing code](https://feature-engine.trainindata.com/en/latest/contribute/contribute_code.html)
38 | * [Contributing docs](https://feature-engine.trainindata.com/en/latest/contribute/contribute_docs.html)
39 | * [Other ways to contribute](https://feature-engine.trainindata.com/en/latest/contribute/contribute_other.html)
40 |
41 | Code of Conduct
42 | ---------------
43 |
44 | We abide by the principles of openness, respect, and consideration of others
45 | of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018-2024 The Feature-engine developers.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include *.md
3 | include *.pkl
4 | recursive-include ./feature_engine/*
5 |
6 | include feature_engine/VERSION
7 |
8 | include ./requirements.txt
9 | include ./LICENSE
10 | exclude *.log
11 | exclude *.cfg
12 |
13 | recursive-exclude * __pycache__
14 | recursive-exclude * *.py[co]
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = feature_engine
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/_static/css/feature-engine.css:
--------------------------------------------------------------------------------
1 | @import url("theme.css");
2 |
3 |
4 | /* Css template from sklearn:
5 | https://github.com/scikit-learn/scikit-learn/blob/f71c0313142c4e5f2f35a0021c36075cf8dba611/doc/themes/scikit-learn-modern/static/css/theme.css
6 | */
7 |
8 | /* authors */
9 | .authors-container {
10 | display: flex;
11 | flex-wrap: wrap;
12 | justify-content: center;
13 | }
14 |
15 |
16 | /* sponsors and testimonials */
17 |
18 | div.sk-sponsor-div, div.sk-testimonial-div {
19 | display: flex;
20 | flex-wrap: wrap;
21 | -webkit-flex-align: center;
22 | -ms-flex-align: center;
23 | -webkit-align-items: center;
24 | align-items: center;
25 | }
26 |
27 | div.sk-sponsor-div-box, div.sk-testimonial-div-box {
28 | width: 100%;
29 | }
30 |
31 | @media screen and (min-width: 500px) {
32 | div.sk-sponsor-div-box, div.sk-testimonial-div-box {
33 | width: 50%;
34 | }
35 | }
36 |
37 | .caption {
38 | text-align: center
39 | }
--------------------------------------------------------------------------------
/docs/_templates/class.rst:
--------------------------------------------------------------------------------
1 | {{objname}}
2 | {{ underline }}==============
3 |
4 | .. currentmodule:: {{ module }}
5 |
6 | .. autoclass:: {{ objname }}
7 |
8 | {% block methods %}
9 |
10 | {% if methods %}
11 | .. rubric:: Methods
12 |
13 | .. autosummary::
14 | {% for item in methods %}
15 | {% if '__init__' not in item %}
16 | ~{{ name }}.{{ item }}
17 | {% endif %}
18 | {%- endfor %}
19 | {% endif %}
20 | {% endblock %}
21 |
22 | .. include:: {{module}}.{{objname}}.examples
23 |
24 | .. raw:: html
25 |
26 |
27 |
--------------------------------------------------------------------------------
/docs/_templates/numpydoc_docstring.rst:
--------------------------------------------------------------------------------
1 | {{index}}
2 | {{summary}}
3 | {{extended_summary}}
4 | {{parameters}}
5 | {{returns}}
6 | {{yields}}
7 | {{other_parameters}}
8 | {{attributes}}
9 | {{raises}}
10 | {{warns}}
11 | {{warnings}}
12 | {{see_also}}
13 | {{notes}}
14 | {{references}}
15 | {{examples}}
16 | {{methods}}
--------------------------------------------------------------------------------
/docs/about/authors.rst:
--------------------------------------------------------------------------------
1 | .. raw :: html
2 |
3 |
4 |
5 |
9 |
10 |
11 |
Soledad Galli
12 |
13 |
14 |
15 |
Morgan Sell
16 |
17 |
18 |
--------------------------------------------------------------------------------
/docs/about/former_authors.rst:
--------------------------------------------------------------------------------
1 | .. raw :: html
2 |
3 |
4 |
5 |
9 |
10 |
11 |
Chris Samiullah
12 |
13 |
14 |
15 |
Nicolas Galli
16 |
17 |
--------------------------------------------------------------------------------
/docs/about/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. _about:
3 |
4 | About
5 | =====
6 |
7 | In this section you will find information about the Feature-engine's origin, main
8 | developers, roadmap and overall vision for the package. You will also find information
9 | about how to cite Feature-engine and our main sponsors.
10 |
11 | .. toctree::
12 | :maxdepth: 1
13 |
14 | about
15 | governance
16 | roadmap
--------------------------------------------------------------------------------
/docs/api_doc/creation/CyclicalFeatures.rst:
--------------------------------------------------------------------------------
1 | CyclicalFeatures
2 | ================
3 |
4 | .. autoclass:: feature_engine.creation.CyclicalFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/creation/DecisionTreeFeatures.rst:
--------------------------------------------------------------------------------
1 | DecisionTreeFeatures
2 | ====================
3 |
4 | .. autoclass:: feature_engine.creation.DecisionTreeFeatures
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/creation/MathFeatures.rst:
--------------------------------------------------------------------------------
1 | MathFeatures
2 | ============
3 |
4 | .. autoclass:: feature_engine.creation.MathFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/creation/RelativeFeatures.rst:
--------------------------------------------------------------------------------
1 | RelativeFeatures
2 | ================
3 |
4 | .. autoclass:: feature_engine.creation.RelativeFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/creation/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Feature Creation
4 | ================
5 |
6 | Feature-engine's creation transformers create and add new features to the dataframe
7 | by either combining or transforming existing features.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | MathFeatures
13 | RelativeFeatures
14 | CyclicalFeatures
15 | DecisionTreeFeatures
16 |
17 |
18 | Transformers in other Libraries
19 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
20 |
21 | Check also the following transformer from Scikit-learn:
22 |
23 | * `PolynomialFeatures `_
24 | * `SplineTransformer `_
25 |
--------------------------------------------------------------------------------
/docs/api_doc/datasets/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Datasets
4 | ========
5 |
6 | We are starting to build a library of functions that allow you and us to quickly load
7 | datasets to demonstrate and test the functionality of Feature-engine (and, why not,
8 | other Python libraries).
9 |
10 | At the moment, we support the following functions:
11 |
12 | .. toctree::
13 | :maxdepth: 1
14 |
15 | titanic
16 |
--------------------------------------------------------------------------------
/docs/api_doc/datasets/titanic.rst:
--------------------------------------------------------------------------------
1 | load__titanic
2 | =============
3 |
4 | .. currentmodule:: feature_engine.datasets
5 |
6 | .. autofunction:: load_titanic
--------------------------------------------------------------------------------
/docs/api_doc/datetime/DatetimeFeatures.rst:
--------------------------------------------------------------------------------
1 | DatetimeFeatures
2 | ================
3 |
4 | .. autoclass:: feature_engine.datetime.DatetimeFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/datetime/DatetimeSubtraction.rst:
--------------------------------------------------------------------------------
1 | DatetimeSubtraction
2 | ===================
3 |
4 | .. autoclass:: feature_engine.datetime.DatetimeSubtraction
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/datetime/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Datetime Features
4 | =================
5 |
6 | Feature-engine's datetime transformers are able to extract a wide variety of datetime
7 | features from existing datetime or object-like data.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | DatetimeFeatures
13 | DatetimeSubtraction
14 |
15 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/ArbitraryDiscretiser.rst:
--------------------------------------------------------------------------------
1 | ArbitraryDiscretiser
2 | ====================
3 |
4 | .. autoclass:: feature_engine.discretisation.ArbitraryDiscretiser
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/DecisionTreeDiscretiser.rst:
--------------------------------------------------------------------------------
1 | DecisionTreeDiscretiser
2 | =======================
3 |
4 | .. autoclass:: feature_engine.discretisation.DecisionTreeDiscretiser
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/EqualFrequencyDiscretiser.rst:
--------------------------------------------------------------------------------
1 | EqualFrequencyDiscretiser
2 | =========================
3 |
4 | .. autoclass:: feature_engine.discretisation.EqualFrequencyDiscretiser
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/EqualWidthDiscretiser.rst:
--------------------------------------------------------------------------------
1 | EqualWidthDiscretiser
2 | =====================
3 |
4 | .. autoclass:: feature_engine.discretisation.EqualWidthDiscretiser
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/GeometricWidthDiscretiser.rst:
--------------------------------------------------------------------------------
1 | GeometricWidthDiscretiser
2 | =========================
3 |
4 | .. autoclass:: feature_engine.discretisation.GeometricWidthDiscretiser
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/discretisation/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. currentmodule:: feature_engine.discretisation
3 |
4 | Discretisation
5 | ==============
6 |
7 | Feature-engine's discretisation transformers transform continuous variables into
8 | discrete features. This is accomplished, in general, by sorting the variable values
9 | into continuous intervals.
10 |
11 | **Summary**
12 |
13 | ===================================== ========================================================================
14 | Transformer Functionality
15 | ===================================== ========================================================================
16 | :class:`EqualFrequencyDiscretiser()` Sorts values into intervals with similar number of observations.
17 | :class:`EqualWidthDiscretiser()` Sorts values into intervals of equal size.
18 | :class:`ArbitraryDiscretiser()` Sorts values into intervals predefined by the user.
19 | :class:`DecisionTreeDiscretiser()` Replaces values by predictions of a decision tree, which are discrete.
20 | :class:`GeometricWidthDiscretiser()` Sorts variable into geometrical intervals.
21 | ===================================== ========================================================================
22 |
23 |
24 | .. toctree::
25 | :maxdepth: 1
26 | :hidden:
27 |
28 | EqualFrequencyDiscretiser
29 | EqualWidthDiscretiser
30 | ArbitraryDiscretiser
31 | DecisionTreeDiscretiser
32 | GeometricWidthDiscretiser
33 |
34 | Additional transformers for discretisation
35 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
36 |
37 | For discretisation using K-means, check Scikit-learn's
38 | `KBinsDiscretizer `_.
39 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/CountFrequencyEncoder.rst:
--------------------------------------------------------------------------------
1 | CountFrequencyEncoder
2 | =====================
3 |
4 | .. autoclass:: feature_engine.encoding.CountFrequencyEncoder
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/DecisionTreeEncoder.rst:
--------------------------------------------------------------------------------
1 | DecisionTreeEncoder
2 | ===================
3 |
4 | .. autoclass:: feature_engine.encoding.DecisionTreeEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/MeanEncoder.rst:
--------------------------------------------------------------------------------
1 | MeanEncoder
2 | ===========
3 |
4 | .. autoclass:: feature_engine.encoding.MeanEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/OneHotEncoder.rst:
--------------------------------------------------------------------------------
1 | OneHotEncoder
2 | =============
3 |
4 | .. autoclass:: feature_engine.encoding.OneHotEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/OrdinalEncoder.rst:
--------------------------------------------------------------------------------
1 | OrdinalEncoder
2 | ==============
3 |
4 | .. autoclass:: feature_engine.encoding.OrdinalEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/RareLabelEncoder.rst:
--------------------------------------------------------------------------------
1 | RareLabelEncoder
2 | ================
3 |
4 |
5 | .. autoclass:: feature_engine.encoding.RareLabelEncoder
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/StringSimilarityEncoder.rst:
--------------------------------------------------------------------------------
1 | StringSimilarityEncoder
2 | =======================
3 |
4 | .. autoclass:: feature_engine.encoding.StringSimilarityEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/WoEEncoder.rst:
--------------------------------------------------------------------------------
1 | WoEEncoder
2 | ==========
3 |
4 | .. autoclass:: feature_engine.encoding.WoEEncoder
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/encoding/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Categorical Encoding
4 | ====================
5 |
6 | Feature-engine's categorical encoders replace the categories of the variable with
7 | estimated or arbitrary numbers.
8 |
9 | **Summary of Feature-engine's encoders characteristics**
10 |
11 | ================================= ============ ================= ============== ===============================================================
12 | Transformer Regression Classification Multi-class Description
13 | ================================= ============ ================= ============== ===============================================================
14 | :class:`OneHotEncoder()` √ √ √ Adds dummy variables to represent each category
15 | :class:`OrdinalEncoder()` √ √ √ Replaces categories with an integer
16 | :class:`CountFreuencyEncoder()` √ √ √ Replaces categories with their count or frequency
17 | :class:`MeanEncoder()` √ √ x Replaces categories with the targe mean value
18 | :class:`WoEEncoder()` x √ x Replaces categories with the weight of the evidence
19 | :class:`DecisionTreeEncoder()` √ √ √ Replaces categories with the predictions of a decision tree
20 | :class:`RareLabelEncoder()` √ √ √ Groups infrequent categories into a single one
21 | ================================= ============ ================= ============== ===============================================================
22 |
23 | Feature-engine's categorical encoders encode only variables of type categorical or
24 | object by default. From version 1.1.0, you have the option to set the parameter
25 | `ignore_format` to True to make the transformers also accept numerical variables as
26 | input.
27 |
28 |
29 | .. toctree::
30 | :maxdepth: 1
31 |
32 | OneHotEncoder
33 | CountFrequencyEncoder
34 | OrdinalEncoder
35 | MeanEncoder
36 | WoEEncoder
37 | DecisionTreeEncoder
38 | RareLabelEncoder
39 | StringSimilarityEncoder
40 |
41 | Other categorical encoding libraries
42 | ------------------------------------
43 |
44 | For additional categorical encoding transformations, visit the open-source package
45 | `Category encoders `_.
46 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/AddMissingIndicator.rst:
--------------------------------------------------------------------------------
1 | AddMissingIndicator
2 | ===================
3 |
4 | .. autoclass:: feature_engine.imputation.AddMissingIndicator
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/ArbitraryNumberImputer.rst:
--------------------------------------------------------------------------------
1 | ArbitraryNumberImputer
2 | ======================
3 |
4 | .. autoclass:: feature_engine.imputation.ArbitraryNumberImputer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/CategoricalImputer.rst:
--------------------------------------------------------------------------------
1 | CategoricalImputer
2 | ==================
3 |
4 | .. autoclass:: feature_engine.imputation.CategoricalImputer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/DropMissingData.rst:
--------------------------------------------------------------------------------
1 | DropMissingData
2 | ===============
3 |
4 | .. autoclass:: feature_engine.imputation.DropMissingData
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/EndTailImputer.rst:
--------------------------------------------------------------------------------
1 | EndTailImputer
2 | ==============
3 |
4 | .. autoclass:: feature_engine.imputation.EndTailImputer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/MeanMedianImputer.rst:
--------------------------------------------------------------------------------
1 | MeanMedianImputer
2 | =================
3 |
4 | .. autoclass:: feature_engine.imputation.MeanMedianImputer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/imputation/RandomSampleImputer.rst:
--------------------------------------------------------------------------------
1 | RandomSampleImputer
2 | ===================
3 |
4 | .. autoclass:: feature_engine.imputation.RandomSampleImputer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/index.rst:
--------------------------------------------------------------------------------
1 | .. _api:
2 |
3 | API
4 | ===
5 |
6 | Full API documentation for Feature-engine transformers.
7 |
8 | Transformation
9 | --------------
10 |
11 | .. toctree::
12 | :maxdepth: 1
13 |
14 | imputation/index
15 | encoding/index
16 | discretisation/index
17 | outliers/index
18 | transformation/index
19 |
20 | Creation
21 | --------
22 |
23 | .. toctree::
24 | :maxdepth: 1
25 |
26 | creation/index
27 | datetime/index
28 |
29 |
30 | Selection
31 | ---------
32 | .. toctree::
33 | :maxdepth: 1
34 |
35 | selection/index
36 |
37 | Time series
38 | -----------
39 |
40 | .. toctree::
41 | :maxdepth: 1
42 |
43 | timeseries/index
44 |
45 | Other
46 | -----
47 | .. toctree::
48 | :maxdepth: 1
49 |
50 | preprocessing/index
51 | scaling/index
52 | wrappers/index
53 |
54 | Pipeline
55 | --------
56 | .. toctree::
57 | :maxdepth: 1
58 |
59 | pipeline/index
60 |
61 | Datasets
62 | --------
63 | .. toctree::
64 | :maxdepth: 1
65 |
66 | datasets/index
67 |
68 | Tools
69 | -----
70 | .. toctree::
71 | :maxdepth: 1
72 |
73 | variable_handling/index
--------------------------------------------------------------------------------
/docs/api_doc/outliers/ArbitraryOutlierCapper.rst:
--------------------------------------------------------------------------------
1 | ArbitraryOutlierCapper
2 | ======================
3 |
4 | .. autoclass:: feature_engine.outliers.ArbitraryOutlierCapper
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/outliers/OutlierTrimmer.rst:
--------------------------------------------------------------------------------
1 | OutlierTrimmer
2 | ==============
3 |
4 | .. autoclass:: feature_engine.outliers.OutlierTrimmer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/outliers/Winsorizer.rst:
--------------------------------------------------------------------------------
1 | Winsorizer
2 | ==========
3 |
4 | .. autoclass:: feature_engine.outliers.Winsorizer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/outliers/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | .. currentmodule:: feature_engine.outliers
4 |
5 | Outlier Handling
6 | ================
7 |
8 | Feature-engine's outlier transformers cap maximum or minimum values of a variable at an
9 | arbitrary or derived value. The OutlierTrimmer removes outliers from the dataset.
10 |
11 | =================================== ==============================================================
12 | Transformer Description
13 | =================================== ==============================================================
14 | :class:`Winsorizer()` Caps variables at automatically determined extreme values
15 | :class:`ArbitraryOutlierCapper()` Caps variables at values determined by the user
16 | :class:`OutlierTrimmer()` Removes outliers from the dataframe
17 | =================================== ==============================================================
18 |
19 | .. toctree::
20 | :maxdepth: 1
21 | :hidden:
22 |
23 | Winsorizer
24 | ArbitraryOutlierCapper
25 | OutlierTrimmer
--------------------------------------------------------------------------------
/docs/api_doc/pipeline/Pipeline.rst:
--------------------------------------------------------------------------------
1 | Pipeline
2 | ========
3 |
4 | .. autoclass:: feature_engine.pipeline.Pipeline
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/pipeline/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | .. currentmodule:: feature_engine.pipeline
4 |
5 | Pipeline
6 | ========
7 |
8 | Feature-engine's Pipeline is equivalent to Scikit-learn's pipeline, and in addition,
9 | it accepts the method `transform_x_y`, to adjust both X and y, in those cases where
10 | rows are removed from X.
11 |
12 | .. toctree::
13 | :maxdepth: 1
14 |
15 | Pipeline
16 | make_pipeline
17 |
--------------------------------------------------------------------------------
/docs/api_doc/pipeline/make_pipeline.rst:
--------------------------------------------------------------------------------
1 | make_pipeline
2 | =============
3 |
4 | .. currentmodule:: feature_engine.pipeline
5 |
6 | .. autofunction:: make_pipeline
--------------------------------------------------------------------------------
/docs/api_doc/preprocessing/MatchCategories.rst:
--------------------------------------------------------------------------------
1 | MatchCategories
2 | ===============
3 |
4 | .. autoclass:: feature_engine.preprocessing.MatchCategories
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/preprocessing/MatchVariables.rst:
--------------------------------------------------------------------------------
1 | MatchVariables
2 | ==============
3 |
4 | .. autoclass:: feature_engine.preprocessing.MatchVariables
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/preprocessing/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Preprocessing
4 | =============
5 |
6 | Feature-engine's preprocessing transformers apply general data pre-processing
7 | and transformation procedures.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | MatchCategories
13 | MatchVariables
14 |
--------------------------------------------------------------------------------
/docs/api_doc/scaling/MeanNormalizationScaler.rst:
--------------------------------------------------------------------------------
1 | MeanNormalizationScaler
2 | =======================
3 |
4 | .. autoclass:: feature_engine.scaling.MeanNormalizationScaler
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/scaling/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Scaling
4 | =======
5 |
6 | Feature-engine's scaling transformers apply various scaling techniques to
7 | given columns
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | MeanNormalizationScaler
13 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/DropConstantFeatures.rst:
--------------------------------------------------------------------------------
1 | DropConstantFeatures
2 | ====================
3 |
4 | .. autoclass:: feature_engine.selection.DropConstantFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/DropCorrelatedFeatures.rst:
--------------------------------------------------------------------------------
1 | DropCorrelatedFeatures
2 | ======================
3 |
4 | .. autoclass:: feature_engine.selection.DropCorrelatedFeatures
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/DropDuplicateFeatures.rst:
--------------------------------------------------------------------------------
1 | DropDuplicateFeatures
2 | =====================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.DropDuplicateFeatures
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/DropFeatures.rst:
--------------------------------------------------------------------------------
1 | DropFeatures
2 | =============
3 |
4 | .. autoclass:: feature_engine.selection.DropFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/DropHighPSIFeatures.rst:
--------------------------------------------------------------------------------
1 | DropHighPSIFeatures
2 | ===================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.DropHighPSIFeatures
6 | :members:
--------------------------------------------------------------------------------
/docs/api_doc/selection/MRMR.rst:
--------------------------------------------------------------------------------
1 | MRMR
2 | ====
3 |
4 |
5 | .. autoclass:: feature_engine.selection.MRMR
6 | :members:
--------------------------------------------------------------------------------
/docs/api_doc/selection/ProbeFeatureSelection.rst:
--------------------------------------------------------------------------------
1 | ProbeFeatureSelection
2 | =====================
3 |
4 | .. autoclass:: feature_engine.selection.ProbeFeatureSelection
5 | :members:
--------------------------------------------------------------------------------
/docs/api_doc/selection/RecursiveFeatureAddition.rst:
--------------------------------------------------------------------------------
1 | RecursiveFeatureAddition
2 | ========================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.RecursiveFeatureAddition
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/RecursiveFeatureElimination.rst:
--------------------------------------------------------------------------------
1 | RecursiveFeatureElimination
2 | ============================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.RecursiveFeatureElimination
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/SelectByInformationValue.rst:
--------------------------------------------------------------------------------
1 | SelectByInformationValue
2 | ========================
3 |
4 | .. autoclass:: feature_engine.selection.SelectByInformationValue
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/SelectByShuffling.rst:
--------------------------------------------------------------------------------
1 | SelectByShuffling
2 | =================
3 |
4 | .. autoclass:: feature_engine.selection.SelectByShuffling
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/SelectBySingleFeaturePerformance.rst:
--------------------------------------------------------------------------------
1 | SelectBySingleFeaturePerformance
2 | ================================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.SelectBySingleFeaturePerformance
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/SelectByTargetMeanPerformance.rst:
--------------------------------------------------------------------------------
1 | SelectByTargetMeanPerformance
2 | =============================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.SelectByTargetMeanPerformance
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/selection/SmartCorrelatedSelection.rst:
--------------------------------------------------------------------------------
1 | SmartCorrelatedSelection
2 | ========================
3 |
4 |
5 | .. autoclass:: feature_engine.selection.SmartCorrelatedSelection
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/timeseries/forecasting/ExpandingWindowFeatures.rst:
--------------------------------------------------------------------------------
1 | ExpandingWindowFeatures
2 | =======================
3 |
4 | .. autoclass:: feature_engine.timeseries.forecasting.ExpandingWindowFeatures
5 | :members:
--------------------------------------------------------------------------------
/docs/api_doc/timeseries/forecasting/LagFeatures.rst:
--------------------------------------------------------------------------------
1 | LagFeatures
2 | ===========
3 |
4 | .. autoclass:: feature_engine.timeseries.forecasting.LagFeatures
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/timeseries/forecasting/WindowFeatures.rst:
--------------------------------------------------------------------------------
1 | WindowFeatures
2 | ==============
3 |
4 | .. autoclass:: feature_engine.timeseries.forecasting.WindowFeatures
5 | :members:
--------------------------------------------------------------------------------
/docs/api_doc/timeseries/forecasting/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Forecasting Features
4 | ====================
5 |
6 | Feature-engine's time series forecasting transformers create and add new features to the
7 | dataframe by lagging features or calculating statistics over windows of time in the
8 | past.
9 |
10 | .. toctree::
11 | :maxdepth: 1
12 |
13 | LagFeatures
14 | WindowFeatures
15 | ExpandingWindowFeatures
16 |
17 |
18 |
--------------------------------------------------------------------------------
/docs/api_doc/timeseries/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Time Series Features
4 | ====================
5 |
6 | Feature-engine's time series transformers derive features from time series data.
7 |
8 | .. toctree::
9 | :maxdepth: 1
10 |
11 | forecasting/index
12 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/ArcsinTransformer.rst:
--------------------------------------------------------------------------------
1 | ArcsinTransformer
2 | =================
3 |
4 |
5 | .. autoclass:: feature_engine.transformation.ArcsinTransformer
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/BoxCoxTransformer.rst:
--------------------------------------------------------------------------------
1 | BoxCoxTransformer
2 | =================
3 |
4 | .. autoclass:: feature_engine.transformation.BoxCoxTransformer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/LogCpTransformer.rst:
--------------------------------------------------------------------------------
1 | LogCpTransformer
2 | ================
3 |
4 | .. autoclass:: feature_engine.transformation.LogCpTransformer
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/LogTransformer.rst:
--------------------------------------------------------------------------------
1 | LogTransformer
2 | ==============
3 |
4 |
5 | .. autoclass:: feature_engine.transformation.LogTransformer
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/PowerTransformer.rst:
--------------------------------------------------------------------------------
1 | PowerTransformer
2 | ================
3 |
4 |
5 | .. autoclass:: feature_engine.transformation.PowerTransformer
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/ReciprocalTransformer.rst:
--------------------------------------------------------------------------------
1 | ReciprocalTransformer
2 | =====================
3 |
4 |
5 | .. autoclass:: feature_engine.transformation.ReciprocalTransformer
6 | :members:
7 |
8 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/YeoJohnsonTransformer.rst:
--------------------------------------------------------------------------------
1 | YeoJohnsonTransformer
2 | =====================
3 |
4 | .. autoclass:: feature_engine.transformation.YeoJohnsonTransformer
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/transformation/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Variance Stabilizing Transformations
4 | ====================================
5 |
6 | Feature-engine's variable transformers transform numerical variables with various
7 | mathematical transformations.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | LogTransformer
13 | LogCpTransformer
14 | ReciprocalTransformer
15 | ArcsinTransformer
16 | PowerTransformer
17 | BoxCoxTransformer
18 | YeoJohnsonTransformer
19 |
20 |
21 | Transformers in other Libraries
22 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
23 |
24 | These and additional transformations can be obtained with the following Scikit-learn
25 | classes:
26 |
27 | * `FunctionTransformer `_
28 | * `PowerTransformer `_
29 |
30 | Note that Scikit-klearn classes return Numpy arrays and are applied to the entire dataset.
31 |
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/check_all_variables.rst:
--------------------------------------------------------------------------------
1 | check_all_variables
2 | ===================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: check_all_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/check_categorical_variables.rst:
--------------------------------------------------------------------------------
1 | check_categorical_variables
2 | ===========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: check_categorical_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/check_datetime_variables.rst:
--------------------------------------------------------------------------------
1 | check_datetime_variables
2 | ========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: check_datetime_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/check_numerical_variables.rst:
--------------------------------------------------------------------------------
1 | check_numerical_variables
2 | =========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: check_numerical_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/find_all_variables.rst:
--------------------------------------------------------------------------------
1 | find_all_variables
2 | ==================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: find_all_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/find_categorical_and_numerical_variables.rst:
--------------------------------------------------------------------------------
1 | find_categorical_and_numerical_variables
2 | ========================================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: find_categorical_and_numerical_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/find_categorical_variables.rst:
--------------------------------------------------------------------------------
1 | find_categorical_variables
2 | ==========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: find_categorical_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/find_datetime_variables.rst:
--------------------------------------------------------------------------------
1 | find_datetime_variables
2 | =======================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: find_datetime_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/find_numerical_variables.rst:
--------------------------------------------------------------------------------
1 | find_numerical_variables
2 | ========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: find_numerical_variables
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Variable handling functions
4 | ===========================
5 |
6 | This set of functions find variables of a specific type in a dataframe, or check that a
7 | list of variables is of a specified data type.
8 |
9 | The `find` functions take a dataframe as an argument and returns a list with the names
10 | of the variables of the desired type.
11 |
12 | The `check` functions check that the list of variables are all of the desired data type.
13 |
14 | The `retain` functions select the variables in a list if they fulfill a condition.
15 |
16 | These functions are used under-the-hood by all Feature-engine transformers to select the
17 | variables that they will modify.
18 |
19 | .. toctree::
20 | :maxdepth: 1
21 |
22 | find_all_variables
23 | find_categorical_variables
24 | find_datetime_variables
25 | find_numerical_variables
26 | find_categorical_and_numerical_variables
27 | check_all_variables
28 | check_categorical_variables
29 | check_datetime_variables
30 | check_numerical_variables
31 | retain_variables_if_in_df
32 |
--------------------------------------------------------------------------------
/docs/api_doc/variable_handling/retain_variables_if_in_df.rst:
--------------------------------------------------------------------------------
1 | retain_variables_if_in_df
2 | =========================
3 |
4 | .. currentmodule:: feature_engine.variable_handling
5 |
6 | .. autofunction:: retain_variables_if_in_df
--------------------------------------------------------------------------------
/docs/api_doc/wrappers/Wrapper.rst:
--------------------------------------------------------------------------------
1 | SklearnTransformerWrapper
2 | =========================
3 |
4 | .. autoclass:: feature_engine.wrappers.SklearnTransformerWrapper
5 | :members:
6 |
7 |
--------------------------------------------------------------------------------
/docs/api_doc/wrappers/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | .. currentmodule:: feature_engine.wrappers
4 |
5 | Scikit-learn Wrapper
6 | ====================
7 |
8 | Feature-engine's Scikit-learn wrappers wrap Scikit-learn transformers allowing their
9 | implementation only on a selected subset of features.
10 |
11 | .. toctree::
12 | :maxdepth: 1
13 |
14 | Wrapper
15 |
16 | Other wrappers
17 | ~~~~~~~~~~~~~~
18 |
19 | The :class:`SklearnTransformerWrapper()` offers a similar function to the
20 | `ColumnTransformer `_
21 | class available in Scikit-learn. They differ in the implementation to select the
22 | variables.
--------------------------------------------------------------------------------
/docs/contribute/code_of_conduct.rst:
--------------------------------------------------------------------------------
1 | Code of Conduct
2 | ===============
3 |
4 | Feature-engine is an open source Python project. We follow the
5 | `Python Software Foundation Code of Conduct `_.
6 | All interactions among members of the Feature-engine community must meet those
7 | guidelines. This includes (but is not limited to) interactions through the mailing
8 | list, GitHub and StackOverflow.
9 |
10 | Everyone is expected to be open, considerate, and respectful of others no matter what
11 | their position is within the project. We show gratitude for any contribution, big or
12 | small. We welcome feedback and participation. We want to make Feature-engine a nice,
13 | welcoming and safe place for you to do your first contribution to open source, and why
14 | not the second, the third and so on :).
15 |
--------------------------------------------------------------------------------
/docs/contribute/contribute_jup.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Contribute Jupyter notebooks
4 | ============================
5 |
6 | We created a collection of Jupyter notebooks that showcase the main functionality of
7 | Feature-engine's transformers. We link these notebooks throughout the main documentation
8 | to offer users more examples and details about transformers and how to use them.
9 |
10 | **Note** that the Jupyter notebooks are hosted in a separate
11 | `Github repository `_.
12 |
13 | Here are some guidelines on how to add a new notebook or update an existing one. The
14 | contribution workflow is the same we use for the main source code base.
15 |
16 | Jupyter contribution workflow
17 | -----------------------------
18 |
19 | 1. Fork the `Github repository `_.
20 | 2. Clone your fork into your local computer: `git clone https://github.com//feature-engine-examples.git`.
21 | 3. Navigate into the project directory: `cd feature-engine-examples`.
22 | 4. If you haven't done so yet, install feature-engine: `pip install feature_engine`.
23 | 5. Create a feature branch with a meaningful name: `git checkout -b mynotebookbranch`.
24 | 6. Develop your notebook
25 | 7. Add the changes to your copy of the fork: `git add .`, `git commit -m "a meaningful commit message"`, `git pull origin mynotebookbranch`.
26 | 8. Go to your fork on Github and make a PR to this repo
27 | 9. Done
28 |
29 | The review process for notebooks is usually much faster than for the main source code base.
30 |
31 | Jupyter creation guidelines
32 | ---------------------------
33 |
34 | If you want to add a new Jupyter notebook, there are a few things to note:
35 |
36 | - Make sure that the dataset you use is publicly available and with a clear license that it is free to use
37 | - Do not upload datasets to the repository
38 | - Add instructions on how to obtain and prepare the data for the demo
39 | - Throughout the notebook, add guidelines on what you are going to do next, and what is the conclusion of the output
40 |
41 | That's it! Fairly straightforward.
42 |
43 | We look forward to your contribution :)
--------------------------------------------------------------------------------
/docs/donate.rst:
--------------------------------------------------------------------------------
1 | Sponsor us
2 | ----------
3 |
4 | |
5 |
6 | .. image:: images/sponsors/call_for_sponsors.png
7 | :align: center
8 | :target: https://github.com/sponsors/feature-engine
9 |
10 | |
11 |
12 | Support Feature-engine financially through
13 | `Github Sponsors `_ and help further our
14 | mission to democratize machine learning and programming tools through open-source.
15 |
16 | More details about how we use donations in the
17 | `sponsors page `_.
18 |
19 | |
20 |
21 | Sponsors
22 | --------
23 |
24 | Feature-engine is a community driven project, however institutional, private and
25 | individual support help to assure its sustainability. The project would like to thank
26 | the following sponsors:
27 |
28 | |
29 |
30 | .. image:: images/sponsors/trainindata.png
31 | :width: 200pt
32 | :align: center
33 | :target: https://www.trainindata.com/
34 |
35 |
--------------------------------------------------------------------------------
/docs/images/1024px-Relationship_between_mean_and_median_under_different_skewness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/1024px-Relationship_between_mean_and_median_under_different_skewness.png
--------------------------------------------------------------------------------
/docs/images/Discretisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/Discretisation.png
--------------------------------------------------------------------------------
/docs/images/FeatureEnginePackageStructure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructure.png
--------------------------------------------------------------------------------
/docs/images/FeatureEnginePackageStructureCrossSectional.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureCrossSectional.png
--------------------------------------------------------------------------------
/docs/images/FeatureEnginePackageStructureDatetimeText.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureDatetimeText.png
--------------------------------------------------------------------------------
/docs/images/FeatureEnginePackageStructureTimeseries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/FeatureEnginePackageStructureTimeseries.png
--------------------------------------------------------------------------------
/docs/images/PSI_distribution_case1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case1.png
--------------------------------------------------------------------------------
/docs/images/PSI_distribution_case3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case3.png
--------------------------------------------------------------------------------
/docs/images/PSI_distribution_case4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case4.png
--------------------------------------------------------------------------------
/docs/images/PSI_distribution_case5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/PSI_distribution_case5.png
--------------------------------------------------------------------------------
/docs/images/Variable_Transformation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/Variable_Transformation.png
--------------------------------------------------------------------------------
/docs/images/arbitraryvalueimputation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/arbitraryvalueimputation.png
--------------------------------------------------------------------------------
/docs/images/bmilogcp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/bmilogcp.png
--------------------------------------------------------------------------------
/docs/images/bmiraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/bmiraw.png
--------------------------------------------------------------------------------
/docs/images/boxplot-age-percentiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-age-percentiles.png
--------------------------------------------------------------------------------
/docs/images/boxplot-age.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-age.png
--------------------------------------------------------------------------------
/docs/images/boxplot-fare-mad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-fare-mad.png
--------------------------------------------------------------------------------
/docs/images/boxplot-fare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-fare.png
--------------------------------------------------------------------------------
/docs/images/boxplot-sibsp-fare-iqr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-sibsp-fare-iqr.png
--------------------------------------------------------------------------------
/docs/images/boxplot-sibsp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-sibsp.png
--------------------------------------------------------------------------------
/docs/images/boxplot-titanic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/boxplot-titanic.png
--------------------------------------------------------------------------------
/docs/images/breast_cancer_arcsin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/breast_cancer_arcsin.png
--------------------------------------------------------------------------------
/docs/images/breast_cancer_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/breast_cancer_raw.png
--------------------------------------------------------------------------------
/docs/images/cookbook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/cookbook.png
--------------------------------------------------------------------------------
/docs/images/dmlm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/dmlm.png
--------------------------------------------------------------------------------
/docs/images/endtailimputer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/endtailimputer.png
--------------------------------------------------------------------------------
/docs/images/equalfrequencydiscretisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation.png
--------------------------------------------------------------------------------
/docs/images/equalfrequencydiscretisation_gaussian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation_gaussian.png
--------------------------------------------------------------------------------
/docs/images/equalfrequencydiscretisation_skewed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalfrequencydiscretisation_skewed.png
--------------------------------------------------------------------------------
/docs/images/equalwidthdiscretisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/equalwidthdiscretisation.png
--------------------------------------------------------------------------------
/docs/images/f_statistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/f_statistic.png
--------------------------------------------------------------------------------
/docs/images/feml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/feml.png
--------------------------------------------------------------------------------
/docs/images/fetsf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fetsf.png
--------------------------------------------------------------------------------
/docs/images/fork.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fork.png
--------------------------------------------------------------------------------
/docs/images/frequentcategoryimputer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/frequentcategoryimputer.png
--------------------------------------------------------------------------------
/docs/images/fsml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fsml.png
--------------------------------------------------------------------------------
/docs/images/fsmlbook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fsmlbook.png
--------------------------------------------------------------------------------
/docs/images/fwml.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/fwml.png
--------------------------------------------------------------------------------
/docs/images/hour_sin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin.png
--------------------------------------------------------------------------------
/docs/images/hour_sin2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin2.png
--------------------------------------------------------------------------------
/docs/images/hour_sin3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin3.png
--------------------------------------------------------------------------------
/docs/images/hour_sin4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/hour_sin4.png
--------------------------------------------------------------------------------
/docs/images/increasingwidthdisc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/increasingwidthdisc.png
--------------------------------------------------------------------------------
/docs/images/ivml_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/ivml_logo.png
--------------------------------------------------------------------------------
/docs/images/logcpraw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logcpraw.png
--------------------------------------------------------------------------------
/docs/images/logcptransform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logcptransform.png
--------------------------------------------------------------------------------
/docs/images/logo/FeatureEngine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/FeatureEngine.png
--------------------------------------------------------------------------------
/docs/images/logo/Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/Logo.png
--------------------------------------------------------------------------------
/docs/images/logo/Logo_name.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/Logo_name.png
--------------------------------------------------------------------------------
/docs/images/logo/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/logo/favicon.png
--------------------------------------------------------------------------------
/docs/images/lotarea_pt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_pt.png
--------------------------------------------------------------------------------
/docs/images/lotarea_pt_custom_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_pt_custom_exp.png
--------------------------------------------------------------------------------
/docs/images/lotarea_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarea_raw.png
--------------------------------------------------------------------------------
/docs/images/lotareaboxcox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareaboxcox.png
--------------------------------------------------------------------------------
/docs/images/lotarealog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarealog.png
--------------------------------------------------------------------------------
/docs/images/lotareapower.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareapower.png
--------------------------------------------------------------------------------
/docs/images/lotarearaw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotarearaw.png
--------------------------------------------------------------------------------
/docs/images/lotareareciprocal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareareciprocal.png
--------------------------------------------------------------------------------
/docs/images/lotareayeojohnson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotareayeojohnson.png
--------------------------------------------------------------------------------
/docs/images/lotshape-price-per-cat-enc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotshape-price-per-cat-enc.png
--------------------------------------------------------------------------------
/docs/images/lotshape-price-per-cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/lotshape-price-per-cat.png
--------------------------------------------------------------------------------
/docs/images/meanmedianimputater_distributions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/meanmedianimputater_distributions.png
--------------------------------------------------------------------------------
/docs/images/medianimputation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medianimputation.png
--------------------------------------------------------------------------------
/docs/images/medinc_disc_arbitrarily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_disc_arbitrarily.png
--------------------------------------------------------------------------------
/docs/images/medinc_disc_arbitrarily2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_disc_arbitrarily2.png
--------------------------------------------------------------------------------
/docs/images/medinc_hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/medinc_hist.png
--------------------------------------------------------------------------------
/docs/images/missingcategoryimputer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/missingcategoryimputer.png
--------------------------------------------------------------------------------
/docs/images/missingindicator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/missingindicator.png
--------------------------------------------------------------------------------
/docs/images/mli_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mli_logo.png
--------------------------------------------------------------------------------
/docs/images/monotonic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/monotonic.png
--------------------------------------------------------------------------------
/docs/images/mzoning-price-per-cat-enc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mzoning-price-per-cat-enc.png
--------------------------------------------------------------------------------
/docs/images/mzoning-price-per-cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/mzoning-price-per-cat.png
--------------------------------------------------------------------------------
/docs/images/nonnormalvars2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2.png
--------------------------------------------------------------------------------
/docs/images/nonnormalvars2logtransformed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2logtransformed.png
--------------------------------------------------------------------------------
/docs/images/nonnormalvars2transformed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/nonnormalvars2transformed.png
--------------------------------------------------------------------------------
/docs/images/ordinal_encoding_monotonic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/ordinal_encoding_monotonic.png
--------------------------------------------------------------------------------
/docs/images/pipelineprediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/pipelineprediction.png
--------------------------------------------------------------------------------
/docs/images/probe-importance-std.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe-importance-std.png
--------------------------------------------------------------------------------
/docs/images/probe_feature_normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe_feature_normal.png
--------------------------------------------------------------------------------
/docs/images/probe_features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/probe_features.png
--------------------------------------------------------------------------------
/docs/images/quasiconstant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/quasiconstant.png
--------------------------------------------------------------------------------
/docs/images/randomsampleimputation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/randomsampleimputation.png
--------------------------------------------------------------------------------
/docs/images/reciprocal_transformer/reciprocal_transfomer_inverse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_inverse.png
--------------------------------------------------------------------------------
/docs/images/reciprocal_transformer/reciprocal_transfomer_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_new.png
--------------------------------------------------------------------------------
/docs/images/reciprocal_transformer/reciprocal_transfomer_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transfomer_original.png
--------------------------------------------------------------------------------
/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_new.png
--------------------------------------------------------------------------------
/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/reciprocal_transformer/reciprocal_transformer_3plots_original.png
--------------------------------------------------------------------------------
/docs/images/rfa_linreg_imp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfa_linreg_imp.png
--------------------------------------------------------------------------------
/docs/images/rfa_perf_drifts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfa_perf_drifts.png
--------------------------------------------------------------------------------
/docs/images/rfe_perf_drift.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfe_perf_drift.png
--------------------------------------------------------------------------------
/docs/images/rfimportancemrmr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/rfimportancemrmr.png
--------------------------------------------------------------------------------
/docs/images/selectionChart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/selectionChart.png
--------------------------------------------------------------------------------
/docs/images/shuffle-features-std.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/shuffle-features-std.png
--------------------------------------------------------------------------------
/docs/images/single-feature-perf-std.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/single-feature-perf-std.png
--------------------------------------------------------------------------------
/docs/images/single_feature_probes_imp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/single_feature_probes_imp.png
--------------------------------------------------------------------------------
/docs/images/sponsors/call_for_sponsors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/call_for_sponsors.png
--------------------------------------------------------------------------------
/docs/images/sponsors/how-did-you-discover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/how-did-you-discover.png
--------------------------------------------------------------------------------
/docs/images/sponsors/trainindata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/sponsors/trainindata.png
--------------------------------------------------------------------------------
/docs/images/summary/imputersSummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/summary/imputersSummary.png
--------------------------------------------------------------------------------
/docs/images/summary/selectionSummary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/summary/selectionSummary.png
--------------------------------------------------------------------------------
/docs/images/target-mean-sel-std.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/target-mean-sel-std.png
--------------------------------------------------------------------------------
/docs/images/toydata_pt_raw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_raw.png
--------------------------------------------------------------------------------
/docs/images/toydata_pt_transformed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_transformed.png
--------------------------------------------------------------------------------
/docs/images/toydata_pt_transformed_custom_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/toydata_pt_transformed_custom_exp.png
--------------------------------------------------------------------------------
/docs/images/transformedcoupleYJ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/transformedcoupleYJ.png
--------------------------------------------------------------------------------
/docs/images/treediscretisation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treediscretisation.png
--------------------------------------------------------------------------------
/docs/images/treemonotonicprediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treemonotonicprediction.png
--------------------------------------------------------------------------------
/docs/images/treepredictionrounded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/treepredictionrounded.png
--------------------------------------------------------------------------------
/docs/images/untransformedcoupleYJ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/untransformedcoupleYJ.png
--------------------------------------------------------------------------------
/docs/images/woe_encoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/woe_encoding.png
--------------------------------------------------------------------------------
/docs/images/woe_prediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/woe_prediction.png
--------------------------------------------------------------------------------
/docs/images/yeojohnsonformula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/docs/images/yeojohnsonformula.png
--------------------------------------------------------------------------------
/docs/quickstart/datasets.rst:
--------------------------------------------------------------------------------
1 | .. _datasets:
2 |
3 | Datasets
4 | ========
5 |
6 | The user guide and examples included in Feature-engine's documentation are based on
7 | these 3 datasets:
8 |
9 | Titanic dataset
10 | ~~~~~~~~~~~~~~~
11 |
12 | We use the dataset available in `openML `_ which can be
13 | downloaded from `here `_.
14 |
15 | Ames House Prices dataset
16 | ~~~~~~~~~~~~~~~~~~~~~~~~~
17 |
18 | We use the data set created by Professor Dean De Cock:
19 | * Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing
20 | * Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3.
21 |
22 | The examples are based on a copy of the dataset available on
23 | `Kaggle `_.
24 |
25 | The original data and documentation can be found here:
26 |
27 | * `Documentation `_
28 |
29 | * `Data `_
30 |
31 | Credit Approval dataset
32 | ~~~~~~~~~~~~~~~~~~~~~~~
33 |
34 | We use the Credit Approval dataset from the UCI Machine Learning Repository:
35 |
36 | Dua, D. and Graff, C. (2019). `UCI Machine Learning Repository `_.
37 | Irvine, CA: University of California, School of Information and Computer Science.
38 |
39 | To download the dataset visit this
40 | `website `_
41 | and click on "crx.data" to download the data set.
42 |
43 | To prepare the data for the examples:
44 |
45 | .. code:: python
46 |
47 | import random
48 | import pandas as pd
49 | import numpy as np
50 |
51 | # load data
52 | data = pd.read_csv('crx.data', header=None)
53 |
54 | # create variable names according to UCI Machine Learning information
55 | varnames = ['A'+str(s) for s in range(1,17)]
56 | data.columns = varnames
57 |
58 | # replace ? by np.nan
59 | data = data.replace('?', np.nan)
60 |
61 | # re-cast some variables to the correct types
62 | data['A2'] = data['A2'].astype('float')
63 | data['A14'] = data['A14'].astype('float')
64 |
65 | # encode target to binary
66 | data['A16'] = data['A16'].map({'+':1, '-':0})
67 |
68 | # save the data
69 | data.to_csv('creditApprovalUCI.csv', index=False)
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | # Library Dependencies
2 | numpy>=1.18.2
3 | pandas>=1.0.3
4 | scikit-learn>=1.0.0
5 | scipy>=1.4.1
6 | statsmodels>=0.11.1
7 |
8 | # Documentation Dependencies
9 | docutils==0.16
10 | Sphinx>=4.3.2
11 | pydata_sphinx_theme>=0.7.2
12 | sphinx_autodoc_typehints>=1.11.1,<=1.21.3
13 | numpydoc>=0.9.2
14 |
--------------------------------------------------------------------------------
/docs/resources/books.rst:
--------------------------------------------------------------------------------
1 | Books
2 | =====
3 |
4 | You can learn more about how to use Feature-engine and feature engineering in general
5 | in the following books:
6 |
7 | .. figure:: ../images/cookbook.png
8 | :width: 200
9 | :figclass: align-center
10 | :align: left
11 | :target: https://www.packtpub.com/en-us/product/python-feature-engineering-cookbook-9781835883587
12 |
13 | Python Feature Engineering Cookbook
14 |
15 |
16 | .. figure:: ../images/fsmlbook.png
17 | :width: 200
18 | :figclass: align-center
19 | :align: left
20 | :target: https://www.trainindata.com/p/feature-selection-in-machine-learning-book
21 |
22 | Feature Selection in Machine Learning
--------------------------------------------------------------------------------
/docs/resources/courses.rst:
--------------------------------------------------------------------------------
1 | Courses
2 | =======
3 |
4 | You can learn more about how to use Feature-engine and, feature engineering and feature
5 | selection in general in the following online courses:
6 |
7 | .. figure:: ../images/feml.png
8 | :width: 300
9 | :figclass: align-center
10 | :align: left
11 | :target: https://www.trainindata.com/p/feature-engineering-for-machine-learning
12 |
13 | Feature Engineering for Machine Learning
14 |
15 | .. figure:: ../images/fsml.png
16 | :width: 300
17 | :figclass: align-center
18 | :align: right
19 | :target: https://www.trainindata.com/p/feature-selection-for-machine-learning
20 |
21 | Feature Selection for Machine Learning
22 |
23 | .. figure:: ../images/fwml.png
24 | :width: 300
25 | :figclass: align-center
26 | :align: left
27 | :target: https://www.courses.trainindata.com/p/forecasting-with-machine-learning
28 |
29 | Forecasting with Machine Learning
30 |
31 | .. figure:: ../images/fetsf.png
32 | :width: 300
33 | :figclass: align-center
34 | :align: right
35 | :target: https://www.trainindata.com/p/feature-engineering-for-forecasting
36 |
37 | Feature Engineering for Time Series Forecasting
38 |
39 | .. figure:: ../images/mli_logo.png
40 | :width: 300
41 | :figclass: align-center
42 | :align: left
43 | :target: https://www.courses.trainindata.com/p/machine-learning-interpretability
44 |
45 | Interpreting Machine Learning Models
46 |
47 |
48 | |
49 | |
50 |
--------------------------------------------------------------------------------
/docs/resources/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. _learning_resources:
3 |
4 | Resources
5 | =========
6 |
7 | Here you find learning resources to know more about Feature-engine and feature
8 | engineering and selection in general.
9 |
10 | We have gathered online courses, books, blogs, videos, podcasts, jupyter notebook and
11 | kaggle kernels, so you can follow the resource with the way of learning that you like
12 | the most.
13 |
14 | .. toctree::
15 | :maxdepth: 1
16 |
17 | courses
18 | books
19 | blogs
20 | tutorials
--------------------------------------------------------------------------------
/docs/resources/tutorials.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 |
4 | How To
5 | ------
6 |
7 | Check our `jupyter notebooks `_
8 | showcasing the functionality of each Feature-engine transformer.
9 |
10 | Kaggle Kernels
11 | --------------
12 |
13 | We also prepared Kaggle kernels with demos mixing data exploration, feature engineering,
14 | feature creation, feature selection and hyperparameter optimization of entire pipelines.
15 |
16 | - `Feature selection for bank customer satisfaction prediction `_
17 | - `Feature engineering and selection for house price prediction `_
18 | - `Feature creation for wine quality prediction `_
19 | - `Feature engineering and model stacking for house price modelling `_
20 | - `Feature engineering with Feature-engine and Randomized search `_
21 | - `Feature engineering with Feature-engine and Grid search `_
22 |
23 |
24 |
25 | Video tutorials
26 | ---------------
27 |
28 | You can find some videos on how to use Feature-engine in the
29 | `Feature-engine playlist `_
30 | in Train in Data's YouTube channel. The list is a bit short at the moment, apologies.
--------------------------------------------------------------------------------
/docs/sphinxext/README.txt:
--------------------------------------------------------------------------------
1 | =====================================
2 | numpydoc -- Numpy's Sphinx extensions
3 | =====================================
4 |
5 | Numpy's documentation uses several custom extensions to Sphinx. These
6 | are shipped in this ``numpydoc`` package, in case you want to make use
7 | of them in third-party projects.
8 |
9 | The following extensions are available:
10 |
11 | - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add
12 | the code description directives ``np-function``, ``np-cfunction``, etc.
13 | that support the Numpy docstring syntax.
14 |
15 | - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes.
16 |
17 | - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::``
18 | directive. Note that this implementation may still undergo severe
19 | changes or eventually be deprecated.
20 |
21 | - ``numpydoc.only_directives``: (DEPRECATED)
22 |
23 | - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive.
24 | Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``,
25 | and it the Sphinx 1.0 version is recommended over that included in
26 | Numpydoc.
27 |
28 |
29 | numpydoc
30 | ========
31 |
32 | Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings
33 | following the Numpy/Scipy format to a form palatable to Sphinx.
34 |
35 | Options
36 | -------
37 |
38 | The following options can be set in conf.py:
39 |
40 | - numpydoc_use_plots: bool
41 |
42 | Whether to produce ``plot::`` directives for Examples sections that
43 | contain ``import matplotlib``.
44 |
45 | - numpydoc_show_class_members: bool
46 |
47 | Whether to show all members of a class in the Methods and Attributes
48 | sections automatically.
49 |
50 | - numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead)
51 |
52 | Whether to insert an edit link after docstrings.
53 |
--------------------------------------------------------------------------------
/docs/user_guide/datetime/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Datetime Features
4 | =================
5 |
6 | Feature-engine’s datetime transformers are able to extract a wide variety of datetime
7 | features from existing datetime or object-like data.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | DatetimeFeatures
13 | DatetimeSubtraction
--------------------------------------------------------------------------------
/docs/user_guide/discretisation/index.rst:
--------------------------------------------------------------------------------
1 | .. _discretization_transformers:
2 |
3 | .. -*- mode: rst -*-
4 |
5 | Discretisation
6 | ==============
7 |
8 | Feature-engine's variable discretisation transformers transform continuous numerical
9 | variables into discrete variables. The discrete variables will contain contiguous
10 | intervals in the case of the equal frequency and equal width transformers. The
11 | Decision Tree discretiser will return a discrete variable, in the sense that the
12 | new feature takes a finite number of values.
13 |
14 | The following illustration shows the process of discretisation:
15 |
16 | .. figure:: ../../images/Discretisation.png
17 | :align: center
18 | :width: 500
19 |
20 |
21 | With discretisation, sometimes we can obtain a more homogeneous value spread from an
22 | originally skewed variable. But this is not always possible.
23 |
24 | **Discretisation plus encoding**
25 |
26 | Very often, after we discretise the numerical continuous variables into discrete intervals
27 | we want to proceed their engineering as if they were categorical. This is common practice.
28 | Throughout the user guide, we point to jupyter notebooks that showcase this functionality.
29 |
30 | **Discretisers**
31 |
32 | .. toctree::
33 | :maxdepth: 1
34 |
35 | EqualFrequencyDiscretiser
36 | EqualWidthDiscretiser
37 | ArbitraryDiscretiser
38 | DecisionTreeDiscretiser
39 | GeometricWidthDiscretiser
40 |
--------------------------------------------------------------------------------
/docs/user_guide/imputation/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Missing Data Imputation
4 | =======================
5 |
6 | Feature-engine's missing data imputers replace missing data by parameters estimated
7 | from data or arbitrary values pre-defined by the user. The following image summarizes
8 | the main imputer's functionality.
9 |
10 | .. figure:: ../../images/summary/imputersSummary.png
11 | :align: center
12 |
13 | |
14 |
15 | In this guide, you will find code snippets to quickly be able to apply the imputers
16 | to your datasets, as well as general knowledge and guidance on the imputation
17 | techniques.
18 |
19 |
20 | Imputers
21 | ~~~~~~~~
22 |
23 | .. toctree::
24 | :maxdepth: 1
25 |
26 | MeanMedianImputer
27 | ArbitraryNumberImputer
28 | EndTailImputer
29 | CategoricalImputer
30 | RandomSampleImputer
31 | AddMissingIndicator
32 | DropMissingData
--------------------------------------------------------------------------------
/docs/user_guide/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. _user_guide:
3 |
4 | User Guide
5 | ==========
6 |
7 | In this section you will find additional information about Feature-engine's transformers
8 | and feature engineering transformations in general, as well as additional examples.
9 |
10 | Transformation
11 | --------------
12 |
13 | .. toctree::
14 | :maxdepth: 1
15 |
16 | imputation/index
17 | encoding/index
18 | discretisation/index
19 | outliers/index
20 | transformation/index
21 | scaling/index
22 |
23 | Creation
24 | --------
25 |
26 | .. toctree::
27 | :maxdepth: 1
28 |
29 | creation/index
30 | datetime/index
31 |
32 |
33 | Selection
34 | ---------
35 | .. toctree::
36 | :maxdepth: 1
37 |
38 | selection/index
39 |
40 |
41 | Time series
42 | -----------
43 |
44 | .. toctree::
45 | :maxdepth: 1
46 |
47 | timeseries/index
48 |
49 |
50 | Other
51 | -----
52 | .. toctree::
53 | :maxdepth: 1
54 |
55 | preprocessing/index
56 | wrappers/index
57 |
58 | Pipeline
59 | --------
60 | .. toctree::
61 | :maxdepth: 1
62 |
63 | pipeline/index
64 |
65 | Tools
66 | -----
67 | .. toctree::
68 | :maxdepth: 1
69 |
70 | variable_handling/index
--------------------------------------------------------------------------------
/docs/user_guide/outliers/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Outlier Handling
4 | ================
5 |
6 | Feature-engine's outlier cappers cap maximum or minimum values of a variable at an
7 | arbitrary or derived value. The OutlierTrimmer removes outliers from the dataset.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | Winsorizer
13 | ArbitraryOutlierCapper
14 | OutlierTrimmer
--------------------------------------------------------------------------------
/docs/user_guide/pipeline/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Pipeline
4 | ========
5 |
6 | Feature-engine's Pipeline is equivalent to Scikit-learn's pipeline, and in addition,
7 | it accepts the method `transform_x_y`, to adjust both X and y, in those cases where
8 | rows are removed from X.
9 |
10 | .. toctree::
11 | :maxdepth: 1
12 |
13 | Pipeline
14 | make_pipeline
15 |
--------------------------------------------------------------------------------
/docs/user_guide/preprocessing/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Preprocessing
4 | =============
5 |
6 | Feature-engine's preprocessing transformers apply general data pre-processing
7 | and transformation procedures.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | MatchCategories
13 | MatchVariables
14 |
--------------------------------------------------------------------------------
/docs/user_guide/scaling/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. _scaling_user_guide:
3 |
4 | .. currentmodule:: feature_engine.scaling
5 |
6 | Scaling
7 | =======
8 |
9 | `Feature scaling `_
10 | is the process of transforming the range of numerical features so that they fit within a
11 | specific scale, usually to improve the performance and training stability of machine learning
12 | models.
13 |
14 | Scaling helps to normalize the input data, ensuring that each feature contributes proportionately
15 | to the final result, particularly in algorithms that are sensitive to the range of the data,
16 | such as gradient descent-based models (e.g., linear regression, logistic regression, neural networks)
17 | and distance-based models (e.g., K-nearest neighbors, clustering).
18 |
19 | Feature-engine's scalers replace the variables' values by the scaled ones. In this page, we
20 | discuss the importance of scaling numerical features, and then introduce the various
21 | scaling techniques supported by Feature-engine.
22 |
23 | Importance of scaling
24 | ---------------------
25 |
26 | Scaling is crucial in machine learning as it ensures that features contribute equally to model
27 | training, preventing bias toward variables with larger ranges. Properly scaled data enhances the
28 | performance of algorithms sensitive to the magnitude of input values, such as gradient descent
29 | and distance-based methods. Additionally, scaling can improve convergence speed and overall model
30 | accuracy, leading to more reliable predictions.
31 |
32 |
33 | When apply scaling
34 | ------------------
35 |
36 | - **Training:** Most machine learning algorithms require data to be scaled before training,
37 | especially linear models, neural networks, and distance-based models.
38 |
39 | - **Feature Engineering:** Scaling can be essential for certain feature engineering techniques,
40 | like polynomial features.
41 |
42 | - **Resampling:** Some oversampling methods like SMOTE and many of the undersampling methods
43 | clean data based on KNN algorithms, which are distance based models.
44 |
45 |
46 | When Scaling Is Not Necessary
47 | -----------------------------
48 |
49 | Not all algorithms require scaling. For example, tree-based algorithms (like Decision Trees,
50 | Random Forests, Gradient Boosting) are generally invariant to scaling because they split data
51 | based on the order of values, not the magnitude.
52 |
53 | Scalers
54 | -------
55 |
56 | .. toctree::
57 | :maxdepth: 1
58 |
59 | MeanNormalizationScaler
60 |
--------------------------------------------------------------------------------
/docs/user_guide/timeseries/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 | .. _timeseries:
3 |
4 | .. currentmodule:: feature_engine.timeseries
5 |
6 |
7 | Time Series Features
8 | ====================
9 |
10 | Feature-engine's time series transformers create features from time series data.
11 |
12 | .. toctree::
13 | :maxdepth: 1
14 |
15 | forecasting/index
16 |
17 | |
18 | |
19 | |
20 |
--------------------------------------------------------------------------------
/docs/user_guide/transformation/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Variance Stabilizing Transformations
4 | ====================================
5 |
6 | Feature-engine's variable transformers transform numerical variables with various
7 | mathematical transformations.
8 |
9 | Variable transformations are commonly used to spread the values of the original variables
10 | over a wider value range. See the following illustration:
11 |
12 | .. figure:: ../../images/Variable_Transformation.png
13 | :align: center
14 |
15 |
16 | Article
17 | -------
18 |
19 | We added a lot of information about **variance stabilizing transformations** in this
20 | `article `_.
21 |
22 | **Note**
23 |
24 | Note however, that improving the value spread is not always possible and it depends
25 | on the nature of the variable.
26 |
27 | **Transformers**
28 |
29 | .. toctree::
30 | :maxdepth: 1
31 |
32 | LogTransformer
33 | LogCpTransformer
34 | ReciprocalTransformer
35 | ArcsinTransformer
36 | PowerTransformer
37 | BoxCoxTransformer
38 | YeoJohnsonTransformer
39 |
--------------------------------------------------------------------------------
/docs/user_guide/variable_handling/check_numerical_variables.rst:
--------------------------------------------------------------------------------
1 | .. _check_num_vars:
2 |
3 | .. currentmodule:: feature_engine.variable_handling
4 |
5 | check_numerical_variables
6 | =========================
7 |
8 | :class:`check_numerical_variables()` checks that the variables in the list are of
9 | type numerical.
10 |
11 | Let's create a toy dataset with numerical, categorical and datetime variables:
12 |
13 | .. code:: python
14 |
15 | import pandas as pd
16 | df = pd.DataFrame({
17 | "Name": ["tom", "nick", "krish", "jack"],
18 | "City": ["London", "Manchester", "Liverpool", "Bristol"],
19 | "Age": [20, 21, 19, 18],
20 | "Marks": [0.9, 0.8, 0.7, 0.6],
21 | "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
22 | })
23 |
24 | print(df.head())
25 |
26 | We see the resulting dataframe below:
27 |
28 | .. code:: python
29 |
30 | Name City Age Marks dob
31 | 0 tom London 20 0.9 2020-02-24 00:00:00
32 | 1 nick Manchester 21 0.8 2020-02-24 00:01:00
33 | 2 krish Liverpool 19 0.7 2020-02-24 00:02:00
34 | 3 jack Bristol 18 0.6 2020-02-24 00:03:00
35 |
36 | Let's now check that 2 of the variables are of type numerical:
37 |
38 | .. code:: python
39 |
40 | from feature_engine.variable_handling import check_numerical_variables
41 |
42 | var_num = check_numerical_variables(df, ['Age', 'Marks'])
43 |
44 | var_num
45 |
46 | If the variables are numerical, the function returns their names in a list:
47 |
48 | .. code:: python
49 |
50 | ['Age', 'Marks']
51 |
52 | If we pass a variable that is not of type numerical,
53 | :class:`check_numerical_variables()` will return an error:
54 |
55 | .. code:: python
56 |
57 | check_numerical_variables(df, ['Age', 'Name'])
58 |
59 | Below we see the error message:
60 |
61 | .. code:: python
62 |
63 | TypeError: Some of the variables are not numerical. Please cast them as numerical
64 | before using this transformer.
65 |
--------------------------------------------------------------------------------
/docs/user_guide/variable_handling/find_numerical_variables.rst:
--------------------------------------------------------------------------------
1 | .. _find_num_vars:
2 |
3 | .. currentmodule:: feature_engine.variable_handling
4 |
5 | find_numerical_variables
6 | ========================
7 |
8 | :class:`find_numerical_variables()` returns a list with the names of the numerical
9 | variables in the dataset.
10 |
11 | Let's create a toy dataset with numerical, categorical and datetime variables:
12 |
13 | .. code:: python
14 |
15 | import pandas as pd
16 | df = pd.DataFrame({
17 | "Name": ["tom", "nick", "krish", "jack"],
18 | "City": ["London", "Manchester", "Liverpool", "Bristol"],
19 | "Age": [20, 21, 19, 18],
20 | "Marks": [0.9, 0.8, 0.7, 0.6],
21 | "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
22 | })
23 |
24 | print(df.head())
25 |
26 | We see the resulting dataframe below:
27 |
28 | .. code:: python
29 |
30 | Name City Age Marks dob
31 | 0 tom London 20 0.9 2020-02-24 00:00:00
32 | 1 nick Manchester 21 0.8 2020-02-24 00:01:00
33 | 2 krish Liverpool 19 0.7 2020-02-24 00:02:00
34 | 3 jack Bristol 18 0.6 2020-02-24 00:03:00
35 |
36 | With :class:`find_numerical_variables()` we capture the names of all numerical
37 | variables in a list. So let's do that and then display the list:
38 |
39 | .. code:: python
40 |
41 | from feature_engine.variable_handling import find_numerical_variables
42 |
43 | var_num = find_numerical_variables(df)
44 |
45 | var_num
46 |
47 | We see the names of the numerical variables in the list below:
48 |
49 | .. code:: python
50 |
51 | ['Age', 'Marks']
52 |
53 | If there are no numerical variables in the dataset, :class:`find_numerical_variables()`
54 | will raise an error.
55 |
--------------------------------------------------------------------------------
/docs/user_guide/variable_handling/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Variable handling functions
4 | ===========================
5 |
6 | This set of functions find variables of a specific type in a dataframe, or check that a
7 | list of variables is of a specified data type.
8 |
9 | The `find` functions take a dataframe as an argument and returns a list with the names
10 | of the variables of the desired type.
11 |
12 | The `check` functions check that the list of variables are all of the desired data type.
13 |
14 | The `retain` functions select the variables in a list if they fulfill a condition.
15 |
16 | You can use these functions to identify different sets of variables based on their
17 | data type to streamline your feature engineering pipelines or create your own
18 | Feature-engine or Scikit-learn compatible transformers.
19 |
20 |
21 | .. toctree::
22 | :maxdepth: 1
23 |
24 | find_all_variables
25 | find_categorical_variables
26 | find_datetime_variables
27 | find_numerical_variables
28 | find_categorical_and_numerical_variables
29 | check_all_variables
30 | check_categorical_variables
31 | check_datetime_variables
32 | check_numerical_variables
33 | retain_variables_if_in_df
34 |
--------------------------------------------------------------------------------
/docs/user_guide/wrappers/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | Scikit-learn Wrapper
4 | ====================
5 |
6 | Feature-engine's Scikit-learn wrappers wrap Scikit-learn transformers allowing their
7 | implementation only on a selected subset of features.
8 |
9 | .. toctree::
10 | :maxdepth: 1
11 |
12 | Wrapper
--------------------------------------------------------------------------------
/docs/versions/index.rst:
--------------------------------------------------------------------------------
1 | Other versions
2 | ==============
3 |
4 | Web-based documentation is available for versions listed below:
5 |
6 | - `Feature-engine 1.6 `_
7 |
8 |
--------------------------------------------------------------------------------
/docs/whats_new/index.rst:
--------------------------------------------------------------------------------
1 | .. -*- mode: rst -*-
2 |
3 | What's new
4 | ==========
5 |
6 | Find out what's new in each new version release.
7 |
8 | .. toctree::
9 | :maxdepth: 2
10 |
11 | v_180
12 | v_170
13 | v_160
14 | v_150
15 | v_140
16 | v_130
17 | v_120
18 | v_1
19 | v_06
--------------------------------------------------------------------------------
/feature_engine/VERSION:
--------------------------------------------------------------------------------
1 | 1.8.3
2 |
--------------------------------------------------------------------------------
/feature_engine/__init__.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 |
3 | import feature_engine
4 |
5 | PACKAGE_ROOT = pathlib.Path(feature_engine.__file__).resolve().parent
6 | VERSION_PATH = PACKAGE_ROOT / "VERSION"
7 |
8 | name = "feature_engine"
9 |
10 | with open(VERSION_PATH, "r") as version_file:
11 | __version__ = version_file.read().strip()
12 |
--------------------------------------------------------------------------------
/feature_engine/_base_transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_base_transformers/__init__.py
--------------------------------------------------------------------------------
/feature_engine/_check_init_parameters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_check_init_parameters/__init__.py
--------------------------------------------------------------------------------
/feature_engine/_check_init_parameters/check_init_input_params.py:
--------------------------------------------------------------------------------
1 | def _check_param_missing_values(missing_values):
2 | if missing_values not in ["raise", "ignore"]:
3 | raise ValueError(
4 | "missing_values takes only values 'raise' or 'ignore'. "
5 | f"Got {missing_values} instead."
6 | )
7 |
8 |
9 | def _check_param_drop_original(drop_original):
10 | if not isinstance(drop_original, bool):
11 | raise ValueError(
12 | "drop_original takes only boolean values True and False. "
13 | f"Got {drop_original} instead."
14 | )
15 |
--------------------------------------------------------------------------------
/feature_engine/_check_init_parameters/check_input_dictionary.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 |
4 | def _check_numerical_dict(dict_: Optional[dict]) -> Optional[dict]:
5 | """
6 | Checks that all values in the dictionary are integers and floats. It can take also
7 | take None as value.
8 |
9 | Parameters
10 | ----------
11 | dict_ : dict
12 | The dictionary that will be checked.
13 |
14 | Raises
15 | ------
16 | ValueError
17 | If any of the values in the dictionary are not int or float.
18 | TypeError
19 | When input type is not a dictionary.
20 | """
21 |
22 | if isinstance(dict_, dict):
23 | if not all([isinstance(x, (float, int)) for x in dict_.values()]):
24 | raise ValueError(
25 | "All values in the dictionary must be integer or float. "
26 | f"Got {dict_} instead."
27 | )
28 |
29 | elif dict_ is not None:
30 | raise TypeError(
31 | f"The parameter can only take a dictionary or None. Got {dict_} instead."
32 | )
33 | return None
34 |
--------------------------------------------------------------------------------
/feature_engine/_check_init_parameters/check_variables.py:
--------------------------------------------------------------------------------
1 | from typing import Any, List, Union
2 |
3 | Variables = Union[None, int, str, List[Union[str, int]]]
4 |
5 |
6 | def _check_variables_input_value(variables: Variables) -> Any:
7 | """
8 | Checks that the input value for the `variables` parameter located in the init of
9 | all Feature-engine transformers is of the correct type.
10 | Allowed values are None, int, str or list of strings and integers.
11 |
12 | Parameters
13 | ----------
14 | variables : string, int, list of strings, list of integers. Default=None
15 |
16 | Returns
17 | -------
18 | variables: same as input
19 | """
20 |
21 | msg = (
22 | "`variables` should contain a string, an integer or a list of strings or "
23 | f"integers. Got {variables} instead."
24 | )
25 | msg_dupes = "The list entered in `variables` contains duplicated variable names."
26 | msg_empty = "The list of `variables` is empty."
27 |
28 | if variables is not None:
29 | if isinstance(variables, list):
30 | if not all(isinstance(i, (str, int)) for i in variables):
31 | raise ValueError(msg)
32 | if len(variables) == 0:
33 | raise ValueError(msg_empty)
34 | if len(variables) != len(set(variables)):
35 | raise ValueError(msg_dupes)
36 | else:
37 | if not isinstance(variables, (str, int)):
38 | raise ValueError(msg)
39 | return variables
40 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/__init__.py
--------------------------------------------------------------------------------
/feature_engine/_docstrings/fit_attributes.py:
--------------------------------------------------------------------------------
1 | """Docstrings for the attributes that are generated during fit."""
2 |
3 | _variables_attribute_docstring = """variables_:
4 | The group of variables that will be transformed.
5 | """.rstrip()
6 |
7 | _feature_names_in_docstring = """feature_names_in_:
8 | List with the names of features seen during `fit`.
9 | """.rstrip()
10 |
11 | _n_features_in_docstring = """n_features_in_:
12 | The number of features in the train set used in fit.
13 | """.rstrip()
14 |
15 | # used by discretisers
16 | _binner_dict_docstring = """binner_dict_:
17 | Dictionary with the interval limits per variable.
18 | """.rstrip()
19 |
20 | # used by imputers
21 | _imputer_dict_docstring = """imputer_dict_:
22 | Dictionary with the values to replace missing data in each variable.
23 | """.rstrip()
24 |
25 | # used by outlier module
26 | _right_tail_caps_docstring = """right_tail_caps_:
27 | Dictionary with the maximum values beyond which a value will be considered an
28 | outlier.
29 | """.rstrip()
30 |
31 | _left_tail_caps_docstring = """left_tail_caps_:
32 | Dictionary with the minimum values beyond which a value will be considered an
33 | outlier.
34 | """.rstrip()
35 |
36 | # used by selection module
37 | _feature_importances_docstring = """feature_importances_:
38 | Pandas Series with the feature importance (comes from step 2)
39 | """.rstrip()
40 |
41 | _feature_importances_std_docstring = """feature_importances_std_:
42 | Pandas Series with the standard deviation of the feature importance.
43 | """.rstrip()
44 |
45 | _performance_drifts_docstring = """performance_drifts_:
46 | Dictionary with the performance drift per examined feature (comes from step 5).
47 | """.rstrip()
48 |
49 | _performance_drifts_std_docstring = """performance_drifts_std_:
50 | Dictionary with the performance drift's standard deviation of the
51 | examined feature (comes from step 5).
52 | """.rstrip()
53 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/init_parameters/__init__.py
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/all_trasnformers.py:
--------------------------------------------------------------------------------
1 | """Docstrings for the parameters corresponding to the __init__"""
2 |
3 | _variables_numerical_docstring = """variables: list, default=None
4 | The list of numerical variables to transform. If None, the transformer will
5 | automatically find and select all numerical variables.
6 | """.rstrip()
7 |
8 | _variables_categorical_docstring = """variables: list, default=None
9 | The list of categorical variables that will be encoded. If None, the
10 | encoder will find and transform all variables of type object or categorical by
11 | default. You can also make the transformer accept numerical variables, see the
12 | parameter `ignore_format`.
13 | """.rstrip()
14 |
15 | _drop_original_docstring = """drop_original: bool, default=False
16 | If True, the original variables to transform will be dropped from the dataframe.
17 | """.rstrip()
18 |
19 | _missing_values_docstring = """missing_values: string, default='raise'
20 | Indicates if missing values should be ignored or raised. If `'raise'` the
21 | transformer will return an error if the the datasets to `fit` or `transform`
22 | contain missing values. If `'ignore'`, missing data will be ignored when
23 | learning parameters or performing the transformation.
24 | """.rstrip()
25 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/discretisers.py:
--------------------------------------------------------------------------------
1 | _return_object_docstring = """return_object: bool, default=False
2 | Whether the the discrete variable should be returned as type numeric or type
3 | object. If you would like to encode the discrete variables with Feature-engine's
4 | categorical encoders, use True. Alternatively, keep the default to False.
5 | """.rstrip()
6 |
7 | _return_boundaries_docstring = """return_boundaries: bool, default=False
8 | Whether the output should be the interval boundaries. If True, it returns
9 | the interval boundaries. If False, it returns integers.
10 | """.rstrip()
11 |
12 | _precision_docstring = """precision: int, default=3
13 | The precision at which to store and display the bins labels.
14 | """.rstrip()
15 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/encoders.py:
--------------------------------------------------------------------------------
1 | _ignore_format_docstring = """ignore_format: bool, default=False
2 | This transformer operates only on variables of type object or categorical. To
3 | override this behaviour and allow the transformer to transform numerical
4 | variables as well, set to `True`.\n
5 | If `ignore_format` is `False`, the encoder will automatically select variables
6 | of type object or categorical, or check that the variables entered by the user
7 | are of type object or categorical. If `True`, the encoder will select all
8 | variables or accept all variables entered by the user, including those cast as
9 | numeric.\n
10 | In short, set to `True` when you want to encode numerical variables.
11 | """.rstrip()
12 |
13 | _unseen_docstring = """unseen: string, default='ignore'
14 | Indicates what to do when categories not present in the train set are
15 | encountered during transform. If `'raise'`, then unseen categories will raise
16 | an error. If `'ignore'`, then unseen categories will be encoded as NaN and a
17 | warning will be raised instead.
18 | """.rstrip()
19 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/outliers.py:
--------------------------------------------------------------------------------
1 | _capping_method_docstring = """capping_method: str, default='gaussian'
2 | Desired outlier detection method. Can be 'gaussian', 'iqr', 'mad',
3 | 'quantiles'. \n
4 | The transformer will find the maximum and / or minimum values beyond which a
5 | data point will be considered an outlier using:
6 | **'gaussian'**: the Gaussian approximation.
7 | **'iqr'**: the IQR proximity rule.
8 | **'quantiles'**: the percentiles.
9 | **'mad'**: the Gaussian approximation but using robust statistics.
10 | """.rstrip()
11 |
12 | _tail_docstring = """tail: str, default='right'
13 | Whether to look for outliers on the right, left or both tails of the
14 | distribution. Can take 'left', 'right' or 'both'.
15 | """.rstrip()
16 |
17 | _fold_docstring = """fold: int, float or 'auto', default='auto'.
18 | The factor used to multiply the std, MAD or IQR to calculate
19 | the maximum or minimum allowed values.
20 | When 'auto', `fold` is set based on the `capping_method`: \n
21 | - If `capping_method='quantile'` then `'fold'` = 0.05; \n
22 | - If `capping_method='gaussian'` then `'fold'` = 3.0; \n
23 | - If `capping_method='mad'` then `'fold'` = 3.29; \n
24 | - If `capping_method='iqr'` then `'fold'` = 1.5. \n
25 | Recommended values are 2, 2.5 or 3 for the gaussian approximation,
26 | 1.5 or 3 for the IQR proximity rule and 3 or 3.5 for MAD rule. \n
27 | If `capping_method='quantile'`, then `'fold'` indicates the percentile. So if
28 | `fold=0.05`, the limits will be the 95th and 5th percentiles. \n
29 | **Note**: When `capping_method='quantile'`, the maximum `fold` allowed is 0.2,
30 | which will find boundaries at the 20th and 80th percentile.
31 | """.rstrip()
32 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/init_parameters/selection.py:
--------------------------------------------------------------------------------
1 | _confirm_variables_docstring = """confirm_variables: bool, default=False
2 | If set to True, variables that are not present in the input dataframe will
3 | be removed from the list of variables. Only used when passing a variable
4 | list to the parameter `variables`. See parameter variables for more details.
5 | """.rstrip()
6 |
7 | _estimator_docstring = """estimator: object
8 | A Scikit-learn estimator for regression or classification.
9 | The estimator must have either a `feature_importances` or a `coef_`
10 | attribute after fitting.
11 | """.rstrip()
12 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/methods.py:
--------------------------------------------------------------------------------
1 | """Docstrings for the methods. They are meant to be used in the init docstrings of
2 | the transformers."""
3 |
4 | _fit_not_learn_docstring = """fit:
5 | This transformer does not learn parameters.
6 | """.rstrip()
7 |
8 | _fit_transform_docstring = """fit_transform:
9 | Fit to data, then transform it.
10 |
11 | get_feature_names_out:
12 | Get output feature names for transformation.
13 |
14 | get_params:
15 | Get parameters for this estimator.
16 |
17 | set_params:
18 | Set the parameters of this estimator.
19 | """.rstrip()
20 |
21 | _inverse_transform_docstring = """inverse_transform:
22 | Convert the data back to the original representation.
23 | """.rstrip()
24 |
25 | # used in categorical encoders
26 | _transform_encoders_docstring = """transform:
27 | Encode the categories to numbers.
28 | """.rstrip()
29 |
30 | # used in creation module
31 | _transform_creation_docstring = """transform:
32 | Create new features.
33 | """.rstrip()
34 |
35 | # used in discretisers module
36 | _fit_discretiser_docstring = """fit:
37 | Find the interval limits.
38 | """.rstrip()
39 |
40 | _transform_discretiser_docstring = """transform:
41 | Sort continuous variable values into the intervals.
42 | """.rstrip()
43 |
44 | # used in imputation module
45 | _transform_imputers_docstring = """transform:
46 | Impute missing data.
47 | """.rstrip()
48 |
--------------------------------------------------------------------------------
/feature_engine/_docstrings/selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_docstrings/selection/__init__.py
--------------------------------------------------------------------------------
/feature_engine/_docstrings/substitute.py:
--------------------------------------------------------------------------------
1 | """Utilities for docstring in Feature-engine.
2 |
3 | Taken from the project imbalanced-learn:
4 |
5 | https://github.com/scikit-learn-contrib/imbalanced-learn/blob/
6 | imblearn/utils/_docstring.py#L7
7 | """
8 |
9 |
10 | class Substitution:
11 | """Decorate a function's or a class' docstring to perform string
12 | substitution on it.
13 | This decorator should be robust even if obj.__doc__ is None
14 | (for example, if -OO was passed to the interpreter).
15 | """
16 |
17 | def __init__(self, *args, **kwargs):
18 | if args and kwargs:
19 | raise AssertionError("Only positional or keyword args are allowed")
20 |
21 | self.params = args or kwargs
22 |
23 | def __call__(self, obj):
24 | obj.__doc__ = obj.__doc__.format(**self.params)
25 | return obj
26 |
--------------------------------------------------------------------------------
/feature_engine/_prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/_prediction/__init__.py
--------------------------------------------------------------------------------
/feature_engine/creation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module creation includes classes to create new variables by combination of existing
3 | variables in the dataframe.
4 | """
5 | from .cyclical_features import CyclicalFeatures
6 | from .decision_tree_features import DecisionTreeFeatures
7 | from .math_features import MathFeatures
8 | from .relative_features import RelativeFeatures
9 |
10 | __all__ = [
11 | "DecisionTreeFeatures",
12 | "MathFeatures",
13 | "RelativeFeatures",
14 | "CyclicalFeatures",
15 | ]
16 |
--------------------------------------------------------------------------------
/feature_engine/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .titanic import load_titanic
2 |
3 | __all__ = ["load_titanic"]
4 |
--------------------------------------------------------------------------------
/feature_engine/datetime/__init__.py:
--------------------------------------------------------------------------------
1 | "The module datetime computes features from dates and times."
2 |
3 | from .datetime import DatetimeFeatures
4 | from .datetime_subtraction import DatetimeSubtraction
5 |
6 | __all__ = ["DatetimeFeatures", "DatetimeSubtraction"]
7 |
--------------------------------------------------------------------------------
/feature_engine/datetime/_datetime_constants.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | FEATURES_SUPPORTED = [
4 | "month",
5 | "quarter",
6 | "semester",
7 | "year",
8 | "week",
9 | "day_of_week",
10 | "day_of_month",
11 | "day_of_year",
12 | "weekend",
13 | "month_start",
14 | "month_end",
15 | "quarter_start",
16 | "quarter_end",
17 | "year_start",
18 | "year_end",
19 | "leap_year",
20 | "days_in_month",
21 | "hour",
22 | "minute",
23 | "second",
24 | ]
25 |
26 | FEATURES_DEFAULT = [
27 | "month",
28 | "year",
29 | "day_of_week",
30 | "day_of_month",
31 | "hour",
32 | "minute",
33 | "second",
34 | ]
35 |
36 | FEATURES_SUFFIXES = {
37 | "month": "_month",
38 | "quarter": "_quarter",
39 | "semester": "_semester",
40 | "year": "_year",
41 | "week": "_week",
42 | "day_of_week": "_day_of_week",
43 | "day_of_month": "_day_of_month",
44 | "day_of_year": "_day_of_year",
45 | "weekend": "_weekend",
46 | "month_start": "_month_start",
47 | "month_end": "_month_end",
48 | "quarter_start": "_quarter_start",
49 | "quarter_end": "_quarter_end",
50 | "year_start": "_year_start",
51 | "year_end": "_year_end",
52 | "leap_year": "_leap_year",
53 | "days_in_month": "_days_in_month",
54 | "hour": "_hour",
55 | "minute": "_minute",
56 | "second": "_second",
57 | }
58 |
59 | FEATURES_FUNCTIONS = {
60 | "month": lambda x: x.dt.month,
61 | "quarter": lambda x: x.dt.quarter,
62 | "semester": lambda x: np.where(x.dt.month <= 6, 1, 2).astype(np.int64),
63 | "year": lambda x: x.dt.year,
64 | "week": lambda x: x.dt.isocalendar().week.astype(np.int64),
65 | "day_of_week": lambda x: x.dt.dayofweek,
66 | "day_of_month": lambda x: x.dt.day,
67 | "day_of_year": lambda x: x.dt.dayofyear,
68 | "weekend": lambda x: np.where(x.dt.dayofweek <= 4, 0, 1).astype(np.int64),
69 | "month_start": lambda x: x.dt.is_month_start.astype(np.int64),
70 | "month_end": lambda x: x.dt.is_month_end.astype(np.int64),
71 | "quarter_start": lambda x: x.dt.is_quarter_start.astype(np.int64),
72 | "quarter_end": lambda x: x.dt.is_quarter_end.astype(np.int64),
73 | "year_start": lambda x: x.dt.is_year_start.astype(np.int64),
74 | "year_end": lambda x: x.dt.is_year_end.astype(np.int64),
75 | "leap_year": lambda x: x.dt.is_leap_year.astype(np.int64),
76 | "days_in_month": lambda x: x.dt.days_in_month.astype(np.int64),
77 | "hour": lambda x: x.dt.hour,
78 | "minute": lambda x: x.dt.minute,
79 | "second": lambda x: x.dt.second,
80 | }
81 |
--------------------------------------------------------------------------------
/feature_engine/discretisation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module discretisation includes classes to sort continuous variables into bins or
3 | intervals.
4 | """
5 |
6 | from .arbitrary import ArbitraryDiscretiser
7 | from .decision_tree import DecisionTreeDiscretiser
8 | from .equal_frequency import EqualFrequencyDiscretiser
9 | from .equal_width import EqualWidthDiscretiser
10 | from .geometric_width import GeometricWidthDiscretiser
11 |
12 | __all__ = [
13 | "DecisionTreeDiscretiser",
14 | "EqualFrequencyDiscretiser",
15 | "EqualWidthDiscretiser",
16 | "ArbitraryDiscretiser",
17 | "GeometricWidthDiscretiser",
18 | ]
19 |
--------------------------------------------------------------------------------
/feature_engine/encoding/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module encoding includes classes to transform categorical variables into numerical.
3 | """
4 |
5 | from .count_frequency import CountFrequencyEncoder
6 | from .decision_tree import DecisionTreeEncoder
7 | from .mean_encoding import MeanEncoder
8 | from .one_hot import OneHotEncoder
9 | from .ordinal import OrdinalEncoder
10 | from .rare_label import RareLabelEncoder
11 | from .similarity_encoder import StringSimilarityEncoder
12 | from .woe import WoEEncoder
13 |
14 | __all__ = [
15 | "CountFrequencyEncoder",
16 | "DecisionTreeEncoder",
17 | "MeanEncoder",
18 | "OneHotEncoder",
19 | "OrdinalEncoder",
20 | "RareLabelEncoder",
21 | "StringSimilarityEncoder",
22 | "WoEEncoder",
23 | ]
24 |
--------------------------------------------------------------------------------
/feature_engine/encoding/_helper_functions.py:
--------------------------------------------------------------------------------
1 | def check_parameter_unseen(unseen, accepted_values):
2 | if not isinstance(accepted_values, list) or not all(
3 | isinstance(item, str) for item in accepted_values
4 | ):
5 | raise ValueError(
6 | "accepted_values should be a list of strings. "
7 | f" Got {accepted_values} instead."
8 | )
9 | if unseen not in accepted_values:
10 | raise ValueError(
11 | f"Parameter `unseen` takes only values {', '.join(accepted_values)}."
12 | f" Got {unseen} instead."
13 | )
14 |
--------------------------------------------------------------------------------
/feature_engine/imputation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module imputation includes classes to perform missing data imputation
3 | """
4 |
5 | from .arbitrary_number import ArbitraryNumberImputer
6 | from .categorical import CategoricalImputer
7 | from .drop_missing_data import DropMissingData
8 | from .end_tail import EndTailImputer
9 | from .mean_median import MeanMedianImputer
10 | from .missing_indicator import AddMissingIndicator
11 | from .random_sample import RandomSampleImputer
12 |
13 | __all__ = [
14 | "MeanMedianImputer",
15 | "ArbitraryNumberImputer",
16 | "CategoricalImputer",
17 | "EndTailImputer",
18 | "AddMissingIndicator",
19 | "RandomSampleImputer",
20 | "DropMissingData",
21 | ]
22 |
--------------------------------------------------------------------------------
/feature_engine/outliers/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module outliers includes classes to remove or cap outliers.
3 | """
4 |
5 | from .artbitrary import ArbitraryOutlierCapper
6 | from .trimmer import OutlierTrimmer
7 | from .winsorizer import Winsorizer
8 |
9 | __all__ = ["Winsorizer", "ArbitraryOutlierCapper", "OutlierTrimmer"]
10 |
--------------------------------------------------------------------------------
/feature_engine/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import Pipeline, make_pipeline
2 |
3 | __all__ = ["Pipeline", "make_pipeline"]
4 |
--------------------------------------------------------------------------------
/feature_engine/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module preprocessing includes classes and functions for general data pre-processing
3 | and transformation.
4 | """
5 |
6 | from .match_categories import MatchCategories
7 | from .match_columns import MatchVariables
8 |
9 | __all__ = [
10 | "MatchCategories",
11 | "MatchVariables",
12 | ]
13 |
--------------------------------------------------------------------------------
/feature_engine/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/py.typed
--------------------------------------------------------------------------------
/feature_engine/scaling/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module scaling includes classes to transform variables using various
3 | scaling methods.
4 | """
5 |
6 | from .mean_normalization import MeanNormalizationScaler
7 |
8 | __all__ = [
9 | "MeanNormalizationScaler",
10 | ]
11 |
--------------------------------------------------------------------------------
/feature_engine/selection/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module selection includes classes to select features or remove unwanted features.
3 | """
4 | from .drop_constant_features import DropConstantFeatures
5 | from .drop_correlated_features import DropCorrelatedFeatures
6 | from .drop_duplicate_features import DropDuplicateFeatures
7 | from .drop_features import DropFeatures
8 | from .drop_psi_features import DropHighPSIFeatures
9 | from .information_value import SelectByInformationValue
10 | from .probe_feature_selection import ProbeFeatureSelection
11 | from .recursive_feature_addition import RecursiveFeatureAddition
12 | from .recursive_feature_elimination import RecursiveFeatureElimination
13 | from .shuffle_features import SelectByShuffling
14 | from .single_feature_performance import SelectBySingleFeaturePerformance
15 | from .smart_correlation_selection import SmartCorrelatedSelection
16 | from .target_mean_selection import SelectByTargetMeanPerformance
17 | from .mrmr import MRMR
18 |
19 | __all__ = [
20 | "DropFeatures",
21 | "DropConstantFeatures",
22 | "DropDuplicateFeatures",
23 | "DropCorrelatedFeatures",
24 | "DropHighPSIFeatures",
25 | "SmartCorrelatedSelection",
26 | "SelectByShuffling",
27 | "SelectBySingleFeaturePerformance",
28 | "RecursiveFeatureAddition",
29 | "RecursiveFeatureElimination",
30 | "SelectByTargetMeanPerformance",
31 | "SelectByInformationValue",
32 | "ProbeFeatureSelection",
33 | "MRMR",
34 | ]
35 |
--------------------------------------------------------------------------------
/feature_engine/selection/_selection_constants.py:
--------------------------------------------------------------------------------
1 | _CLASSIFICATION_METRICS = [
2 | "accuracy",
3 | "balanced_accuracy",
4 | "top_k_accuracy",
5 | "average_precision",
6 | "neg_brier_score",
7 | "f1",
8 | "f1_micro",
9 | "f1_macro",
10 | "f1_weighted",
11 | "f1_samples",
12 | "neg_log_loss",
13 | "precision",
14 | "precision_micro",
15 | "precision_macro",
16 | "precision_weighted",
17 | "precision_samples",
18 | "recall",
19 | "recall_micro",
20 | "recall_macro",
21 | "recall_weighted",
22 | "recall_samples",
23 | "jaccard",
24 | "jaccard_micro",
25 | "jaccard_macro",
26 | "jaccard_weighted",
27 | "jaccard_samples",
28 | "roc_auc",
29 | "roc_auc_ovr",
30 | "roc_auc_ovo",
31 | "roc_auc_ovr_weighted",
32 | "roc_auc_ovo_weighted",
33 | ]
34 |
35 | _REGRESSION_METRICS = [
36 | "explained_variance",
37 | "r2",
38 | "max_error",
39 | "neg_median_absolute_error",
40 | "neg_mean_absolute_error",
41 | "neg_mean_absolute_percentage_error",
42 | "neg_mean_squared_error",
43 | "neg_mean_squared_log_error",
44 | "neg_root_mean_squared_error",
45 | "neg_mean_poisson_deviance",
46 | "neg_mean_gamma_deviance",
47 | ]
48 |
--------------------------------------------------------------------------------
/feature_engine/tags.py:
--------------------------------------------------------------------------------
1 | import sklearn
2 | from sklearn.utils.fixes import parse_version
3 |
4 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
5 |
6 |
7 | def _return_tags():
8 | tags = {
9 | "preserves_dtype": [],
10 | "_xfail_checks": {
11 | # Complex data in math terms, are values like 4i (imaginary numbers
12 | # so to speak). I've never seen such a thing in the dfs I've
13 | # worked with, so I don't think we need this test.
14 | "check_complex_data": "Test not needed.",
15 | # check that estimators treat dtype object as numeric if possible
16 | "check_dtype_object": "Feature-engine transformers use dtypes to select "
17 | "between numerical and categorical variables. Feature-engine trusts the "
18 | "user casts the variables appropriately",
19 | # Test fails because FE does not like the sklearn class _NotAnArray
20 | # The test aims to check that the check_X_y function from sklearn is
21 | # working, but we do not use that check, because we work with dfs.
22 | "check_transformer_data_not_an_array": "Ok to fail",
23 | "check_sample_weights_not_an_array": "Ok to fail",
24 | # TODO: we probably need the test below!!
25 | "check_methods_sample_order_invariance": "Test does not work on dataframes",
26 | # TODO: we probably need the test below!!
27 | # the test below tests that a second fit overrides a first fit.
28 | # the problem is that the test does not work with pandas df.
29 | "check_fit_idempotent": "Test does not work on dataframes.",
30 | "check_fit2d_predict1d": "Test not relevant, Feature-engine transformers "
31 | "only work with dataframes.",
32 | },
33 | }
34 |
35 | if sklearn_version > parse_version("1.6"):
36 | msg1 = "against Feature-engines design."
37 | msg2 = "Our transformers do not preserve dtype."
38 | all_fail = {
39 | "check_do_not_raise_errors_in_init_or_set_params": msg1,
40 | "check_transformer_preserve_dtypes": msg2,
41 | # TODO: investigate this test further.
42 | "check_n_features_in_after_fitting": "not sure why it fails, we do check.",
43 | }
44 | tags["_xfail_checks"].update(all_fail) # type: ignore
45 | return tags
46 |
--------------------------------------------------------------------------------
/feature_engine/timeseries/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/feature_engine/timeseries/__init__.py
--------------------------------------------------------------------------------
/feature_engine/timeseries/forecasting/__init__.py:
--------------------------------------------------------------------------------
1 | """ Transformers that create features for time-series forecasting."""
2 |
3 | from .expanding_window_features import ExpandingWindowFeatures
4 | from .lag_features import LagFeatures
5 | from .window_features import WindowFeatures
6 |
7 | __all__ = ["LagFeatures", "WindowFeatures", "ExpandingWindowFeatures"]
8 |
--------------------------------------------------------------------------------
/feature_engine/transformation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module transformation includes classes to transform variables using mathematical
3 | functions.
4 | """
5 |
6 | from .arcsin import ArcsinTransformer
7 | from .boxcox import BoxCoxTransformer
8 | from .log import LogCpTransformer, LogTransformer
9 | from .power import PowerTransformer
10 | from .reciprocal import ReciprocalTransformer
11 | from .yeojohnson import YeoJohnsonTransformer
12 |
13 | __all__ = [
14 | "BoxCoxTransformer",
15 | "LogTransformer",
16 | "LogCpTransformer",
17 | "PowerTransformer",
18 | "ReciprocalTransformer",
19 | "YeoJohnsonTransformer",
20 | "ArcsinTransformer",
21 | ]
22 |
--------------------------------------------------------------------------------
/feature_engine/variable_handling/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module variable handling includes functions to select variables of a certain type
3 | or check that a list of variables is in certain type.
4 | """
5 |
6 | from .check_variables import (
7 | check_all_variables,
8 | check_categorical_variables,
9 | check_datetime_variables,
10 | check_numerical_variables,
11 | )
12 | from .find_variables import (
13 | find_all_variables,
14 | find_categorical_and_numerical_variables,
15 | find_categorical_variables,
16 | find_datetime_variables,
17 | find_numerical_variables,
18 | )
19 | from .retain_variables import retain_variables_if_in_df
20 |
21 | __all__ = [
22 | "check_all_variables",
23 | "check_numerical_variables",
24 | "check_categorical_variables",
25 | "check_datetime_variables",
26 | "find_all_variables",
27 | "find_numerical_variables",
28 | "find_categorical_variables",
29 | "find_datetime_variables",
30 | "find_categorical_and_numerical_variables",
31 | "retain_variables_if_in_df",
32 | ]
33 |
--------------------------------------------------------------------------------
/feature_engine/variable_handling/_variable_type_checks.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import pandas as pd
4 | from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
5 | from pandas.core.dtypes.common import is_numeric_dtype as is_numeric
6 | from pandas.core.dtypes.common import is_object_dtype as is_object
7 |
8 |
9 | def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
10 | # check for datetime only if object cannot be cast as numeric because
11 | # if it could pd.to_datetime would convert it to datetime regardless
12 | if is_object(column):
13 | is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column)
14 |
15 | # check for datetime only if the type of the categories is not numeric
16 | # because pd.to_datetime throws an error when it is an integer
17 | elif isinstance(column.dtype, pd.CategoricalDtype):
18 | is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column)
19 |
20 | return is_cat
21 |
22 |
23 | def _is_categories_num(column: pd.Series) -> bool:
24 | return is_numeric(column.dtype.categories)
25 |
26 |
27 | def _is_convertible_to_dt(column: pd.Series) -> bool:
28 | with warnings.catch_warnings():
29 | warnings.simplefilter("ignore")
30 | return is_datetime(pd.to_datetime(column, errors="ignore", utc=True))
31 |
32 |
33 | def _is_convertible_to_num(column: pd.Series) -> bool:
34 | try:
35 | ser = pd.to_numeric(column)
36 | except (ValueError, TypeError):
37 | ser = column
38 | return is_numeric(ser)
39 |
40 |
41 | def _is_categorical_and_is_datetime(column: pd.Series) -> bool:
42 | # check for datetime only if object cannot be cast as numeric because
43 | # if it could pd.to_datetime would convert it to datetime regardless
44 | if is_object(column):
45 | is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column)
46 |
47 | # check for datetime only if the type of the categories is not numeric
48 | # because pd.to_datetime throws an error when it is an integer
49 | elif isinstance(column.dtype, pd.CategoricalDtype):
50 | is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column)
51 |
52 | return is_dt
53 |
--------------------------------------------------------------------------------
/feature_engine/variable_handling/dtypes.py:
--------------------------------------------------------------------------------
1 | DATETIME_TYPES = ("datetimetz", "datetime")
2 |
--------------------------------------------------------------------------------
/feature_engine/variable_handling/retain_variables.py:
--------------------------------------------------------------------------------
1 | """Functions to remove variables from a list."""
2 |
3 | from typing import List, Union
4 |
5 | Variables = Union[int, str, List[Union[str, int]]]
6 |
7 |
8 | def retain_variables_if_in_df(X, variables):
9 | """Returns the subset of variables in the list that are present in the dataframe.
10 |
11 | More details in the :ref:`User Guide `.
12 |
13 | Parameters
14 | ----------
15 | X: pandas dataframe of shape = [n_samples, n_features]
16 | The dataset.
17 |
18 | variables: string, int or list of strings or int.
19 | The names of the variables to check.
20 |
21 | Returns
22 | -------
23 | variables_in_df: List.
24 | The subset of `variables` that is present `X`.
25 |
26 | Examples
27 | --------
28 | >>> import pandas as pd
29 | >>> from feature_engine.variable_handling import retain_variables_if_in_df
30 | >>> X = pd.DataFrame({
31 | >>> "var_num": [1, 2, 3],
32 | >>> "var_cat": ["A", "B", "C"],
33 | >>> "var_date": pd.date_range("2020-02-24", periods=3, freq="T")
34 | >>> })
35 | >>> vars_in_df = retain_variables_if_in_df(X, ['var_num', 'var_cat', 'var_other'])
36 | >>> vars_in_df
37 | ['var_num', 'var_cat']
38 | """
39 | if isinstance(variables, (str, int)):
40 | variables = [variables]
41 |
42 | variables_in_df = [var for var in variables if var in X.columns]
43 |
44 | # Raise an error if no column is left to work with.
45 | if len(variables_in_df) == 0:
46 | raise ValueError(
47 | "None of the variables in the list are present in the dataframe."
48 | )
49 |
50 | return variables_in_df
51 |
--------------------------------------------------------------------------------
/feature_engine/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The module wrappers includes classes to wrap Scikit-learn transformers so that they
3 | can be applied to a selected subset of features and return a dataframe.
4 | """
5 |
6 | from .wrappers import SklearnTransformerWrapper
7 |
8 | __all__ = ["SklearnTransformerWrapper"]
9 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | warn_unused_ignores = True
3 | follow_imports = skip
4 | show_error_context = True
5 | warn_incomplete_stub = True
6 | ignore_missing_imports = True
7 | check_untyped_defs = True
8 | cache_dir = /dev/null
9 | warn_redundant_casts = True
10 | warn_unused_configs = True
11 | strict_optional = True
12 |
13 | exclude = (?x)(
14 | mixins\.py$ # or files ending with "two.pyi"
15 | )
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 |
3 | [pytest]
4 | filterwarnings =
5 | ignore::sklearn.exceptions.SkipTestWarning
6 | ignore::UserWarning
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.18.2
2 | pandas>=2.2.0
3 | scikit-learn>=1.4.0
4 | scipy>=1.4.1
5 | statsmodels>=0.11.1
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from setuptools import find_packages, setup
4 |
5 | # Package meta-data.
6 | NAME = "feature_engine"
7 | DESCRIPTION = "Feature engineering and selection package with Scikit-learn's fit transform functionality"
8 | URL = "http://github.com/feature-engine/feature_engine"
9 | EMAIL = "solegalli@protonmail.com"
10 | AUTHOR = "Soledad Galli"
11 | REQUIRES_PYTHON = ">=3.9.0"
12 |
13 | # description
14 | with open("README.md", "r") as fh:
15 | long_description = fh.read()
16 |
17 |
18 | # Packages required for this module to be executed
19 | def list_reqs(fname='requirements.txt'):
20 | with open(fname) as fd:
21 | return fd.read().splitlines()
22 |
23 |
24 | # Load the package's VERSION file as a dictionary.
25 | about = {}
26 | ROOT_DIR = Path(__file__).resolve().parent
27 | PACKAGE_DIR = ROOT_DIR / 'feature_engine'
28 | with open(PACKAGE_DIR / "VERSION") as f:
29 | _version = f.read().strip()
30 | about["__version__"] = _version
31 |
32 | setup(name=NAME,
33 | version=about["__version__"],
34 | description=DESCRIPTION,
35 | long_description=long_description,
36 | long_description_content_type="text/markdown",
37 | url=URL,
38 | author=AUTHOR,
39 | author_email=EMAIL,
40 | python_requires=REQUIRES_PYTHON,
41 | packages=find_packages(exclude=("tests",)),
42 | package_data={"feature_engine": ["VERSION", "py.typed"]},
43 | license='BSD 3 clause',
44 | install_requires=list_reqs(),
45 | include_package_data=True,
46 | classifiers=[
47 | # Trove classifiers
48 | # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
49 | "License :: OSI Approved :: BSD License",
50 | "Programming Language :: Python :: 3.9",
51 | "Programming Language :: Python :: 3.10",
52 | "Programming Language :: Python :: 3.11",
53 | "Programming Language :: Python :: 3.12",
54 | ],
55 | zip_safe=False)
56 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | pytest>=5.4.1
3 |
4 | # repo maintenance tooling
5 | black>=21.5b1
6 | coverage>=6.4.4
7 | flake8>=3.9.2
8 | isort>=5.8.0
9 | mypy>=0.740
10 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/estimator_checks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/estimator_checks/__init__.py
--------------------------------------------------------------------------------
/tests/estimator_checks/dataframe_for_checks.py:
--------------------------------------------------------------------------------
1 | """Dataframe used as input by many estimator checks."""
2 |
3 | from typing import Tuple
4 |
5 | import pandas as pd
6 | from sklearn.datasets import make_classification
7 |
8 |
9 | def test_df(
10 | categorical: bool = False, datetime: bool = False
11 | ) -> Tuple[pd.DataFrame, pd.Series]:
12 | """
13 | Creates a dataframe that contains only numerical features, or additionally,
14 | categorical and datetime features.
15 |
16 | Parameters
17 | ----------
18 | categorical: bool, default=False
19 | Whether to add 2 additional categorical features.
20 |
21 | datetime: bool, default=False
22 | Whether to add one additional datetime feature.
23 |
24 | Returns
25 | -------
26 | X: pd.DataFrame
27 | A pandas dataframe.
28 | """
29 | X, y = make_classification(
30 | n_samples=1000,
31 | n_features=12,
32 | n_redundant=4,
33 | n_clusters_per_class=1,
34 | weights=[0.50],
35 | class_sep=2,
36 | random_state=1,
37 | )
38 |
39 | # transform arrays into pandas df and series
40 | colnames = [f"var_{i}" for i in range(12)]
41 | X = pd.DataFrame(X, columns=colnames)
42 | y = pd.Series(y)
43 |
44 | if categorical is True:
45 | X["cat_var1"] = ["A"] * 1000
46 | X["cat_var2"] = ["B"] * 1000
47 |
48 | if datetime is True:
49 | X["date1"] = pd.date_range("2020-02-24", periods=1000, freq="min")
50 | X["date2"] = pd.date_range("2021-09-29", periods=1000, freq="h")
51 |
52 | return X, y
53 |
--------------------------------------------------------------------------------
/tests/estimator_checks/fit_functionality_checks.py:
--------------------------------------------------------------------------------
1 | """Checks functionality in the fit method shared by all transformers."""
2 |
3 | import pytest
4 | from sklearn import clone
5 |
6 | from tests.estimator_checks.dataframe_for_checks import test_df
7 |
8 |
9 | def check_feature_names_in(estimator):
10 | """Checks that transformers learn the variable names of the train set used
11 | during fit, as well as the number of variables.
12 |
13 | Should be applied to all transformers.
14 | """
15 | # the estimator learns the parameters from the train set
16 | X, y = test_df(categorical=True, datetime=True)
17 | varnames = list(X.columns)
18 | estimator = clone(estimator)
19 | estimator.fit(X, y)
20 | assert estimator.feature_names_in_ == varnames
21 | assert estimator.n_features_in_ == len(varnames)
22 |
23 |
24 | def check_error_if_y_not_passed(estimator):
25 | """
26 | Checks that transformer raises error when y is not passed during fit. Functionality
27 | is provided by Python, when making a parameter mandatory.
28 |
29 | For this test to run, we need to add the tag 'requires_y' to the transformer.
30 | """
31 | X, y = test_df()
32 | estimator = clone(estimator)
33 | with pytest.raises(TypeError):
34 | estimator.fit(X)
35 |
--------------------------------------------------------------------------------
/tests/estimator_checks/init_params_allowed_values_checks.py:
--------------------------------------------------------------------------------
1 | """Many transformers have similar init parameters which take the same input values.
2 | In this script, we add tests for the allowed values for those parameters.
3 | """
4 | import pytest
5 | from sklearn import clone
6 |
7 |
8 | def check_error_param_missing_values(estimator):
9 | """
10 | Only for transformers with a parameter `missing_values`in init.
11 |
12 | Checks transformer raises error when user enters non-permitted value to the
13 | parameter.
14 | """
15 | # param takes values "raise" or "ignore"
16 | estimator = clone(estimator)
17 | for value in [2, "hola", False]:
18 | if estimator.__class__.__name__ == "MathFeatures":
19 | with pytest.raises(ValueError):
20 | estimator.__class__(
21 | variables=["var_1", "var_2", "var_3"],
22 | func="mean",
23 | missing_values=value,
24 | )
25 |
26 | elif estimator.__class__.__name__ == "RelativeFeatures":
27 | with pytest.raises(ValueError):
28 | estimator.__class__(
29 | variables=["var_1", "var_2", "var_3"],
30 | reference=["var_4"],
31 | func="mean",
32 | missing_values=value,
33 | )
34 | else:
35 | with pytest.raises(ValueError):
36 | estimator.__class__(missing_values=value)
37 |
38 |
39 | def check_error_param_confirm_variables(estimator):
40 | """
41 | Only for transformers with a parameter `confirm_variables`in init.
42 |
43 | Checks transformer raises error when user enters non-permitted value to the
44 | parameter.
45 | """
46 | # param takes values True or False
47 | estimator = clone(estimator)
48 | for value in [2, "hola", [True]]:
49 | msg = (
50 | f"confirm_variables takes only values True and False. Got {value} instead."
51 | )
52 | with pytest.raises(ValueError) as record:
53 | estimator.__class__(confirm_variables=value)
54 | assert record.value.args[0] == msg
55 |
--------------------------------------------------------------------------------
/tests/estimator_checks/non_fitted_error_checks.py:
--------------------------------------------------------------------------------
1 | """Checks functionality in the transform method shared by all transformers."""
2 |
3 | import pytest
4 | from sklearn import clone
5 | from sklearn.exceptions import NotFittedError
6 |
7 | from tests.estimator_checks.dataframe_for_checks import test_df
8 |
9 |
10 | def check_raises_non_fitted_error(estimator):
11 | """
12 | Check if transformer raises error when transform() method is called before
13 | calling fit() method.
14 |
15 | The functionality is provided by sklearn's `check_is_fitted` function.
16 | """
17 | X, y = test_df()
18 | transformer = clone(estimator)
19 | # Test when fit is not called prior to transform.
20 | with pytest.raises(NotFittedError):
21 | transformer.transform(X)
22 |
--------------------------------------------------------------------------------
/tests/parametrize_with_checks_creation_v16.py:
--------------------------------------------------------------------------------
1 | """
2 | File intended to help understand check_estimator tests for the module creation of
3 | Feature-engine. It is not run as part of the battery of acceptance tests. Works from
4 | sklearn > 1.6.
5 | """
6 |
7 | from sklearn.utils.estimator_checks import parametrize_with_checks
8 |
9 | from feature_engine.creation import (
10 | CyclicalFeatures,
11 | DecisionTreeFeatures,
12 | MathFeatures,
13 | RelativeFeatures,
14 | )
15 |
16 | dtf = DecisionTreeFeatures(regression=False)
17 | cf = CyclicalFeatures()
18 | mf = MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore")
19 | rf = RelativeFeatures(
20 | variables=["x0", "x1"],
21 | reference=["x0"],
22 | func=["add"],
23 | missing_values="ignore",
24 | )
25 |
26 | EXPECTED_FAILED_CHECKS = {
27 | "DecisionTreeFeatures": dtf._more_tags()["_xfail_checks"],
28 | "CyclicalFeatures": cf._more_tags()["_xfail_checks"],
29 | "MathFeatures": mf._more_tags()["_xfail_checks"],
30 | "RelativeFeatures": rf._more_tags()["_xfail_checks"],
31 | }
32 |
33 |
34 | # creation
35 | @parametrize_with_checks(
36 | estimators=[dtf, cf, mf, rf],
37 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get(
38 | est.__class__.__name__, {}
39 | ),
40 | )
41 | def test_sklearn_compatible_creator(estimator, check):
42 | check(estimator)
43 |
--------------------------------------------------------------------------------
/tests/parametrize_with_checks_discretization_v16.py:
--------------------------------------------------------------------------------
1 | """
2 | File intended to help understand check_estimator tests for Feature-engine's
3 | discretization module. It is not run as part of the battery of acceptance tests.
4 | Works from sklearn > 1.6.
5 | """
6 |
7 | import numpy as np
8 | from sklearn.utils.estimator_checks import parametrize_with_checks
9 |
10 | from feature_engine.discretisation import (
11 | ArbitraryDiscretiser,
12 | DecisionTreeDiscretiser,
13 | EqualFrequencyDiscretiser,
14 | EqualWidthDiscretiser,
15 | GeometricWidthDiscretiser,
16 | )
17 |
18 | dtd = DecisionTreeDiscretiser(regression=False)
19 | efd = EqualFrequencyDiscretiser()
20 | ewd = EqualWidthDiscretiser()
21 | ad = ArbitraryDiscretiser(binning_dict={"x0": [-np.inf, 0, np.inf]})
22 | gd = GeometricWidthDiscretiser()
23 |
24 | EXPECTED_FAILED_CHECKS = {
25 | "DecisionTreeDiscretiser": dtd._more_tags()["_xfail_checks"],
26 | "EqualFrequencyDiscretiser": efd._more_tags()["_xfail_checks"],
27 | "EqualWidthDiscretiser": ewd._more_tags()["_xfail_checks"],
28 | "ArbitraryDiscretiser": ad._more_tags()["_xfail_checks"],
29 | "GeometricWidthDiscretiser": gd._more_tags()["_xfail_checks"],
30 | }
31 |
32 |
33 | # discretization
34 | @parametrize_with_checks(
35 | estimators=[dtd, efd, ewd, ad, gd],
36 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get(
37 | est.__class__.__name__, {}
38 | ),
39 | )
40 | def test_sklearn_compatible_creator(estimator, check):
41 | check(estimator)
42 |
--------------------------------------------------------------------------------
/tests/parametrize_with_checks_encoders_v16.py:
--------------------------------------------------------------------------------
1 | """
2 | File intended to help understand check_estimator tests for Feature-engine's
3 | encoding module. It is not run as part of the battery of acceptance tests.
4 | Works from sklearn > 1.6.
5 | """
6 |
7 | from sklearn.utils.estimator_checks import parametrize_with_checks
8 |
9 | from feature_engine.encoding import (
10 | CountFrequencyEncoder,
11 | MeanEncoder,
12 | OneHotEncoder,
13 | OrdinalEncoder,
14 | RareLabelEncoder,
15 | StringSimilarityEncoder,
16 | WoEEncoder,
17 | )
18 | from feature_engine.tags import _return_tags
19 |
20 | ce = CountFrequencyEncoder(ignore_format=True)
21 | me = MeanEncoder(ignore_format=True)
22 | ohe = OneHotEncoder(ignore_format=True)
23 | oe = OrdinalEncoder(ignore_format=True)
24 | re = RareLabelEncoder(
25 | tol=0.00000000001,
26 | n_categories=100000000000,
27 | replace_with=10,
28 | ignore_format=True,
29 | )
30 | woe = WoEEncoder(ignore_format=True)
31 | sse = StringSimilarityEncoder(ignore_format=True)
32 |
33 | FAILED_CHECKS = _return_tags()["_xfail_checks"]
34 | FAILED_CHECKS.update({"check_estimators_nan_inf": "transformer allows NA"})
35 |
36 | EXPECTED_FAILED_CHECKS = {
37 | "CountFrequencyEncoder": FAILED_CHECKS,
38 | "MeanEncoder": FAILED_CHECKS,
39 | "OneHotEncoder": FAILED_CHECKS,
40 | "OrdinalEncoder": FAILED_CHECKS,
41 | "RareLabelEncoder": FAILED_CHECKS,
42 | "StringSimilarityEncoder": FAILED_CHECKS,
43 | }
44 |
45 |
46 | # encoding
47 | @parametrize_with_checks(
48 | estimators=[ce, me, ohe, oe, re, woe, sse],
49 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get(
50 | est.__class__.__name__, {}
51 | ),
52 | )
53 | def test_sklearn_compatible_creator(estimator, check):
54 | check(estimator)
55 |
--------------------------------------------------------------------------------
/tests/parametrize_with_checks_outliers_v16.py:
--------------------------------------------------------------------------------
1 | """
2 | File intended to help understand check_estimator tests for Feature-engine's
3 | outliers module. It is not run as part of the battery of acceptance tests.
4 | Works from sklearn > 1.6.
5 | """
6 |
7 | from sklearn.utils.estimator_checks import parametrize_with_checks
8 |
9 | from feature_engine.outliers import ArbitraryOutlierCapper, OutlierTrimmer, Winsorizer
10 | from feature_engine.tags import _return_tags
11 |
12 | aoc = ArbitraryOutlierCapper(max_capping_dict={"x0": 10})
13 | ot = OutlierTrimmer()
14 | wz = Winsorizer()
15 |
16 | FAILED_CHECKS = _return_tags()["_xfail_checks"]
17 | FAILED_CHECKS_AOC = _return_tags()["_xfail_checks"]
18 |
19 | msg1 = "transformers raise errors when data variation is low, " "thus this check fails"
20 |
21 | msg2 = "transformer has 1 mandatory parameter"
22 |
23 | FAILED_CHECKS.update({"check_fit2d_1sample": msg1})
24 | FAILED_CHECKS_AOC.update(
25 | {
26 | "check_fit2d_1sample": msg1,
27 | "check_parameters_default_constructible": msg2,
28 | }
29 | )
30 |
31 | EXPECTED_FAILED_CHECKS = {
32 | "ArbitraryOutlierCapper": FAILED_CHECKS_AOC,
33 | "OutlierTrimmer": FAILED_CHECKS,
34 | "Winsorizer": FAILED_CHECKS,
35 | }
36 |
37 |
38 | # encoding
39 | @parametrize_with_checks(
40 | estimators=[aoc, ot, wz],
41 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get(
42 | est.__class__.__name__, {}
43 | ),
44 | )
45 | def test_sklearn_compatible_creator(estimator, check):
46 | check(estimator)
47 |
--------------------------------------------------------------------------------
/tests/parametrize_with_checks_prediction_v16.py:
--------------------------------------------------------------------------------
1 | """
2 | File intended to help understand check_estimator tests for Feature-engine's
3 | prediction module. It is not run as part of the battery of acceptance tests.
4 | Works from sklearn > 1.6.
5 | """
6 |
7 | from sklearn.utils.estimator_checks import parametrize_with_checks
8 |
9 | from feature_engine._prediction.base_predictor import BaseTargetMeanEstimator
10 | from feature_engine._prediction.target_mean_classifier import TargetMeanClassifier
11 | from feature_engine._prediction.target_mean_regressor import TargetMeanRegressor
12 | from feature_engine.tags import _return_tags
13 |
14 | _estimators = [BaseTargetMeanEstimator(), TargetMeanClassifier(), TargetMeanRegressor()]
15 |
16 | FAILED_CHECKS = _return_tags()["_xfail_checks"]
17 |
18 | EXPECTED_FAILED_CHECKS = {
19 | "BaseTargetMeanEstimator": FAILED_CHECKS,
20 | "TargetMeanClassifier": FAILED_CHECKS,
21 | "TargetMeanRegressor": FAILED_CHECKS,
22 | }
23 |
24 |
25 | @parametrize_with_checks(
26 | estimators=_estimators,
27 | expected_failed_checks=lambda est: EXPECTED_FAILED_CHECKS.get(
28 | est.__class__.__name__, {}
29 | ),
30 | )
31 | def test_sklearn_compatible_creator(estimator, check):
32 | check(estimator)
33 |
--------------------------------------------------------------------------------
/tests/test_base_transformers/test_base_numerical_transformer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from numpy import inf
3 | from pandas.testing import assert_frame_equal
4 |
5 | from feature_engine._base_transformers.base_numerical import BaseNumericalTransformer
6 | from tests.estimator_checks.non_fitted_error_checks import check_raises_non_fitted_error
7 |
8 |
9 | class MockClass(BaseNumericalTransformer):
10 | def __init__(self):
11 | self.variables = None
12 |
13 | def transform(self, X):
14 | return self._check_transform_input_and_state(X)
15 |
16 |
17 | def test_fit_method(df_vartypes, df_na):
18 | transformer = MockClass()
19 | res = transformer.fit(df_vartypes)
20 | assert transformer.feature_names_in_ == list(df_vartypes.columns)
21 | assert transformer.n_features_in_ == len(df_vartypes.columns)
22 | assert_frame_equal(res, df_vartypes)
23 |
24 | with pytest.raises(ValueError):
25 | transformer.fit(df_na)
26 |
27 | df_na = df_na.fillna(inf)
28 | with pytest.raises(ValueError):
29 | assert transformer.fit(df_na)
30 |
31 |
32 | def test_transform_method(df_vartypes, df_na):
33 | transformer = MockClass()
34 | transformer.fit(df_vartypes)
35 | assert_frame_equal(
36 | transformer._check_transform_input_and_state(df_vartypes), df_vartypes
37 | )
38 | assert_frame_equal(
39 | transformer._check_transform_input_and_state(
40 | df_vartypes[["City", "Age", "Name", "Marks", "dob"]]
41 | ),
42 | df_vartypes,
43 | )
44 |
45 | with pytest.raises(ValueError):
46 | transformer.fit(df_na)
47 |
48 | df_na = df_na.fillna(inf)
49 | with pytest.raises(ValueError):
50 | assert transformer.fit(df_na)
51 |
52 | with pytest.raises(ValueError):
53 | assert transformer._check_transform_input_and_state(
54 | df_vartypes[["Age", "Marks"]]
55 | )
56 |
57 |
58 | def test_raises_non_fitted_error():
59 | check_raises_non_fitted_error(MockClass())
60 |
--------------------------------------------------------------------------------
/tests/test_base_transformers/test_transform_xy_mixin.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from feature_engine._base_transformers.mixins import TransformXyMixin
5 |
6 |
7 | class MockTransformer(TransformXyMixin):
8 | def transform(self, X):
9 | return X.iloc[1:-1].copy()
10 |
11 |
12 | def test_transform_x_y_method(df_vartypes):
13 | # single target
14 | y = pd.Series(0, index=np.arange(len(df_vartypes)))
15 | transformer = MockTransformer()
16 | Xt, yt = transformer.transform_x_y(df_vartypes, y)
17 |
18 | assert len(Xt) == len(yt)
19 | assert len(Xt) != len(df_vartypes)
20 | assert len(yt) != len(y)
21 | assert (Xt.index == yt.index).all()
22 | assert (Xt.index == [1, 2]).all()
23 |
24 | # multioutput target
25 | y = (
26 | pd.DataFrame(columns=["vara", "varb"], index=df_vartypes.index)
27 | .astype(float)
28 | .fillna(0)
29 | )
30 | Xt, yt = transformer.transform_x_y(df_vartypes, y)
31 |
32 | assert len(Xt) == len(yt)
33 | assert len(Xt) != len(df_vartypes)
34 | assert len(yt) != len(y)
35 | assert (Xt.index == yt.index).all()
36 | assert (Xt.index == [1, 2]).all()
37 |
--------------------------------------------------------------------------------
/tests/test_check_init_parameters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_check_init_parameters/__init__.py
--------------------------------------------------------------------------------
/tests/test_check_init_parameters/test_check_init_input_params.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine._check_init_parameters.check_init_input_params import (
4 | _check_param_drop_original,
5 | _check_param_missing_values,
6 | )
7 |
8 |
9 | @pytest.mark.parametrize("missing_vals", [None, ["Hola"], True, "Hola"])
10 | def test_check_param_missing_values(missing_vals):
11 | with pytest.raises(ValueError):
12 | _check_param_missing_values(missing_vals)
13 |
14 |
15 | @pytest.mark.parametrize("drop_orig", [None, ["Hola"], 10, "Hola"])
16 | def test_check_param_drop_original(drop_orig):
17 | with pytest.raises(ValueError):
18 | _check_param_drop_original(drop_orig)
19 |
--------------------------------------------------------------------------------
/tests/test_check_init_parameters/test_check_input_dictionary.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine._check_init_parameters.check_input_dictionary import (
4 | _check_numerical_dict,
5 | )
6 |
7 |
8 | @pytest.mark.parametrize("input_dict", [{"a": 1, "b": "c"}, {1: 1, 2: "c"}])
9 | def test_raises_error_when_item_in_dict_not_numerical(input_dict):
10 | with pytest.raises(ValueError):
11 | _check_numerical_dict(input_dict)
12 |
13 |
14 | @pytest.mark.parametrize("input_dict", [[1, 2, 3], (1, 2, 3), "hola", 5])
15 | def test_raises_error_when_input_not_dictionary_or_none(input_dict):
16 | with pytest.raises(TypeError):
17 | _check_numerical_dict(input_dict)
18 |
--------------------------------------------------------------------------------
/tests/test_check_init_parameters/test_check_variables.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine._check_init_parameters.check_variables import (
4 | _check_variables_input_value,
5 | )
6 |
7 |
8 | @pytest.mark.parametrize("_input_vars", [("var1", "var2"), {"var1": 1, "var2": 2}])
9 | def test_raises_errors_when_not_list_str_or_int(_input_vars):
10 | with pytest.raises(ValueError) as record:
11 | assert _check_variables_input_value(_input_vars)
12 | msg = (
13 | "`variables` should contain a string, an integer or a list of strings or "
14 | f"integers. Got {_input_vars} instead."
15 | )
16 | assert str(record.value) == msg
17 |
18 |
19 | @pytest.mark.parametrize(
20 | "_input_vars", [["var1", "var2", "var2", "var3"], [0, 1, 1, 2]]
21 | )
22 | def test_raises_error_when_duplicated_var_names(_input_vars):
23 | with pytest.raises(ValueError) as record:
24 | assert _check_variables_input_value(_input_vars)
25 | msg = "The list entered in `variables` contains duplicated variable names."
26 | assert str(record.value) == msg
27 |
28 |
29 | def test_raises_error_when_empty_list():
30 | with pytest.raises(ValueError) as record:
31 | assert _check_variables_input_value([])
32 | msg = "The list of `variables` is empty."
33 | assert str(record.value) == msg
34 |
35 |
36 | @pytest.mark.parametrize(
37 | "_input_vars",
38 | [["var1", "var2", "var3"], [0, 1, 2, 3], "var1", ["var1"], 0, [0]],
39 | )
40 | def test_return_variables(_input_vars):
41 | assert _check_variables_input_value(_input_vars) == _input_vars
42 |
43 |
44 | def test_return_when_variables_is_none():
45 | assert _check_variables_input_value(None) is None
46 |
--------------------------------------------------------------------------------
/tests/test_creation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_creation/__init__.py
--------------------------------------------------------------------------------
/tests/test_creation/test_check_estimator_creation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | import sklearn
4 | from sklearn.pipeline import Pipeline
5 | from sklearn.utils.estimator_checks import check_estimator
6 | from sklearn.utils.fixes import parse_version
7 |
8 | from feature_engine.creation import (
9 | CyclicalFeatures,
10 | DecisionTreeFeatures,
11 | MathFeatures,
12 | RelativeFeatures,
13 | )
14 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator
15 |
16 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
17 |
18 | _estimators = [
19 | MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"),
20 | RelativeFeatures(
21 | variables=["x0", "x1"], reference=["x0"], func=["add"], missing_values="ignore"
22 | ),
23 | CyclicalFeatures(),
24 | DecisionTreeFeatures(regression=False),
25 | ]
26 |
27 | if sklearn_version > parse_version("1.6"):
28 |
29 | @pytest.mark.parametrize("estimator", _estimators)
30 | def test_check_estimator_from_sklearn(estimator):
31 | return check_estimator(
32 | estimator=estimator,
33 | expected_failed_checks=estimator._more_tags()["_xfail_checks"],
34 | )
35 |
36 | else:
37 |
38 | @pytest.mark.parametrize("estimator", _estimators)
39 | def test_check_estimator_from_sklearn(estimator):
40 | return check_estimator(estimator)
41 |
42 |
43 | _estimators = [
44 | MathFeatures(variables=["var_1", "var_2", "var_3"], func="mean"),
45 | RelativeFeatures(variables=["var_1", "var_2"], reference=["var_3"], func=["add"]),
46 | CyclicalFeatures(),
47 | ]
48 |
49 |
50 | @pytest.mark.parametrize("estimator", _estimators)
51 | def test_check_estimator_from_feature_engine(estimator):
52 | return check_feature_engine_estimator(estimator)
53 |
54 |
55 | _estimators = [
56 | CyclicalFeatures(),
57 | MathFeatures(variables=["feature_1", "feature_2"], func=["sum", "mean"]),
58 | RelativeFeatures(variables=["feature_1"], reference=["feature_2"], func=["div"]),
59 | ]
60 |
61 |
62 | @pytest.mark.parametrize("transformer", _estimators)
63 | def test_transformers_in_pipeline_with_set_output_pandas(transformer):
64 | X = pd.DataFrame({"feature_1": [1, 2, 3, 4, 5], "feature_2": [6, 7, 8, 9, 10]})
65 | y = pd.Series([0, 1, 0, 1, 0])
66 |
67 | pipe = Pipeline([("trs", transformer)]).set_output(transform="pandas")
68 |
69 | Xtt = transformer.fit_transform(X)
70 | Xtp = pipe.fit_transform(X, y)
71 |
72 | pd.testing.assert_frame_equal(Xtt, Xtp)
73 |
--------------------------------------------------------------------------------
/tests/test_datasets/__init__().py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_datasets/__init__().py
--------------------------------------------------------------------------------
/tests/test_datetime/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_datetime/__init__.py
--------------------------------------------------------------------------------
/tests/test_datetime/test_check_estimator_datetime.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from sklearn.pipeline import Pipeline
4 |
5 | from feature_engine.datetime import DatetimeFeatures, DatetimeSubtraction
6 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator
7 |
8 | _estimators = [DatetimeFeatures()]
9 |
10 |
11 | @pytest.mark.parametrize("estimator", _estimators)
12 | def test_check_estimator_from_feature_engine(estimator):
13 | return check_feature_engine_estimator(estimator)
14 |
15 |
16 | transformers = [
17 | DatetimeFeatures(),
18 | DatetimeSubtraction(variables="feature_1", reference="feature_2"),
19 | ]
20 |
21 |
22 | @pytest.mark.parametrize("transformer", transformers)
23 | def test_datetime_transformers(transformer):
24 | X = pd.DataFrame(
25 | {
26 | "feature_1": [
27 | "2014-05-05",
28 | "2014-05-05",
29 | "2014-05-05",
30 | "2014-05-05",
31 | "2014-05-05",
32 | ],
33 | "feature_2": [
34 | "2014-05-05",
35 | "2014-05-05",
36 | "2014-05-05",
37 | "2014-05-05",
38 | "2014-05-05",
39 | ],
40 | },
41 | )
42 | y = pd.Series([0, 1, 0, 1, 0])
43 |
44 | pipe = Pipeline(
45 | [
46 | ("trs", transformer),
47 | ]
48 | ).set_output(transform="pandas")
49 |
50 | Xtt = transformer.fit_transform(X)
51 | Xtp = pipe.fit_transform(X, y)
52 |
53 | pd.testing.assert_frame_equal(Xtt, Xtp)
54 |
--------------------------------------------------------------------------------
/tests/test_discretisation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_discretisation/__init__.py
--------------------------------------------------------------------------------
/tests/test_discretisation/test_check_estimator_discretisers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | import sklearn
5 | from sklearn.pipeline import Pipeline
6 | from sklearn.utils.estimator_checks import check_estimator
7 | from sklearn.utils.fixes import parse_version
8 |
9 | from feature_engine.discretisation import (
10 | ArbitraryDiscretiser,
11 | DecisionTreeDiscretiser,
12 | EqualFrequencyDiscretiser,
13 | EqualWidthDiscretiser,
14 | GeometricWidthDiscretiser,
15 | )
16 | from tests.estimator_checks.estimator_checks import check_feature_engine_estimator
17 |
18 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
19 |
20 |
21 | _estimators = [
22 | DecisionTreeDiscretiser(regression=False),
23 | EqualFrequencyDiscretiser(),
24 | EqualWidthDiscretiser(),
25 | ArbitraryDiscretiser(binning_dict={"x0": [-np.inf, 0, np.inf]}),
26 | GeometricWidthDiscretiser(),
27 | ]
28 |
29 | if sklearn_version < parse_version("1.6"):
30 |
31 | @pytest.mark.parametrize("estimator", _estimators)
32 | def test_check_estimator_from_sklearn(estimator):
33 | return check_estimator(estimator)
34 |
35 | else:
36 |
37 | @pytest.mark.parametrize("estimator", _estimators)
38 | def test_check_estimator_from_sklearn(estimator):
39 | return check_estimator(
40 | estimator=estimator,
41 | expected_failed_checks=estimator._more_tags()["_xfail_checks"],
42 | )
43 |
44 |
45 | @pytest.mark.parametrize("estimator", _estimators)
46 | def test_check_estimator_from_feature_engine(estimator):
47 | if estimator.__class__.__name__ == "ArbitraryDiscretiser":
48 | estimator.set_params(binning_dict={"var_1": [-np.inf, 0, np.inf]})
49 | return check_feature_engine_estimator(estimator)
50 |
51 |
52 | @pytest.mark.parametrize("transformer", _estimators)
53 | def test_transformers_within_pipeline(transformer):
54 | if transformer.__class__.__name__ == "ArbitraryDiscretiser":
55 | transformer.set_params(binning_dict={"feature_1": [-np.inf, 0, np.inf]})
56 |
57 | X = pd.DataFrame({"feature_1": [1, 2, 3, 4, 5], "feature_2": [6, 7, 8, 9, 10]})
58 | y = pd.Series([0, 1, 0, 1, 0])
59 |
60 | pipe = Pipeline([("trs", transformer)]).set_output(transform="pandas")
61 |
62 | Xtt = transformer.fit_transform(X, y)
63 | Xtp = pipe.fit_transform(X, y)
64 |
65 | pd.testing.assert_frame_equal(Xtt, Xtp)
66 |
--------------------------------------------------------------------------------
/tests/test_encoding/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/__init__.py
--------------------------------------------------------------------------------
/tests/test_encoding/test_base_encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/test_base_encoders/__init__.py
--------------------------------------------------------------------------------
/tests/test_encoding/test_base_encoders/test_categorical_init_mixin.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine.encoding.base_encoder import CategoricalInitMixin
4 |
5 |
6 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)])
7 | def test_raises_error_when_ignore_format_not_permitted(param):
8 | with pytest.raises(ValueError) as record:
9 | CategoricalInitMixin(ignore_format=param)
10 | msg = f"ignore_format takes only booleans True and False. Got {param} instead."
11 | assert str(record.value) == msg
12 |
13 |
14 | @pytest.mark.parametrize("param", [True, False])
15 | def test_ignore_format_value_assignment(param):
16 | enc = CategoricalInitMixin(ignore_format=param)
17 | assert enc.ignore_format == param
18 |
--------------------------------------------------------------------------------
/tests/test_encoding/test_base_encoders/test_categorical_init_mixin_na.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine.encoding.base_encoder import CategoricalInitMixinNA
4 |
5 |
6 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)])
7 | def test_raises_error_when_ignore_format_not_permitted(param):
8 | with pytest.raises(ValueError) as record:
9 | CategoricalInitMixinNA(ignore_format=param)
10 | msg = f"ignore_format takes only booleans True and False. Got {param} instead."
11 | assert str(record.value) == msg
12 |
13 |
14 | @pytest.mark.parametrize("param", [1, "hola", [1, 2, 0], (True, False)])
15 | def test_raises_error_when_missing_values_not_permitted(param):
16 | with pytest.raises(ValueError) as record:
17 | CategoricalInitMixinNA(missing_values=param)
18 | msg = f"missing_values takes only values 'raise' or 'ignore'. Got {param} instead."
19 | assert str(record.value) == msg
20 |
21 |
22 | @pytest.mark.parametrize("param", [(True, "ignore"), (False, "raise")])
23 | def test_correct_param_value_assignment(param):
24 | format_, na_ = param
25 | enc = CategoricalInitMixinNA(ignore_format=format_, missing_values=na_)
26 | assert enc.ignore_format == format_
27 | assert enc.missing_values == na_
28 |
--------------------------------------------------------------------------------
/tests/test_encoding/test_helper_functions.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from feature_engine.encoding._helper_functions import check_parameter_unseen
4 |
5 |
6 | @pytest.mark.parametrize("accepted", ["one", False, [1, 2], ("one", "two"), 1])
7 | def test_raises_error_when_accepted_values_not_permitted(accepted):
8 | with pytest.raises(ValueError) as record:
9 | check_parameter_unseen("zero", accepted)
10 | msg = "accepted_values should be a list of strings. " f" Got {accepted} instead."
11 | assert str(record.value) == msg
12 |
13 |
14 | @pytest.mark.parametrize("accepted", [["one", "two"], ["three", "four"]])
15 | def test_raises_error_when_error_not_in_accepted_values(accepted):
16 | with pytest.raises(ValueError) as record:
17 | check_parameter_unseen("zero", accepted)
18 | msg = (
19 | f"Parameter `unseen` takes only values {', '.join(accepted)}."
20 | " Got zero instead."
21 | )
22 | assert str(record.value) == msg
23 |
--------------------------------------------------------------------------------
/tests/test_encoding/test_woe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_encoding/test_woe/__init__.py
--------------------------------------------------------------------------------
/tests/test_encoding/test_woe/test_woe_class.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 |
5 | from feature_engine.encoding.woe import WoE
6 |
7 |
8 | def test_woe_calculation(df_enc):
9 | pos_exp = pd.Series({"A": 0.333333, "B": 0.333333, "C": 0.333333})
10 | neg_exp = pd.Series({"A": 0.285714, "B": 0.571429, "C": 0.142857})
11 |
12 | woe_class = WoE()
13 | pos, neg, woe = woe_class._calculate_woe(df_enc, df_enc["target"], "var_A")
14 |
15 | pd.testing.assert_series_equal(pos, pos_exp, check_names=False)
16 | pd.testing.assert_series_equal(neg, neg_exp, check_names=False)
17 | pd.testing.assert_series_equal(np.log(pos_exp / neg_exp), woe, check_names=False)
18 |
19 |
20 | def test_woe_error():
21 | df = {
22 | "var_A": ["B"] * 9 + ["A"] * 6 + ["C"] * 3 + ["D"] * 2,
23 | "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
24 | "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0],
25 | }
26 | df = pd.DataFrame(df)
27 | woe_class = WoE()
28 |
29 | with pytest.raises(ValueError):
30 | woe_class._calculate_woe(df, df["target"], "var_A")
31 |
32 |
33 | @pytest.mark.parametrize("fill_value", [1, 10, 0.1])
34 | def test_fill_value(fill_value):
35 | df = {
36 | "var_A": ["A"] * 9 + ["B"] * 6 + ["C"] * 3 + ["D"] * 2,
37 | "var_B": ["A"] * 10 + ["B"] * 6 + ["C"] * 4,
38 | "target": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0],
39 | }
40 | df = pd.DataFrame(df)
41 |
42 | pos_exp = pd.Series(
43 | {
44 | "A": 0.2857142857142857,
45 | "B": 0.2857142857142857,
46 | "C": 0.42857142857142855,
47 | "D": fill_value,
48 | }
49 | )
50 | neg_exp = pd.Series(
51 | {
52 | "A": 0.5384615384615384,
53 | "B": 0.3076923076923077,
54 | "C": fill_value,
55 | "D": 0.15384615384615385,
56 | }
57 | )
58 |
59 | woe_class = WoE()
60 | pos, neg, woe = woe_class._calculate_woe(
61 | df, df["target"], "var_A", fill_value=fill_value
62 | )
63 |
64 | pd.testing.assert_series_equal(pos, pos_exp, check_names=False)
65 | pd.testing.assert_series_equal(neg, neg_exp, check_names=False)
66 | pd.testing.assert_series_equal(np.log(pos_exp / neg_exp), woe, check_names=False)
67 |
--------------------------------------------------------------------------------
/tests/test_imputation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_imputation/__init__.py
--------------------------------------------------------------------------------
/tests/test_imputation/test_mean_mdian_imputer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | from feature_engine.imputation import MeanMedianImputer
5 |
6 |
7 | def test_mean_imputation_and_automatically_select_variables(df_na):
8 | # set up transformer
9 | imputer = MeanMedianImputer(imputation_method="mean", variables=None)
10 | X_transformed = imputer.fit_transform(df_na)
11 |
12 | # set up reference result
13 | X_reference = df_na.copy()
14 | X_reference["Age"] = X_reference["Age"].fillna(28.714285714285715)
15 | X_reference["Marks"] = X_reference["Marks"].fillna(0.6833333333333332)
16 |
17 | # test init params
18 | assert imputer.imputation_method == "mean"
19 | assert imputer.variables is None
20 |
21 | # test fit attributes
22 | assert imputer.variables_ == ["Age", "Marks"]
23 | imputer.imputer_dict_ = {
24 | key: round(value, 3) for (key, value) in imputer.imputer_dict_.items()
25 | }
26 | assert imputer.imputer_dict_ == {
27 | "Age": 28.714,
28 | "Marks": 0.683,
29 | }
30 | assert imputer.n_features_in_ == 6
31 |
32 | # test transform output:
33 | # selected variables should have no NA
34 | # not selected variables should still have NA
35 | assert X_transformed[["Age", "Marks"]].isnull().sum().sum() == 0
36 | assert X_transformed[["Name", "City"]].isnull().sum().sum() > 0
37 | pd.testing.assert_frame_equal(X_transformed, X_reference)
38 |
39 |
40 | def test_median_imputation_when_user_enters_single_variables(df_na):
41 | # set up trasnformer
42 | imputer = MeanMedianImputer(imputation_method="median", variables=["Age"])
43 | X_transformed = imputer.fit_transform(df_na)
44 |
45 | # set up reference output
46 | X_reference = df_na.copy()
47 | X_reference["Age"] = X_reference["Age"].fillna(23.0)
48 |
49 | # test init params
50 | assert imputer.imputation_method == "median"
51 | assert imputer.variables == ["Age"]
52 |
53 | # test fit attributes
54 | assert imputer.n_features_in_ == 6
55 | assert imputer.imputer_dict_ == {"Age": 23.0}
56 |
57 | # test transform output
58 | assert X_transformed["Age"].isnull().sum() == 0
59 | pd.testing.assert_frame_equal(X_transformed, X_reference)
60 |
61 |
62 | def test_error_with_wrong_imputation_method():
63 | with pytest.raises(ValueError):
64 | MeanMedianImputer(imputation_method="arbitrary")
65 |
--------------------------------------------------------------------------------
/tests/test_outliers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_outliers/__init__.py
--------------------------------------------------------------------------------
/tests/test_prediction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_prediction/__init__.py
--------------------------------------------------------------------------------
/tests/test_prediction/conftest.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 |
5 | @pytest.fixture(scope="module")
6 | def df_classification():
7 | df = {
8 | "cat_var_A": ["A"] * 5 + ["B"] * 5 + ["C"] * 5 + ["D"] * 5,
9 | "cat_var_B": ["A"] * 6
10 | + ["B"] * 2
11 | + ["C"] * 2
12 | + ["B"] * 2
13 | + ["C"] * 2
14 | + ["D"] * 6,
15 | "num_var_A": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4],
16 | "num_var_B": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4],
17 | }
18 |
19 | df = pd.DataFrame(df)
20 | y = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
21 | return df, y
22 |
23 |
24 | @pytest.fixture(scope="module")
25 | def df_regression():
26 | df = {
27 | "cat_var_A": ["A"] * 5 + ["B"] * 5 + ["C"] * 5 + ["D"] * 5,
28 | "cat_var_B": ["A"] * 6
29 | + ["B"] * 2
30 | + ["C"] * 2
31 | + ["B"] * 2
32 | + ["C"] * 2
33 | + ["D"] * 6,
34 | "num_var_A": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4],
35 | "num_var_B": [1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4],
36 | }
37 |
38 | df = pd.DataFrame(df)
39 | y = pd.Series([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])
40 | return df, y
41 |
--------------------------------------------------------------------------------
/tests/test_preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_preprocessing/__init__.py
--------------------------------------------------------------------------------
/tests/test_scaling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_scaling/__init__.py
--------------------------------------------------------------------------------
/tests/test_selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_selection/__init__.py
--------------------------------------------------------------------------------
/tests/test_selection/test_base_selector.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from pandas.testing import assert_frame_equal
3 |
4 | from feature_engine.selection.base_selector import BaseSelector
5 |
6 |
7 | @pytest.mark.parametrize("val", [None, "hola", [True]])
8 | def test_confirm_variables_in_init(val):
9 | with pytest.raises(ValueError):
10 | BaseSelector(confirm_variables=val)
11 |
12 |
13 | class MockClass(BaseSelector):
14 | def __init__(self, variables=None, confirm_variables=False):
15 | self.variables = variables
16 | self.confirm_variables = confirm_variables
17 |
18 | def fit(self, X, y=None):
19 | self.features_to_drop_ = ["Name", "Marks"]
20 | self._get_feature_names_in(X)
21 | return self
22 |
23 |
24 | def test_transform_method(df_vartypes):
25 | transformer = MockClass()
26 | transformer.fit(df_vartypes)
27 | Xt = transformer.transform(df_vartypes)
28 |
29 | # tests output of transform
30 | assert_frame_equal(Xt, df_vartypes.drop(["Name", "Marks"], axis=1))
31 |
32 | # tests this line: X = X[self.feature_names_in_]
33 | assert_frame_equal(
34 | transformer.transform(df_vartypes[["City", "Age", "Name", "Marks", "dob"]]),
35 | Xt,
36 | )
37 | # test error when there is a df shape missmatch
38 | with pytest.raises(ValueError):
39 | assert transformer.transform(df_vartypes[["Age", "Marks"]])
40 |
41 |
42 | def test_get_feature_names_in(df_vartypes):
43 | tr = MockClass()
44 | tr._get_feature_names_in(df_vartypes)
45 | assert tr.n_features_in_ == df_vartypes.shape[1]
46 | assert tr.feature_names_in_ == list(df_vartypes.columns)
47 |
48 |
49 | def test_get_support(df_vartypes):
50 | tr = MockClass()
51 | tr.fit(df_vartypes)
52 | v_bool = [False, True, True, False, True]
53 | v_ind = [1, 2, 4]
54 | assert tr.get_support() == v_bool
55 | assert list(tr.get_support(indices=True)) == v_ind
56 |
--------------------------------------------------------------------------------
/tests/test_sklearn_compatible/test_set_output.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.datasets import load_iris
4 | from sklearn.linear_model import LogisticRegression
5 | from sklearn.pipeline import make_pipeline
6 | from sklearn.preprocessing import StandardScaler
7 |
8 | from feature_engine.transformation import YeoJohnsonTransformer
9 |
10 |
11 | def test_pipeline_with_set_output_sklearn_last():
12 |
13 | X, y = load_iris(return_X_y=True, as_frame=True)
14 |
15 | pipeline = make_pipeline(
16 | YeoJohnsonTransformer(), StandardScaler(), LogisticRegression()
17 | ).set_output(transform="default")
18 |
19 | pipeline.fit(X, y)
20 |
21 | X_t = pipeline[:-1].transform(X)
22 | assert isinstance(X_t, np.ndarray)
23 |
24 | pipeline.set_output(transform="pandas")
25 | X_t = pipeline[:-1].transform(X)
26 |
27 | assert isinstance(X_t, pd.DataFrame)
28 |
29 |
30 | def test_pipeline_with_set_output_featureengine_last():
31 |
32 | X, y = load_iris(return_X_y=True, as_frame=True)
33 |
34 | pipeline = make_pipeline(
35 | StandardScaler(), YeoJohnsonTransformer(), LogisticRegression()
36 | ).set_output(transform="default")
37 |
38 | pipeline.fit(X, y)
39 |
40 | X_t = pipeline[:-1].transform(X)
41 | pipeline.fit(X, y)
42 | assert isinstance(X_t, pd.DataFrame)
43 |
44 | pipeline.set_output(transform="pandas")
45 | pipeline.fit(X, y)
46 |
47 | X_t = pipeline[:-1].transform(X)
48 |
49 | assert isinstance(X_t, pd.DataFrame)
50 |
51 |
52 | def test_individual_transformer():
53 |
54 | X, y = load_iris(return_X_y=True, as_frame=True)
55 |
56 | transformer = YeoJohnsonTransformer()
57 | transformer.set_output(transform="default")
58 | transformer.fit(X)
59 |
60 | X_t = transformer.transform(X)
61 | assert isinstance(X_t, pd.DataFrame)
62 |
63 | transformer.set_output(transform="pandas")
64 | X_t = transformer.transform(X)
65 |
66 | assert isinstance(X_t, pd.DataFrame)
67 |
--------------------------------------------------------------------------------
/tests/test_time_series/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_time_series/__init__.py
--------------------------------------------------------------------------------
/tests/test_time_series/test_forecasting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_time_series/test_forecasting/__init__.py
--------------------------------------------------------------------------------
/tests/test_time_series/test_forecasting/conftest.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 |
5 | @pytest.fixture(scope="module")
6 | def df_time():
7 | date_time = [
8 | "2020-05-15 12:00:00",
9 | "2020-05-15 12:15:00",
10 | "2020-05-15 12:30:00",
11 | "2020-05-15 12:45:00",
12 | "2020-05-15 13:00:00",
13 | "2020-05-15 13:15:00",
14 | "2020-05-15 13:30:00",
15 | "2020-05-15 13:45:00",
16 | "2020-05-15 14:00:00",
17 | "2020-05-15 14:15:00",
18 | "2020-05-15 14:30:00",
19 | "2020-05-15 14:45:00",
20 | "2020-05-15 15:00:00",
21 | "2020-05-15 15:15:00",
22 | "2020-05-15 15:30:00",
23 | ]
24 |
25 | data = {
26 | "ambient_temp": [
27 | 31.31,
28 | 31.51,
29 | 32.15,
30 | 32.39,
31 | 32.62,
32 | 32.5,
33 | 32.52,
34 | 32.68,
35 | 33.76,
36 | 34.13,
37 | 34.08,
38 | 33.7,
39 | 33.89,
40 | 34.04,
41 | 34.4,
42 | ],
43 | "module_temp": [
44 | 49.18,
45 | 49.84,
46 | 52.35,
47 | 50.63,
48 | 49.61,
49 | 47.01,
50 | 46.67,
51 | 47.52,
52 | 49.8,
53 | 55.03,
54 | 54.52,
55 | 47.62,
56 | 46.03,
57 | 44.29,
58 | 46.74,
59 | ],
60 | "irradiation": [
61 | 0.51,
62 | 0.79,
63 | 0.65,
64 | 0.76,
65 | 0.42,
66 | 0.49,
67 | 0.57,
68 | 0.56,
69 | 0.74,
70 | 0.89,
71 | 0.47,
72 | 0.54,
73 | 0.4,
74 | 0.45,
75 | 0.57,
76 | ],
77 | "color": ["blue"] * 10 + ["green"] * 5,
78 | }
79 |
80 | df = pd.DataFrame(data, index=date_time)
81 | df.index = pd.to_datetime(df.index)
82 | return df
83 |
--------------------------------------------------------------------------------
/tests/test_transformation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_transformation/__init__.py
--------------------------------------------------------------------------------
/tests/test_transformation/test_arcsin_transformer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from sklearn.exceptions import NotFittedError
4 |
5 | from feature_engine.transformation import ArcsinTransformer
6 |
7 |
8 | def test_transform_and_inverse_transform(df_vartypes):
9 | transformer = ArcsinTransformer(variables=["Marks"])
10 | X = transformer.fit_transform(df_vartypes)
11 |
12 | # expected output
13 | transf_df = df_vartypes.copy()
14 | transf_df["Marks"] = [1.24905, 1.10715, 0.99116, 0.88607]
15 |
16 | # test transform output
17 | pd.testing.assert_frame_equal(X, transf_df)
18 |
19 | # test inverse_transform
20 | Xit = transformer.inverse_transform(X)
21 |
22 | # convert numbers to original format.
23 | Xit["Marks"] = Xit["Marks"].round(1)
24 |
25 | # test
26 | pd.testing.assert_frame_equal(Xit, df_vartypes)
27 |
28 |
29 | def test_fit_raises_error_if_na_in_df(df_na):
30 | # test case 2: when dataset contains na, fit method
31 | transformer = ArcsinTransformer(variables=["Marks"])
32 | with pytest.raises(ValueError):
33 | transformer.fit(df_na)
34 |
35 |
36 | def test_transform_raises_error_if_na_in_df(df_vartypes, df_na):
37 | # test case 3: when dataset contains na, transform method
38 | transformer = ArcsinTransformer(variables=["Marks"])
39 | transformer.fit(df_vartypes)
40 | with pytest.raises(ValueError):
41 | transformer.transform(df_na[df_vartypes.columns])
42 |
43 |
44 | def test_error_if_df_contains_outside_range_values(df_vartypes):
45 | # test error when data contains value outside range [0, +1]
46 | df_out_range = df_vartypes.copy()
47 | df_out_range.loc[1, "Marks"] = 2
48 |
49 | transformer = ArcsinTransformer(variables=["Marks"])
50 | # test case 4: when variable contains value outside range, fit
51 | with pytest.raises(ValueError):
52 | transformer.fit(df_out_range)
53 |
54 | # test case 5: when variable contains value outside range, transform
55 | transformer.fit(df_vartypes)
56 | with pytest.raises(ValueError):
57 | transformer.transform(df_out_range)
58 |
59 | # when selecting variables automatically and some are outside range
60 | transformer = ArcsinTransformer()
61 | with pytest.raises(ValueError):
62 | transformer.fit(df_vartypes)
63 |
64 |
65 | def test_non_fitted_error(df_vartypes):
66 | transformer = ArcsinTransformer(variables="Marks")
67 | with pytest.raises(NotFittedError):
68 | transformer.transform(df_vartypes)
69 |
--------------------------------------------------------------------------------
/tests/test_transformation/test_boxcox_transformer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from sklearn.exceptions import NotFittedError
4 |
5 | from feature_engine.transformation import BoxCoxTransformer
6 |
7 |
8 | def test_automatically_finds_variables(df_vartypes):
9 | # test case 1: automatically select variables
10 | transformer = BoxCoxTransformer(variables=None)
11 | X = transformer.fit_transform(df_vartypes)
12 |
13 | # expected output
14 | transf_df = df_vartypes.copy()
15 | transf_df["Age"] = [9.78731, 10.1666, 9.40189, 9.0099]
16 | transf_df["Marks"] = [-0.101687, -0.207092, -0.316843, -0.431788]
17 |
18 | # test init params
19 | assert transformer.variables is None
20 | # test fit attr
21 | assert transformer.variables_ == ["Age", "Marks"]
22 | assert transformer.n_features_in_ == 5
23 | # test transform output
24 | pd.testing.assert_frame_equal(X, transf_df)
25 |
26 | # test inverse_transform
27 | Xit = transformer.inverse_transform(X)
28 |
29 | # convert numbers to original format.
30 | Xit["Age"] = Xit["Age"].round().astype("int64")
31 | Xit["Marks"] = Xit["Marks"].round(1)
32 |
33 | # test
34 | pd.testing.assert_frame_equal(Xit, df_vartypes)
35 |
36 |
37 | def test_fit_raises_error_if_df_contains_na(df_na):
38 | # test case 2: when dataset contains na, fit method
39 | transformer = BoxCoxTransformer()
40 | with pytest.raises(ValueError):
41 | transformer.fit(df_na)
42 |
43 |
44 | def test_transform_raises_error_if_df_contains_na(df_vartypes, df_na):
45 | # test case 3: when dataset contains na, transform method
46 | transformer = BoxCoxTransformer()
47 | transformer.fit(df_vartypes)
48 | with pytest.raises(ValueError):
49 | transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
50 |
51 |
52 | def test_error_if_df_contains_negative_values(df_vartypes):
53 | # test error when data contains negative values
54 | df_neg = df_vartypes.copy()
55 | df_neg.loc[1, "Age"] = -1
56 |
57 | # test case 4: when variable contains negative value, fit
58 | transformer = BoxCoxTransformer()
59 | with pytest.raises(ValueError):
60 | transformer.fit(df_neg)
61 |
62 | # test case 5: when variable contains negative value, transform
63 | transformer = BoxCoxTransformer()
64 | transformer.fit(df_vartypes)
65 | with pytest.raises(ValueError):
66 | transformer.transform(df_neg)
67 |
68 |
69 | def test_non_fitted_error(df_vartypes):
70 | transformer = BoxCoxTransformer()
71 | with pytest.raises(NotFittedError):
72 | transformer.transform(df_vartypes)
73 |
--------------------------------------------------------------------------------
/tests/test_transformation/test_reciprocal_transformer.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from sklearn.exceptions import NotFittedError
4 |
5 | from feature_engine.transformation import ReciprocalTransformer
6 |
7 |
8 | def test_automatically_find_variables(df_vartypes):
9 | # test case 1: automatically select variables
10 | transformer = ReciprocalTransformer(variables=None)
11 | X = transformer.fit_transform(df_vartypes)
12 |
13 | # expected output
14 | transf_df = df_vartypes.copy()
15 | transf_df["Age"] = [0.05, 0.047619, 0.0526316, 0.0555556]
16 | transf_df["Marks"] = [1.11111, 1.25, 1.42857, 1.66667]
17 |
18 | # test init params
19 | assert transformer.variables is None
20 | # test fit attr
21 | assert transformer.variables_ == ["Age", "Marks"]
22 | assert transformer.n_features_in_ == 5
23 | # test transform output
24 | pd.testing.assert_frame_equal(X, transf_df)
25 |
26 | # test inverse_transform
27 | Xit = transformer.inverse_transform(X)
28 |
29 | # convert numbers to original format.
30 | Xit["Age"] = Xit["Age"].round().astype("int64")
31 | Xit["Marks"] = Xit["Marks"].round(1)
32 |
33 | # test
34 | pd.testing.assert_frame_equal(Xit, df_vartypes)
35 |
36 |
37 | def test_fit_raises_error_if_na_in_df(df_na):
38 | # test case 2: when dataset contains na, fit method
39 | with pytest.raises(ValueError):
40 | transformer = ReciprocalTransformer()
41 | transformer.fit(df_na)
42 |
43 |
44 | def test_transform_raises_error_if_na_in_df(df_vartypes, df_na):
45 | # test case 3: when dataset contains na, transform method
46 | with pytest.raises(ValueError):
47 | transformer = ReciprocalTransformer()
48 | transformer.fit(df_vartypes)
49 | transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
50 |
51 |
52 | def test_error_if_df_contains_0_as_value(df_vartypes):
53 | # test error when data contains value zero
54 | df_neg = df_vartypes.copy()
55 | df_neg.loc[1, "Age"] = 0
56 |
57 | # test case 4: when variable contains zero, fit
58 | with pytest.raises(ValueError):
59 | transformer = ReciprocalTransformer()
60 | transformer.fit(df_neg)
61 |
62 | # test case 5: when variable contains zero, transform
63 | with pytest.raises(ValueError):
64 | transformer = ReciprocalTransformer()
65 | transformer.fit(df_vartypes)
66 | transformer.transform(df_neg)
67 |
68 |
69 | def test_non_fitted_error(df_vartypes):
70 | with pytest.raises(NotFittedError):
71 | transformer = ReciprocalTransformer()
72 | transformer.transform(df_vartypes)
73 |
--------------------------------------------------------------------------------
/tests/test_variable_handling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_variable_handling/__init__.py
--------------------------------------------------------------------------------
/tests/test_variable_handling/conftest.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 |
5 | @pytest.fixture
6 | def df():
7 | df = pd.DataFrame(
8 | {
9 | "Name": ["tom", "nick", "krish", "jack"],
10 | "City": ["London", "Manchester", "Liverpool", "Bristol"],
11 | "Age": [20, 21, 19, 18],
12 | "Marks": [0.9, 0.8, 0.7, 0.6],
13 | "date_range": pd.date_range("2020-02-24", periods=4, freq="min"),
14 | "date_obj0": ["2020-02-24", "2020-02-25", "2020-02-26", "2020-02-27"],
15 | "date_range_tz": pd.date_range(
16 | "2020-02-24", periods=4, freq="min"
17 | ).tz_localize("UTC"),
18 | }
19 | )
20 | df["Name"] = df["Name"].astype("category")
21 | return df
22 |
23 |
24 | @pytest.fixture
25 | def df_int(df):
26 | df = df.copy()
27 | df.columns = range(1, len(df.columns) + 1)
28 | return df
29 |
30 |
31 | @pytest.fixture
32 | def df_datetime(df):
33 | df = df.copy()
34 |
35 | df["date_obj1"] = ["01-Jan-2010", "24-Feb-1945", "14-Jun-2100", "17-May-1999"]
36 | df["date_obj2"] = ["10/11/12", "12/31/09", "06/30/95", "03/17/04"]
37 | df["time_obj"] = ["21:45:23", "09:15:33", "12:34:59", "03:27:02"]
38 |
39 | df["time_objTZ"] = df["time_obj"].add(["+5", "+11", "-3", "-8"])
40 | df["date_obj1"] = df["date_obj1"].astype("category")
41 | df["Age"] = df["Age"].astype("O")
42 | return df
43 |
--------------------------------------------------------------------------------
/tests/test_variable_handling/test_remove_variables.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 |
4 | from feature_engine.variable_handling.retain_variables import retain_variables_if_in_df
5 |
6 | test_dict = [
7 | (
8 | pd.DataFrame(columns=["A", "B", "C", "D", "E"]),
9 | ["A", "C", "B", "G", "H"],
10 | ["A", "C", "B"],
11 | ["X", "Y"],
12 | ),
13 | (pd.DataFrame(columns=[1, 2, 3, 4, 5]), [1, 2, 4, 6], [1, 2, 4], [6, 7]),
14 | (pd.DataFrame(columns=[1, 2, 3, 4, 5]), 1, [1], 7),
15 | (pd.DataFrame(columns=["A", "B", "C", "D", "E"]), "C", ["C"], "G"),
16 | ]
17 |
18 |
19 | @pytest.mark.parametrize("df, variables, overlap, col_not_in_df", test_dict)
20 | def test_retain_variables_if_in_df(df, variables, overlap, col_not_in_df):
21 |
22 | msg = "None of the variables in the list are present in the dataframe."
23 |
24 | assert retain_variables_if_in_df(df, variables) == overlap
25 |
26 | with pytest.raises(ValueError) as record:
27 | retain_variables_if_in_df(df, col_not_in_df)
28 | assert str(record.value) == msg
29 |
--------------------------------------------------------------------------------
/tests/test_wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/feature-engine/feature_engine/ead24576946db3d7e9eac9d2946ad4a27a46030e/tests/test_wrappers/__init__.py
--------------------------------------------------------------------------------
/tests/test_wrappers/test_check_estimator_wrappers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import sklearn
3 | from sklearn.impute import SimpleImputer
4 | from sklearn.preprocessing import OrdinalEncoder, StandardScaler
5 | from sklearn.utils.estimator_checks import check_estimator
6 | from sklearn.utils.fixes import parse_version
7 |
8 | from feature_engine.wrappers import SklearnTransformerWrapper
9 | from tests.estimator_checks.estimator_checks import (
10 | check_raises_error_when_input_not_a_df,
11 | )
12 | from tests.estimator_checks.fit_functionality_checks import check_feature_names_in
13 | from tests.estimator_checks.non_fitted_error_checks import check_raises_non_fitted_error
14 | from tests.estimator_checks.variable_selection_checks import (
15 | check_all_types_variables_assignment,
16 | check_numerical_variables_assignment,
17 | )
18 |
19 | sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
20 |
21 | if sklearn_version < parse_version("1.6"):
22 |
23 | def test_sklearn_transformer_wrapper():
24 | check_estimator(SklearnTransformerWrapper(transformer=SimpleImputer()))
25 |
26 | else:
27 |
28 | def test_sklearn_transformer_wrapper():
29 | check_estimator(
30 | estimator=SklearnTransformerWrapper(transformer=SimpleImputer()),
31 | expected_failed_checks=SklearnTransformerWrapper(
32 | transformer=SimpleImputer()
33 | )._more_tags()["_xfail_checks"],
34 | )
35 |
36 |
37 | @pytest.mark.parametrize(
38 | "estimator", [SklearnTransformerWrapper(transformer=OrdinalEncoder())]
39 | )
40 | def test_check_estimator_from_feature_engine(estimator):
41 | check_raises_non_fitted_error(estimator)
42 | check_raises_error_when_input_not_a_df(estimator)
43 | check_feature_names_in(estimator)
44 |
45 |
46 | def test_check_variables_assignment():
47 | check_numerical_variables_assignment(
48 | SklearnTransformerWrapper(transformer=StandardScaler())
49 | )
50 | check_all_types_variables_assignment(
51 | SklearnTransformerWrapper(transformer=OrdinalEncoder())
52 | )
53 |
54 |
55 | def test_raises_error_when_no_transformer_passed():
56 | # this transformer needs an estimator as an input param.
57 | with pytest.raises(TypeError):
58 | SklearnTransformerWrapper()
59 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py39, py310, py311-sklearn150, py311-sklearn160, py312, codecov, docs, stylechecks, typechecks
3 | skipsdist = true
4 |
5 | [testenv]
6 | install_command = pip install {opts} {packages}
7 | envdir = {toxworkdir}/unit_tests
8 | setenv =
9 | PYTHONPATH=.
10 | COVERAGE_RCFILE = {envtmpdir}/coveragerc
11 | commands =
12 | pytest tests
13 |
14 | [testenv:py39]
15 | deps =
16 | -rtest_requirements.txt
17 |
18 | [testenv:py310]
19 | deps =
20 | -rtest_requirements.txt
21 |
22 | [testenv:py311-sklearn150]
23 | deps =
24 | -rtest_requirements.txt
25 | scikit-learn==1.5.1
26 |
27 | [testenv:py311-sklearn160]
28 | deps =
29 | -rtest_requirements.txt
30 | scikit-learn==1.6.1
31 |
32 | [testenv:py312]
33 | deps =
34 | -rtest_requirements.txt
35 |
36 | [testenv:codecov]
37 | deps =
38 | -rtest_requirements.txt
39 | commands_pre =
40 | {envpython} -c 'from pathlib import Path; Path(r"{env:COVERAGE_RCFILE}").write_text(Path(".coveragerc").read_text())'
41 | commands =
42 | coverage run -m pytest -v
43 | coverage report
44 |
45 | [testenv:docs]
46 | deps =
47 | -r docs/requirements.txt
48 | commands =
49 | sphinx-build -W -b html -d {envtmpdir}/doctrees docs {envtmpdir}/html
50 |
51 | [testenv:stylechecks]
52 | deps =
53 | flake8
54 | commands = {posargs:flake8 feature_engine tests}
55 |
56 | [testenv:typechecks]
57 | deps =
58 | mypy
59 | commands = {posargs:mypy feature_engine}
60 |
61 | [flake8]
62 | exclude = .git, env
63 | # match black code formatter
64 | max-line-length = 88
65 |
66 | profile = black
67 | line_length = 88
68 | lines_between_sections = 1
69 | known_first_party = "sentry"
--------------------------------------------------------------------------------