├── __init__.py ├── examples ├── __init__.py ├── manual_multiclass.py └── multiclass_example.ipynb ├── experiments ├── __init__.py ├── replication_experiments │ ├── __init__.py │ ├── replication_data │ │ ├── vary_D_Credit 1.pdf │ │ ├── vary_e_Credit 1.pdf │ │ ├── vary_t_Credit 1.pdf │ │ ├── low_eps_bb_Credit 1.pdf │ │ ├── dp_ebm_vary_T_Credit 1.pdf │ │ ├── comparisons_zoom_Credit 1.pdf │ │ ├── split_candidates_vary_Q_Credit 1.pdf │ │ ├── split_candidates_vary_T_Credit 1.pdf │ │ ├── split_candidates_vary_s_Credit 1.pdf │ │ ├── feature_interactions_vary_k_Credit 1.pdf │ │ └── split_candidates_vary_eps_Credit 1.pdf │ └── experiment_replicator.py └── paper_experiments │ └── paper_plots │ ├── vary_D_Bank.pdf │ ├── vary_D_adult.pdf │ ├── vary_D_nomao.pdf │ ├── vary_clients.pdf │ ├── vary_e_Bank.pdf │ ├── vary_e_adult.pdf │ ├── vary_e_nomao.pdf │ ├── vary_t_Bank.pdf │ ├── vary_t_adult.pdf │ ├── vary_t_nomao.pdf │ ├── bubble_plot_full.pdf │ ├── low_eps_bb_Bank.pdf │ ├── low_eps_bb_adult.pdf │ ├── low_eps_bb_nomao.pdf │ ├── vary_D_Credit 1.pdf │ ├── vary_D_Credit 2.pdf │ ├── vary_e_Credit 1.pdf │ ├── vary_e_Credit 2.pdf │ ├── vary_t_Credit 1.pdf │ ├── vary_t_Credit 2.pdf │ ├── dp_ebm_vary_T_Bank.pdf │ ├── dp_ebm_vary_T_adult.pdf │ ├── dp_ebm_vary_T_nomao.pdf │ ├── low_eps_bb_Credit 1.pdf │ ├── low_eps_bb_Credit 2.pdf │ ├── bubble_plot_tree_mean.pdf │ ├── comparisons_zoom_Bank.pdf │ ├── comparisons_zoom_adult.pdf │ ├── comparisons_zoom_nomao.pdf │ ├── dp_ebm_vary_T_Credit 1.pdf │ ├── dp_ebm_vary_T_Credit 2.pdf │ ├── non_dp_ebm_vary_T_Bank.pdf │ ├── low_eps_bb_Credit 1_zoom.pdf │ ├── non_dp_ebm_vary_T_adult.pdf │ ├── non_dp_ebm_vary_T_nomao.pdf │ ├── total_client_computation.pdf │ ├── total_server_computation.pdf │ ├── comparisons_zoom_Credit 1.pdf │ ├── comparisons_zoom_Credit 2.pdf │ ├── non_dp_ebm_vary_T_Credit 1.pdf │ ├── non_dp_ebm_vary_T_Credit 2.pdf │ ├── comparisons_zoom_higgs-sample.pdf │ ├── split_candidates_vary_Q_Bank.pdf │ ├── split_candidates_vary_Q_adult.pdf │ ├── split_candidates_vary_Q_nomao.pdf │ ├── split_candidates_vary_T_Bank.pdf │ ├── split_candidates_vary_T_adult.pdf │ ├── split_candidates_vary_T_nomao.pdf │ ├── split_candidates_vary_s_Bank.pdf │ ├── split_candidates_vary_s_adult.pdf │ ├── split_candidates_vary_s_nomao.pdf │ ├── comparisons_zoom_Credit 1_right.pdf │ ├── feature_interactions_vary_k_Bank.pdf │ ├── split_candidates_vary_Q_Credit 1.pdf │ ├── split_candidates_vary_Q_Credit 2.pdf │ ├── split_candidates_vary_T_Credit 1.pdf │ ├── split_candidates_vary_T_Credit 2.pdf │ ├── split_candidates_vary_eps_Bank.pdf │ ├── split_candidates_vary_eps_adult.pdf │ ├── split_candidates_vary_eps_nomao.pdf │ ├── split_candidates_vary_s_Credit 1.pdf │ ├── split_candidates_vary_s_Credit 2.pdf │ ├── feature_interactions_vary_k_adult.pdf │ ├── feature_interactions_vary_k_nomao.pdf │ ├── split_candidates_vary_eps_Credit 1.pdf │ ├── split_candidates_vary_eps_Credit 2.pdf │ ├── feature_interactions_vary_k_Credit 1.pdf │ ├── feature_interactions_vary_k_Credit 2.pdf │ ├── split_candidates_vary_Q_higgs-sample.pdf │ ├── split_candidates_vary_T_Credit 1_zoom.pdf │ ├── split_candidates_vary_T_higgs-sample.pdf │ ├── split_candidates_vary_s_higgs-sample.pdf │ ├── split_candidates_vary_eps_higgs-sample.pdf │ ├── split_candidates_vary_T_Credit 1_lineplot.pdf │ ├── split_candidates_vary_s_Credit 1_lineplot.pdf │ └── split_candidates_vary_eps_Credit 1_lineplot.pdf ├── federated_gbdt ├── __init__.py ├── core │ ├── __init__.py │ ├── binning │ │ ├── __init__.py │ │ ├── sparse_vector.py │ │ ├── bin_inner_param.py │ │ ├── feature_binning_param.py │ │ ├── bin_result.py │ │ ├── quantile_summaries.py │ │ └── quantile_binning.py │ ├── pure_ldp │ │ ├── __init__.py │ │ ├── frequency_oracles │ │ │ ├── hybrid_mechanism │ │ │ │ ├── __init__.py │ │ │ │ ├── hybrid_mech_server.py │ │ │ │ └── hybrid_mech_client.py │ │ │ ├── square_wave │ │ │ │ ├── __init__.py │ │ │ │ ├── sw_client.py │ │ │ │ └── sw_server.py │ │ │ └── local_hashing │ │ │ │ ├── __init__.py │ │ │ │ ├── fast_lh_client.py │ │ │ │ ├── lh_server.py │ │ │ │ ├── lh_client.py │ │ │ │ └── fast_lh_server.py │ │ └── core │ │ │ ├── prob_simplex.py │ │ │ ├── _freq_oracle_client.py │ │ │ ├── __init__.py │ │ │ ├── fo_creator.py │ │ │ └── _freq_oracle_server.py │ ├── moments_accountant │ │ ├── __init__.py │ │ ├── compute_noise_from_budget_lib.py │ │ └── dp_params.py │ ├── dp_multiq │ │ ├── requirements.txt │ │ ├── AUTHORS │ │ ├── README │ │ ├── run.sh │ │ ├── run_experiment.py │ │ ├── smooth.py │ │ ├── base.py │ │ ├── ind_exp.py │ │ ├── smooth_utils.py │ │ ├── joint_exp.py │ │ └── csmooth.py │ ├── README.MD │ ├── baseline_constants.py │ ├── plotting.py │ └── loss_functions.py ├── models │ ├── base │ │ ├── __init__.py │ │ ├── README.MD │ │ ├── tree_node.py │ │ ├── jit_functions.py │ │ └── tree_base.py │ ├── gbdt │ │ ├── __init__.py │ │ └── components │ │ │ ├── __init__.py │ │ │ ├── index_sampler.py │ │ │ └── train_monitor.py │ ├── __init__.py │ └── README.md └── README.md ├── .gitignore ├── requirements.txt └── README.md /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/models/base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/models/gbdt/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/core/binning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/replication_experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/core/moments_accountant/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/models/gbdt/components/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /federated_gbdt/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .gbdt.private_gbdt import PrivateGBDT 2 | 3 | __all__ = ["PrivateGBDT"] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .idea 3 | __pycache__ 4 | experiments/experiment_data 5 | data 6 | dev 7 | **/pre_paper/** -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/__init__.py: -------------------------------------------------------------------------------- 1 | from .sw_server import SWServer 2 | from .sw_client import SWClient -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py >= 0.1.6 2 | matplotlib >= 3.0.3 3 | pandas >= 1.1.5 4 | numpy >= 1.16.4 5 | scipy >= 1.2.1 6 | -------------------------------------------------------------------------------- /federated_gbdt/models/README.md: -------------------------------------------------------------------------------- 1 | ### Structure: 2 | * ```base``` - Contains base classes for clients, trees and decision nodes 3 | * ``private_gbdt`` - The main DP GBDT framework -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_D_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_D_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_D_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_clients.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_clients.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_e_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_e_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_e_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_t_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_t_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_t_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/bubble_plot_full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/bubble_plot_full.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_D_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_D_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_e_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_e_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_t_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/vary_t_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/dp_ebm_vary_T_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/dp_ebm_vary_T_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/bubble_plot_tree_mean.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/bubble_plot_tree_mean.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1_zoom.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1_zoom.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/total_client_computation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/total_client_computation.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/total_server_computation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/total_server_computation.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/vary_D_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_D_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/vary_e_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_e_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/vary_t_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_t_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_higgs-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_higgs-sample.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_nomao.pdf -------------------------------------------------------------------------------- /federated_gbdt/README.md: -------------------------------------------------------------------------------- 1 | ### Structure: 2 | 3 | * ``core`` - Contains helper functions and structures for XGBoost such as quantile sketches, loss functions and pre-processing methods 4 | * ``models`` - Contains main GBDT models -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1_right.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1_right.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Bank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Bank.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/low_eps_bb_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/low_eps_bb_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/feature_interactions_vary_k_adult.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_adult.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/feature_interactions_vary_k_nomao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_nomao.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/dp_ebm_vary_T_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/dp_ebm_vary_T_Credit 1.pdf -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/__init__.py: -------------------------------------------------------------------------------- 1 | from .lh_server import LHServer 2 | from .lh_client import LHClient 3 | 4 | from .fast_lh_client import FastLHClient 5 | from .fast_lh_server import FastLHServer -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 2.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_Q_higgs-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_higgs-sample.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_zoom.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_zoom.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_higgs-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_higgs-sample.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_higgs-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_higgs-sample.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/comparisons_zoom_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/comparisons_zoom_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_higgs-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_higgs-sample.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_lineplot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_lineplot.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1_lineplot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1_lineplot.pdf -------------------------------------------------------------------------------- /experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1_lineplot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1_lineplot.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/split_candidates_vary_Q_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_Q_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/split_candidates_vary_T_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_T_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/split_candidates_vary_s_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_s_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/feature_interactions_vary_k_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/feature_interactions_vary_k_Credit 1.pdf -------------------------------------------------------------------------------- /experiments/replication_experiments/replication_data/split_candidates_vary_eps_Credit 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_eps_Credit 1.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | autodp==0.2 2 | bitarray==2.6.2 3 | fast_histogram==0.11 4 | matplotlib==3.6.2 5 | numba==0.56.4 6 | numpy==1.23.5 7 | pandas==1.5.2 8 | pathos==0.3.0 9 | pmlb==1.0.1.post3 10 | scikit_learn==1.2.0 11 | scipy==1.10.0 12 | seaborn==0.12.2 13 | six==1.16.0 14 | statsmodels==0.13.5 15 | tqdm==4.64.1 16 | xxhash==3.2.0 17 | notebook==6.5.2 -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of authors for copyright purposes. 2 | # 3 | # This does not necessarily list everyone who has contributed code, since in 4 | # some cases, their employer may be the copyright holder. To see the full list 5 | # of contributors, see the revision history in source control. 6 | Google LLC 7 | Jennifer Gillenwater 8 | Matthew Joseph 9 | Alex Kulesza -------------------------------------------------------------------------------- /federated_gbdt/models/base/README.MD: -------------------------------------------------------------------------------- 1 | ### File Structure: 2 | 3 | * ``client.py`` - Leftover code from FEVERLESS, used in ``federated_xgboost.models.vertical`` 4 | * ``tree_base.py`` - Base class for the XGBoost model, modified from FEVERLESS code 5 | * ``tree_node.py`` - Base class for a node in a decision tree, modified from FEVERLESS code 6 | * ``tree_params.py`` - Leftover code from FEVERLESS, used in ``federated_xgboost.models.vertical`` -------------------------------------------------------------------------------- /federated_gbdt/core/README.MD: -------------------------------------------------------------------------------- 1 | ### Structue of federated_xgboost.core: 2 | 3 | * ``binning`` - Binning/Quantile Sketching implementation from FATE 4 | * ``dp_multiq`` - Google implementation of multi-dimensional DP quantiles (see https://arxiv.org/abs/2102.08244) 5 | * ``moments_accountant`` - RDP moments accountant implementation (of the sampled gaussian mechanism (SGM)) from tensorflow.privacy 6 | * ``pure_ldp`` - Various implementations of LDP protocols 7 | * ``baseline_constants.py`` - Leftover code from FEVERLESS, contains parameters for quantile sketching -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/core/prob_simplex.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | #simplex projection 4 | def project_probability_simplex(p_estimate): 5 | k = len(p_estimate) # Infer the size of the alphabet. 6 | p_estimate_sorted = np.sort(p_estimate) 7 | p_estimate_sorted[:] = p_estimate_sorted[::-1] 8 | p_sorted_cumsum = np.cumsum(p_estimate_sorted) 9 | i = 1 10 | while i < k: 11 | if p_estimate_sorted[i] + (1.0 / (i + 1)) * (1 - p_sorted_cumsum[i]) < 0: 12 | break 13 | i += 1 14 | lmd = (1.0 / i) * (1 - p_sorted_cumsum[i - 1]) 15 | return np.maximum(p_estimate + lmd, 0) -------------------------------------------------------------------------------- /federated_gbdt/core/binning/sparse_vector.py: -------------------------------------------------------------------------------- 1 | # sparse vector ------------------------------------------------------ 2 | class SparseVector: 3 | """ 4 | sparse vector: dict, record (indices, data) kv tuples 5 | shape: the real feature shape of data 6 | """ 7 | def __init__(self, indices=None, fn=None, data=None, shape=0): 8 | self.sparse_vec = dict(zip(indices, data)) 9 | self.feature_name = fn 10 | self.shape = shape 11 | 12 | def get_all_data(self): 13 | for idx, data in self.sparse_vec.items(): 14 | yield idx, data 15 | 16 | def get_shape(self): 17 | return self.shape 18 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/README: -------------------------------------------------------------------------------- 1 | First download the Goodreads dataset from Kaggle: 2 | https://www.kaggle.com/jealousleopard/goodreadsbooks. 3 | Save it as "books.csv" in this directory. Then use 4 | 5 | > cd .. 6 | > python -m dp_multiq.run_experiment 7 | 8 | to run a small number of trials for each of the four data regimes described in 9 | the paper. This will produce and save plots with names eps_1_error_[data].png 10 | and eps_1_times_[data].png. 11 | 12 | To fully replicate the experiments from the paper, change the default parameters 13 | in the experiment() function to 14 | 15 | num_samples=1000 16 | num_quantiles_range=range(1, 30) 17 | est_num_trials=20 18 | ts_num_trials=5 -------------------------------------------------------------------------------- /federated_gbdt/core/baseline_constants.py: -------------------------------------------------------------------------------- 1 | 2 | ACCURACY_KEY = 'accuracy' 3 | BYTES_WRITTEN_KEY = 'bytes_written' 4 | BYTES_READ_KEY = 'bytes_read' 5 | LOCAL_COMPUTATIONS_KEY = 'local_computations' 6 | NUM_ROUND_KEY = 'round_number' 7 | NUM_SAMPLES_KEY = 'num_samples' 8 | CLIENT_ID_KEY = 'client_id' 9 | 10 | FLOAT_ZERO = 1e-8 11 | QUANTILE = 'quantile' 12 | DEFAULT_COMPRESS_THRESHOLD = 10000 13 | DEFAULT_HEAD_SIZE = 10000 14 | DEFAULT_RELATIVE_ERROR = 0.001 15 | G_BIN_NUM = 10 16 | MAX_SPLIT_NODES = 2 ** 16 17 | SECURE_AGG_AMPLIFY_FACTOR = 1000 18 | 19 | MAX_CLASSNUM = 1000 20 | 21 | LABEL_NAME = ['default.payment.next.month', 'SeriousDlqin2yrs', 'y', 'y_yes', 'Appliances'] 22 | 23 | CLASSIFICATION = 'classification' 24 | REGRESSION = 'regression' 25 | CLUSTERING = 'clustering' 26 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The Google Research Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | set -e 17 | set -x 18 | 19 | virtualenv -p python3 . 20 | source ./bin/activate 21 | 22 | pip install -r requirements.txt 23 | pushd .. 24 | python3 -m dp_multiq.run_experiment 25 | popd 26 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/hybrid_mech_server.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer 2 | import numpy as np 3 | 4 | 5 | class HMServer(FreqOracleServer): 6 | def __init__(self, epsilon, d, index_mapper=None): 7 | super().__init__(epsilon, d, index_mapper=index_mapper) 8 | self.update_params(epsilon, d, index_mapper=index_mapper) 9 | self.aggregated_data = [] 10 | 11 | def update_params(self, epsilon=None, d=None, index_mapper=None): 12 | super().update_params(epsilon, d, index_mapper) 13 | 14 | def aggregate(self, priv_data): 15 | self.aggregated_data.append(priv_data) 16 | self.n += 1 17 | 18 | def _update_estimates(self): 19 | mean = np.mean(self.aggregated_data) 20 | return mean 21 | 22 | def estimate(self, suppress_warnings=False): 23 | """ 24 | Calculates a frequency estimate of the given data item 25 | Args: 26 | data: data item 27 | suppress_warnings: Optional boolean - Supresses warnings about possible inaccurate estimations 28 | Returns: float - frequency estimate 29 | """ 30 | self.check_warnings(suppress_warnings=suppress_warnings) 31 | return self._update_estimates() 32 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/sw_client.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient 2 | import numpy as np 3 | import random 4 | import math 5 | 6 | 7 | class SWClient(FreqOracleClient): 8 | def __init__(self, epsilon, index_mapper=None): 9 | super().__init__(epsilon=epsilon, d=None, index_mapper=index_mapper) 10 | self.update_params(epsilon, d=None, index_mapper=index_mapper) 11 | 12 | def update_params(self, epsilon=None, d=None, index_mapper=None): 13 | super().update_params(epsilon, d, index_mapper) 14 | ee = np.exp(self.epsilon) 15 | if epsilon is not None or d is not None: 16 | self.b = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon)) 17 | self.p = ee / ((2 * self.b * ee) + 1) 18 | self.q = 1 / ((2 * self.b * ee) + 1) 19 | 20 | def _perturb(self, data): 21 | if random.random() <= 2 * self.b * self.p: 22 | perturbed_val = random.uniform(data - self.b, data + self.b) 23 | else: 24 | if random.random() < 0.5: 25 | perturbed_val = random.uniform(-self.b, data - self.b) 26 | else: 27 | perturbed_val = random.uniform(data + self.b, 1 + self.b) 28 | 29 | return perturbed_val 30 | 31 | def privatise(self, data): 32 | # index = self.index_mapper(data) 33 | return self._perturb(data) 34 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/run_experiment.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Script for running experiments and saving the resulting plots.""" 17 | 18 | import warnings 19 | 20 | from dp_multiq import experiment 21 | 22 | # To suppress RuntimeWarnings about log(0). This quantity occurs frequently in 23 | # our code, and doesn't mean that anything is going wrong; numpy.log(0) will 24 | # produce -numpy.inf, which our code handles appropriately. 25 | warnings.simplefilter("ignore", category=RuntimeWarning) 26 | 27 | experiment.experiment(methods=[ 28 | experiment.QuantilesEstimationMethod.JOINT_EXP, experiment 29 | .QuantilesEstimationMethod.IND_EXP, experiment.QuantilesEstimationMethod 30 | .APP_IND_EXP, experiment.QuantilesEstimationMethod.SMOOTH, 31 | experiment.QuantilesEstimationMethod.CSMOOTH, 32 | experiment.QuantilesEstimationMethod.LAP_TREE 33 | ]) 34 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/fast_lh_client.py: -------------------------------------------------------------------------------- 1 | import random 2 | from federated_gbdt.core.pure_ldp.frequency_oracles.local_hashing import LHClient 3 | 4 | # Client-side for fast local-hashing 5 | # Heuristic fast variant of OLH 6 | 7 | class FastLHClient(LHClient): 8 | def __init__(self, epsilon, d, k, g=2, use_olh=False, index_mapper=None): 9 | """ 10 | Fast heuristic version of OLH 11 | 12 | Args: 13 | epsilon: float - The privacy budget 14 | g: Optional integer - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing) 15 | use_olh: Optional boolean - if set to true uses Optimised Local Hashing (OLH) i.e g is set to round(e^epsilon + 1) 16 | index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 17 | """ 18 | self.k = k 19 | super().__init__(epsilon, d, g, use_olh, index_mapper) 20 | 21 | if k is not None: 22 | self.k = k 23 | 24 | def update_params(self, epsilon=None, d=None, k=None, use_olh=None, g=None, index_mapper=None): 25 | super().update_params(epsilon, d, use_olh, g, index_mapper) 26 | self.k = k if k is not None else self.k 27 | 28 | def privatise(self, data): 29 | """ 30 | Privatises a user's data using fast local hashing (FLH) 31 | 32 | Args: 33 | data: The data to be privatised 34 | 35 | Returns: 36 | privatised data: a single integer 37 | """ 38 | 39 | seed = random.randint(0, self.k-1) 40 | return self._perturb(data, seed), seed 41 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/core/_freq_oracle_client.py: -------------------------------------------------------------------------------- 1 | 2 | class FreqOracleClient: 3 | def __init__(self, epsilon, d, index_mapper=None): 4 | """ 5 | 6 | Args: 7 | epsilon (float): Privacy budget 8 | d (int): domain size - not all freq oracles need this, so can be None 9 | index_mapper (func): Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 10 | """ 11 | self.epsilon = epsilon 12 | self.d = d 13 | 14 | if index_mapper is None: 15 | self.index_mapper = lambda x: x - 1 16 | else: 17 | self.index_mapper = index_mapper 18 | 19 | def update_params(self, epsilon=None, d=None, index_mapper=None): 20 | """ 21 | Method to update params of freq oracle client, should be overridden if more options needed. 22 | Args: 23 | epsilon (optional float): Privacy budget 24 | d (optional int): Domain size 25 | index_mapper (optional func): Index map function 26 | """ 27 | self.epsilon = epsilon if epsilon is not None else self.epsilon 28 | self.d = d if d is not None else self.d 29 | self.index_mapper = index_mapper if index_mapper is not None else self.index_mapper 30 | 31 | def _perturb(self, data): 32 | """ 33 | Used internally to perturb raw data, must be implemented by a FreqOracle 34 | Args: 35 | data: user's data item 36 | """ 37 | raise NotImplementedError("Must implement") 38 | 39 | def privatise(self, data): 40 | """ 41 | Public facing method to privatise user's data 42 | Args: 43 | data: user's data item 44 | """ 45 | raise NotImplementedError("Must implement") -------------------------------------------------------------------------------- /federated_gbdt/models/base/tree_node.py: -------------------------------------------------------------------------------- 1 | from numba.experimental import jitclass 2 | from numba import float32, int32, deferred_type, optional, types 3 | 4 | # Node of decision tree, recursive model 5 | 6 | node_type = deferred_type() # numba 7 | 8 | array_type = types.float32[:] 9 | 10 | spec = [ # numba 11 | ('feature_i', int32), 12 | ('threshold', optional(float32)), 13 | # ('value', optional(float32)), 14 | ('value', optional(array_type)), 15 | ('true_branch', optional(node_type)), 16 | ('false_branch', optional(node_type)), 17 | ('split_gain', optional(float32)), 18 | ('hessian_sum', optional(float32)), 19 | ('gradient_sum', optional(float32)), 20 | ('num_observations', optional(int32)), 21 | ('depth', optional(int32)), 22 | ] 23 | 24 | # @jitclass(spec) # numba 25 | class DecisionNode: 26 | def __init__(self, node_id="empty", feature_i=None, threshold=None, 27 | value=None, true_branch=None, false_branch=None, split_gain=None, hessian_sum=None, gradient_sum=None, num_observations=None, depth=None): 28 | 29 | self.feature_i = feature_i # Index for feature that is split on 30 | self.threshold = threshold # Split candidate value 31 | self.value = value # Value if the node is a leaf in the tree 32 | self.true_branch = true_branch # Left subtree 33 | self.false_branch = false_branch # Right subtree 34 | self.node_id = node_id # Node id for post-training stats 35 | 36 | # Needed for feature importance calculations 37 | self.split_gain = split_gain 38 | self.hessian_sum = hessian_sum 39 | self.gradient_sum = gradient_sum 40 | self.num_observations = num_observations 41 | self.depth = depth 42 | 43 | # node_type.define(DecisionNode.class_type.instance_type) # numba -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/core/__init__.py: -------------------------------------------------------------------------------- 1 | import xxhash 2 | import hashlib 3 | from bitarray import bitarray 4 | 5 | # Base classes for frequency oracles 6 | from ._freq_oracle_client import FreqOracleClient 7 | from ._freq_oracle_server import FreqOracleServer 8 | 9 | # Helper functions for generating hash funcs 10 | 11 | def generate_hash_funcs(k, m): 12 | """ 13 | Generates k hash functions that map data to the range {0, 1,..., m-1} 14 | Args: 15 | k: The number of hash functions 16 | m: The domain {0,1,...,m-1} that hash func maps too 17 | Returns: List of k hash functions 18 | """ 19 | hash_funcs = [] 20 | for i in range(0, k): 21 | hash_funcs.append(generate_hash(m, i)) 22 | return hash_funcs 23 | 24 | 25 | def generate_256_hash(): 26 | """ 27 | 28 | Returns: A hash function that maps data to {0,1,... 255} 29 | 30 | """ 31 | return lambda data: xxhash.xxh64(data, seed=10).intdigest() % 256 32 | 33 | 34 | def generate_hash(m, seed): 35 | """ 36 | Generate a single hash function that maps data to {0, ... ,m-1} 37 | Args: 38 | m: int domain to map too 39 | seed: int the seed for the hash function 40 | 41 | Returns: Hash function 42 | 43 | """ 44 | return lambda data: xxhash.xxh64(str(data), seed=seed).intdigest() % m 45 | 46 | 47 | def get_sha256_hash_arr(hashId, dataString): 48 | """ 49 | Used in priv_count_sketch freq oracle for hashing 50 | Args: 51 | hashId: seed of the hash 52 | dataString: data string to hash 53 | 54 | Returns: hashed data as a bitarray 55 | 56 | """ 57 | message = hashlib.sha256() 58 | 59 | message.update((str(hashId) + dataString).encode("utf8")) 60 | 61 | message_in_bytes = message.digest() 62 | 63 | message_in_bit_array = bitarray(endian='little') 64 | message_in_bit_array.frombytes(message_in_bytes) 65 | 66 | return message_in_bit_array 67 | -------------------------------------------------------------------------------- /examples/manual_multiclass.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append("../") 4 | 5 | from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT 6 | from federated_gbdt.core.loss_functions import SoftmaxCrossEntropyLoss 7 | from experiments.experiment_helpers.data_loader import DataLoader 8 | from sklearn.metrics import roc_auc_score, accuracy_score 9 | from sklearn.preprocessing import OneHotEncoder 10 | import numpy as np 11 | 12 | from xgboost import XGBClassifier 13 | 14 | # Load connect4 dataset 15 | dataloader = DataLoader() 16 | X_train, X_test, y_train, y_test = dataloader.load_datasets( 17 | ["connect_4"], return_dict=False 18 | )[0] 19 | onehot_y_test = OneHotEncoder(sparse_output=False).fit_transform(y_test.reshape(-1, 1)) 20 | 21 | # XGBoost baseline 22 | xgb = XGBClassifier().fit(X_train, y_train) 23 | xgb_probs = xgb.predict_proba(X_test) 24 | xgb_pred = np.argmax(xgb_probs, axis=1) 25 | print(f"XGBoost AUC - {roc_auc_score(onehot_y_test, xgb_probs)}") 26 | print(f"XGBoost Accuracy - {accuracy_score(y_test, xgb_pred)}") 27 | print("\n") 28 | 29 | # PrivateGBDT (eps=0, non-private) 30 | C = len(np.unique(y_train)) # C=3 classes for connect4 31 | total_eps = 3 32 | # scale privacy budget, here we have eps=0 (non-private) so scaling not needed 33 | class_eps = total_eps / C 34 | class_probs = [] 35 | for c in range(0, C): 36 | print(f"Training model... class {c} vs all") 37 | dp_method = "" if class_eps == 0 else "gaussian_cdp" 38 | xgb_model = PrivateGBDT(num_trees=100, epsilon=class_eps, dp_method=dp_method) 39 | y_train_c = (y_train == c).astype(int) # one-vs-all for class k 40 | xgb_model = xgb_model.fit(X_train, y_train_c) 41 | class_probs.append(xgb_model.predict_proba(X_test)[:, 1]) 42 | y_probs = SoftmaxCrossEntropyLoss().predict(np.array(list(zip(*class_probs)))) 43 | y_pred = np.argmax(y_probs, axis=1) 44 | print( 45 | f"PrivateGBDT (epsilon={total_eps}) AUC - {roc_auc_score(onehot_y_test, y_probs)}" 46 | ) 47 | print(f"PrivateGBDT (epsilon={total_eps}) Accuracy - {accuracy_score(y_test, y_pred)}") 48 | -------------------------------------------------------------------------------- /federated_gbdt/models/base/jit_functions.py: -------------------------------------------------------------------------------- 1 | import numba 2 | import math 3 | 4 | @numba.jit(nopython=True) 5 | def _L1_clip(total_grads, reg_alpha): 6 | """ 7 | L1 regularisation on the gradients, controlled by self.reg_alpha 8 | 9 | :param total_grads: 10 | :return: 11 | """ 12 | if total_grads > reg_alpha: 13 | return total_grads - reg_alpha 14 | elif total_grads < -1 * reg_alpha: 15 | return total_grads + reg_alpha 16 | else: 17 | return 0 18 | 19 | 20 | @numba.jit(nopython=True) 21 | def _calculate_weight(total_grads, total_hess, reg_alpha, reg_delta, reg_lambda): 22 | """ 23 | Calculates weight for leaf nodes 24 | 25 | :param total_grads: Total sum of gradients 26 | :param total_hess: Total sum of hessians 27 | :return: Weight for leaf node 28 | """ 29 | if total_hess < 0: 30 | total_hess = 0 31 | 32 | weight = -1 * (_L1_clip(total_grads, reg_alpha) / (total_hess + reg_lambda)) 33 | if reg_delta != 0 and abs(weight) > reg_delta: 34 | return math.copysign(reg_delta, weight) # Delta clipping 35 | else: 36 | return weight 37 | 38 | 39 | @numba.jit(nopython=True) 40 | def _calculate_gain(total_grads, total_hess, reg_alpha, reg_delta, reg_lambda): 41 | """ 42 | Calculates gain from sum of gradients and sum of hessians 43 | 44 | :param total_grads: Sum of gradients 45 | :param total_hess: Sum of hessians 46 | :return: Gain score 47 | """ 48 | con = _L1_clip(total_grads, reg_alpha) 49 | weight = -1 * (con / (total_hess + reg_lambda)) 50 | if reg_delta != 0 and abs(weight) > reg_delta: # If delta-clipping is enabled the gain calculation is a little more complicated, following the implementation in the original XGBoost: https://github.com/dmlc/xgboost/blob/d7d1b6e3a6e2aa8fcb1857bf5e3188302a03b399/src/tree/param.h 51 | weight = math.copysign(reg_delta, weight) # Delta clipping 52 | return -(2 * total_grads * weight + (total_hess + reg_lambda) * weight ** 2) + reg_alpha * abs(weight) # This is an L1-regularised clipped gain calculation 53 | else: 54 | return -weight * con # G^2/H + lambda, with possible L1 regularisation and delta clipping on G 55 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/smooth.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Smooth sensitivity method for computing differentially private quantiles. 17 | 18 | Lemmas 2.6 and 2.9 from "Smooth Sensitivity and Sampling in Private Data 19 | Analysis" by Nissim, Radkhodnikova, and Smith 20 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf) describe the 21 | noise scaled to the smooth sensitivity. 22 | """ 23 | 24 | import numpy as np 25 | 26 | from dp_multiq import base 27 | from dp_multiq import smooth_utils 28 | 29 | 30 | def smooth(sorted_data, data_low, data_high, qs, divided_eps, divided_delta): 31 | """Returns (eps, delta)-differentially private quantile estimates for qs. 32 | 33 | Args: 34 | sorted_data: Array of data points sorted in increasing order. 35 | data_low: Lower limit for any differentially private quantile output value. 36 | data_high: Upper limit for any differentially private quantile output value. 37 | qs: Increasing array of quantiles in [0,1]. 38 | divided_eps: Privacy parameter epsilon, assumed to be already divided for 39 | the desired overall eps. 40 | divided_delta: Privacy parameter delta, assumed to be already divided for 41 | the desired overall delta. 42 | """ 43 | sorted_data = np.clip(sorted_data, data_low, data_high) 44 | o = np.empty(len(qs)) 45 | n = len(sorted_data) 46 | alpha = divided_eps / 2.0 47 | beta = divided_eps / (2 * np.log(2 / divided_delta)) 48 | for i in range(len(qs)): 49 | true_quantile_idx = base.quantile_index(n, qs[i]) 50 | true_quantile_value = sorted_data[true_quantile_idx] 51 | log_sensitivity = smooth_utils.compute_log_smooth_sensitivity( 52 | sorted_data, data_low, data_high, true_quantile_idx, beta) 53 | noise = np.exp(log_sensitivity) * np.random.laplace() / alpha 54 | o[i] = true_quantile_value + noise 55 | o = np.clip(o, data_low, data_high) 56 | return np.sort(o) 57 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/core/fo_creator.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.pure_ldp.frequency_oracles import * 2 | import copy 3 | import inspect 4 | 5 | 6 | # Used to create a list of possible frequency oracles in the pure-LDP library 7 | 8 | client_class_list = [] 9 | server_class_list = [] 10 | globs = list(globals().keys()).copy() # Create copy, since globals updates too much to iterate through 11 | 12 | for key in globs: 13 | if "Client" in key: 14 | client_class_list.append(key.replace("Client", "")) 15 | if "Server" in key: 16 | server_class_list.append(key.replace("Server", "")) 17 | 18 | class_list = {"Client": client_class_list, "Server": server_class_list} 19 | 20 | 21 | def _create_pure_fo_instance(obj_type, name, obj_params): 22 | """ 23 | Used internally to create instances of various Client/Server frequency oracles 24 | 25 | Args: 26 | obj_type (str): Either "Client" or "Server" 27 | name: Name of the frequency oracle to create 28 | obj_params: Parameters for the frequency oracle object 29 | 30 | Returns: Instance of name + obj_type frequency oracle 31 | 32 | """ 33 | fo_list = class_list[obj_type] 34 | 35 | split = name.split("_") # Get prefix of client name i.e if passed "local_hashing" get "LH" as prefix 36 | 37 | if len(split) > 1: 38 | name = "" 39 | for word in split: 40 | name += word[0] 41 | 42 | name = name.upper() 43 | if name == "HR": name = "HADAMARDRESPONSE" 44 | 45 | upper_fo_list = list(map(lambda x: x.upper(), fo_list)) 46 | 47 | if name not in upper_fo_list: 48 | raise ValueError("Frequency oracle must be one of:", fo_list, 49 | "\n NOTE: Values are case insensitive") 50 | 51 | fo_name = client_class_list[upper_fo_list.index(name)] + obj_type 52 | 53 | constructor = globals().get(fo_name) 54 | expected_params = list(inspect.signature(constructor).parameters) 55 | 56 | params = dict( 57 | (key.split("=")[0], obj_params[key.split("=")[0]]) for key in expected_params if key in obj_params.keys()) 58 | 59 | return constructor(**params) 60 | 61 | 62 | def create_fo_client_instance(name, client_params): 63 | """ 64 | Given a name of a frequency oracle creates a client instance of it 65 | 66 | Args: 67 | name: Name of frequency oracle (i.e LH, HE) 68 | client_params: The parameters for the client frequency oracle object 69 | 70 | Returns: A frequency oracle instance of nameClient 71 | 72 | """ 73 | return _create_pure_fo_instance("Client", name, client_params) 74 | 75 | 76 | def create_fo_server_instance(name, server_params): 77 | """ 78 | Given a name of a frequency oracle creates a server instance of it 79 | 80 | Args: 81 | name: Name of frequency oracle (i.e LH, HE) 82 | server_params: The parameters for the server frequency oracle 83 | 84 | Returns: A frequency oracle instance of nameServer 85 | 86 | """ 87 | return _create_pure_fo_instance("Server", name, server_params) 88 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/base.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Basic methods for generating data and computing non-private quantiles.""" 17 | 18 | import math 19 | import numpy as np 20 | 21 | 22 | def quantile_index(n, quantile): 23 | """Returns index of the specified quantile in a sorted dataset of n elements. 24 | 25 | Args: 26 | n: Size of the sorted dataset. 27 | quantile: A value in [0, 1] indicating the desired quantile. 28 | 29 | Returns: 30 | Index of the specified quantile. If the quantile is between points at 31 | indices i and i+1, returns i. 32 | """ 33 | return int(math.floor((n - 1) * quantile)) 34 | 35 | 36 | def quantiles(data, qs): 37 | """Returns quantile estimates for qs. 38 | 39 | Args: 40 | data: A dataset sorted in increasing order. 41 | qs: Increasing array of quantiles in [0,1]. 42 | """ 43 | return np.quantile(data, qs, interpolation='lower') 44 | 45 | 46 | def misclassified_points_error(sorted_data, true_quantiles, est_quantiles): 47 | """Returns the average number of data points between true and est quantiles. 48 | 49 | Args: 50 | sorted_data: A dataset sorted in increasing order. 51 | true_quantiles: Ground truth quantiles. 52 | est_quantiles: Estimated quantiles. 53 | 54 | Returns: 55 | The number of data points strictly between true_quantiles[j] and 56 | est_quantiles[j], averaged over all j. 57 | """ 58 | total_missed = 0 59 | num_quantiles = len(true_quantiles) 60 | for q_idx in range(num_quantiles): 61 | total_missed += np.abs( 62 | np.sum(sorted_data > true_quantiles[q_idx]) - 63 | np.sum(sorted_data > est_quantiles[q_idx])) 64 | return total_missed / num_quantiles 65 | 66 | 67 | def distance_error(true_quantiles, est_quantiles): 68 | """Returns the mean distance between the true and estimated quantiles. 69 | 70 | Args: 71 | true_quantiles: Ground truth quantiles. 72 | est_quantiles: Estimated quantiles. 73 | """ 74 | return np.mean(np.abs(true_quantiles - est_quantiles)) 75 | 76 | 77 | def gen_gaussian(num_samples, mean, stddev): 78 | """Returns num_samples iid Gaussian samples in increasing order. 79 | 80 | Args: 81 | num_samples: Number of samples to return. 82 | mean: Mean of Gaussian distribution to sample. 83 | stddev: Standard deviation of Gaussian distribution to sample. 84 | """ 85 | return np.sort(np.random.normal(loc=mean, scale=stddev, size=num_samples)) 86 | 87 | 88 | def gen_uniform(num_samples, data_low, data_high): 89 | """Returns num_samples iid uniform samples in increasing order. 90 | 91 | Args: 92 | num_samples: Number of samples to return. 93 | data_low: Lower bound of uniform distribution to sample. 94 | data_high: Upper bound of uniform distribution to sample. 95 | """ 96 | return np.sort( 97 | np.random.uniform(low=data_low, high=data_high, size=num_samples)) 98 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/lh_server.py: -------------------------------------------------------------------------------- 1 | import math 2 | import xxhash 3 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer 4 | 5 | # Server-side for local-hashing 6 | 7 | # Loosely based on https://github.com/vvv214/LDP_Protocols/blob/master/olh.py 8 | 9 | class LHServer(FreqOracleServer): 10 | def __init__(self, epsilon, d, g=2, use_olh=False, index_mapper=None): 11 | """ 12 | 13 | Args: 14 | epsilon: float - The privacy budget 15 | d: integer - Size of the data domain 16 | g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing) 17 | use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1) 18 | index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 19 | """ 20 | super().__init__(epsilon, d, index_mapper=index_mapper) 21 | self.set_name("LHServer") 22 | self.g = g 23 | self.use_olh = use_olh 24 | self.update_params(epsilon=epsilon, d=d, g=g, index_mapper=index_mapper) 25 | 26 | def update_params(self, epsilon=None, d=None, use_olh=None, g=None, index_mapper=None): 27 | """ 28 | Updates LHServer parameters, will reset any aggregated/estimated data 29 | Args: 30 | epsilon: optional - privacy budget 31 | d: optional - domain size 32 | g: optional - hash domain 33 | index_mapper: optional - function 34 | """ 35 | super().update_params(epsilon, d, index_mapper) 36 | 37 | # If use_olh is true, then update the g parameter 38 | if use_olh is not None: 39 | self.use_olh = use_olh 40 | 41 | self.g = g if g is not None else self.g 42 | if self.use_olh is True: 43 | self.g = int(round(math.exp(self.epsilon))) + 1 44 | 45 | # Update probs and g 46 | if epsilon is not None: 47 | self.p = math.exp(self.epsilon) / (math.exp(self.epsilon) + self.g - 1) 48 | 49 | def aggregate(self, priv_data): 50 | """ 51 | Aggregates privatised data from LHClient to be used to calculate frequency estimates. 52 | 53 | Args: 54 | priv_data: Privatised data of the form returned from UEClient.privatise 55 | """ 56 | seed = priv_data[1] 57 | priv_data = priv_data[0] 58 | 59 | for i in range(0, self.d): 60 | if priv_data == (xxhash.xxh32(str(i), seed=seed).intdigest() % self.g): 61 | self.aggregated_data[i] += 1 62 | 63 | self.n += 1 64 | 65 | def _update_estimates(self): 66 | a = self.g / (self.p * self.g - 1) 67 | b = self.n / (self.p * self.g - 1) 68 | 69 | self.estimated_data = a * self.aggregated_data - b 70 | return self.estimated_data 71 | 72 | def estimate(self, data, suppress_warnings=False): 73 | """ 74 | Calculates a frequency estimate of the given data item using the aggregated data. 75 | 76 | Args: 77 | data: data item 78 | suppress_warnings: Optional boolean - Suppresses warnings about possible inaccurate estimations 79 | 80 | Returns: float - frequency estimate of the data item 81 | 82 | """ 83 | self.check_warnings(suppress_warnings=suppress_warnings) 84 | index = self.index_mapper(data) 85 | self.check_and_update_estimates() 86 | return self.estimated_data[index] 87 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/lh_client.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import xxhash 4 | from sys import maxsize 5 | import random 6 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient 7 | 8 | # Client-side for local-hashing 9 | 10 | # Very loosely based on code by Wang (https://github.com/vvv214/LDP_Protocols/blob/master/olh.py) 11 | 12 | class LHClient(FreqOracleClient): 13 | def __init__(self, epsilon, d, g=2, use_olh=False, index_mapper=None): 14 | """ 15 | 16 | Args: 17 | epsilon: float - The privacy budget 18 | d: integer - Domain size 19 | g: Optional integer - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing) 20 | use_olh: Optional boolean - if set to true uses Optimised Local Hashing (OLH) i.e g is set to round(e^epsilon + 1) 21 | index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 22 | """ 23 | super().__init__(epsilon, d, index_mapper=index_mapper) 24 | self.use_olh = use_olh 25 | self.g =g 26 | self.update_params(epsilon=epsilon, d=d, g=g, index_mapper=index_mapper) 27 | 28 | def update_params(self, epsilon=None, d=None, use_olh=None, g=None, index_mapper=None): 29 | """ 30 | 31 | Args: 32 | epsilon: optional - privacy budget 33 | d: optional - domain size 34 | g: optional - hash domain 35 | index_mapper: optional - function 36 | """ 37 | super().update_params(epsilon, d, index_mapper) # Updates core params 38 | 39 | # If use_olh is true, then update the g parameter 40 | self.use_olh = use_olh if use_olh is not None else self.use_olh 41 | 42 | # Updates g and probs 43 | self.g = g if g is not None else self.g 44 | if self.use_olh is True: 45 | self.g = int(round(math.exp(self.epsilon))) + 1 46 | 47 | if self.epsilon is not None or self.g is not None: 48 | self.p = math.exp(self.epsilon) / (math.exp(self.epsilon) + self.g - 1) 49 | self.q = 1.0 / (math.exp(self.epsilon) + self.g - 1) 50 | 51 | def _perturb(self, data, seed): 52 | """ 53 | Used internally to perturb data using local hashing. 54 | 55 | Will hash the user's data item and then perturb it with probabilities that 56 | satisfy epsilon local differential privacy. 57 | 58 | Local hashing is explained in more detail here: https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf 59 | 60 | Args: 61 | data: User's data to be privatised 62 | seed: The seed for the user's hash function 63 | 64 | Returns: peturbed data 65 | 66 | """ 67 | index = self.index_mapper(data) 68 | 69 | # Taken directly from https://github.com/vvv214/LDP_Protocols/blob/master/olh.py#L55-L65 70 | x = (xxhash.xxh32(str(index), seed=seed).intdigest() % self.g) 71 | y = x 72 | 73 | p_sample = np.random.random_sample() 74 | # the following two are equivalent 75 | # if p_sample > p: 76 | # while not y == x: 77 | # y = np.random.randint(0, g) 78 | if p_sample > self.p - self.q: 79 | # perturb 80 | y = np.random.randint(0, self.g) 81 | 82 | return y 83 | 84 | def privatise(self, data): 85 | """ 86 | Privatises a user's data using local hashing. 87 | 88 | Args: 89 | data: The data to be privatised 90 | 91 | Returns: 92 | privatised data: a single integer 93 | """ 94 | seed = random.randint(0,maxsize) # This is sys.maxsize 95 | return self._perturb(data, seed), seed 96 | -------------------------------------------------------------------------------- /federated_gbdt/models/gbdt/components/index_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import math 3 | import itertools 4 | 5 | class IndexSampler(): 6 | def __init__(self, subsample, row_sample_method, colsample_bytree, colsample_bylevel, colsample_bynode): 7 | self.subsample = subsample 8 | self.row_sample_method = row_sample_method 9 | self.colsample_bytree = colsample_bytree # Number of features to uniformly sample per tree 10 | self.colsample_bylevel = colsample_bylevel # Per level of a tree 11 | self.colsample_bynode = colsample_bynode # Per node of a tree 12 | # Sampling is multiplicative i.e cols_sample_bytree * colsample_bylevel * colsample_bynode * d features are considered at each split 13 | 14 | self.feature_interaction_iter = None 15 | 16 | def sample(self, current_round, num_samples, num_features, max_depth, feature_interaction_method="cyclical", feature_interaction_k=""): 17 | """ 18 | Helper method to perform sampling for the XGBoost model 19 | 20 | :param num_samples: The number of rows in the dataset 21 | :param num_features: The number of features 22 | :return: The sampled indexes for rows, features per tree and features per node according to the self.subsample, 23 | self.colsample_bytree and self.colsample_bylevel parameters 24 | """ 25 | col_tree_sample = None 26 | col_level_sample = None 27 | row_sample = np.arange(num_samples) 28 | 29 | if self.subsample < 1: # Sample rows per tree 30 | if self.row_sample_method == "wor": 31 | row_sample = np.random.choice(num_samples, size=round(num_samples * self.subsample), replace=False) 32 | elif self.row_sample_method == "wr": 33 | raise NotImplemented("With replacement sampling is not implemented") 34 | elif self.row_sample_method == "poisson": 35 | row_sample = np.where(np.random.binomial(1, self.subsample, size=num_samples)==1)[0] 36 | elif self.row_sample_method == "disjoint": 37 | subset_size = math.ceil(num_samples*self.subsample) 38 | start = (((current_round) % math.ceil(num_samples / subset_size))) * subset_size 39 | end = start + subset_size 40 | row_sample = self.disjoint[start:end] 41 | 42 | if self.colsample_bytree < 1: # Sample columns per tree 43 | col_tree_sample = np.random.choice(num_features, size=math.ceil(num_features * self.colsample_bytree), replace=False) 44 | if self.colsample_bylevel < 1 and self.colsample_bytree < 1: # Sample columns per level of the tree (taking into account the cols alreaady sampled for the current tree) 45 | col_level_sample = [np.random.choice(range(0, len(col_tree_sample)), size=math.ceil(len(col_tree_sample) * self.colsample_bylevel), replace=False) for i in range(0, self.max_depth + 2)] 46 | elif self.colsample_bylevel < 1: 47 | col_level_sample = [np.random.choice(num_features, size=math.ceil(num_features* self.colsample_bylevel), replace=False) for i in range(0, self.max_depth + 2)] 48 | 49 | if "cyclical" in feature_interaction_method: 50 | if feature_interaction_k == 1: 51 | col_tree_sample = [current_round % num_features] 52 | elif feature_interaction_k: 53 | if self.feature_interaction_iter is None: 54 | self.feature_interaction_iter = itertools.cycle(itertools.combinations(list(range(0, num_features)), feature_interaction_k)) # precompute 55 | col_tree_sample = list(next(self.feature_interaction_iter)) 56 | elif "random" in feature_interaction_method: 57 | if feature_interaction_k: 58 | col_tree_sample = np.random.choice(num_features, size=feature_interaction_k, replace=False) # Choose k features at random 59 | 60 | return row_sample, col_tree_sample, col_level_sample 61 | -------------------------------------------------------------------------------- /federated_gbdt/core/moments_accountant/compute_noise_from_budget_lib.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Library for computing privacy values for DP-SGD.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import math 22 | import sys 23 | 24 | from scipy.optimize import bisect 25 | 26 | from federated_gbdt.core.moments_accountant.rdp_accountant import compute_rdp, get_privacy_spent # pylint: disable=g-import-not-at-top 27 | 28 | # Opting out of loading all sibling packages and their dependencies. 29 | sys.skip_tf_privacy_import = True 30 | 31 | 32 | def apply_dp_sgd_analysis(q, sigma, steps, orders, delta): 33 | """Compute and print results of DP-SGD analysis.""" 34 | 35 | # compute_rdp requires that sigma be the ratio of the standard deviation of 36 | # the Gaussian noise to the l2-sensitivity of the function to which it is 37 | # added. Hence, sigma here corresponds to the `noise_multiplier` parameter 38 | # in the DP-SGD implementation found in privacy.optimizers.dp_optimizer 39 | rdp = compute_rdp(q, sigma, steps, orders) 40 | 41 | eps, _, opt_order = get_privacy_spent(orders, rdp, target_delta=delta) 42 | return eps, opt_order 43 | 44 | 45 | def compute_noise(n, batch_size, target_epsilon, epochs, delta, noise_lbd, verbose=False): 46 | """Compute noise based on the given hyperparameters.""" 47 | q = batch_size / n # q - the sampling ratio. 48 | if q > 1: 49 | raise Exception('n must be larger than the batch size.') 50 | orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] + 51 | list(range(5, 64)) + [128, 256, 512]) 52 | steps = int(math.ceil(epochs * n / batch_size)) 53 | 54 | init_noise = noise_lbd # minimum possible noise 55 | init_epsilon, _ = apply_dp_sgd_analysis(q, init_noise, steps, orders, delta) 56 | 57 | if init_epsilon < target_epsilon: # noise_lbd was an overestimate 58 | print('min_noise too large for target epsilon.') 59 | return 0 60 | 61 | cur_epsilon = init_epsilon 62 | max_noise, min_noise = init_noise, 0 63 | 64 | # doubling to find the right range 65 | while cur_epsilon > target_epsilon: # until noise is large enough 66 | max_noise, min_noise = max_noise * 2, max_noise 67 | cur_epsilon, _ = apply_dp_sgd_analysis(q, max_noise, steps, orders, delta) 68 | # print(cur_epsilon) 69 | 70 | def epsilon_fn(noise): # should return 0 if guess_epsilon==target_epsilon 71 | guess_epsilon, opt_order = apply_dp_sgd_analysis(q, noise, steps, orders, delta) 72 | if verbose: 73 | print("Optimal Alpha", opt_order) 74 | return guess_epsilon - target_epsilon 75 | 76 | target_noise, res = bisect(epsilon_fn, min_noise, max_noise, full_output=True) 77 | final_eps, opt_order = apply_dp_sgd_analysis(q, target_noise, steps, orders, delta) 78 | 79 | if verbose: 80 | print(res) 81 | print( 82 | 'DP-SGD with sampling rate = {:.3g}% and noise_multiplier = {} iterated' 83 | ' over {} steps satisfies'.format(100 * q, target_noise, steps), 84 | end=' ') 85 | print('differential privacy with eps = {:.3g} and delta = {}.'.format( 86 | target_epsilon, delta)) 87 | 88 | return target_noise, opt_order -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/fast_lh_server.py: -------------------------------------------------------------------------------- 1 | import xxhash 2 | import numpy as np 3 | from federated_gbdt.core.pure_ldp.frequency_oracles.local_hashing import LHServer 4 | 5 | # Server-side for fast local-hashing 6 | 7 | class FastLHServer(LHServer): 8 | def __init__(self, epsilon, d, k, g=2, use_olh=True, index_mapper=None, hash_matrix=None): 9 | """ 10 | 11 | Args: 12 | epsilon: float - The privacy budget 13 | d: integer - Size of the data domain 14 | k: integer - The number of hash functions to use. Larger k results in a more accurate oracle at the expense of computation time. 15 | g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing) 16 | use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1) 17 | index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 18 | hash_matrix: Optional matrix - Allows the use of a pre-computed hash matrix that contains hashed domain elements 19 | """ 20 | self.k = k 21 | super().__init__(epsilon, d, g, use_olh, index_mapper=index_mapper) 22 | self.hash_counts = np.zeros((self.k, self.g)) 23 | 24 | # g = lambda i,j: xxhash.xxh32(str(int(j)), seed=int(i)).intdigest() % self.g 25 | 26 | if hash_matrix is None: 27 | matrix = np.empty((self.k, self.d)) 28 | for i in range(0, self.k): 29 | for j in range(0, self.d): 30 | matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g 31 | 32 | # self.hash_matrix = np.fromfunction(g, (self.k, self.d)) 33 | self.hash_matrix = matrix 34 | else: 35 | self.hash_matrix = hash_matrix 36 | 37 | def update_params(self, epsilon=None, d=None, k=None, use_olh=None, g=None, index_mapper=None, update_hash_matrix=True): 38 | super().update_params(epsilon=epsilon, d=d, use_olh=use_olh, g=g, index_mapper=index_mapper) 39 | self.k = k if k is not None else self.k 40 | 41 | # If any of the main parameters are updated the hash_matrix needs to be updated... this is quite slow 42 | if epsilon is not None or self.g is not None or self.k is not None or self.d is not None and update_hash_matrix is True: 43 | matrix = np.empty((self.k, self.d)) 44 | for i in range(0, self.k): 45 | for j in range(0, self.d): 46 | matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g 47 | self.hash_matrix = matrix 48 | 49 | def aggregate(self, priv_data): 50 | """ 51 | Aggregates privatised data from FastLHClient to be used to calculate frequency estimates. 52 | 53 | Args: 54 | priv_data: Privatised data of the form returned from UEClient.privatise 55 | """ 56 | seed = priv_data[1] 57 | priv_data = priv_data[0] 58 | 59 | self.hash_counts[seed][priv_data] += 1 60 | self.n += 1 61 | 62 | def _compute_aggregates(self): 63 | 64 | def func(x): 65 | sum = 0 66 | for index, val in enumerate(x): 67 | sum += self.hash_counts[index,int(val)] 68 | return sum 69 | 70 | self.aggregated_data = np.apply_along_axis(func, 0, self.hash_matrix) 71 | 72 | def _update_estimates(self): 73 | self._compute_aggregates() 74 | super()._update_estimates() 75 | 76 | def estimate(self, data, suppress_warnings=False): 77 | """ 78 | Calculates a frequency estimate of the given data item using the aggregated data. 79 | 80 | Args: 81 | data: data item 82 | suppress_warnings: Optional boolean - Suppresses warnings about possible inaccurate estimations 83 | 84 | Returns: float - frequency estimate of the data item 85 | 86 | """ 87 | self.check_and_update_estimates() 88 | return super().estimate(data) -------------------------------------------------------------------------------- /experiments/replication_experiments/experiment_replicator.py: -------------------------------------------------------------------------------- 1 | from experiments.paper_experiments.paper_experiments import * 2 | from experiments.paper_experiments.paper_plotter import * 3 | import os.path 4 | 5 | base_path = "./replication_data/" 6 | 7 | class ExperimentReplicator(): 8 | def __init__(self): 9 | pass 10 | 11 | def replicate(self, figure_num, dataset="Credit 1", overwrite=False): 12 | if figure_num == 1: 13 | filename = "replication_fig1" 14 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 15 | print("Replicated data already exists...") 16 | else: 17 | dp_split_methods_with_update_methods(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1]) 18 | print("Plotting data...") 19 | plot_split_methods_with_update(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 20 | 21 | elif figure_num == 2: 22 | filename = "replication_fig2" 23 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 24 | print("Replicated data already exists...") 25 | else: 26 | dp_split_candidate_methods(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1]) 27 | print("Plotting data...") 28 | plot_split_candidates(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 29 | 30 | elif figure_num == 3: 31 | filename = "replication_fig3" 32 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 33 | print("Replicated data already exists...") 34 | else: 35 | feature_interaction_experiments(filename=filename, save_data=True, replication=True, iters=6, datasets=[dataset], seeds=[1]) 36 | print("Plotting data...") 37 | plot_k_way(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 38 | 39 | elif figure_num == 4: 40 | filename = "replication_fig4" 41 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 42 | print("Replicated data already exists...") 43 | else: 44 | dp_ebm_experiment(filename=filename, save_data=True, replication=True, iters=10, datasets=[dataset], seeds=[1]) 45 | print("Plotting data...") 46 | plot_ebm_comparisons(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 47 | 48 | elif figure_num == 5: 49 | filename = "replication_fig5" 50 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 51 | print("Replicated data already exists...") 52 | else: 53 | batched_boosting(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1]) 54 | print("Plotting data...") 55 | plot_low_eps_bb(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 56 | 57 | elif figure_num == 6: 58 | filename = "replication_fig6" 59 | if os.path.isfile(base_path + filename + ".csv") and not overwrite: 60 | print("Replicated data already exists...") 61 | else: 62 | comparisons_experiment(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1]) 63 | print("Plotting data...") 64 | plot_comparisons(in_path=base_path+filename+".csv", out_path=base_path, replication=True) 65 | 66 | if __name__ == "__main__": 67 | replicator = ExperimentReplicator() 68 | # parser = argparse.ArgumentParser() 69 | # parser.add_argument('fig_num', type=int, default=1, choices=range(6),nargs='+', help='Figure number to replicate') 70 | # parser.add_argument('overwrite', type=bool, default=False, help='Whether to overwrite the existing data') 71 | # args = parser.parse_args() 72 | # replicator.replicate(args.fig_num, overwrite=args.overwrite) 73 | 74 | replicator.replicate(1, overwrite=False, dataset="Credit 1") -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/hybrid_mech_client.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient 2 | import numpy as np 3 | import random 4 | import math 5 | 6 | class HMClient(FreqOracleClient): 7 | def __init__(self, epsilon, max, min, index_mapper=None, perturb_type="hybrid"): 8 | 9 | super().__init__(epsilon=epsilon, d=None, index_mapper=index_mapper) 10 | self.update_params(epsilon, d=None, index_mapper=index_mapper) 11 | self.perturb_type = perturb_type 12 | self.max = max 13 | self.min = min 14 | self.normalised_input = np.array([]) 15 | 16 | def update_params(self, epsilon=None, d=None, index_mapper=None): 17 | super().update_params(epsilon, d, index_mapper) 18 | ee = np.exp(self.epsilon) 19 | 20 | if epsilon is not None or d is not None: 21 | self.p = ee/(ee + 1) 22 | self.q = 1/(ee + 1) 23 | self.ee2 = np.exp(self.epsilon/2) 24 | self.s = (self.ee2 + 1) / (self.ee2 - 1) 25 | self.alpha = 1 - (np.exp(-self.epsilon/2)) 26 | 27 | def _stochastic_rounding(self, norm_data): 28 | # perturb mechanism for Stochastic Rounding 29 | if random.random() <= self.q + (((self.p - self.q)*(1 - norm_data)) / 2): 30 | v_prime = -1 31 | else: 32 | v_prime = +1 33 | 34 | result = v_prime/(self.p-self.q) 35 | return result 36 | 37 | def _piecewise_mechanism(self, norm_data): 38 | # perturb for piecewise mechanism 39 | if random.random() <= self.ee2/(self.ee2 + 1): 40 | v_prime = random.uniform(((self.ee2 * norm_data) - 1) / (self.ee2 - 1), 41 | ((self.ee2 * norm_data) + 1) / (self.ee2 - 1)) 42 | else: 43 | if random.random() <= (((self.ee2 * norm_data) - 1) / (self.ee2 - 1) + self.s) / (self.s+1): # Weight sampling uniformly from the left-region by it's size 44 | v_prime = random.uniform(-self.s, ((self.ee2 * norm_data) - 1) / (self.ee2 - 1)) 45 | else: 46 | v_prime = random.uniform(((self.ee2 * norm_data) + 1) / (self.ee2 - 1), self.s) 47 | return v_prime 48 | 49 | def _pm2(self, v): 50 | """ 51 | Piecewise Mechanism, from paper: Collecting and Analyzing Multidimensional Data with Local Differential Privacy 52 | """ 53 | z = np.e ** (self.epsilon / 2) 54 | P1 = (v + 1) / (2 + 2 * z) 55 | P2 = z / (z + 1) 56 | P3 = (1 - v) / (2 + 2 * z) 57 | 58 | C = (z + 1) / (z - 1) 59 | g1 = (C + 1) * v / 2 - (C - 1) / 2 60 | g2 = (C + 1) * v / 2 + (C - 1) / 2 61 | 62 | rnd = np.random.random() 63 | if rnd < P1: 64 | result = -C + np.random.random() * (g1 - (-C)) 65 | elif rnd < P1 + P2: 66 | result = (g2 - g1) * np.random.random() + g1 67 | else: 68 | result = (C - g2) * np.random.random() + g2 69 | return result 70 | 71 | def _perturb(self, data): 72 | # normalise the input data into the domain [-1,1] 73 | norm_data = ((2*(data - self.min)) / (self.max - self.min)) - 1 74 | result = 0 75 | if self.perturb_type == "hybrid": 76 | # when epsilon > 0.61 use PW with prob alpha and SR with 1-alpha 77 | if self.epsilon > 0.61: 78 | if random.random() <= self.alpha: 79 | result = self._piecewise_mechanism(norm_data) 80 | else: 81 | result = self._stochastic_rounding(norm_data) 82 | # when epsilon <= 0.61 use SR only 83 | else: 84 | result = self._stochastic_rounding(norm_data) 85 | elif self.perturb_type == "sr": 86 | result = self._stochastic_rounding(norm_data) 87 | elif self.perturb_type == "pm": 88 | result = self._piecewise_mechanism(norm_data) 89 | 90 | result = ((result + 1) * (self.max - self.min) / 2) + self.min 91 | return result 92 | 93 | def privatise(self, data): 94 | return self._perturb(data) 95 | -------------------------------------------------------------------------------- /federated_gbdt/core/plotting.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from scipy.stats import gaussian_kde 6 | 7 | from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT 8 | 9 | from experiments.experiment_helpers.data_loader import DataLoader 10 | 11 | def plot_feature_importance(model, feature_names, method="gain"): 12 | """ 13 | Plots feature importance 14 | 15 | :param feature_names: List of feature names as strings (for plotting) 16 | :param method: Feature importance method to be used 17 | """ 18 | x, y = zip(*model.feature_importance(method).most_common()) 19 | plt.figure(figsize=(10, 10)) 20 | plt.bar(feature_names[list(x)], y) 21 | plt.xticks(rotation=90) 22 | plt.xlabel("Feature") 23 | plt.ylabel(method) 24 | plt.title("Feature Importance - " + method) 25 | plt.show() 26 | 27 | # Pass data, types of sketches to visualise and the features to visualise - Optional is to pass different bin nums to be printed 28 | def visualise_quantiles(model, X, sketch_types, feature_list, hist_bins=[32]): 29 | """ 30 | Helper method to visualise quantiles calculated via various methods 31 | 32 | :param X: Data 33 | :param sketch_types: List of quantile methods to be computed on features in X 34 | :param feature_list: List of features to visualise quantiles 35 | :param hist_bins: List of # of histogram bins to visualise 36 | """ 37 | quantile_map = {} 38 | for hist_bin in hist_bins: 39 | model.split_candidate_manager.num_candidates = hist_bin 40 | for sketch_type in sketch_types: 41 | model.split_candidate_manager.sketch_type = sketch_type 42 | model.split_candidate_manager.find_split_candidates(X, 0) 43 | quantile_map[sketch_type] = model.split_candidate_manager.feature_split_candidates 44 | 45 | _, axes = plt.subplots(len(feature_list), len(sketch_types), figsize=(20,30)) 46 | axes = np.array(axes).reshape(len(feature_list), len(sketch_types)) 47 | print(axes.shape) 48 | for j, feature_index in enumerate(feature_list): 49 | # Create subplot grid... 50 | print("Feature j", X[:, j]) 51 | for i, sketch_type in enumerate(quantile_map.keys()): 52 | # Plot feature dist 53 | sns.kdeplot(x=X[:, feature_index], ax=axes[j,i]) 54 | # sns.histplot(x=X[:, j], stat="density", kde=True, hist=False) 55 | 56 | # x,y = kde.get_lines()[0].get_data() 57 | kde = gaussian_kde(X[:,feature_index][~np.isnan(X[:,feature_index])]) 58 | 59 | quantiles = quantile_map[sketch_type][feature_index] 60 | # print(sketch_type, "quantiles:", len(quantiles)) 61 | # print(sketch_type, "unique quantiles:", len(set(quantiles))) 62 | # print(sketch_type, quantiles, "\n") 63 | axes[j,i].vlines(quantiles, 0, kde(quantiles), colors="red", linestyles="--", linewidth=0.4) 64 | axes[j,i].set_xlim(left=np.nanmin(X[:,j]), right=np.nanmax(X[:,j])) 65 | axes[j,i].set_yticklabels([]) 66 | axes[j,i].set_xticklabels([]) 67 | y_label = axes[j,i].get_yaxis().get_label() 68 | y_label.set_visible(False) 69 | # axes[j,i].set_title("Density of feature " + str(feature_index) + "\n Quantile Method: " + sketch_type) 70 | 71 | if "uniform" in quantile_map.keys(): 72 | uniform_quantiles = quantile_map["uniform"][feature_index] 73 | 74 | for k in quantile_map.keys(): 75 | ldp_quantiles = quantile_map[k][feature_index] 76 | ldp_quantiles = np.sort(ldp_quantiles) 77 | total_mse = 0 78 | for i, q in enumerate(ldp_quantiles): 79 | total_mse += np.min((uniform_quantiles-q)**2) 80 | # total_mse += (uniform_quantiles[i]-q)**2 81 | 82 | print("Feature", feature_index, "Method:", k, "MSE:", total_mse/len(uniform_quantiles)) 83 | 84 | plt.axis("off") 85 | plt.show() 86 | 87 | 88 | if __name__ == '__main__': 89 | dataloader = DataLoader() 90 | data = list(dataloader.load_datasets(["Credit 1"], remove_missing=True, return_dict=True, verbose=True).items())[0] 91 | X, X_test, y_train, y_test = data[1] 92 | X = X.to_numpy() 93 | model = PrivateGBDT() 94 | visualise_quantiles(model, X, ["uniform", "log"], [2,4,5,7]) 95 | -------------------------------------------------------------------------------- /federated_gbdt/core/loss_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def sigmoid(x): 4 | return 1.0 / (1.0 + np.exp(-x)) 5 | 6 | class Sigmoid(): 7 | def __call__(self, x): 8 | return 1 / (1 + np.exp(-x)) 9 | 10 | def gradient(self, x): 11 | return self.__call__(x) * (1 - self.__call__(x)) 12 | 13 | def softmax(x, axis=-1): 14 | y = np.exp(x - np.max(x, axis, keepdims=True)) 15 | return y / np.sum(y, axis, keepdims=True) 16 | 17 | class LogisticLoss(): 18 | def __init__(self): 19 | sigmoid = Sigmoid() 20 | self.log_func = sigmoid 21 | self.log_grad = sigmoid.gradient 22 | 23 | def loss(self, y, y_pred): 24 | y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15) 25 | p = self.log_func(y_pred) 26 | return y * np.log(p) + (1 - y) * np.log(1 - p) 27 | 28 | # gradient w.r.t y_pred 29 | def gradient(self, y, y_pred): 30 | p = self.log_func(y_pred) 31 | return -(y - p) 32 | 33 | # w.r.t y_pred 34 | def hess(self, y, y_pred): 35 | p = self.log_func(y_pred) 36 | return p * (1 - p) 37 | 38 | # binary cross entropy loss ------------------------------------------------------------------------------------ 39 | class SigmoidBinaryCrossEntropyLoss: 40 | 41 | def __init__(self): 42 | pass 43 | 44 | @staticmethod 45 | def predict(value): 46 | return sigmoid(value) 47 | 48 | def compute_loss(self, y, y_pred): 49 | # negative averaged log loss 50 | log_loss = np.nan_to_num(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred)) 51 | return -np.sum(log_loss) / len(y_pred) 52 | 53 | def compute_grad(self, y, y_pred): 54 | return y_pred - y 55 | 56 | def compute_hess(self, y, y_pred): 57 | return y_pred * (1 - y_pred) 58 | 59 | 60 | # softmax cross entropy loss ------------------------------------------------------------------------------- 61 | class SoftmaxCrossEntropyLoss: 62 | 63 | def __init__(self): 64 | pass 65 | 66 | @staticmethod 67 | def predict(values): 68 | """ 69 | :param values: ndarray 70 | :return: ndarray 71 | """ 72 | return softmax(values) 73 | 74 | def compute_loss(self, y, y_pred): 75 | y_prob = self.predict(y_pred) 76 | # do summation over feature dimensions & do averaging over samples 77 | log_loss = np.nan_to_num(y * np.log(y_prob)) 78 | return -np.sum(log_loss) / len(y_prob) 79 | 80 | def compute_grad(self, y, y_pred): 81 | assert len(y_pred.shape) == 2 82 | y_prob = self.predict(y_pred) 83 | return y_prob - y 84 | 85 | def compute_hess(self, y, y_pred): 86 | y_prob = self.predict(y_pred) 87 | return y_prob * (1 - y_prob) 88 | 89 | 90 | class BinaryRFLoss(): 91 | def __init__(self): 92 | pass 93 | 94 | def predict(self, x): 95 | return x 96 | 97 | def compute_grad(self, y, y_pred): 98 | return (np.array(y)==1).astype("int") 99 | 100 | def compute_hess(self, y, y_pred): 101 | return np.ones_like(y) 102 | 103 | class SoftmaxLoss: 104 | def __init__(self): 105 | pass 106 | 107 | def predict(self, x): 108 | out = [] 109 | for i,r in enumerate(x): 110 | e = np.exp(r) 111 | out.append(e / np.sum(e)) 112 | return np.array(out) 113 | 114 | def compute_grad(self, y, y_pred): 115 | grads = [] 116 | p = self.predict(y_pred) 117 | 118 | for i in range(len(y)): 119 | grad = np.zeros(y_pred.shape[1]) 120 | for j in range(0, y_pred.shape[1]): 121 | if j == y[i]: 122 | grad[j] = p[i][j] - 1 123 | else: 124 | grad[j] = p[i][j] 125 | grads.append(grad) 126 | 127 | return np.array(grads) 128 | 129 | def compute_hess(self, y, y_pred): 130 | hess = np.zeros(len(y_pred)) 131 | p = self.predict(y_pred) 132 | return p * (1- p) 133 | 134 | 135 | class LeastSquareLoss: 136 | """ loss = 1/2 (y-y_hat)**2 """ 137 | 138 | def __init__(self): 139 | pass 140 | 141 | @staticmethod 142 | def predict(value): 143 | return value 144 | 145 | @staticmethod 146 | def compute_loss(y, y_pred): 147 | lse_loss = 0.5 * (y - y_pred)**2 148 | return np.sum(lse_loss) / len(y) 149 | 150 | @staticmethod 151 | def compute_grad(y, y_pred): 152 | return y_pred - y 153 | 154 | @staticmethod 155 | def compute_hess(y, y_pred): 156 | # derivative of y_hat-y is 1 157 | if type(y).__name__ == 'ndarray' or type(y_pred).__name__ == 'ndarray': 158 | return np.ones_like(y) 159 | else: 160 | return 1 161 | -------------------------------------------------------------------------------- /federated_gbdt/core/moments_accountant/dp_params.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.moments_accountant.compute_noise_from_budget_lib import compute_noise 2 | import math 3 | 4 | # Sam Comment: 5 | # Method uses the RDP moments accountant which works as follows 6 | # 1) For a fixed eps,delta compute the (alpha, tau)-RDP guarantee of the Gaussian mechanism 7 | # 2) Perform a binary search over alpha values to find a sigma value that guarantees an epsilon < target_epsilon 8 | # 3) Perform a bisection on the (alpha,tau)-RDP => (eps,delta)-DP conversion bound with the (loose) sigma found 9 | # to find the exact noise needed for the target_epsilon given 10 | 11 | class RDPAccountant(): 12 | def __init__(self, eps, delta, q, clip, total_queries, method="rdp", verbose=False): 13 | self.eps = eps 14 | self.delta = delta 15 | self.q = q 16 | self.clip = clip 17 | self.total_leaf_nodes = total_queries 18 | self.method = method 19 | self.sigma, self.opt_alpha = self.compute_sigma(method=method, eps=eps, delta=delta, q=q, total_queries=total_queries, verbose=verbose) 20 | 21 | @staticmethod 22 | def compute_sigma(method, eps, delta, q, total_queries, verbose): 23 | opt_alpha = None 24 | if method == "rdp": 25 | sigma, opt_alpha = compute_noise(1, q, eps, total_queries, delta, 1e-5, verbose) 26 | elif method == "basic": 27 | sigma = total_queries * math.sqrt(2 * math.log(total_queries*1.25 / delta)) / eps # Basic composition - scalar mechanism 28 | elif method == "advanced": 29 | eps_prime = eps / (2 * math.sqrt(2 * total_queries * math.log(2 * total_queries / delta))) 30 | sigma = math.sqrt(2 * math.log(1.25 / delta)) / eps_prime # Advanced composition 31 | elif "rdp_weak": 32 | a = (-2 * (math.log(delta) - eps) + math.sqrt((2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / ( 2 * eps) # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound 33 | C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1) 34 | sigma = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + (a - 1) * eps))) # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP 35 | opt_alpha = a 36 | 37 | return sigma, opt_alpha 38 | 39 | def budget_examples(): 40 | eps = 1 41 | delta = 1e-5 42 | total_queries = 6 * 10 * 10 # Suppose 10 features over 10 trees with a maximum depth of 6 43 | # total_queries = 1 44 | 45 | sigma_basic = total_queries * math.sqrt(2 * math.log(1.25 / delta)) / eps 46 | 47 | eps_prime = eps / (2 * math.sqrt(2 * total_queries * math.log(2 * total_queries / delta))) 48 | 49 | sigma_basic = total_queries * math.sqrt(2 * math.log(total_queries * 1.25 / delta)) / eps # Basic composition 50 | sigma_advanced = math.sqrt(2 * math.log(1.25 / delta)) / eps_prime # Advanced composition 51 | sigma_moments = 2 * math.sqrt(total_queries * math.log(1 / delta)) / eps # Moments accountant asymptotic bound 52 | 53 | a = (-2 * (math.log(delta) - eps) + math.sqrt( 54 | (2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / ( 55 | 2 * eps) # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound 56 | C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1) 57 | sigma_rdp_weak = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + ( 58 | a - 1) * eps))) # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP 59 | 60 | 61 | obj = RDPAccountant(eps, delta, 1, None, total_queries, 62 | verbose=False) # RDP using the tf implementation which uses the stronger conversion bound (also supports tight subsampling analysis) 63 | 64 | eps=eps/2 65 | a = (-2 * (math.log(delta) - eps) + math.sqrt( 66 | (2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / ( 67 | 2 * eps) # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound 68 | C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1) 69 | sigma_rdp_weak_2 = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + ( 70 | a - 1) * eps))) # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP 71 | 72 | print("Alpha found directly using weak bound:", a) 73 | print("Optimal Alpha:", obj.opt_alpha) 74 | 75 | print("\n") 76 | print("SIGMA VALUES") 77 | print("Basic Composition:", sigma_basic) 78 | print("Advanced Composition:", sigma_advanced) 79 | print("Sigma Moments:", sigma_moments) 80 | print("RDP Accountant via weak bound:", sigma_rdp_weak) 81 | print("RDP Accountant via weak bound:", sigma_rdp_weak_2) 82 | print("RDP Accountant", obj.sigma) 83 | print("RDP Accountant", RDPAccountant(1.5, delta, 1, None, total_queries, 84 | verbose=False).sigma) 85 | 86 | budget_examples() 87 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/ind_exp.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """IndExp method for computing differentially private quantiles. 17 | 18 | Algorithm 2 from the paper "Privacy-preserving Statistical Estimation with 19 | Optimal Convergence Rates" by Smith (STOC 2011, 20 | http://cs-people.bu.edu/ads22/pubs/2011/stoc194-smith.pdf) describes the 21 | subroutine used to compute a single quantile. Theorem 3 from the paper ``Optimal 22 | Differential Privacy Composition for Exponential Mechanisms and the Cost of 23 | Adaptivity'' by Dong, Durfee, and Rogers (ICML 2020, 24 | https://arxiv.org/pdf/1909.13830.pdf) describes the composition used for the 25 | approximate DP variant of IndExp. 26 | """ 27 | 28 | import numpy as np 29 | import scipy 30 | 31 | 32 | def racing_sample(log_terms): 33 | """Numerically stable method for sampling from an exponential distribution. 34 | 35 | Args: 36 | log_terms: Array of terms of form log(coefficient) - (exponent term). 37 | 38 | Returns: 39 | A sample from the exponential distribution determined by terms. See 40 | Algorithm 1 from the paper "Duff: A Dataset-Distance-Based 41 | Utility Function Family for the Exponential Mechanism" 42 | (https://arxiv.org/pdf/2010.04235.pdf) for details; each element of terms is 43 | analogous to a single log(lambda(A_k)) - (eps * k/2) in their algorithm. 44 | """ 45 | return np.argmin( 46 | np.log(np.log(1.0 / np.random.uniform(size=log_terms.shape))) - log_terms) 47 | 48 | 49 | def opt_comp_p(eps, t): 50 | """Returns p_{eps, t} for opt_comp_calculator. 51 | 52 | Args: 53 | eps: Privacy parameter epsilon. 54 | t: Exponent t. 55 | """ 56 | return (np.exp(-t) - np.exp(-eps)) / (1.0 - np.exp(-eps)) 57 | 58 | 59 | def opt_comp_calculator(overall_eps, overall_delta, num_comps): 60 | """Returns the optimal per-composition eps for overall approx DP guarantee. 61 | 62 | Args: 63 | overall_eps: Desired overall privacy parameter epsilon. 64 | overall_delta: Desired overall privacy parameter delta. 65 | num_comps: Total number of compositions. 66 | 67 | Returns: 68 | eps_0 such that num_compositions eps_0-DP applications of the exponential 69 | mechanism will overall be (overall_eps, overall_delta)-DP, using the 70 | expression given in Theorem 3 of DDR20. This assumes that the composition is 71 | non-adaptive. 72 | """ 73 | eps_i_range = np.arange(overall_eps / num_comps - 0.01, overall_eps, 0.01) 74 | num_eps_i = len(eps_i_range) 75 | max_eps = 0 76 | for eps_idx in range(num_eps_i): 77 | eps = eps_i_range[eps_idx] 78 | max_sum = 0 79 | for ell in range(num_comps + 1): 80 | t_ell_star = np.clip((overall_eps + (ell + 1) * eps) / (num_comps + 1), 81 | 0.0, eps) 82 | p_t_ell_star = opt_comp_p(eps, t_ell_star) 83 | term_sum = 0 84 | for i in range(num_comps + 1): 85 | term_sum += scipy.special.binom(num_comps, i) * np.power( 86 | p_t_ell_star, num_comps - i) * np.power(1 - p_t_ell_star, i) * max( 87 | np.exp(num_comps * t_ell_star - 88 | (i * eps)) - np.exp(overall_eps), 0) 89 | if term_sum > max_sum: 90 | max_sum = term_sum 91 | if max_sum > overall_delta: 92 | return max_eps 93 | else: 94 | max_eps = eps 95 | return max_eps 96 | 97 | 98 | def ind_exp(sorted_data, data_low, data_high, qs, divided_eps, swap): 99 | """Returns eps-differentially private collection of quantile estimates for qs. 100 | 101 | Args: 102 | sorted_data: Array of data points sorted in increasing order. 103 | data_low: Lower limit for any differentially private quantile output value. 104 | data_high: Upper limit for any differentially private quantile output value. 105 | qs: Increasing array of quantiles in [0,1]. 106 | divided_eps: Privacy parameter epsilon for each estimated quantile. Assumes 107 | that divided_eps has been computed to ensure the desired overall privacy 108 | guarantee. 109 | swap: If true, uses swap dp sensitivity, otherwise uses add-remove. 110 | """ 111 | num_quantiles = len(qs) 112 | outputs = np.empty(num_quantiles) 113 | sorted_data = np.clip(sorted_data, data_low, data_high) 114 | data_size = len(sorted_data) 115 | sorted_data = np.concatenate(([data_low], sorted_data, [data_high])) 116 | data_gaps = sorted_data[1:] - sorted_data[:-1] 117 | for q_idx in range(num_quantiles): 118 | quantile = qs[q_idx] 119 | if swap: 120 | sensitivity = 1.0 121 | else: 122 | sensitivity = max(quantile, 1 - quantile) 123 | idx_left = racing_sample( 124 | np.log(data_gaps) + 125 | ((divided_eps / (-2.0 * sensitivity)) * 126 | np.abs(np.arange(0, data_size + 1) - (quantile * data_size)))) 127 | outputs[q_idx] = np.random.uniform(sorted_data[idx_left], 128 | sorted_data[idx_left + 1]) 129 | # Note that the outputs are already clipped to [data_low, data_high], so no 130 | # further clipping of outputs is necessary. 131 | return np.sort(outputs) 132 | -------------------------------------------------------------------------------- /examples/multiclass_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Load Data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import sys\n", 17 | "sys.path.append(\"../\")\n", 18 | "\n", 19 | "from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT\n", 20 | "from experiments.experiment_helpers.data_loader import DataLoader\n", 21 | "from sklearn.metrics import roc_auc_score\n", 22 | "from sklearn.preprocessing import label_binarize\n", 23 | "\n", 24 | "import numpy as np" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "dataloader = DataLoader()\n", 34 | "\n", 35 | "# Default is 70/30 split\n", 36 | "X_train, X_test, y_train, y_test = dataloader.load_datasets([\"connect_4\"], return_dict=False)[0]" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "(47289, 42)" 48 | ] 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "X_train.shape" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "array([2, 2, 1, ..., 2, 2, 2])" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "y_train" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "array([0, 1, 2])" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "np.unique(y_train)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "y_test_onehot = label_binarize(y_test, classes=[0,1,2])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## XGBoost Training (No DP)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "(20268, 3)\n" 125 | ] 126 | }, 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "0.907745991798139" 131 | ] 132 | }, 133 | "execution_count": 7, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "xgb_model = PrivateGBDT(num_trees=100, epsilon=0)\n", 140 | "xgb_model = xgb_model.fit(X_train, y_train)\n", 141 | "y_pred = xgb_model.predict_proba(X_test)\n", 142 | "print(y_pred.shape)\n", 143 | "\n", 144 | "roc_auc_score(y_test_onehot, y_pred)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## DP-XGBoost (FEVERLESS)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "0.6525901702496729" 163 | ] 164 | }, 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "dp_xgb_model = PrivateGBDT(num_trees=100, epsilon=3, dp_method=\"gaussian_cdp\")\n", 172 | "dp_xgb_model = dp_xgb_model.fit(X_train, y_train)\n", 173 | "y_pred = dp_xgb_model.predict_proba(X_test)\n", 174 | "\n", 175 | "roc_auc_score(y_test_onehot, y_pred)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## DP-TR XGBoost" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 9, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "0.7821243499339423" 194 | ] 195 | }, 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "dp_tr_model = PrivateGBDT(num_trees=100, epsilon=3, split_method=\"totally_random\", \n", 203 | " sketch_type=\"uniform\", dp_method=\"gaussian_cdp\")\n", 204 | "\n", 205 | "dp_tr_model = dp_tr_model.fit(X_train, y_train)\n", 206 | "y_pred = dp_tr_model.predict_proba(X_test)\n", 207 | "\n", 208 | "roc_auc_score(y_test_onehot, y_pred)" 209 | ] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3 (ipykernel)", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.9.15" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 1 233 | } 234 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/smooth_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Smooth sensitivity utils used by both Smooth and CSmooth. 17 | 18 | Section 3.1 from "Smooth Sensitivity and Sampling in Private Data Analysis" by 19 | Nissim, Radkhodnikova, and Smith 20 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf) gives 21 | details for compute_log_sensitivity and its helper functions. 22 | """ 23 | 24 | import numpy as np 25 | 26 | 27 | def check_indices(n, lower_idx, upper_idx): 28 | """Raises an error for indices outside of the [-1, n] range. 29 | 30 | Args: 31 | n: Right endpoint for valid range. 32 | lower_idx: Lower bound for idx. 33 | upper_idx: Upper bound for idx. 34 | """ 35 | if lower_idx < -1: 36 | raise ValueError("Index too small: lower_idx < -1.") 37 | if upper_idx > n: 38 | raise ValueError("Index too large: upper_idx > n.") 39 | 40 | 41 | def update_log_smooth_sensitivity(lower_idx1, upper_idx1, lower_idx2, 42 | upper_idx2, data, data_low, data_high, t, 43 | log_smooth_sensitivity): 44 | """Updates, returns log smooth sensitivity by searching local sensitivities. 45 | 46 | Args: 47 | lower_idx1: Min value for index i. 48 | upper_idx1: Max value for index i. 49 | lower_idx2: Min value for index j. 50 | upper_idx2: Max value for index j. 51 | data: User data, sorted in increasing order and clipped to lie in the 52 | [data_low, data_high] range. 53 | data_low: Lower limit for differentially private quantile output value. 54 | data_high: Upper limit for differentially private quantile output value. 55 | t: Smooth sensitivity parameter. 56 | log_smooth_sensitivity: Current max log smooth sensitivity, as found by 57 | previous searches of other index ranges. 58 | 59 | Returns: 60 | The maximum distance-weighted local sensitivity at any pair of indices 61 | (i, j) where lower_idx1 <= i <= upper_idx1 and 62 | lower_idx2 <= j <= upper_idx2. The special indices -1 and n = len(data) are 63 | allowed and interpreted as indexing values data_low and data_high, 64 | respectively. 65 | """ 66 | n = len(data) 67 | 68 | # Sanity checks. 69 | check_indices(n, lower_idx1, upper_idx1) 70 | check_indices(n, lower_idx2, upper_idx2) 71 | if upper_idx2 < lower_idx2: 72 | raise ValueError("Indices out of order: upper_idx2 < lower_idx2.") 73 | 74 | if upper_idx1 < lower_idx1: 75 | # Nothing to explore, return current log smooth sensitivity value. 76 | return log_smooth_sensitivity 77 | 78 | # Find the middle index and set i to this value. 79 | i = (lower_idx1 + upper_idx1) // 2 80 | 81 | # Scan the eligible indices j in the [lower_idx2, upper_idx2] range. 82 | js = np.arange(lower_idx2, upper_idx2 + 1) 83 | 84 | # Copy values from data at the indices indicated by js. (For js that are n, 85 | # use max_value.) 86 | j_vals = np.empty(upper_idx2 + 1 - lower_idx2) 87 | js_lt_n_bool = js < n 88 | js_lt_n = js[js_lt_n_bool] 89 | j_vals[js_lt_n_bool] = data[js_lt_n] 90 | j_vals[np.logical_not(js_lt_n_bool)] = data_high 91 | 92 | # Compute database distances for all the (i, j) pairs. 93 | database_distances = np.maximum(js - (i + 1), 0) 94 | 95 | # Compute local sensitivities for all the (i, j) pairs. 96 | base_value = data_low if i == -1 else data[i] 97 | local_sensitivities = j_vals - base_value 98 | 99 | # Compute log smooth sensitivities: 100 | # log(exp(-t*database_distances) * local_sensitivities). 101 | log_smooth_sensitivities = -t * database_distances + np.log( 102 | local_sensitivities) 103 | 104 | # Find the largest smooth sensitivity. 105 | max_smooth_sensitivity_index = np.argmax(log_smooth_sensitivities) 106 | current_max_log_smooth_sensitivity = log_smooth_sensitivities[ 107 | max_smooth_sensitivity_index] 108 | max_smooth_sensitivity_index = js[max_smooth_sensitivity_index] 109 | 110 | # Update the input smooth sensitivity if we found a larger one. 111 | log_smooth_sensitivity = max(log_smooth_sensitivity, 112 | current_max_log_smooth_sensitivity) 113 | 114 | # Check the remaining indices. (All indices in the [lower_idx1, upper_idx1] 115 | # range that are not equal to the midpoint i value checked above.) 116 | log_smooth_sensitivity1 = update_log_smooth_sensitivity( 117 | i + 1, upper_idx1, max_smooth_sensitivity_index, upper_idx2, data, 118 | data_low, data_high, t, log_smooth_sensitivity) 119 | log_smooth_sensitivity2 = update_log_smooth_sensitivity( 120 | lower_idx1, i - 1, lower_idx2, max_smooth_sensitivity_index, data, 121 | data_low, data_high, t, log_smooth_sensitivity) 122 | return max(log_smooth_sensitivity1, log_smooth_sensitivity2) 123 | 124 | 125 | def compute_log_smooth_sensitivity(data, data_low, data_high, true_quantile_idx, 126 | t): 127 | """Returns log(t-smooth sensitivity) for the given dataset and quantile. 128 | 129 | Args: 130 | data: User data, sorted in increasing order and clipped to lie in the 131 | [data_low, data_high] range. 132 | data_low: Lower limit for differentially private quantile output value. 133 | data_high: Upper limit for differentially private quantile output value. 134 | true_quantile_idx: Index into data at the desired quantile location. 135 | t: Smooth sensitivity parameter. 136 | """ 137 | n = len(data) 138 | return update_log_smooth_sensitivity(-1, true_quantile_idx, true_quantile_idx, 139 | n, data, data_low, data_high, t, 140 | -np.inf) 141 | -------------------------------------------------------------------------------- /federated_gbdt/core/binning/bin_inner_param.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | class BinInnerParam(object): 5 | """ 6 | Use to store columns related params for binning process 7 | """ 8 | 9 | def __init__(self): 10 | self.bin_indexes = [] 11 | self.bin_names = [] 12 | self.col_name_maps = {} 13 | self.header = [] 14 | self.transform_bin_indexes = [] 15 | self.transform_bin_names = [] 16 | self.category_indexes = [] 17 | self.category_names = [] 18 | 19 | def set_header(self, header): 20 | self.header = copy.deepcopy(header) 21 | for idx, col_name in enumerate(self.header): 22 | self.col_name_maps[col_name] = idx 23 | 24 | def set_bin_all(self): 25 | """ 26 | Called when user set to bin all columns 27 | """ 28 | self.bin_indexes = [i for i in range(len(self.header))] 29 | self.bin_names = copy.deepcopy(self.header) 30 | 31 | def set_transform_all(self): 32 | self.transform_bin_indexes = self.bin_indexes 33 | self.transform_bin_names = self.bin_names 34 | self.transform_bin_indexes.extend(self.category_indexes) 35 | self.transform_bin_names.extend(self.category_names) 36 | 37 | def add_bin_indexes(self, bin_indexes): 38 | if bin_indexes is None: 39 | return 40 | for idx in bin_indexes: 41 | if idx >= len(self.header): 42 | # LOGGER.warning("Adding a index that out of header's bound") 43 | # continue 44 | raise ValueError("Adding a index that out of header's bound") 45 | if idx not in self.bin_indexes: 46 | self.bin_indexes.append(idx) 47 | self.bin_names.append(self.header[idx]) 48 | 49 | def add_bin_names(self, bin_names): 50 | if bin_names is None: 51 | return 52 | 53 | for bin_name in bin_names: 54 | idx = self.col_name_maps.get(bin_name) 55 | if idx is None: 56 | LOGGER.warning("Adding a col_name that is not exist in header") 57 | continue 58 | if idx not in self.bin_indexes: 59 | self.bin_indexes.append(idx) 60 | self.bin_names.append(self.header[idx]) 61 | 62 | def add_transform_bin_indexes(self, transform_indexes): 63 | if transform_indexes is None: 64 | return 65 | 66 | for idx in transform_indexes: 67 | if idx >= len(self.header) or idx < 0: 68 | raise ValueError("Adding a index that out of header's bound") 69 | # LOGGER.warning("Adding a index that out of header's bound") 70 | # continue 71 | if idx not in self.transform_bin_indexes: 72 | self.transform_bin_indexes.append(idx) 73 | self.transform_bin_names.append(self.header[idx]) 74 | 75 | def add_transform_bin_names(self, transform_names): 76 | if transform_names is None: 77 | return 78 | for bin_name in transform_names: 79 | idx = self.col_name_maps.get(bin_name) 80 | if idx is None: 81 | raise ValueError("Adding a col_name that is not exist in header") 82 | 83 | if idx not in self.transform_bin_indexes: 84 | self.transform_bin_indexes.append(idx) 85 | self.transform_bin_names.append(self.header[idx]) 86 | 87 | def add_category_indexes(self, category_indexes): 88 | if category_indexes == -1: 89 | category_indexes = [i for i in range(len(self.header))] 90 | elif category_indexes is None: 91 | return 92 | 93 | for idx in category_indexes: 94 | if idx >= len(self.header): 95 | # LOGGER.warning("Adding a index that out of header's bound") 96 | continue 97 | if idx not in self.category_indexes: 98 | self.category_indexes.append(idx) 99 | self.category_names.append(self.header[idx]) 100 | if idx in self.bin_indexes: 101 | self.bin_indexes.remove(idx) 102 | self.bin_names.remove(self.header[idx]) 103 | 104 | def add_category_names(self, category_names): 105 | if category_names is None: 106 | return 107 | 108 | for bin_name in category_names: 109 | idx = self.col_name_maps.get(bin_name) 110 | if idx is None: 111 | # LOGGER.warning("Adding a col_name that is not exist in header") 112 | continue 113 | if idx not in self.category_indexes: 114 | self.category_indexes.append(idx) 115 | self.category_names.append(self.header[idx]) 116 | if idx in self.bin_indexes: 117 | self.bin_indexes.remove(idx) 118 | self.bin_names.remove(self.header[idx]) 119 | 120 | @property 121 | def bin_cols_map(self): 122 | assert len(self.bin_indexes) == len(self.bin_names) 123 | return dict(zip(self.bin_names, self.bin_indexes)) 124 | 125 | def encode_col_name_dict(self, col_name_dict: dict, model): 126 | result = {} 127 | for x, y in col_name_dict.items(): 128 | col_index = self.col_name_maps.get(x) 129 | result[anonymous_generator.generate_anonymous(col_index, model=model)] = y 130 | return result 131 | 132 | def encode_col_name_list(self, col_name_list: list, model): 133 | result = [] 134 | for x in col_name_list: 135 | col_index = self.col_name_maps.get(x) 136 | result.append(anonymous_generator.generate_anonymous(col_index, model=model)) 137 | return result 138 | 139 | # def __encode_col_name(self, col_name): 140 | # col_index = self.col_name_maps.get(col_name) 141 | # if col_index is None: 142 | # LOGGER.warning("Encoding a non-exist column name") 143 | # return None 144 | # return '.'.join(['host', str(col_index)]) 145 | 146 | def decode_col_name(self, encoded_name: str): 147 | col_index = anonymous_generator.reconstruct_fid(encoded_name) 148 | 149 | # try: 150 | # col_index = int(encoded_name.split('.')[1]) 151 | # except IndexError or ValueError: 152 | # raise RuntimeError("Bin inner param is trying to decode an invalid col_name.") 153 | return self.header[col_index] -------------------------------------------------------------------------------- /federated_gbdt/models/base/tree_base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import federated_gbdt.core.baseline_constants as consts 3 | from abc import ABC, abstractmethod 4 | import pandas 5 | from sklearn.metrics import roc_auc_score, accuracy_score 6 | import _pickle as pickle 7 | from federated_gbdt.core.loss_functions import SoftmaxCrossEntropyLoss 8 | 9 | 10 | class TreeBase(ABC): 11 | def __init__( 12 | self, 13 | min_samples_split=2, 14 | max_depth=3, 15 | task_type=consts.CLASSIFICATION, 16 | num_classes=-1, 17 | ): 18 | self.root = None # Root node in dec. tree 19 | self.min_samples_split = min_samples_split 20 | self.max_depth = max_depth 21 | self.task_type = task_type 22 | self.num_classes = num_classes 23 | self.K = 2 24 | self.training_method = "boosting" 25 | self.batched_update_size = 1 26 | self.trees = [] 27 | self.multiclass_trees = {} 28 | self.loss = None 29 | 30 | @abstractmethod 31 | def fit(self, *args, **kwargs): 32 | pass 33 | 34 | @abstractmethod 35 | def _build_tree(self, *args, **kwargs): 36 | """ 37 | Build the tree model according to impurity and leaf value 38 | """ 39 | pass 40 | 41 | def _convert_df(self, X): 42 | """ 43 | 44 | :param X: Data as a Pandas DataFrame 45 | :return: X as a Numpy array 46 | """ 47 | if isinstance(X, pandas.DataFrame): 48 | X = X.to_numpy() 49 | 50 | return X 51 | 52 | @staticmethod 53 | def predict_value(X, tree): 54 | out = np.zeros(X.shape[0]) 55 | 56 | if tree.value is None: 57 | # Choose the feature that we will test 58 | feature_value = X[:, tree.feature_i] 59 | left_split = feature_value <= tree.threshold 60 | right_split = ~left_split 61 | 62 | # Determine if we will follow left or right branch 63 | out[left_split] = TreeBase.predict_value(X[left_split], tree.true_branch) 64 | out[right_split] = TreeBase.predict_value(X[right_split], tree.false_branch) 65 | else: 66 | out = np.repeat(tree.value[0], X.shape[0]) 67 | 68 | return out 69 | 70 | def predict_over_trees(self, X, y): 71 | metrics = [] 72 | for i in range(1, len(self.trees) + 1): 73 | trees = self.trees[:i] 74 | y_pred = self.loss.predict(self.predict_weight(X, trees)) 75 | auc = roc_auc_score(y, y_pred) 76 | acc = accuracy_score(y, (y_pred >= 0.5).astype("int")) 77 | metrics.append((auc, acc)) 78 | print("Tree", i, "AUC :", auc) 79 | print("Tree", i, "Acc :", acc, "\n") 80 | 81 | return metrics 82 | 83 | def predict_weight(self, X, tree=None): 84 | """ 85 | Predicts a weight (i.e for classification a non-sigmoided value) for each observation passed. 86 | By default this is calculated from the whole ensemble or from a specific tree if passed 87 | 88 | :param X: Data 89 | :param tree: Tree to predict weight from 90 | :return: Model prediction as a weight 91 | """ 92 | X = self._convert_df(X) 93 | pred = ( 94 | np.zeros((X.shape[0], self.num_classes)) 95 | if self.num_classes > 2 96 | else np.zeros(X.shape[0]) 97 | ) 98 | trees = tree if tree is not None else self.trees 99 | 100 | preds = [] 101 | for i, tree in enumerate(trees): 102 | pred += self.predict_value(X, tree) 103 | if self.training_method == "batched_boosting": 104 | if ( 105 | i + 1 106 | ) % self.batched_update_size == 0: # Average current weights and add to preds 107 | pred /= self.batched_update_size 108 | preds.append(pred) 109 | pred = np.zeros(X.shape[0]) 110 | elif (i + 1) == len(trees): 111 | pred /= (i + 1) % self.batched_update_size 112 | preds.append(pred) 113 | elif ( 114 | self.early_stopping == "average_retrain" 115 | and (i + 1) % (len(self.trees) / 2) == 0 116 | ): 117 | preds.append(pred) 118 | 119 | if self.training_method == "batched_boosting": 120 | # print("NUM OF BATCHES", len(preds)) 121 | pred = np.add.reduce(preds) 122 | elif self.training_method == "rf": 123 | pred /= len(trees) 124 | elif self.early_stopping == "average_retrain": 125 | pred = (preds[0] + preds[1]) / 2 126 | 127 | return pred 128 | 129 | def predict(self, X): 130 | """Classify samples one by one and return the set of labels""" 131 | X = self._convert_df(X) 132 | return (np.argmax(self.predict_prob(X), axis=1)).astype("int") 133 | 134 | def predict_prob(self, X): 135 | """ 136 | For binary classification will return probabilities instead of raw weights 137 | 138 | :param X: Rows of observations to classify 139 | :return: A list of probabilities for each observation 140 | """ 141 | X = self._convert_df(X) 142 | probs = [] 143 | for k in range(0, self.K): 144 | probs.append(self.predict_weight(X, self.multiclass_trees[k])) 145 | probs = np.array(list(zip(*probs))) 146 | if self.task_type == consts.CLASSIFICATION: 147 | probs = ( 148 | self.loss.predict(probs) 149 | if self.K <= 2 150 | else SoftmaxCrossEntropyLoss().predict(probs) 151 | ) 152 | if self.K <= 2: 153 | probs = np.array([[1 - p[0], p[0]] for p in probs]) 154 | return probs 155 | 156 | def predict_proba(self, X): 157 | return self.predict_prob(X) 158 | 159 | def _reset_tracking_attributes(self, checkpoint): 160 | return 161 | 162 | def save(self, filename, checkpoint=False): 163 | self._reset_tracking_attributes( 164 | checkpoint 165 | ) # Otherwise saved file will be large... 166 | f = open(filename + ".pkl", "wb") 167 | pickle.dump(self.__dict__, f, 2) 168 | f.close() 169 | 170 | def load(self, filename): 171 | f = open(filename, "rb") 172 | tmp_dict = pickle.load(f) 173 | f.close() 174 | 175 | self.__dict__.update(tmp_dict) 176 | -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/core/_freq_oracle_server.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy as np 3 | from federated_gbdt.core.pure_ldp.core.prob_simplex import project_probability_simplex 4 | 5 | class FreqOracleServer: 6 | def __init__(self, epsilon, d, index_mapper=None): 7 | """ 8 | 9 | Args: 10 | epsilon: privacy budget 11 | d: domain size - not all freq oracles need this so can be None 12 | index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain 13 | 14 | """ 15 | self.epsilon = epsilon 16 | self.d = d 17 | 18 | self.aggregated_data = np.zeros(self.d) # Some freq oracle servers keep track of aggregated data to generate estimated_data 19 | self.estimated_data = np.zeros(self.d) # Keep track of estimated data for quick access 20 | self.n = 0 # The number of data items aggregated 21 | 22 | self.name = "FrequencyOracle" # Name of the frequency oracle for warning messages, set using .set_name(name) 23 | self.last_estimated = 0 24 | 25 | if index_mapper is None: 26 | self.index_mapper = lambda x: x - 1 27 | else: 28 | self.index_mapper = index_mapper 29 | 30 | def set_name(self, name): 31 | """ 32 | Sets freq servers name 33 | Args: 34 | name: string - name of frequency oracle 35 | """ 36 | self.name = name 37 | 38 | def reset(self): 39 | """ 40 | This method resets the server's aggregated/estimated data and sets n = 0. 41 | This should be overridden if other parameters need to be reset. 42 | """ 43 | self.aggregated_data = np.zeros(self.d) 44 | self.estimated_data = np.zeros(self.d) 45 | self.last_estimated = 0 46 | self.n = 0 47 | 48 | def update_params(self, epsilon=None, d=None, index_mapper=None): 49 | """ 50 | Method to update params of freq oracle server, should be overridden if more options needed. 51 | This will reset aggregated/estimated data. 52 | Args: 53 | epsilon: Optional - privacy budget 54 | d: Optional - domain size 55 | index_mapper: Optional - function 56 | """ 57 | self.epsilon = epsilon if epsilon is not None else self.epsilon # Updating epsilon here will not update any internal probabilities 58 | # Any class that implements FreqOracleServer, needs to override update_params to update epsilon properly 59 | 60 | self.d = d if d is not None else self.d 61 | self.index_mapper = index_mapper if index_mapper is not None else self.index_mapper 62 | self.reset() 63 | 64 | def check_warnings(self, suppress_warnings=False): 65 | """ 66 | Used during estimation to check warnings 67 | Args: 68 | suppress_warnings: Optional boolean - If True suppresses warnings from being output 69 | """ 70 | pass 71 | 72 | def aggregate(self, data): 73 | """ 74 | The main method for aggregation, should be implemented by a freq oracle server 75 | Args: 76 | data: item to estimate frequency of 77 | """ 78 | raise NotImplementedError("Must implement") 79 | 80 | def aggregate_all(self, data_list): 81 | """ 82 | Helper method used to aggregate a list of data 83 | Args: 84 | data_list: List of private data to aggregate 85 | """ 86 | for data in data_list: 87 | self.aggregate(data) 88 | 89 | def check_and_update_estimates(self): 90 | """ 91 | Used to check if the "cached" estimated data needs re-estimating, this occurs when new data has been aggregated since last 92 | """ 93 | if self.last_estimated < self.n: # If new data has been aggregated since the last estimation, then estimate all 94 | self.last_estimated = self.n 95 | self._update_estimates() 96 | 97 | def _update_estimates(self): 98 | """ 99 | Used internally to update estimates, should be implemented 100 | """ 101 | raise NotImplementedError("Must implement") 102 | 103 | def estimate(self, data, suppress_warnings=False): 104 | """ 105 | Calculates frequency estimate of given data item, must be implemented 106 | Args: 107 | data: data to estimate the frequency warning of 108 | suppress_warnings: Optional boolean - if true suppresses warnings 109 | """ 110 | raise NotImplementedError("Must implement") 111 | 112 | def estimate_all(self, data_list, suppress_warnings=False, normalization=0): 113 | """ 114 | Helper method, given a list of data items, returns a list of their estimated frequencies 115 | Args: 116 | data_list: list of data items to estimate 117 | suppress_warnings: If True, will suppress estimation warnings 118 | normalization: Normalisation should only be specified when estimating over the entire domain! 119 | 0 - No Norm 120 | 1 - Additive Norm 121 | 2 - Prob Simplex 122 | 3 (or otherwise) - Threshold cut 123 | 124 | Returns: list of estimates 125 | 126 | """ 127 | self.check_and_update_estimates() 128 | 129 | estimates = np.array([self.estimate(x, suppress_warnings=suppress_warnings) for x in data_list]) 130 | 131 | if normalization == 0: # No normalisation 132 | return estimates 133 | elif normalization == 1: # Additive normalisation 134 | diff = self.n - sum(estimates[estimates > 0]) 135 | non_zero = (estimates>0).sum() 136 | 137 | for i,item in enumerate(estimates): 138 | if item > 0: 139 | estimates[i] = item + diff/non_zero 140 | else: 141 | estimates[i] = 0 142 | 143 | return estimates 144 | elif normalization == 2: # Prob Simplex 145 | proj = project_probability_simplex(estimates/self.n) 146 | return np.array(proj) * self.n 147 | else: 148 | # Threshold cut 149 | sorted_index = np.argsort((-1 * estimates)) 150 | total = 0 151 | i=0 152 | for i,index in enumerate(sorted_index): 153 | total += estimates[index] 154 | if total > self.n: 155 | break 156 | 157 | for j in range(i, len(sorted_index)): 158 | estimates[sorted_index[j]] = 0 159 | 160 | return estimates 161 | 162 | @property 163 | def get_estimates(self): 164 | """ 165 | Returns: Estimated data 166 | """ 167 | return self.estimated_data 168 | -------------------------------------------------------------------------------- /federated_gbdt/core/binning/feature_binning_param.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core import baseline_constants 2 | import copy 3 | 4 | 5 | class TransformParam: 6 | """ 7 | Define how to transfer the cols 8 | 9 | Parameters 10 | ---------- 11 | transform_cols : list of column index, default: -1 12 | Specify which columns need to be transform. If column index is None, None of columns will be transformed. 13 | If it is -1, it will use same columns as cols in binning module. 14 | 15 | transform_names: list of string, default: [] 16 | Specify which columns need to calculated. Each element in the list represent for a column name in header. 17 | 18 | 19 | transform_type: str, 'bin_num'or 'woe' or None default: 'bin_num' 20 | Specify which value these columns going to replace. 21 | 1. bin_num: Transfer original feature value to bin index in which this value belongs to. 22 | 2. woe: This is valid for guest party only. It will replace original value to its woe value 23 | 3. None: nothing will be replaced. 24 | """ 25 | 26 | def __init__(self, transform_cols=-1, transform_names=None, transform_type="bin_num"): 27 | super(TransformParam, self).__init__() 28 | self.transform_cols = transform_cols 29 | self.transform_names = transform_names 30 | self.transform_type = transform_type 31 | 32 | 33 | class OptimalBinningParam: 34 | """ 35 | Indicate optimal binning params 36 | 37 | Parameters 38 | ---------- 39 | metric_method: str, default: "iv" 40 | The algorithm metric method. Support iv, gini, ks, chi-square 41 | 42 | 43 | min_bin_pct: float, default: 0.05 44 | The minimum percentage of each bucket 45 | 46 | max_bin_pct: float, default: 1.0 47 | The maximum percentage of each bucket 48 | 49 | init_bin_nums: int, default 100 50 | Number of bins when initialize 51 | 52 | mixture: bool, default: True 53 | Whether each bucket need event and non-event records 54 | 55 | init_bucket_method: str default: quantile 56 | Init bucket methods. Accept quantile and bucket. 57 | 58 | """ 59 | 60 | def __init__(self, metric_method='iv', min_bin_pct=0.05, max_bin_pct=1.0, 61 | init_bin_nums=1000, mixture=True, init_bucket_method='quantile'): 62 | super().__init__() 63 | self.init_bucket_method = init_bucket_method 64 | self.metric_method = metric_method 65 | self.max_bin = None 66 | self.mixture = mixture 67 | self.max_bin_pct = max_bin_pct 68 | self.min_bin_pct = min_bin_pct 69 | self.init_bin_nums = init_bin_nums 70 | self.adjustment_factor = None 71 | 72 | 73 | class FeatureBinningParam: 74 | """ 75 | Define the feature binning method 76 | 77 | Parameters 78 | ---------- 79 | method : str, 'quantile', 'bucket' or 'optimal', default: 'quantile' 80 | Binning method. 81 | 82 | compress_thres: int, default: 10000 83 | When the number of saved summaries exceed this threshold, it will call its compress function 84 | 85 | head_size: int, default: 10000 86 | The buffer size to store inserted observations. When head list reach this buffer size, the 87 | QuantileSummaries object start to generate summary(or stats) and insert into its sampled list. 88 | 89 | error: float, 0 <= error < 1 default: 0.001 90 | The error of tolerance of binning. The final split point comes from original data, and the rank 91 | of this value is close to the exact rank. More precisely, 92 | floor((p - 2 * error) * N) <= rank(x) <= ceil((p + 2 * error) * N) 93 | where p is the quantile in float, and N is total number of data. 94 | 95 | bin_num: int, bin_num > 0, default: 10 96 | The max bin number for binning 97 | 98 | bin_indexes : list of int or int, default: -1 99 | Specify which columns need to be binned. -1 represent for all columns. If you need to indicate specific 100 | cols, provide a list of header index instead of -1. 101 | 102 | bin_names : list of string, default: [] 103 | Specify which columns need to calculated. Each element in the list represent for a column name in header. 104 | 105 | adjustment_factor : float, default: 0.5 106 | the adjustment factor when calculating WOE. This is useful when there is no event or non-event in 107 | a bin. Please note that this parameter will NOT take effect for setting in host. 108 | 109 | category_indexes : list of int or int, default: [] 110 | Specify which columns are category features. -1 represent for all columns. List of int indicate a set of 111 | such features. For category features, bin_obj will take its original values as split_points and treat them 112 | as have been binned. If this is not what you expect, please do NOT put it into this parameters. 113 | 114 | The number of categories should not exceed bin_num set above. 115 | 116 | category_names : list of string, default: [] 117 | Use column names to specify category features. Each element in the list represent for a column name in header. 118 | 119 | local_only : bool, default: False 120 | Whether just provide binning method to guest party. If true, host party will do nothing. 121 | 122 | transform_param: TransformParam 123 | Define how to transfer the binned data. 124 | 125 | need_run: bool, default True 126 | Indicate if this module needed to be run 127 | 128 | skip_static: bool, default False 129 | If true, binning will not calculate iv, woe etc. In this case, optimal-binning 130 | will not be supported. 131 | 132 | """ 133 | 134 | def __init__(self, method=baseline_constants.QUANTILE, 135 | compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD, 136 | head_size=baseline_constants.DEFAULT_HEAD_SIZE, 137 | error=baseline_constants.DEFAULT_RELATIVE_ERROR, 138 | bin_num=baseline_constants.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5, 139 | transform_param=TransformParam(), optimal_binning_param=OptimalBinningParam(), 140 | local_only=False, category_indexes=None, category_names=None, 141 | need_run=True, skip_static=False): 142 | self.method = method 143 | self.compress_thres = compress_thres 144 | self.head_size = head_size 145 | self.error = error 146 | self.adjustment_factor = adjustment_factor 147 | self.bin_num = bin_num 148 | self.bin_indexes = bin_indexes 149 | self.bin_names = bin_names 150 | self.category_indexes = category_indexes 151 | self.category_names = category_names 152 | self.local_only = local_only 153 | self.transform_param = copy.deepcopy(transform_param) 154 | self.optimal_binning_param = copy.deepcopy(optimal_binning_param) 155 | self.need_run = need_run 156 | self.skip_static = skip_static 157 | -------------------------------------------------------------------------------- /federated_gbdt/models/gbdt/components/train_monitor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | from collections import defaultdict 5 | 6 | class TrainMonitor(): 7 | def __init__(self, num_clients, num_classes=2): 8 | self.gradient_info = [] # List of tuples containing (gradient, hess) info 9 | self.leaf_gradient_tracker = [[], []] 10 | self.root_gradient_tracker = [[],[]] 11 | 12 | self.gradient_total = [0,0] 13 | self.current_tree_weights = [] 14 | self.previous_tree_weights = [] 15 | self.y_weights = [] 16 | self.batched_weights = [] 17 | 18 | self.last_feature = -1 19 | self.node_count = -1 20 | 21 | self.leaf_count_tracker = [] 22 | self.leaf_count = 0 23 | self.internal_node_count = defaultdict(int) 24 | self.internal_node_count_tracker = [] 25 | self.bin_tracker = defaultdict(int) 26 | self.tree_bin_tracker = [] 27 | 28 | self.current_tree_depth = 0 29 | 30 | self.num_classes = num_classes 31 | 32 | self.client_rounds_sent = [0] 33 | self.client_rounds_received = [0] 34 | self.client_payload_sent = [0] 35 | self.client_payload_received = [0] 36 | 37 | 38 | self.num_clients = num_clients 39 | 40 | self.client_timer = 0 41 | self.server_timer = 0 42 | 43 | self.client_total_time = [0] 44 | self.server_total_time = [0] 45 | 46 | self.client_time_dict = {"histogram building": 0, "computing gradients": 0, 'initialise private histogram': 0, "forming gradient + hess histogram": 0, 47 | "retrieving grads/hess for node": 0,} 48 | 49 | self.server_time_dict = {"initial split candidates": 0, "privacy_accountant initialisation": 0, "initialise model weights": 0, "split_candidates": 0, 50 | "pre-tree ops": 0, "post-tree ops": 0, "initialise private histogram": 0, "adding noise to gradient + hess histogram": 0, 51 | "sampling features for node": 0, "calculating internal split": 0, "updating split constraints": 0, "leaf weight": 0} 52 | 53 | def start_timing_event(self, device_type, tag=""): 54 | if device_type == "client": 55 | self.client_timer = time.time() 56 | else: 57 | self.server_timer = time.time() 58 | 59 | def end_timing_event(self, device_type, tag=""): 60 | if device_type == "client": 61 | time_elapsed = time.time() - self.client_timer 62 | self.client_total_time[-1] += time_elapsed 63 | self.client_time_dict[tag] += time_elapsed 64 | self.client_timer = 0 65 | else: 66 | time_elapsed = time.time() - self.server_timer 67 | self.server_total_time[-1] += time_elapsed 68 | self.server_time_dict[tag] += time_elapsed 69 | self.server_timer = 0 70 | 71 | def update_num_clients(self, num_clients): 72 | self.num_clients = num_clients 73 | 74 | self.client_rounds_received = [0] 75 | self.client_rounds_sent = [0] 76 | self.client_payload_sent = [0] 77 | self.client_payload_received = [0] 78 | 79 | def update_received(self, client_ids, payload_size): 80 | self.client_payload_received[-1] += payload_size 81 | self.client_rounds_received[-1] += 1 82 | 83 | def update_sent(self, client_ids, payload_size, increment_round=True): 84 | if len(client_ids) > 0: 85 | self.client_payload_sent[-1] += payload_size 86 | if increment_round: 87 | self.client_rounds_sent[-1] += 1 88 | 89 | def output_summary(self): 90 | print(f"\nNumber of clients {self.num_clients}") 91 | print(f"Max client rounds sent {np.max(self.client_rounds_sent)}") 92 | print(f"Avg client rounds sent {np.mean(self.client_rounds_sent)}") 93 | print(f"Total client sent {np.sum(self.client_rounds_sent)}") 94 | 95 | print(f"Max client rounds received {np.max(self.client_rounds_received)}") 96 | print(f"Avg client rounds received {np.mean(self.client_rounds_received)}") 97 | 98 | print(f"Max client sent {np.max(self.client_payload_sent) / 1024}Kb") 99 | print(f"Average client sent {np.mean(self.client_payload_sent) / 1024}Kb") 100 | print(f"Total client sent {np.sum(self.client_payload_sent) / 1024}Kb") 101 | 102 | print(f"Total leaf count {self.leaf_count_tracker}") 103 | # print(f"Total internal nodes {self.internal_node_count_tracker}") 104 | print("\n") 105 | 106 | for i, t in enumerate(self.client_total_time): 107 | print(f"Tree {i} client total time {self.client_total_time[i]}") 108 | print(f"Tree {i} server total time {self.server_total_time[i]}") 109 | 110 | print(f"Client time dict {self.client_time_dict}") 111 | print(f"Server time dict {self.server_time_dict}") 112 | 113 | def _update_comm_stats(self, split_method, training_method): 114 | # print(f"Stats before updating rounds={self.client_rounds_sent[-1]}, payload={self.client_payload_sent[-1]}") 115 | 116 | # Internal nodes 117 | if split_method != "totally_random": 118 | total = 0 119 | for level in self.bin_tracker: 120 | num_bins = self.bin_tracker[level] 121 | total += 8*2*num_bins 122 | self.client_payload_sent[-1] += total 123 | 124 | for level in self.internal_node_count: 125 | if self.internal_node_count[level] > 0: 126 | self.client_rounds_sent[-1] += 1 127 | 128 | # Leaf nodes 129 | if training_method != "batched_boosting": 130 | self.update_sent(range(0, self.num_clients), payload_size=8*2*self.leaf_count, increment_round=True) 131 | 132 | # print(f"Stats after updating rounds={self.client_rounds_sent[-1]}, payload={self.client_payload_sent[-1]}") 133 | 134 | def reset(self): 135 | # For comm tracking 136 | self.leaf_count_tracker.append(self.leaf_count) 137 | self.leaf_count = 0 138 | self.internal_node_count_tracker.append(self.internal_node_count) 139 | self.internal_node_count = defaultdict(int) 140 | self.tree_bin_tracker.append(self.bin_tracker) 141 | self.bin_tracker = defaultdict(int) 142 | 143 | self.client_rounds_sent.append(0) 144 | self.client_payload_sent.append(0) 145 | self.client_rounds_received.append(0) 146 | self.client_payload_received.append(0) 147 | 148 | self.client_timer, self.server_timer = 0,0 149 | self.client_total_time.append(0) 150 | self.server_total_time.append(0) 151 | 152 | self.gradient_total = [0,0] 153 | self.current_tree_depth = 0 154 | self.previous_tree_weights = self.current_tree_weights 155 | self.current_tree_weights = np.zeros(len(self.current_tree_weights)) if self.num_classes == 2 else np.zeros((len(self.current_tree_weights), self.num_classes)) 156 | 157 | def set_num_classes(self, num_classes): 158 | self.num_classes = num_classes -------------------------------------------------------------------------------- /federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/sw_server.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer 2 | import numpy as np 3 | import math 4 | import scipy 5 | import random 6 | 7 | from numba import jit 8 | 9 | class SWServer(FreqOracleServer): 10 | def __init__(self, epsilon, d=1024, d_prime=1024, smooth=True, smc=False, index_mapper=None): 11 | super().__init__(epsilon, d=None, index_mapper=index_mapper) 12 | self.smc = smc 13 | self.smooth = smooth 14 | self.d = d # Domain bins B_i, n 15 | self.d_prime = d_prime # Randomised Bins \tilde{B}_j, m 16 | self.update_params(epsilon, d=None, index_mapper=index_mapper) 17 | self.aggregated_data = [] 18 | 19 | def update_params(self, epsilon=None, d=None, index_mapper=None): 20 | super().update_params(epsilon, d, index_mapper) 21 | ee = np.exp(self.epsilon) 22 | if epsilon is not None or d is not None: 23 | self.b = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon)) 24 | self.p = ee / ((2 * self.b * ee) + 1) 25 | self.q = 1 / ((2 * self.b * ee) + 1) 26 | self.w = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon)) * 2 27 | self.M = self.generate_M(self.d_prime, self.d) 28 | 29 | def aggregate(self, priv_data): 30 | self.aggregated_data.append(priv_data) 31 | self.n += 1 32 | 33 | def generate_M(self, m=1024, n=1024): 34 | # report matrix 35 | m_cell = (1 + self.w) / m 36 | n_cell = 1 / n 37 | 38 | transform = np.ones((m, n)) * self.q * m_cell 39 | for i in range(n): 40 | left_most_v = (i * n_cell) # For bin B_i, this is the left boundary - v_min 41 | right_most_v = ((i + 1) * n_cell) # Right boundary of B_i - v_max 42 | 43 | ll_bound = int(left_most_v / m_cell) 44 | lr_bound = int((left_most_v + self.w) / m_cell) 45 | rl_bound = int(right_most_v / m_cell) 46 | rr_bound = int((right_most_v + self.w) / m_cell) 47 | 48 | ll_v = left_most_v - self.w / 2 49 | rl_v = right_most_v - self.w / 2 50 | l_p = ((ll_bound + 1) * m_cell - self.w / 2 - ll_v) * (self.p - self.q) + self.q * m_cell 51 | r_p = ((rl_bound + 1) * m_cell - self.w / 2 - rl_v) * (self.p - self.q) + self.q * m_cell 52 | if rl_bound > ll_bound: 53 | transform[ll_bound, i] = (l_p - self.q * m_cell) * ( 54 | (ll_bound + 1) * m_cell - self.w / 2 - ll_v) / n_cell * 0.5 + self.q * m_cell 55 | transform[ll_bound + 1, i] = self.p * m_cell - (self.p * m_cell - r_p) * ( 56 | rl_v - ((ll_bound + 1) * m_cell - self.w / 2)) / n_cell * 0.5 57 | else: 58 | transform[ll_bound, i] = (l_p + r_p) / 2 59 | transform[ll_bound + 1, i] = self.p * m_cell 60 | 61 | lr_v = left_most_v + self.w / 2 62 | rr_v = right_most_v + self.w / 2 63 | r_p = (rr_v - (rr_bound * m_cell - self.w / 2)) * (self.p - self.q) + self.q * m_cell 64 | l_p = (lr_v - (lr_bound * m_cell - self.w / 2)) * (self.p - self.q) + self.q * m_cell 65 | if rr_bound > lr_bound: 66 | if rr_bound < m: 67 | transform[rr_bound, i] = (r_p - self.q * m_cell) * ( 68 | rr_v - (rr_bound * m_cell - self.w / 2)) / n_cell * 0.5 + self.q * m_cell 69 | 70 | transform[rr_bound - 1, i] = self.p * m_cell - (self.p * m_cell - l_p) * ( 71 | (rr_bound * m_cell - self.w / 2) - lr_v) / n_cell * 0.5 72 | 73 | else: 74 | transform[rr_bound, i] = (l_p + r_p) / 2 75 | transform[rr_bound - 1, i] = self.p * m_cell 76 | 77 | if rr_bound - 1 > ll_bound + 2: 78 | transform[ll_bound + 2: rr_bound - 1, i] = self.p * m_cell 79 | 80 | return transform 81 | 82 | def difference_intervals(self, I1, I2): 83 | a_start, a_end = I1 84 | b_start, b_end = I2 85 | return min(abs(a_start - b_start), abs(a_start - b_end), abs(a_end - b_start), abs(a_end - b_end)), max( 86 | abs(a_start - b_start), abs(a_start - b_end), abs(a_end - b_start), abs(a_end - b_end)) 87 | 88 | def EMS(self, priv_hist, iterations, threshold, smooth=False): 89 | if smooth: 90 | # smoothing matrix 91 | smoothing_factor = 2 92 | binomial_tmp = [scipy.special.binom(smoothing_factor, k) for k in range(smoothing_factor + 1)] 93 | smoothing_matrix = np.zeros((self.d, self.d)) 94 | central_idx = int(len(binomial_tmp) / 2) 95 | for i in range(int(smoothing_factor / 2)): 96 | smoothing_matrix[i, : central_idx + i + 1] = binomial_tmp[central_idx - i:] 97 | for i in range(int(smoothing_factor / 2), self.d - int(smoothing_factor / 2)): 98 | smoothing_matrix[i, i - central_idx: i + central_idx + 1] = binomial_tmp 99 | for i in range(self.d - int(smoothing_factor / 2), self.d): 100 | remain = self.d - i - 1 101 | smoothing_matrix[i, i - central_idx + 1:] = binomial_tmp[: central_idx + remain] 102 | row_sum = np.sum(smoothing_matrix, axis=1) 103 | smoothing_matrix = (smoothing_matrix.T / row_sum).T 104 | 105 | # EMS 106 | theta = np.ones(self.d) / float(self.d) 107 | theta_old = np.zeros(self.d) 108 | r = 0 109 | sample_size = sum(priv_hist) 110 | old_logliklihood = 0 111 | 112 | while np.linalg.norm(theta_old - theta, ord=1) > 1 / sample_size and r < iterations: 113 | theta_old = np.copy(theta) 114 | X_condition = np.matmul(self.M, theta_old) 115 | 116 | TMP = self.M.T / X_condition 117 | 118 | P = np.copy(np.matmul(TMP, priv_hist)) 119 | P = P * theta_old 120 | 121 | theta = np.copy(P / sum(P)) 122 | 123 | # Smoothing step 124 | if smooth: 125 | theta = np.matmul(smoothing_matrix, theta) 126 | theta = theta / sum(theta) 127 | 128 | logliklihood = np.inner(priv_hist, np.log(np.matmul(self.M, theta))) 129 | imporve = logliklihood - old_logliklihood 130 | 131 | if r > 1 and abs(imporve) < threshold: 132 | # print("stop when", imporve / old_logliklihood, loglikelihood_threshold) 133 | break 134 | 135 | old_logliklihood = logliklihood 136 | 137 | r += 1 138 | return theta 139 | 140 | def g_density(self, v_prime, v): 141 | out = np.zeros(shape=v_prime.shape) 142 | out.fill(self.q) 143 | p_indexes = np.abs(v - v_prime) < self.b 144 | out[p_indexes] = self.p 145 | return out 146 | 147 | def _update_estimates(self): 148 | histogram, _ = np.histogram(self.aggregated_data, bins=self.d_prime, range=(-self.b, 1 + self.b)) 149 | self.estimated_density = self.EMS(histogram, 100, 1e-3, self.smooth) 150 | 151 | return self.estimated_density 152 | 153 | def estimate(self, data, suppress_warnings=False): 154 | self.check_and_update_estimates() 155 | return self.estimated_density[data] 156 | 157 | def estimate_all(self, data_list, suppress_warnings=False): 158 | return [self.estimate(item) for item in data_list] 159 | 160 | def estimate_density(self, N=None, suppress_warnings=False): 161 | self.check_and_update_estimates() 162 | return self.estimated_density 163 | -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/joint_exp.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """JointExp method for computing multiple dp quantiles.""" 17 | 18 | import numpy as np 19 | from numpy.fft import irfft 20 | from numpy.fft import rfft 21 | from scipy import special 22 | 23 | import federated_gbdt.core.dp_multiq.ind_exp as ind_exp 24 | 25 | 26 | def compute_intervals(sorted_data, data_low, data_high): 27 | """Returns array of intervals of adjacent points. 28 | 29 | Args: 30 | sorted_data: Nondecreasing array of data points, all in the [data_low, 31 | data_high] range. 32 | data_low: Lower bound for data. 33 | data_high: Upper bound for data. 34 | 35 | Returns: 36 | An array of intervals of adjacent points from [data_low, data_high] in 37 | nondecreasing order. For example, if sorted_data = [0,1,1,2,3], 38 | data_low = 0, and data_high = 4, returns 39 | [[0, 0], [0, 1], [1, 1], [1, 2], [2, 3], [3, 4]]. 40 | """ 41 | return np.block([[data_low, sorted_data], [sorted_data, 42 | data_high]]).transpose() 43 | 44 | 45 | def compute_log_phi(data_intervals, qs, eps, swap): 46 | """Computes two-dimensional array log_phi. 47 | 48 | Args: 49 | data_intervals: Array of intervals of adjacent points from 50 | compute_intervals. 51 | qs: Increasing array of quantiles in [0,1]. 52 | eps: Privacy parameter epsilon. 53 | swap: If true, uses swap dp sensitivity, otherwise uses add-remove. 54 | 55 | Returns: 56 | Array log_phi where log_phi[i-i',j] = log(phi(i, i', j)). 57 | """ 58 | num_data_intervals = len(data_intervals) 59 | original_data_size = num_data_intervals - 1 60 | if swap: 61 | sensitivity = 2.0 62 | else: 63 | if len(qs) == 1: 64 | sensitivity = 2.0 * (1 - min(qs[0], 1 - qs[0])) 65 | else: 66 | sensitivity = 2.0 * (1 - min(qs[0], np.min(qs[1:] - qs[:-1]), 1 - qs[-1])) 67 | eps_term = -(eps / (2.0 * sensitivity)) 68 | gaps = np.arange(num_data_intervals) 69 | target_ns = (np.block([qs, 1]) - np.block([0, qs])) * original_data_size 70 | return eps_term * np.abs(gaps.reshape(-1, 1) - target_ns) 71 | 72 | 73 | def logdotexp_toeplitz_lt(c, x): 74 | """Multiplies a log-space vector by a lower triangular Toeplitz matrix. 75 | 76 | Args: 77 | c: First column of the Toeplitz matrix (in log space). 78 | x: Vector to be multiplied (in log space). 79 | 80 | Returns: 81 | Let T denote the lower triangular Toeplitz matrix whose first column is 82 | given by exp(c); then the vector returned by this function is log(T * 83 | exp(x)). The multiplication is done using FFTs for efficiency, and care is 84 | taken to avoid overflow during exponentiation. 85 | """ 86 | max_c, max_x = np.max(c), np.max(x) 87 | exp_c, exp_x = c - max_c, x - max_x 88 | np.exp(exp_c, out=exp_c) 89 | np.exp(exp_x, out=exp_x) 90 | n = len(x) 91 | # Choose the next power of two. 92 | p = np.power(2, np.ceil(np.log2(2 * n - 1))).astype(int) 93 | fft_exp_c = rfft(exp_c, n=p) 94 | fft_exp_x = rfft(exp_x, n=p) 95 | y = irfft(fft_exp_c * fft_exp_x)[:n] 96 | np.maximum(0, y, out=y) 97 | np.log(y, out=y) 98 | y += max_c + max_x 99 | return y 100 | 101 | 102 | def compute_log_alpha(data_intervals, log_phi, qs): 103 | """Computes three-dimensional array log_alpha. 104 | 105 | Args: 106 | data_intervals: Array of intervals of adjacent points from 107 | compute_intervals. 108 | log_phi: Array from compute_log_phi. 109 | qs: Increasing array of quantiles in (0,1). 110 | 111 | Returns: 112 | Array log_alpha[a, b, c] where a and c index over quantiles and b represents 113 | interval repeats. 114 | """ 115 | num_intervals = len(data_intervals) 116 | num_quantiles = len(qs) 117 | data_intervals_log_sizes = np.log(data_intervals[:, 1] - data_intervals[:, 0]) 118 | log_alpha = np.log(np.zeros([num_quantiles, num_intervals, num_quantiles])) 119 | log_alpha[0, :, 0] = log_phi[:, 0] + data_intervals_log_sizes 120 | # A handy mask for log_phi. 121 | disallow_repeat = np.zeros(num_intervals) 122 | disallow_repeat[0] = -np.inf 123 | for j in range(1, num_quantiles): 124 | log_hat_alpha = special.logsumexp(log_alpha[j - 1, :, :], axis=1) 125 | log_alpha[j, :, 0] = data_intervals_log_sizes + logdotexp_toeplitz_lt( 126 | log_phi[:, j] + disallow_repeat, log_hat_alpha) 127 | log_alpha[j, 0, 0] = -np.inf # Correct possible numerical error. 128 | log_alpha[j, :, 1:j+1] = \ 129 | (log_phi[0, j] + data_intervals_log_sizes)[:, np.newaxis] \ 130 | + log_alpha[j-1, :, 0:j] - np.log(np.arange(1, j+1) + 1) 131 | return log_alpha 132 | 133 | 134 | def sample_joint_exp(log_alpha, data_intervals, log_phi, qs): 135 | """Given log_alpha and log_phi, samples final quantile estimates. 136 | 137 | Args: 138 | log_alpha: Array from compute_log_alpha. 139 | data_intervals: Array of intervals of adjacent points from 140 | compute_intervals. 141 | log_phi: Array from compute_log_phi. 142 | qs: Increasing array of quantiles in (0,1). 143 | 144 | Returns: 145 | Array outputs where outputs[i] is the quantile estimate corresponding to 146 | quantile q[i]. 147 | """ 148 | num_intervals = len(data_intervals) 149 | num_quantiles = len(qs) 150 | outputs = np.zeros(num_quantiles) 151 | last_i = num_intervals - 1 152 | j = num_quantiles - 1 153 | repeats = 0 154 | while j >= 0: 155 | log_dist = log_alpha[j, :last_i + 1, :] + log_phi[:last_i + 1, 156 | j + 1][::-1, np.newaxis] 157 | # Prevent repeats unless it's the first round. 158 | if j < num_quantiles - 1: 159 | log_dist[last_i, :] = -np.inf 160 | i, k = np.unravel_index( 161 | ind_exp.racing_sample(log_dist), [last_i + 1, num_quantiles]) 162 | repeats += k 163 | k += 1 164 | for j2 in range(j - k + 1, j + 1): 165 | outputs[j2] = np.random.uniform(data_intervals[i, 0], data_intervals[i, 166 | 1]) 167 | j -= k 168 | last_i = i 169 | return np.sort(outputs) 170 | 171 | 172 | def joint_exp(sorted_data, data_low, data_high, qs, eps, swap): 173 | """Computes eps-differentially private quantile estimates for qs. 174 | 175 | Args: 176 | sorted_data: Array of data points sorted in increasing order. 177 | data_low: Lower bound for data. 178 | data_high: Upper bound for data. 179 | qs: Increasing array of quantiles in (0,1). 180 | eps: Privacy parameter epsilon. 181 | swap: If true, uses swap dp sensitivity, otherwise uses add-remove. 182 | 183 | Returns: 184 | Array o where o[i] is the quantile estimate corresponding to quantile q[i]. 185 | """ 186 | clipped_data = np.clip(sorted_data, data_low, data_high) 187 | data_intervals = compute_intervals(clipped_data, data_low, data_high) 188 | log_phi = compute_log_phi(data_intervals, qs, eps, swap) 189 | log_alpha = compute_log_alpha(data_intervals, log_phi, qs) 190 | return sample_joint_exp(log_alpha, data_intervals, log_phi, qs) 191 | -------------------------------------------------------------------------------- /federated_gbdt/core/binning/bin_result.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BinColResults(object): 5 | def __init__(self, woe_array=(), iv_array=(), event_count_array=(), non_event_count_array=(), 6 | event_rate_array=(), non_event_rate_array=(), iv=None): 7 | self.woe_array = list(woe_array) 8 | self.iv_array = list(iv_array) 9 | self.event_count_array = list(event_count_array) 10 | self.non_event_count_array = list(non_event_count_array) 11 | self.event_rate_array = list(event_rate_array) 12 | self.non_event_rate_array = list(non_event_rate_array) 13 | self.split_points = None 14 | if iv is None: 15 | iv = 0 16 | for idx, woe in enumerate(self.woe_array): 17 | non_event_rate = non_event_count_array[idx] 18 | event_rate = event_rate_array[idx] 19 | iv += (non_event_rate - event_rate) * woe 20 | self.iv = iv 21 | self._bin_anonymous = None 22 | 23 | # @property 24 | # def bin_anonymous(self): 25 | # if self.split_points is None or len(self.split_points) == 0: 26 | # return [] 27 | # if self._bin_anonymous is None: 28 | # return ["bin_" + str(i) for i in range(len(self.split_points))] 29 | # return self._bin_anonymous 30 | # 31 | # @bin_anonymous.setter 32 | # def bin_anonymous(self, x): 33 | # self._bin_anonymous = x 34 | # 35 | def set_split_points(self, split_points): 36 | self.split_points = split_points 37 | 38 | def get_split_points(self): 39 | return np.array(self.split_points) 40 | # 41 | # @property 42 | # def is_woe_monotonic(self): 43 | # """ 44 | # Check the woe is monotonic or not 45 | # """ 46 | # woe_array = self.woe_array 47 | # if len(woe_array) <= 1: 48 | # return True 49 | # 50 | # is_increasing = all(x <= y for x, y in zip(woe_array, woe_array[1:])) 51 | # is_decreasing = all(x >= y for x, y in zip(woe_array, woe_array[1:])) 52 | # return is_increasing or is_decreasing 53 | # 54 | # @property 55 | # def bin_nums(self): 56 | # return len(self.woe_array) 57 | # 58 | # def result_dict(self): 59 | # save_dict = self.__dict__ 60 | # save_dict['is_woe_monotonic'] = self.is_woe_monotonic 61 | # save_dict['bin_nums'] = self.bin_nums 62 | # return save_dict 63 | # 64 | # def reconstruct(self, iv_obj): 65 | # self.woe_array = list(iv_obj.woe_array) 66 | # self.iv_array = list(iv_obj.iv_array) 67 | # self.event_count_array = list(iv_obj.event_count_array) 68 | # self.non_event_count_array = list(iv_obj.non_event_count_array) 69 | # self.event_rate_array = list(iv_obj.event_rate_array) 70 | # self.non_event_rate_array = list(iv_obj.non_event_rate_array) 71 | # self.split_points = list(iv_obj.split_points) 72 | # self.iv = iv_obj.iv 73 | # 74 | # def generate_pb(self): 75 | # result = feature_binning_param_pb2.IVParam(woe_array=self.woe_array, 76 | # iv_array=self.iv_array, 77 | # event_count_array=self.event_count_array, 78 | # non_event_count_array=self.non_event_count_array, 79 | # event_rate_array=self.event_rate_array, 80 | # non_event_rate_array=self.non_event_rate_array, 81 | # split_points=self.split_points, 82 | # iv=self.iv, 83 | # is_woe_monotonic=self.is_woe_monotonic, 84 | # bin_nums=self.bin_nums, 85 | # bin_anonymous=self.bin_anonymous) 86 | # return result 87 | 88 | 89 | class BinResults(object): 90 | def __init__(self): 91 | self.all_cols_results = {} # {col_name: BinColResult} 92 | self.role = '' 93 | self.party_id = '' 94 | 95 | # def set_role_party(self, role, party_id): 96 | # self.role = role 97 | # self.party_id = party_id 98 | # 99 | # def put_col_results(self, col_name, col_results: BinColResults): 100 | # ori_col_results = self.all_cols_results.get(col_name) 101 | # if ori_col_results is not None: 102 | # col_results.set_split_points(ori_col_results.get_split_points()) 103 | # self.all_cols_results[col_name] = col_results 104 | # 105 | def put_col_split_points(self, col_name, split_points): 106 | col_results = self.all_cols_results.get(col_name, BinColResults()) 107 | col_results.set_split_points(split_points) 108 | self.all_cols_results[col_name] = col_results 109 | 110 | # def query_split_points(self, col_name): 111 | # col_results = self.all_cols_results.get(col_name) 112 | # if col_results is None: 113 | # LOGGER.warning("Querying non-exist split_points") 114 | # return None 115 | # return col_results.split_points 116 | 117 | @property 118 | def all_split_points(self): 119 | results = {} 120 | for col_name, col_result in self.all_cols_results.items(): 121 | results[col_name] = col_result.get_split_points() 122 | return results 123 | # 124 | # @property 125 | # def all_ivs(self): 126 | # return [(col_name, x.iv) for col_name, x in self.all_cols_results.items()] 127 | # 128 | # @property 129 | # def all_woes(self): 130 | # return {col_name: x.woe_array for col_name, x in self.all_cols_results.items()} 131 | # 132 | # @property 133 | # def all_monotonic(self): 134 | # return {col_name: x.is_woe_monotonic for col_name, x in self.all_cols_results.items()} 135 | # 136 | # def summary(self): 137 | # return {"iv": self.all_ivs, 138 | # "woe": self.all_woes, 139 | # "monotonic": self.all_monotonic} 140 | 141 | def get_split_points_array(self, bin_names): 142 | split_points_result = [] 143 | for bin_name in bin_names: 144 | if bin_name not in self.all_cols_results: 145 | continue 146 | split_points_result.append(self.all_cols_results[bin_name].get_split_points()) 147 | return np.array(split_points_result) 148 | 149 | # def generated_pb(self): 150 | # col_result_dict = {} 151 | # for col_name, col_bin_result in self.all_cols_results.items(): 152 | # col_result_dict[col_name] = col_bin_result.generate_pb() 153 | # LOGGER.debug("In generated_pb, role: {}, party_id: {}".format(self.role, self.party_id)) 154 | # result_pb = feature_binning_param_pb2.FeatureBinningResult(binning_result=col_result_dict, 155 | # role=self.role, 156 | # party_id=str(self.party_id)) 157 | # return result_pb 158 | # 159 | # def reconstruct(self, result_pb): 160 | # self.role = result_pb.role 161 | # self.party_id = result_pb.party_id 162 | # binning_result = dict(result_pb.binning_result) 163 | # for col_name, col_bin_result in binning_result.items(): 164 | # col_bin_obj = BinColResults() 165 | # col_bin_obj.reconstruct(col_bin_result) 166 | # self.all_cols_results[col_name] = col_bin_obj 167 | # return self -------------------------------------------------------------------------------- /federated_gbdt/core/dp_multiq/csmooth.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """CDP smooth sensitivity method for computing differentially private quantiles. 17 | 18 | The smooth sensitivity method is described in 19 | "Smooth Sensitivity and Sampling in Private Data Analysis" by Nissim, 20 | Raskhodnikova, and Smith 21 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf). Details for 22 | the CDP noise distribution appear in Section 3.1 of "Average-Case Averages: 23 | Private Algorithms for Smooth Sensitivity and Mean Estimation" by Bun and 24 | Steinke (NeurIPS 2019). Details for optimizing t, s, and sigma appear in 25 | Section 3.1.1 of the same paper. 26 | """ 27 | 28 | import numpy as np 29 | 30 | from dp_multiq import base 31 | from dp_multiq import smooth_utils 32 | 33 | 34 | def compute_triples(eps, ts): 35 | """Returns triples of form (t, log(s), sigma) for hyperparameter optimization. 36 | 37 | Args: 38 | eps: Privacy parameter epsilon. 39 | ts: Array of possible smooth sensitivity parameters. 40 | """ 41 | triples = np.empty([len(ts), 3]) 42 | for t_idx in range(len(ts)): 43 | t = ts[t_idx] 44 | triples[t_idx, 0] = t 45 | sigma = opt_sigma(eps, t) 46 | triples[t_idx, 2] = sigma 47 | triples[t_idx, 1] = -1.5 * (sigma**2) + np.log(eps - (t / sigma)) 48 | return triples 49 | 50 | 51 | def opt_sigma(eps, t): 52 | """Returns optimal sigma as detailed in Section 3.1.1 of Bun and Steinke. 53 | 54 | Args: 55 | eps: Privacy parameter epsilon. 56 | t: Smooth sensitivity parameter. 57 | """ 58 | return np.real(np.roots([5 * eps / t, -5, 0, -1])[0]) 59 | 60 | 61 | def lln(sigma): 62 | """Returns a sample from the Laplace Log-Normal distribution. 63 | 64 | Args: 65 | sigma: Sigma parameter for the Laplace Log-Normal distribution. 66 | """ 67 | return np.random.laplace() * np.exp(sigma * np.random.normal()) 68 | 69 | 70 | def csmooth(sorted_data, data_low, data_high, qs, divided_eps, ts): 71 | """Returns eps^2/2-CDP quantile estimates for qs. 72 | 73 | Args: 74 | sorted_data: Array of data points sorted in increasing order. 75 | data_low: Lower limit for any differentially private quantile output value. 76 | data_high: Upper limit for any differentially private quantile output value. 77 | qs: Increasing array of quantiles in [0,1]. 78 | divided_eps: Privacy parameter epsilon. Assumes eps has already been divided 79 | so that the overall desired privacy guarantee is achieved. 80 | ts: Array of smooth sensitivity parameters, one for each q in qs. 81 | """ 82 | sorted_data = np.clip(sorted_data, data_low, data_high) 83 | o = np.empty(len(qs)) 84 | triples = compute_triples(divided_eps, ts) 85 | for i in range(len(qs)): 86 | t, log_s, sigma = triples[i] 87 | true_quantile_idx = base.quantile_index(len(sorted_data), qs[i]) 88 | true_quantile_value = sorted_data[true_quantile_idx] 89 | laplace_log_normal_noise = lln(sigma) 90 | log_sensitivity = smooth_utils.compute_log_smooth_sensitivity( 91 | sorted_data, data_low, data_high, true_quantile_idx, t) 92 | noise = np.sign(laplace_log_normal_noise) * np.exp( 93 | log_sensitivity + np.log(np.abs(laplace_log_normal_noise)) - log_s) 94 | o[i] = true_quantile_value + noise 95 | o = np.clip(o, data_low, data_high) 96 | return np.sort(o) 97 | 98 | 99 | def log_choose_triple_idx(triples, eps, log_sensitivities): 100 | """Returns triple (t, log_s, sigma) that minimizes noisy statistic variance. 101 | 102 | Args: 103 | triples: Array with entries of form (t, log_s, sigma). 104 | eps: Privacy parameter epsilon. 105 | log_sensitivities: Log(t smooth sensitivity) for each t in triples. 106 | """ 107 | variances = np.empty(len(triples)) 108 | for triple_idx in range(len(triples)): 109 | numerator = 2 * (np.exp(2 * log_sensitivities[triple_idx])) 110 | denominator = np.exp(-5 * (triples[triple_idx][2]**2)) * ( 111 | (eps - (triples[triple_idx][0] / triples[triple_idx][2]))**2) 112 | variances[triple_idx] = numerator / denominator 113 | return np.argmin(variances) 114 | 115 | 116 | def csmooth_tune_and_return_ts(sorted_data, data_low, data_high, qs, 117 | divided_eps, log_t_low, log_t_high, num_t): 118 | """Returns ts minimizing variance for data and each q under ~eps^2/2-CDP. 119 | 120 | Args: 121 | sorted_data: Array of data points sorted in increasing order. 122 | data_low: Lower limit for any differentially private quantile output value. 123 | data_high: Upper limit for any differentially private quantile output value. 124 | qs: Increasing array of quantiles in [0,1]. 125 | divided_eps: Privacy parameter epsilon. Assumes eps has already been divided 126 | so that the overall desired privacy guarantee is achieved. 127 | log_t_low: Tuning range for t has lower bound 10^(log_t_low). 128 | log_t_high: Tuning range for t has upper bound 10^(log_t_high). 129 | num_t: Number of logarithmically spaced t used to populate tuning range. 130 | """ 131 | sorted_data = np.clip(sorted_data, data_low, data_high) 132 | triples = compute_triples(divided_eps, 133 | np.logspace(log_t_low, log_t_high, num_t)) 134 | num_qs = len(qs) 135 | ts = np.empty(num_qs) 136 | for i in range(num_qs): 137 | true_quantile_idx = base.quantile_index(len(sorted_data), qs[i]) 138 | log_sensitivities = np.zeros(len(triples)) 139 | for triple_idx in range(len(triples)): 140 | t = triples[triple_idx, 0] 141 | log_sensitivities[ 142 | triple_idx] = smooth_utils.compute_log_smooth_sensitivity( 143 | sorted_data, data_low, data_high, true_quantile_idx, t) 144 | ts[i] = triples[log_choose_triple_idx(triples, divided_eps, 145 | log_sensitivities)][0] 146 | return ts 147 | 148 | 149 | def csmooth_tune_t_experiment(eps, num_samples, num_trials, num_quantiles_range, 150 | data_low, data_high, log_t_low, log_t_high, 151 | num_t): 152 | """Returns 2-D array of ts, tuned for each (num_quantiles, quantile) pair. 153 | 154 | Args: 155 | eps: Privacy parameter epsilon. 156 | num_samples: Number of standard Gaussian samples to draw for each trial. 157 | num_trials: Number of trials to average. 158 | num_quantiles_range: Array of number of quantiles to estimate. 159 | data_low: Lower bound for data, used by CSmooth. 160 | data_high: Upper bound for data, used by CSmooth. 161 | log_t_low: Tuning range for t has lower bound 10^(log_t_low). 162 | log_t_high: Tuning range for t has upper bound 10^(log_t_high). 163 | num_t: Number of logarithmically spaced t used to populate tuning range. 164 | """ 165 | ts = [np.zeros(num_quantiles) for num_quantiles in num_quantiles_range] 166 | num_quantiles_idx = 0 167 | for num_quantiles_idx in range(len(num_quantiles_range)): 168 | num_quantiles = num_quantiles_range[num_quantiles_idx] 169 | divided_eps = eps / np.sqrt(num_quantiles) 170 | for _ in range(num_trials): 171 | sorted_data = base.gen_gaussian(num_samples, 0, 1) 172 | qs = np.linspace(0, 1, num_quantiles + 2)[1:-1] 173 | ts[num_quantiles_idx] += csmooth_tune_and_return_ts( 174 | sorted_data, data_low, data_high, qs, divided_eps, log_t_low, 175 | log_t_high, num_t) / num_trials 176 | print("Finished num_quantiles: {}".format(num_quantiles)) 177 | return ts 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code for "Federated Boosted Decision Trees with Differential Privacy" 2 | 3 | This repository contains code for the ACM CCS'22 paper ["Federated Boosted Decision Trees with Differential Privacy"](https://arxiv.org/abs/2210.02910) 4 | ## Reference 5 | 6 | If the code and/or paper contained in this repository were useful to you please consider citing this [work](https://arxiv.org/abs/2210.02910): 7 | ```bibtex 8 | @inproceedings{maddock2022federated, 9 | author = {Maddock, Samuel and Cormode, Graham and Wang, Tianhao and Maple, Carsten and Jha, Somesh}, 10 | title = {Federated Boosted Decision Trees with Differential Privacy}, 11 | year = {2022}, 12 | isbn = {9781450394505}, 13 | publisher = {Association for Computing Machinery}, 14 | booktitle = {Proceedings of the 2022 ACM SIGSAC Conference on Computer and Communications Security}, 15 | address = {New York, NY, USA}, url = {https://doi.org/10.1145/3548606.3560687}, doi = {10.1145/3548606.3560687}, 16 | pages = {2249–2263}, 17 | location = {Los Angeles, CA, USA}, series = {CCS '22} 18 | } 19 | ``` 20 | 21 | ## Installation 22 | 23 | The simplest way to install an environment for this repo is to use conda and `pip install -r ./requirements.txt` 24 | ```commandline 25 | conda create -n "fedxgb" python=3.9 26 | conda activate fedxgb 27 | pip install -r ./requirements.txt 28 | ``` 29 | 30 | Alternatively `pip install` the required libraries 31 | 32 | ```commandline 33 | pip install pandas seaborn matplotlib scikit-learn numpy xgboost xxhash bitarray scipy numba statsmodels six progressbar autodp fast_histogram notebook pmlb 34 | ``` 35 | 36 | ### Datasets 37 | 38 | Datasets need to be downloaded and placed in the `data` directory in the root of the repo. We use the following datasets in our experiments: 39 | * [Credit 1](https://www.kaggle.com/competitions/GiveMeSomeCredit/data?select=cs-training.csv) - should be placed under `data/Kaggle Credit 1/credit1-training.csv` 40 | * [Credit 2](https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset) - should be placed under `data/UCI Credit 2/UCI_Credit_Card.csv` 41 | * [Adult](https://archive.ics.uci.edu/ml/datasets/adult) - should be placed under `data/UCI Adult/adult.data` 42 | * [Bank](https://archive.ics.uci.edu/ml/datasets/bank+marketing) - should be placed under `data/UCI Bank Marketing/bank-full.csv` 43 | * [Nomao](https://archive.ics.uci.edu/ml/datasets/Nomao) - should be placed under `data/UCI Nomao/Nomao.data` 44 | * [Higgs](https://archive.ics.uci.edu/ml/datasets/HIGGS) - The Higgs dataset should be subsampled to n=200,000 samples and placed under `data/UCI Higgs/higgs-200k.csv` 45 | 46 | 47 | ## Outline 48 | 49 | The code is split into two components 50 | * `experiments` - Main code for running and plotting experiments 51 | * `federated_gbdt` - Main code for the private GBDT model 52 | 53 | In order to generate plots and tables as in the paper see "Paper Experiments, Plots and Tables" 54 | 55 | In order to replicate the main figures in the paper from scratch see "Replication Instructions" 56 | 57 | ## Framework 58 | 59 | The code structure of `federated_gbdt` is as follows 60 | * `core` 61 | * `binning`- Contains quantile sketching code from the FEVERLESS implementation 62 | * `dp_multiq` - Central DP quantiles (not used in the paper) 63 | * `moments_accountant` - TensorFlow Privacy Moments Accountant (RDP) 64 | * `pure_ldp` - LDP protocols (not used in paper) 65 | * `baseline_constants.py` - Contains constants for the FEVERLESS implementation of quantile sketching 66 | * `loss_functions.py` - Contains loss functions used in the GBDT algorithm 67 | * `plotting.py` - Debugging code 68 | * `models` 69 | * `base` 70 | * `jit_functions.py` - Numba functions for computing GBDT quantities needed for training (split scores and weights) 71 | * `tree_node.py` - Contains the `DecisionNode` class reworked from the FEVERLESS implementation 72 | * `tree_base.py` - Base tree implementation 73 | * `gbdt` 74 | * `components` 75 | * `index_sampler.py`- Contains the `IndexSampler` class for managing which features/observations a tree uses during training 76 | * `privacy_accountant.py` - Contains the `PrivacyAccountant` class for managing DP during training of a `PrivateGBDT` 77 | * `split_candidate_manager.py` - Manages the various methods used to propose split candidates 78 | * `train_monitor.py` - Monitors various training statistics of a `PrivateGBDT` model 79 | * `private_gbdt.py` - Contains the main model `PrivateGBDT` class 80 | 81 | ## Paper Experiments, Plots and Table 82 | 83 | All experiments were run with 15 iterations in total (3 iterations over 5 different train-test sets). Code for running experiments is in `experiments/paper_experiments/paper_experiments.py` and plotting in `experiments/paper_experiments/paper_plotter.py` 84 | 85 | ### Running paper experiments 86 | The following methods in `paper_experiments.py` corresponds to the following figures/tables in the paper: 87 | * `dp_split_methods_with_update_methods` - Corresponds to Figure 1 (a,b,c), Table 2 in main text, Figures 7-10 and Table 7-10 in the Appendix 88 | * `dp_split_candidate_methods` - Corresponds to Figure 2 (a,b,c) and Table 3 in the main text, Figure 11 and 12 in the Appendix 89 | * `feature_interaction_experiments` - Corresponds to Figure 3 90 | * `dp_ebm_experiment` - Corresponds to Figure 4 91 | * `batched_boosting` - Corresponds to Figure 5, Table 4 in the main text, Figure 13 in the Appendix 92 | * `comparisons_experiment` - Corresponds to Figure 6 in the main text and Figures 14-18 in the Appendix 93 | 94 | 95 | ### Generating paper plots 96 | 97 | Paper figures are already generated and present in `experiments/paper_experiments/paper_plots`. 98 | 99 | To recreate paper plots download the paper results from [here](https://drive.google.com/file/d/1u7BFhEP7e2sqxfr3vAd92hrOJaV_sZI7/view?usp=sharing) and place them in `experiments/paper_experiments/paper_results/` 100 | 101 | The following methods in `experiment_plotter.py` can be used to plot results: 102 | * `plot_split_methods_with_update` - Figure 1(a,b,c) 103 | * `plot_split_candidates` - Figure 2(a,b,c) 104 | * `plot_k_way` - Figure 3 105 | * `plot_ebm_comparisons` - Figure 4 106 | * `plot_low_eps_bb` - Figure 5 107 | * `plot_comparisons` - Figure 6 108 | * `table_split_methods_with_update` - Table 2 109 | * `table_split_candidate` - Table 3 110 | * `table_low_eps_bb` - Table 4 111 | 112 | Plots and tables for the Appendix can be recreated via the following (although they are already present in `paper_plots`): 113 | * `appendix_E1` - Figures 7,8,9,10 114 | * `appendix_E1_table` - Tables 7,8,9,10 115 | * `appendix_E2` - Figure 12 116 | * `appendix_E3` - Not used 117 | * `appendix_E4` - Figure 13 118 | * `appendix_E5`- Figure 14, 15, 16, 17, 18 119 | 120 | ## Replication Instructions 121 | 122 | As all experiments in the paper are repeated over 15 iterations they are usually too slow to replicate within a reasonable amount of time. 123 | Instead, to approximately replicate an experiment from scratch additional code is provided in `experiments/replication_experiments` 124 | 125 | Most replication experiments have been designed to run on the Credit 1 dataset in ~30 minutes depending on the device. Most run on a single test-train seed over 3 iterations. 126 | 127 | The `experiments/replication_experiments` folder already contains data and replication figures for all 6 figures presented in the main paper. 128 | You can also generate appendix figures by changing the dataset that is passed to `ExperimentReplicator.replicate` 129 | 130 | 131 | Benchmark replication times performed on a Macbook Air M1: 132 | * Fig 1(a,b,c): ~30 mins 133 | * Fig 2(a,b,c): ~45 mins 134 | * Fig 3: ~20 mins 135 | * Fig 4: ~10 mins 136 | * Fig 5: ~15 mins 137 | * Fig 6: ~25 mins 138 | 139 | ## Acknowledgements 140 | 141 | * Part of the tree structure implementation is based on the public implementation of the FEVERLESS [paper](https://paperswithcode.com/paper/feverless-fast-and-secure-vertical-federated) with code repo [here](https://github.com/feverless111/vfl/blob/0c0bae50c37c193938e59a95c67fa62b43e43e8e/FEVERLESS/models/vertical/tree/xgboost/centralized_xgboost.py) 142 | * We make extensive use of the [autodp](https://github.com/yuxiangw/autodp) library by Yu-Xiang Wang to verify privacy accounting 143 | * Part of our privacy accountant uses the RDP moments accountant implemented in [TensorFlow Privacy](https://github.com/tensorflow/privacy) 144 | * Although not used in our paper, the code supports using datasets from the [Penn Machine Learning Benchmarks (PMLB)](https://epistasislab.github.io/pmlb/) 145 | -------------------------------------------------------------------------------- /federated_gbdt/core/binning/quantile_summaries.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core import baseline_constants 2 | import math 3 | 4 | 5 | class Stats(object): 6 | def __init__(self, value, g: int, delta: int): 7 | self.value = value 8 | self.g = g 9 | self.delta = delta 10 | 11 | 12 | class QuantileSummaries(object): 13 | def __init__(self, compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD, 14 | head_size=baseline_constants.DEFAULT_HEAD_SIZE, 15 | error=baseline_constants.DEFAULT_RELATIVE_ERROR, 16 | abnormal_list=None): 17 | self.compress_thres = compress_thres 18 | self.head_size = head_size 19 | self.error = error 20 | self.head_sampled = [] 21 | self.sampled = [] # list of Stats 22 | self.count = 0 # Total observations appeared 23 | if abnormal_list is None: 24 | self.abnormal_list = [] 25 | else: 26 | self.abnormal_list = abnormal_list 27 | self._total_count = 0 28 | 29 | def set_total_count(self, total_count): 30 | self._total_count = total_count 31 | 32 | # insert a number 33 | def insert(self, x): 34 | """ 35 | Insert an observation of data. First store in a array buffer. If the buffer is full, 36 | do a batch insert. If the size of sampled list reach compress_thres, compress this list. 37 | Parameters 38 | ---------- 39 | x : float 40 | The feature value 41 | 42 | """ 43 | if x in self.abnormal_list: 44 | return 45 | 46 | try: 47 | x = float(x) 48 | except ValueError: 49 | return 50 | 51 | self.head_sampled.append(x) 52 | if len(self.head_sampled) >= self.head_size: 53 | self._insert_head_buffer() # clear self.head_sample -> self.sampled 54 | if len(self.sampled) >= self.compress_thres: 55 | self.compress() 56 | 57 | def _insert_head_buffer(self): 58 | if not len(self.head_sampled): # If empty 59 | return 60 | current_count = self.count 61 | sorted_head = sorted(self.head_sampled) 62 | new_sampled = [] 63 | sample_idx = 0 64 | ops_idx = 0 65 | while ops_idx < len(sorted_head): 66 | current_sample = sorted_head[ops_idx] 67 | while sample_idx < len(self.sampled) and self.sampled[sample_idx].value <= current_sample: 68 | new_sampled.append(self.sampled[sample_idx]) 69 | sample_idx += 1 70 | 71 | current_count += 1 72 | 73 | # If it is the first one to insert or if it is the last one 74 | if not new_sampled or (sample_idx == len(self.sampled) and 75 | ops_idx == len(sorted_head) - 1): 76 | delta = 0 77 | else: 78 | # delta = math.floor(2 * self.error * current_count) - 1 79 | delta = math.floor(2 * self.error * current_count) 80 | 81 | new_stats = Stats(current_sample, 1, delta) 82 | new_sampled.append(new_stats) 83 | ops_idx += 1 84 | self.sampled = new_sampled 85 | self.head_sampled = [] 86 | self.count = current_count 87 | 88 | def compress(self): 89 | self._insert_head_buffer() 90 | # merge_threshold = math.floor(2 * self.error * self.count) - 1 91 | merge_threshold = 2 * self.error * self.count 92 | compressed = self._compress_immut(merge_threshold) 93 | self.sampled = compressed 94 | 95 | def merge(self, other): 96 | """ 97 | merge current summeries with the other one. 98 | Parameters 99 | ---------- 100 | other : QuantileSummaries 101 | The summaries to be merged 102 | """ 103 | if other.head_sampled: 104 | # other._insert_head_buffer() 105 | other.compress() 106 | 107 | if self.head_sampled: 108 | # self._insert_head_buffer() 109 | self.compress() 110 | 111 | if other.count == 0: 112 | return self 113 | 114 | if self.count == 0: 115 | self.count = other.count 116 | self.sampled = other.sampled 117 | return self 118 | 119 | # merge two sorted array 120 | new_sample = [] 121 | i, j = 0, 0 122 | while i < len(self.sampled) and j < len(other.sampled): 123 | if self.sampled[i].value < other.sampled[j].value: 124 | new_sample.append(self.sampled[i]) 125 | i += 1 126 | else: 127 | new_sample.append(other.sampled[j]) 128 | j += 1 129 | new_sample += self.sampled[i:] 130 | new_sample += other.sampled[j:] 131 | 132 | self.sampled = new_sample 133 | self.count += other.count 134 | # merge_threshold = math.floor(2 * self.error * self.count) - 1 135 | merge_threshold = 2 * self.error * self.count 136 | 137 | self.sampled = self._compress_immut(merge_threshold) 138 | return self 139 | 140 | def query(self, quantile): 141 | """ 142 | Given the queried quantile, return the approximation guaranteed result 143 | Parameters 144 | ---------- 145 | quantile : float [0.0, 1.0] 146 | The target quantile 147 | 148 | Returns 149 | ------- 150 | float, the corresponding value result. 151 | """ 152 | if self.head_sampled: 153 | # self._insert_head_buffer() 154 | self.compress() 155 | 156 | if quantile < 0 or quantile > 1: 157 | raise ValueError("Quantile should be in range [0.0, 1.0]") 158 | 159 | if self.count == 0: 160 | return 0 161 | 162 | if quantile <= self.error: 163 | return self.sampled[0].value 164 | 165 | if quantile >= 1 - self.error: 166 | return self.sampled[-1].value 167 | 168 | rank = math.ceil(quantile * self.count) 169 | target_error = math.ceil(self.error * self.count) 170 | min_rank = 0 171 | i = 1 172 | while i < len(self.sampled) - 1: 173 | cur_sample = self.sampled[i] 174 | min_rank += cur_sample.g 175 | max_rank = min_rank + cur_sample.delta 176 | if max_rank - target_error <= rank <= min_rank + target_error: 177 | return cur_sample.value 178 | i += 1 179 | return self.sampled[-1].value 180 | 181 | def _compress_immut(self, merge_threshold): 182 | if not self.sampled: 183 | return self.sampled 184 | 185 | res = [] 186 | 187 | # Start from the last element 188 | head = self.sampled[-1] 189 | i = len(self.sampled) - 2 # Do not merge the last element 190 | 191 | while i >= 1: 192 | this_sample = self.sampled[i] 193 | if this_sample.g + head.g + head.delta < merge_threshold: 194 | head.g = head.g + this_sample.g 195 | else: 196 | res.append(head) 197 | head = this_sample 198 | i -= 1 199 | res.append(head) 200 | 201 | # If head of current sample is smaller than this new res's head 202 | # Add current head into res 203 | current_head = self.sampled[0] 204 | if current_head.value <= head.value and len(self.sampled) > 1: 205 | res.append(current_head) 206 | 207 | # Python do not support prepend, thus, use reverse instead 208 | res.reverse() 209 | return res 210 | 211 | 212 | class SparseQuantileSummaries(QuantileSummaries): 213 | def __init__(self, compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD, 214 | head_size=baseline_constants.DEFAULT_HEAD_SIZE, 215 | error=baseline_constants.DEFAULT_RELATIVE_ERROR, 216 | abnormal_list=None): 217 | super(SparseQuantileSummaries, self).__init__(compress_thres, head_size, error, abnormal_list) 218 | 219 | # Compare with the sparse point, static the number of each part. 220 | self.smaller_num = 0 221 | self.bigger_num = 0 222 | # self._total_count = 0 223 | 224 | def insert(self, x): 225 | if x in self.abnormal_list: 226 | return 227 | if x < baseline_constants.FLOAT_ZERO: 228 | self.smaller_num += 1 229 | elif x >= baseline_constants.FLOAT_ZERO: 230 | self.bigger_num += 1 231 | super(SparseQuantileSummaries, self).insert(x) 232 | 233 | def query(self, quantile): 234 | if self.zero_lower_bound < quantile < self.zero_upper_bound: 235 | return 0.0 236 | 237 | non_zero_quantile = self._convert_query_percentile(quantile) # can be ignored 238 | result = super(SparseQuantileSummaries, self).query(non_zero_quantile) 239 | return result 240 | 241 | def merge(self, other): 242 | self.smaller_num += other.smaller_num 243 | self.bigger_num += other.bigger_num 244 | super(SparseQuantileSummaries, self).merge(other) 245 | return self 246 | 247 | def _convert_query_percentile(self, quantile): 248 | zeros_count = self._total_count - self.count 249 | if zeros_count == 0: 250 | return quantile 251 | 252 | if quantile <= self.zero_lower_bound: 253 | return (self._total_count / self.count) * quantile 254 | 255 | return (quantile - self.zero_upper_bound + self.zero_lower_bound) / ( 256 | 1 - self.zero_upper_bound + self.zero_lower_bound) 257 | 258 | @property 259 | def zero_lower_bound(self): 260 | if self.smaller_num == 0: 261 | return 0.0 262 | return self.smaller_num / self._total_count 263 | 264 | @property 265 | def zero_upper_bound(self): 266 | if self.bigger_num == 0: 267 | return self._total_count 268 | zeros_num = self._total_count - self.smaller_num - self.bigger_num 269 | return (self.smaller_num + zeros_num) / self._total_count -------------------------------------------------------------------------------- /federated_gbdt/core/binning/quantile_binning.py: -------------------------------------------------------------------------------- 1 | from federated_gbdt.core.binning.feature_binning_param import FeatureBinningParam 2 | from federated_gbdt.core.binning.base_binning import BaseBinning 3 | from federated_gbdt.core.binning.quantile_summaries import SparseQuantileSummaries, QuantileSummaries 4 | from federated_gbdt.core.baseline_constants import DEFAULT_RELATIVE_ERROR 5 | 6 | import pandas as pd 7 | import copy 8 | import functools 9 | 10 | 11 | class NoneType: 12 | def __eq__(self, obj): 13 | return isinstance(obj, NoneType) 14 | 15 | def get_split_points(data_inst, is_sparse=False, bin_num=32, 16 | binning_error=DEFAULT_RELATIVE_ERROR, 17 | handle_missing_value=False): 18 | assert isinstance(data_inst, pd.DataFrame) 19 | param_obj = FeatureBinningParam(bin_num=bin_num, error=binning_error) 20 | if handle_missing_value: 21 | binning_obj = QuantileBinning(params=param_obj, abnormal_list=[NoneType()]) 22 | else: 23 | binning_obj = QuantileBinning(params=param_obj) 24 | binning_obj.fit_split_points(data_inst, is_sparse) 25 | #print('split point results have been defined') 26 | return binning_obj.get_split_points_result_numpy() 27 | 28 | def quantile_summary_factory(is_sparse, param_dict): 29 | if is_sparse: 30 | return SparseQuantileSummaries(**param_dict) 31 | else: 32 | return QuantileSummaries(**param_dict) 33 | 34 | 35 | class QuantileBinning(BaseBinning): 36 | """ 37 | After quantile binning, the numbers of elements in each binning are equal. 38 | 39 | The result of this algorithm has the following deterministic bound: 40 | If the data_instances has N elements and if we request the quantile at probability `p` up to error 41 | `err`, then the algorithm will return a sample `x` from the data so that the *exact* rank 42 | of `x` is close to (p * N). 43 | More precisely, 44 | 45 | {{{ 46 | floor((p - 2 * err) * N) <= rank(x) <= ceil((p + 2 * err) * N) 47 | }}} 48 | 49 | This method implements a variation of the Greenwald-Khanna algorithm (with some speed 50 | optimizations). 51 | """ 52 | 53 | def __init__(self, params: FeatureBinningParam, abnormal_list=None, allow_duplicate=False): 54 | super(QuantileBinning, self).__init__(params, abnormal_list) 55 | self.summary_dict = None 56 | self.allow_duplicate = allow_duplicate 57 | 58 | def fit_split_points(self, data_inst, is_sparse=False): 59 | """ 60 | Apply the binning method 61 | 62 | Parameters 63 | ---------- 64 | sparse_dataseries : Data series 65 | The input sparse vector 66 | 67 | Returns 68 | ------- 69 | split_points : dict. 70 | Each value represent for the split points for a feature. The element in each row represent for 71 | the corresponding split point. 72 | e.g. 73 | split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 74 | 'x2': [1, 2, 3, 4, ...], # The second feature 75 | ... # Other features 76 | } 77 | """ 78 | if is_sparse: 79 | assert isinstance(data_inst, pd.Series) 80 | header = data_inst.iloc[0].feature_name 81 | else: 82 | assert isinstance(data_inst, pd.DataFrame) 83 | header = list(data_inst.columns) 84 | # if not isinstance(sparse_dataseries, pd.Series): 85 | # raise TypeError('the input data should be data series') 86 | 87 | # LOGGER.debug("in _fit_split_point, cols_map: {}".format(self.bin_inner_param.bin_cols_map)) 88 | 89 | self._default_setting(header) 90 | # self._init_cols(data_instances) 91 | percent_value = 1.0 / self.bin_num 92 | 93 | # calculate the split points 94 | percentile_rate = [i * percent_value for i in range(1, self.bin_num)] 95 | percentile_rate.append(1.0) 96 | 97 | self._fit_split_point(data_inst, is_sparse, percentile_rate) 98 | 99 | # self.fit_category_features(sparse_dataseries) # can be ignored here 100 | return self.bin_results.all_split_points # {fn: [fv_thresholds], ....} 101 | 102 | def get_split_points_result_numpy(self): 103 | return self.bin_results.get_split_points_array(self.bin_inner_param.transform_bin_names) 104 | 105 | @staticmethod 106 | def copy_merge(s1, s2): 107 | new_s1 = copy.deepcopy(s1) 108 | return new_s1.merge(s2) 109 | 110 | def _fit_split_point(self, data_inst, is_sparse, percentile_rate): 111 | if self.summary_dict is None: 112 | f = functools.partial(self.feature_summary, 113 | params=self.params, # FeatureBinningParam(...) 114 | abnormal_list=self.abnormal_list, 115 | cols_dict=self.bin_inner_param.bin_cols_map, # {bin_name: bin_idx, ...} 116 | header=self.header, 117 | is_sparse=is_sparse) 118 | summary_dict = f(data_inst=data_inst) 119 | summary_dict = dict(summary_dict) 120 | 121 | # LOGGER.debug(f"new summary_dict: {summary_dict}") 122 | total_count = len(data_inst) 123 | for _, summary_obj in summary_dict.items(): 124 | summary_obj.set_total_count(total_count) 125 | 126 | self.summary_dict = summary_dict 127 | else: 128 | summary_dict = self.summary_dict 129 | 130 | for col_name, summary in summary_dict.items(): 131 | split_point = [] 132 | for percen_rate in percentile_rate: 133 | s_p = summary.query(percen_rate) 134 | if not self.allow_duplicate: 135 | if s_p not in split_point: 136 | split_point.append(s_p) 137 | else: 138 | split_point.append(s_p) 139 | self.bin_results.put_col_split_points(col_name, split_point) 140 | 141 | @staticmethod 142 | def feature_summary(data_inst, params, cols_dict, abnormal_list, header, is_sparse): 143 | summary_dict = {} 144 | 145 | summary_param = {'compress_thres': params.compress_thres, 146 | 'head_size': params.head_size, 147 | 'error': params.error, 148 | 'abnormal_list': abnormal_list} 149 | 150 | for col_name, col_index in cols_dict.items(): 151 | quantile_summaries = quantile_summary_factory(is_sparse=is_sparse, param_dict=summary_param) 152 | # quantile_summaries = SparseQuantileSummaries(**summary_param) 153 | summary_dict[col_name] = quantile_summaries 154 | 155 | if is_sparse: 156 | # pd.Series 157 | for sv in data_inst: 158 | data_generator = sv.get_all_data() 159 | for col_idx, col_value in data_generator: 160 | col_name = header[col_idx] 161 | if col_name not in cols_dict: 162 | continue 163 | summary = summary_dict[col_name] 164 | summary.insert(col_value) 165 | else: 166 | # pd.Dataframe 167 | for _, inst in data_inst.iterrows(): 168 | for col_name, summary in summary_dict.items(): 169 | col_index = cols_dict[col_name] 170 | summary.insert(inst[col_index]) 171 | 172 | result = [] 173 | for features_name, summary_obj in summary_dict.items(): 174 | summary_obj.compress() 175 | # result.append(((_, features_name), summary_obj)) 176 | result.append((features_name, summary_obj)) 177 | 178 | return result 179 | 180 | @staticmethod 181 | def _query_split_points(summary, percent_rates): 182 | split_point = [] 183 | for percent_rate in percent_rates: 184 | s_p = summary.query(percent_rate) 185 | if s_p not in split_point: 186 | split_point.append(s_p) 187 | return split_point 188 | 189 | @staticmethod 190 | def approxi_quantile(data_instances, params, cols_dict, abnormal_list, header, is_sparse): 191 | """ 192 | Calculates each quantile information 193 | 194 | Parameters 195 | ---------- 196 | data_instances : DTable 197 | The input data 198 | 199 | cols_dict: dict 200 | Record key, value pairs where key is cols' name, and value is cols' index. 201 | 202 | params : FeatureBinningParam object, 203 | Parameters that user set. 204 | 205 | abnormal_list: list, default: None 206 | Specify which columns are abnormal so that will not static when traveling. 207 | 208 | header: list, 209 | Storing the header information. 210 | 211 | is_sparse: bool 212 | Specify whether data_instance is in sparse type 213 | 214 | Returns 215 | ------- 216 | summary_dict: dict 217 | {'col_name1': summary1, 218 | 'col_name2': summary2, 219 | ... 220 | } 221 | 222 | """ 223 | 224 | summary_dict = {} 225 | 226 | summary_param = {'compress_thres': params.compress_thres, 227 | 'head_size': params.head_size, 228 | 'error': params.error, 229 | 'abnormal_list': abnormal_list} 230 | 231 | for col_name, col_index in cols_dict.items(): 232 | quantile_summaries = quantile_summary_factory(is_sparse=is_sparse, param_dict=summary_param) 233 | summary_dict[col_name] = quantile_summaries 234 | 235 | QuantileBinning.insert_datas(data_instances, summary_dict, cols_dict, header, is_sparse) 236 | for _, summary_obj in summary_dict.items(): 237 | summary_obj.compress() 238 | return summary_dict 239 | 240 | @staticmethod 241 | def insert_datas(data_instances, summary_dict, cols_dict, header, is_sparse): 242 | 243 | for iter_key, instant in data_instances: 244 | if not is_sparse: 245 | if type(instant).__name__ == 'Instance': 246 | features = instant.features 247 | else: 248 | features = instant 249 | for col_name, summary in summary_dict.items(): 250 | col_index = cols_dict[col_name] 251 | summary.insert(features[col_index]) 252 | else: 253 | data_generator = instant.features.get_all_data() 254 | for col_idx, col_value in data_generator: 255 | col_name = header[col_idx] 256 | summary = summary_dict[col_name] 257 | summary.insert(col_value) 258 | 259 | @staticmethod 260 | def merge_summary_dict(s_dict1, s_dict2): 261 | if s_dict1 is None and s_dict2 is None: 262 | return None 263 | if s_dict1 is None: 264 | return s_dict2 265 | if s_dict2 is None: 266 | return s_dict1 267 | 268 | s_dict1 = copy.deepcopy(s_dict1) 269 | s_dict2 = copy.deepcopy(s_dict2) 270 | 271 | new_dict = {} 272 | for col_name, summary1 in s_dict1.items(): 273 | summary2 = s_dict2.get(col_name) 274 | summary1.merge(summary2) 275 | new_dict[col_name] = summary1 276 | return new_dict 277 | 278 | def query_quantile_point(self, query_points, col_names=None): 279 | 280 | if self.summary_dict is None: 281 | raise RuntimeError("Bin object should be fit before query quantile points") 282 | 283 | if col_names is None: 284 | col_names = self.bin_inner_param.bin_names 285 | 286 | summary_dict = self.summary_dict 287 | 288 | if isinstance(query_points, (int, float)): 289 | query_dict = {} 290 | for col_name in col_names: 291 | query_dict[col_name] = query_points 292 | elif isinstance(query_points, dict): 293 | query_dict = query_points 294 | else: 295 | raise ValueError("query_points has wrong type, should be a float, int or dict") 296 | 297 | result = {} 298 | for col_name, query_point in query_dict.items(): 299 | summary = summary_dict[col_name] 300 | result[col_name] = summary.query(query_point) 301 | return result 302 | 303 | 304 | # class QuantileBinningTool(QuantileBinning): 305 | # """ 306 | # Use for quantile binning data directly. 307 | # """ 308 | # 309 | # def __init__(self, bin_nums=consts.G_BIN_NUM, param_obj: FeatureBinningParam = None, 310 | # abnormal_list=None, allow_duplicate=False): 311 | # if param_obj is None: 312 | # param_obj = FeatureBinningParam(bin_num=bin_nums) 313 | # super().__init__(params=param_obj, abnormal_list=abnormal_list, allow_duplicate=allow_duplicate) --------------------------------------------------------------------------------