├── __init__.py
├── examples
    ├── __init__.py
    ├── manual_multiclass.py
    └── multiclass_example.ipynb
├── experiments
    ├── __init__.py
    ├── replication_experiments
    │   ├── __init__.py
    │   ├── replication_data
    │   │   ├── vary_D_Credit 1.pdf
    │   │   ├── vary_e_Credit 1.pdf
    │   │   ├── vary_t_Credit 1.pdf
    │   │   ├── low_eps_bb_Credit 1.pdf
    │   │   ├── dp_ebm_vary_T_Credit 1.pdf
    │   │   ├── comparisons_zoom_Credit 1.pdf
    │   │   ├── split_candidates_vary_Q_Credit 1.pdf
    │   │   ├── split_candidates_vary_T_Credit 1.pdf
    │   │   ├── split_candidates_vary_s_Credit 1.pdf
    │   │   ├── feature_interactions_vary_k_Credit 1.pdf
    │   │   └── split_candidates_vary_eps_Credit 1.pdf
    │   └── experiment_replicator.py
    └── paper_experiments
    │   └── paper_plots
    │       ├── vary_D_Bank.pdf
    │       ├── vary_D_adult.pdf
    │       ├── vary_D_nomao.pdf
    │       ├── vary_clients.pdf
    │       ├── vary_e_Bank.pdf
    │       ├── vary_e_adult.pdf
    │       ├── vary_e_nomao.pdf
    │       ├── vary_t_Bank.pdf
    │       ├── vary_t_adult.pdf
    │       ├── vary_t_nomao.pdf
    │       ├── bubble_plot_full.pdf
    │       ├── low_eps_bb_Bank.pdf
    │       ├── low_eps_bb_adult.pdf
    │       ├── low_eps_bb_nomao.pdf
    │       ├── vary_D_Credit 1.pdf
    │       ├── vary_D_Credit 2.pdf
    │       ├── vary_e_Credit 1.pdf
    │       ├── vary_e_Credit 2.pdf
    │       ├── vary_t_Credit 1.pdf
    │       ├── vary_t_Credit 2.pdf
    │       ├── dp_ebm_vary_T_Bank.pdf
    │       ├── dp_ebm_vary_T_adult.pdf
    │       ├── dp_ebm_vary_T_nomao.pdf
    │       ├── low_eps_bb_Credit 1.pdf
    │       ├── low_eps_bb_Credit 2.pdf
    │       ├── bubble_plot_tree_mean.pdf
    │       ├── comparisons_zoom_Bank.pdf
    │       ├── comparisons_zoom_adult.pdf
    │       ├── comparisons_zoom_nomao.pdf
    │       ├── dp_ebm_vary_T_Credit 1.pdf
    │       ├── dp_ebm_vary_T_Credit 2.pdf
    │       ├── non_dp_ebm_vary_T_Bank.pdf
    │       ├── low_eps_bb_Credit 1_zoom.pdf
    │       ├── non_dp_ebm_vary_T_adult.pdf
    │       ├── non_dp_ebm_vary_T_nomao.pdf
    │       ├── total_client_computation.pdf
    │       ├── total_server_computation.pdf
    │       ├── comparisons_zoom_Credit 1.pdf
    │       ├── comparisons_zoom_Credit 2.pdf
    │       ├── non_dp_ebm_vary_T_Credit 1.pdf
    │       ├── non_dp_ebm_vary_T_Credit 2.pdf
    │       ├── comparisons_zoom_higgs-sample.pdf
    │       ├── split_candidates_vary_Q_Bank.pdf
    │       ├── split_candidates_vary_Q_adult.pdf
    │       ├── split_candidates_vary_Q_nomao.pdf
    │       ├── split_candidates_vary_T_Bank.pdf
    │       ├── split_candidates_vary_T_adult.pdf
    │       ├── split_candidates_vary_T_nomao.pdf
    │       ├── split_candidates_vary_s_Bank.pdf
    │       ├── split_candidates_vary_s_adult.pdf
    │       ├── split_candidates_vary_s_nomao.pdf
    │       ├── comparisons_zoom_Credit 1_right.pdf
    │       ├── feature_interactions_vary_k_Bank.pdf
    │       ├── split_candidates_vary_Q_Credit 1.pdf
    │       ├── split_candidates_vary_Q_Credit 2.pdf
    │       ├── split_candidates_vary_T_Credit 1.pdf
    │       ├── split_candidates_vary_T_Credit 2.pdf
    │       ├── split_candidates_vary_eps_Bank.pdf
    │       ├── split_candidates_vary_eps_adult.pdf
    │       ├── split_candidates_vary_eps_nomao.pdf
    │       ├── split_candidates_vary_s_Credit 1.pdf
    │       ├── split_candidates_vary_s_Credit 2.pdf
    │       ├── feature_interactions_vary_k_adult.pdf
    │       ├── feature_interactions_vary_k_nomao.pdf
    │       ├── split_candidates_vary_eps_Credit 1.pdf
    │       ├── split_candidates_vary_eps_Credit 2.pdf
    │       ├── feature_interactions_vary_k_Credit 1.pdf
    │       ├── feature_interactions_vary_k_Credit 2.pdf
    │       ├── split_candidates_vary_Q_higgs-sample.pdf
    │       ├── split_candidates_vary_T_Credit 1_zoom.pdf
    │       ├── split_candidates_vary_T_higgs-sample.pdf
    │       ├── split_candidates_vary_s_higgs-sample.pdf
    │       ├── split_candidates_vary_eps_higgs-sample.pdf
    │       ├── split_candidates_vary_T_Credit 1_lineplot.pdf
    │       ├── split_candidates_vary_s_Credit 1_lineplot.pdf
    │       └── split_candidates_vary_eps_Credit 1_lineplot.pdf
├── federated_gbdt
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── binning
    │   │   ├── __init__.py
    │   │   ├── sparse_vector.py
    │   │   ├── bin_inner_param.py
    │   │   ├── feature_binning_param.py
    │   │   ├── bin_result.py
    │   │   ├── quantile_summaries.py
    │   │   └── quantile_binning.py
    │   ├── pure_ldp
    │   │   ├── __init__.py
    │   │   ├── frequency_oracles
    │   │   │   ├── hybrid_mechanism
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── hybrid_mech_server.py
    │   │   │   │   └── hybrid_mech_client.py
    │   │   │   ├── square_wave
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── sw_client.py
    │   │   │   │   └── sw_server.py
    │   │   │   └── local_hashing
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── fast_lh_client.py
    │   │   │   │   ├── lh_server.py
    │   │   │   │   ├── lh_client.py
    │   │   │   │   └── fast_lh_server.py
    │   │   └── core
    │   │   │   ├── prob_simplex.py
    │   │   │   ├── _freq_oracle_client.py
    │   │   │   ├── __init__.py
    │   │   │   ├── fo_creator.py
    │   │   │   └── _freq_oracle_server.py
    │   ├── moments_accountant
    │   │   ├── __init__.py
    │   │   ├── compute_noise_from_budget_lib.py
    │   │   └── dp_params.py
    │   ├── dp_multiq
    │   │   ├── requirements.txt
    │   │   ├── AUTHORS
    │   │   ├── README
    │   │   ├── run.sh
    │   │   ├── run_experiment.py
    │   │   ├── smooth.py
    │   │   ├── base.py
    │   │   ├── ind_exp.py
    │   │   ├── smooth_utils.py
    │   │   ├── joint_exp.py
    │   │   └── csmooth.py
    │   ├── README.MD
    │   ├── baseline_constants.py
    │   ├── plotting.py
    │   └── loss_functions.py
    ├── models
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── README.MD
    │   │   ├── tree_node.py
    │   │   ├── jit_functions.py
    │   │   └── tree_base.py
    │   ├── gbdt
    │   │   ├── __init__.py
    │   │   └── components
    │   │   │   ├── __init__.py
    │   │   │   ├── index_sampler.py
    │   │   │   └── train_monitor.py
    │   ├── __init__.py
    │   └── README.md
    └── README.md
├── .gitignore
├── requirements.txt
└── README.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/base/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/gbdt/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/replication_experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/moments_accountant/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/gbdt/components/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .gbdt.private_gbdt import PrivateGBDT
2 | 
3 | __all__ = ["PrivateGBDT"]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .idea
3 | __pycache__
4 | experiments/experiment_data
5 | data
6 | dev
7 | **/pre_paper/**


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/__init__.py:
--------------------------------------------------------------------------------
1 | from .sw_server import SWServer
2 | from .sw_client import SWClient


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/requirements.txt:
--------------------------------------------------------------------------------
1 | absl-py >= 0.1.6
2 | matplotlib >= 3.0.3
3 | pandas >= 1.1.5
4 | numpy >= 1.16.4
5 | scipy >= 1.2.1
6 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/README.md:
--------------------------------------------------------------------------------
1 | ### Structure:
2 | * ```base``` - Contains base classes for clients, trees and decision nodes
3 | * ``private_gbdt`` - The main DP GBDT framework 


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_D_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_D_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_D_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_clients.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_clients.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_e_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_e_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_e_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_t_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_t_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_t_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/bubble_plot_full.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/bubble_plot_full.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_D_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_D_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_D_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_e_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_e_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_e_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_t_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/vary_t_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/vary_t_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/bubble_plot_tree_mean.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/bubble_plot_tree_mean.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/dp_ebm_vary_T_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1_zoom.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/low_eps_bb_Credit 1_zoom.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/total_client_computation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/total_client_computation.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/total_server_computation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/total_server_computation.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/non_dp_ebm_vary_T_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/vary_D_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_D_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/vary_e_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_e_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/vary_t_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/vary_t_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_higgs-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_higgs-sample.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_nomao.pdf


--------------------------------------------------------------------------------
/federated_gbdt/README.md:
--------------------------------------------------------------------------------
1 | ### Structure:
2 | 
3 | * ``core`` - Contains helper functions and structures for XGBoost such as quantile sketches, loss functions and pre-processing methods
4 | * ``models`` - Contains main GBDT models


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1_right.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/comparisons_zoom_Credit 1_right.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Bank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Bank.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/low_eps_bb_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/low_eps_bb_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_adult.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_adult.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_nomao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_nomao.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/dp_ebm_vary_T_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/dp_ebm_vary_T_Credit 1.pdf


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/__init__.py:
--------------------------------------------------------------------------------
1 | from .lh_server import LHServer
2 | from .lh_client import LHClient
3 | 
4 | from .fast_lh_client import FastLHClient
5 | from .fast_lh_server import FastLHServer


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/feature_interactions_vary_k_Credit 2.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_higgs-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_Q_higgs-sample.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_zoom.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_zoom.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_higgs-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_higgs-sample.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_higgs-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_higgs-sample.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/comparisons_zoom_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/comparisons_zoom_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_higgs-sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_higgs-sample.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_lineplot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_T_Credit 1_lineplot.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1_lineplot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_s_Credit 1_lineplot.pdf


--------------------------------------------------------------------------------
/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1_lineplot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/paper_experiments/paper_plots/split_candidates_vary_eps_Credit 1_lineplot.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/split_candidates_vary_Q_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_Q_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/split_candidates_vary_T_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_T_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/split_candidates_vary_s_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_s_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/feature_interactions_vary_k_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/feature_interactions_vary_k_Credit 1.pdf


--------------------------------------------------------------------------------
/experiments/replication_experiments/replication_data/split_candidates_vary_eps_Credit 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Samuel-Maddock/federated-boosted-dp-trees/HEAD/experiments/replication_experiments/replication_data/split_candidates_vary_eps_Credit 1.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | autodp==0.2
 2 | bitarray==2.6.2
 3 | fast_histogram==0.11
 4 | matplotlib==3.6.2
 5 | numba==0.56.4
 6 | numpy==1.23.5
 7 | pandas==1.5.2
 8 | pathos==0.3.0
 9 | pmlb==1.0.1.post3
10 | scikit_learn==1.2.0
11 | scipy==1.10.0
12 | seaborn==0.12.2
13 | six==1.16.0
14 | statsmodels==0.13.5
15 | tqdm==4.64.1
16 | xxhash==3.2.0
17 | notebook==6.5.2


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the list of authors for copyright purposes.
2 | #
3 | # This does not necessarily list everyone who has contributed code, since in
4 | # some cases, their employer may be the copyright holder.  To see the full list
5 | # of contributors, see the revision history in source control.
6 | Google LLC
7 | Jennifer Gillenwater
8 | Matthew Joseph
9 | Alex Kulesza


--------------------------------------------------------------------------------
/federated_gbdt/models/base/README.MD:
--------------------------------------------------------------------------------
1 | ### File Structure:
2 | 
3 | * ``client.py`` - Leftover code from FEVERLESS, used in ``federated_xgboost.models.vertical``
4 | * ``tree_base.py`` - Base class for the XGBoost model, modified from FEVERLESS code
5 | * ``tree_node.py`` - Base class for a node in a decision tree, modified from FEVERLESS code
6 | * ``tree_params.py`` - Leftover code from FEVERLESS, used in ``federated_xgboost.models.vertical``


--------------------------------------------------------------------------------
/federated_gbdt/core/README.MD:
--------------------------------------------------------------------------------
1 | ### Structue of federated_xgboost.core:
2 | 
3 | * ``binning`` - Binning/Quantile Sketching implementation from FATE
4 | * ``dp_multiq`` - Google implementation of multi-dimensional DP quantiles (see https://arxiv.org/abs/2102.08244)
5 | * ``moments_accountant`` - RDP moments accountant implementation (of the sampled gaussian mechanism (SGM)) from tensorflow.privacy
6 | * ``pure_ldp`` - Various implementations of LDP protocols
7 | * ``baseline_constants.py`` - Leftover code from FEVERLESS, contains parameters for quantile sketching


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/core/prob_simplex.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | #simplex projection
 4 | def project_probability_simplex(p_estimate):
 5 |     k = len(p_estimate)  # Infer the size of the alphabet.
 6 |     p_estimate_sorted = np.sort(p_estimate)
 7 |     p_estimate_sorted[:] = p_estimate_sorted[::-1]
 8 |     p_sorted_cumsum = np.cumsum(p_estimate_sorted)
 9 |     i = 1
10 |     while i < k:
11 |         if p_estimate_sorted[i] + (1.0 / (i + 1)) * (1 - p_sorted_cumsum[i]) < 0:
12 |             break
13 |         i += 1
14 |     lmd = (1.0 / i) * (1 - p_sorted_cumsum[i - 1])
15 |     return np.maximum(p_estimate + lmd, 0)


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/sparse_vector.py:
--------------------------------------------------------------------------------
 1 | # sparse vector ------------------------------------------------------
 2 | class SparseVector:
 3 |     """
 4 |     sparse vector: dict, record (indices, data) kv tuples
 5 |     shape: the real feature shape of data
 6 |     """
 7 |     def __init__(self, indices=None, fn=None, data=None, shape=0):
 8 |         self.sparse_vec = dict(zip(indices, data))
 9 |         self.feature_name = fn
10 |         self.shape = shape
11 | 
12 |     def get_all_data(self):
13 |         for idx, data in self.sparse_vec.items():
14 |             yield idx, data
15 | 
16 |     def get_shape(self):
17 |         return self.shape
18 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/README:
--------------------------------------------------------------------------------
 1 | First download the Goodreads dataset from Kaggle:
 2 | https://www.kaggle.com/jealousleopard/goodreadsbooks.
 3 | Save it as "books.csv" in this directory. Then use
 4 | 
 5 | > cd ..
 6 | > python -m dp_multiq.run_experiment
 7 | 
 8 | to run a small number of trials for each of the four data regimes described in
 9 | the paper. This will produce and save plots with names eps_1_error_[data].png
10 | and eps_1_times_[data].png.
11 | 
12 | To fully replicate the experiments from the paper, change the default parameters
13 | in the experiment() function to
14 | 
15 | num_samples=1000
16 | num_quantiles_range=range(1, 30)
17 | est_num_trials=20
18 | ts_num_trials=5


--------------------------------------------------------------------------------
/federated_gbdt/core/baseline_constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ACCURACY_KEY = 'accuracy'
 3 | BYTES_WRITTEN_KEY = 'bytes_written'
 4 | BYTES_READ_KEY = 'bytes_read'
 5 | LOCAL_COMPUTATIONS_KEY = 'local_computations'
 6 | NUM_ROUND_KEY = 'round_number'
 7 | NUM_SAMPLES_KEY = 'num_samples'
 8 | CLIENT_ID_KEY = 'client_id'
 9 | 
10 | FLOAT_ZERO = 1e-8
11 | QUANTILE = 'quantile'
12 | DEFAULT_COMPRESS_THRESHOLD = 10000
13 | DEFAULT_HEAD_SIZE = 10000
14 | DEFAULT_RELATIVE_ERROR = 0.001
15 | G_BIN_NUM = 10
16 | MAX_SPLIT_NODES = 2 ** 16
17 | SECURE_AGG_AMPLIFY_FACTOR = 1000
18 | 
19 | MAX_CLASSNUM = 1000
20 | 
21 | LABEL_NAME = ['default.payment.next.month', 'SeriousDlqin2yrs', 'y', 'y_yes', 'Appliances']
22 | 
23 | CLASSIFICATION = 'classification'
24 | REGRESSION = 'regression'
25 | CLUSTERING = 'clustering'
26 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The Google Research Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | set -e
17 | set -x
18 | 
19 | virtualenv -p python3 .
20 | source ./bin/activate
21 | 
22 | pip install -r requirements.txt
23 | pushd ..
24 | python3 -m dp_multiq.run_experiment
25 | popd
26 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/hybrid_mech_server.py:
--------------------------------------------------------------------------------
 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer
 2 | import numpy as np
 3 | 
 4 | 
 5 | class HMServer(FreqOracleServer):
 6 |     def __init__(self, epsilon, d, index_mapper=None):
 7 |         super().__init__(epsilon, d, index_mapper=index_mapper)
 8 |         self.update_params(epsilon, d, index_mapper=index_mapper)
 9 |         self.aggregated_data = []
10 | 
11 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
12 |         super().update_params(epsilon, d, index_mapper)
13 | 
14 |     def aggregate(self, priv_data):
15 |         self.aggregated_data.append(priv_data)
16 |         self.n += 1
17 | 
18 |     def _update_estimates(self):
19 |         mean = np.mean(self.aggregated_data)
20 |         return mean
21 | 
22 |     def estimate(self, suppress_warnings=False):
23 |         """
24 |         Calculates a frequency estimate of the given data item
25 |         Args:
26 |             data: data item
27 |             suppress_warnings: Optional boolean - Supresses warnings about possible inaccurate estimations
28 |         Returns: float - frequency estimate
29 |         """
30 |         self.check_warnings(suppress_warnings=suppress_warnings)
31 |         return self._update_estimates()
32 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/sw_client.py:
--------------------------------------------------------------------------------
 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient
 2 | import numpy as np
 3 | import random
 4 | import math
 5 | 
 6 | 
 7 | class SWClient(FreqOracleClient):
 8 |     def __init__(self, epsilon, index_mapper=None):
 9 |         super().__init__(epsilon=epsilon, d=None, index_mapper=index_mapper)
10 |         self.update_params(epsilon, d=None, index_mapper=index_mapper)
11 | 
12 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
13 |         super().update_params(epsilon, d, index_mapper)
14 |         ee = np.exp(self.epsilon)
15 |         if epsilon is not None or d is not None:
16 |             self.b = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon))
17 |             self.p = ee / ((2 * self.b * ee) + 1)
18 |             self.q = 1 / ((2 * self.b * ee) + 1)
19 | 
20 |     def _perturb(self, data):
21 |         if random.random() <= 2 * self.b * self.p:
22 |             perturbed_val = random.uniform(data - self.b, data + self.b)
23 |         else:
24 |             if random.random() < 0.5:
25 |                 perturbed_val = random.uniform(-self.b, data - self.b)
26 |             else:
27 |                 perturbed_val = random.uniform(data + self.b, 1 + self.b)
28 | 
29 |         return perturbed_val
30 | 
31 |     def privatise(self, data):
32 |         # index = self.index_mapper(data)
33 |         return self._perturb(data)
34 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/run_experiment.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Script for running experiments and saving the resulting plots."""
17 | 
18 | import warnings
19 | 
20 | from dp_multiq import experiment
21 | 
22 | # To suppress RuntimeWarnings about log(0).  This quantity occurs frequently in
23 | # our code, and doesn't mean that anything is going wrong; numpy.log(0) will
24 | # produce -numpy.inf, which our code handles appropriately.
25 | warnings.simplefilter("ignore", category=RuntimeWarning)
26 | 
27 | experiment.experiment(methods=[
28 |     experiment.QuantilesEstimationMethod.JOINT_EXP, experiment
29 |     .QuantilesEstimationMethod.IND_EXP, experiment.QuantilesEstimationMethod
30 |     .APP_IND_EXP, experiment.QuantilesEstimationMethod.SMOOTH,
31 |     experiment.QuantilesEstimationMethod.CSMOOTH,
32 |     experiment.QuantilesEstimationMethod.LAP_TREE
33 | ])
34 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/fast_lh_client.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from federated_gbdt.core.pure_ldp.frequency_oracles.local_hashing import LHClient
 3 | 
 4 | # Client-side for fast local-hashing
 5 |     # Heuristic fast variant of OLH
 6 | 
 7 | class FastLHClient(LHClient):
 8 |     def __init__(self, epsilon, d, k, g=2, use_olh=False, index_mapper=None):
 9 |         """
10 |         Fast heuristic version of OLH
11 | 
12 |         Args:
13 |             epsilon: float - The privacy budget
14 |             g: Optional integer - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing)
15 |             use_olh: Optional boolean - if set to true uses Optimised Local Hashing (OLH) i.e g is set to round(e^epsilon + 1)
16 |             index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
17 |         """
18 |         self.k = k
19 |         super().__init__(epsilon, d, g, use_olh, index_mapper)
20 | 
21 |         if k is not None:
22 |             self.k = k
23 | 
24 |     def update_params(self, epsilon=None, d=None, k=None, use_olh=None, g=None, index_mapper=None):
25 |         super().update_params(epsilon, d, use_olh, g, index_mapper)
26 |         self.k = k if k is not None else self.k
27 | 
28 |     def privatise(self, data):
29 |         """
30 |         Privatises a user's data using fast local hashing (FLH)
31 | 
32 |         Args:
33 |             data: The data to be privatised
34 | 
35 |         Returns:
36 |             privatised data: a single integer
37 |         """
38 | 
39 |         seed = random.randint(0, self.k-1)
40 |         return self._perturb(data, seed), seed
41 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/core/_freq_oracle_client.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class FreqOracleClient:
 3 |     def __init__(self, epsilon, d, index_mapper=None):
 4 |         """
 5 | 
 6 |         Args:
 7 |             epsilon (float): Privacy budget
 8 |             d (int): domain size - not all freq oracles need this, so can be None
 9 |             index_mapper (func): Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
10 |         """
11 |         self.epsilon = epsilon
12 |         self.d = d
13 | 
14 |         if index_mapper is None:
15 |             self.index_mapper = lambda x: x - 1
16 |         else:
17 |             self.index_mapper = index_mapper
18 | 
19 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
20 |         """
21 |         Method to update params of freq oracle client, should be overridden if more options needed.
22 |         Args:
23 |             epsilon (optional float): Privacy budget
24 |             d (optional int): Domain size
25 |             index_mapper (optional func): Index map function
26 |         """
27 |         self.epsilon = epsilon if epsilon is not None else self.epsilon
28 |         self.d = d if d is not None else self.d
29 |         self.index_mapper = index_mapper if index_mapper is not None else self.index_mapper
30 | 
31 |     def _perturb(self, data):
32 |         """
33 |         Used internally to perturb raw data, must be implemented by a FreqOracle
34 |         Args:
35 |             data: user's data item
36 |         """
37 |         raise NotImplementedError("Must implement")
38 | 
39 |     def privatise(self, data):
40 |         """
41 |         Public facing method to privatise user's data
42 |         Args:
43 |             data: user's data item
44 |         """
45 |         raise NotImplementedError("Must implement")


--------------------------------------------------------------------------------
/federated_gbdt/models/base/tree_node.py:
--------------------------------------------------------------------------------
 1 | from numba.experimental import jitclass
 2 | from numba import float32, int32, deferred_type, optional, types
 3 | 
 4 | # Node of decision tree, recursive model
 5 | 
 6 | node_type = deferred_type() # numba
 7 | 
 8 | array_type = types.float32[:]
 9 | 
10 | spec = [ # numba
11 |     ('feature_i', int32),
12 |     ('threshold', optional(float32)),
13 |     # ('value', optional(float32)),
14 |     ('value', optional(array_type)),
15 |     ('true_branch', optional(node_type)),
16 |     ('false_branch', optional(node_type)),
17 |     ('split_gain', optional(float32)),
18 |     ('hessian_sum', optional(float32)),
19 |     ('gradient_sum', optional(float32)),
20 |     ('num_observations', optional(int32)),
21 |     ('depth', optional(int32)),
22 | ]
23 | 
24 | # @jitclass(spec) # numba
25 | class DecisionNode:
26 |     def __init__(self, node_id="empty", feature_i=None, threshold=None,
27 |                  value=None, true_branch=None, false_branch=None, split_gain=None, hessian_sum=None, gradient_sum=None, num_observations=None, depth=None):
28 | 
29 |         self.feature_i = feature_i          # Index for feature that is split on
30 |         self.threshold = threshold          # Split candidate value
31 |         self.value = value                  # Value if the node is a leaf in the tree
32 |         self.true_branch = true_branch      #  Left subtree
33 |         self.false_branch = false_branch    #  Right subtree
34 |         self.node_id = node_id              # Node id for post-training stats
35 | 
36 |         # Needed for feature importance calculations
37 |         self.split_gain = split_gain
38 |         self.hessian_sum = hessian_sum
39 |         self.gradient_sum = gradient_sum
40 |         self.num_observations = num_observations
41 |         self.depth = depth
42 | 
43 | # node_type.define(DecisionNode.class_type.instance_type) # numba


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import xxhash
 2 | import hashlib
 3 | from bitarray import bitarray
 4 | 
 5 | # Base classes for frequency oracles
 6 | from ._freq_oracle_client import FreqOracleClient
 7 | from ._freq_oracle_server import FreqOracleServer
 8 | 
 9 | # Helper functions for generating hash funcs
10 | 
11 | def generate_hash_funcs(k, m):
12 |     """
13 |     Generates k hash functions that map data to the range {0, 1,..., m-1}
14 |     Args:
15 |         k: The number of hash functions
16 |         m: The domain {0,1,...,m-1} that hash func maps too
17 |     Returns: List of k hash functions
18 |     """
19 |     hash_funcs = []
20 |     for i in range(0, k):
21 |         hash_funcs.append(generate_hash(m, i))
22 |     return hash_funcs
23 | 
24 | 
25 | def generate_256_hash():
26 |     """
27 | 
28 |     Returns: A hash function that maps data to {0,1,... 255}
29 | 
30 |     """
31 |     return lambda data: xxhash.xxh64(data, seed=10).intdigest() % 256
32 | 
33 | 
34 | def generate_hash(m, seed):
35 |     """
36 |     Generate a single hash function that maps data to {0, ... ,m-1}
37 |     Args:
38 |         m: int domain to map too
39 |         seed: int the seed for the hash function
40 | 
41 |     Returns: Hash function
42 | 
43 |     """
44 |     return lambda data: xxhash.xxh64(str(data), seed=seed).intdigest() % m
45 | 
46 | 
47 | def get_sha256_hash_arr(hashId, dataString):
48 |     """
49 |     Used in priv_count_sketch freq oracle for hashing
50 |     Args:
51 |         hashId: seed of the hash
52 |         dataString: data string to hash
53 | 
54 |     Returns: hashed data as a bitarray
55 | 
56 |     """
57 |     message = hashlib.sha256()
58 | 
59 |     message.update((str(hashId) + dataString).encode("utf8"))
60 | 
61 |     message_in_bytes = message.digest()
62 | 
63 |     message_in_bit_array = bitarray(endian='little')
64 |     message_in_bit_array.frombytes(message_in_bytes)
65 | 
66 |     return message_in_bit_array
67 | 


--------------------------------------------------------------------------------
/examples/manual_multiclass.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sys.path.append("../")
 4 | 
 5 | from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT
 6 | from federated_gbdt.core.loss_functions import SoftmaxCrossEntropyLoss
 7 | from experiments.experiment_helpers.data_loader import DataLoader
 8 | from sklearn.metrics import roc_auc_score, accuracy_score
 9 | from sklearn.preprocessing import OneHotEncoder
10 | import numpy as np
11 | 
12 | from xgboost import XGBClassifier
13 | 
14 | # Load connect4 dataset
15 | dataloader = DataLoader()
16 | X_train, X_test, y_train, y_test = dataloader.load_datasets(
17 |     ["connect_4"], return_dict=False
18 | )[0]
19 | onehot_y_test = OneHotEncoder(sparse_output=False).fit_transform(y_test.reshape(-1, 1))
20 | 
21 | # XGBoost baseline
22 | xgb = XGBClassifier().fit(X_train, y_train)
23 | xgb_probs = xgb.predict_proba(X_test)
24 | xgb_pred = np.argmax(xgb_probs, axis=1)
25 | print(f"XGBoost AUC - {roc_auc_score(onehot_y_test, xgb_probs)}")
26 | print(f"XGBoost Accuracy - {accuracy_score(y_test, xgb_pred)}")
27 | print("\n")
28 | 
29 | # PrivateGBDT (eps=0, non-private)
30 | C = len(np.unique(y_train))  # C=3 classes for connect4
31 | total_eps = 3
32 | # scale privacy budget, here we have eps=0 (non-private) so scaling not needed
33 | class_eps = total_eps / C
34 | class_probs = []
35 | for c in range(0, C):
36 |     print(f"Training model... class {c} vs all")
37 |     dp_method = "" if class_eps == 0 else "gaussian_cdp"
38 |     xgb_model = PrivateGBDT(num_trees=100, epsilon=class_eps, dp_method=dp_method)
39 |     y_train_c = (y_train == c).astype(int)  # one-vs-all for class k
40 |     xgb_model = xgb_model.fit(X_train, y_train_c)
41 |     class_probs.append(xgb_model.predict_proba(X_test)[:, 1])
42 | y_probs = SoftmaxCrossEntropyLoss().predict(np.array(list(zip(*class_probs))))
43 | y_pred = np.argmax(y_probs, axis=1)
44 | print(
45 |     f"PrivateGBDT (epsilon={total_eps}) AUC - {roc_auc_score(onehot_y_test, y_probs)}"
46 | )
47 | print(f"PrivateGBDT (epsilon={total_eps}) Accuracy - {accuracy_score(y_test, y_pred)}")
48 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/base/jit_functions.py:
--------------------------------------------------------------------------------
 1 | import numba
 2 | import math
 3 | 
 4 | @numba.jit(nopython=True)
 5 | def _L1_clip(total_grads, reg_alpha):
 6 |     """
 7 |     L1 regularisation on the gradients, controlled by self.reg_alpha
 8 | 
 9 |     :param total_grads:
10 |     :return:
11 |     """
12 |     if total_grads > reg_alpha:
13 |         return total_grads - reg_alpha
14 |     elif total_grads < -1 * reg_alpha:
15 |         return total_grads + reg_alpha
16 |     else:
17 |         return 0
18 | 
19 | 
20 | @numba.jit(nopython=True)
21 | def _calculate_weight(total_grads, total_hess, reg_alpha, reg_delta, reg_lambda):
22 |     """
23 |     Calculates weight for leaf nodes
24 | 
25 |     :param total_grads: Total sum of gradients
26 |     :param total_hess:  Total sum of hessians
27 |     :return: Weight for leaf node
28 |     """
29 |     if total_hess < 0:
30 |         total_hess = 0
31 | 
32 |     weight = -1 * (_L1_clip(total_grads, reg_alpha) / (total_hess + reg_lambda))
33 |     if reg_delta != 0 and abs(weight) > reg_delta:
34 |         return math.copysign(reg_delta, weight)  # Delta clipping
35 |     else:
36 |         return weight
37 | 
38 | 
39 | @numba.jit(nopython=True)
40 | def _calculate_gain(total_grads, total_hess, reg_alpha, reg_delta, reg_lambda):
41 |     """
42 |     Calculates gain from sum of gradients and sum of hessians
43 | 
44 |     :param total_grads: Sum of gradients
45 |     :param total_hess: Sum of hessians
46 |     :return: Gain score
47 |     """
48 |     con = _L1_clip(total_grads, reg_alpha)
49 |     weight = -1 * (con / (total_hess + reg_lambda))
50 |     if reg_delta != 0 and abs(weight) > reg_delta: # If delta-clipping is enabled the gain calculation is a little more complicated, following the implementation in the original XGBoost: https://github.com/dmlc/xgboost/blob/d7d1b6e3a6e2aa8fcb1857bf5e3188302a03b399/src/tree/param.h
51 |         weight = math.copysign(reg_delta, weight)  # Delta clipping
52 |         return -(2 * total_grads * weight + (total_hess + reg_lambda) * weight ** 2) + reg_alpha * abs(weight)  # This is an L1-regularised clipped gain calculation
53 |     else:
54 |         return -weight * con  # G^2/H + lambda, with possible L1 regularisation and delta clipping on G
55 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/smooth.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Smooth sensitivity method for computing differentially private quantiles.
17 | 
18 | Lemmas 2.6 and 2.9 from "Smooth Sensitivity and Sampling in Private Data
19 | Analysis" by Nissim, Radkhodnikova, and Smith
20 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf) describe the
21 | noise scaled to the smooth sensitivity.
22 | """
23 | 
24 | import numpy as np
25 | 
26 | from dp_multiq import base
27 | from dp_multiq import smooth_utils
28 | 
29 | 
30 | def smooth(sorted_data, data_low, data_high, qs, divided_eps, divided_delta):
31 |   """Returns (eps, delta)-differentially private quantile estimates for qs.
32 | 
33 |   Args:
34 |     sorted_data: Array of data points sorted in increasing order.
35 |     data_low: Lower limit for any differentially private quantile output value.
36 |     data_high: Upper limit for any differentially private quantile output value.
37 |     qs: Increasing array of quantiles in [0,1].
38 |     divided_eps: Privacy parameter epsilon, assumed to be already divided for
39 |       the desired overall eps.
40 |     divided_delta: Privacy parameter delta, assumed to be already divided for
41 |       the desired overall delta.
42 |   """
43 |   sorted_data = np.clip(sorted_data, data_low, data_high)
44 |   o = np.empty(len(qs))
45 |   n = len(sorted_data)
46 |   alpha = divided_eps / 2.0
47 |   beta = divided_eps / (2 * np.log(2 / divided_delta))
48 |   for i in range(len(qs)):
49 |     true_quantile_idx = base.quantile_index(n, qs[i])
50 |     true_quantile_value = sorted_data[true_quantile_idx]
51 |     log_sensitivity = smooth_utils.compute_log_smooth_sensitivity(
52 |         sorted_data, data_low, data_high, true_quantile_idx, beta)
53 |     noise = np.exp(log_sensitivity) * np.random.laplace() / alpha
54 |     o[i] = true_quantile_value + noise
55 |   o = np.clip(o, data_low, data_high)
56 |   return np.sort(o)
57 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/core/fo_creator.py:
--------------------------------------------------------------------------------
 1 | from federated_gbdt.core.pure_ldp.frequency_oracles import *
 2 | import copy
 3 | import inspect
 4 | 
 5 | 
 6 | # Used to create a list of possible frequency oracles in the pure-LDP library
 7 | 
 8 | client_class_list = []
 9 | server_class_list = []
10 | globs = list(globals().keys()).copy()  # Create copy, since globals updates too much to iterate through
11 | 
12 | for key in globs:
13 |     if "Client" in key:
14 |         client_class_list.append(key.replace("Client", ""))
15 |     if "Server" in key:
16 |         server_class_list.append(key.replace("Server", ""))
17 | 
18 | class_list = {"Client": client_class_list, "Server": server_class_list}
19 | 
20 | 
21 | def _create_pure_fo_instance(obj_type, name, obj_params):
22 |     """
23 |     Used internally to create instances of various Client/Server frequency oracles
24 | 
25 |     Args:
26 |         obj_type (str): Either "Client" or "Server"
27 |         name: Name of the frequency oracle to create
28 |         obj_params: Parameters for the frequency oracle object
29 | 
30 |     Returns: Instance of name + obj_type frequency oracle
31 | 
32 |     """
33 |     fo_list = class_list[obj_type]
34 | 
35 |     split = name.split("_")  # Get prefix of client name i.e if passed "local_hashing" get "LH" as prefix
36 | 
37 |     if len(split) > 1:
38 |         name = ""
39 |         for word in split:
40 |             name += word[0]
41 | 
42 |     name = name.upper()
43 |     if name == "HR": name = "HADAMARDRESPONSE"
44 | 
45 |     upper_fo_list = list(map(lambda x: x.upper(), fo_list))
46 | 
47 |     if name not in upper_fo_list:
48 |         raise ValueError("Frequency oracle must be one of:", fo_list,
49 |                          "\n NOTE: Values are case insensitive")
50 | 
51 |     fo_name = client_class_list[upper_fo_list.index(name)] + obj_type
52 | 
53 |     constructor = globals().get(fo_name)
54 |     expected_params = list(inspect.signature(constructor).parameters)
55 | 
56 |     params = dict(
57 |         (key.split("=")[0], obj_params[key.split("=")[0]]) for key in expected_params if key in obj_params.keys())
58 | 
59 |     return constructor(**params)
60 | 
61 | 
62 | def create_fo_client_instance(name, client_params):
63 |     """
64 |     Given a name of a frequency oracle creates a client instance of it
65 | 
66 |     Args:
67 |         name: Name of frequency oracle (i.e LH, HE)
68 |         client_params: The parameters for the client frequency oracle object
69 | 
70 |     Returns: A frequency oracle instance of nameClient
71 | 
72 |     """
73 |     return _create_pure_fo_instance("Client", name, client_params)
74 | 
75 | 
76 | def create_fo_server_instance(name, server_params):
77 |     """
78 |     Given a name of a frequency oracle creates a server instance of it
79 | 
80 |     Args:
81 |         name: Name of frequency oracle (i.e LH, HE)
82 |         server_params: The parameters for the server frequency oracle
83 | 
84 |     Returns: A frequency oracle instance of nameServer
85 | 
86 |     """
87 |     return _create_pure_fo_instance("Server", name, server_params)
88 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/base.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The Google Research Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Basic methods for generating data and computing non-private quantiles."""
17 | 
18 | import math
19 | import numpy as np
20 | 
21 | 
22 | def quantile_index(n, quantile):
23 |   """Returns index of the specified quantile in a sorted dataset of n elements.
24 | 
25 |   Args:
26 |     n: Size of the sorted dataset.
27 |     quantile: A value in [0, 1] indicating the desired quantile.
28 | 
29 |   Returns:
30 |     Index of the specified quantile. If the quantile is between points at
31 |     indices i and i+1, returns i.
32 |   """
33 |   return int(math.floor((n - 1) * quantile))
34 | 
35 | 
36 | def quantiles(data, qs):
37 |   """Returns quantile estimates for qs.
38 | 
39 |   Args:
40 |     data: A dataset sorted in increasing order.
41 |     qs: Increasing array of quantiles in [0,1].
42 |   """
43 |   return np.quantile(data, qs, interpolation='lower')
44 | 
45 | 
46 | def misclassified_points_error(sorted_data, true_quantiles, est_quantiles):
47 |   """Returns the average number of data points between true and est quantiles.
48 | 
49 |   Args:
50 |     sorted_data: A dataset sorted in increasing order.
51 |     true_quantiles: Ground truth quantiles.
52 |     est_quantiles: Estimated quantiles.
53 | 
54 |   Returns:
55 |     The number of data points strictly between true_quantiles[j] and
56 |     est_quantiles[j], averaged over all j.
57 |   """
58 |   total_missed = 0
59 |   num_quantiles = len(true_quantiles)
60 |   for q_idx in range(num_quantiles):
61 |     total_missed += np.abs(
62 |         np.sum(sorted_data > true_quantiles[q_idx]) -
63 |         np.sum(sorted_data > est_quantiles[q_idx]))
64 |   return total_missed / num_quantiles
65 | 
66 | 
67 | def distance_error(true_quantiles, est_quantiles):
68 |   """Returns the mean distance between the true and estimated quantiles.
69 | 
70 |   Args:
71 |     true_quantiles: Ground truth quantiles.
72 |     est_quantiles: Estimated quantiles.
73 |   """
74 |   return np.mean(np.abs(true_quantiles - est_quantiles))
75 | 
76 | 
77 | def gen_gaussian(num_samples, mean, stddev):
78 |   """Returns num_samples iid Gaussian samples in increasing order.
79 | 
80 |   Args:
81 |     num_samples: Number of samples to return.
82 |     mean: Mean of Gaussian distribution to sample.
83 |     stddev: Standard deviation of Gaussian distribution to sample.
84 |   """
85 |   return np.sort(np.random.normal(loc=mean, scale=stddev, size=num_samples))
86 | 
87 | 
88 | def gen_uniform(num_samples, data_low, data_high):
89 |   """Returns num_samples iid uniform samples in increasing order.
90 | 
91 |   Args:
92 |     num_samples: Number of samples to return.
93 |     data_low: Lower bound of uniform distribution to sample.
94 |     data_high: Upper bound of uniform distribution to sample.
95 |   """
96 |   return np.sort(
97 |       np.random.uniform(low=data_low, high=data_high, size=num_samples))
98 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/lh_server.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import xxhash
 3 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer
 4 | 
 5 | # Server-side for local-hashing
 6 | 
 7 | # Loosely based on https://github.com/vvv214/LDP_Protocols/blob/master/olh.py
 8 | 
 9 | class LHServer(FreqOracleServer):
10 |     def __init__(self, epsilon, d, g=2, use_olh=False, index_mapper=None):
11 |         """
12 | 
13 |         Args:
14 |             epsilon: float - The privacy budget
15 |             d: integer - Size of the data domain
16 |             g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing)
17 |             use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1)
18 |             index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
19 |         """
20 |         super().__init__(epsilon, d, index_mapper=index_mapper)
21 |         self.set_name("LHServer")
22 |         self.g = g
23 |         self.use_olh = use_olh
24 |         self.update_params(epsilon=epsilon, d=d, g=g, index_mapper=index_mapper)
25 | 
26 |     def update_params(self, epsilon=None, d=None, use_olh=None, g=None, index_mapper=None):
27 |         """
28 |         Updates LHServer parameters, will reset any aggregated/estimated data
29 |         Args:
30 |             epsilon: optional - privacy budget
31 |             d: optional - domain size
32 |             g: optional - hash domain
33 |             index_mapper: optional - function
34 |         """
35 |         super().update_params(epsilon, d, index_mapper)
36 | 
37 |         # If use_olh is true, then update the g parameter
38 |         if use_olh is not None:
39 |             self.use_olh = use_olh
40 | 
41 |         self.g = g if g is not None else self.g
42 |         if self.use_olh is True:
43 |             self.g = int(round(math.exp(self.epsilon))) + 1
44 | 
45 |         # Update probs and g
46 |         if epsilon is not None:
47 |             self.p = math.exp(self.epsilon) / (math.exp(self.epsilon) + self.g - 1)
48 | 
49 |     def aggregate(self, priv_data):
50 |         """
51 |         Aggregates privatised data from LHClient to be used to calculate frequency estimates.
52 | 
53 |         Args:
54 |             priv_data: Privatised data of the form returned from UEClient.privatise
55 |         """
56 |         seed = priv_data[1]
57 |         priv_data = priv_data[0]
58 | 
59 |         for i in range(0, self.d):
60 |             if priv_data == (xxhash.xxh32(str(i), seed=seed).intdigest() % self.g):
61 |                 self.aggregated_data[i] += 1
62 | 
63 |         self.n += 1
64 | 
65 |     def _update_estimates(self):
66 |         a = self.g / (self.p * self.g - 1)
67 |         b = self.n / (self.p * self.g - 1)
68 | 
69 |         self.estimated_data = a * self.aggregated_data - b
70 |         return self.estimated_data
71 | 
72 |     def estimate(self, data, suppress_warnings=False):
73 |         """
74 |         Calculates a frequency estimate of the given data item using the aggregated data.
75 | 
76 |         Args:
77 |             data: data item
78 |             suppress_warnings: Optional boolean - Suppresses warnings about possible inaccurate estimations
79 | 
80 |         Returns: float - frequency estimate of the data item
81 | 
82 |         """
83 |         self.check_warnings(suppress_warnings=suppress_warnings)
84 |         index = self.index_mapper(data)
85 |         self.check_and_update_estimates()
86 |         return self.estimated_data[index]
87 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/lh_client.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | import xxhash
 4 | from sys import maxsize
 5 | import random
 6 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient
 7 | 
 8 | # Client-side for local-hashing
 9 | 
10 | # Very loosely based on code by Wang (https://github.com/vvv214/LDP_Protocols/blob/master/olh.py)
11 | 
12 | class LHClient(FreqOracleClient):
13 |     def __init__(self, epsilon, d, g=2, use_olh=False, index_mapper=None):
14 |         """
15 | 
16 |         Args:
17 |             epsilon: float - The privacy budget
18 |             d: integer - Domain size
19 |             g: Optional integer - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing)
20 |             use_olh: Optional boolean - if set to true uses Optimised Local Hashing (OLH) i.e g is set to round(e^epsilon + 1)
21 |             index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
22 |         """
23 |         super().__init__(epsilon, d, index_mapper=index_mapper)
24 |         self.use_olh = use_olh
25 |         self.g =g
26 |         self.update_params(epsilon=epsilon, d=d, g=g, index_mapper=index_mapper)
27 | 
28 |     def update_params(self, epsilon=None, d=None, use_olh=None, g=None, index_mapper=None):
29 |         """
30 | 
31 |         Args:
32 |             epsilon: optional - privacy budget
33 |             d: optional - domain size
34 |             g: optional - hash domain
35 |             index_mapper: optional - function
36 |         """
37 |         super().update_params(epsilon, d, index_mapper) # Updates core params
38 | 
39 |         # If use_olh is true, then update the g parameter
40 |         self.use_olh = use_olh if use_olh is not None else self.use_olh
41 | 
42 |         # Updates g and probs
43 |         self.g = g if g is not None else self.g
44 |         if self.use_olh is True:
45 |             self.g = int(round(math.exp(self.epsilon))) + 1
46 | 
47 |         if self.epsilon is not None or self.g is not None:
48 |             self.p = math.exp(self.epsilon) / (math.exp(self.epsilon) + self.g - 1)
49 |             self.q = 1.0 / (math.exp(self.epsilon) + self.g - 1)
50 | 
51 |     def _perturb(self, data, seed):
52 |         """
53 |         Used internally to perturb data using local hashing.
54 | 
55 |         Will hash the user's data item and then perturb it with probabilities that
56 |         satisfy epsilon local differential privacy.
57 | 
58 |         Local hashing is explained in more detail here: https://www.usenix.org/system/files/conference/usenixsecurity17/sec17-wang-tianhao.pdf
59 | 
60 |         Args:
61 |             data: User's data to be privatised
62 |             seed: The seed for the user's hash function
63 | 
64 |         Returns: peturbed data
65 | 
66 |         """
67 |         index = self.index_mapper(data)
68 | 
69 |         # Taken directly from https://github.com/vvv214/LDP_Protocols/blob/master/olh.py#L55-L65
70 |         x = (xxhash.xxh32(str(index), seed=seed).intdigest() % self.g)
71 |         y = x
72 | 
73 |         p_sample = np.random.random_sample()
74 |         # the following two are equivalent
75 |         # if p_sample > p:
76 |         #     while not y == x:
77 |         #         y = np.random.randint(0, g)
78 |         if p_sample > self.p - self.q:
79 |             # perturb
80 |             y = np.random.randint(0, self.g)
81 | 
82 |         return y
83 | 
84 |     def privatise(self, data):
85 |         """
86 |         Privatises a user's data using local hashing.
87 | 
88 |         Args:
89 |             data: The data to be privatised
90 | 
91 |         Returns:
92 |             privatised data: a single integer
93 |         """
94 |         seed = random.randint(0,maxsize) # This is sys.maxsize
95 |         return self._perturb(data, seed), seed
96 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/gbdt/components/index_sampler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import math
 3 | import itertools
 4 | 
 5 | class IndexSampler():
 6 |     def __init__(self, subsample, row_sample_method, colsample_bytree, colsample_bylevel, colsample_bynode):
 7 |         self.subsample = subsample
 8 |         self.row_sample_method = row_sample_method
 9 |         self.colsample_bytree = colsample_bytree  # Number of features to uniformly sample per tree
10 |         self.colsample_bylevel = colsample_bylevel  # Per level of a tree
11 |         self.colsample_bynode = colsample_bynode  # Per node of a tree
12 |         # Sampling is multiplicative i.e cols_sample_bytree * colsample_bylevel * colsample_bynode * d features are considered at each split
13 | 
14 |         self.feature_interaction_iter = None
15 | 
16 |     def sample(self, current_round, num_samples, num_features, max_depth, feature_interaction_method="cyclical", feature_interaction_k=""):
17 |         """
18 |         Helper method to perform sampling for the XGBoost model
19 | 
20 |         :param num_samples: The number of rows in the dataset
21 |         :param num_features: The number of features
22 |         :return: The sampled indexes for rows, features per tree and features per node according to the self.subsample,
23 |             self.colsample_bytree and self.colsample_bylevel parameters
24 |         """
25 |         col_tree_sample = None
26 |         col_level_sample = None
27 |         row_sample = np.arange(num_samples)
28 | 
29 |         if self.subsample < 1:  # Sample rows per tree
30 |             if self.row_sample_method == "wor":
31 |                 row_sample = np.random.choice(num_samples, size=round(num_samples * self.subsample), replace=False)
32 |             elif self.row_sample_method == "wr":
33 |                 raise NotImplemented("With replacement sampling is not implemented")
34 |             elif self.row_sample_method == "poisson":
35 |                 row_sample = np.where(np.random.binomial(1, self.subsample, size=num_samples)==1)[0]
36 |             elif self.row_sample_method == "disjoint":
37 |                 subset_size = math.ceil(num_samples*self.subsample)
38 |                 start = (((current_round) % math.ceil(num_samples / subset_size))) * subset_size
39 |                 end = start + subset_size
40 |                 row_sample = self.disjoint[start:end]
41 | 
42 |         if self.colsample_bytree < 1:  # Sample columns per tree
43 |             col_tree_sample = np.random.choice(num_features, size=math.ceil(num_features * self.colsample_bytree), replace=False)
44 |         if self.colsample_bylevel < 1 and self.colsample_bytree < 1:  # Sample columns per level of the tree (taking into account the cols alreaady sampled for the current tree)
45 |             col_level_sample = [np.random.choice(range(0, len(col_tree_sample)), size=math.ceil(len(col_tree_sample) * self.colsample_bylevel), replace=False) for i in range(0, self.max_depth + 2)]
46 |         elif self.colsample_bylevel < 1:
47 |             col_level_sample = [np.random.choice(num_features, size=math.ceil(num_features* self.colsample_bylevel), replace=False) for i in range(0, self.max_depth + 2)]
48 | 
49 |         if "cyclical" in feature_interaction_method:
50 |             if feature_interaction_k == 1:
51 |                 col_tree_sample = [current_round % num_features]
52 |             elif feature_interaction_k:
53 |                 if self.feature_interaction_iter is None:
54 |                     self.feature_interaction_iter = itertools.cycle(itertools.combinations(list(range(0, num_features)), feature_interaction_k)) # precompute
55 |                 col_tree_sample = list(next(self.feature_interaction_iter))
56 |         elif "random" in feature_interaction_method:
57 |             if feature_interaction_k:
58 |                 col_tree_sample = np.random.choice(num_features, size=feature_interaction_k, replace=False) # Choose k features at random
59 | 
60 |         return row_sample, col_tree_sample, col_level_sample
61 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/moments_accountant/compute_noise_from_budget_lib.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Library for computing privacy values for DP-SGD."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import math
22 | import sys
23 | 
24 | from scipy.optimize import bisect
25 | 
26 | from federated_gbdt.core.moments_accountant.rdp_accountant import compute_rdp, get_privacy_spent # pylint: disable=g-import-not-at-top
27 | 
28 | # Opting out of loading all sibling packages and their dependencies.
29 | sys.skip_tf_privacy_import = True
30 | 
31 | 
32 | def apply_dp_sgd_analysis(q, sigma, steps, orders, delta):
33 |     """Compute and print results of DP-SGD analysis."""
34 | 
35 |     # compute_rdp requires that sigma be the ratio of the standard deviation of
36 |     # the Gaussian noise to the l2-sensitivity of the function to which it is
37 |     # added. Hence, sigma here corresponds to the `noise_multiplier` parameter
38 |     # in the DP-SGD implementation found in privacy.optimizers.dp_optimizer
39 |     rdp = compute_rdp(q, sigma, steps, orders)
40 | 
41 |     eps, _, opt_order = get_privacy_spent(orders, rdp, target_delta=delta)
42 |     return eps, opt_order
43 | 
44 | 
45 | def compute_noise(n, batch_size, target_epsilon, epochs, delta, noise_lbd, verbose=False):
46 |     """Compute noise based on the given hyperparameters."""
47 |     q = batch_size / n  # q - the sampling ratio.
48 |     if q > 1:
49 |         raise Exception('n must be larger than the batch size.')
50 |     orders = ([1.25, 1.5, 1.75, 2., 2.25, 2.5, 3., 3.5, 4., 4.5] +
51 |               list(range(5, 64)) + [128, 256, 512])
52 |     steps = int(math.ceil(epochs * n / batch_size))
53 | 
54 |     init_noise = noise_lbd  # minimum possible noise
55 |     init_epsilon, _ = apply_dp_sgd_analysis(q, init_noise, steps, orders, delta)
56 | 
57 |     if init_epsilon < target_epsilon:  # noise_lbd was an overestimate
58 |         print('min_noise too large for target epsilon.')
59 |         return 0
60 | 
61 |     cur_epsilon = init_epsilon
62 |     max_noise, min_noise = init_noise, 0
63 | 
64 |     # doubling to find the right range
65 |     while cur_epsilon > target_epsilon:  # until noise is large enough
66 |         max_noise, min_noise = max_noise * 2, max_noise
67 |         cur_epsilon, _ = apply_dp_sgd_analysis(q, max_noise, steps, orders, delta)
68 |         # print(cur_epsilon)
69 | 
70 |     def epsilon_fn(noise):  # should return 0 if guess_epsilon==target_epsilon
71 |         guess_epsilon, opt_order = apply_dp_sgd_analysis(q, noise, steps, orders, delta)
72 |         if verbose:
73 |             print("Optimal Alpha", opt_order)
74 |         return guess_epsilon - target_epsilon
75 | 
76 |     target_noise, res = bisect(epsilon_fn, min_noise, max_noise, full_output=True)
77 |     final_eps, opt_order = apply_dp_sgd_analysis(q, target_noise, steps, orders, delta)
78 | 
79 |     if verbose:
80 |         print(res)
81 |         print(
82 |             'DP-SGD with sampling rate = {:.3g}% and noise_multiplier = {} iterated'
83 |             ' over {} steps satisfies'.format(100 * q, target_noise, steps),
84 |             end=' ')
85 |         print('differential privacy with eps = {:.3g} and delta = {}.'.format(
86 |             target_epsilon, delta))
87 | 
88 |     return target_noise, opt_order


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/local_hashing/fast_lh_server.py:
--------------------------------------------------------------------------------
 1 | import xxhash
 2 | import numpy as np
 3 | from federated_gbdt.core.pure_ldp.frequency_oracles.local_hashing import LHServer
 4 | 
 5 | # Server-side for fast local-hashing
 6 | 
 7 | class FastLHServer(LHServer):
 8 |     def __init__(self, epsilon, d, k, g=2, use_olh=True, index_mapper=None, hash_matrix=None):
 9 |         """
10 | 
11 |         Args:
12 |             epsilon: float - The privacy budget
13 |             d: integer - Size of the data domain
14 |             k: integer - The number of hash functions to use. Larger k results in a more accurate oracle at the expense of computation time.
15 |             g: Optional float - The domain [g] = {1,2,...,g} that data is hashed to, 2 by default (binary local hashing)
16 |             use_olh: Optional boolean - if set to true uses Optimised Local Hashing i.e g is set to round(e^epsilon + 1)
17 |             index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
18 |             hash_matrix: Optional matrix - Allows the use of a pre-computed hash matrix that contains hashed domain elements
19 |         """
20 |         self.k = k
21 |         super().__init__(epsilon, d, g, use_olh, index_mapper=index_mapper)
22 |         self.hash_counts = np.zeros((self.k, self.g))
23 | 
24 |         # g = lambda i,j: xxhash.xxh32(str(int(j)), seed=int(i)).intdigest() % self.g
25 | 
26 |         if hash_matrix is None:
27 |             matrix = np.empty((self.k, self.d))
28 |             for i in range(0, self.k):
29 |                 for j in range(0, self.d):
30 |                     matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g
31 | 
32 |             # self.hash_matrix = np.fromfunction(g, (self.k, self.d))
33 |             self.hash_matrix = matrix
34 |         else:
35 |             self.hash_matrix = hash_matrix
36 | 
37 |     def update_params(self, epsilon=None, d=None, k=None, use_olh=None, g=None, index_mapper=None, update_hash_matrix=True):
38 |         super().update_params(epsilon=epsilon, d=d, use_olh=use_olh, g=g, index_mapper=index_mapper)
39 |         self.k = k if k is not None else self.k
40 | 
41 |         # If any of the main parameters are updated the hash_matrix needs to be updated... this is quite slow
42 |         if epsilon is not None or self.g is not None or self.k is not None or self.d is not None and update_hash_matrix is True:
43 |             matrix = np.empty((self.k, self.d))
44 |             for i in range(0, self.k):
45 |                 for j in range(0, self.d):
46 |                     matrix[i][j] = xxhash.xxh32(str(j), seed=i).intdigest() % self.g
47 |             self.hash_matrix = matrix
48 | 
49 |     def aggregate(self, priv_data):
50 |         """
51 |         Aggregates privatised data from FastLHClient to be used to calculate frequency estimates.
52 | 
53 |         Args:
54 |             priv_data: Privatised data of the form returned from UEClient.privatise
55 |         """
56 |         seed = priv_data[1]
57 |         priv_data = priv_data[0]
58 | 
59 |         self.hash_counts[seed][priv_data] += 1
60 |         self.n += 1
61 | 
62 |     def _compute_aggregates(self):
63 | 
64 |         def func(x):
65 |             sum = 0
66 |             for index, val in enumerate(x):
67 |                 sum += self.hash_counts[index,int(val)]
68 |             return sum
69 | 
70 |         self.aggregated_data = np.apply_along_axis(func, 0, self.hash_matrix)
71 | 
72 |     def _update_estimates(self):
73 |         self._compute_aggregates()
74 |         super()._update_estimates()
75 | 
76 |     def estimate(self, data, suppress_warnings=False):
77 |         """
78 |         Calculates a frequency estimate of the given data item using the aggregated data.
79 | 
80 |         Args:
81 |             data: data item
82 |             suppress_warnings: Optional boolean - Suppresses warnings about possible inaccurate estimations
83 | 
84 |         Returns: float - frequency estimate of the data item
85 | 
86 |         """
87 |         self.check_and_update_estimates()
88 |         return super().estimate(data)


--------------------------------------------------------------------------------
/experiments/replication_experiments/experiment_replicator.py:
--------------------------------------------------------------------------------
 1 | from experiments.paper_experiments.paper_experiments import *
 2 | from experiments.paper_experiments.paper_plotter import *
 3 | import os.path
 4 | 
 5 | base_path = "./replication_data/"
 6 | 
 7 | class ExperimentReplicator():
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def replicate(self, figure_num, dataset="Credit 1", overwrite=False):
12 |         if figure_num == 1:
13 |             filename = "replication_fig1"
14 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
15 |                 print("Replicated data already exists...")
16 |             else:
17 |                 dp_split_methods_with_update_methods(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1])
18 |             print("Plotting data...")
19 |             plot_split_methods_with_update(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
20 | 
21 |         elif figure_num == 2:
22 |             filename = "replication_fig2"
23 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
24 |                 print("Replicated data already exists...")
25 |             else:
26 |                 dp_split_candidate_methods(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1])
27 |             print("Plotting data...")
28 |             plot_split_candidates(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
29 | 
30 |         elif figure_num == 3:
31 |             filename = "replication_fig3"
32 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
33 |                 print("Replicated data already exists...")
34 |             else:
35 |                 feature_interaction_experiments(filename=filename, save_data=True, replication=True, iters=6, datasets=[dataset], seeds=[1])
36 |             print("Plotting data...")
37 |             plot_k_way(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
38 | 
39 |         elif figure_num == 4:
40 |             filename = "replication_fig4"
41 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
42 |                 print("Replicated data already exists...")
43 |             else:
44 |                 dp_ebm_experiment(filename=filename, save_data=True, replication=True, iters=10, datasets=[dataset], seeds=[1])
45 |             print("Plotting data...")
46 |             plot_ebm_comparisons(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
47 | 
48 |         elif figure_num == 5:
49 |             filename = "replication_fig5"
50 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
51 |                 print("Replicated data already exists...")
52 |             else:
53 |                 batched_boosting(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1])
54 |             print("Plotting data...")
55 |             plot_low_eps_bb(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
56 | 
57 |         elif figure_num == 6:
58 |             filename = "replication_fig6"
59 |             if os.path.isfile(base_path + filename + ".csv") and not overwrite:
60 |                 print("Replicated data already exists...")
61 |             else:
62 |                 comparisons_experiment(filename=filename, save_data=True, replication=True, iters=3, datasets=[dataset], seeds=[1])
63 |             print("Plotting data...")
64 |             plot_comparisons(in_path=base_path+filename+".csv", out_path=base_path, replication=True)
65 | 
66 | if __name__ == "__main__":
67 |     replicator = ExperimentReplicator()
68 |     # parser = argparse.ArgumentParser()
69 |     # parser.add_argument('fig_num', type=int, default=1, choices=range(6),nargs='+', help='Figure number to replicate')
70 |     # parser.add_argument('overwrite', type=bool, default=False, help='Whether to overwrite the existing data')
71 |     # args = parser.parse_args()
72 |     # replicator.replicate(args.fig_num, overwrite=args.overwrite)
73 | 
74 |     replicator.replicate(1, overwrite=False, dataset="Credit 1")


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/hybrid_mechanism/hybrid_mech_client.py:
--------------------------------------------------------------------------------
 1 | from federated_gbdt.core.pure_ldp.core import FreqOracleClient
 2 | import numpy as np
 3 | import random
 4 | import math
 5 | 
 6 | class HMClient(FreqOracleClient):
 7 |     def __init__(self, epsilon, max, min, index_mapper=None, perturb_type="hybrid"):
 8 | 
 9 |         super().__init__(epsilon=epsilon, d=None, index_mapper=index_mapper)
10 |         self.update_params(epsilon, d=None, index_mapper=index_mapper)
11 |         self.perturb_type = perturb_type
12 |         self.max = max
13 |         self.min = min
14 |         self.normalised_input = np.array([])
15 | 
16 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
17 |         super().update_params(epsilon, d, index_mapper)
18 |         ee = np.exp(self.epsilon)
19 | 
20 |         if epsilon is not None or d is not None:
21 |             self.p = ee/(ee + 1)
22 |             self.q = 1/(ee + 1)
23 |             self.ee2 = np.exp(self.epsilon/2)
24 |             self.s = (self.ee2 + 1) / (self.ee2 - 1)
25 |             self.alpha = 1 - (np.exp(-self.epsilon/2))
26 | 
27 |     def _stochastic_rounding(self, norm_data):
28 |         # perturb mechanism for Stochastic Rounding
29 |         if random.random() <= self.q + (((self.p - self.q)*(1 - norm_data)) / 2):
30 |             v_prime = -1
31 |         else:
32 |             v_prime = +1
33 | 
34 |         result = v_prime/(self.p-self.q)
35 |         return result
36 | 
37 |     def _piecewise_mechanism(self, norm_data):
38 |         # perturb for piecewise mechanism
39 |         if random.random() <= self.ee2/(self.ee2 + 1):
40 |             v_prime = random.uniform(((self.ee2 * norm_data) - 1) / (self.ee2 - 1),
41 |                                      ((self.ee2 * norm_data) + 1) / (self.ee2 - 1))
42 |         else:
43 |             if random.random() <= (((self.ee2 * norm_data) - 1) / (self.ee2 - 1) + self.s) / (self.s+1): # Weight sampling uniformly from the left-region by it's size
44 |                 v_prime = random.uniform(-self.s, ((self.ee2 * norm_data) - 1) / (self.ee2 - 1))
45 |             else:
46 |                 v_prime = random.uniform(((self.ee2 * norm_data) + 1) / (self.ee2 - 1), self.s)
47 |         return v_prime
48 | 
49 |     def _pm2(self, v):
50 |         """
51 |         Piecewise Mechanism, from paper: Collecting and Analyzing Multidimensional Data with Local Differential Privacy
52 |         """
53 |         z = np.e ** (self.epsilon / 2)
54 |         P1 = (v + 1) / (2 + 2 * z)
55 |         P2 = z / (z + 1)
56 |         P3 = (1 - v) / (2 + 2 * z)
57 | 
58 |         C = (z + 1) / (z - 1)
59 |         g1 = (C + 1) * v / 2 - (C - 1) / 2
60 |         g2 = (C + 1) * v / 2 + (C - 1) / 2
61 | 
62 |         rnd = np.random.random()
63 |         if rnd < P1:
64 |             result = -C + np.random.random() * (g1 - (-C))
65 |         elif rnd < P1 + P2:
66 |             result = (g2 - g1) * np.random.random() + g1
67 |         else:
68 |             result = (C - g2) * np.random.random() + g2
69 |         return result
70 | 
71 |     def _perturb(self, data):
72 |         # normalise the input data into the domain [-1,1]
73 |         norm_data = ((2*(data - self.min)) / (self.max - self.min)) - 1
74 |         result = 0
75 |         if self.perturb_type == "hybrid":
76 |             # when epsilon > 0.61 use PW with prob alpha and SR with 1-alpha
77 |             if self.epsilon > 0.61:
78 |                 if random.random() <= self.alpha:
79 |                     result = self._piecewise_mechanism(norm_data)
80 |                 else:
81 |                     result = self._stochastic_rounding(norm_data)
82 |             # when epsilon <= 0.61 use SR only
83 |             else:
84 |                 result = self._stochastic_rounding(norm_data)
85 |         elif self.perturb_type == "sr":
86 |             result = self._stochastic_rounding(norm_data)
87 |         elif self.perturb_type == "pm":
88 |             result = self._piecewise_mechanism(norm_data)
89 | 
90 |         result = ((result + 1) * (self.max - self.min) / 2) + self.min
91 |         return result
92 | 
93 |     def privatise(self, data):
94 |         return self._perturb(data)
95 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/plotting.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | from scipy.stats import gaussian_kde
 6 | 
 7 | from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT
 8 | 
 9 | from experiments.experiment_helpers.data_loader import DataLoader
10 | 
11 | def plot_feature_importance(model, feature_names, method="gain"):
12 |     """
13 |     Plots feature importance
14 | 
15 |     :param feature_names: List of feature names as strings (for plotting)
16 |     :param method: Feature importance method to be used
17 |     """
18 |     x, y = zip(*model.feature_importance(method).most_common())
19 |     plt.figure(figsize=(10, 10))
20 |     plt.bar(feature_names[list(x)], y)
21 |     plt.xticks(rotation=90)
22 |     plt.xlabel("Feature")
23 |     plt.ylabel(method)
24 |     plt.title("Feature Importance - " + method)
25 |     plt.show()
26 | 
27 | # Pass data, types of sketches to visualise and the features to visualise - Optional is to pass different bin nums to be printed
28 | def visualise_quantiles(model, X, sketch_types, feature_list, hist_bins=[32]):
29 |     """
30 |     Helper method to visualise quantiles calculated via various methods
31 | 
32 |     :param X: Data
33 |     :param sketch_types: List of quantile methods to be computed on features in X
34 |     :param feature_list: List of features to visualise quantiles
35 |     :param hist_bins: List of # of histogram bins to visualise
36 |     """
37 |     quantile_map = {}
38 |     for hist_bin in hist_bins:
39 |         model.split_candidate_manager.num_candidates = hist_bin
40 |         for sketch_type in sketch_types:
41 |             model.split_candidate_manager.sketch_type = sketch_type
42 |             model.split_candidate_manager.find_split_candidates(X, 0)
43 |             quantile_map[sketch_type] = model.split_candidate_manager.feature_split_candidates
44 | 
45 |     _, axes = plt.subplots(len(feature_list), len(sketch_types), figsize=(20,30))
46 |     axes = np.array(axes).reshape(len(feature_list), len(sketch_types))
47 |     print(axes.shape)
48 |     for j, feature_index in enumerate(feature_list):
49 |         # Create subplot grid...
50 |         print("Feature j", X[:, j])
51 |         for i, sketch_type in enumerate(quantile_map.keys()):
52 |             # Plot feature dist
53 |             sns.kdeplot(x=X[:, feature_index], ax=axes[j,i])
54 |             # sns.histplot(x=X[:, j], stat="density", kde=True, hist=False)
55 | 
56 |             # x,y = kde.get_lines()[0].get_data()
57 |             kde = gaussian_kde(X[:,feature_index][~np.isnan(X[:,feature_index])])
58 | 
59 |             quantiles = quantile_map[sketch_type][feature_index]
60 |             # print(sketch_type, "quantiles:", len(quantiles))
61 |             # print(sketch_type, "unique quantiles:", len(set(quantiles)))
62 |             # print(sketch_type, quantiles, "\n")
63 |             axes[j,i].vlines(quantiles, 0, kde(quantiles), colors="red", linestyles="--", linewidth=0.4)
64 |             axes[j,i].set_xlim(left=np.nanmin(X[:,j]), right=np.nanmax(X[:,j]))
65 |             axes[j,i].set_yticklabels([])
66 |             axes[j,i].set_xticklabels([])
67 |             y_label = axes[j,i].get_yaxis().get_label()
68 |             y_label.set_visible(False)
69 |             # axes[j,i].set_title("Density of feature " + str(feature_index) + "\n Quantile Method: " + sketch_type)
70 | 
71 |         if "uniform" in quantile_map.keys():
72 |             uniform_quantiles = quantile_map["uniform"][feature_index]
73 | 
74 |             for k in quantile_map.keys():
75 |                 ldp_quantiles = quantile_map[k][feature_index]
76 |                 ldp_quantiles = np.sort(ldp_quantiles)
77 |                 total_mse = 0
78 |                 for i, q in enumerate(ldp_quantiles):
79 |                     total_mse += np.min((uniform_quantiles-q)**2)
80 |                     # total_mse += (uniform_quantiles[i]-q)**2
81 | 
82 |                 print("Feature", feature_index, "Method:", k, "MSE:", total_mse/len(uniform_quantiles))
83 | 
84 |     plt.axis("off")
85 |     plt.show()
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     dataloader = DataLoader()
90 |     data = list(dataloader.load_datasets(["Credit 1"], remove_missing=True, return_dict=True, verbose=True).items())[0]
91 |     X, X_test, y_train, y_test = data[1]
92 |     X = X.to_numpy()
93 |     model = PrivateGBDT()
94 |     visualise_quantiles(model, X, ["uniform", "log"], [2,4,5,7])
95 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/loss_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def sigmoid(x):
  4 |     return 1.0 / (1.0 + np.exp(-x))
  5 | 
  6 | class Sigmoid():
  7 |     def __call__(self, x):
  8 |         return 1 / (1 + np.exp(-x))
  9 | 
 10 |     def gradient(self, x):
 11 |         return self.__call__(x) * (1 - self.__call__(x))
 12 | 
 13 | def softmax(x, axis=-1):
 14 |     y = np.exp(x - np.max(x, axis, keepdims=True))
 15 |     return y / np.sum(y, axis, keepdims=True)
 16 | 
 17 | class LogisticLoss():
 18 |     def __init__(self):
 19 |         sigmoid = Sigmoid()
 20 |         self.log_func = sigmoid
 21 |         self.log_grad = sigmoid.gradient
 22 | 
 23 |     def loss(self, y, y_pred):
 24 |         y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
 25 |         p = self.log_func(y_pred)
 26 |         return y * np.log(p) + (1 - y) * np.log(1 - p)
 27 | 
 28 |     # gradient w.r.t y_pred
 29 |     def gradient(self, y, y_pred):
 30 |         p = self.log_func(y_pred)
 31 |         return -(y - p)
 32 | 
 33 |     # w.r.t y_pred
 34 |     def hess(self, y, y_pred):
 35 |         p = self.log_func(y_pred)
 36 |         return p * (1 - p)
 37 | 
 38 | # binary cross entropy loss ------------------------------------------------------------------------------------
 39 | class SigmoidBinaryCrossEntropyLoss:
 40 | 
 41 |     def __init__(self):
 42 |         pass
 43 | 
 44 |     @staticmethod
 45 |     def predict(value):
 46 |         return sigmoid(value)
 47 | 
 48 |     def compute_loss(self, y, y_pred):
 49 |         # negative averaged log loss
 50 |         log_loss = np.nan_to_num(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
 51 |         return -np.sum(log_loss) / len(y_pred)
 52 | 
 53 |     def compute_grad(self, y, y_pred):
 54 |         return y_pred - y
 55 | 
 56 |     def compute_hess(self, y, y_pred):
 57 |         return y_pred * (1 - y_pred)
 58 | 
 59 | 
 60 | # softmax cross entropy loss -------------------------------------------------------------------------------
 61 | class SoftmaxCrossEntropyLoss:
 62 | 
 63 |     def __init__(self):
 64 |         pass
 65 | 
 66 |     @staticmethod
 67 |     def predict(values):
 68 |         """
 69 |         :param values: ndarray
 70 |         :return: ndarray
 71 |         """
 72 |         return softmax(values)
 73 | 
 74 |     def compute_loss(self, y, y_pred):
 75 |         y_prob = self.predict(y_pred)
 76 |         # do summation over feature dimensions & do averaging over samples
 77 |         log_loss = np.nan_to_num(y * np.log(y_prob))
 78 |         return -np.sum(log_loss) / len(y_prob)
 79 | 
 80 |     def compute_grad(self, y, y_pred):
 81 |         assert len(y_pred.shape) == 2
 82 |         y_prob = self.predict(y_pred)
 83 |         return y_prob - y
 84 | 
 85 |     def compute_hess(self, y, y_pred):
 86 |         y_prob = self.predict(y_pred)
 87 |         return y_prob * (1 - y_prob)
 88 | 
 89 | 
 90 | class BinaryRFLoss():
 91 |     def __init__(self):
 92 |         pass
 93 | 
 94 |     def predict(self, x):
 95 |         return x
 96 | 
 97 |     def compute_grad(self, y, y_pred):
 98 |         return (np.array(y)==1).astype("int")
 99 | 
100 |     def compute_hess(self, y, y_pred):
101 |         return np.ones_like(y)
102 | 
103 | class SoftmaxLoss:
104 |     def __init__(self):
105 |         pass
106 | 
107 |     def predict(self, x):
108 |         out = []
109 |         for i,r in enumerate(x):
110 |             e = np.exp(r)
111 |             out.append(e / np.sum(e))
112 |         return np.array(out)
113 | 
114 |     def compute_grad(self, y, y_pred):
115 |         grads = []
116 |         p = self.predict(y_pred)
117 | 
118 |         for i in range(len(y)):
119 |             grad = np.zeros(y_pred.shape[1])
120 |             for j in range(0, y_pred.shape[1]):
121 |                 if j == y[i]:
122 |                     grad[j] = p[i][j] - 1
123 |                 else:
124 |                     grad[j] = p[i][j]
125 |             grads.append(grad)
126 | 
127 |         return np.array(grads)
128 | 
129 |     def compute_hess(self, y, y_pred):
130 |         hess = np.zeros(len(y_pred))
131 |         p = self.predict(y_pred)
132 |         return p * (1- p)
133 | 
134 | 
135 | class LeastSquareLoss:
136 |     """ loss = 1/2 (y-y_hat)**2 """
137 | 
138 |     def __init__(self):
139 |         pass
140 | 
141 |     @staticmethod
142 |     def predict(value):
143 |         return value
144 | 
145 |     @staticmethod
146 |     def compute_loss(y, y_pred):
147 |         lse_loss = 0.5 * (y - y_pred)**2
148 |         return np.sum(lse_loss) / len(y)
149 | 
150 |     @staticmethod
151 |     def compute_grad(y, y_pred):
152 |         return y_pred - y
153 | 
154 |     @staticmethod
155 |     def compute_hess(y, y_pred):
156 |         # derivative of y_hat-y is 1
157 |         if type(y).__name__ == 'ndarray' or type(y_pred).__name__ == 'ndarray':
158 |             return np.ones_like(y)
159 |         else:
160 |             return 1
161 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/moments_accountant/dp_params.py:
--------------------------------------------------------------------------------
 1 | from federated_gbdt.core.moments_accountant.compute_noise_from_budget_lib import compute_noise
 2 | import math
 3 | 
 4 | # Sam Comment:
 5 |     # Method uses the RDP moments accountant which works as follows
 6 |     # 1) For a fixed eps,delta compute the (alpha, tau)-RDP guarantee of the Gaussian mechanism
 7 |     # 2) Perform a binary search over alpha values to find a sigma value that guarantees an epsilon < target_epsilon
 8 |     # 3) Perform a bisection on the (alpha,tau)-RDP => (eps,delta)-DP conversion bound with the (loose) sigma found
 9 |     #       to find the exact noise needed for the target_epsilon given
10 | 
11 | class RDPAccountant():
12 |     def __init__(self, eps, delta, q, clip, total_queries, method="rdp", verbose=False):
13 |         self.eps = eps
14 |         self.delta = delta
15 |         self.q = q
16 |         self.clip = clip
17 |         self.total_leaf_nodes = total_queries
18 |         self.method = method
19 |         self.sigma, self.opt_alpha = self.compute_sigma(method=method, eps=eps, delta=delta, q=q, total_queries=total_queries, verbose=verbose)
20 | 
21 |     @staticmethod
22 |     def compute_sigma(method, eps, delta, q, total_queries, verbose):
23 |         opt_alpha = None
24 |         if method == "rdp":
25 |             sigma, opt_alpha = compute_noise(1, q, eps, total_queries, delta, 1e-5, verbose)
26 |         elif method == "basic":
27 |             sigma = total_queries * math.sqrt(2 * math.log(total_queries*1.25 / delta)) / eps  # Basic composition - scalar mechanism
28 |         elif method == "advanced":
29 |             eps_prime = eps / (2 * math.sqrt(2 * total_queries * math.log(2 * total_queries / delta)))
30 |             sigma = math.sqrt(2 * math.log(1.25 / delta)) / eps_prime  # Advanced composition
31 |         elif "rdp_weak":
32 |             a = (-2 * (math.log(delta) - eps) + math.sqrt((2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / ( 2 * eps)  # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound
33 |             C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1)
34 |             sigma = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + (a - 1) * eps)))  # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP
35 |             opt_alpha = a
36 | 
37 |         return sigma, opt_alpha
38 | 
39 | def budget_examples():
40 |     eps = 1
41 |     delta = 1e-5
42 |     total_queries = 6 * 10 * 10  # Suppose 10 features over 10 trees with a maximum depth of 6
43 |     # total_queries = 1
44 | 
45 |     sigma_basic = total_queries * math.sqrt(2 * math.log(1.25 / delta)) / eps
46 | 
47 |     eps_prime = eps / (2 * math.sqrt(2 * total_queries * math.log(2 * total_queries / delta)))
48 | 
49 |     sigma_basic = total_queries * math.sqrt(2 * math.log(total_queries * 1.25 / delta)) / eps  # Basic composition
50 |     sigma_advanced = math.sqrt(2 * math.log(1.25 / delta)) / eps_prime  # Advanced composition
51 |     sigma_moments = 2 * math.sqrt(total_queries * math.log(1 / delta)) / eps  # Moments accountant asymptotic bound
52 | 
53 |     a = (-2 * (math.log(delta) - eps) + math.sqrt(
54 |         (2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / (
55 |                     2 * eps)  # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound
56 |     C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1)
57 |     sigma_rdp_weak = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + (
58 |                 a - 1) * eps)))  # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP
59 | 
60 | 
61 |     obj = RDPAccountant(eps, delta, 1, None, total_queries,
62 |                         verbose=False)  # RDP using the tf implementation which uses the stronger conversion bound (also supports tight subsampling analysis)
63 | 
64 |     eps=eps/2
65 |     a = (-2 * (math.log(delta) - eps) + math.sqrt(
66 |         (2 * (math.log(delta) - eps)) ** 2 + 4 * eps * (math.log(delta) + eps))) / (
67 |                     2 * eps)  # Optimal alpha value for RDP can be solved exactly in the Gaussian case by applying the weak conversion bound
68 |     C = math.log1p(-1 / a) - math.log(delta * a) / (a - 1)
69 |     sigma_rdp_weak_2 = math.sqrt(total_queries * a * (a - 1) / (2 * (math.log(delta) + (
70 |                 a - 1) * eps)))  # RDP using the stronger conversion bound (a,r)-RDP to (eps,delta)-DP
71 | 
72 |     print("Alpha found directly using weak bound:", a)
73 |     print("Optimal Alpha:", obj.opt_alpha)
74 | 
75 |     print("\n")
76 |     print("SIGMA VALUES")
77 |     print("Basic Composition:", sigma_basic)
78 |     print("Advanced Composition:", sigma_advanced)
79 |     print("Sigma Moments:", sigma_moments)
80 |     print("RDP Accountant via weak bound:", sigma_rdp_weak)
81 |     print("RDP Accountant via weak bound:", sigma_rdp_weak_2)
82 |     print("RDP Accountant", obj.sigma)
83 |     print("RDP Accountant", RDPAccountant(1.5, delta, 1, None, total_queries,
84 |                         verbose=False).sigma)
85 | 
86 | budget_examples()
87 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/ind_exp.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """IndExp method for computing differentially private quantiles.
 17 | 
 18 | Algorithm 2 from the paper "Privacy-preserving Statistical Estimation with
 19 | Optimal Convergence Rates" by Smith (STOC 2011,
 20 | http://cs-people.bu.edu/ads22/pubs/2011/stoc194-smith.pdf) describes the
 21 | subroutine used to compute a single quantile. Theorem 3 from the paper ``Optimal
 22 | Differential Privacy Composition for Exponential Mechanisms and the Cost of
 23 | Adaptivity'' by Dong, Durfee, and Rogers (ICML 2020,
 24 | https://arxiv.org/pdf/1909.13830.pdf) describes the composition used for the
 25 | approximate DP variant of IndExp.
 26 | """
 27 | 
 28 | import numpy as np
 29 | import scipy
 30 | 
 31 | 
 32 | def racing_sample(log_terms):
 33 |   """Numerically stable method for sampling from an exponential distribution.
 34 | 
 35 |   Args:
 36 |     log_terms: Array of terms of form log(coefficient) - (exponent term).
 37 | 
 38 |   Returns:
 39 |     A sample from the exponential distribution determined by terms. See
 40 |     Algorithm 1 from the paper "Duff: A Dataset-Distance-Based
 41 |     Utility Function Family for the Exponential Mechanism"
 42 |     (https://arxiv.org/pdf/2010.04235.pdf) for details; each element of terms is
 43 |     analogous to a single log(lambda(A_k)) - (eps * k/2) in their algorithm.
 44 |   """
 45 |   return np.argmin(
 46 |       np.log(np.log(1.0 / np.random.uniform(size=log_terms.shape))) - log_terms)
 47 | 
 48 | 
 49 | def opt_comp_p(eps, t):
 50 |   """Returns p_{eps, t} for opt_comp_calculator.
 51 | 
 52 |   Args:
 53 |     eps: Privacy parameter epsilon.
 54 |     t: Exponent t.
 55 |   """
 56 |   return (np.exp(-t) - np.exp(-eps)) / (1.0 - np.exp(-eps))
 57 | 
 58 | 
 59 | def opt_comp_calculator(overall_eps, overall_delta, num_comps):
 60 |   """Returns the optimal per-composition eps for overall approx DP guarantee.
 61 | 
 62 |   Args:
 63 |     overall_eps: Desired overall privacy parameter epsilon.
 64 |     overall_delta: Desired overall privacy parameter delta.
 65 |     num_comps: Total number of compositions.
 66 | 
 67 |   Returns:
 68 |     eps_0 such that num_compositions eps_0-DP applications of the exponential
 69 |     mechanism will overall be (overall_eps, overall_delta)-DP, using the
 70 |     expression given in Theorem 3 of DDR20. This assumes that the composition is
 71 |     non-adaptive.
 72 |   """
 73 |   eps_i_range = np.arange(overall_eps / num_comps - 0.01, overall_eps, 0.01)
 74 |   num_eps_i = len(eps_i_range)
 75 |   max_eps = 0
 76 |   for eps_idx in range(num_eps_i):
 77 |     eps = eps_i_range[eps_idx]
 78 |     max_sum = 0
 79 |     for ell in range(num_comps + 1):
 80 |       t_ell_star = np.clip((overall_eps + (ell + 1) * eps) / (num_comps + 1),
 81 |                            0.0, eps)
 82 |       p_t_ell_star = opt_comp_p(eps, t_ell_star)
 83 |       term_sum = 0
 84 |       for i in range(num_comps + 1):
 85 |         term_sum += scipy.special.binom(num_comps, i) * np.power(
 86 |             p_t_ell_star, num_comps - i) * np.power(1 - p_t_ell_star, i) * max(
 87 |                 np.exp(num_comps * t_ell_star -
 88 |                        (i * eps)) - np.exp(overall_eps), 0)
 89 |       if term_sum > max_sum:
 90 |         max_sum = term_sum
 91 |     if max_sum > overall_delta:
 92 |       return max_eps
 93 |     else:
 94 |       max_eps = eps
 95 |   return max_eps
 96 | 
 97 | 
 98 | def ind_exp(sorted_data, data_low, data_high, qs, divided_eps, swap):
 99 |   """Returns eps-differentially private collection of quantile estimates for qs.
100 | 
101 |   Args:
102 |     sorted_data: Array of data points sorted in increasing order.
103 |     data_low: Lower limit for any differentially private quantile output value.
104 |     data_high: Upper limit for any differentially private quantile output value.
105 |     qs: Increasing array of quantiles in [0,1].
106 |     divided_eps: Privacy parameter epsilon for each estimated quantile. Assumes
107 |       that divided_eps has been computed to ensure the desired overall privacy
108 |       guarantee.
109 |     swap: If true, uses swap dp sensitivity, otherwise uses add-remove.
110 |   """
111 |   num_quantiles = len(qs)
112 |   outputs = np.empty(num_quantiles)
113 |   sorted_data = np.clip(sorted_data, data_low, data_high)
114 |   data_size = len(sorted_data)
115 |   sorted_data = np.concatenate(([data_low], sorted_data, [data_high]))
116 |   data_gaps = sorted_data[1:] - sorted_data[:-1]
117 |   for q_idx in range(num_quantiles):
118 |     quantile = qs[q_idx]
119 |     if swap:
120 |       sensitivity = 1.0
121 |     else:
122 |       sensitivity = max(quantile, 1 - quantile)
123 |     idx_left = racing_sample(
124 |         np.log(data_gaps) +
125 |         ((divided_eps / (-2.0 * sensitivity)) *
126 |          np.abs(np.arange(0, data_size + 1) - (quantile * data_size))))
127 |     outputs[q_idx] = np.random.uniform(sorted_data[idx_left],
128 |                                        sorted_data[idx_left + 1])
129 |   # Note that the outputs are already clipped to [data_low, data_high], so no
130 |   # further clipping of outputs is necessary.
131 |   return np.sort(outputs)
132 | 


--------------------------------------------------------------------------------
/examples/multiclass_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Load Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import sys\n",
 17 |     "sys.path.append(\"../\")\n",
 18 |     "\n",
 19 |     "from federated_gbdt.models.gbdt.private_gbdt import PrivateGBDT\n",
 20 |     "from experiments.experiment_helpers.data_loader import DataLoader\n",
 21 |     "from sklearn.metrics import roc_auc_score\n",
 22 |     "from sklearn.preprocessing import label_binarize\n",
 23 |     "\n",
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "dataloader = DataLoader()\n",
 34 |     "\n",
 35 |     "# Default is 70/30 split\n",
 36 |     "X_train, X_test, y_train, y_test = dataloader.load_datasets([\"connect_4\"], return_dict=False)[0]"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "(47289, 42)"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "X_train.shape"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "array([2, 2, 1, ..., 2, 2, 2])"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "y_train"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "array([0, 1, 2])"
 88 |       ]
 89 |      },
 90 |      "execution_count": 5,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "np.unique(y_train)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "y_test_onehot = label_binarize(y_test, classes=[0,1,2])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## XGBoost Training (No DP)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "(20268, 3)\n"
125 |      ]
126 |     },
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "0.907745991798139"
131 |       ]
132 |      },
133 |      "execution_count": 7,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "xgb_model = PrivateGBDT(num_trees=100, epsilon=0)\n",
140 |     "xgb_model = xgb_model.fit(X_train, y_train)\n",
141 |     "y_pred = xgb_model.predict_proba(X_test)\n",
142 |     "print(y_pred.shape)\n",
143 |     "\n",
144 |     "roc_auc_score(y_test_onehot, y_pred)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "## DP-XGBoost (FEVERLESS)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 8,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "0.6525901702496729"
163 |       ]
164 |      },
165 |      "execution_count": 8,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "dp_xgb_model = PrivateGBDT(num_trees=100, epsilon=3, dp_method=\"gaussian_cdp\")\n",
172 |     "dp_xgb_model = dp_xgb_model.fit(X_train, y_train)\n",
173 |     "y_pred = dp_xgb_model.predict_proba(X_test)\n",
174 |     "\n",
175 |     "roc_auc_score(y_test_onehot, y_pred)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "## DP-TR XGBoost"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 9,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "0.7821243499339423"
194 |       ]
195 |      },
196 |      "execution_count": 9,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "dp_tr_model = PrivateGBDT(num_trees=100, epsilon=3, split_method=\"totally_random\", \n",
203 |     "                           sketch_type=\"uniform\", dp_method=\"gaussian_cdp\")\n",
204 |     "\n",
205 |     "dp_tr_model = dp_tr_model.fit(X_train, y_train)\n",
206 |     "y_pred = dp_tr_model.predict_proba(X_test)\n",
207 |     "\n",
208 |     "roc_auc_score(y_test_onehot, y_pred)"
209 |    ]
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3 (ipykernel)",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.9.15"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 1
233 | }
234 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/smooth_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Smooth sensitivity utils used by both Smooth and CSmooth.
 17 | 
 18 | Section 3.1 from "Smooth Sensitivity and Sampling in Private Data Analysis" by
 19 | Nissim, Radkhodnikova, and Smith
 20 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf) gives
 21 | details for compute_log_sensitivity and its helper functions.
 22 | """
 23 | 
 24 | import numpy as np
 25 | 
 26 | 
 27 | def check_indices(n, lower_idx, upper_idx):
 28 |   """Raises an error for indices outside of the [-1, n] range.
 29 | 
 30 |   Args:
 31 |     n: Right endpoint for valid range.
 32 |     lower_idx: Lower bound for idx.
 33 |     upper_idx: Upper bound for idx.
 34 |   """
 35 |   if lower_idx < -1:
 36 |     raise ValueError("Index too small: lower_idx < -1.")
 37 |   if upper_idx > n:
 38 |     raise ValueError("Index too large: upper_idx > n.")
 39 | 
 40 | 
 41 | def update_log_smooth_sensitivity(lower_idx1, upper_idx1, lower_idx2,
 42 |                                   upper_idx2, data, data_low, data_high, t,
 43 |                                   log_smooth_sensitivity):
 44 |   """Updates, returns log smooth sensitivity by searching local sensitivities.
 45 | 
 46 |   Args:
 47 |     lower_idx1: Min value for index i.
 48 |     upper_idx1: Max value for index i.
 49 |     lower_idx2: Min value for index j.
 50 |     upper_idx2: Max value for index j.
 51 |     data: User data, sorted in increasing order and clipped to lie in the
 52 |       [data_low, data_high] range.
 53 |     data_low: Lower limit for differentially private quantile output value.
 54 |     data_high: Upper limit for differentially private quantile output value.
 55 |     t: Smooth sensitivity parameter.
 56 |     log_smooth_sensitivity: Current max log smooth sensitivity, as found by
 57 |       previous searches of other index ranges.
 58 | 
 59 |   Returns:
 60 |     The maximum distance-weighted local sensitivity at any pair of indices
 61 |     (i, j) where lower_idx1 <= i <= upper_idx1 and
 62 |     lower_idx2 <= j <= upper_idx2. The special indices -1 and n = len(data) are
 63 |     allowed and interpreted as indexing values data_low and data_high,
 64 |     respectively.
 65 |   """
 66 |   n = len(data)
 67 | 
 68 |   # Sanity checks.
 69 |   check_indices(n, lower_idx1, upper_idx1)
 70 |   check_indices(n, lower_idx2, upper_idx2)
 71 |   if upper_idx2 < lower_idx2:
 72 |     raise ValueError("Indices out of order: upper_idx2 < lower_idx2.")
 73 | 
 74 |   if upper_idx1 < lower_idx1:
 75 |     # Nothing to explore, return current log smooth sensitivity value.
 76 |     return log_smooth_sensitivity
 77 | 
 78 |   # Find the middle index and set i to this value.
 79 |   i = (lower_idx1 + upper_idx1) // 2
 80 | 
 81 |   # Scan the eligible indices j in the [lower_idx2, upper_idx2] range.
 82 |   js = np.arange(lower_idx2, upper_idx2 + 1)
 83 | 
 84 |   # Copy values from data at the indices indicated by js.  (For js that are n,
 85 |   # use max_value.)
 86 |   j_vals = np.empty(upper_idx2 + 1 - lower_idx2)
 87 |   js_lt_n_bool = js < n
 88 |   js_lt_n = js[js_lt_n_bool]
 89 |   j_vals[js_lt_n_bool] = data[js_lt_n]
 90 |   j_vals[np.logical_not(js_lt_n_bool)] = data_high
 91 | 
 92 |   # Compute database distances for all the (i, j) pairs.
 93 |   database_distances = np.maximum(js - (i + 1), 0)
 94 | 
 95 |   # Compute local sensitivities for all the (i, j) pairs.
 96 |   base_value = data_low if i == -1 else data[i]
 97 |   local_sensitivities = j_vals - base_value
 98 | 
 99 |   # Compute log smooth sensitivities:
100 |   #   log(exp(-t*database_distances) * local_sensitivities).
101 |   log_smooth_sensitivities = -t * database_distances + np.log(
102 |       local_sensitivities)
103 | 
104 |   # Find the largest smooth sensitivity.
105 |   max_smooth_sensitivity_index = np.argmax(log_smooth_sensitivities)
106 |   current_max_log_smooth_sensitivity = log_smooth_sensitivities[
107 |       max_smooth_sensitivity_index]
108 |   max_smooth_sensitivity_index = js[max_smooth_sensitivity_index]
109 | 
110 |   # Update the input smooth sensitivity if we found a larger one.
111 |   log_smooth_sensitivity = max(log_smooth_sensitivity,
112 |                                current_max_log_smooth_sensitivity)
113 | 
114 |   # Check the remaining indices.  (All indices in the [lower_idx1, upper_idx1]
115 |   # range that are not equal to the midpoint i value checked above.)
116 |   log_smooth_sensitivity1 = update_log_smooth_sensitivity(
117 |       i + 1, upper_idx1, max_smooth_sensitivity_index, upper_idx2, data,
118 |       data_low, data_high, t, log_smooth_sensitivity)
119 |   log_smooth_sensitivity2 = update_log_smooth_sensitivity(
120 |       lower_idx1, i - 1, lower_idx2, max_smooth_sensitivity_index, data,
121 |       data_low, data_high, t, log_smooth_sensitivity)
122 |   return max(log_smooth_sensitivity1, log_smooth_sensitivity2)
123 | 
124 | 
125 | def compute_log_smooth_sensitivity(data, data_low, data_high, true_quantile_idx,
126 |                                    t):
127 |   """Returns log(t-smooth sensitivity) for the given dataset and quantile.
128 | 
129 |   Args:
130 |     data: User data, sorted in increasing order and clipped to lie in the
131 |       [data_low, data_high] range.
132 |     data_low: Lower limit for differentially private quantile output value.
133 |     data_high: Upper limit for differentially private quantile output value.
134 |     true_quantile_idx: Index into data at the desired quantile location.
135 |     t: Smooth sensitivity parameter.
136 |   """
137 |   n = len(data)
138 |   return update_log_smooth_sensitivity(-1, true_quantile_idx, true_quantile_idx,
139 |                                        n, data, data_low, data_high, t,
140 |                                        -np.inf)
141 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/bin_inner_param.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | 
  4 | class BinInnerParam(object):
  5 |     """
  6 |     Use to store columns related params for binning process
  7 |     """
  8 | 
  9 |     def __init__(self):
 10 |         self.bin_indexes = []
 11 |         self.bin_names = []
 12 |         self.col_name_maps = {}
 13 |         self.header = []
 14 |         self.transform_bin_indexes = []
 15 |         self.transform_bin_names = []
 16 |         self.category_indexes = []
 17 |         self.category_names = []
 18 | 
 19 |     def set_header(self, header):
 20 |         self.header = copy.deepcopy(header)
 21 |         for idx, col_name in enumerate(self.header):
 22 |             self.col_name_maps[col_name] = idx
 23 | 
 24 |     def set_bin_all(self):
 25 |         """
 26 |         Called when user set to bin all columns
 27 |         """
 28 |         self.bin_indexes = [i for i in range(len(self.header))]
 29 |         self.bin_names = copy.deepcopy(self.header)
 30 | 
 31 |     def set_transform_all(self):
 32 |         self.transform_bin_indexes = self.bin_indexes
 33 |         self.transform_bin_names = self.bin_names
 34 |         self.transform_bin_indexes.extend(self.category_indexes)
 35 |         self.transform_bin_names.extend(self.category_names)
 36 | 
 37 |     def add_bin_indexes(self, bin_indexes):
 38 |         if bin_indexes is None:
 39 |             return
 40 |         for idx in bin_indexes:
 41 |             if idx >= len(self.header):
 42 |                 # LOGGER.warning("Adding a index that out of header's bound")
 43 |                 # continue
 44 |                 raise ValueError("Adding a index that out of header's bound")
 45 |             if idx not in self.bin_indexes:
 46 |                 self.bin_indexes.append(idx)
 47 |                 self.bin_names.append(self.header[idx])
 48 | 
 49 |     def add_bin_names(self, bin_names):
 50 |         if bin_names is None:
 51 |             return
 52 | 
 53 |         for bin_name in bin_names:
 54 |             idx = self.col_name_maps.get(bin_name)
 55 |             if idx is None:
 56 |                 LOGGER.warning("Adding a col_name that is not exist in header")
 57 |                 continue
 58 |             if idx not in self.bin_indexes:
 59 |                 self.bin_indexes.append(idx)
 60 |                 self.bin_names.append(self.header[idx])
 61 | 
 62 |     def add_transform_bin_indexes(self, transform_indexes):
 63 |         if transform_indexes is None:
 64 |             return
 65 | 
 66 |         for idx in transform_indexes:
 67 |             if idx >= len(self.header) or idx < 0:
 68 |                 raise ValueError("Adding a index that out of header's bound")
 69 |                 # LOGGER.warning("Adding a index that out of header's bound")
 70 |                 # continue
 71 |             if idx not in self.transform_bin_indexes:
 72 |                 self.transform_bin_indexes.append(idx)
 73 |                 self.transform_bin_names.append(self.header[idx])
 74 | 
 75 |     def add_transform_bin_names(self, transform_names):
 76 |         if transform_names is None:
 77 |             return
 78 |         for bin_name in transform_names:
 79 |             idx = self.col_name_maps.get(bin_name)
 80 |             if idx is None:
 81 |                 raise ValueError("Adding a col_name that is not exist in header")
 82 | 
 83 |             if idx not in self.transform_bin_indexes:
 84 |                 self.transform_bin_indexes.append(idx)
 85 |                 self.transform_bin_names.append(self.header[idx])
 86 | 
 87 |     def add_category_indexes(self, category_indexes):
 88 |         if category_indexes == -1:
 89 |             category_indexes = [i for i in range(len(self.header))]
 90 |         elif category_indexes is None:
 91 |             return
 92 | 
 93 |         for idx in category_indexes:
 94 |             if idx >= len(self.header):
 95 |                 # LOGGER.warning("Adding a index that out of header's bound")
 96 |                 continue
 97 |             if idx not in self.category_indexes:
 98 |                 self.category_indexes.append(idx)
 99 |                 self.category_names.append(self.header[idx])
100 |             if idx in self.bin_indexes:
101 |                 self.bin_indexes.remove(idx)
102 |                 self.bin_names.remove(self.header[idx])
103 | 
104 |     def add_category_names(self, category_names):
105 |         if category_names is None:
106 |             return
107 | 
108 |         for bin_name in category_names:
109 |             idx = self.col_name_maps.get(bin_name)
110 |             if idx is None:
111 |                 # LOGGER.warning("Adding a col_name that is not exist in header")
112 |                 continue
113 |             if idx not in self.category_indexes:
114 |                 self.category_indexes.append(idx)
115 |                 self.category_names.append(self.header[idx])
116 |             if idx in self.bin_indexes:
117 |                 self.bin_indexes.remove(idx)
118 |                 self.bin_names.remove(self.header[idx])
119 | 
120 |     @property
121 |     def bin_cols_map(self):
122 |         assert len(self.bin_indexes) == len(self.bin_names)
123 |         return dict(zip(self.bin_names, self.bin_indexes))
124 | 
125 |     def encode_col_name_dict(self, col_name_dict: dict, model):
126 |         result = {}
127 |         for x, y in col_name_dict.items():
128 |             col_index = self.col_name_maps.get(x)
129 |             result[anonymous_generator.generate_anonymous(col_index, model=model)] = y
130 |         return result
131 | 
132 |     def encode_col_name_list(self, col_name_list: list, model):
133 |         result = []
134 |         for x in col_name_list:
135 |             col_index = self.col_name_maps.get(x)
136 |             result.append(anonymous_generator.generate_anonymous(col_index, model=model))
137 |         return result
138 | 
139 |     # def __encode_col_name(self, col_name):
140 |     #     col_index = self.col_name_maps.get(col_name)
141 |     #     if col_index is None:
142 |     #         LOGGER.warning("Encoding a non-exist column name")
143 |     #         return None
144 |     #     return '.'.join(['host', str(col_index)])
145 | 
146 |     def decode_col_name(self, encoded_name: str):
147 |         col_index = anonymous_generator.reconstruct_fid(encoded_name)
148 | 
149 |         # try:
150 |         #     col_index = int(encoded_name.split('.')[1])
151 |         # except IndexError or ValueError:
152 |         #     raise RuntimeError("Bin inner param is trying to decode an invalid col_name.")
153 |         return self.header[col_index]


--------------------------------------------------------------------------------
/federated_gbdt/models/base/tree_base.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import federated_gbdt.core.baseline_constants as consts
  3 | from abc import ABC, abstractmethod
  4 | import pandas
  5 | from sklearn.metrics import roc_auc_score, accuracy_score
  6 | import _pickle as pickle
  7 | from federated_gbdt.core.loss_functions import SoftmaxCrossEntropyLoss
  8 | 
  9 | 
 10 | class TreeBase(ABC):
 11 |     def __init__(
 12 |         self,
 13 |         min_samples_split=2,
 14 |         max_depth=3,
 15 |         task_type=consts.CLASSIFICATION,
 16 |         num_classes=-1,
 17 |     ):
 18 |         self.root = None  # Root node in dec. tree
 19 |         self.min_samples_split = min_samples_split
 20 |         self.max_depth = max_depth
 21 |         self.task_type = task_type
 22 |         self.num_classes = num_classes
 23 |         self.K = 2
 24 |         self.training_method = "boosting"
 25 |         self.batched_update_size = 1
 26 |         self.trees = []
 27 |         self.multiclass_trees = {}
 28 |         self.loss = None
 29 | 
 30 |     @abstractmethod
 31 |     def fit(self, *args, **kwargs):
 32 |         pass
 33 | 
 34 |     @abstractmethod
 35 |     def _build_tree(self, *args, **kwargs):
 36 |         """
 37 |         Build the tree model according to impurity and leaf value
 38 |         """
 39 |         pass
 40 | 
 41 |     def _convert_df(self, X):
 42 |         """
 43 | 
 44 |         :param X: Data as a Pandas DataFrame
 45 |         :return: X as a Numpy array
 46 |         """
 47 |         if isinstance(X, pandas.DataFrame):
 48 |             X = X.to_numpy()
 49 | 
 50 |         return X
 51 | 
 52 |     @staticmethod
 53 |     def predict_value(X, tree):
 54 |         out = np.zeros(X.shape[0])
 55 | 
 56 |         if tree.value is None:
 57 |             # Choose the feature that we will test
 58 |             feature_value = X[:, tree.feature_i]
 59 |             left_split = feature_value <= tree.threshold
 60 |             right_split = ~left_split
 61 | 
 62 |             # Determine if we will follow left or right branch
 63 |             out[left_split] = TreeBase.predict_value(X[left_split], tree.true_branch)
 64 |             out[right_split] = TreeBase.predict_value(X[right_split], tree.false_branch)
 65 |         else:
 66 |             out = np.repeat(tree.value[0], X.shape[0])
 67 | 
 68 |         return out
 69 | 
 70 |     def predict_over_trees(self, X, y):
 71 |         metrics = []
 72 |         for i in range(1, len(self.trees) + 1):
 73 |             trees = self.trees[:i]
 74 |             y_pred = self.loss.predict(self.predict_weight(X, trees))
 75 |             auc = roc_auc_score(y, y_pred)
 76 |             acc = accuracy_score(y, (y_pred >= 0.5).astype("int"))
 77 |             metrics.append((auc, acc))
 78 |             print("Tree", i, "AUC :", auc)
 79 |             print("Tree", i, "Acc :", acc, "\n")
 80 | 
 81 |         return metrics
 82 | 
 83 |     def predict_weight(self, X, tree=None):
 84 |         """
 85 |         Predicts a weight (i.e for classification a non-sigmoided value) for each observation passed.
 86 |         By default this is calculated from the whole ensemble or from a specific tree if passed
 87 | 
 88 |         :param X: Data
 89 |         :param tree: Tree to predict weight from
 90 |         :return: Model prediction as a weight
 91 |         """
 92 |         X = self._convert_df(X)
 93 |         pred = (
 94 |             np.zeros((X.shape[0], self.num_classes))
 95 |             if self.num_classes > 2
 96 |             else np.zeros(X.shape[0])
 97 |         )
 98 |         trees = tree if tree is not None else self.trees
 99 | 
100 |         preds = []
101 |         for i, tree in enumerate(trees):
102 |             pred += self.predict_value(X, tree)
103 |             if self.training_method == "batched_boosting":
104 |                 if (
105 |                     i + 1
106 |                 ) % self.batched_update_size == 0:  # Average current weights and add to preds
107 |                     pred /= self.batched_update_size
108 |                     preds.append(pred)
109 |                     pred = np.zeros(X.shape[0])
110 |                 elif (i + 1) == len(trees):
111 |                     pred /= (i + 1) % self.batched_update_size
112 |                     preds.append(pred)
113 |             elif (
114 |                 self.early_stopping == "average_retrain"
115 |                 and (i + 1) % (len(self.trees) / 2) == 0
116 |             ):
117 |                 preds.append(pred)
118 | 
119 |         if self.training_method == "batched_boosting":
120 |             # print("NUM OF BATCHES", len(preds))
121 |             pred = np.add.reduce(preds)
122 |         elif self.training_method == "rf":
123 |             pred /= len(trees)
124 |         elif self.early_stopping == "average_retrain":
125 |             pred = (preds[0] + preds[1]) / 2
126 | 
127 |         return pred
128 | 
129 |     def predict(self, X):
130 |         """Classify samples one by one and return the set of labels"""
131 |         X = self._convert_df(X)
132 |         return (np.argmax(self.predict_prob(X), axis=1)).astype("int")
133 | 
134 |     def predict_prob(self, X):
135 |         """
136 |         For binary classification will return probabilities instead of raw weights
137 | 
138 |         :param X: Rows of observations to classify
139 |         :return: A list of probabilities for each observation
140 |         """
141 |         X = self._convert_df(X)
142 |         probs = []
143 |         for k in range(0, self.K):
144 |             probs.append(self.predict_weight(X, self.multiclass_trees[k]))
145 |         probs = np.array(list(zip(*probs)))
146 |         if self.task_type == consts.CLASSIFICATION:
147 |             probs = (
148 |                 self.loss.predict(probs)
149 |                 if self.K <= 2
150 |                 else SoftmaxCrossEntropyLoss().predict(probs)
151 |             )
152 |         if self.K <= 2:
153 |             probs = np.array([[1 - p[0], p[0]] for p in probs])
154 |         return probs
155 | 
156 |     def predict_proba(self, X):
157 |         return self.predict_prob(X)
158 | 
159 |     def _reset_tracking_attributes(self, checkpoint):
160 |         return
161 | 
162 |     def save(self, filename, checkpoint=False):
163 |         self._reset_tracking_attributes(
164 |             checkpoint
165 |         )  # Otherwise saved file will be large...
166 |         f = open(filename + ".pkl", "wb")
167 |         pickle.dump(self.__dict__, f, 2)
168 |         f.close()
169 | 
170 |     def load(self, filename):
171 |         f = open(filename, "rb")
172 |         tmp_dict = pickle.load(f)
173 |         f.close()
174 | 
175 |         self.__dict__.update(tmp_dict)
176 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/core/_freq_oracle_server.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import numpy as np
  3 | from federated_gbdt.core.pure_ldp.core.prob_simplex import project_probability_simplex
  4 | 
  5 | class FreqOracleServer:
  6 |     def __init__(self, epsilon, d, index_mapper=None):
  7 |         """
  8 | 
  9 |         Args:
 10 |             epsilon: privacy budget
 11 |             d: domain size - not all freq oracles need this so can be None
 12 |             index_mapper: Optional function - maps data items to indexes in the range {0, 1, ..., d-1} where d is the size of the data domain
 13 | 
 14 |         """
 15 |         self.epsilon = epsilon
 16 |         self.d = d
 17 | 
 18 |         self.aggregated_data = np.zeros(self.d) # Some freq oracle servers keep track of aggregated data to generate estimated_data
 19 |         self.estimated_data = np.zeros(self.d) # Keep track of estimated data for quick access
 20 |         self.n = 0 # The number of data items aggregated
 21 | 
 22 |         self.name = "FrequencyOracle" # Name of the frequency oracle for warning messages, set using .set_name(name)
 23 |         self.last_estimated = 0
 24 | 
 25 |         if index_mapper is None:
 26 |             self.index_mapper = lambda x: x - 1
 27 |         else:
 28 |             self.index_mapper = index_mapper
 29 | 
 30 |     def set_name(self, name):
 31 |         """
 32 |         Sets freq servers name
 33 |         Args:
 34 |             name: string - name of frequency oracle
 35 |         """
 36 |         self.name = name
 37 | 
 38 |     def reset(self):
 39 |         """
 40 |         This method resets the server's aggregated/estimated data and sets n = 0.
 41 |         This should be overridden if other parameters need to be reset.
 42 |         """
 43 |         self.aggregated_data = np.zeros(self.d)
 44 |         self.estimated_data = np.zeros(self.d)
 45 |         self.last_estimated = 0
 46 |         self.n = 0
 47 | 
 48 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
 49 |         """
 50 |         Method to update params of freq oracle server, should be overridden if more options needed.
 51 |         This will reset aggregated/estimated data.
 52 |         Args:
 53 |             epsilon: Optional - privacy budget
 54 |             d: Optional - domain size
 55 |             index_mapper: Optional - function
 56 |         """
 57 |         self.epsilon = epsilon if epsilon is not None else self.epsilon # Updating epsilon here will not update any internal probabilities
 58 |         # Any class that implements FreqOracleServer, needs to override update_params to update epsilon properly
 59 | 
 60 |         self.d = d if d is not None else self.d
 61 |         self.index_mapper = index_mapper if index_mapper is not None else self.index_mapper
 62 |         self.reset()
 63 | 
 64 |     def check_warnings(self, suppress_warnings=False):
 65 |         """
 66 |         Used during estimation to check warnings
 67 |         Args:
 68 |             suppress_warnings: Optional boolean - If True suppresses warnings from being output
 69 |         """
 70 |         pass
 71 | 
 72 |     def aggregate(self, data):
 73 |         """
 74 |         The main method for aggregation, should be implemented by a freq oracle server
 75 |         Args:
 76 |             data: item to estimate frequency of
 77 |         """
 78 |         raise NotImplementedError("Must implement")
 79 | 
 80 |     def aggregate_all(self, data_list):
 81 |         """
 82 |         Helper method used to aggregate a list of data
 83 |         Args:
 84 |             data_list: List of private data to aggregate
 85 |         """
 86 |         for data in data_list:
 87 |             self.aggregate(data)
 88 | 
 89 |     def check_and_update_estimates(self):
 90 |         """
 91 |         Used to check if the "cached" estimated data needs re-estimating, this occurs when new data has been aggregated since last
 92 |         """
 93 |         if self.last_estimated < self.n:  # If new data has been aggregated since the last estimation, then estimate all
 94 |             self.last_estimated = self.n
 95 |             self._update_estimates()
 96 | 
 97 |     def _update_estimates(self):
 98 |         """
 99 |         Used internally to update estimates, should be implemented
100 |         """
101 |         raise NotImplementedError("Must implement")
102 | 
103 |     def estimate(self, data, suppress_warnings=False):
104 |         """
105 |         Calculates frequency estimate of given data item, must be implemented
106 |         Args:
107 |             data: data to estimate the frequency warning of
108 |             suppress_warnings: Optional boolean - if true suppresses warnings
109 |         """
110 |         raise NotImplementedError("Must implement")
111 | 
112 |     def estimate_all(self, data_list, suppress_warnings=False, normalization=0):
113 |         """
114 |         Helper method, given a list of data items, returns a list of their estimated frequencies
115 |         Args:
116 |             data_list: list of data items to estimate
117 |             suppress_warnings: If True, will suppress estimation warnings
118 |             normalization: Normalisation should only be specified when estimating over the entire domain!
119 |                            0 - No Norm
120 |                            1 - Additive Norm
121 |                            2 - Prob Simplex
122 |                            3 (or otherwise) - Threshold cut
123 | 
124 |         Returns: list of estimates
125 | 
126 |         """
127 |         self.check_and_update_estimates()
128 | 
129 |         estimates = np.array([self.estimate(x, suppress_warnings=suppress_warnings) for x in data_list])
130 | 
131 |         if normalization == 0: # No normalisation
132 |             return estimates
133 |         elif normalization == 1: # Additive normalisation
134 |             diff = self.n - sum(estimates[estimates > 0])
135 |             non_zero = (estimates>0).sum()
136 | 
137 |             for i,item in enumerate(estimates):
138 |                 if item > 0:
139 |                     estimates[i] = item + diff/non_zero
140 |                 else:
141 |                     estimates[i] = 0
142 | 
143 |             return estimates
144 |         elif normalization == 2: # Prob Simplex
145 |             proj = project_probability_simplex(estimates/self.n)
146 |             return np.array(proj) * self.n
147 |         else:
148 |             # Threshold cut
149 |             sorted_index = np.argsort((-1 * estimates))
150 |             total = 0
151 |             i=0
152 |             for i,index in enumerate(sorted_index):
153 |                 total += estimates[index]
154 |                 if total > self.n:
155 |                     break
156 | 
157 |             for j in range(i, len(sorted_index)):
158 |                 estimates[sorted_index[j]] = 0
159 | 
160 |             return estimates
161 | 
162 |     @property
163 |     def get_estimates(self):
164 |         """
165 |         Returns: Estimated data
166 |         """
167 |         return self.estimated_data
168 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/feature_binning_param.py:
--------------------------------------------------------------------------------
  1 | from federated_gbdt.core import baseline_constants
  2 | import copy
  3 | 
  4 | 
  5 | class TransformParam:
  6 |     """
  7 |     Define how to transfer the cols
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     transform_cols : list of column index, default: -1
 12 |         Specify which columns need to be transform. If column index is None, None of columns will be transformed.
 13 |         If it is -1, it will use same columns as cols in binning module.
 14 | 
 15 |     transform_names: list of string, default: []
 16 |         Specify which columns need to calculated. Each element in the list represent for a column name in header.
 17 | 
 18 | 
 19 |     transform_type: str, 'bin_num'or 'woe' or None default: 'bin_num'
 20 |         Specify which value these columns going to replace.
 21 |          1. bin_num: Transfer original feature value to bin index in which this value belongs to.
 22 |          2. woe: This is valid for guest party only. It will replace original value to its woe value
 23 |          3. None: nothing will be replaced.
 24 |     """
 25 | 
 26 |     def __init__(self, transform_cols=-1, transform_names=None, transform_type="bin_num"):
 27 |         super(TransformParam, self).__init__()
 28 |         self.transform_cols = transform_cols
 29 |         self.transform_names = transform_names
 30 |         self.transform_type = transform_type
 31 | 
 32 | 
 33 | class OptimalBinningParam:
 34 |     """
 35 |     Indicate optimal binning params
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     metric_method: str, default: "iv"
 40 |         The algorithm metric method. Support iv, gini, ks, chi-square
 41 | 
 42 | 
 43 |     min_bin_pct: float, default: 0.05
 44 |         The minimum percentage of each bucket
 45 | 
 46 |     max_bin_pct: float, default: 1.0
 47 |         The maximum percentage of each bucket
 48 | 
 49 |     init_bin_nums: int, default 100
 50 |         Number of bins when initialize
 51 | 
 52 |     mixture: bool, default: True
 53 |         Whether each bucket need event and non-event records
 54 | 
 55 |     init_bucket_method: str default: quantile
 56 |         Init bucket methods. Accept quantile and bucket.
 57 | 
 58 |     """
 59 | 
 60 |     def __init__(self, metric_method='iv', min_bin_pct=0.05, max_bin_pct=1.0,
 61 |                  init_bin_nums=1000, mixture=True, init_bucket_method='quantile'):
 62 |         super().__init__()
 63 |         self.init_bucket_method = init_bucket_method
 64 |         self.metric_method = metric_method
 65 |         self.max_bin = None
 66 |         self.mixture = mixture
 67 |         self.max_bin_pct = max_bin_pct
 68 |         self.min_bin_pct = min_bin_pct
 69 |         self.init_bin_nums = init_bin_nums
 70 |         self.adjustment_factor = None
 71 | 
 72 | 
 73 | class FeatureBinningParam:
 74 |     """
 75 |     Define the feature binning method
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     method : str, 'quantile'， 'bucket' or 'optimal', default: 'quantile'
 80 |         Binning method.
 81 | 
 82 |     compress_thres: int, default: 10000
 83 |         When the number of saved summaries exceed this threshold, it will call its compress function
 84 | 
 85 |     head_size: int, default: 10000
 86 |         The buffer size to store inserted observations. When head list reach this buffer size, the
 87 |         QuantileSummaries object start to generate summary(or stats) and insert into its sampled list.
 88 | 
 89 |     error: float, 0 <= error < 1 default: 0.001
 90 |         The error of tolerance of binning. The final split point comes from original data, and the rank
 91 |         of this value is close to the exact rank. More precisely,
 92 |         floor((p - 2 * error) * N) <= rank(x) <= ceil((p + 2 * error) * N)
 93 |         where p is the quantile in float, and N is total number of data.
 94 | 
 95 |     bin_num: int, bin_num > 0, default: 10
 96 |         The max bin number for binning
 97 | 
 98 |     bin_indexes : list of int or int, default: -1
 99 |         Specify which columns need to be binned. -1 represent for all columns. If you need to indicate specific
100 |         cols, provide a list of header index instead of -1.
101 | 
102 |     bin_names : list of string, default: []
103 |         Specify which columns need to calculated. Each element in the list represent for a column name in header.
104 | 
105 |     adjustment_factor : float, default: 0.5
106 |         the adjustment factor when calculating WOE. This is useful when there is no event or non-event in
107 |         a bin. Please note that this parameter will NOT take effect for setting in host.
108 | 
109 |     category_indexes : list of int or int, default: []
110 |         Specify which columns are category features. -1 represent for all columns. List of int indicate a set of
111 |         such features. For category features, bin_obj will take its original values as split_points and treat them
112 |         as have been binned. If this is not what you expect, please do NOT put it into this parameters.
113 | 
114 |         The number of categories should not exceed bin_num set above.
115 | 
116 |     category_names : list of string, default: []
117 |         Use column names to specify category features. Each element in the list represent for a column name in header.
118 | 
119 |     local_only : bool, default: False
120 |         Whether just provide binning method to guest party. If true, host party will do nothing.
121 | 
122 |     transform_param: TransformParam
123 |         Define how to transfer the binned data.
124 | 
125 |     need_run: bool, default True
126 |         Indicate if this module needed to be run
127 | 
128 |     skip_static: bool, default False
129 |         If true, binning will not calculate iv, woe etc. In this case, optimal-binning
130 |         will not be supported.
131 | 
132 |     """
133 | 
134 |     def __init__(self, method=baseline_constants.QUANTILE,
135 |                  compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD,
136 |                  head_size=baseline_constants.DEFAULT_HEAD_SIZE,
137 |                  error=baseline_constants.DEFAULT_RELATIVE_ERROR,
138 |                  bin_num=baseline_constants.G_BIN_NUM, bin_indexes=-1, bin_names=None, adjustment_factor=0.5,
139 |                  transform_param=TransformParam(), optimal_binning_param=OptimalBinningParam(),
140 |                  local_only=False, category_indexes=None, category_names=None,
141 |                  need_run=True, skip_static=False):
142 |         self.method = method
143 |         self.compress_thres = compress_thres
144 |         self.head_size = head_size
145 |         self.error = error
146 |         self.adjustment_factor = adjustment_factor
147 |         self.bin_num = bin_num
148 |         self.bin_indexes = bin_indexes
149 |         self.bin_names = bin_names
150 |         self.category_indexes = category_indexes
151 |         self.category_names = category_names
152 |         self.local_only = local_only
153 |         self.transform_param = copy.deepcopy(transform_param)
154 |         self.optimal_binning_param = copy.deepcopy(optimal_binning_param)
155 |         self.need_run = need_run
156 |         self.skip_static = skip_static
157 | 


--------------------------------------------------------------------------------
/federated_gbdt/models/gbdt/components/train_monitor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | 
  4 | from collections import defaultdict
  5 | 
  6 | class TrainMonitor():
  7 |     def __init__(self, num_clients, num_classes=2):
  8 |         self.gradient_info = [] # List of tuples containing (gradient, hess) info
  9 |         self.leaf_gradient_tracker = [[], []]
 10 |         self.root_gradient_tracker = [[],[]]
 11 | 
 12 |         self.gradient_total = [0,0]
 13 |         self.current_tree_weights = []
 14 |         self.previous_tree_weights = []
 15 |         self.y_weights = []
 16 |         self.batched_weights = []
 17 | 
 18 |         self.last_feature = -1
 19 |         self.node_count = -1
 20 | 
 21 |         self.leaf_count_tracker = []
 22 |         self.leaf_count = 0
 23 |         self.internal_node_count = defaultdict(int)
 24 |         self.internal_node_count_tracker = []
 25 |         self.bin_tracker = defaultdict(int)
 26 |         self.tree_bin_tracker = []
 27 | 
 28 |         self.current_tree_depth = 0
 29 | 
 30 |         self.num_classes = num_classes
 31 | 
 32 |         self.client_rounds_sent = [0]
 33 |         self.client_rounds_received = [0]
 34 |         self.client_payload_sent = [0]
 35 |         self.client_payload_received = [0]
 36 | 
 37 | 
 38 |         self.num_clients = num_clients
 39 | 
 40 |         self.client_timer = 0
 41 |         self.server_timer = 0
 42 | 
 43 |         self.client_total_time = [0]
 44 |         self.server_total_time = [0]
 45 | 
 46 |         self.client_time_dict = {"histogram building": 0, "computing gradients": 0, 'initialise private histogram': 0, "forming gradient + hess histogram": 0,
 47 |                                  "retrieving grads/hess for node": 0,}
 48 | 
 49 |         self.server_time_dict = {"initial split candidates": 0, "privacy_accountant initialisation": 0, "initialise model weights": 0, "split_candidates": 0,
 50 |                                  "pre-tree ops": 0, "post-tree ops": 0, "initialise private histogram": 0, "adding noise to gradient + hess histogram": 0,
 51 |                                  "sampling features for node": 0, "calculating internal split": 0, "updating split constraints": 0, "leaf weight": 0}
 52 | 
 53 |     def start_timing_event(self, device_type, tag=""):
 54 |         if device_type == "client":
 55 |             self.client_timer = time.time()
 56 |         else:
 57 |             self.server_timer = time.time()
 58 | 
 59 |     def end_timing_event(self, device_type, tag=""):
 60 |         if device_type == "client":
 61 |             time_elapsed = time.time() - self.client_timer
 62 |             self.client_total_time[-1] += time_elapsed
 63 |             self.client_time_dict[tag] += time_elapsed
 64 |             self.client_timer = 0
 65 |         else:
 66 |             time_elapsed = time.time() - self.server_timer
 67 |             self.server_total_time[-1] += time_elapsed
 68 |             self.server_time_dict[tag] += time_elapsed
 69 |             self.server_timer = 0
 70 | 
 71 |     def update_num_clients(self, num_clients):
 72 |         self.num_clients = num_clients
 73 | 
 74 |         self.client_rounds_received = [0]
 75 |         self.client_rounds_sent = [0]
 76 |         self.client_payload_sent = [0]
 77 |         self.client_payload_received = [0]
 78 | 
 79 |     def update_received(self, client_ids, payload_size):
 80 |         self.client_payload_received[-1] += payload_size
 81 |         self.client_rounds_received[-1] += 1
 82 | 
 83 |     def update_sent(self, client_ids, payload_size, increment_round=True):
 84 |         if len(client_ids) > 0:
 85 |             self.client_payload_sent[-1] += payload_size
 86 |             if increment_round:
 87 |                 self.client_rounds_sent[-1] += 1
 88 | 
 89 |     def output_summary(self):
 90 |         print(f"\nNumber of clients {self.num_clients}")
 91 |         print(f"Max client rounds sent {np.max(self.client_rounds_sent)}")
 92 |         print(f"Avg client rounds sent {np.mean(self.client_rounds_sent)}")
 93 |         print(f"Total client sent {np.sum(self.client_rounds_sent)}")
 94 | 
 95 |         print(f"Max client rounds received {np.max(self.client_rounds_received)}")
 96 |         print(f"Avg client rounds received {np.mean(self.client_rounds_received)}")
 97 | 
 98 |         print(f"Max client sent {np.max(self.client_payload_sent) / 1024}Kb")
 99 |         print(f"Average client sent {np.mean(self.client_payload_sent) / 1024}Kb")
100 |         print(f"Total client sent {np.sum(self.client_payload_sent) / 1024}Kb")
101 | 
102 |         print(f"Total leaf count {self.leaf_count_tracker}")
103 |         # print(f"Total internal nodes {self.internal_node_count_tracker}")
104 |         print("\n")
105 | 
106 |         for i, t in enumerate(self.client_total_time):
107 |             print(f"Tree {i} client total time {self.client_total_time[i]}")
108 |             print(f"Tree {i} server total time {self.server_total_time[i]}")
109 | 
110 |         print(f"Client time dict {self.client_time_dict}")
111 |         print(f"Server time dict {self.server_time_dict}")
112 | 
113 |     def _update_comm_stats(self, split_method, training_method):
114 |         # print(f"Stats before updating rounds={self.client_rounds_sent[-1]}, payload={self.client_payload_sent[-1]}")
115 | 
116 |         # Internal nodes
117 |         if split_method != "totally_random":
118 |             total = 0
119 |             for level in self.bin_tracker:
120 |                 num_bins = self.bin_tracker[level]
121 |                 total += 8*2*num_bins
122 |             self.client_payload_sent[-1] += total
123 | 
124 |             for level in self.internal_node_count:
125 |                 if self.internal_node_count[level] > 0:
126 |                     self.client_rounds_sent[-1] += 1
127 | 
128 |         # Leaf nodes
129 |         if training_method != "batched_boosting":
130 |             self.update_sent(range(0, self.num_clients), payload_size=8*2*self.leaf_count, increment_round=True)
131 | 
132 |         # print(f"Stats after updating rounds={self.client_rounds_sent[-1]}, payload={self.client_payload_sent[-1]}")
133 | 
134 |     def reset(self):
135 |         # For comm tracking
136 |         self.leaf_count_tracker.append(self.leaf_count)
137 |         self.leaf_count = 0
138 |         self.internal_node_count_tracker.append(self.internal_node_count)
139 |         self.internal_node_count = defaultdict(int)
140 |         self.tree_bin_tracker.append(self.bin_tracker)
141 |         self.bin_tracker = defaultdict(int)
142 | 
143 |         self.client_rounds_sent.append(0)
144 |         self.client_payload_sent.append(0)
145 |         self.client_rounds_received.append(0)
146 |         self.client_payload_received.append(0)
147 | 
148 |         self.client_timer, self.server_timer = 0,0
149 |         self.client_total_time.append(0)
150 |         self.server_total_time.append(0)
151 | 
152 |         self.gradient_total = [0,0]
153 |         self.current_tree_depth = 0
154 |         self.previous_tree_weights = self.current_tree_weights
155 |         self.current_tree_weights = np.zeros(len(self.current_tree_weights)) if self.num_classes == 2 else np.zeros((len(self.current_tree_weights), self.num_classes))
156 | 
157 |     def set_num_classes(self, num_classes):
158 |         self.num_classes = num_classes


--------------------------------------------------------------------------------
/federated_gbdt/core/pure_ldp/frequency_oracles/square_wave/sw_server.py:
--------------------------------------------------------------------------------
  1 | from federated_gbdt.core.pure_ldp.core import FreqOracleServer
  2 | import numpy as np
  3 | import math
  4 | import scipy
  5 | import random
  6 | 
  7 | from numba import jit
  8 | 
  9 | class SWServer(FreqOracleServer):
 10 |     def __init__(self, epsilon, d=1024, d_prime=1024, smooth=True, smc=False, index_mapper=None):
 11 |         super().__init__(epsilon, d=None, index_mapper=index_mapper)
 12 |         self.smc = smc
 13 |         self.smooth = smooth
 14 |         self.d = d  # Domain bins B_i, n
 15 |         self.d_prime = d_prime  # Randomised Bins \tilde{B}_j, m
 16 |         self.update_params(epsilon, d=None, index_mapper=index_mapper)
 17 |         self.aggregated_data = []
 18 | 
 19 |     def update_params(self, epsilon=None, d=None, index_mapper=None):
 20 |         super().update_params(epsilon, d, index_mapper)
 21 |         ee = np.exp(self.epsilon)
 22 |         if epsilon is not None or d is not None:
 23 |             self.b = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon))
 24 |             self.p = ee / ((2 * self.b * ee) + 1)
 25 |             self.q = 1 / ((2 * self.b * ee) + 1)
 26 |             self.w = ((self.epsilon * ee) - ee + 1) / (2 * ee * (ee - 1 - self.epsilon)) * 2
 27 |             self.M = self.generate_M(self.d_prime, self.d)
 28 | 
 29 |     def aggregate(self, priv_data):
 30 |         self.aggregated_data.append(priv_data)
 31 |         self.n += 1
 32 | 
 33 |     def generate_M(self, m=1024, n=1024):
 34 |         # report matrix
 35 |         m_cell = (1 + self.w) / m
 36 |         n_cell = 1 / n
 37 | 
 38 |         transform = np.ones((m, n)) * self.q * m_cell
 39 |         for i in range(n):
 40 |             left_most_v = (i * n_cell)  # For bin B_i, this is the left boundary - v_min
 41 |             right_most_v = ((i + 1) * n_cell)  # Right boundary of B_i - v_max
 42 | 
 43 |             ll_bound = int(left_most_v / m_cell)
 44 |             lr_bound = int((left_most_v + self.w) / m_cell)
 45 |             rl_bound = int(right_most_v / m_cell)
 46 |             rr_bound = int((right_most_v + self.w) / m_cell)
 47 | 
 48 |             ll_v = left_most_v - self.w / 2
 49 |             rl_v = right_most_v - self.w / 2
 50 |             l_p = ((ll_bound + 1) * m_cell - self.w / 2 - ll_v) * (self.p - self.q) + self.q * m_cell
 51 |             r_p = ((rl_bound + 1) * m_cell - self.w / 2 - rl_v) * (self.p - self.q) + self.q * m_cell
 52 |             if rl_bound > ll_bound:
 53 |                 transform[ll_bound, i] = (l_p - self.q * m_cell) * (
 54 |                         (ll_bound + 1) * m_cell - self.w / 2 - ll_v) / n_cell * 0.5 + self.q * m_cell
 55 |                 transform[ll_bound + 1, i] = self.p * m_cell - (self.p * m_cell - r_p) * (
 56 |                         rl_v - ((ll_bound + 1) * m_cell - self.w / 2)) / n_cell * 0.5
 57 |             else:
 58 |                 transform[ll_bound, i] = (l_p + r_p) / 2
 59 |                 transform[ll_bound + 1, i] = self.p * m_cell
 60 | 
 61 |             lr_v = left_most_v + self.w / 2
 62 |             rr_v = right_most_v + self.w / 2
 63 |             r_p = (rr_v - (rr_bound * m_cell - self.w / 2)) * (self.p - self.q) + self.q * m_cell
 64 |             l_p = (lr_v - (lr_bound * m_cell - self.w / 2)) * (self.p - self.q) + self.q * m_cell
 65 |             if rr_bound > lr_bound:
 66 |                 if rr_bound < m:
 67 |                     transform[rr_bound, i] = (r_p - self.q * m_cell) * (
 68 |                             rr_v - (rr_bound * m_cell - self.w / 2)) / n_cell * 0.5 + self.q * m_cell
 69 | 
 70 |                 transform[rr_bound - 1, i] = self.p * m_cell - (self.p * m_cell - l_p) * (
 71 |                         (rr_bound * m_cell - self.w / 2) - lr_v) / n_cell * 0.5
 72 | 
 73 |             else:
 74 |                 transform[rr_bound, i] = (l_p + r_p) / 2
 75 |                 transform[rr_bound - 1, i] = self.p * m_cell
 76 | 
 77 |             if rr_bound - 1 > ll_bound + 2:
 78 |                 transform[ll_bound + 2: rr_bound - 1, i] = self.p * m_cell
 79 | 
 80 |         return transform
 81 | 
 82 |     def difference_intervals(self, I1, I2):
 83 |         a_start, a_end = I1
 84 |         b_start, b_end = I2
 85 |         return min(abs(a_start - b_start), abs(a_start - b_end), abs(a_end - b_start), abs(a_end - b_end)), max(
 86 |             abs(a_start - b_start), abs(a_start - b_end), abs(a_end - b_start), abs(a_end - b_end))
 87 | 
 88 |     def EMS(self, priv_hist, iterations, threshold, smooth=False):
 89 |         if smooth:
 90 |             # smoothing matrix
 91 |             smoothing_factor = 2
 92 |             binomial_tmp = [scipy.special.binom(smoothing_factor, k) for k in range(smoothing_factor + 1)]
 93 |             smoothing_matrix = np.zeros((self.d, self.d))
 94 |             central_idx = int(len(binomial_tmp) / 2)
 95 |             for i in range(int(smoothing_factor / 2)):
 96 |                 smoothing_matrix[i, : central_idx + i + 1] = binomial_tmp[central_idx - i:]
 97 |             for i in range(int(smoothing_factor / 2), self.d - int(smoothing_factor / 2)):
 98 |                 smoothing_matrix[i, i - central_idx: i + central_idx + 1] = binomial_tmp
 99 |             for i in range(self.d - int(smoothing_factor / 2), self.d):
100 |                 remain = self.d - i - 1
101 |                 smoothing_matrix[i, i - central_idx + 1:] = binomial_tmp[: central_idx + remain]
102 |             row_sum = np.sum(smoothing_matrix, axis=1)
103 |             smoothing_matrix = (smoothing_matrix.T / row_sum).T
104 | 
105 |         # EMS
106 |         theta = np.ones(self.d) / float(self.d)
107 |         theta_old = np.zeros(self.d)
108 |         r = 0
109 |         sample_size = sum(priv_hist)
110 |         old_logliklihood = 0
111 | 
112 |         while np.linalg.norm(theta_old - theta, ord=1) > 1 / sample_size and r < iterations:
113 |             theta_old = np.copy(theta)
114 |             X_condition = np.matmul(self.M, theta_old)
115 | 
116 |             TMP = self.M.T / X_condition
117 | 
118 |             P = np.copy(np.matmul(TMP, priv_hist))
119 |             P = P * theta_old
120 | 
121 |             theta = np.copy(P / sum(P))
122 | 
123 |             # Smoothing step
124 |             if smooth:
125 |                 theta = np.matmul(smoothing_matrix, theta)
126 |                 theta = theta / sum(theta)
127 | 
128 |             logliklihood = np.inner(priv_hist, np.log(np.matmul(self.M, theta)))
129 |             imporve = logliklihood - old_logliklihood
130 | 
131 |             if r > 1 and abs(imporve) < threshold:
132 |                 # print("stop when", imporve / old_logliklihood, loglikelihood_threshold)
133 |                 break
134 | 
135 |             old_logliklihood = logliklihood
136 | 
137 |             r += 1
138 |         return theta
139 | 
140 |     def g_density(self, v_prime, v):
141 |         out = np.zeros(shape=v_prime.shape)
142 |         out.fill(self.q)
143 |         p_indexes = np.abs(v - v_prime) < self.b
144 |         out[p_indexes] = self.p
145 |         return out
146 | 
147 |     def _update_estimates(self):
148 |         histogram, _ = np.histogram(self.aggregated_data, bins=self.d_prime, range=(-self.b, 1 + self.b))
149 |         self.estimated_density = self.EMS(histogram, 100, 1e-3, self.smooth)
150 | 
151 |         return self.estimated_density
152 | 
153 |     def estimate(self, data, suppress_warnings=False):
154 |         self.check_and_update_estimates()
155 |         return self.estimated_density[data]
156 | 
157 |     def estimate_all(self, data_list, suppress_warnings=False):
158 |         return [self.estimate(item) for item in data_list]
159 | 
160 |     def estimate_density(self, N=None, suppress_warnings=False):
161 |         self.check_and_update_estimates()
162 |         return self.estimated_density
163 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/joint_exp.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """JointExp method for computing multiple dp quantiles."""
 17 | 
 18 | import numpy as np
 19 | from numpy.fft import irfft
 20 | from numpy.fft import rfft
 21 | from scipy import special
 22 | 
 23 | import federated_gbdt.core.dp_multiq.ind_exp as ind_exp
 24 | 
 25 | 
 26 | def compute_intervals(sorted_data, data_low, data_high):
 27 |   """Returns array of intervals of adjacent points.
 28 | 
 29 |   Args:
 30 |     sorted_data: Nondecreasing array of data points, all in the [data_low,
 31 |       data_high] range.
 32 |     data_low: Lower bound for data.
 33 |     data_high: Upper bound for data.
 34 | 
 35 |   Returns:
 36 |     An array of intervals of adjacent points from [data_low, data_high] in
 37 |     nondecreasing order. For example, if sorted_data = [0,1,1,2,3],
 38 |     data_low = 0, and data_high = 4, returns
 39 |     [[0, 0], [0, 1], [1, 1], [1, 2], [2, 3], [3, 4]].
 40 |   """
 41 |   return np.block([[data_low, sorted_data], [sorted_data,
 42 |                                              data_high]]).transpose()
 43 | 
 44 | 
 45 | def compute_log_phi(data_intervals, qs, eps, swap):
 46 |   """Computes two-dimensional array log_phi.
 47 | 
 48 |   Args:
 49 |     data_intervals: Array of intervals of adjacent points from
 50 |       compute_intervals.
 51 |     qs: Increasing array of quantiles in [0,1].
 52 |     eps: Privacy parameter epsilon.
 53 |     swap: If true, uses swap dp sensitivity, otherwise uses add-remove.
 54 | 
 55 |   Returns:
 56 |     Array log_phi where log_phi[i-i',j] = log(phi(i, i', j)).
 57 |   """
 58 |   num_data_intervals = len(data_intervals)
 59 |   original_data_size = num_data_intervals - 1
 60 |   if swap:
 61 |     sensitivity = 2.0
 62 |   else:
 63 |     if len(qs) == 1:
 64 |       sensitivity = 2.0 * (1 - min(qs[0], 1 - qs[0]))
 65 |     else:
 66 |       sensitivity = 2.0 * (1 - min(qs[0], np.min(qs[1:] - qs[:-1]), 1 - qs[-1]))
 67 |   eps_term = -(eps / (2.0 * sensitivity))
 68 |   gaps = np.arange(num_data_intervals)
 69 |   target_ns = (np.block([qs, 1]) - np.block([0, qs])) * original_data_size
 70 |   return eps_term * np.abs(gaps.reshape(-1, 1) - target_ns)
 71 | 
 72 | 
 73 | def logdotexp_toeplitz_lt(c, x):
 74 |   """Multiplies a log-space vector by a lower triangular Toeplitz matrix.
 75 | 
 76 |   Args:
 77 |     c: First column of the Toeplitz matrix (in log space).
 78 |     x: Vector to be multiplied (in log space).
 79 | 
 80 |   Returns:
 81 |     Let T denote the lower triangular Toeplitz matrix whose first column is
 82 |     given by exp(c); then the vector returned by this function is log(T *
 83 |     exp(x)). The multiplication is done using FFTs for efficiency, and care is
 84 |     taken to avoid overflow during exponentiation.
 85 |   """
 86 |   max_c, max_x = np.max(c), np.max(x)
 87 |   exp_c, exp_x = c - max_c, x - max_x
 88 |   np.exp(exp_c, out=exp_c)
 89 |   np.exp(exp_x, out=exp_x)
 90 |   n = len(x)
 91 |   # Choose the next power of two.
 92 |   p = np.power(2, np.ceil(np.log2(2 * n - 1))).astype(int)
 93 |   fft_exp_c = rfft(exp_c, n=p)
 94 |   fft_exp_x = rfft(exp_x, n=p)
 95 |   y = irfft(fft_exp_c * fft_exp_x)[:n]
 96 |   np.maximum(0, y, out=y)
 97 |   np.log(y, out=y)
 98 |   y += max_c + max_x
 99 |   return y
100 | 
101 | 
102 | def compute_log_alpha(data_intervals, log_phi, qs):
103 |   """Computes three-dimensional array log_alpha.
104 | 
105 |   Args:
106 |     data_intervals: Array of intervals of adjacent points from
107 |       compute_intervals.
108 |     log_phi: Array from compute_log_phi.
109 |     qs: Increasing array of quantiles in (0,1).
110 | 
111 |   Returns:
112 |     Array log_alpha[a, b, c] where a and c index over quantiles and b represents
113 |     interval repeats.
114 |   """
115 |   num_intervals = len(data_intervals)
116 |   num_quantiles = len(qs)
117 |   data_intervals_log_sizes = np.log(data_intervals[:, 1] - data_intervals[:, 0])
118 |   log_alpha = np.log(np.zeros([num_quantiles, num_intervals, num_quantiles]))
119 |   log_alpha[0, :, 0] = log_phi[:, 0] + data_intervals_log_sizes
120 |   # A handy mask for log_phi.
121 |   disallow_repeat = np.zeros(num_intervals)
122 |   disallow_repeat[0] = -np.inf
123 |   for j in range(1, num_quantiles):
124 |     log_hat_alpha = special.logsumexp(log_alpha[j - 1, :, :], axis=1)
125 |     log_alpha[j, :, 0] = data_intervals_log_sizes + logdotexp_toeplitz_lt(
126 |         log_phi[:, j] + disallow_repeat, log_hat_alpha)
127 |     log_alpha[j, 0, 0] = -np.inf  # Correct possible numerical error.
128 |     log_alpha[j, :, 1:j+1] = \
129 |       (log_phi[0, j] + data_intervals_log_sizes)[:, np.newaxis] \
130 |       + log_alpha[j-1, :, 0:j] - np.log(np.arange(1, j+1) + 1)
131 |   return log_alpha
132 | 
133 | 
134 | def sample_joint_exp(log_alpha, data_intervals, log_phi, qs):
135 |   """Given log_alpha and log_phi, samples final quantile estimates.
136 | 
137 |   Args:
138 |     log_alpha: Array from compute_log_alpha.
139 |     data_intervals: Array of intervals of adjacent points from
140 |       compute_intervals.
141 |     log_phi: Array from compute_log_phi.
142 |     qs: Increasing array of quantiles in (0,1).
143 | 
144 |   Returns:
145 |     Array outputs where outputs[i] is the quantile estimate corresponding to
146 |     quantile q[i].
147 |   """
148 |   num_intervals = len(data_intervals)
149 |   num_quantiles = len(qs)
150 |   outputs = np.zeros(num_quantiles)
151 |   last_i = num_intervals - 1
152 |   j = num_quantiles - 1
153 |   repeats = 0
154 |   while j >= 0:
155 |     log_dist = log_alpha[j, :last_i + 1, :] + log_phi[:last_i + 1,
156 |                                                       j + 1][::-1, np.newaxis]
157 |     # Prevent repeats unless it's the first round.
158 |     if j < num_quantiles - 1:
159 |       log_dist[last_i, :] = -np.inf
160 |     i, k = np.unravel_index(
161 |         ind_exp.racing_sample(log_dist), [last_i + 1, num_quantiles])
162 |     repeats += k
163 |     k += 1
164 |     for j2 in range(j - k + 1, j + 1):
165 |       outputs[j2] = np.random.uniform(data_intervals[i, 0], data_intervals[i,
166 |                                                                            1])
167 |     j -= k
168 |     last_i = i
169 |   return np.sort(outputs)
170 | 
171 | 
172 | def joint_exp(sorted_data, data_low, data_high, qs, eps, swap):
173 |   """Computes eps-differentially private quantile estimates for qs.
174 | 
175 |   Args:
176 |     sorted_data: Array of data points sorted in increasing order.
177 |     data_low: Lower bound for data.
178 |     data_high: Upper bound for data.
179 |     qs: Increasing array of quantiles in (0,1).
180 |     eps: Privacy parameter epsilon.
181 |     swap: If true, uses swap dp sensitivity, otherwise uses add-remove.
182 | 
183 |   Returns:
184 |     Array o where o[i] is the quantile estimate corresponding to quantile q[i].
185 |   """
186 |   clipped_data = np.clip(sorted_data, data_low, data_high)
187 |   data_intervals = compute_intervals(clipped_data, data_low, data_high)
188 |   log_phi = compute_log_phi(data_intervals, qs, eps, swap)
189 |   log_alpha = compute_log_alpha(data_intervals, log_phi, qs)
190 |   return sample_joint_exp(log_alpha, data_intervals, log_phi, qs)
191 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/bin_result.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class BinColResults(object):
  5 |     def __init__(self, woe_array=(), iv_array=(), event_count_array=(), non_event_count_array=(),
  6 |                  event_rate_array=(), non_event_rate_array=(), iv=None):
  7 |         self.woe_array = list(woe_array)
  8 |         self.iv_array = list(iv_array)
  9 |         self.event_count_array = list(event_count_array)
 10 |         self.non_event_count_array = list(non_event_count_array)
 11 |         self.event_rate_array = list(event_rate_array)
 12 |         self.non_event_rate_array = list(non_event_rate_array)
 13 |         self.split_points = None
 14 |         if iv is None:
 15 |             iv = 0
 16 |             for idx, woe in enumerate(self.woe_array):
 17 |                 non_event_rate = non_event_count_array[idx]
 18 |                 event_rate = event_rate_array[idx]
 19 |                 iv += (non_event_rate - event_rate) * woe
 20 |         self.iv = iv
 21 |         self._bin_anonymous = None
 22 | 
 23 |     # @property
 24 |     # def bin_anonymous(self):
 25 |     #     if self.split_points is None or len(self.split_points) == 0:
 26 |     #         return []
 27 |     #     if self._bin_anonymous is None:
 28 |     #         return ["bin_" + str(i) for i in range(len(self.split_points))]
 29 |     #     return self._bin_anonymous
 30 |     #
 31 |     # @bin_anonymous.setter
 32 |     # def bin_anonymous(self, x):
 33 |     #     self._bin_anonymous = x
 34 |     #
 35 |     def set_split_points(self, split_points):
 36 |         self.split_points = split_points
 37 | 
 38 |     def get_split_points(self):
 39 |         return np.array(self.split_points)
 40 |     #
 41 |     # @property
 42 |     # def is_woe_monotonic(self):
 43 |     #     """
 44 |     #     Check the woe is monotonic or not
 45 |     #     """
 46 |     #     woe_array = self.woe_array
 47 |     #     if len(woe_array) <= 1:
 48 |     #         return True
 49 |     #
 50 |     #     is_increasing = all(x <= y for x, y in zip(woe_array, woe_array[1:]))
 51 |     #     is_decreasing = all(x >= y for x, y in zip(woe_array, woe_array[1:]))
 52 |     #     return is_increasing or is_decreasing
 53 |     #
 54 |     # @property
 55 |     # def bin_nums(self):
 56 |     #     return len(self.woe_array)
 57 |     #
 58 |     # def result_dict(self):
 59 |     #     save_dict = self.__dict__
 60 |     #     save_dict['is_woe_monotonic'] = self.is_woe_monotonic
 61 |     #     save_dict['bin_nums'] = self.bin_nums
 62 |     #     return save_dict
 63 |     #
 64 |     # def reconstruct(self, iv_obj):
 65 |     #     self.woe_array = list(iv_obj.woe_array)
 66 |     #     self.iv_array = list(iv_obj.iv_array)
 67 |     #     self.event_count_array = list(iv_obj.event_count_array)
 68 |     #     self.non_event_count_array = list(iv_obj.non_event_count_array)
 69 |     #     self.event_rate_array = list(iv_obj.event_rate_array)
 70 |     #     self.non_event_rate_array = list(iv_obj.non_event_rate_array)
 71 |     #     self.split_points = list(iv_obj.split_points)
 72 |     #     self.iv = iv_obj.iv
 73 |     #
 74 |     # def generate_pb(self):
 75 |     #     result = feature_binning_param_pb2.IVParam(woe_array=self.woe_array,
 76 |     #                                                iv_array=self.iv_array,
 77 |     #                                                event_count_array=self.event_count_array,
 78 |     #                                                non_event_count_array=self.non_event_count_array,
 79 |     #                                                event_rate_array=self.event_rate_array,
 80 |     #                                                non_event_rate_array=self.non_event_rate_array,
 81 |     #                                                split_points=self.split_points,
 82 |     #                                                iv=self.iv,
 83 |     #                                                is_woe_monotonic=self.is_woe_monotonic,
 84 |     #                                                bin_nums=self.bin_nums,
 85 |     #                                                bin_anonymous=self.bin_anonymous)
 86 |     #     return result
 87 | 
 88 | 
 89 | class BinResults(object):
 90 |     def __init__(self):
 91 |         self.all_cols_results = {}  # {col_name: BinColResult}
 92 |         self.role = ''
 93 |         self.party_id = ''
 94 | 
 95 |     # def set_role_party(self, role, party_id):
 96 |     #     self.role = role
 97 |     #     self.party_id = party_id
 98 |     #
 99 |     # def put_col_results(self, col_name, col_results: BinColResults):
100 |     #     ori_col_results = self.all_cols_results.get(col_name)
101 |     #     if ori_col_results is not None:
102 |     #         col_results.set_split_points(ori_col_results.get_split_points())
103 |     #     self.all_cols_results[col_name] = col_results
104 |     #
105 |     def put_col_split_points(self, col_name, split_points):
106 |         col_results = self.all_cols_results.get(col_name, BinColResults())
107 |         col_results.set_split_points(split_points)
108 |         self.all_cols_results[col_name] = col_results
109 | 
110 |     # def query_split_points(self, col_name):
111 |     #     col_results = self.all_cols_results.get(col_name)
112 |     #     if col_results is None:
113 |     #         LOGGER.warning("Querying non-exist split_points")
114 |     #         return None
115 |     #     return col_results.split_points
116 | 
117 |     @property
118 |     def all_split_points(self):
119 |         results = {}
120 |         for col_name, col_result in self.all_cols_results.items():
121 |             results[col_name] = col_result.get_split_points()
122 |         return results
123 |     #
124 |     # @property
125 |     # def all_ivs(self):
126 |     #     return [(col_name, x.iv) for col_name, x in self.all_cols_results.items()]
127 |     #
128 |     # @property
129 |     # def all_woes(self):
130 |     #     return {col_name: x.woe_array for col_name, x in self.all_cols_results.items()}
131 |     #
132 |     # @property
133 |     # def all_monotonic(self):
134 |     #     return {col_name: x.is_woe_monotonic for col_name, x in self.all_cols_results.items()}
135 |     #
136 |     # def summary(self):
137 |     #     return {"iv": self.all_ivs,
138 |     #             "woe": self.all_woes,
139 |     #             "monotonic": self.all_monotonic}
140 | 
141 |     def get_split_points_array(self, bin_names):
142 |         split_points_result = []
143 |         for bin_name in bin_names:
144 |             if bin_name not in self.all_cols_results:
145 |                 continue
146 |             split_points_result.append(self.all_cols_results[bin_name].get_split_points())
147 |         return np.array(split_points_result)
148 | 
149 |     # def generated_pb(self):
150 |     #     col_result_dict = {}
151 |     #     for col_name, col_bin_result in self.all_cols_results.items():
152 |     #         col_result_dict[col_name] = col_bin_result.generate_pb()
153 |     #     LOGGER.debug("In generated_pb, role: {}, party_id: {}".format(self.role, self.party_id))
154 |     #     result_pb = feature_binning_param_pb2.FeatureBinningResult(binning_result=col_result_dict,
155 |     #                                                                role=self.role,
156 |     #                                                                party_id=str(self.party_id))
157 |     #     return result_pb
158 |     #
159 |     # def reconstruct(self, result_pb):
160 |     #     self.role = result_pb.role
161 |     #     self.party_id = result_pb.party_id
162 |     #     binning_result = dict(result_pb.binning_result)
163 |     #     for col_name, col_bin_result in binning_result.items():
164 |     #         col_bin_obj = BinColResults()
165 |     #         col_bin_obj.reconstruct(col_bin_result)
166 |     #         self.all_cols_results[col_name] = col_bin_obj
167 |     #     return self


--------------------------------------------------------------------------------
/federated_gbdt/core/dp_multiq/csmooth.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """CDP smooth sensitivity method for computing differentially private quantiles.
 17 | 
 18 | The smooth sensitivity method is described in
 19 | "Smooth Sensitivity and Sampling in Private Data Analysis" by Nissim,
 20 | Raskhodnikova, and Smith
 21 | (https://cs-people.bu.edu/ads22/pubs/NRS07/NRS07-full-draft-v1.pdf). Details for
 22 | the CDP noise distribution appear in Section 3.1 of "Average-Case Averages:
 23 | Private Algorithms for Smooth Sensitivity and Mean Estimation" by Bun and
 24 | Steinke (NeurIPS 2019). Details for optimizing t, s, and sigma appear in
 25 | Section 3.1.1 of the same paper.
 26 | """
 27 | 
 28 | import numpy as np
 29 | 
 30 | from dp_multiq import base
 31 | from dp_multiq import smooth_utils
 32 | 
 33 | 
 34 | def compute_triples(eps, ts):
 35 |   """Returns triples of form (t, log(s), sigma) for hyperparameter optimization.
 36 | 
 37 |   Args:
 38 |     eps: Privacy parameter epsilon.
 39 |     ts: Array of possible smooth sensitivity parameters.
 40 |   """
 41 |   triples = np.empty([len(ts), 3])
 42 |   for t_idx in range(len(ts)):
 43 |     t = ts[t_idx]
 44 |     triples[t_idx, 0] = t
 45 |     sigma = opt_sigma(eps, t)
 46 |     triples[t_idx, 2] = sigma
 47 |     triples[t_idx, 1] = -1.5 * (sigma**2) + np.log(eps - (t / sigma))
 48 |   return triples
 49 | 
 50 | 
 51 | def opt_sigma(eps, t):
 52 |   """Returns optimal sigma as detailed in Section 3.1.1 of Bun and Steinke.
 53 | 
 54 |   Args:
 55 |     eps: Privacy parameter epsilon.
 56 |     t: Smooth sensitivity parameter.
 57 |   """
 58 |   return np.real(np.roots([5 * eps / t, -5, 0, -1])[0])
 59 | 
 60 | 
 61 | def lln(sigma):
 62 |   """Returns a sample from the Laplace Log-Normal distribution.
 63 | 
 64 |   Args:
 65 |     sigma: Sigma parameter for the Laplace Log-Normal distribution.
 66 |   """
 67 |   return np.random.laplace() * np.exp(sigma * np.random.normal())
 68 | 
 69 | 
 70 | def csmooth(sorted_data, data_low, data_high, qs, divided_eps, ts):
 71 |   """Returns eps^2/2-CDP quantile estimates for qs.
 72 | 
 73 |   Args:
 74 |     sorted_data: Array of data points sorted in increasing order.
 75 |     data_low: Lower limit for any differentially private quantile output value.
 76 |     data_high: Upper limit for any differentially private quantile output value.
 77 |     qs: Increasing array of quantiles in [0,1].
 78 |     divided_eps: Privacy parameter epsilon. Assumes eps has already been divided
 79 |       so that the overall desired privacy guarantee is achieved.
 80 |     ts: Array of smooth sensitivity parameters, one for each q in qs.
 81 |   """
 82 |   sorted_data = np.clip(sorted_data, data_low, data_high)
 83 |   o = np.empty(len(qs))
 84 |   triples = compute_triples(divided_eps, ts)
 85 |   for i in range(len(qs)):
 86 |     t, log_s, sigma = triples[i]
 87 |     true_quantile_idx = base.quantile_index(len(sorted_data), qs[i])
 88 |     true_quantile_value = sorted_data[true_quantile_idx]
 89 |     laplace_log_normal_noise = lln(sigma)
 90 |     log_sensitivity = smooth_utils.compute_log_smooth_sensitivity(
 91 |         sorted_data, data_low, data_high, true_quantile_idx, t)
 92 |     noise = np.sign(laplace_log_normal_noise) * np.exp(
 93 |         log_sensitivity + np.log(np.abs(laplace_log_normal_noise)) - log_s)
 94 |     o[i] = true_quantile_value + noise
 95 |   o = np.clip(o, data_low, data_high)
 96 |   return np.sort(o)
 97 | 
 98 | 
 99 | def log_choose_triple_idx(triples, eps, log_sensitivities):
100 |   """Returns triple (t, log_s, sigma) that minimizes noisy statistic variance.
101 | 
102 |   Args:
103 |     triples: Array with entries of form (t, log_s, sigma).
104 |     eps: Privacy parameter epsilon.
105 |     log_sensitivities: Log(t smooth sensitivity) for each t in triples.
106 |   """
107 |   variances = np.empty(len(triples))
108 |   for triple_idx in range(len(triples)):
109 |     numerator = 2 * (np.exp(2 * log_sensitivities[triple_idx]))
110 |     denominator = np.exp(-5 * (triples[triple_idx][2]**2)) * (
111 |         (eps - (triples[triple_idx][0] / triples[triple_idx][2]))**2)
112 |     variances[triple_idx] = numerator / denominator
113 |   return np.argmin(variances)
114 | 
115 | 
116 | def csmooth_tune_and_return_ts(sorted_data, data_low, data_high, qs,
117 |                                divided_eps, log_t_low, log_t_high, num_t):
118 |   """Returns ts minimizing variance for data and each q under ~eps^2/2-CDP.
119 | 
120 |   Args:
121 |     sorted_data: Array of data points sorted in increasing order.
122 |     data_low: Lower limit for any differentially private quantile output value.
123 |     data_high: Upper limit for any differentially private quantile output value.
124 |     qs: Increasing array of quantiles in [0,1].
125 |     divided_eps: Privacy parameter epsilon. Assumes eps has already been divided
126 |       so that the overall desired privacy guarantee is achieved.
127 |     log_t_low: Tuning range for t has lower bound 10^(log_t_low).
128 |     log_t_high: Tuning range for t has upper bound 10^(log_t_high).
129 |     num_t: Number of logarithmically spaced t used to populate tuning range.
130 |   """
131 |   sorted_data = np.clip(sorted_data, data_low, data_high)
132 |   triples = compute_triples(divided_eps,
133 |                             np.logspace(log_t_low, log_t_high, num_t))
134 |   num_qs = len(qs)
135 |   ts = np.empty(num_qs)
136 |   for i in range(num_qs):
137 |     true_quantile_idx = base.quantile_index(len(sorted_data), qs[i])
138 |     log_sensitivities = np.zeros(len(triples))
139 |     for triple_idx in range(len(triples)):
140 |       t = triples[triple_idx, 0]
141 |       log_sensitivities[
142 |           triple_idx] = smooth_utils.compute_log_smooth_sensitivity(
143 |               sorted_data, data_low, data_high, true_quantile_idx, t)
144 |     ts[i] = triples[log_choose_triple_idx(triples, divided_eps,
145 |                                           log_sensitivities)][0]
146 |   return ts
147 | 
148 | 
149 | def csmooth_tune_t_experiment(eps, num_samples, num_trials, num_quantiles_range,
150 |                               data_low, data_high, log_t_low, log_t_high,
151 |                               num_t):
152 |   """Returns 2-D array of ts, tuned for each (num_quantiles, quantile) pair.
153 | 
154 |   Args:
155 |     eps: Privacy parameter epsilon.
156 |     num_samples: Number of standard Gaussian samples to draw for each trial.
157 |     num_trials: Number of trials to average.
158 |     num_quantiles_range: Array of number of quantiles to estimate.
159 |     data_low: Lower bound for data, used by CSmooth.
160 |     data_high: Upper bound for data, used by CSmooth.
161 |     log_t_low: Tuning range for t has lower bound 10^(log_t_low).
162 |     log_t_high: Tuning range for t has upper bound 10^(log_t_high).
163 |     num_t: Number of logarithmically spaced t used to populate tuning range.
164 |   """
165 |   ts = [np.zeros(num_quantiles) for num_quantiles in num_quantiles_range]
166 |   num_quantiles_idx = 0
167 |   for num_quantiles_idx in range(len(num_quantiles_range)):
168 |     num_quantiles = num_quantiles_range[num_quantiles_idx]
169 |     divided_eps = eps / np.sqrt(num_quantiles)
170 |     for _ in range(num_trials):
171 |       sorted_data = base.gen_gaussian(num_samples, 0, 1)
172 |       qs = np.linspace(0, 1, num_quantiles + 2)[1:-1]
173 |       ts[num_quantiles_idx] += csmooth_tune_and_return_ts(
174 |           sorted_data, data_low, data_high, qs, divided_eps, log_t_low,
175 |           log_t_high, num_t) / num_trials
176 |     print("Finished num_quantiles: {}".format(num_quantiles))
177 |   return ts
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Code for "Federated Boosted Decision Trees with Differential Privacy"
  2 | 
  3 | This repository contains code for the ACM CCS'22 paper ["Federated Boosted Decision Trees with Differential Privacy"](https://arxiv.org/abs/2210.02910)
  4 | ## Reference
  5 | 
  6 | If the code and/or paper contained in this repository were useful to you please consider citing this [work](https://arxiv.org/abs/2210.02910):
  7 | ```bibtex
  8 | @inproceedings{maddock2022federated,
  9 |   author = {Maddock, Samuel and Cormode, Graham and Wang, Tianhao and Maple, Carsten and Jha, Somesh}, 
 10 |   title = {Federated Boosted Decision Trees with Differential Privacy}, 
 11 |   year = {2022}, 
 12 |   isbn = {9781450394505}, 
 13 |   publisher = {Association for Computing Machinery}, 
 14 |   booktitle = {Proceedings of the 2022 ACM SIGSAC Conference on Computer and Communications Security}, 
 15 |   address = {New York, NY, USA}, url = {https://doi.org/10.1145/3548606.3560687}, doi = {10.1145/3548606.3560687}, 
 16 |   pages = {2249–2263}, 
 17 |   location = {Los Angeles, CA, USA}, series = {CCS '22} 
 18 | }
 19 | ```
 20 | 
 21 | ## Installation
 22 | 
 23 | The simplest way to install an environment for this repo is to use conda and `pip install -r ./requirements.txt`
 24 | ```commandline
 25 | conda create -n "fedxgb" python=3.9 
 26 | conda activate fedxgb
 27 | pip install -r ./requirements.txt
 28 | ```
 29 | 
 30 | Alternatively `pip install` the required libraries
 31 | 
 32 | ```commandline
 33 | pip install pandas seaborn matplotlib scikit-learn numpy xgboost xxhash bitarray scipy numba statsmodels six progressbar autodp fast_histogram notebook pmlb
 34 | ```
 35 | 
 36 | ### Datasets
 37 | 
 38 | Datasets need to be downloaded and placed in the `data` directory in the root of the repo. We use the following datasets in our experiments:
 39 | * [Credit 1](https://www.kaggle.com/competitions/GiveMeSomeCredit/data?select=cs-training.csv) - should be placed under `data/Kaggle Credit 1/credit1-training.csv`
 40 | * [Credit 2](https://www.kaggle.com/datasets/uciml/default-of-credit-card-clients-dataset) - should be placed under `data/UCI Credit 2/UCI_Credit_Card.csv`
 41 | * [Adult](https://archive.ics.uci.edu/ml/datasets/adult) - should be placed under `data/UCI Adult/adult.data`
 42 | * [Bank](https://archive.ics.uci.edu/ml/datasets/bank+marketing) - should be placed under `data/UCI Bank Marketing/bank-full.csv`
 43 | * [Nomao](https://archive.ics.uci.edu/ml/datasets/Nomao) - should be placed under `data/UCI Nomao/Nomao.data`
 44 | * [Higgs](https://archive.ics.uci.edu/ml/datasets/HIGGS) - The Higgs dataset should be subsampled to n=200,000 samples and placed under `data/UCI Higgs/higgs-200k.csv`
 45 | 
 46 | 
 47 | ## Outline
 48 | 
 49 | The code is split into two components
 50 | * `experiments` - Main code for running and plotting experiments
 51 | * `federated_gbdt` - Main code for the private GBDT model
 52 | 
 53 | In order to generate plots and tables as in the paper see "Paper Experiments, Plots and Tables"
 54 | 
 55 | In order to replicate the main figures in the paper from scratch see "Replication Instructions"
 56 | 
 57 | ## Framework
 58 | 
 59 | The code structure of `federated_gbdt` is as follows
 60 | * `core`
 61 |   * `binning`- Contains quantile sketching code from the FEVERLESS implementation
 62 |   * `dp_multiq` - Central DP quantiles (not used in the paper)
 63 |   * `moments_accountant` - TensorFlow Privacy Moments Accountant (RDP)
 64 |   * `pure_ldp` - LDP protocols (not used in paper)
 65 |   * `baseline_constants.py` - Contains constants for the FEVERLESS implementation of quantile sketching
 66 |   * `loss_functions.py` - Contains loss functions used in the GBDT algorithm
 67 |   * `plotting.py` - Debugging code
 68 | * `models`
 69 |   * `base`
 70 |     * `jit_functions.py` - Numba functions for computing GBDT quantities needed for training (split scores and weights)
 71 |     * `tree_node.py` - Contains the `DecisionNode` class reworked from the FEVERLESS implementation
 72 |     * `tree_base.py` - Base tree implementation
 73 |   * `gbdt`
 74 |     * `components` 
 75 |       * `index_sampler.py`- Contains the `IndexSampler` class for managing which features/observations a tree uses during training
 76 |       * `privacy_accountant.py` - Contains the `PrivacyAccountant` class for managing DP during training of a `PrivateGBDT`
 77 |       * `split_candidate_manager.py` - Manages the various methods used to propose split candidates
 78 |       * `train_monitor.py` - Monitors various training statistics of a `PrivateGBDT` model
 79 |     * `private_gbdt.py` - Contains the main model `PrivateGBDT` class 
 80 | 
 81 | ## Paper Experiments, Plots and Table
 82 | 
 83 | All experiments were run with 15 iterations in total (3 iterations over 5 different train-test sets). Code for running experiments is in `experiments/paper_experiments/paper_experiments.py` and plotting in `experiments/paper_experiments/paper_plotter.py`
 84 | 
 85 | ### Running paper experiments
 86 | The following methods in `paper_experiments.py` corresponds to the following figures/tables in the paper:
 87 | * `dp_split_methods_with_update_methods` - Corresponds to Figure 1 (a,b,c), Table 2 in main text, Figures 7-10 and Table 7-10 in the Appendix
 88 | * `dp_split_candidate_methods` - Corresponds to Figure 2 (a,b,c) and Table 3 in the main text, Figure 11 and 12 in the Appendix
 89 | * `feature_interaction_experiments` - Corresponds to Figure 3
 90 | * `dp_ebm_experiment` - Corresponds to Figure 4
 91 | * `batched_boosting` -  Corresponds to Figure 5, Table 4 in the main text, Figure 13 in the Appendix
 92 | * `comparisons_experiment` - Corresponds to Figure 6 in the main text and Figures 14-18 in the Appendix
 93 | 
 94 | 
 95 | ### Generating paper plots
 96 | 
 97 | Paper figures are already generated and present in `experiments/paper_experiments/paper_plots`. 
 98 | 
 99 | To recreate paper plots download the paper results from [here](https://drive.google.com/file/d/1u7BFhEP7e2sqxfr3vAd92hrOJaV_sZI7/view?usp=sharing) and place them in `experiments/paper_experiments/paper_results/` 
100 | 
101 | The following methods in `experiment_plotter.py` can be used to plot results:
102 | * `plot_split_methods_with_update` - Figure 1(a,b,c)
103 | * `plot_split_candidates` - Figure 2(a,b,c)
104 | * `plot_k_way` - Figure 3
105 | * `plot_ebm_comparisons` - Figure 4
106 | * `plot_low_eps_bb` - Figure 5
107 | * `plot_comparisons` - Figure 6
108 | * `table_split_methods_with_update` - Table 2
109 | * `table_split_candidate` - Table 3
110 | * `table_low_eps_bb` - Table 4
111 | 
112 | Plots and tables for the Appendix can be recreated via the following (although they are already present in `paper_plots`):
113 | * `appendix_E1` - Figures 7,8,9,10
114 | * `appendix_E1_table` - Tables 7,8,9,10
115 | * `appendix_E2` - Figure 12
116 | * `appendix_E3` - Not used
117 | * `appendix_E4` - Figure 13
118 | * `appendix_E5`- Figure 14, 15, 16, 17, 18
119 | 
120 | ## Replication Instructions
121 | 
122 | As all experiments in the paper are repeated over 15 iterations they are usually too slow to replicate within a reasonable amount of time. 
123 | Instead, to approximately replicate an experiment from scratch additional code is provided in `experiments/replication_experiments`
124 | 
125 | Most replication experiments have been designed to run on the Credit 1 dataset in ~30 minutes depending on the device. Most run on a single test-train seed over 3 iterations.
126 | 
127 | The `experiments/replication_experiments` folder already contains data and replication figures for all 6 figures presented in the main paper.
128 | You can also generate appendix figures by changing the dataset that is passed to `ExperimentReplicator.replicate`
129 | 
130 | 
131 | Benchmark replication times performed on a Macbook Air M1:
132 | * Fig 1(a,b,c): ~30 mins
133 | * Fig 2(a,b,c): ~45 mins
134 | * Fig 3: ~20 mins
135 | * Fig 4: ~10 mins
136 | * Fig 5: ~15 mins
137 | * Fig 6: ~25 mins
138 | 
139 | ## Acknowledgements
140 | 
141 | * Part of the tree structure implementation is based on the public implementation of the FEVERLESS [paper](https://paperswithcode.com/paper/feverless-fast-and-secure-vertical-federated) with code repo [here](https://github.com/feverless111/vfl/blob/0c0bae50c37c193938e59a95c67fa62b43e43e8e/FEVERLESS/models/vertical/tree/xgboost/centralized_xgboost.py)
142 | * We make extensive use of the [autodp](https://github.com/yuxiangw/autodp) library by Yu-Xiang Wang to verify privacy accounting 
143 | * Part of our privacy accountant uses the RDP moments accountant implemented in [TensorFlow Privacy](https://github.com/tensorflow/privacy)
144 | * Although not used in our paper, the code supports using datasets from the [Penn Machine Learning Benchmarks (PMLB)](https://epistasislab.github.io/pmlb/) 
145 | 


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/quantile_summaries.py:
--------------------------------------------------------------------------------
  1 | from federated_gbdt.core import baseline_constants
  2 | import math
  3 | 
  4 | 
  5 | class Stats(object):
  6 |     def __init__(self, value, g: int, delta: int):
  7 |         self.value = value
  8 |         self.g = g
  9 |         self.delta = delta
 10 | 
 11 | 
 12 | class QuantileSummaries(object):
 13 |     def __init__(self, compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD,
 14 |                  head_size=baseline_constants.DEFAULT_HEAD_SIZE,
 15 |                  error=baseline_constants.DEFAULT_RELATIVE_ERROR,
 16 |                  abnormal_list=None):
 17 |         self.compress_thres = compress_thres
 18 |         self.head_size = head_size
 19 |         self.error = error
 20 |         self.head_sampled = []
 21 |         self.sampled = []  # list of Stats
 22 |         self.count = 0  # Total observations appeared
 23 |         if abnormal_list is None:
 24 |             self.abnormal_list = []
 25 |         else:
 26 |             self.abnormal_list = abnormal_list
 27 |         self._total_count = 0
 28 | 
 29 |     def set_total_count(self, total_count):
 30 |         self._total_count = total_count
 31 | 
 32 |     # insert a number
 33 |     def insert(self, x):
 34 |         """
 35 |         Insert an observation of data. First store in a array buffer. If the buffer is full,
 36 |         do a batch insert. If the size of sampled list reach compress_thres, compress this list.
 37 |         Parameters
 38 |         ----------
 39 |         x : float
 40 |             The feature value
 41 | 
 42 |         """
 43 |         if x in self.abnormal_list:
 44 |             return
 45 | 
 46 |         try:
 47 |             x = float(x)
 48 |         except ValueError:
 49 |             return
 50 | 
 51 |         self.head_sampled.append(x)
 52 |         if len(self.head_sampled) >= self.head_size:
 53 |             self._insert_head_buffer() # clear self.head_sample -> self.sampled
 54 |             if len(self.sampled) >= self.compress_thres:
 55 |                 self.compress()
 56 | 
 57 |     def _insert_head_buffer(self):
 58 |         if not len(self.head_sampled):  # If empty
 59 |             return
 60 |         current_count = self.count
 61 |         sorted_head = sorted(self.head_sampled)
 62 |         new_sampled = []
 63 |         sample_idx = 0
 64 |         ops_idx = 0
 65 |         while ops_idx < len(sorted_head):
 66 |             current_sample = sorted_head[ops_idx]
 67 |             while sample_idx < len(self.sampled) and self.sampled[sample_idx].value <= current_sample:
 68 |                 new_sampled.append(self.sampled[sample_idx])
 69 |                 sample_idx += 1
 70 | 
 71 |             current_count += 1
 72 | 
 73 |             # If it is the first one to insert or if it is the last one
 74 |             if not new_sampled or (sample_idx == len(self.sampled) and
 75 |                                    ops_idx == len(sorted_head) - 1):
 76 |                 delta = 0
 77 |             else:
 78 |                 # delta = math.floor(2 * self.error * current_count) - 1
 79 |                 delta = math.floor(2 * self.error * current_count)
 80 | 
 81 |             new_stats = Stats(current_sample, 1, delta)
 82 |             new_sampled.append(new_stats)
 83 |             ops_idx += 1
 84 |         self.sampled = new_sampled
 85 |         self.head_sampled = []
 86 |         self.count = current_count
 87 | 
 88 |     def compress(self):
 89 |         self._insert_head_buffer()
 90 |         # merge_threshold = math.floor(2 * self.error * self.count) - 1
 91 |         merge_threshold = 2 * self.error * self.count
 92 |         compressed = self._compress_immut(merge_threshold)
 93 |         self.sampled = compressed
 94 | 
 95 |     def merge(self, other):
 96 |         """
 97 |         merge current summeries with the other one.
 98 |         Parameters
 99 |         ----------
100 |         other : QuantileSummaries
101 |             The summaries to be merged
102 |         """
103 |         if other.head_sampled:
104 |             # other._insert_head_buffer()
105 |             other.compress()
106 | 
107 |         if self.head_sampled:
108 |             # self._insert_head_buffer()
109 |             self.compress()
110 | 
111 |         if other.count == 0:
112 |             return self
113 | 
114 |         if self.count == 0:
115 |             self.count = other.count
116 |             self.sampled = other.sampled
117 |             return self
118 | 
119 |         # merge two sorted array
120 |         new_sample = []
121 |         i, j = 0, 0
122 |         while i < len(self.sampled) and j < len(other.sampled):
123 |             if self.sampled[i].value < other.sampled[j].value:
124 |                 new_sample.append(self.sampled[i])
125 |                 i += 1
126 |             else:
127 |                 new_sample.append(other.sampled[j])
128 |                 j += 1
129 |         new_sample += self.sampled[i:]
130 |         new_sample += other.sampled[j:]
131 | 
132 |         self.sampled = new_sample
133 |         self.count += other.count
134 |         # merge_threshold = math.floor(2 * self.error * self.count) - 1
135 |         merge_threshold = 2 * self.error * self.count
136 | 
137 |         self.sampled = self._compress_immut(merge_threshold)
138 |         return self
139 | 
140 |     def query(self, quantile):
141 |         """
142 |         Given the queried quantile, return the approximation guaranteed result
143 |         Parameters
144 |         ----------
145 |         quantile : float [0.0, 1.0]
146 |             The target quantile
147 | 
148 |         Returns
149 |         -------
150 |         float, the corresponding value result.
151 |         """
152 |         if self.head_sampled:
153 |             # self._insert_head_buffer()
154 |             self.compress()
155 | 
156 |         if quantile < 0 or quantile > 1:
157 |             raise ValueError("Quantile should be in range [0.0, 1.0]")
158 | 
159 |         if self.count == 0:
160 |             return 0
161 | 
162 |         if quantile <= self.error:
163 |             return self.sampled[0].value
164 | 
165 |         if quantile >= 1 - self.error:
166 |             return self.sampled[-1].value
167 | 
168 |         rank = math.ceil(quantile * self.count)
169 |         target_error = math.ceil(self.error * self.count)
170 |         min_rank = 0
171 |         i = 1
172 |         while i < len(self.sampled) - 1:
173 |             cur_sample = self.sampled[i]
174 |             min_rank += cur_sample.g
175 |             max_rank = min_rank + cur_sample.delta
176 |             if max_rank - target_error <= rank <= min_rank + target_error:
177 |                 return cur_sample.value
178 |             i += 1
179 |         return self.sampled[-1].value
180 | 
181 |     def _compress_immut(self, merge_threshold):
182 |         if not self.sampled:
183 |             return self.sampled
184 | 
185 |         res = []
186 | 
187 |         # Start from the last element
188 |         head = self.sampled[-1]
189 |         i = len(self.sampled) - 2  # Do not merge the last element
190 | 
191 |         while i >= 1:
192 |             this_sample = self.sampled[i]
193 |             if this_sample.g + head.g + head.delta < merge_threshold:
194 |                 head.g = head.g + this_sample.g
195 |             else:
196 |                 res.append(head)
197 |                 head = this_sample
198 |             i -= 1
199 |         res.append(head)
200 | 
201 |         # If head of current sample is smaller than this new res's head
202 |         # Add current head into res
203 |         current_head = self.sampled[0]
204 |         if current_head.value <= head.value and len(self.sampled) > 1:
205 |             res.append(current_head)
206 | 
207 |         # Python do not support prepend, thus, use reverse instead
208 |         res.reverse()
209 |         return res
210 | 
211 | 
212 | class SparseQuantileSummaries(QuantileSummaries):
213 |     def __init__(self, compress_thres=baseline_constants.DEFAULT_COMPRESS_THRESHOLD,
214 |                  head_size=baseline_constants.DEFAULT_HEAD_SIZE,
215 |                  error=baseline_constants.DEFAULT_RELATIVE_ERROR,
216 |                  abnormal_list=None):
217 |         super(SparseQuantileSummaries, self).__init__(compress_thres, head_size, error, abnormal_list)
218 | 
219 |         # Compare with the sparse point, static the number of each part.
220 |         self.smaller_num = 0
221 |         self.bigger_num = 0
222 |         # self._total_count = 0
223 | 
224 |     def insert(self, x):
225 |         if x in self.abnormal_list:
226 |             return
227 |         if x < baseline_constants.FLOAT_ZERO:
228 |             self.smaller_num += 1
229 |         elif x >= baseline_constants.FLOAT_ZERO:
230 |             self.bigger_num += 1
231 |         super(SparseQuantileSummaries, self).insert(x)
232 | 
233 |     def query(self, quantile):
234 |         if self.zero_lower_bound < quantile < self.zero_upper_bound:
235 |             return 0.0
236 | 
237 |         non_zero_quantile = self._convert_query_percentile(quantile)  # can be ignored
238 |         result = super(SparseQuantileSummaries, self).query(non_zero_quantile)
239 |         return result
240 | 
241 |     def merge(self, other):
242 |         self.smaller_num += other.smaller_num
243 |         self.bigger_num += other.bigger_num
244 |         super(SparseQuantileSummaries, self).merge(other)
245 |         return self
246 | 
247 |     def _convert_query_percentile(self, quantile):
248 |         zeros_count = self._total_count - self.count
249 |         if zeros_count == 0:
250 |             return quantile
251 | 
252 |         if quantile <= self.zero_lower_bound:
253 |             return (self._total_count / self.count) * quantile
254 | 
255 |         return (quantile - self.zero_upper_bound + self.zero_lower_bound) / (
256 |                 1 - self.zero_upper_bound + self.zero_lower_bound)
257 | 
258 |     @property
259 |     def zero_lower_bound(self):
260 |         if self.smaller_num == 0:
261 |             return 0.0
262 |         return self.smaller_num / self._total_count
263 | 
264 |     @property
265 |     def zero_upper_bound(self):
266 |         if self.bigger_num == 0:
267 |             return self._total_count
268 |         zeros_num = self._total_count - self.smaller_num - self.bigger_num
269 |         return (self.smaller_num + zeros_num) / self._total_count


--------------------------------------------------------------------------------
/federated_gbdt/core/binning/quantile_binning.py:
--------------------------------------------------------------------------------
  1 | from federated_gbdt.core.binning.feature_binning_param import FeatureBinningParam
  2 | from federated_gbdt.core.binning.base_binning import BaseBinning
  3 | from federated_gbdt.core.binning.quantile_summaries import SparseQuantileSummaries, QuantileSummaries
  4 | from federated_gbdt.core.baseline_constants import DEFAULT_RELATIVE_ERROR
  5 | 
  6 | import pandas as pd
  7 | import copy
  8 | import functools
  9 | 
 10 | 
 11 | class NoneType:
 12 |     def __eq__(self, obj):
 13 |         return isinstance(obj, NoneType)
 14 | 
 15 | def get_split_points(data_inst, is_sparse=False, bin_num=32,
 16 |                                binning_error=DEFAULT_RELATIVE_ERROR,
 17 |                                handle_missing_value=False):
 18 |     assert isinstance(data_inst, pd.DataFrame)
 19 |     param_obj = FeatureBinningParam(bin_num=bin_num, error=binning_error)
 20 |     if handle_missing_value:
 21 |         binning_obj = QuantileBinning(params=param_obj, abnormal_list=[NoneType()])
 22 |     else:
 23 |         binning_obj = QuantileBinning(params=param_obj)
 24 |     binning_obj.fit_split_points(data_inst, is_sparse)
 25 |     #print('split point results have been defined')
 26 |     return binning_obj.get_split_points_result_numpy()
 27 | 
 28 | def quantile_summary_factory(is_sparse, param_dict):
 29 |     if is_sparse:
 30 |         return SparseQuantileSummaries(**param_dict)
 31 |     else:
 32 |         return QuantileSummaries(**param_dict)
 33 | 
 34 | 
 35 | class QuantileBinning(BaseBinning):
 36 |     """
 37 |     After quantile binning, the numbers of elements in each binning are equal.
 38 | 
 39 |     The result of this algorithm has the following deterministic bound:
 40 |     If the data_instances has N elements and if we request the quantile at probability `p` up to error
 41 |     `err`, then the algorithm will return a sample `x` from the data so that the *exact* rank
 42 |     of `x` is close to (p * N).
 43 |     More precisely,
 44 | 
 45 |     {{{
 46 |       floor((p - 2 * err) * N) <= rank(x) <= ceil((p + 2 * err) * N)
 47 |     }}}
 48 | 
 49 |     This method implements a variation of the Greenwald-Khanna algorithm (with some speed
 50 |     optimizations).
 51 |     """
 52 | 
 53 |     def __init__(self, params: FeatureBinningParam, abnormal_list=None, allow_duplicate=False):
 54 |         super(QuantileBinning, self).__init__(params, abnormal_list)
 55 |         self.summary_dict = None
 56 |         self.allow_duplicate = allow_duplicate
 57 | 
 58 |     def fit_split_points(self, data_inst, is_sparse=False):
 59 |         """
 60 |         Apply the binning method
 61 | 
 62 |         Parameters
 63 |         ----------
 64 |         sparse_dataseries : Data series
 65 |             The input sparse vector
 66 | 
 67 |         Returns
 68 |         -------
 69 |         split_points : dict.
 70 |             Each value represent for the split points for a feature. The element in each row represent for
 71 |             the corresponding split point.
 72 |             e.g.
 73 |             split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
 74 |                             'x2': [1, 2, 3, 4, ...],           # The second feature
 75 |                             ...                         # Other features
 76 |                             }
 77 |         """
 78 |         if is_sparse:
 79 |             assert isinstance(data_inst, pd.Series)
 80 |             header = data_inst.iloc[0].feature_name
 81 |         else:
 82 |             assert isinstance(data_inst, pd.DataFrame)
 83 |             header = list(data_inst.columns)
 84 |         # if not isinstance(sparse_dataseries, pd.Series):
 85 |         #     raise TypeError('the input data should be data series')
 86 | 
 87 |         # LOGGER.debug("in _fit_split_point, cols_map: {}".format(self.bin_inner_param.bin_cols_map))
 88 | 
 89 |         self._default_setting(header)
 90 |         # self._init_cols(data_instances)
 91 |         percent_value = 1.0 / self.bin_num
 92 | 
 93 |         # calculate the split points
 94 |         percentile_rate = [i * percent_value for i in range(1, self.bin_num)]
 95 |         percentile_rate.append(1.0)
 96 | 
 97 |         self._fit_split_point(data_inst, is_sparse, percentile_rate)
 98 | 
 99 |         # self.fit_category_features(sparse_dataseries)  # can be ignored here
100 |         return self.bin_results.all_split_points   # {fn: [fv_thresholds], ....}
101 | 
102 |     def get_split_points_result_numpy(self):
103 |         return self.bin_results.get_split_points_array(self.bin_inner_param.transform_bin_names)
104 | 
105 |     @staticmethod
106 |     def copy_merge(s1, s2):
107 |         new_s1 = copy.deepcopy(s1)
108 |         return new_s1.merge(s2)
109 | 
110 |     def _fit_split_point(self, data_inst, is_sparse, percentile_rate):
111 |         if self.summary_dict is None:
112 |             f = functools.partial(self.feature_summary,
113 |                                   params=self.params,   # FeatureBinningParam(...)
114 |                                   abnormal_list=self.abnormal_list,
115 |                                   cols_dict=self.bin_inner_param.bin_cols_map,  # {bin_name: bin_idx, ...}
116 |                                   header=self.header,
117 |                                   is_sparse=is_sparse)
118 |             summary_dict = f(data_inst=data_inst)
119 |             summary_dict = dict(summary_dict)
120 | 
121 |             # LOGGER.debug(f"new summary_dict: {summary_dict}")
122 |             total_count = len(data_inst)
123 |             for _, summary_obj in summary_dict.items():
124 |                 summary_obj.set_total_count(total_count)
125 | 
126 |             self.summary_dict = summary_dict
127 |         else:
128 |             summary_dict = self.summary_dict
129 | 
130 |         for col_name, summary in summary_dict.items():
131 |             split_point = []
132 |             for percen_rate in percentile_rate:
133 |                 s_p = summary.query(percen_rate)
134 |                 if not self.allow_duplicate:
135 |                     if s_p not in split_point:
136 |                         split_point.append(s_p)
137 |                 else:
138 |                     split_point.append(s_p)
139 |             self.bin_results.put_col_split_points(col_name, split_point)
140 | 
141 |     @staticmethod
142 |     def feature_summary(data_inst, params, cols_dict, abnormal_list, header, is_sparse):
143 |         summary_dict = {}
144 | 
145 |         summary_param = {'compress_thres': params.compress_thres,
146 |                          'head_size': params.head_size,
147 |                          'error': params.error,
148 |                          'abnormal_list': abnormal_list}
149 | 
150 |         for col_name, col_index in cols_dict.items():
151 |             quantile_summaries = quantile_summary_factory(is_sparse=is_sparse, param_dict=summary_param)
152 |             # quantile_summaries = SparseQuantileSummaries(**summary_param)
153 |             summary_dict[col_name] = quantile_summaries
154 | 
155 |         if is_sparse:
156 |             # pd.Series
157 |             for sv in data_inst:
158 |                 data_generator = sv.get_all_data()
159 |                 for col_idx, col_value in data_generator:
160 |                     col_name = header[col_idx]
161 |                     if col_name not in cols_dict:
162 |                         continue
163 |                     summary = summary_dict[col_name]
164 |                     summary.insert(col_value)
165 |         else:
166 |             # pd.Dataframe
167 |             for _, inst in data_inst.iterrows():
168 |                 for col_name, summary in summary_dict.items():
169 |                     col_index = cols_dict[col_name]
170 |                     summary.insert(inst[col_index])
171 | 
172 |         result = []
173 |         for features_name, summary_obj in summary_dict.items():
174 |             summary_obj.compress()
175 |             # result.append(((_, features_name), summary_obj))
176 |             result.append((features_name, summary_obj))
177 | 
178 |         return result
179 | 
180 |     @staticmethod
181 |     def _query_split_points(summary, percent_rates):
182 |         split_point = []
183 |         for percent_rate in percent_rates:
184 |             s_p = summary.query(percent_rate)
185 |             if s_p not in split_point:
186 |                 split_point.append(s_p)
187 |         return split_point
188 | 
189 |     @staticmethod
190 |     def approxi_quantile(data_instances, params, cols_dict, abnormal_list, header, is_sparse):
191 |         """
192 |         Calculates each quantile information
193 | 
194 |         Parameters
195 |         ----------
196 |         data_instances : DTable
197 |             The input data
198 | 
199 |         cols_dict: dict
200 |             Record key, value pairs where key is cols' name, and value is cols' index.
201 | 
202 |         params : FeatureBinningParam object,
203 |                 Parameters that user set.
204 | 
205 |         abnormal_list: list, default: None
206 |             Specify which columns are abnormal so that will not static when traveling.
207 | 
208 |         header: list,
209 |             Storing the header information.
210 | 
211 |         is_sparse: bool
212 |             Specify whether data_instance is in sparse type
213 | 
214 |         Returns
215 |         -------
216 |         summary_dict: dict
217 |             {'col_name1': summary1,
218 |              'col_name2': summary2,
219 |              ...
220 |              }
221 | 
222 |         """
223 | 
224 |         summary_dict = {}
225 | 
226 |         summary_param = {'compress_thres': params.compress_thres,
227 |                          'head_size': params.head_size,
228 |                          'error': params.error,
229 |                          'abnormal_list': abnormal_list}
230 | 
231 |         for col_name, col_index in cols_dict.items():
232 |             quantile_summaries = quantile_summary_factory(is_sparse=is_sparse, param_dict=summary_param)
233 |             summary_dict[col_name] = quantile_summaries
234 | 
235 |         QuantileBinning.insert_datas(data_instances, summary_dict, cols_dict, header, is_sparse)
236 |         for _, summary_obj in summary_dict.items():
237 |             summary_obj.compress()
238 |         return summary_dict
239 | 
240 |     @staticmethod
241 |     def insert_datas(data_instances, summary_dict, cols_dict, header, is_sparse):
242 | 
243 |         for iter_key, instant in data_instances:
244 |             if not is_sparse:
245 |                 if type(instant).__name__ == 'Instance':
246 |                     features = instant.features
247 |                 else:
248 |                     features = instant
249 |                 for col_name, summary in summary_dict.items():
250 |                     col_index = cols_dict[col_name]
251 |                     summary.insert(features[col_index])
252 |             else:
253 |                 data_generator = instant.features.get_all_data()
254 |                 for col_idx, col_value in data_generator:
255 |                     col_name = header[col_idx]
256 |                     summary = summary_dict[col_name]
257 |                     summary.insert(col_value)
258 | 
259 |     @staticmethod
260 |     def merge_summary_dict(s_dict1, s_dict2):
261 |         if s_dict1 is None and s_dict2 is None:
262 |             return None
263 |         if s_dict1 is None:
264 |             return s_dict2
265 |         if s_dict2 is None:
266 |             return s_dict1
267 | 
268 |         s_dict1 = copy.deepcopy(s_dict1)
269 |         s_dict2 = copy.deepcopy(s_dict2)
270 | 
271 |         new_dict = {}
272 |         for col_name, summary1 in s_dict1.items():
273 |             summary2 = s_dict2.get(col_name)
274 |             summary1.merge(summary2)
275 |             new_dict[col_name] = summary1
276 |         return new_dict
277 | 
278 |     def query_quantile_point(self, query_points, col_names=None):
279 | 
280 |         if self.summary_dict is None:
281 |             raise RuntimeError("Bin object should be fit before query quantile points")
282 | 
283 |         if col_names is None:
284 |             col_names = self.bin_inner_param.bin_names
285 | 
286 |         summary_dict = self.summary_dict
287 | 
288 |         if isinstance(query_points, (int, float)):
289 |             query_dict = {}
290 |             for col_name in col_names:
291 |                 query_dict[col_name] = query_points
292 |         elif isinstance(query_points, dict):
293 |             query_dict = query_points
294 |         else:
295 |             raise ValueError("query_points has wrong type, should be a float, int or dict")
296 | 
297 |         result = {}
298 |         for col_name, query_point in query_dict.items():
299 |             summary = summary_dict[col_name]
300 |             result[col_name] = summary.query(query_point)
301 |         return result
302 | 
303 | 
304 | # class QuantileBinningTool(QuantileBinning):
305 | #     """
306 | #     Use for quantile binning data directly.
307 | #     """
308 | #
309 | #     def __init__(self, bin_nums=consts.G_BIN_NUM, param_obj: FeatureBinningParam = None,
310 | #                  abnormal_list=None, allow_duplicate=False):
311 | #         if param_obj is None:
312 | #             param_obj = FeatureBinningParam(bin_num=bin_nums)
313 | #         super().__init__(params=param_obj, abnormal_list=abnormal_list, allow_duplicate=allow_duplicate)


--------------------------------------------------------------------------------