├── LICENSE ├── thesis.pdf ├── slides ├── slides.pdf ├── figures │ ├── mdi.pdf │ ├── bench.pdf │ ├── rp-1.pdf │ ├── tree.pdf │ ├── wine.jpg │ ├── blackbox.jpg │ ├── fit-time.pdf │ ├── forest.pdf │ ├── imp-wine.pdf │ ├── led-fig.pdf │ ├── led-fig.png │ ├── led-imp.pdf │ ├── condorcet.png │ ├── imp-wine2.pdf │ ├── motivation.png │ ├── rp-memory.pdf │ ├── tree-wine.pdf │ ├── avatars │ │ ├── bholt.jpg │ │ ├── joel.jpg │ │ ├── lars.png │ │ ├── ndawe.jpg │ │ ├── satra.jpg │ │ ├── arjoly.jpg │ │ ├── glouppe.jpg │ │ ├── ogrisel.jpg │ │ ├── pprett.jpg │ │ └── amueller.jpg │ ├── bias-variance.pdf │ ├── blackbox-open.jpg │ ├── tree-simple.pdf │ ├── scikit-learn-logo.pdf │ ├── tree-partition-a.pdf │ ├── tree-partition-b.pdf │ ├── tree-partition-c.pdf │ ├── tree-partition-d.pdf │ └── bias-variance-darts.jpg └── minted.sty ├── tex ├── figures │ ├── blason.pdf │ ├── ch2_mlp.pdf │ ├── ch3_tree.pdf │ ├── ch5_sort.pdf │ ├── ch5_tree.pdf │ ├── ch6_led.pdf │ ├── ch3_splits.pdf │ ├── ch6_imp_led.pdf │ ├── ch6_order.pdf │ ├── ch7_network.png │ ├── ch7_red_led.pdf │ ├── ch7_red_xor.pdf │ ├── ch7_splits.pdf │ ├── ch3_goodness.pdf │ ├── ch3_partition.pdf │ ├── ch4_variance.pdf │ ├── ch5_mnist_fit.pdf │ ├── ch7_bias_null.pdf │ ├── ch7_trees_ets.pdf │ ├── ch7_trees_id3.pdf │ ├── ch2_hyperplane.pdf │ ├── ch3_toy_x1_error.pdf │ ├── ch3_toy_x1_gini.pdf │ ├── ch3_toy_x2_gini.pdf │ ├── ch3_toy_x3_gini.pdf │ ├── ch4_correlation.pdf │ ├── ch4_overfitting.pdf │ ├── ch5_learningset.pdf │ ├── ch7_bias_depth.pdf │ ├── ch7_bias_trees.pdf │ ├── ch7_trees_ets2.pdf │ ├── ch8 │ │ ├── figure4-mem.pdf │ │ ├── figure4-none.pdf │ │ ├── figure5-c-tis.pdf │ │ ├── figure5-a-arcene.pdf │ │ ├── figure5-b-cifar10.pdf │ │ ├── figure5-d-madelon.pdf │ │ ├── figure5-e-isolet.pdf │ │ ├── figure5-f-mnist.pdf │ │ ├── figure3-c-tis-rp-dt.pdf │ │ ├── figure3-c-tis-rp-et.pdf │ │ ├── figure3-a-arcene-rp-dt.pdf │ │ ├── figure3-a-arcene-rp-et.pdf │ │ ├── figure3-e-isolet-rp-dt.pdf │ │ ├── figure3-e-isolet-rp-et.pdf │ │ ├── figure3-f-mnist-rp-dt.pdf │ │ ├── figure3-f-mnist-rp-et.pdf │ │ ├── figure3-b-cifar10-rp-dt.pdf │ │ ├── figure3-b-cifar10-rp-et.pdf │ │ ├── figure3-d-madelon-rp-dt.pdf │ │ └── figure3-d-madelon-rp-et.pdf │ ├── ch8_rank_large.pdf │ ├── ch8_rank_small.pdf │ ├── ch3_split_ordered.pdf │ ├── ch4_bias_variance.pdf │ ├── ch4_proximity_plot.pdf │ ├── ch5_mnist_predict.pdf │ ├── ch7_bias_depth_rf.pdf │ ├── ch2_train_test_error.pdf │ ├── ch3_impurity_comparison.pdf │ ├── ch4_estimate_distribution.pdf │ ├── make_friedman1 │ │ ├── n_train_mse.pdf │ │ ├── bootstrap_mse.pdf │ │ ├── max_features_mse.pdf │ │ ├── n_estimators_mse.pdf │ │ ├── n_features_mse.pdf │ │ ├── n_train_time_fit.pdf │ │ ├── bootstrap_time_fit.pdf │ │ ├── max_features_time_fit.pdf │ │ ├── n_estimators_time_fit.pdf │ │ ├── n_features_time_fit.pdf │ │ ├── n_train_average_depth.pdf │ │ ├── bootstrap_average_depth.pdf │ │ ├── max_features_average_depth.pdf │ │ ├── n_estimators_average_depth.pdf │ │ └── n_features_average_depth.pdf │ ├── ch3_split_ordered_invariant.pdf │ ├── generate.sh │ ├── ch7_trees_ets2.tex │ ├── ch5_sort.tex │ ├── ch3_tree.tex │ ├── ch7_trees_id3.tex │ ├── ch3_splits.tex │ ├── ch7_splits.tex │ ├── ch7_trees_ets.tex │ ├── ch5_tree.tex │ ├── ch2_mlp.tex │ ├── ch3_goodness.tex │ ├── ch8_rank_large.tex │ └── ch8_rank_small.tex ├── frontback │ ├── bibliography.tex │ ├── disclaimer.tex │ ├── toc.tex │ ├── jury.tex │ ├── titlepage.tex │ ├── acknowledgments.tex │ ├── abstract.tex │ └── notations.tex ├── Makefile ├── summary.tex ├── thesis.tex ├── chapters │ ├── chapter09.tex │ └── chapter01.tex ├── minted.sty └── classicthesis-config.tex ├── .gitignore ├── scripts ├── ch6_order.py ├── ch5_tree.py ├── ch2_hyperplane.py ├── ch6_decomposition1.py ├── ch4_proximity.py ├── ch4_estimate_distribution.py ├── ch3_partition.py ├── ch6_decomposition2.py ├── ch7_redundant.py ├── ch4_bias_variance.py ├── ch3_split_ordered.py ├── ch7_bias_tree.py ├── ch2_train_test_error.py ├── ch7_bias_depth.py ├── ch3_split_ordered_invariant.py ├── ch7_bias_null.py ├── ch4_correlation_plot.py ├── ch3_impurity.py ├── ch4_correlation.py ├── ch4_overfitting.py ├── ch4_correlation_plot2.py ├── ID3.py └── demo.py ├── benchmarks ├── data.py ├── resources │ └── bench_randomforest.py └── visualize.py └── README.md /LICENSE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/thesis.pdf -------------------------------------------------------------------------------- /slides/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/slides.pdf -------------------------------------------------------------------------------- /slides/figures/mdi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/mdi.pdf -------------------------------------------------------------------------------- /tex/figures/blason.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/blason.pdf -------------------------------------------------------------------------------- /slides/figures/bench.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bench.pdf -------------------------------------------------------------------------------- /slides/figures/rp-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/rp-1.pdf -------------------------------------------------------------------------------- /slides/figures/tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree.pdf -------------------------------------------------------------------------------- /slides/figures/wine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/wine.jpg -------------------------------------------------------------------------------- /tex/figures/ch2_mlp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_mlp.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_tree.pdf -------------------------------------------------------------------------------- /tex/figures/ch5_sort.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_sort.pdf -------------------------------------------------------------------------------- /tex/figures/ch5_tree.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_tree.pdf -------------------------------------------------------------------------------- /tex/figures/ch6_led.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_led.pdf -------------------------------------------------------------------------------- /slides/figures/blackbox.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/blackbox.jpg -------------------------------------------------------------------------------- /slides/figures/fit-time.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/fit-time.pdf -------------------------------------------------------------------------------- /slides/figures/forest.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/forest.pdf -------------------------------------------------------------------------------- /slides/figures/imp-wine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/imp-wine.pdf -------------------------------------------------------------------------------- /slides/figures/led-fig.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-fig.pdf -------------------------------------------------------------------------------- /slides/figures/led-fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-fig.png -------------------------------------------------------------------------------- /slides/figures/led-imp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-imp.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_splits.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_splits.pdf -------------------------------------------------------------------------------- /tex/figures/ch6_imp_led.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_imp_led.pdf -------------------------------------------------------------------------------- /tex/figures/ch6_order.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_order.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_network.png -------------------------------------------------------------------------------- /tex/figures/ch7_red_led.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_red_led.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_red_xor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_red_xor.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_splits.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_splits.pdf -------------------------------------------------------------------------------- /slides/figures/condorcet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/condorcet.png -------------------------------------------------------------------------------- /slides/figures/imp-wine2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/imp-wine2.pdf -------------------------------------------------------------------------------- /slides/figures/motivation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/motivation.png -------------------------------------------------------------------------------- /slides/figures/rp-memory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/rp-memory.pdf -------------------------------------------------------------------------------- /slides/figures/tree-wine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-wine.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_goodness.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_goodness.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_partition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_partition.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_variance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_variance.pdf -------------------------------------------------------------------------------- /tex/figures/ch5_mnist_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_mnist_fit.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_bias_null.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_null.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_trees_ets.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_ets.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_trees_id3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_id3.pdf -------------------------------------------------------------------------------- /slides/figures/avatars/bholt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/bholt.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/joel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/joel.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/lars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/lars.png -------------------------------------------------------------------------------- /slides/figures/avatars/ndawe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/ndawe.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/satra.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/satra.jpg -------------------------------------------------------------------------------- /slides/figures/bias-variance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bias-variance.pdf -------------------------------------------------------------------------------- /slides/figures/blackbox-open.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/blackbox-open.jpg -------------------------------------------------------------------------------- /slides/figures/tree-simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-simple.pdf -------------------------------------------------------------------------------- /tex/figures/ch2_hyperplane.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_hyperplane.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_toy_x1_error.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x1_error.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_toy_x1_gini.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x1_gini.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_toy_x2_gini.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x2_gini.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_toy_x3_gini.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x3_gini.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_correlation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_correlation.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_overfitting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_overfitting.pdf -------------------------------------------------------------------------------- /tex/figures/ch5_learningset.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_learningset.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_bias_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_depth.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_bias_trees.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_trees.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_trees_ets2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_ets2.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure4-mem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure4-mem.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure4-none.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure4-none.pdf -------------------------------------------------------------------------------- /tex/figures/ch8_rank_large.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8_rank_large.pdf -------------------------------------------------------------------------------- /tex/figures/ch8_rank_small.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8_rank_small.pdf -------------------------------------------------------------------------------- /slides/figures/avatars/arjoly.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/arjoly.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/glouppe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/glouppe.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/ogrisel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/ogrisel.jpg -------------------------------------------------------------------------------- /slides/figures/avatars/pprett.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/pprett.jpg -------------------------------------------------------------------------------- /tex/figures/ch3_split_ordered.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_split_ordered.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_bias_variance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_bias_variance.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_proximity_plot.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_proximity_plot.pdf -------------------------------------------------------------------------------- /tex/figures/ch5_mnist_predict.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_mnist_predict.pdf -------------------------------------------------------------------------------- /tex/figures/ch7_bias_depth_rf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_depth_rf.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-c-tis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-c-tis.pdf -------------------------------------------------------------------------------- /slides/figures/avatars/amueller.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/amueller.jpg -------------------------------------------------------------------------------- /slides/figures/scikit-learn-logo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/scikit-learn-logo.pdf -------------------------------------------------------------------------------- /slides/figures/tree-partition-a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-a.pdf -------------------------------------------------------------------------------- /slides/figures/tree-partition-b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-b.pdf -------------------------------------------------------------------------------- /slides/figures/tree-partition-c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-c.pdf -------------------------------------------------------------------------------- /slides/figures/tree-partition-d.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-d.pdf -------------------------------------------------------------------------------- /tex/figures/ch2_train_test_error.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_train_test_error.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-a-arcene.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-a-arcene.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-b-cifar10.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-b-cifar10.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-d-madelon.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-d-madelon.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-e-isolet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-e-isolet.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure5-f-mnist.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-f-mnist.pdf -------------------------------------------------------------------------------- /slides/figures/bias-variance-darts.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bias-variance-darts.jpg -------------------------------------------------------------------------------- /tex/figures/ch3_impurity_comparison.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_impurity_comparison.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-c-tis-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-c-tis-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-c-tis-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-c-tis-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/ch4_estimate_distribution.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_estimate_distribution.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-a-arcene-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-a-arcene-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-a-arcene-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-a-arcene-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-e-isolet-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-e-isolet-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-e-isolet-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-e-isolet-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-f-mnist-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-f-mnist-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-f-mnist-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-f-mnist-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_train_mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_mse.pdf -------------------------------------------------------------------------------- /tex/figures/ch3_split_ordered_invariant.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_split_ordered_invariant.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-b-cifar10-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-b-cifar10-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-b-cifar10-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-b-cifar10-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-d-madelon-rp-dt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-d-madelon-rp-dt.pdf -------------------------------------------------------------------------------- /tex/figures/ch8/figure3-d-madelon-rp-et.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-d-madelon-rp-et.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/bootstrap_mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_mse.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/max_features_mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_mse.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_estimators_mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_mse.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_features_mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_mse.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_train_time_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_time_fit.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/bootstrap_time_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_time_fit.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/max_features_time_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_time_fit.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_estimators_time_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_time_fit.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_features_time_fit.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_time_fit.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_train_average_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_average_depth.pdf -------------------------------------------------------------------------------- /tex/figures/generate.sh: -------------------------------------------------------------------------------- 1 | latex $1.tex 2 | dvips $1.dvi 3 | ps2pdf $1.ps 4 | pdfcrop $1.pdf 5 | rm $1.aux $1.dvi $1.log $1.ps $1.pdf 6 | mv $1-crop.pdf $1.pdf 7 | -------------------------------------------------------------------------------- /tex/figures/make_friedman1/bootstrap_average_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_average_depth.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/max_features_average_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_average_depth.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_estimators_average_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_average_depth.pdf -------------------------------------------------------------------------------- /tex/figures/make_friedman1/n_features_average_depth.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_average_depth.pdf -------------------------------------------------------------------------------- /tex/frontback/bibliography.tex: -------------------------------------------------------------------------------- 1 | % Bibliography ================================================================ 2 | 3 | \chapter{References} 4 | 5 | \begingroup 6 | \def\chapter*#1{} 7 | \bibliographystyle{abbrvnat} 8 | \renewcommand{\bibname}{} 9 | \label{app:bibliography} 10 | \bibliography{bibliography} 11 | \endgroup 12 | 13 | 14 | -------------------------------------------------------------------------------- /tex/frontback/disclaimer.tex: -------------------------------------------------------------------------------- 1 | % Disclaimer ==================================================================== 2 | 3 | \vspace*{\fill} 4 | \begin{center} 5 | {\it This dissertation has been submitted in partial fulfillment of 6 | the requirements for the Degree of Doctor of Philosophy in 7 | Computer Science. 8 | 9 | \vskip1cm 10 | 11 | This version of the manuscript is pending the approval 12 | of the jury.} 13 | \end{center} 14 | \vspace*{\fill} 15 | \vspace*{\fill} 16 | \vspace*{\fill} 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.brf 2 | *.acn 3 | *.pyc 4 | *.acr 5 | *.alg 6 | *.aux 7 | *.bbl 8 | *.blg 9 | *.dvi 10 | *.fdb_latexmk 11 | *.glg 12 | *.glo 13 | *.gls 14 | *.idx 15 | *.ilg 16 | *.ind 17 | *.ist 18 | *.lof 19 | *.lol 20 | *.log 21 | *.lot 22 | *.maf 23 | *.mtc 24 | *.mtc0 25 | *.nav 26 | *.nlo 27 | *.out 28 | *.pdfsync 29 | *.ps 30 | *.snm 31 | *.synctex.gz 32 | *.toc 33 | *.vrb 34 | *.xdy 35 | Thumbs.db 36 | *.tdo 37 | thesis.pdf 38 | classicthesis/* 39 | benchmarks/output/* 40 | benchmarks/ok3/* 41 | benchmarks/figs/* 42 | benchmarks/resources/* 43 | scripts/.ipynb_checkpoints/* 44 | tex/arxiv/* 45 | -------------------------------------------------------------------------------- /tex/Makefile: -------------------------------------------------------------------------------- 1 | summary.pdf: summary.tex classicthesis-config.tex summary/*.tex frontback/*.tex 2 | pdflatex -shell-escape summary 3 | bibtex summary 4 | pdflatex -shell-escape summary 5 | pdflatex -shell-escape summary 6 | 7 | thesis.pdf: bibliography.bib thesis.tex classicthesis-config.tex chapters/*.tex frontback/*.tex 8 | pdflatex -shell-escape thesis 9 | bibtex thesis 10 | pdflatex -shell-escape thesis 11 | pdflatex -shell-escape thesis 12 | 13 | partial: 14 | bibtex thesis 15 | pdflatex -shell-escape thesis 16 | 17 | clean: 18 | rm -f *.lot *.lof *.lol *.toc *.log *.out *.aux *.blg *.bbl thesis.pdf chapters/*.aux frontback/*.aux 19 | 20 | rebuild: clean thesis.pdf 21 | -------------------------------------------------------------------------------- /tex/frontback/toc.tex: -------------------------------------------------------------------------------- 1 | % Table of contents =========================================================== 2 | 3 | \refstepcounter{dummy} 4 | \pdfbookmark[1]{\contentsname}{tableofcontents} 5 | \setcounter{tocdepth}{3} % <-- 2 includes up to subsections in the ToC 6 | \setcounter{secnumdepth}{3} % <-- 3 numbers up to subsubsections 7 | \manualmark 8 | \markboth{\spacedlowsmallcaps{\contentsname}}{\spacedlowsmallcaps{\contentsname}} 9 | \tableofcontents 10 | \automark[section]{chapter} 11 | \renewcommand{\chaptermark}[1]{\markboth{\spacedlowsmallcaps{#1}}{\spacedlowsmallcaps{#1}}} 12 | \renewcommand{\sectionmark}[1]{\markright{\thesection\enspace\spacedlowsmallcaps{#1}}} 13 | 14 | \cleardoublepage 15 | -------------------------------------------------------------------------------- /tex/frontback/jury.tex: -------------------------------------------------------------------------------- 1 | % Jury ==================================================================== 2 | 3 | \pdfbookmark[1]{Jury members}{Jury members} 4 | \chapter*{Jury members} 5 | 6 | 7 | \noindent \textsc{Louis Wehenkel}, Professor at the Universit{\'e} de Li{\`e}ge (President); \\ 8 | 9 | \noindent \textsc{Pierre Geurts}, Professor at the Universit{\'e} de Li{\`e}ge (Advisor); \\ 10 | 11 | \noindent \textsc{Bernard Boigelot}, Professor at the Universit{\'e} de Li{\`e}ge; \\ 12 | 13 | \noindent \textsc{Renaud Detry}, Postdoctoral Researcher at the Universit{\'e} de Li{\`e}ge; \\ 14 | 15 | \noindent \textsc{Gianluca Bontempi}, Professor at the Universit{\'e} Libre de Bruxelles; \\ 16 | 17 | \noindent \textsc{G{\'e}rard Biau}, Professor at the Universit{\'e} Pierre et Marie Curie (France); \\ 18 | 19 | -------------------------------------------------------------------------------- /scripts/ch6_order.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import brewer2mpl 4 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 7).mpl_colors 5 | #cmap = [(0, 0, 1.0), (1.0, 0, 0)] 6 | 7 | all_importances = np.array( 8 | [[0.414, 0.362, 0.327, 0.309, 0.304, 0.305, 0.306], 9 | [0.583, 0.663, 0.715, 0.757, 0.787, 0.801, 0.799], 10 | [0.532, 0.512, 0.496, 0.489, 0.483, 0.475, 0.475], 11 | [0.543, 0.525, 0.484, 0.445, 0.414, 0.409, 0.412], 12 | [0.658, 0.731, 0.778, 0.810, 0.827, 0.831, 0.835], 13 | [0.221, 0.140, 0.126, 0.122, 0.122, 0.121, 0.120], 14 | [0.368, 0.385, 0.392, 0.387, 0.382, 0.375, 0.372]]) 15 | 16 | n_features = all_importances.shape[0] 17 | for m in range(n_features): 18 | plt.plot(range(1, n_features+1), all_importances[m, :], "o-", label="X%d" % (m+1), color=cmap[m]) 19 | 20 | plt.legend(loc="best") 21 | plt.show() 22 | 23 | -------------------------------------------------------------------------------- /scripts/ch5_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | np.random.seed(0) 5 | X = np.random.rand(300, 2) 6 | y = (X[:,0] > 0.3) & (X[:,0] < 0.7) & (X[:,1] > 0.3) & (X[:,1] < 0.7) 7 | 8 | # randomly flips some labels 9 | mask = np.random.permutation(len(X))[:5] 10 | y[mask] = ~y[mask] 11 | 12 | X_c1 = X[y == 0] 13 | plt.scatter(X_c1[:, 0], X_c1[:, 1], color=(1.0, 0, 0)) 14 | 15 | X_c2 = X[y == 1] 16 | plt.scatter(X_c2[:, 0], X_c2[:, 1], color=(0, 0, 1.0)) 17 | 18 | # decision tree 19 | from sklearn.tree import DecisionTreeClassifier 20 | clf = DecisionTreeClassifier(max_leaf_nodes=5).fit(X, y) 21 | print "children_left =", clf.tree_.children_left 22 | print "children_right =", clf.tree_.children_right 23 | print "feature =", clf.tree_.feature 24 | print "threshold =", clf.tree_.threshold 25 | print "impurity =", clf.tree_.impurity 26 | print "n_samples =", clf.tree_.n_node_samples 27 | print "value =", clf.tree_.value 28 | 29 | plt.show() 30 | -------------------------------------------------------------------------------- /tex/frontback/titlepage.tex: -------------------------------------------------------------------------------- 1 | % Front page ================================================================== 2 | 3 | \begin{titlepage} 4 | \begin{addmargin}[-1cm]{-3cm} 5 | \begin{center} 6 | \large 7 | {\Large \textsc{University of Li{\`e}ge}}\\[1ex] 8 | Faculty of Applied Sciences\\ 9 | Department of Electrical Engineering \& Computer Science\\ 10 | 11 | \vfill 12 | 13 | PhD dissertation\\ \vskip1cm 14 | \rule{14cm}{0.4pt}\\ \bigskip 15 | \begingroup 16 | \Large 17 | \color{Maroon}\spacedallcaps{\myTitle} \\ \bigskip 18 | \endgroup 19 | \spacedlowsmallcaps{\mySubtitle} \\ \bigskip 20 | \rule{14cm}{0.4pt}\\ \vskip1cm 21 | by \textsc{Gilles Louppe} 22 | 23 | \vfill 24 | \vfill 25 | \vfill 26 | 27 | \hfill Advisor: Prof. \textsc{Pierre Geurts}\\ 28 | \hfill July 2014 29 | \end{center} 30 | \vspace{-3.5cm}\includegraphics[width=0.25\textwidth]{figures/blason.pdf} 31 | \end{addmargin} 32 | \end{titlepage} 33 | -------------------------------------------------------------------------------- /scripts/ch2_hyperplane.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pylab as pl 3 | from sklearn import svm 4 | 5 | blue = (0, 0, 1.0) 6 | red = (1.0, 0, 0) 7 | gray = (0.7, 0.7, 0.7) 8 | 9 | # we create 40 separable points 10 | X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]] 11 | Y = [0] * 20 + [1] * 20 12 | 13 | # fit the model 14 | clf = svm.SVC(kernel='linear') 15 | clf.fit(X, Y) 16 | 17 | # get the separating hyperplane 18 | w = clf.coef_[0] 19 | a = -w[0] / w[1] 20 | xx = np.linspace(-5, 5) 21 | yy = a * xx - (clf.intercept_[0]) / w[1] 22 | 23 | # plot the parallels to the separating hyperplane that pass through the 24 | # support vectors 25 | b = clf.support_vectors_[0] 26 | yy_down = a * xx + (b[1] - a * b[0]) 27 | b = clf.support_vectors_[-1] 28 | yy_up = a * xx + (b[1] - a * b[0]) 29 | 30 | # plot the line, the points, and the nearest vectors to the plane 31 | pl.plot(xx, yy, 'k-') 32 | pl.plot(xx, yy_down, 'k--') 33 | pl.plot(xx, yy_up, 'k--') 34 | 35 | pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], 36 | s=80, facecolors='none') 37 | 38 | 39 | pl.scatter(X[:20, 0], X[:20, 1], color=blue) 40 | pl.scatter(X[20:, 0], X[20:, 1], color=red) 41 | 42 | pl.axis('tight') 43 | pl.show() 44 | -------------------------------------------------------------------------------- /tex/figures/ch7_trees_ets2.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(16,16) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % ETs 3 9 | % Arrows 10 | \psline[linewidth=0.5pt]{->}(9,8)(8,6.3) 11 | \psline[linewidth=0.5pt]{->}(9,8)(10,6.3) 12 | % Nodes 13 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,8){0.4} 14 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](7.7,5.7)(8.3,6.3) 15 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,5.7)(10.3,6.3) 16 | % Text 17 | \rput(7.9,7.3){$X_1 \leq 0$} 18 | \rput(10.1,7.3){$X_1 > 0$} 19 | % ETs 3 20 | % Arrows 21 | \psline[linewidth=0.5pt]{->}(14,8)(13,6.3) 22 | \psline[linewidth=0.5pt]{->}(14,8)(15,6.3) 23 | % Nodes 24 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](14,8){0.4} 25 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,5.7)(13.3,6.3) 26 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](14.7,5.7)(15.3,6.3) 27 | % Text 28 | \rput(12.9,7.3){$X_2 = 0$} 29 | \rput(15.1,7.3){$X_2 = 1$} 30 | \end{pspicture} 31 | \end{document} 32 | -------------------------------------------------------------------------------- /scripts/ch6_decomposition1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import matplotlib 4 | from matplotlib import cm 5 | 6 | from demo import make_led 7 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 8 | 9 | n_trees = 3000 10 | 11 | X, y = make_led() 12 | fig, axs = plt.subplots(1, 2) 13 | 14 | ax = axs[0] 15 | clf = RandomizedID3Ensemble(n_estimators=n_trees, 16 | base_estimator=RandomizedID3Classifier(k=1)).fit(X, y) 17 | imp = clf.feature_importances_ 18 | ax.imshow(imp, cmap=cm.gist_heat_r, interpolation="nearest", vmin=0, vmax=0.4) 19 | ax.set_yticklabels(["$X_%d$" % (i) for i in range(X.shape[1]+1)]) 20 | ax.set_title("$K=1$") 21 | 22 | ax = axs[1] 23 | clf = RandomizedID3Ensemble(n_estimators=n_trees, 24 | base_estimator=RandomizedID3Classifier(k=X.shape[1])).fit(X, y) 25 | imp = clf.feature_importances_ 26 | img = ax.imshow(imp, cmap=cm.gist_heat_r, interpolation="nearest", vmin=0, vmax=0.4) 27 | ax.set_yticklabels(["$X_%d$" % (i) for i in range(X.shape[1]+1)]) 28 | ax.set_title("$K=%d$" % X.shape[1]) 29 | 30 | cax, kw = matplotlib.colorbar.make_axes([ax for ax in axs.flat]) 31 | cb = plt.colorbar(img, cax=cax, **kw) 32 | cb.set_ticks([0, 0.2, 0.4]) 33 | 34 | plt.show() 35 | 36 | -------------------------------------------------------------------------------- /scripts/ch4_proximity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from itertools import cycle 5 | from scipy.spatial.distance import pdist, squareform 6 | 7 | from sklearn.datasets import load_digits 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.manifold import MDS 10 | 11 | 12 | def rf_proximities(forest, X): 13 | prox = pdist(forest.apply(X), lambda u, v: (u == v).sum()) / forest.n_estimators 14 | prox = squareform(prox) 15 | return prox 16 | 17 | 18 | data = load_digits() 19 | X, y = data.data, data.target 20 | 21 | indices = np.argsort(y) 22 | X = X[indices] 23 | y = y[indices] 24 | 25 | # X = X[y < 2] 26 | # y = y[y < 2] 27 | 28 | forest = RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=1).fit(X, y) 29 | prox = rf_proximities(forest, X) 30 | 31 | plt.matshow(prox, cmap="Reds") 32 | plt.show() 33 | 34 | model = MDS(dissimilarity="precomputed", n_jobs=2) 35 | coords = model.fit_transform(1. - prox) 36 | 37 | n_classes = forest.n_classes_ 38 | cm = plt.get_cmap("hsv") 39 | colors = (cm(1. * i / n_classes) for i in range(n_classes)) 40 | 41 | for k, c in zip(range(n_classes), colors): 42 | plt.plot(coords[y == k, 0], coords[y == k, 1], '.', label=k, color=c) 43 | 44 | plt.legend(loc="best") 45 | plt.show() 46 | -------------------------------------------------------------------------------- /scripts/ch4_estimate_distribution.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.stats import norm 4 | 5 | blue = (0, 0, 1.0) 6 | green = (0, 0.8, 0) 7 | red = (1.0, 0, 0) 8 | red_alpha = (1.0, 0, 0, 0.1) 9 | gray = (0.7, 0.7, 0.7) 10 | 11 | x = np.arange(0, 1, 0.0001) 12 | p_y = norm.pdf(x, 0.6, 0.1) 13 | 14 | plt.plot(x, p_y, color=red) 15 | plt.plot([0.5,0.5], [0.0, np.max(p_y)], '-', color=gray) 16 | plt.plot([0.6,0.6], [0.0, np.max(p_y)+0.01], ':', color=gray) 17 | plt.text(0.6, np.max(p_y) + 0.2, r"$\mathbb{E}_{\cal L} \{ p_{\cal L}(Y=\varphi_B(x)) \}$", fontsize=15, horizontalalignment='center') 18 | plt.text(0.6, 1.7, r"$Var_{\cal L}\{ p_{\cal L}(Y=\varphi_B(x)) \}$", fontsize=15, horizontalalignment='left') 19 | plt.annotate( 20 | '', xy=(0.45, 2.0), xycoords = 'data', 21 | xytext = (0.75, 2.0), textcoords = 'data', 22 | arrowprops = {'arrowstyle':'<->'}) 23 | plt.annotate(r"$P_{\cal L}(\varphi_{\cal L}(x)\neq \varphi_B(x))$", xy=(0.475, 1.0), xycoords='data', fontsize=15, xytext=(0.2, 1.7), textcoords='data', arrowprops={'arrowstyle':'->'}) 24 | 25 | plt.fill_between(x, p_y, y2=0, where=x<0.5, color=red_alpha) 26 | 27 | plt.ylabel("$P$") 28 | plt.ylim((0., 4.5)) 29 | plt.xlim((0., 1.0)) 30 | plt.xticks([0.0, 0.5, 1.0]) 31 | plt.yticks([]) 32 | 33 | plt.show() 34 | -------------------------------------------------------------------------------- /tex/figures/ch5_sort.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \definecolor{gray}{rgb}{0.95,0.95,0.95} 5 | \definecolor{darkgray}{rgb}{0.8,0.8,0.8} 6 | \pagestyle{empty} 7 | \begin{document} 8 | \begin{pspicture}(20,15) 9 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 10 | \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=black](3,13)(13,14) 11 | \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=gray](3.1,13.1)(4.9,13.9) 12 | \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=gray](11.1,13.1)(12.9,13.9) 13 | \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=darkgray](8.1,13.1)(9.4,13.9) 14 | \psline[linewidth=0.5pt]{-}(5,13)(5,14.5) 15 | \psline[linewidth=0.5pt]{-}(11,13)(11,14.5) 16 | \psline[linewidth=0.5pt]{-}(7,14)(7,12.5) 17 | \psline[linewidth=0.5pt]{-}(8,14)(8,12.5) 18 | \psline[linewidth=0.5pt]{-}(9.5,14)(9.5,12.5) 19 | \rput(2,13.5){\texttt{samples}} 20 | \rput[l](5.1,14.5){\texttt{start}} 21 | \rput[l](11.1,14.5){\texttt{end}} 22 | \rput[l](7.1,12.5){\texttt{l}} 23 | \rput[l](8.1,12.5){\texttt{i}} 24 | \rput[l](9.6,12.5){\texttt{r}} 25 | \rput(6,13.5){$<$} 26 | \rput(7.5,13.5){$=$} 27 | \rput(10.25,13.5){$>$} 28 | \end{pspicture} 29 | \end{document} 30 | -------------------------------------------------------------------------------- /tex/figures/ch3_tree.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \usepackage{color} 5 | \definecolor{myblue}{rgb}{0.0,0.0,1.0} 6 | \pagestyle{empty} 7 | \begin{document} 8 | \begin{pspicture}(20,15) 9 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 10 | % Arrows 11 | \psline[linewidth=0.5pt,linecolor=myblue]{->}(10,12)(8.3,11.3) 12 | \psline[linewidth=0.5pt]{->}(10,12)(11.7,11.3) 13 | \psline[linewidth=0.5pt]{->}(8,11)(7,9.35) 14 | \psline[linewidth=0.5pt,linecolor=myblue]{->}(8,11)(9,9.35) 15 | % Nodes 16 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=myblue](10,12){0.4} 17 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=myblue](8,11){0.4} 18 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](11.7,10.7)(12.3,11.3) 19 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,8.7)(7.3,9.3) 20 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=myblue](8.7,8.7)(9.3,9.3) 21 | % Text 22 | \rput(10,12){$t_0$} 23 | \rput(8,11){$t_1$} 24 | \rput(12,11){$t_2$} 25 | \rput(7,9){$t_3$} 26 | \rput(9,9){$t_4$} 27 | \rput(10,11.3){{\small $X_1 \leq 0.7$}} 28 | \rput(8,9.7){{\small $X_2 \leq 0.5$}} 29 | \rput(12,10.5){$c_2$} 30 | \rput(7,8.5){$c_2$} 31 | \rput(9,8.5){$c_1$} 32 | \end{pspicture} 33 | \end{document} 34 | -------------------------------------------------------------------------------- /scripts/ch3_partition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | X = np.random.rand(300, 2) 5 | y = (X[:,0] < 0.7) & (X[:,1] > 0.5) 6 | 7 | # randomly flips some labels 8 | mask = np.random.permutation(len(X))[:15] 9 | y[mask] = ~y[mask] 10 | 11 | X_c1 = X[y == 0] 12 | plt.scatter(X_c1[:, 0], X_c1[:, 1], color=(1.0, 0, 0)) 13 | 14 | X_c2 = X[y == 1] 15 | plt.scatter(X_c2[:, 0], X_c2[:, 1], color=(0, 0, 1.0)) 16 | 17 | # draw lines + text 18 | plt.plot([0, 1], [0, 0], color='k', linestyle='-', linewidth=1) 19 | plt.plot([0, 0], [0, 1], color='k', linestyle='-', linewidth=1) 20 | plt.plot([1, 1], [0, 1], color='k', linestyle='-', linewidth=1) 21 | plt.plot([0, 1], [1, 1], color='k', linestyle='-', linewidth=1) 22 | 23 | plt.plot([0.7, 0.7], [0, 1.0], color='k', linestyle='-', linewidth=1) 24 | plt.plot([0, 0.7], [0.5, 0.5], color='k', linestyle='-', linewidth=1) 25 | 26 | plt.text(0.95, 0.93, r"$t_2$", fontsize=15) 27 | plt.text(0.65, 0.43, r"$t_3$", fontsize=15) 28 | plt.text(0.65, 0.93, r"$t_4$", fontsize=15) 29 | 30 | plt.text(0.7, -0.07, "$0.7$", fontsize=15, horizontalalignment='center') 31 | plt.text(-0.07, 0.5, "$0.5$", fontsize=15, verticalalignment='center') 32 | 33 | plt.text(1.0, -0.07, "$X_1$", fontsize=15, horizontalalignment='center') 34 | plt.text(-0.07, 1.0, "$X_2$", fontsize=15, verticalalignment='center') 35 | plt.show() 36 | 37 | -------------------------------------------------------------------------------- /tex/figures/ch7_trees_id3.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(16,16) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % ID3 1 9 | % Arrows 10 | \psline[linewidth=0.5pt]{->}(4,12)(2,10.3) 11 | \psline[linewidth=0.5pt]{->}(4,12)(4,10.3) 12 | \psline[linewidth=0.5pt]{->}(4,12)(6,10.3) 13 | % Nodes 14 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,12){0.4} 15 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](1.7,9.7)(2.3,10.3) 16 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](3.7,9.7)(4.3,10.3) 17 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](5.7,9.7)(6.3,10.3) 18 | % Text 19 | \rput(2.5,11.3){$X_1 = 0$} 20 | \rput(4,11){$X_1 = 1$} 21 | \rput(5.5,11.3){$X_1 = 2$} 22 | % ID3 2 23 | % Arrows 24 | \psline[linewidth=0.5pt]{->}(10,12)(9,10.3) 25 | \psline[linewidth=0.5pt]{->}(10,12)(11,10.3) 26 | % Nodes 27 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4} 28 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](8.7,9.7)(9.3,10.3) 29 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](10.7,9.7)(11.3,10.3) 30 | % Text 31 | \rput(8.9,11.3){$X_2 = 0$} 32 | \rput(11.1,11.3){$X_2 = 1$} 33 | \end{pspicture} 34 | \end{document} 35 | -------------------------------------------------------------------------------- /scripts/ch6_decomposition2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | import matplotlib 5 | from matplotlib import cm 6 | 7 | from demo import make_led 8 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 9 | 10 | 11 | def feature_importance_tree(clf): 12 | def _visit(tree, conditioning): 13 | conditioning = conditioning + [tree[0]] 14 | 15 | if len(tree) == 2: 16 | pass 17 | 18 | else: 19 | for X in conditioning: 20 | imp[tree[0], X] += tree[1] 21 | 22 | for c in tree[2]: 23 | _visit(c, conditioning) 24 | 25 | imp = np.zeros((clf.n_features_, clf.n_features_)) 26 | _visit(clf.tree_, []) 27 | 28 | return imp 29 | 30 | def feature_importances_ensemble(clf): 31 | importances = np.zeros((clf.p, clf.p)) 32 | 33 | for i, tree in enumerate(clf.estimators_): 34 | importances += feature_importance_tree(tree) 35 | 36 | importances /= clf.n_estimators 37 | 38 | return importances 39 | 40 | 41 | n_trees = 1000 42 | 43 | X, y = make_led() 44 | n_features = X.shape[1] 45 | 46 | clf = RandomizedID3Ensemble(n_estimators=n_trees, 47 | base_estimator=RandomizedID3Classifier(k=1)).fit(X, y) 48 | 49 | imp = feature_importances_ensemble(clf) 50 | plt.imshow(imp, interpolation="nearest", cmap=cm.gist_heat_r) 51 | plt.show() 52 | 53 | -------------------------------------------------------------------------------- /tex/figures/ch3_splits.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(15,15) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % Binary split 9 | % Arrows 10 | \psline[linewidth=0.5pt]{->}(5,12)(4.2,10.4) 11 | \psline[linewidth=0.5pt]{->}(5,12)(5.8,10.4) 12 | % Nodes 13 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,12){0.4} 14 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,10){0.4} 15 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](6,10){0.4} 16 | % Text 17 | \rput(5,12){$t$} 18 | \rput(4,10){$t_L$} 19 | \rput(6,10){$t_R$} 20 | % N-ary split 21 | % Arrows 22 | \psline[linewidth=0.5pt]{->}(10,12)(8.2,10.4) 23 | \psline[linewidth=0.5pt]{->}(10,12)(9.2,10.4) 24 | \psline[linewidth=0.5pt]{->}(10,12)(11.8,10.4) 25 | % Nodes 26 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4} 27 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,10){0.4} 28 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,10){0.4} 29 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,10){0.4} 30 | % Text 31 | \rput(10,12){$t$} 32 | \rput(8,10){$t_{i_1}$} 33 | \rput(9,10){$t_{i_2}$} 34 | \rput(10.5,10){$...$} 35 | \rput(12,10){$t_{i_N}$} 36 | \end{pspicture} 37 | \end{document} 38 | -------------------------------------------------------------------------------- /scripts/ch7_redundant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import brewer2mpl 4 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 7).mpl_colors 5 | #cmap = [(0, 0, 1.0), (1.0, 0, 0)] 6 | 7 | def feature_importances(X, y, n_trees=500): 8 | from sklearn.ensemble import ExtraTreesClassifier 9 | clf = ExtraTreesClassifier(n_estimators=n_trees, max_features=1, criterion="entropy").fit(X, y) 10 | imp = np.zeros(X.shape[1]) 11 | 12 | for tree in clf.estimators_: 13 | imp += tree.tree_.compute_feature_importances(normalize=False) 14 | imp = imp / n_trees 15 | return imp 16 | 17 | def plot_with_dupplicate(X, y, duplicate=0, n_copies=10): 18 | n_features = X.shape[1] 19 | all_importances = [] 20 | X_new = np.hstack([X] + [X[:, duplicate:duplicate+1] for i in range(n_copies)]) 21 | 22 | for i in range(n_copies+1): 23 | all_importances.append(feature_importances(X_new[:, :n_features + i], y)[:n_features]) 24 | 25 | all_importances = np.array(all_importances) 26 | 27 | for m in range(n_features): 28 | plt.plot(range(n_copies+1), all_importances[:, m], "o-", label="X%d" % (m+1), color=cmap[m]) 29 | 30 | plt.title("Adding copies of X%d" % (duplicate+1)) 31 | plt.legend(loc="best") 32 | plt.show() 33 | 34 | from demo import make_led 35 | X, y = make_led() 36 | # X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 37 | # y = np.array([1, 0, 0, 1]) 38 | plot_with_dupplicate(X, y, duplicate=4, n_copies=100) 39 | -------------------------------------------------------------------------------- /scripts/ch4_bias_variance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from scipy.stats import norm 4 | 5 | blue = (0, 0, 1.0) 6 | red = (1.0, 0, 0) 7 | gray = (0.7, 0.7, 0.7) 8 | 9 | x = np.arange(-10, 10, 0.0001) 10 | p_y = norm.pdf(x, -3.0, 1) 11 | p_y_hat = norm.pdf(x, 3.0, 1.8) 12 | 13 | plt.plot(x, p_y, color=blue) 14 | plt.plot(x, p_y_hat, color=red) 15 | 16 | plt.plot([-3,-3], [0.0, np.max(p_y)+0.01], ':', color=gray) 17 | plt.text(-3, np.max(p_y) + 0.02, r"$\varphi_B(x)$", fontsize=15, horizontalalignment='center') 18 | 19 | plt.plot([3,3], [0.0, np.max(p_y_hat)+0.01], ':', color=gray) 20 | plt.text(3, np.max(p_y_hat) + 0.02, r"$\mathbb{E}_{\cal L} \{ \varphi_{\cal L}(x) \}$", fontsize=15, horizontalalignment='center') 21 | 22 | plt.text(0, 0.11, r"$bias^2(x)$", fontsize=15, horizontalalignment='center') 23 | plt.annotate( 24 | '', xy=(-3, 0.1), xycoords = 'data', 25 | xytext = (3, 0.1), textcoords = 'data', 26 | arrowprops = {'arrowstyle':'<->'}) 27 | 28 | plt.text(-5.1, 0.21, r"$noise(x)$", fontsize=15, horizontalalignment='right') 29 | plt.annotate( 30 | '', xy=(-5, 0.2), xycoords = 'data', 31 | xytext = (-1, 0.2), textcoords = 'data', 32 | arrowprops = {'arrowstyle':'<->'}) 33 | 34 | plt.text(5.1, 0.21, r"$var(x)$", fontsize=15, horizontalalignment='left') 35 | plt.annotate( 36 | '', xy=(6, 0.2), xycoords = 'data', 37 | xytext = (0, 0.2), textcoords = 'data', 38 | arrowprops = {'arrowstyle':'<->'}) 39 | 40 | plt.tick_params(axis="x", which="both", bottom="off", top="off", labelbottom="off") 41 | plt.xlabel("$y$") 42 | plt.ylabel("$P$") 43 | 44 | plt.show() 45 | -------------------------------------------------------------------------------- /tex/figures/ch7_splits.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(15,15) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % Binary split 9 | % Arrows 10 | \psline[linewidth=0.5pt]{->}(4,12)(3.2,10.4) 11 | \psline[linewidth=0.5pt]{->}(4,12)(4.8,10.4) 12 | \psline[linewidth=0.5pt]{->}(3,10)(3.8,8.4) 13 | \psline[linewidth=0.5pt]{->}(3,10)(2.2,8.4) 14 | % Nodes 15 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,12){0.4} 16 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,10){0.4} 17 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,10){0.4} 18 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](2,8){0.4} 19 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,8){0.4} 20 | % Text 21 | \rput(3,11.3){$X_j \leq 1$} 22 | \rput(5,11.3){$X_j > 1$} 23 | \rput(2,9.3){$X_j \leq 0$} 24 | \rput(4,9.3){$X_j > 0$} 25 | % N-ary split 26 | % Arrows 27 | \psline[linewidth=0.5pt]{->}(10,12)(8.2,10.4) 28 | \psline[linewidth=0.5pt]{->}(10,12)(10,10.5) 29 | \psline[linewidth=0.5pt]{->}(10,12)(11.8,10.4) 30 | % Nodes 31 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4} 32 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,10){0.4} 33 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,10){0.4} 34 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,10){0.4} 35 | % Text 36 | \rput(8.5,11.3){$X_j = 0$} 37 | \rput(10,11){$X_j = 1$} 38 | \rput(11.5,11.3){$X_j = 2$} 39 | \end{pspicture} 40 | \end{document} 41 | -------------------------------------------------------------------------------- /tex/figures/ch7_trees_ets.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(16,16) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % ETs 1 9 | % Arrows 10 | \psline[linewidth=0.5pt]{->}(4,8)(3,6.4) 11 | \psline[linewidth=0.5pt]{->}(4,8)(5,6.3) 12 | \psline[linewidth=0.5pt]{->}(3,6)(4,4.3) 13 | \psline[linewidth=0.5pt]{->}(3,6)(2,4.3) 14 | % Nodes 15 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,8){0.4} 16 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,6){0.4} 17 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](4.7,5.7)(5.3,6.3) 18 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](1.7,3.7)(2.3,4.3) 19 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](3.7,3.7)(4.3,4.3) 20 | % Text 21 | \rput(2.9,7.3){$X_1 \leq 1$} 22 | \rput(5.1,7.3){$X_1 > 1$} 23 | \rput(1.9,5.3){$X_1 \leq 0$} 24 | \rput(4.1,5.3){$X_1 > 0$} 25 | % ETs 2 26 | % Arrows 27 | \psline[linewidth=0.5pt]{->}(9,8)(8,6.4) 28 | \psline[linewidth=0.5pt]{->}(9,8)(10,6.3) 29 | \psline[linewidth=0.5pt]{->}(8,6)(9,4.3) 30 | \psline[linewidth=0.5pt]{->}(8,6)(7,4.3) 31 | % Nodes 32 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,8){0.4} 33 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,6){0.4} 34 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,5.7)(10.3,6.3) 35 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,3.7)(7.3,4.3) 36 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](8.7,3.7)(9.3,4.3) 37 | % Text 38 | \rput(7.9,7.3){$X_1 \leq 1$} 39 | \rput(10.1,7.3){$X_1 > 1$} 40 | \rput(6.9,5.3){$X_2 = 0$} 41 | \rput(9.1,5.3){$X_2 = 1$} 42 | \end{pspicture} 43 | \end{document} 44 | -------------------------------------------------------------------------------- /tex/frontback/acknowledgments.tex: -------------------------------------------------------------------------------- 1 | % Acknowledgements ============================================================ 2 | 3 | \pdfbookmark[1]{Acknowledgments}{acknowledgments} 4 | \chapter*{Acknowledgments} 5 | 6 | As the saying goes, good premises do not entail good stories. Yet, this 7 | dissertation would certainly not have come to its successful conclusion 8 | without the help, support and trust of colleagues, friends and family. 9 | 10 | First and foremost, I would like to sincerely thank my advisor Pierre Geurts 11 | for his help, guidance and for the freedom I was granted throughout these 12 | years. 13 | 14 | I am grateful to all members of the jury for their interest in this work 15 | and for taking the time to evaluate this dissertation. 16 | 17 | In alphabetical order, I would also like to thank my colleagues who all 18 | contributed to create and maintain a pleasant and stimulating working 19 | environment: Antonio, Arnaud, Benjamin, Damien, Fabien, Julien, Lo\"{i}c, 20 | Louis, Marie, Olivier, Rapha\"{e}l, Van Anh, Vincent. Special thanks go to 21 | Antonio, Arnaud and Vincent who accepted to proofread parts of this manuscript. 22 | 23 | I want take this opportunity to thank the Scikit-Learn team and all its 24 | contributors. This experience within the open source world really contributed 25 | to shape my vision of science and software development towards a model 26 | of rigor, pragmatism and openness. Thanks go to Ga\"{e}l, Olivier, Lars, 27 | Mathieu, Andreas, Alexandre and Peter. 28 | 29 | Special thanks go to the rowing team of the RCAE, for their friendship 30 | and good mood in all circumstances. Guys, I thank you all. 31 | 32 | Even if I never succeeded to fully explain my research topics, I would finally 33 | like to warmly thank my dear friend J\'er\^{o}me and my family for their help 34 | in moments of doubt. 35 | 36 | Last but not least, Laura, I am forever grateful for your unconditional support 37 | and love. 38 | -------------------------------------------------------------------------------- /scripts/ch3_split_ordered.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | np.random.seed(54) 5 | 6 | blue = (0, 0, 1.0) 7 | red = (1.0, 0, 0) 8 | gray = (0.7, 0.7, 0.7) 9 | 10 | n_samples = 10 11 | 12 | X = np.empty(n_samples) 13 | X[:n_samples / 2] = np.sort(np.random.normal(loc=-1.0, size=n_samples / 2)) 14 | X[n_samples / 2:] = np.sort(np.random.normal(loc=1.0, size=n_samples / 2)) 15 | y = np.zeros(n_samples) 16 | y[n_samples / 2:] = 1 17 | 18 | plt.plot([-3,3], [0,0], '-', color='k') 19 | 20 | X_ = np.sort(X) 21 | for i in range(len(X_) - 1): 22 | s = X[i] 23 | plt.plot([s,s], [0.00001/2, 0], '-', color=gray) 24 | 25 | plt.plot([-3,-3], [0.00001, 0], '-', color=gray) 26 | 27 | plt.scatter(X[:n_samples / 2], np.zeros(n_samples / 2), color=blue) 28 | plt.scatter(X[n_samples / 2:], np.zeros(n_samples / 2), color=red) 29 | 30 | s1 = X_[6] 31 | s2 = X_[7] 32 | smid = (s1+s2) / 2.0 33 | 34 | plt.plot([s1,s1], [0.00001/2, 0], '-', color='k') 35 | plt.text(s1, 0.0000055, "$v_k$", fontsize=15, horizontalalignment='center') 36 | plt.text(s2, 0.0000055, "$v_{k+1}$", fontsize=15, horizontalalignment='center') 37 | plt.text(smid, 0.000001, "$v^\prime_k$", fontsize=15, horizontalalignment='center') 38 | plt.plot([smid,smid], [0.00001/2, 0], ':', color=gray) 39 | plt.text((s1+(-3)) / 2.0, -0.000001, "${\cal L}^{v_k}_{t_L}$", fontsize=15, horizontalalignment='center') 40 | plt.text((s1+3) / 2.0, -0.000001, "${\cal L}^{v_k}_{t_R}$", fontsize=15, horizontalalignment='center') 41 | 42 | plt.annotate( 43 | '', xy=(-3, -0.0000015), xycoords = 'data', 44 | xytext = (s1, -0.0000015), textcoords = 'data', 45 | arrowprops = {'arrowstyle':'<->'}) 46 | plt.annotate( 47 | '', xy=(s1, -0.0000015), xycoords = 'data', 48 | xytext = (3, -0.0000015), textcoords = 'data', 49 | arrowprops = {'arrowstyle':'<->'}) 50 | 51 | #plt.plot([s2,s2], [0.00003, -0.00003], '-', color='k') 52 | 53 | 54 | plt.text(3, -0.0000007, "$X_j$", fontsize=15) 55 | 56 | plt.show() 57 | 58 | -------------------------------------------------------------------------------- /tex/figures/ch5_tree.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(20,15) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % Arrows 9 | \psline[linewidth=0.5pt]{->}(10,12)(8.3,11.3) 10 | \psline[linewidth=0.5pt]{->}(10,12)(11.7,11.3) 11 | \psline[linewidth=0.5pt]{->}(12,11)(11,9.4) 12 | \psline[linewidth=0.5pt]{->}(12,11)(13,9.3) 13 | \psline[linewidth=0.5pt]{->}(11,9)(10,7.3) 14 | \psline[linewidth=0.5pt]{->}(11,9)(12,7.4) 15 | \psline[linewidth=0.5pt]{->}(12,7)(11,5.3) 16 | \psline[linewidth=0.5pt]{->}(12,7)(13,5.3) 17 | % Nodes 18 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4} 19 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](7.7,10.7)(8.3,11.3) 20 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,11){0.4} 21 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](11,9){0.4} 22 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,8.7)(13.3,9.3) 23 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,6.7)(10.3,7.3) 24 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,7){0.4} 25 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](10.7,4.7)(11.3,5.3) 26 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,4.7)(13.3,5.3) 27 | % Text 28 | \rput(10,12){$t_0$} 29 | \rput(8,11){$t_1$} 30 | \rput(12,11){$t_2$} 31 | \rput(11,9){$t_3$} 32 | \rput(13,9){$t_4$} 33 | \rput(10,7){$t_5$} 34 | \rput(12,7){$t_6$} 35 | \rput(11,5){$t_7$} 36 | \rput(13,5){$t_8$} 37 | \rput(10,11.3){{\small $X_2 \leq 0.303$}} 38 | \rput(12,9.65){{\small $X_2 \leq 0.696$}} 39 | \rput(11,7.65){{\small $X_1 \leq 0.296$}} 40 | \rput(12,5.65){{\small $X_1 \leq 0.703$}} 41 | \rput(8,10.5){$c_1$} 42 | \rput(13,8.5){$c_1$} 43 | \rput(10,6.5){$c_1$} 44 | \rput(11,4.5){$c_2$} 45 | \rput(13,4.5){$c_1$} 46 | \end{pspicture} 47 | \end{document} 48 | -------------------------------------------------------------------------------- /scripts/ch7_bias_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import brewer2mpl 5 | 6 | from itertools import product 7 | from functools import partial 8 | from demo import entropy 9 | 10 | from sklearn.ensemble import ExtraTreesClassifier 11 | from sklearn.ensemble import RandomForestClassifier 12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 13 | 14 | def feature_importances(X, y, cls, n_trees=5000): 15 | clf = cls(n_estimators=n_trees).fit(X, y) 16 | 17 | if isinstance(clf, RandomizedID3Ensemble): 18 | imp = np.sum(clf.feature_importances_, axis=1) 19 | 20 | else: 21 | imp = np.zeros(X.shape[1]) 22 | 23 | for tree in clf.estimators_: 24 | imp += tree.tree_.compute_feature_importances(normalize=False) 25 | 26 | imp = imp / n_trees 27 | 28 | return imp 29 | 30 | def generate_copy(n1=20, n2=2): 31 | X = np.array([np.arange(n1), np.arange(n1)]).T 32 | X[:, 1] = X[:, 0] >= n1/2 33 | y = X[:, 1] 34 | return X, y 35 | 36 | import brewer2mpl 37 | cmap = [(1., 0, 0), (0, 0, 1)] 38 | 39 | r = {} 40 | g = generate_copy 41 | 42 | for name, cls in [("ETs", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")), 43 | ("RF", partial(RandomForestClassifier, max_features=1, bootstrap=False, criterion="entropy"))]: 44 | f = [] 45 | for n1 in range(2, 20+1, 2): 46 | X, y = g(n1=n1, n2=2) 47 | f.append(feature_importances(X, y, cls=cls)) 48 | r[name] = np.array(f) 49 | 50 | 51 | models = ["ETs", "RF"] 52 | 53 | plt.subplot(1, 2, 1) 54 | 55 | for i, name in enumerate(models): 56 | f = r[name] 57 | plt.plot(range(2, 20+1, 2), f[:, 0], "o-", label="%s" % name, color=cmap[i]) 58 | plt.ylim([0., 1.0]) 59 | plt.title("$X_1$") 60 | plt.legend(loc="best") 61 | 62 | plt.subplot(1, 2, 2) 63 | 64 | for i, name in enumerate(models): 65 | f = r[name] 66 | plt.plot(range(2, 20+1, 2), f[:, 1], "o-", label="%s" % name, color=cmap[i]) 67 | plt.title("$X_2$") 68 | plt.ylim([0., 1.0]) 69 | 70 | plt.show() 71 | 72 | -------------------------------------------------------------------------------- /tex/figures/ch2_mlp.tex: -------------------------------------------------------------------------------- 1 | \documentclass{minimal} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(20,15) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % Arrows 9 | % Layer 0 to 1 10 | \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,8.6) 11 | \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,7.6) 12 | \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,6.6) 13 | \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,5.6) 14 | \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,8.5) 15 | \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,7.5) 16 | \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,6.5) 17 | \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,5.5) 18 | \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,8.4) 19 | \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,7.4) 20 | \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,6.4) 21 | \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,5.4) 22 | \psline[linewidth=0.5pt]{->}(7,7)(7.6,7) 23 | % Layer 1 to 2 24 | \psline[linewidth=0.5pt]{->}(5.2,8.5)(6.6,7.075) 25 | \psline[linewidth=0.5pt]{->}(5.2,7.5)(6.6,7.025) 26 | \psline[linewidth=0.5pt]{->}(5.2,6.5)(6.6,6.975) 27 | \psline[linewidth=0.5pt]{->}(5.2,5.5)(6.6,6.925) 28 | % Layer 0 29 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,8){0.4} 30 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,7){0.4} 31 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,6){0.4} 32 | % Layer 1 33 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,8.5){0.4} 34 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,7.5){0.4} 35 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,6.5){0.4} 36 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,5.5){0.4} 37 | % Layer 2 38 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](7,7){0.4} 39 | % Text 40 | \rput(4,8.75){$w_{ij}$} 41 | \rput(3,8){$x_1$} 42 | \rput(3,7){$x_2$} 43 | \rput(3,6){$x_3$} 44 | \rput(5,8.5){$h_1$} 45 | \rput(5,7.5){$h_2$} 46 | \rput(5,6.5){$h_3$} 47 | \rput(5,5.5){$h_4$} 48 | \rput(7,7){$h_5$} 49 | \rput(7.8,7){$\hat{y}$} 50 | \end{pspicture} 51 | \end{document} 52 | -------------------------------------------------------------------------------- /tex/figures/ch3_goodness.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(10,15) 7 | %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt] 8 | % Arrows 9 | \psline[linewidth=0.5pt]{->}(5,12)(3.3,11.3) 10 | \psline[linewidth=0.5pt]{->}(5,12)(6.7,11.3) 11 | % Nodes 12 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,12){0.4} 13 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,11){0.4} 14 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,10.7)(7.3,11.3) 15 | % Text 16 | \rput(5,12){$t_0$} 17 | \rput(3,11){$t_1$} 18 | \rput(7,11){$t_2$} 19 | \rput(5,11.3){{\small $X_1$}} 20 | \rput(3,10.25){$p(y=c_1|t_1)=\frac{2}{5}$} 21 | \rput(3,9.75){$p(y=c_2|t_1)=\frac{3}{5}$} 22 | \rput(7,10.25){$p(y=c_1|t_2)=\frac{0}{5}$} 23 | \rput(7,9.75){$p(y=c_2|t_2)=\frac{5}{5}$} 24 | % Arrows 25 | \psline[linewidth=0.5pt]{->}(5,9)(3.3,8.3) 26 | \psline[linewidth=0.5pt]{->}(5,9)(6.7,8.3) 27 | % Nodes 28 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,9){0.4} 29 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,8){0.4} 30 | \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,7.7)(7.3,8.3) 31 | % Text 32 | \rput(5,9){$t_0$} 33 | \rput(3,8){$t_1$} 34 | \rput(7,8){$t_2$} 35 | \rput(5,8.3){{\small $X_2$}} 36 | \rput(3,7.25){$p(y=c_1|t_1)=\frac{2}{6}$} 37 | \rput(3,6.75){$p(y=c_2|t_1)=\frac{4}{6}$} 38 | \rput(7,7.25){$p(y=c_1|t_2)=\frac{0}{4}$} 39 | \rput(7,6.75){$p(y=c_2|t_2)=\frac{4}{4}$} 40 | % Arrows 41 | \psline[linewidth=0.5pt]{->}(5,6)(3.3,5.3) 42 | \psline[linewidth=0.5pt]{->}(5,6)(6.7,5.3) 43 | % Nodes 44 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,6){0.4} 45 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,5){0.4} 46 | \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](7,5){0.4} 47 | % Text 48 | \rput(5,6){$t_0$} 49 | \rput(3,5){$t_1$} 50 | \rput(7,5){$t_2$} 51 | \rput(5,5.3){{\small $X_3$}} 52 | \rput(3,4.25){$p(y=c_1|t_1)=\frac{1}{6}$} 53 | \rput(3,3.75){$p(y=c_2|t_1)=\frac{5}{6}$} 54 | \rput(7,4.25){$p(y=c_1|t_2)=\frac{1}{4}$} 55 | \rput(7,3.75){$p(y=c_2|t_2)=\frac{3}{4}$} 56 | \end{pspicture} 57 | \end{document} 58 | -------------------------------------------------------------------------------- /tex/summary.tex: -------------------------------------------------------------------------------- 1 | \documentclass[oneside,openright,titlepage,numbers=noenddot,headinclude,% 2 | footinclude=true,cleardoublepage=empty,abstractoff,BCOR=5mm,% 3 | paper=a4,fontsize=11pt,ngerman,american]{scrreprt} 4 | 5 | % Custom config =============================================================== 6 | 7 | % Classic thesis 8 | \usepackage{amssymb} 9 | \input{classicthesis-config} 10 | 11 | % Theorems and definitions 12 | \usepackage{amsthm} 13 | \newtheorem{theorem}{Theorem} 14 | \newtheorem{lemma}[theorem]{Lemma} 15 | \newtheorem{proposition}[theorem]{Proposition} 16 | \newtheorem{corollary}[theorem]{Corollary} 17 | \newtheorem{definition}{Definition} 18 | 19 | \newtheorem{algorithm}{Algorithm} 20 | \usepackage{algpseudocode} 21 | 22 | % Counters 23 | \renewcommand{\labelenumi}{{\color{halfgray}(\alph{enumi})}} 24 | \renewcommand{\labelenumii}{\color{halfgray}{\roman{enumii}.}} 25 | \renewcommand{\labelitemi}{{\color{halfgray}-}}%\raisebox{0.3ex}{\tiny$\blacksquare$}}} 26 | 27 | \numberwithin{theorem}{chapter} 28 | \numberwithin{definition}{chapter} 29 | \numberwithin{algorithm}{chapter} 30 | \numberwithin{figure}{chapter} 31 | \numberwithin{table}{chapter} 32 | 33 | % Maths 34 | \DeclareMathOperator*{\argmin}{arg\,min} 35 | \DeclareMathOperator*{\argmax}{arg\,max} 36 | 37 | \numberwithin{equation}{chapter} 38 | \allowdisplaybreaks 39 | 40 | % Shaded boxes 41 | \usepackage{framed} 42 | \newenvironment{remark}[1]{% 43 | \definecolor{shadecolor}{gray}{0.9}% 44 | \begin{shaded}{\color{Maroon}\noindent\textsc{#1}}\\% 45 | }{% 46 | \end{shaded}% 47 | } 48 | 49 | % Code snippets 50 | \usepackage{minted} 51 | \definecolor{rulecolor}{rgb}{0.80,0.80,0.80} 52 | \definecolor{bgcolor}{rgb}{1.0,1.0,1.0} 53 | \newminted{python}{bgcolor=bgcolor} 54 | 55 | % Todo 56 | \newcommand{\todo}[1]{\textcolor{red}{[TODO] #1}} 57 | 58 | % PS pictures 59 | \usepackage{pstricks,auto-pst-pdf} 60 | 61 | % Landscape tables 62 | \usepackage{rotating} 63 | 64 | % Checkmarks 65 | \usepackage{pifont}% http://ctan.org/pkg/pifont 66 | \newcommand{\cmark}{\ding{51}}% 67 | \newcommand{\xmark}{\ding{55}}% 68 | 69 | % Wide tables 70 | \usepackage{ltablex} 71 | 72 | 73 | % ----------------------------------------------------------------------------- 74 | 75 | \begin{document} 76 | \frenchspacing 77 | \raggedbottom 78 | \selectlanguage{american} 79 | \pagenumbering{roman} 80 | \pagestyle{plain} 81 | 82 | 83 | \pagenumbering{arabic} 84 | 85 | \include{summary/summary} 86 | 87 | 88 | \end{document} 89 | -------------------------------------------------------------------------------- /scripts/ch2_train_test_error.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from sklearn.cross_validation import train_test_split 6 | from sklearn.datasets import make_friedman1 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.tree import DecisionTreeRegressor 9 | 10 | 11 | # Compute train/test error curves on Friedman1 12 | def error_curves(estimator, parameter, parameter_values, n_repeat=100): 13 | all_train_errors = [] 14 | all_test_errors = [] 15 | 16 | for i in range(n_repeat): 17 | X, y = make_friedman1(n_samples=200) 18 | X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7) 19 | 20 | train_errors = [] 21 | test_errors = [] 22 | 23 | for j, p in enumerate(parameter_values): 24 | est = estimator(**{parameter: p}) 25 | est.fit(X_train, y_train) 26 | 27 | train_errors.append(mean_squared_error(y_train, est.predict(X_train))) 28 | test_errors.append(mean_squared_error(y_test, est.predict(X_test))) 29 | 30 | all_train_errors.append(train_errors) 31 | all_test_errors.append(test_errors) 32 | 33 | return all_train_errors, all_test_errors 34 | 35 | parameter_values = np.arange(1, 100, dtype=np.int) 36 | all_train_errors, all_test_errors = error_curves(DecisionTreeRegressor, 37 | "min_samples_split", 38 | parameter_values) 39 | 40 | 41 | # Plot the error curves 42 | all_train_errors = np.array(all_train_errors) 43 | all_test_errors = np.array(all_test_errors) 44 | 45 | for i, train_errors in enumerate(all_train_errors): 46 | plt.plot(parameter_values[::-1], train_errors, color=(0, 0, 1, 0.1)) 47 | plt.plot(parameter_values[::-1], np.mean(all_train_errors, axis=0), 48 | color=(0, 0, 1), label="Training error") 49 | 50 | for i, test_errors in enumerate(all_test_errors): 51 | plt.plot(parameter_values[::-1], test_errors, color=(1, 0, 0, 0.1)) 52 | plt.plot(parameter_values[::-1], np.mean(all_test_errors, axis=0), 53 | color=(1, 0, 0), label="Test error") 54 | 55 | m = np.mean(all_test_errors, axis=0) 56 | i = np.argmin(m) 57 | plt.vlines((parameter_values[::-1])[i], 0, 30, color=(0.7, 0.7, 0.7)) 58 | plt.ylim([0, 30]) 59 | 60 | plt.tick_params(axis="x", which="both", bottom="off", top="off", labelbottom="off") 61 | plt.xlabel("Model complexity") 62 | plt.ylabel("Mean square error") 63 | plt.legend(loc="best") 64 | 65 | plt.show() 66 | -------------------------------------------------------------------------------- /scripts/ch7_bias_depth.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import brewer2mpl 5 | 6 | from itertools import product 7 | from functools import partial 8 | from demo import entropy 9 | 10 | from sklearn.ensemble import ExtraTreesClassifier 11 | from sklearn.ensemble import RandomForestClassifier 12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 13 | 14 | import brewer2mpl 15 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 5).mpl_colors 16 | 17 | 18 | def feature_importances(X, y, cls, n_trees=500): 19 | clf = cls(n_estimators=n_trees).fit(X, y) 20 | 21 | if isinstance(clf, RandomizedID3Ensemble): 22 | imp = np.sum(clf.feature_importances_, axis=1) 23 | 24 | else: 25 | imp = np.zeros(X.shape[1]) 26 | 27 | for tree in clf.estimators_: 28 | imp += tree.tree_.compute_feature_importances(normalize=False) 29 | 30 | imp = imp / n_trees 31 | 32 | return imp 33 | 34 | def generate_strobl_power(n_samples=120, relevance=0.2): 35 | X = np.array([v for v in product(range(2), range(4), range(10), range(20))]).astype(np.int32) 36 | X = np.hstack((np.random.rand(len(X), 1), X)) 37 | 38 | y = np.zeros(len(X)) 39 | mask = (X[:, 1] == 1) 40 | y[mask] = np.random.rand(mask.sum()) < 0.5-relevance 41 | y[~mask] = np.random.rand((~mask).sum()) < 0.5+relevance 42 | 43 | indices = np.random.permutation(X.shape[0])[:n_samples] 44 | return X[indices], y[indices].astype(np.int32) 45 | 46 | return X, y 47 | 48 | # Generate all importances 49 | #cls = partial(ExtraTreesClassifier, max_features=1, criterion="entropy") 50 | cls = partial(RandomForestClassifier, max_features=5, criterion="entropy") 51 | 52 | relevances = [0.0, 0.1, 0.2, 0.3] 53 | depths = range(1, 10) 54 | 55 | 56 | for i, relevance in enumerate(relevances): 57 | imp_all = [] 58 | 59 | for n in range(10): 60 | imp = [] 61 | X, y = generate_strobl_power(relevance=relevance) 62 | 63 | for q in depths: 64 | c = partial(cls, max_depth=q) 65 | imp.append(feature_importances(X, y, cls=c)) 66 | 67 | imp = np.array(imp) 68 | imp_all.append(imp) 69 | 70 | imp = np.mean(imp_all, axis=0) 71 | 72 | for q in range(imp.shape[0]): 73 | imp[q] /= np.sum(imp[q, :]) 74 | 75 | plt.subplot(2, 2, i + 1) 76 | 77 | for j in range(X.shape[1]): 78 | plt.plot(depths, imp[:, j], "o-", label="$X_%d$" % j, color=cmap[j]) 79 | 80 | plt.ylim([0., 1.0]) 81 | plt.title("Relevance = %.1f" % relevance) 82 | 83 | if i == 0: 84 | plt.legend(loc="best") 85 | 86 | plt.show() 87 | -------------------------------------------------------------------------------- /scripts/ch3_split_ordered_invariant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | np.random.seed(54) 5 | 6 | blue = (0, 0, 1.0) 7 | red = (1.0, 0, 0) 8 | gray = (0.7, 0.7, 0.7) 9 | 10 | n_samples = 10 11 | 12 | X = np.empty(n_samples) 13 | X[:n_samples / 2] = np.sort(np.random.normal(loc=-1.0, size=n_samples / 2)) 14 | X[n_samples / 2:] = np.sort(np.random.normal(loc=1.0, size=n_samples / 2)) 15 | y = np.zeros(n_samples) 16 | y[n_samples / 2:] = 1 17 | 18 | plt.plot([-3,3], [0,0], '-', color='k') 19 | 20 | indices = np.argsort(X) 21 | X_ = X[indices] 22 | for i in range(len(X_) - 1): 23 | s = (X_[i]+X_[i+1]) / 2.0 24 | plt.plot([s,s], [0.00001, 0], ':', color=gray) 25 | 26 | plt.plot([-3,-3], [0.00001, 0], '-', color='k') 27 | 28 | y = y[indices] 29 | 30 | def gini(p): 31 | p1 = 1.0 * np.sum(p) / len(p) 32 | p0 = 1.0 - p1 33 | return p0 * (1 - p0) + p1 * (1 - p1) 34 | 35 | s = [] 36 | delta = [] 37 | 38 | for i in range(1, 8): 39 | i_t = gini(y) 40 | i_t_L = gini(y[:i]) 41 | i_t_R = gini(y[i:]) 42 | p_L = 1.0 * i / n_samples 43 | p_R = 1.0 - p_L 44 | 45 | s.append((X_[i-1] + X_[i]) / 2.0) 46 | delta.append(i_t - p_L * i_t_L - p_R * i_t_R) 47 | 48 | delta = np.array(delta) 49 | delta /= np.max(delta) 50 | delta *= 0.00001 51 | 52 | plt.plot(s, delta, "o-") 53 | 54 | 55 | plt.scatter(X[:n_samples / 2], np.zeros(n_samples / 2), color=blue) 56 | plt.scatter(X[n_samples / 2:], np.zeros(n_samples / 2), color=red) 57 | 58 | s1 = X_[6] 59 | s2 = X_[7] 60 | smid = (s1+s2) / 2.0 61 | 62 | #plt.plot([s2,s2], [0.00001, -0.00001], '-', color=gray) 63 | plt.text(-3, 0.0000105, "$\Delta i(s_j^v, t)$", fontsize=15, horizontalalignment='center') 64 | plt.text(s1, 0.00000095, "$x_{i-1,j}$", fontsize=15, horizontalalignment='center') 65 | plt.text(s2, 0.00000095, "$x_{i,j}$", fontsize=15, horizontalalignment='center') 66 | plt.text(smid, 0.0000105, "$v^\prime_k$", fontsize=15, horizontalalignment='center') 67 | plt.text((smid+(-3)) / 2.0, -0.000001, "${\cal L}^{v^\prime_k}_{t_L}$", fontsize=15, horizontalalignment='center') 68 | plt.text((smid+3) / 2.0, -0.000001, "${\cal L}^{v^\prime_k}_{t_R}$", fontsize=15, horizontalalignment='center') 69 | 70 | plt.annotate( 71 | '', xy=(-3, -0.0000015), xycoords = 'data', 72 | xytext = (smid, -0.0000015), textcoords = 'data', 73 | arrowprops = {'arrowstyle':'<->'}) 74 | plt.annotate( 75 | '', xy=(smid, -0.0000015), xycoords = 'data', 76 | xytext = (3, -0.0000015), textcoords = 'data', 77 | arrowprops = {'arrowstyle':'<->'}) 78 | 79 | plt.annotate("$\Delta$", xy=(s[3], delta[3]), xycoords='data', xytext=(s[3]-0.5, delta[3]-0.000001), textcoords='data', arrowprops={'arrowstyle':'->'}) 80 | 81 | plt.text(3, -0.0000007, "$X_j$", fontsize=15) 82 | 83 | plt.show() 84 | 85 | -------------------------------------------------------------------------------- /tex/figures/ch8_rank_large.tex: -------------------------------------------------------------------------------- 1 | \documentclass{minimal} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(0,-2.5)(7.0,1.5) 7 | \usefont{T1}{ptm}{m}{n} 8 | \psline[linewidth=0.02cm](0.0,1.0)(2.91,1.0) 9 | \psline[linewidth=0.02cm](0.0,1.1)(0.0,0.9) 10 | \psline[linewidth=0.02cm](2.91,1.1)(2.91,0.9) 11 | \rput(1.4505,1.25){CD} 12 | 13 | \psline[linewidth=0.02cm](0.0,0.0)(7.0,0.0) 14 | \psline[linewidth=0.02cm](0.0,0.25)(0.0,0.0) \rput(0.0,0.5){8} 15 | \psline[linewidth=0.02cm](1.0,0.25)(1.0,0.0) \rput(1.0,0.5){7} 16 | \psline[linewidth=0.02cm](2.0,0.25)(2.0,0.0) \rput(2.0,0.5){6} 17 | \psline[linewidth=0.02cm](3.0,0.25)(3.0,0.0) \rput(3.0,0.5){5} 18 | \psline[linewidth=0.02cm](4.0,0.25)(4.0,0.0) \rput(4.0,0.5){4} 19 | \psline[linewidth=0.02cm](5.0,0.25)(5.0,0.0) \rput(5.0,0.5){3} 20 | \psline[linewidth=0.02cm](6.0,0.25)(6.0,0.0) \rput(6.0,0.5){2} 21 | \psline[linewidth=0.02cm](7.0,0.25)(7.0,0.0) \rput(7.0,0.5){1} 22 | 23 | \psline[linewidth=0.02cm](5.62,0.0)(5.62,-1.0) 24 | \psline[linewidth=0.02cm](5.62,-1.0)(7.0,-1.0) 25 | \rput(7.5,-1.0){RS-ET} 26 | 27 | \psline[linewidth=0.02cm](4.85,0.0)(4.85,-1.5) 28 | \psline[linewidth=0.02cm](4.85,-1.5)(7.0,-1.5) 29 | \rput(7.5,-1.5){ET} 30 | 31 | \psline[linewidth=0.02cm](4.47,0.0)(4.47,-2.0) 32 | \psline[linewidth=0.02cm](4.47,-2.0)(7.0,-2.0) 33 | \rput(7.5,-2.0){RS-DT} 34 | 35 | \psline[linewidth=0.02cm](4.16,0.0)(4.16,-2.5) 36 | \psline[linewidth=0.02cm](4.16,-2.5)(7.0,-2.5) 37 | \rput(7.5,-2.5){RP-ET} 38 | 39 | \psline[linewidth=0.02cm](1.39,0.0)(1.39,-1.0) 40 | \psline[linewidth=0.02cm](1.39,-1.0)(0.0,-1.0) 41 | \rput(-0.5,-1.0){P-DT} 42 | 43 | \psline[linewidth=0.02cm](1.77,0.0)(1.77,-1.5) 44 | \psline[linewidth=0.02cm](1.77,-1.5)(0.0,-1.5) 45 | \rput(-0.5,-1.5){RF} 46 | 47 | \psline[linewidth=0.02cm](2.31,0.0)(2.31,-2.0) 48 | \psline[linewidth=0.02cm](2.31,-2.0)(0.0,-2.0) 49 | \rput(-0.5,-2.0){P-ET} 50 | 51 | \psline[linewidth=0.02cm](3.47,0.0)(3.47,-2.5) 52 | \psline[linewidth=0.02cm](3.47,-2.5)(0.0,-2.5) 53 | \rput(-0.5,-2.5){RP-DT} 54 | 55 | \psline[linewidth=0.05cm](5.72,-0.25)(3.37,-0.25) 56 | \psline[linewidth=0.05cm](1.29,-0.5)(4.26,-0.5) 57 | \psline[linewidth=0.05cm](1.67,-0.75)(4.57,-0.75) 58 | \psline[linewidth=0.05cm](2.21,-1.0)(4.95,-1.0) 59 | \end{pspicture} 60 | \end{document} 61 | -------------------------------------------------------------------------------- /tex/figures/ch8_rank_small.tex: -------------------------------------------------------------------------------- 1 | \documentclass{minimal} 2 | \usepackage{pstricks} 3 | \usepackage{pst-plot} 4 | \pagestyle{empty} 5 | \begin{document} 6 | \begin{pspicture}(0,-2.5)(7.0,1.5) 7 | \usefont{T1}{ptm}{m}{n} 8 | \psline[linewidth=0.02cm](0.0,1.0)(2.62,1.0) 9 | \psline[linewidth=0.02cm](0.0,1.1)(0.0,0.9) 10 | \psline[linewidth=0.02cm](2.62,1.1)(2.62,0.9) 11 | \rput(1.31,1.25){CD} 12 | 13 | \psline[linewidth=0.02cm](0.0,0.0)(7.0,0.0) 14 | \psline[linewidth=0.02cm](0.0,0.25)(0.0,0.0) \rput(0.0,0.5){8} 15 | \psline[linewidth=0.02cm](1.0,0.25)(1.0,0.0) \rput(1.0,0.5){7} 16 | \psline[linewidth=0.02cm](2.0,0.25)(2.0,0.0) \rput(2.0,0.5){6} 17 | \psline[linewidth=0.02cm](3.0,0.25)(3.0,0.0) \rput(3.0,0.5){5} 18 | \psline[linewidth=0.02cm](4.0,0.25)(4.0,0.0) \rput(4.0,0.5){4} 19 | \psline[linewidth=0.02cm](5.0,0.25)(5.0,0.0) \rput(5.0,0.5){3} 20 | \psline[linewidth=0.02cm](6.0,0.25)(6.0,0.0) \rput(6.0,0.5){2} 21 | \psline[linewidth=0.02cm](7.0,0.25)(7.0,0.0) \rput(7.0,0.5){1} 22 | 23 | \psline[linewidth=0.02cm](5.88,0.0)(5.88,-1.0) 24 | \psline[linewidth=0.02cm](5.88,-1.0)(7.0,-1.0) 25 | \rput(7.5,-1.0){ET} 26 | 27 | \psline[linewidth=0.02cm](5.19,0.0)(5.18,-1.5) 28 | \psline[linewidth=0.02cm](5.19,-1.5)(7.0,-1.5) 29 | \rput(7.5,-1.5){RS-ET} 30 | 31 | \psline[linewidth=0.02cm](5.07,0.0)(5.07,-2.0) 32 | \psline[linewidth=0.02cm](5.07,-2.0)(7.0,-2.0) 33 | \rput(7.5,-2.0){RP-ET} 34 | 35 | \psline[linewidth=0.02cm](4.25,0.0)(4.25,-2.5) 36 | \psline[linewidth=0.02cm](4.25,-2.5)(7.0,-2.5) 37 | \rput(7.5,-2.5){P-ET} 38 | 39 | \psline[linewidth=0.02cm](0.94,0.0)(0.94,-1.0) 40 | \psline[linewidth=0.02cm](0.94,-1.0)(0.0,-1.0) 41 | \rput(-0.5,-1.0){P-DT} 42 | 43 | \psline[linewidth=0.02cm](1.88,0.0)(1.88,-1.5) 44 | \psline[linewidth=0.02cm](1.88,-1.5)(0.0,-1.5) 45 | \rput(-0.5,-1.5){RS-DT} 46 | 47 | \psline[linewidth=0.02cm](2.13,0.0)(2.13,-2.0) 48 | \psline[linewidth=0.02cm](2.13,-2.0)(0.0,-2.0) 49 | \rput(-0.5,-2.0){RF} 50 | 51 | \psline[linewidth=0.02cm](2.69,0.0)(2.69,-2.5) 52 | \psline[linewidth=0.02cm](2.69,-2.5)(0.0,-2.5) 53 | \rput(-0.5,-2.5){RP-DT} 54 | 55 | \psline[linewidth=0.05cm](5.98,-0.25)(4.15,-0.25) 56 | \psline[linewidth=0.05cm](0.84,-0.25)(2.79,-0.25) 57 | \psline[linewidth=0.05cm](1.78,-0.5)(4.35,-0.5) 58 | \psline[linewidth=0.05cm](2.59,-0.75)(5.29,-0.75) 59 | \end{pspicture} 60 | \end{document} 61 | -------------------------------------------------------------------------------- /tex/frontback/abstract.tex: -------------------------------------------------------------------------------- 1 | % Abstract ==================================================================== 2 | 3 | \pdfbookmark[1]{Abstract}{Abstract} 4 | \chapter*{Abstract} 5 | 6 | Data analysis and machine learning have become an integrative part of the 7 | modern scientific methodology, offering automated procedures for the prediction 8 | of a phenomenon based on past observations, unraveling underlying patterns in 9 | data and providing insights about the problem. Yet, caution should 10 | avoid using machine learning as a black-box tool, but rather consider it as a 11 | methodology, with a rational thought process that is entirely dependent on the 12 | problem under study. In particular, the use of algorithms 13 | should ideally require a reasonable understanding of their 14 | mechanisms, properties and limitations, in order to better apprehend and 15 | interpret their results. 16 | 17 | Accordingly, the goal of this thesis is to provide an in-depth 18 | analysis of random forests, consistently calling into 19 | question each and every part of the algorithm, in order to shed new light on 20 | its learning capabilities, inner workings and interpretability. The first 21 | part of this work studies the induction of decision trees and the construction of 22 | ensembles of randomized trees, motivating their design and purpose whenever 23 | possible. Our contributions follow with an original complexity 24 | analysis of random forests, showing their good computational performance 25 | and scalability, along with an in-depth discussion of their 26 | implementation details, as contributed within Scikit-Learn. 27 | 28 | In the second part of this work, we analyze and discuss the interpretability of 29 | random forests in the eyes of variable importance measures. The core of our 30 | contributions rests in the theoretical characterization of the Mean Decrease of 31 | Impurity variable importance measure, from which we prove and derive some of 32 | its properties in the case of multiway totally randomized trees and in 33 | asymptotic conditions. In consequence of this work, our analysis demonstrates 34 | that variable importances as computed from non-totally randomized trees (e.g., 35 | standard Random Forest) suffer from a combination of defects, due to masking 36 | effects, misestimations of node impurity or due to the binary structure of 37 | decision trees. 38 | 39 | Finally, the last part of this dissertation addresses limitations of random 40 | forests in the context of large datasets. Through extensive experiments, we 41 | show that subsampling both samples and features simultaneously provides on par 42 | performance while lowering at the same time the memory requirements. Overall 43 | this paradigm highlights an intriguing practical fact: there is often no need 44 | to build single models over immensely large datasets. Good performance can 45 | often be achieved by building models on (very) small random parts of the data 46 | and then combining them all in an ensemble, thereby avoiding all practical 47 | burdens of making large data fit into memory. 48 | -------------------------------------------------------------------------------- /scripts/ch7_bias_null.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import brewer2mpl 5 | 6 | from itertools import product 7 | from functools import partial 8 | from demo import entropy 9 | 10 | from sklearn.ensemble import ExtraTreesClassifier 11 | from sklearn.ensemble import RandomForestClassifier 12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 13 | 14 | def feature_importances(X, y, cls, n_trees=500): 15 | clf = cls(n_estimators=n_trees).fit(X, y) 16 | 17 | if isinstance(clf, RandomizedID3Ensemble): 18 | imp = np.sum(clf.feature_importances_, axis=1) 19 | 20 | else: 21 | imp = np.zeros(X.shape[1]) 22 | 23 | for tree in clf.estimators_: 24 | imp += tree.tree_.compute_feature_importances(normalize=False) 25 | 26 | imp = imp / n_trees 27 | 28 | return imp 29 | 30 | def generate_strobl_null(n_samples=120): 31 | X = np.array([v for v in product(range(2), 32 | range(4), 33 | range(10), 34 | range(20), 35 | range(2))]).astype(np.int32) 36 | X, y = X[:, :-1], X[:, -1] 37 | 38 | indices = np.random.randint(0, X.shape[0], n_samples) 39 | X, y = X[indices], y[indices].astype(np.int32) 40 | X = np.hstack((np.random.rand(len(X), 1), X)) 41 | 42 | return X, y 43 | 44 | # Generate all importances 45 | models = [("TRT", partial(RandomizedID3Ensemble, base_estimator=RandomizedID3Classifier(k=1))), 46 | ("ETs K=1", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")), 47 | ("ETs K=3", partial(ExtraTreesClassifier, max_features=3, criterion="entropy")), 48 | ("ETs K=5", partial(ExtraTreesClassifier, max_features=5, criterion="entropy")), 49 | ("RF K=1", partial(RandomForestClassifier, max_features=1, bootstrap=True, criterion="entropy")), 50 | ("RF K=3", partial(RandomForestClassifier, max_features=3, bootstrap=True, criterion="entropy")), 51 | ("RF K=5", partial(RandomForestClassifier, max_features=5, bootstrap=True, criterion="entropy")),] 52 | 53 | n_repeat = 5 54 | r = {} 55 | 56 | for i in range(n_repeat): 57 | print "Iteration", i 58 | 59 | X, y = generate_strobl_null(n_samples=120) 60 | print entropy(y) 61 | 62 | for name, cls in models: 63 | f = feature_importances(X, y, cls=cls, n_trees=500) 64 | 65 | if i == 0: 66 | r[name] = np.array(f) 67 | else: 68 | r[name] += np.array(f) 69 | 70 | print name, np.sum(f) 71 | 72 | for name in r: 73 | r[name] /= n_repeat 74 | 75 | # Convert to pandas and plot 76 | df = pd.DataFrame(r, index=["X%d" % (i+1) for i in range(X.shape[1])]) 77 | df = df.reindex_axis([name for name, _ in models], axis=1) 78 | 79 | import brewer2mpl 80 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', len(r)) 81 | df.plot(kind="bar", colormap=cmap.mpl_colormap, legend="best", grid=False) 82 | plt.show() 83 | -------------------------------------------------------------------------------- /scripts/ch4_correlation_plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | blue = (0, 0, 1.0) 6 | green = (0, 0.8, 0) 7 | red = (1.0, 0, 0) 8 | red_alpha = (1.0, 0, 0, 0.001) 9 | gray = (0.7, 0.7, 0.7) 10 | 11 | results = [[],[], 12 | ["RandomForestRegressor-K=1",3.527128,2.820386,0.706743,0.063868,0.009973,0.286104,0.420639], 13 | ["RandomForestRegressor-K=2",3.036291,2.333874,0.702417,0.075537,0.011347,0.314841,0.387576], 14 | ["RandomForestRegressor-K=3",2.823907,2.109897,0.714009,0.087809,0.012335,0.349486,0.364523], 15 | ["RandomForestRegressor-K=4",2.715613,1.979086,0.736527,0.102472,0.014302,0.391750,0.344778], 16 | ["RandomForestRegressor-K=5",2.643232,1.887080,0.756151,0.111790,0.015411,0.421380,0.334772], 17 | ["RandomForestRegressor-K=6",2.642354,1.851498,0.790856,0.125342,0.016268,0.466556,0.324300], 18 | ["RandomForestRegressor-K=7",2.636296,1.822316,0.813980,0.134200,0.017159,0.495746,0.318234], 19 | ["RandomForestRegressor-K=8",2.623646,1.784344,0.839303,0.146081,0.018631,0.531100,0.308202], 20 | ["RandomForestRegressor-K=9",2.645439,1.780447,0.864992,0.152977,0.019492,0.558601,0.306390], 21 | ["RandomForestRegressor-K=10",2.638901,1.753437,0.885464,0.160371,0.020184,0.583494,0.301970], 22 | ["ExtraTreesRegressor-K=1",3.376099,2.723586,0.652514,0.051864,0.009532,0.230752,0.421761], 23 | ["ExtraTreesRegressor-K=2",2.801100,2.146534,0.654566,0.060858,0.011926,0.258086,0.396480], 24 | ["ExtraTreesRegressor-K=3",2.536644,1.886837,0.649807,0.067322,0.012756,0.273424,0.376383], 25 | ["ExtraTreesRegressor-K=4",2.409943,1.745583,0.664360,0.076519,0.016511,0.302962,0.361399], 26 | ["ExtraTreesRegressor-K=5",2.330165,1.651706,0.678459,0.086137,0.017063,0.331515,0.346944], 27 | ["ExtraTreesRegressor-K=6",2.285386,1.597063,0.688323,0.092147,0.019216,0.349667,0.338655], 28 | ["ExtraTreesRegressor-K=7",2.263983,1.553772,0.710211,0.100322,0.020510,0.378116,0.332094], 29 | ["ExtraTreesRegressor-K=8",2.246997,1.528167,0.718831,0.107167,0.021703,0.396323,0.322507], 30 | ["ExtraTreesRegressor-K=9",2.236845,1.495768,0.741077,0.115699,0.023020,0.423894,0.317183], 31 | ["ExtraTreesRegressor-K=10",2.232862,1.469781,0.763081,0.123849,0.024420,0.451778,0.311304]] 32 | 33 | max_features = range(1, 10+1) 34 | 35 | ax = plt.subplot(1, 2, 1) 36 | plt.plot(max_features, [results[1+k][1] for k in max_features], 'o-', color=blue, label='Random Forest') 37 | plt.plot(max_features, [results[1+k][2] for k in max_features], 'o--', color=blue) 38 | plt.plot(max_features, [results[1+k][3] for k in max_features], 'o:', color=blue) 39 | plt.plot(max_features, [results[11+k][1] for k in max_features], 'o-', color=red, label='Extremely Randomized Trees') 40 | plt.plot(max_features, [results[11+k][2] for k in max_features], 'o--', color=red) 41 | plt.plot(max_features, [results[11+k][3] for k in max_features], 'o:', color=red) 42 | plt.legend(loc="best") 43 | plt.xlabel("$K$") 44 | 45 | plt.subplot(1, 2, 2, sharex=ax) 46 | plt.plot(max_features, [results[1+k][4] for k in max_features], 'o-', color=blue) 47 | plt.plot(max_features, [results[11+k][4] for k in max_features], 'o-', color=red) 48 | plt.xlabel("$K$") 49 | plt.ylabel("$\\rho$") 50 | 51 | plt.show() 52 | -------------------------------------------------------------------------------- /scripts/ch3_impurity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | blue = (0, 0, 1.0) 5 | red = (1.0, 0, 0) 6 | gray = (0.7, 0.7, 0.7) 7 | 8 | # Criterion 9 | 10 | def impurity_error(p1, p2): 11 | return min(p1, p2) 12 | 13 | def impurity_entropy(p1, p2): 14 | if p1 == 0.0 or p1 == 1.0 or p2 == 0.0 or p2 == 1.0: 15 | return 0.0 16 | else: 17 | return -(p1 * np.log2(p1) + p2 * np.log2(p2)) 18 | 19 | def impurity_gini(p1, p2): 20 | return p1 * (1 - p1) + p2 * (1 - p2) 21 | 22 | # Split 23 | 24 | def p(y_t): 25 | return 1.0 * y_t / np.sum(y_t) 26 | 27 | impurity = impurity_gini 28 | y_t = np.array([2, 8], dtype=np.float) 29 | y_t_L = np.array([1, 5], dtype=np.float) 30 | y_t_R = y_t - y_t_L 31 | p_y_t = p(y_t) 32 | p_y_t_L = p(y_t_L) 33 | p_y_t_R = p(y_t_R) 34 | 35 | p_L = y_t_L.sum() / y_t.sum() 36 | p_R = y_t_R.sum() / y_t.sum() 37 | 38 | i_t = impurity(*p_y_t) 39 | i_t_L = impurity(*p_y_t_L) 40 | i_t_R = impurity(*p_y_t_R) 41 | 42 | print "Delta i(s, t) = i(t) - p_L * i(t_L) - p_R * i (t_R)" 43 | print " = %f - %f * %f - %f * %f" % (i_t, p_L, i_t_L, p_R, i_t_R) 44 | print " = %f" % (i_t - p_L * i_t_L - p_R * i_t_R, ) 45 | 46 | 47 | fig = plt.figure() 48 | ax = fig.add_subplot(111) 49 | 50 | x = np.linspace(0.0, 1.0, num=300) 51 | # ax.plot(x, map(impurity, x, 1-x), label="entropy", color=blue) 52 | ax.plot(x, map(impurity_error, x, 1-x), label="$i_E(t)$", color=gray) 53 | ax.plot(x, map(impurity_entropy, x, 1-x), label="$i_H(t)$", color=blue) 54 | ax.plot(x, map(impurity_gini, x, 1-x), label="$i_G(t)$", color=red) 55 | ax.legend(loc="best") 56 | plt.show() 57 | 58 | ax.plot(p_y_t[0], i_t, marker="o", color=red) 59 | ax.plot(p_y_t_L[0], i_t_L, marker="o", color=red) 60 | ax.plot(p_y_t_R[0], i_t_R, marker="o", color=red) 61 | 62 | ax.plot((p_y_t[0], p_y_t[0]), (0, i_t), ":", color=gray) 63 | ax.plot((0, p_y_t[0]), (i_t, i_t), ":", color=gray) 64 | ax.annotate("$i(t)$", xy=(0, i_t), xytext=(0+0.01, i_t), va="center") 65 | ax.annotate("$p(c_1|t)$", xy=(p_y_t[0], 0), xytext=(p_y_t[0], 0+0.025), ha="center") 66 | 67 | ax.plot((p_y_t_L[0], p_y_t_L[0]), (0, i_t_L), ":", color=gray) 68 | ax.plot((0, p_y_t_L[0]), (i_t_L, i_t_L), ":", color=gray) 69 | ax.annotate("$i(t_L)$", xy=(0, i_t_L), xytext=(0+0.01, i_t_L), va="center") 70 | ax.annotate("$p(c_1|t_L)$", xy=(p_y_t_L[0], 0), xytext=(p_y_t_L[0], 0+0.025), ha="center") 71 | 72 | ax.plot((p_y_t_R[0], p_y_t_R[0]), (0, i_t_R), ":", color=gray) 73 | ax.plot((0, p_y_t_R[0]), (i_t_R, i_t_R), ":", color=gray) 74 | ax.annotate("$i(t_R)$", xy=(0, i_t_R), xytext=(0+0.01, i_t_R), va="center") 75 | ax.annotate("$p(c_1|t_R)$", xy=(p_y_t_R[0], 0), xytext=(p_y_t_R[0], 0+0.025), ha="center") 76 | 77 | ax.plot((p_y_t_L[0], p_y_t_R[0]), (i_t_L, i_t_R), "-", color=gray) 78 | ax.plot((p_y_t[0], p_y_t[0]), (i_t, p_L * i_t_L + p_R * i_t_R), "-", color=red) 79 | ax.plot(p_y_t[0], p_L * i_t_L + p_R * i_t_R, marker="o", color=gray) 80 | ax.annotate("$\Delta i(s, t) = %.3f$" % abs(i_t - p_L * i_t_L - p_R * i_t_R), xy=(p_y_t[0], i_t - 0.5*(i_t - p_L * i_t_L - p_R * i_t_R)), xytext=(p_y_t[0]+0.05, i_t - 0.5*(i_t - p_L * i_t_L - p_R * i_t_R)), arrowprops=dict(arrowstyle="->"), va="center") 81 | 82 | #ax.legend(loc="best") 83 | plt.show() 84 | -------------------------------------------------------------------------------- /benchmarks/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from sklearn.utils import check_random_state 4 | 5 | 6 | def make_waveforms(n_samples=300, random_state=None): 7 | """Make the waveforms dataset. (CART)""" 8 | random_state = check_random_state(random_state) 9 | 10 | def h1(x): 11 | if x < 7: 12 | return x 13 | elif x < 13: 14 | return 13.-x 15 | else: 16 | return 0. 17 | 18 | def h2(x): 19 | if x < 9: 20 | return 0. 21 | elif x < 15: 22 | return x-9. 23 | else: 24 | return 21.-x 25 | 26 | def h3(x): 27 | if x < 5: 28 | return 0. 29 | elif x < 11: 30 | return x-5. 31 | elif x < 17: 32 | return 17.-x 33 | else: 34 | return 0. 35 | 36 | u = random_state.rand(n_samples) 37 | y = random_state.randint(low=0, high=3, size=n_samples) 38 | X = random_state.normal(size=(n_samples, 21)) 39 | 40 | for i in range(n_samples): 41 | if y[i] == 0: 42 | ha = h1 43 | hb = h2 44 | elif y[i] == 1: 45 | ha = h1 46 | hb = h3 47 | else: 48 | ha = h2 49 | hb = h3 50 | 51 | for m in np.arange(1, 21+1): 52 | X[i, m-1] += u[i] * ha(m) + (1 - u[i]) * hb(m) 53 | 54 | return X, y 55 | 56 | 57 | def make_ringnorm(n_samples=300, random_state=None): 58 | """Make the ring-norm dataset. (Breiman, Tech. report 460.)""" 59 | random_state = check_random_state(random_state) 60 | a = 1. / 20.**0.5 61 | 62 | y = random_state.randint(low=0, high=2, size=n_samples) 63 | X = np.zeros((n_samples, 20)) 64 | 65 | negatives = (y == 0) 66 | positives = (y == 1) 67 | 68 | X[negatives] = random_state.multivariate_normal(mean=np.zeros(20), cov=4.*np.eye(20), size=negatives.sum()) 69 | X[positives] = random_state.normal(loc=[a]*20, size=(positives.sum(), 20)) 70 | 71 | return X, y 72 | 73 | 74 | def make_twonorm(n_samples=300, random_state=None): 75 | """Make the two-norm dataset. (Breiman, Tech. report 460.)""" 76 | random_state = check_random_state(random_state) 77 | a = 2. / 20.**0.5 78 | 79 | y = random_state.randint(low=0, high=2, size=n_samples) 80 | X = np.zeros((n_samples, 20)) 81 | 82 | negatives = (y == 0) 83 | positives = (y == 1) 84 | 85 | X[negatives] = random_state.normal(loc=[a]*20, size=(negatives.sum(), 20)) 86 | X[positives] = random_state.normal(loc=[-a]*20, size=(positives.sum(), 20)) 87 | 88 | return X, y 89 | 90 | def make_threenorm(n_samples=300, random_state=None): 91 | """Make the three-norm dataset. (Breiman, Tech. report 460.)""" 92 | random_state = check_random_state(random_state) 93 | a = 2. / 20.**0.5 94 | 95 | y = random_state.randint(low=0, high=4, size=n_samples) 96 | X = np.zeros((n_samples, 20)) 97 | 98 | class0 = (y == 0) 99 | class1 = (y == 1) 100 | class2 = (y >= 2) 101 | 102 | X[class0] = random_state.normal(loc=[a]*20, size=(class0.sum(), 20)) 103 | X[class1] = random_state.normal(loc=[-a]*20, size=(class1.sum(), 20)) 104 | X[class2] = random_state.normal(loc=[a,-a]*10, size=(class2.sum(), 20)) 105 | 106 | y[class0] = 0 107 | y[class1] = 0 108 | y[class2] = 1 109 | 110 | return X, y 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Understanding Random Forests 2 | ============================ 3 | 4 | PhD dissertation, Gilles Louppe, July 2014. Defended on October 9, 2014. 5 | 6 | _arXiv:_ http://arxiv.org/abs/1407.7502 7 | 8 | _Mirrors:_ 9 | - http://hdl.handle.net/2268/170309 10 | - http://www.montefiore.ulg.ac.be/~glouppe/pdf/phd-thesis.pdf 11 | 12 | _License:_ BSD 3 clause 13 | 14 | _Contact:_ Gilles Louppe ([@glouppe](https://twitter.com/glouppe/), ) 15 | 16 | Please cite using the following BibTex entry: 17 | 18 | ``` 19 | @phdthesis{louppe2014understanding, 20 | title={Understanding Random Forests: From Theory to Practice}, 21 | author={Louppe, Gilles}, 22 | school={University of Liege, Belgium}, 23 | year=2014, 24 | month=10, 25 | note={arXiv:1407.7502} 26 | } 27 | ``` 28 | 29 | --- 30 | 31 | Data analysis and machine learning have become an integrative part of the 32 | modern scientific methodology, offering automated procedures for the prediction 33 | of a phenomenon based on past observations, unraveling underlying patterns in 34 | data and providing insights about the problem. Yet, caution should 35 | avoid using machine learning as a black-box tool, but rather consider it as a 36 | methodology, with a rational thought process that is entirely dependent on the 37 | problem under study. In particular, the use of algorithms 38 | should ideally require a reasonable understanding of their 39 | mechanisms, properties and limitations, in order to better apprehend and 40 | interpret their results. 41 | 42 | Accordingly, the goal of this thesis is to provide an in-depth 43 | analysis of random forests, consistently calling into 44 | question each and every part of the algorithm, in order to shed new light on 45 | its learning capabilities, inner workings and interpretability. The first 46 | part of this work studies the induction of decision trees and the construction of 47 | ensembles of randomized trees, motivating their design and purpose whenever 48 | possible. Our contributions follow with an original complexity 49 | analysis of random forests, showing their good computational performance 50 | and scalability, along with an in-depth discussion of their 51 | implementation details, as contributed within Scikit-Learn. 52 | 53 | In the second part of this work, we analyze and discuss the interpretability of 54 | random forests in the eyes of variable importance measures. The core of our 55 | contributions rests in the theoretical characterization of the Mean Decrease of 56 | Impurity variable importance measure, from which we prove and derive some of 57 | its properties in the case of multiway totally randomized trees and in 58 | asymptotic conditions. In consequence of this work, our analysis demonstrates 59 | that variable importances as computed from non-totally randomized trees (e.g., 60 | standard Random Forest) suffer from a combination of defects, due to masking 61 | effects, misestimations of node impurity or due to the binary structure of 62 | decision trees. 63 | 64 | Finally, the last part of this dissertation addresses limitations of random 65 | forests in the context of large datasets. Through extensive experiments, we 66 | show that subsampling both samples and features simultaneously provides on par 67 | performance while lowering at the same time the memory requirements. Overall 68 | this paradigm highlights an intriguing practical fact: there is often no need 69 | to build single models over immensely large datasets. Good performance can 70 | often be achieved by building models on (very) small random parts of the data 71 | and then combining them all in an ensemble, thereby avoiding all practical 72 | burdens of making large data fit into memory. 73 | -------------------------------------------------------------------------------- /tex/thesis.tex: -------------------------------------------------------------------------------- 1 | \documentclass[twoside,openright,titlepage,numbers=noenddot,headinclude,% 2 | footinclude=true,cleardoublepage=empty,abstractoff,BCOR=5mm,% 3 | paper=a4,fontsize=11pt,ngerman,american]{scrreprt} 4 | 5 | % Custom config =============================================================== 6 | 7 | % Classic thesis 8 | \usepackage{amssymb} 9 | \input{classicthesis-config} 10 | 11 | % Theorems and definitions 12 | \usepackage{amsthm} 13 | \newtheorem{theorem}{Theorem} 14 | \newtheorem{lemma}[theorem]{Lemma} 15 | \newtheorem{proposition}[theorem]{Proposition} 16 | \newtheorem{corollary}[theorem]{Corollary} 17 | \newtheorem{definition}{Definition} 18 | 19 | \newtheorem{algorithm}{Algorithm} 20 | \usepackage{algpseudocode} 21 | 22 | % Counters 23 | \renewcommand{\labelenumi}{{\color{halfgray}(\alph{enumi})}} 24 | \renewcommand{\labelenumii}{\color{halfgray}{\roman{enumii}.}} 25 | \renewcommand{\labelitemi}{{\color{halfgray}-}}%\raisebox{0.3ex}{\tiny$\blacksquare$}}} 26 | 27 | \numberwithin{theorem}{chapter} 28 | \numberwithin{definition}{chapter} 29 | \numberwithin{algorithm}{chapter} 30 | \numberwithin{figure}{chapter} 31 | \numberwithin{table}{chapter} 32 | 33 | % Maths 34 | \DeclareMathOperator*{\argmin}{arg\,min} 35 | \DeclareMathOperator*{\argmax}{arg\,max} 36 | 37 | \numberwithin{equation}{chapter} 38 | \allowdisplaybreaks 39 | 40 | % Shaded boxes 41 | \usepackage{framed} 42 | \newenvironment{remark}[1]{% 43 | \definecolor{shadecolor}{gray}{0.9}% 44 | \begin{shaded}{\color{Maroon}\noindent\textsc{#1}}\\% 45 | }{% 46 | \end{shaded}% 47 | } 48 | 49 | % Code snippets 50 | \usepackage{minted} 51 | \definecolor{rulecolor}{rgb}{0.80,0.80,0.80} 52 | \definecolor{bgcolor}{rgb}{1.0,1.0,1.0} 53 | \newminted{python}{bgcolor=bgcolor} 54 | 55 | % Todo 56 | \newcommand{\todo}[1]{\textcolor{red}{[TODO] #1}} 57 | 58 | % PS pictures 59 | \usepackage{pstricks,auto-pst-pdf} 60 | 61 | % Landscape tables 62 | \usepackage{rotating} 63 | 64 | % Checkmarks 65 | \usepackage{pifont}% http://ctan.org/pkg/pifont 66 | \newcommand{\cmark}{\ding{51}}% 67 | \newcommand{\xmark}{\ding{55}}% 68 | 69 | % Wide tables 70 | \usepackage{ltablex} 71 | 72 | 73 | % ----------------------------------------------------------------------------- 74 | 75 | \begin{document} 76 | \frenchspacing 77 | \raggedbottom 78 | \selectlanguage{american} 79 | \pagenumbering{roman} 80 | \pagestyle{plain} 81 | 82 | 83 | % Front pages ================================================================= 84 | \include{frontback/titlepage} 85 | %\cleardoublepage\include{frontback/disclaimer} 86 | \cleardoublepage\include{frontback/jury} 87 | \cleardoublepage\include{frontback/abstract} 88 | %\cleardoublepage\include{frontback/publications} 89 | \cleardoublepage\include{frontback/acknowledgments} 90 | \pagestyle{scrheadings} 91 | \cleardoublepage\include{frontback/toc} 92 | 93 | 94 | % Content ===================================================================== 95 | \pagenumbering{arabic} 96 | 97 | \cleardoublepage 98 | \include{chapters/chapter01}\cleardoublepage 99 | 100 | %\ctparttex{} 101 | \part{Growing Decision Trees}\label{part:1} 102 | \include{chapters/chapter02}\cleardoublepage 103 | \include{chapters/chapter03}\cleardoublepage 104 | \include{chapters/chapter04}\cleardoublepage 105 | \include{chapters/chapter05}\cleardoublepage 106 | 107 | %\ctparttex{} 108 | \part{Interpreting Random Forests}\label{part:2} 109 | \include{chapters/chapter06}\cleardoublepage 110 | \include{chapters/chapter07}\cleardoublepage 111 | 112 | %\ctparttex{} 113 | \part{Subsampling data}\label{part:3} 114 | \include{chapters/chapter08} 115 | 116 | \addtocontents{toc}{\protect\vspace*{\baselineskip}\protect} 117 | \cleardoublepage 118 | \makeatletter 119 | \def\toclevel@chapter{-1} 120 | \makeatother 121 | \include{chapters/chapter09} 122 | 123 | 124 | % Back pages ================================================================== 125 | \appendix 126 | \cleardoublepage 127 | \part{Appendix} 128 | 129 | \cleardoublepage\include{frontback/notations} 130 | \cleardoublepage\include{frontback/bibliography} 131 | 132 | 133 | \end{document} 134 | -------------------------------------------------------------------------------- /scripts/ch4_correlation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from functools import partial 4 | from sklearn.utils import check_random_state 5 | 6 | 7 | def make(n_samples, n_features=5, noise_features=5, random_state=None): 8 | X = check_random_state(random_state).normal(size=(n_samples, n_features+noise_features)) 9 | y = np.sum(X[:, :n_features], axis=1) 10 | return X, y 11 | 12 | 13 | # from sklearn.datasets import make_friedman1 as make 14 | # # make = partial(make,) 15 | 16 | from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor #, PERTRegressor 17 | from sklearn.tree import DecisionTreeRegressor 18 | from sklearn.metrics import mean_squared_error 19 | 20 | n_train = 50 21 | n_test = 600 22 | n_estimators = 10 # number of trees per forest 23 | n_sets = 100 # number of learning sets 24 | n_trees = 50 # number of trees per learning sets, for estimating statistics 25 | 26 | estimators = [("PERTRegressor", PERTRegressor), 27 | ("Bagging", partial(RandomForestRegressor, max_features=1.0, bootstrap=True))] 28 | estimators.extend([("RandomForestRegressor-K=%d" % i, partial(RandomForestRegressor, max_features=i)) for i in range(1, 10+1)]) 29 | estimators.extend([("ExtraTreesRegressor-K=%d" % i, partial(ExtraTreesRegressor, max_features=i)) for i in range(1, 10+1)]) 30 | 31 | estimators = [] 32 | estimators.extend([("RandomForestRegressor-M=%d" % i, partial(RandomForestRegressor, n_estimators=i, max_features=1)) for i in range(1, 50+1)]) 33 | #estimators.extend([("ExtraTreesRegressor-M=%d" % i, partial(ExtraTreesRegressor, n_estimators=i, max_features=1)) for i in range(1, 50+1)]) 34 | 35 | 36 | train = [make(n_samples=n_train, random_state=i) for i in range(n_sets)] 37 | X_test, y_test = make(n_samples=n_test) 38 | 39 | for m in range(1, 50+1): 40 | n_estimators = m 41 | estimator = partial(RandomForestRegressor, n_estimators=m, max_features=1) 42 | method = "RandomForestRegressor-M=%d" % m 43 | 44 | # Compute bias/variance on forest predictions 45 | forests = [] 46 | 47 | for k, (X_train, y_train) in enumerate(train): 48 | #forests.append(estimator(n_estimators=n_estimators, random_state=k).fit(X_train, y_train)) 49 | forests.append(estimator(random_state=k).fit(X_train, y_train)) 50 | 51 | pred_forest = np.zeros((n_test, n_sets)) 52 | 53 | error = 0.0 54 | for k, forest in enumerate(forests): 55 | pred_forest[:, k] = forest.predict(X_test) 56 | error += mean_squared_error(y_test, pred_forest[:, k]) 57 | error /= n_sets 58 | 59 | bias_forest = (y_test - np.mean(pred_forest, axis=1)) ** 2 60 | var_forest = np.var(pred_forest, axis=1) 61 | 62 | # Estimate bias/variance from tree predictions 63 | trees = [] 64 | 65 | for k, (X_train, y_train) in enumerate(train): 66 | #trees.extend(estimator(n_estimators=n_trees, random_state=n_sets+k).fit(X_train, y_train).estimators_) 67 | trees.extend(RandomForestRegressor(n_estimators=n_trees, max_features=1, random_state=n_sets+k).fit(X_train, y_train).estimators_) 68 | 69 | pred_trees = np.zeros((n_test, n_sets * n_trees)) 70 | 71 | for m, tree in enumerate(trees): 72 | pred_trees[:, m] = tree.predict(X_test) 73 | 74 | mu = np.mean(pred_trees, axis=1) 75 | sigma = np.var(pred_trees, axis=1) 76 | rho = np.zeros(n_test) 77 | 78 | for i in range(n_test): 79 | e_prod = 0.0 80 | for k in range(n_sets): 81 | p = pred_trees[i, k*n_trees:(k+1)*n_trees] 82 | p = p.reshape((n_trees, 1)) 83 | e_prod += np.dot(p, p.T).mean() 84 | e_prod /= n_sets 85 | rho[i] = (e_prod - mu[i]**2) / sigma[i] 86 | 87 | bias = (y_test - mu) ** 2 88 | var = rho * sigma + (1 - rho) / n_estimators * sigma 89 | 90 | print "%s,%f,%f,%f,%f,%f,%f,%f" % (method, bias.mean()+var.mean(), bias.mean(), var.mean(), rho.mean(), rho.std(), (rho*sigma).mean(), ((1 - rho) / n_estimators * sigma).mean()) 91 | 92 | # print "%f (error) = %f (b^2) + %f (var)" % (error, bias_forest.mean(), var_forest.mean()) 93 | # print "%f (error) = %f (b^2) + %f (rho*sigma + (1-rho)/M*sigma)" % (bias.mean()+var.mean(), bias.mean(), var.mean()) 94 | # print "var = %f (rho*sigma) + %f (1-rho)/M*sigma ; rho = %f" % ((rho*sigma).mean(), ((1 - rho) / n_estimators * sigma).mean(), rho.mean()) 95 | # print "---" 96 | -------------------------------------------------------------------------------- /scripts/ch4_overfitting.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | blue = (0, 0, 1.0) 5 | green = (0, 0.8, 0) 6 | red = (1.0, 0, 0) 7 | red_alpha = (1.0, 0, 0, 0.001) 8 | gray = (0.7, 0.7, 0.7) 9 | 10 | # Settings 11 | n_repeat = 100 # Number of iterations for computing expectations 12 | n_train = 30 # Size of the training set 13 | n_test = 1000 # Size of the test set 14 | noise = 0.1**0.5 # Standard deviation of the noise 15 | np.random.seed(0) 16 | 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.preprocessing import PolynomialFeatures 19 | from sklearn.linear_model import LinearRegression 20 | 21 | estimators = [("Degree = 1", Pipeline([("polynomial_features", PolynomialFeatures(degree=1, include_bias=False)), ("linear_regression", LinearRegression())])), 22 | ("Degree = 5", Pipeline([("polynomial_features", PolynomialFeatures(degree=5, include_bias=False)), ("linear_regression", LinearRegression())])), 23 | ("Degree = 15", Pipeline([("polynomial_features", PolynomialFeatures(degree=15, include_bias=False)), ("linear_regression", LinearRegression())])),] 24 | 25 | n_estimators = len(estimators) 26 | 27 | # Generate data 28 | def f(x): 29 | x = x.ravel() 30 | 31 | return np.cos(2.5 * np.pi * x) 32 | 33 | def generate(n_samples, noise, n_repeat=1): 34 | X = np.random.rand(n_samples) 35 | X = np.sort(X) 36 | 37 | if n_repeat == 1: 38 | y = f(X) + np.random.normal(0.0, noise, n_samples) 39 | else: 40 | y = np.zeros((n_samples, n_repeat)) 41 | 42 | for i in range(n_repeat): 43 | y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples) 44 | 45 | X = X.reshape((n_samples, 1)) 46 | 47 | return X, y 48 | 49 | X_train = [] 50 | y_train = [] 51 | 52 | for i in range(n_repeat): 53 | X, y = generate(n_samples=n_train, noise=noise) 54 | X_train.append(X) 55 | y_train.append(y) 56 | 57 | X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat) 58 | 59 | plt.figure(figsize=(14, 8)) 60 | 61 | # Loop over estimators to compare 62 | for n, (name, estimator) in enumerate(estimators): 63 | # Compute predictions 64 | y_predict = np.zeros((n_test, n_repeat)) 65 | 66 | for i in xrange(n_repeat): 67 | estimator.fit(X_train[i], y_train[i]) 68 | y_predict[:, i] = estimator.predict(X_test) 69 | 70 | # Bias^2 + Variance + Noise decomposition of the mean squared error 71 | y_error = np.zeros(n_test) 72 | 73 | for i in range(n_repeat): 74 | for j in range(n_repeat): 75 | y_error += (y_test[:, j] - y_predict[:, i]) ** 2 76 | 77 | y_error /= (n_repeat * n_repeat) 78 | 79 | y_noise = np.var(y_test, axis=1) 80 | y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2 81 | y_var = np.var(y_predict, axis=1) 82 | 83 | print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) " 84 | " + {3:.4f} (var) + {4:.4f} (noise)".format(name, 85 | np.mean(y_error), 86 | np.mean(y_bias), 87 | np.mean(y_var), 88 | np.mean(y_noise))) 89 | 90 | # Plot figures 91 | ax = plt.subplot(2, n_estimators, n + 1) 92 | plt.setp(ax, xticks=(), yticks=()) 93 | plt.plot(X_test, f(X_test), color=blue) 94 | plt.plot(X_train[0], y_train[0], ".b") 95 | plt.plot(X_test, y_predict[:, 0], color=gray) 96 | 97 | for i in range(1, n_repeat): 98 | plt.plot(X_test, y_predict[:, i], color=red_alpha, alpha=0.05) 99 | 100 | plt.plot(X_test, np.mean(y_predict, axis=1), color=red, 101 | label="$\mathbb{E}_{LS} \^y(x)$") 102 | 103 | plt.xlabel("x") 104 | plt.ylabel("y") 105 | plt.xlim((0., 1.0)) 106 | plt.ylim((-2, 2)) 107 | plt.title(name) 108 | 109 | ax = plt.subplot(2, n_estimators, n_estimators + n + 1) 110 | plt.setp(ax, xticks=(), yticks=()) 111 | plt.plot(X_test, y_error, color=gray, label="$error(x)$") 112 | plt.plot(X_test, y_bias, color=blue, label="$bias^2(x)$"), 113 | plt.plot(X_test, y_var, color=red, label="$var(x)$"), 114 | plt.plot(X_test, y_noise, color=green, label="$noise(x)$") 115 | plt.xlabel("x") 116 | plt.xlim((0., 1.0)) 117 | plt.ylim((0, 2.0)) 118 | 119 | if n == 0: 120 | plt.legend(loc="upper left", prop={"size": 11}) 121 | 122 | plt.show() 123 | -------------------------------------------------------------------------------- /scripts/ch4_correlation_plot2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | blue = (0, 0, 1.0) 6 | green = (0, 0.8, 0) 7 | red = (1.0, 0, 0) 8 | red_alpha = (1.0, 0, 0, 0.7) 9 | gray = (0.7, 0.7, 0.7) 10 | 11 | results=[[], 12 | ["RandomForestRegressor-M=1",7.134504,2.629471,4.505033,0.063198,0.009473,0.283799,4.221234], 13 | ["RandomForestRegressor-M=2",5.023887,2.629471,2.394416,0.063198,0.009473,0.283799,2.110617], 14 | ["RandomForestRegressor-M=3",4.320348,2.629471,1.690877,0.063198,0.009473,0.283799,1.407078], 15 | ["RandomForestRegressor-M=4",3.968578,2.629471,1.339107,0.063198,0.009473,0.283799,1.055309], 16 | ["RandomForestRegressor-M=5",3.757516,2.629471,1.128046,0.063198,0.009473,0.283799,0.844247], 17 | ["RandomForestRegressor-M=6",3.616809,2.629471,0.987338,0.063198,0.009473,0.283799,0.703539], 18 | ["RandomForestRegressor-M=7",3.516303,2.629471,0.886832,0.063198,0.009473,0.283799,0.603033], 19 | ["RandomForestRegressor-M=8",3.440924,2.629471,0.811453,0.063198,0.009473,0.283799,0.527654], 20 | ["RandomForestRegressor-M=9",3.382296,2.629471,0.752825,0.063198,0.009473,0.283799,0.469026], 21 | ["RandomForestRegressor-M=10",3.335393,2.629471,0.705922,0.063198,0.009473,0.283799,0.422123], 22 | ["RandomForestRegressor-M=11",3.297018,2.629471,0.667547,0.063198,0.009473,0.283799,0.383749], 23 | ["RandomForestRegressor-M=12",3.265039,2.629471,0.635568,0.063198,0.009473,0.283799,0.351770], 24 | ["RandomForestRegressor-M=13",3.237980,2.629471,0.608509,0.063198,0.009473,0.283799,0.324710], 25 | ["RandomForestRegressor-M=14",3.214786,2.629471,0.585316,0.063198,0.009473,0.283799,0.301517], 26 | ["RandomForestRegressor-M=15",3.194685,2.629471,0.565215,0.063198,0.009473,0.283799,0.281416], 27 | ["RandomForestRegressor-M=16",3.177097,2.629471,0.547626,0.063198,0.009473,0.283799,0.263827], 28 | ["RandomForestRegressor-M=17",3.161577,2.629471,0.532107,0.063198,0.009473,0.283799,0.248308], 29 | ["RandomForestRegressor-M=18",3.147783,2.629471,0.518312,0.063198,0.009473,0.283799,0.234513], 30 | ["RandomForestRegressor-M=19",3.135440,2.629471,0.505969,0.063198,0.009473,0.283799,0.222170], 31 | ["RandomForestRegressor-M=20",3.124331,2.629471,0.494861,0.063198,0.009473,0.283799,0.211062], 32 | ["ExtraTreesRegressor-M=1",6.931454,2.484647,4.446807,0.051816,0.009069,0.230366,4.216441], 33 | ["ExtraTreesRegressor-M=2",4.823234,2.484647,2.338587,0.051816,0.009069,0.230366,2.108220], 34 | ["ExtraTreesRegressor-M=3",4.120494,2.484647,1.635847,0.051816,0.009069,0.230366,1.405480], 35 | ["ExtraTreesRegressor-M=4",3.769124,2.484647,1.284476,0.051816,0.009069,0.230366,1.054110], 36 | ["ExtraTreesRegressor-M=5",3.558302,2.484647,1.073654,0.051816,0.009069,0.230366,0.843288], 37 | ["ExtraTreesRegressor-M=6",3.417754,2.484647,0.933106,0.051816,0.009069,0.230366,0.702740], 38 | ["ExtraTreesRegressor-M=7",3.317362,2.484647,0.832715,0.051816,0.009069,0.230366,0.602349], 39 | ["ExtraTreesRegressor-M=8",3.242068,2.484647,0.757421,0.051816,0.009069,0.230366,0.527055], 40 | ["ExtraTreesRegressor-M=9",3.183507,2.484647,0.698860,0.051816,0.009069,0.230366,0.468493], 41 | ["ExtraTreesRegressor-M=10",3.136657,2.484647,0.652010,0.051816,0.009069,0.230366,0.421644], 42 | ["ExtraTreesRegressor-M=11",3.098326,2.484647,0.613679,0.051816,0.009069,0.230366,0.383313], 43 | ["ExtraTreesRegressor-M=12",3.066383,2.484647,0.581736,0.051816,0.009069,0.230366,0.351370], 44 | ["ExtraTreesRegressor-M=13",3.039355,2.484647,0.554708,0.051816,0.009069,0.230366,0.324342], 45 | ["ExtraTreesRegressor-M=14",3.016188,2.484647,0.531541,0.051816,0.009069,0.230366,0.301174], 46 | ["ExtraTreesRegressor-M=15",2.996109,2.484647,0.511462,0.051816,0.009069,0.230366,0.281096], 47 | ["ExtraTreesRegressor-M=16",2.978541,2.484647,0.493894,0.051816,0.009069,0.230366,0.263528], 48 | ["ExtraTreesRegressor-M=17",2.963039,2.484647,0.478392,0.051816,0.009069,0.230366,0.248026], 49 | ["ExtraTreesRegressor-M=18",2.949260,2.484647,0.464613,0.051816,0.009069,0.230366,0.234247], 50 | ["ExtraTreesRegressor-M=19",2.936931,2.484647,0.452284,0.051816,0.009069,0.230366,0.221918], 51 | ["ExtraTreesRegressor-M=20",2.925835,2.484647,0.441188,0.051816,0.009069,0.230366,0.210822]] 52 | 53 | n_estimators = range(1, 20+1) 54 | 55 | ax = plt.subplot() 56 | 57 | plt.plot(n_estimators, [results[m][3] for m in n_estimators], 'o-', color=red, label=r'$var(x)$') 58 | plt.plot(n_estimators, [results[m][6] for m in n_estimators], ':', color=red_alpha, label=r'$\rho(x) \sigma^2_{{\cal L},\theta}$') 59 | plt.plot(n_estimators, [results[m][7] for m in n_estimators], '--', color=red_alpha, label=r'$\frac{1-\rho(x)}{M} \sigma^2_{{\cal L},\theta}$') 60 | 61 | plt.xlabel("$M$") 62 | 63 | ax.set_xlim([1, 20]) 64 | plt.legend() 65 | 66 | #plt.plot(n_estimators, [results[20+m][3] for m in n_estimators], 'o-', color=red, label='Extremely Randomized Trees') 67 | 68 | plt.show() 69 | -------------------------------------------------------------------------------- /scripts/ID3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Understanding variable importances in forests of randomized trees. 3 | Gilles Louppe, Louis Wehenkel, Antonio Sutera and Pierre Geurts 4 | NIPS, Lake Tahoe, United States, 2013 5 | http://orbi.ulg.ac.be/handle/2268/155642 6 | 7 | This module implements a simplistic randomized ID3 tree classifier 8 | (`RandomizedID3Classifier`), along with its ensemble counter-part 9 | (`RandomizedID3Ensemble`). 10 | 11 | Warning: These classes implement `fit` and `feature_importances_`, but do not 12 | provide any `predict` method. They only serve as a proof-of-concept. 13 | 14 | Author: Gilles Louppe 15 | License: BSD 3 clause 16 | """ 17 | import copy 18 | import itertools 19 | import numpy as np 20 | 21 | from sklearn.base import BaseEstimator, ClassifierMixin 22 | from sklearn.ensemble import BaseEnsemble 23 | from sklearn.utils import check_random_state 24 | 25 | from demo import entropy 26 | 27 | MAX_INT = np.iinfo("i").max 28 | 29 | 30 | class RandomizedID3Classifier(BaseEstimator, ClassifierMixin): 31 | """Simplistic implementation of an ID3 randomized tree.""" 32 | 33 | def __init__(self, k=1, max_depth=None, random_state=None): 34 | self.k = k 35 | self.max_depth = max_depth 36 | self.random_state = random_state 37 | self.tree_ = None 38 | 39 | def fit(self, X, y): 40 | self.n_features_ = X.shape[1] 41 | self.classes_ = np.unique(y) 42 | self.n_classes_ = len(self.classes_) 43 | self.random_state_ = check_random_state(self.random_state) 44 | 45 | self.values_ = [] 46 | for i in xrange(self.n_features_): 47 | self.values_.append(np.unique(X[:, i])) 48 | 49 | self.tree_ = self._partition(X, 50 | np.searchsorted(self.classes_, y), 51 | range(self.n_features_), 52 | X.shape[0]) 53 | 54 | return self 55 | 56 | def predict(self, X): 57 | raise NotImplementedError 58 | 59 | def _partition(self, X, y, variables, n_samples, depth=0): 60 | rng = self.random_state_ 61 | 62 | # Leaf 63 | if len(variables) == 0 or (self.max_depth is not None and depth >= self.max_depth): 64 | values = 1. * np.bincount(y, minlength=self.n_classes_) / len(y) 65 | return (values, len(y)) 66 | 67 | # Internal node 68 | else: 69 | variables = copy.copy(variables) 70 | n_variables = len(variables) 71 | n_node = len(X) 72 | 73 | best = None 74 | best_score = -np.inf 75 | best_children = None 76 | 77 | features = (rng.permutation(n_variables))[:min(self.k, 78 | n_variables)] 79 | 80 | for i in features: 81 | X_i = variables[i] 82 | 83 | children = [] 84 | 85 | for xi in self.values_[X_i]: 86 | mask_xi = X[:, X_i] == xi 87 | if sum(mask_xi) > 0: 88 | children.append((X[mask_xi], y[mask_xi], sum(mask_xi))) 89 | 90 | score = ((1. * n_node / n_samples) # P(B=b) 91 | * (entropy(y) - sum([1. * entropy(c_y) * c_n / n_node 92 | for _, c_y, c_n in children]))) 93 | 94 | if score > best_score: 95 | best = i 96 | best_score = score 97 | best_children = children 98 | 99 | X_i = variables.pop(best) 100 | 101 | return (X_i, 102 | best_score, 103 | [self._partition(c_X, 104 | c_y, 105 | variables, 106 | n_samples, 107 | depth=depth+1) for c_X, 108 | c_y, 109 | _ in best_children]) 110 | 111 | @property 112 | def feature_importances_(self): 113 | def _visit(tree, depth): 114 | if len(tree) == 2: 115 | pass 116 | 117 | else: 118 | imp[tree[0], depth] += tree[1] 119 | 120 | for c in tree[2]: 121 | _visit(c, depth+1) 122 | 123 | imp = np.zeros((self.n_features_, self.n_features_)) 124 | _visit(self.tree_, 0) 125 | 126 | return imp 127 | 128 | 129 | class RandomizedID3Ensemble(BaseEnsemble, ClassifierMixin): 130 | """Simplistic implementation of an ensemble of ID3 randomized trees.""" 131 | 132 | def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None): 133 | super(RandomizedID3Ensemble, self).__init__( 134 | base_estimator=base_estimator, 135 | n_estimators=n_estimators, 136 | estimator_params=("max_depth",)) 137 | 138 | self.max_depth = max_depth 139 | self.random_state = random_state 140 | 141 | def _validate_estimator(self): 142 | super(RandomizedID3Ensemble, self)._validate_estimator( 143 | default=RandomizedID3Classifier()) 144 | 145 | def fit(self, X, y): 146 | random_state = check_random_state(self.random_state) 147 | self._validate_estimator() 148 | self.p = X.shape[1] 149 | 150 | for i in xrange(self.n_estimators): 151 | tree = self._make_estimator() 152 | tree.set_params(random_state=random_state.randint(MAX_INT)) 153 | tree.fit(X, y) 154 | 155 | return self 156 | 157 | def predict(self, X): 158 | raise NotImplementedError 159 | 160 | @property 161 | def feature_importances_(self): 162 | importances = np.zeros((self.p, self.p)) 163 | 164 | for i, tree in enumerate(self.estimators_): 165 | importances += tree.feature_importances_ 166 | 167 | importances /= self.n_estimators 168 | 169 | return importances 170 | -------------------------------------------------------------------------------- /tex/chapters/chapter09.tex: -------------------------------------------------------------------------------- 1 | \chapter{Conclusions}\label{ch:conclusions} 2 | 3 | By and large, machine learning remains an open field of research for which many 4 | questions are still left unanswered, even regarding well-established methods. 5 | In this dissertation, we have revisited decision trees and random forests, 6 | consistently calling into question each and every part of these algorithms, in 7 | order to shed new light on their learning capabilities, inner workings and 8 | interpretability. 9 | 10 | In Part~\textsc{\ref{part:1}} of this work, we laid out the decision trees and 11 | random forests methodology in the context of classification and regression 12 | tasks. Our treatment first considered the induction of individual decision 13 | trees and put them into a unified and composable framework. In particular, our 14 | analysis reviewed assignment rules, stopping criteria and splitting rules, 15 | theoretically motivating their design and purpose whenever possible. We then 16 | proceeded with a systematic study of randomized ensemble methods within the 17 | bias-variance framework. We established that variance depends on the 18 | correlation between individual tree predictions, thereby showing why 19 | randomization acts as a mechanism for reducing the generalization error of an 20 | ensemble. Random forest and its variants were then presented within the 21 | framework previously introduced, and their properties and features discussed 22 | and reviewed. Our contributions followed with an original time and space 23 | complexity analysis of random forests, hence showing their good computational 24 | performance and scalability to larger problems. Finally, the first part of this 25 | work concluded with an in-depth discussion of implementation details of random 26 | forests, highlighting and discussing considerations that are critical, yet 27 | easily overlooked, for guaranteeing good computational performance. While not 28 | directly apparent within this manuscript, this discussion also underlined our 29 | contributions in terms of software, within the open source Sckit-Learn library. 30 | As open science and reproducibility concerns are gaining momentum, we indeed 31 | believe that good quality software should be an integrative part, acknowledged 32 | for its own value and impact, of any modern scientific research activity. 33 | 34 | Part~\textsc{\ref{part:2}} of this dissertation analyzed and discussed the 35 | interpretability of random forests in the eyes of variable importance measures. 36 | The core of our contributions rests in the theoretical characterization of the 37 | Mean Decrease of Impurity variable importance measure, from which we have then 38 | proved and derived some of its properties in the case of multiway totally 39 | randomized trees and in asymptotic conditions. In particular, we have shown 40 | that variable importances offer a three-level decomposition of the information 41 | jointly provided by the input variables about the output, accounting for all 42 | possible interaction terms in a fair and exhaustive way. More interestingly, we 43 | have also shown that variable importances only depend on relevant variables and 44 | that the importance of irrelevant variables is strictly equal to zero, thereby 45 | making importances a sound and appropriate criterion for assessing the 46 | usefulness of variables. In consequence of this work, our analysis then 47 | demonstrated that variable importances as computed from non-totally randomized 48 | trees (e.g., standard Random Forest or Extremely Randomized Trees) suffer from 49 | a combination of defects, due to masking effects, misestimations of node 50 | impurity or due to the binary structure of decision trees. Overall, we believe 51 | that our analysis should bring helpful insights in a wide range of 52 | applications, by shedding new light on variable importances. In particular, we 53 | advise to complement their interpretation and analysis with a systematic 54 | decomposition of their terms, in order to better understand why variables are 55 | (or are not) important. 56 | 57 | This preliminary work unveils various directions of future work, both from a 58 | theoretical and practical point of view. To our belief, the most interesting 59 | theoretical open question would be the characterization of the distribution of 60 | variable importances in the finite setting. Such a characterization would 61 | indeed allow to more reliably distinguish irrelevant variables (whose 62 | importances are positive in the finite case) from relevant variables. Another 63 | interesting direction of future work would be to derive a proper 64 | characterization of variable importances in the case of binary trees -- even if we 65 | believe, as pointed out earlier, that variable importances derived from such 66 | ensembles may in fact not be as appropriate as desired. From a more practical 67 | point of view, this study also calls for a re-analysis of previous empirical 68 | studies. We indeed believe that variable importances along with their 69 | decomposition should yield new insights in many cases, providing a better 70 | understanding of the interactions between the input variables and the output, 71 | but also between the input variables themselves. Again, we recommend multiway 72 | totally randomized trees to mitigate sources of bias as much as possible. 73 | 74 | Finally, Part~\textsc{\ref{part:3}} addressed limitations of random forests in 75 | the context of large datasets. Through extensive experiments, we have shown 76 | that subsampling either samples, features or both simultaneously provides on 77 | par performance while lowering at the same time the memory requirements. 78 | Overall this paradigm highlights an intriguing practical fact: there is often 79 | no need to build single models over immensely large datasets. Good performance 80 | can often more simply be achieved by building models on small random parts of the 81 | data and then combining them all in an ensemble, thereby avoiding all practical and 82 | computational burdens of making large data fit into memory. Again, this work 83 | raises interesting questions of further work. From a theoretical point of view, 84 | one would be to identify the statistical properties in the learning problem 85 | that are necessary for guaranteeing subsampling strategies to work. In 86 | particular, in which cases is it better to subsample examples rather than 87 | features? From a more practical point of view, other directions of research 88 | also include the study of smarter sampling strategies or the empirical 89 | verification that conclusions extend to non tree-based methods. 90 | 91 | Overall, this thesis calls for a permanent re-assessment of machine learning 92 | methods and algorithms. It is only through a better understanding of their 93 | mechanisms that algorithms will advance in a consistent and reliable way. 94 | Always seek for the what and why. In conclusion, machine learning should not be 95 | considered as a black-box tool, but as a methodology, with a rational thought 96 | process that is entirely dependent on the problem we are trying to solve. 97 | -------------------------------------------------------------------------------- /benchmarks/resources/bench_randomforest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmark script to bench scikit-learn's RandomForestClassifier 3 | vs. R's randomForest. 4 | 5 | It uses rpy2 to call R from python. Timings for randomForest are 6 | pessimistic due to a constant overhead by wrapping numpy matrices 7 | in R data_frames. The effect of the overhead can be reduced 8 | by increasing the number of trees. 9 | 10 | Note: make sure the LD_LIBRARY_PATH is set for rpy2:: 11 | 12 | $ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib 13 | """ 14 | 15 | import numpy as np 16 | 17 | from time import time 18 | from functools import wraps 19 | from collections import defaultdict 20 | 21 | from sklearn import datasets as sk_datasets 22 | from sklearn.utils import shuffle 23 | from sklearn.utils import check_random_state 24 | from sklearn.ensemble import RandomForestClassifier 25 | from sklearn.base import BaseEstimator, ClassifierMixin 26 | 27 | from rpy2.robjects.numpy2ri import numpy2ri 28 | from rpy2.robjects.packages import importr 29 | from rpy2 import robjects as ro 30 | 31 | import pylab as pl 32 | 33 | rf = importr('randomForest') 34 | 35 | data_path = '/home/pprett/corpora' 36 | 37 | 38 | class RRandomForestClassifier(BaseEstimator, ClassifierMixin): 39 | 40 | def __init__(self, **kargs): 41 | self.params = kargs 42 | 43 | def fit(self, X, y): 44 | self.classes_ = np.unique(y) 45 | y = np.searchsorted(self.classes_, y) + 1 46 | X = numpy2ri(X) 47 | y = ro.FactorVector(numpy2ri(y)) 48 | self.model_ = rf.randomForest(X, y, **self.params) 49 | return self 50 | 51 | def predict(self, X): 52 | X = numpy2ri(X) 53 | pred = rf.predict_randomForest(self.model_, X) 54 | # R maps class labels 55 | pred = np.array(pred, dtype=np.int32) - 1 56 | return self.classes_[pred] 57 | 58 | 59 | def repeat(n_repetitions=3): 60 | def wrap(f): 61 | def wrapper(*args, **kargs): 62 | scores = [] 63 | for i in range(n_repetitions): 64 | scores.append(f(*args, random_state=i, **kargs)) 65 | scores = np.array(scores) 66 | return scores.mean(axis=0), scores.std(axis=0) 67 | return wraps(f)(wrapper) 68 | return wrap 69 | 70 | 71 | @repeat() 72 | def bench_hastie_10_2(clf, random_state=None): 73 | X, y = sk_datasets.make_hastie_10_2(random_state=random_state) 74 | X_train, X_test = X[:2000], X[2000:] 75 | y_train, y_test = y[:2000], y[2000:] 76 | X_train = np.asarray(X_train, order='f', dtype=np.float32) 77 | X_test = np.asarray(X_test, dtype=np.float32) 78 | t0 = time() 79 | clf.fit(X_train, y_train) 80 | train_time = time() - t0 81 | t0 = time() 82 | pred = clf.predict(X_test) 83 | test_time = time() - t0 84 | error_rate = np.mean(pred != y_test) 85 | return error_rate, train_time, test_time 86 | 87 | 88 | @repeat() 89 | def bench_random_gaussian(clf, random_state=None): 90 | rs = check_random_state(random_state) 91 | shape = (12000, 10) 92 | X = rs.normal(size=shape).reshape(shape) 93 | y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.int32) 94 | 95 | X_train, X_test = X[:2000], X[2000:] 96 | y_train, y_test = y[:2000], y[2000:] 97 | X_train = np.asarray(X_train, order='f', dtype=np.float32) 98 | X_test = np.asarray(X_test, dtype=np.float32) 99 | 100 | t0 = time() 101 | clf.fit(X_train, y_train) 102 | train_time = time() - t0 103 | t0 = time() 104 | pred = clf.predict(X_test) 105 | test_time = time() - t0 106 | error_rate = np.mean(pred != y_test) 107 | return error_rate, train_time, test_time 108 | 109 | 110 | @repeat() 111 | def bench_spam(clf, random_state=None): 112 | X = np.loadtxt(data_path + "/spam/spambase.data", delimiter=",") 113 | y = X[:, -1].ravel() 114 | X = X[:, :-1] 115 | f = open(data_path + "/spam/spambase.names") 116 | feature_names = np.array([l.split(":")[0] for l in f]) 117 | 118 | X, y = shuffle(X, y, random_state=random_state) 119 | X_test, y_test = X[:1536], y[:1536] 120 | X_train, y_train = X[1536:], y[1536:] 121 | X_train = np.asarray(X_train, order='f', dtype=np.float32) 122 | X_test = np.asarray(X_test, dtype=np.float32) 123 | 124 | t0 = time() 125 | clf.fit(X_train, y_train) 126 | train_time = time() - t0 127 | t0 = time() 128 | error_rate = (1.0 - clf.score(X_test, y_test)) 129 | test_time = time() - t0 130 | return error_rate, train_time, test_time 131 | 132 | 133 | @repeat() 134 | def bench_madelon(clf, random_state=None): 135 | X_train = np.loadtxt(data_path + "/madelon/madelon_train.data") 136 | y_train = np.loadtxt(data_path + "/madelon/madelon_train.labels") 137 | X_test = np.loadtxt(data_path + "/madelon/madelon_valid.data") 138 | y_test = np.loadtxt(data_path + "/madelon/madelon_valid.labels") 139 | X_train = np.asarray(X_train, order='f', dtype=np.float32) 140 | X_test = np.asarray(X_test, dtype=np.float32) 141 | t0 = time() 142 | clf.fit(X_train, y_train) 143 | train_time = time() - t0 144 | t0 = time() 145 | error_rate = (1.0 - clf.score(X_test, y_test)) 146 | test_time = time() - t0 147 | return error_rate, train_time, test_time 148 | 149 | 150 | @repeat() 151 | def bench_arcene(clf, random_state=None): 152 | X_train = np.loadtxt(data_path + "/arcene/arcene_train.data") 153 | y_train = np.loadtxt(data_path + "/arcene/arcene_train.labels") 154 | X_test = np.loadtxt(data_path + "/arcene/arcene_valid.data") 155 | y_test = np.loadtxt(data_path + "/arcene/arcene_valid.labels") 156 | X_train = np.asarray(X_train, order='f', dtype=np.float32) 157 | X_test = np.asarray(X_test, dtype=np.float32) 158 | t0 = time() 159 | clf.fit(X_train, y_train) 160 | train_time = time() - t0 161 | t0 = time() 162 | error_rate = (1.0 - clf.score(X_test, y_test)) 163 | test_time = time() - t0 164 | return error_rate, train_time, test_time 165 | 166 | 167 | @repeat() 168 | def bench_landsat(clf, random_state=None): 169 | landsat = sk_datasets.load_landsat() 170 | X = np.asarray(landsat.data, order='f', dtype=np.float32) 171 | y = landsat.target 172 | t0 = time() 173 | clf.fit(X, y) 174 | train_time = time() - t0 175 | t0 = time() 176 | error_rate = (1.0 - clf.score(X, y)) 177 | test_time = time() - t0 178 | return error_rate, train_time, test_time 179 | 180 | 181 | @repeat(1) 182 | def bench_mnist(clf, random_state=None): 183 | rs = check_random_state(random_state) 184 | mnist = sk_datasets.fetch_mldata('MNIST original') 185 | inds = np.arange(len(mnist.data)) 186 | rs.shuffle(inds) 187 | cut_off = int(0.9 * len(inds)) 188 | train_i = inds[:cut_off] 189 | test_i = inds[cut_off:] 190 | 191 | X_train = mnist.data[train_i].astype(np.float32) 192 | y_train = mnist.target[train_i].astype(np.float64) 193 | 194 | X_test = mnist.data[test_i].astype(np.float32) 195 | y_test = mnist.target[test_i].astype(np.float64) 196 | 197 | t0 = time() 198 | clf.fit(X_train, y_train) 199 | train_time = time() - t0 200 | t0 = time() 201 | error_rate = (1.0 - clf.score(X_test, y_test)) 202 | test_time = time() - t0 203 | return error_rate, train_time, test_time 204 | 205 | 206 | if __name__ == '__main__': 207 | res = defaultdict(dict) 208 | 209 | clfs = {'r': RRandomForestClassifier(ntree=100, mtry=3, nodesize=1), 210 | 'py': RandomForestClassifier(n_estimators=100, max_features=3, 211 | min_samples_leaf=1, 212 | n_jobs=1)} 213 | datasets = {'random_gaussian': bench_random_gaussian, 214 | 'spam': bench_spam, 215 | 'madelon': bench_madelon, 216 | 'arcene': bench_arcene, 217 | 'landsat': bench_landsat, 218 | 'hastie_10_2': bench_hastie_10_2} 219 | 220 | for impl, clf in clfs.iteritems(): 221 | for dataset, ds_bench in datasets.iteritems(): 222 | mean, std = ds_bench(clf) 223 | res[dataset][impl] = (mean, std) 224 | 225 | clfs = {'r': RRandomForestClassifier(ntree=10, mtry=3, nodesize=1), 226 | 'py': RandomForestClassifier(n_estimators=10, max_features=3, 227 | min_samples_leaf=1, 228 | n_jobs=1)} 229 | datasets = {'mnist': bench_mnist} 230 | for impl, clf in clfs.iteritems(): 231 | for dataset, ds_bench in datasets.iteritems(): 232 | mean, std = ds_bench(clf) 233 | res[dataset][impl] = (mean, std) 234 | 235 | for ds in res: 236 | print('_' * 80) 237 | print(ds) 238 | print 239 | print("%s\t%s\t%s" % (' '*4, 'r'.center(13), 'py'.center(13))) 240 | for i, metric in enumerate(['score', 'train', 'test']): 241 | print("%s\t%.4f (%.2f)\t%.4f (%.2f)" % 242 | (metric, res[ds]['r'][0][i], res[ds]['r'][1][i], 243 | res[ds]['py'][0][i], res[ds]['py'][1][i])) 244 | print 245 | 246 | -------------------------------------------------------------------------------- /slides/minted.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% This is file `minted.sty', 3 | %% generated with the docstrip utility. 4 | %% 5 | %% The original source files were: 6 | %% 7 | %% minted.dtx (with options: `package') 8 | %% Copyright 2010--2011 Konrad Rudolph 9 | %% 10 | %% This work may be distributed and/or modified under the 11 | %% conditions of the LaTeX Project Public License, either version 1.3 12 | %% of this license or (at your option) any later version. 13 | %% The latest version of this license is in 14 | %% http://www.latex-project.org/lppl.txt 15 | %% and version 1.3 or later is part of all distributions of LaTeX 16 | %% version 2005/12/01 or later. 17 | %% 18 | %% Additionally, the project may be distributed under the terms of the new BSD 19 | %% license. 20 | %% 21 | %% This work has the LPPL maintenance status `maintained'. 22 | %% 23 | %% The Current Maintainer of this work is Konrad Rudolph. 24 | %% 25 | %% This work consists of the files minted.dtx and minted.ins 26 | %% and the derived file minted.sty. 27 | \NeedsTeXFormat{LaTeX2e} 28 | \ProvidesPackage{minted}[2011/09/17 v1.7 Yet another Pygments shim for LaTeX] 29 | \RequirePackage{keyval} 30 | \RequirePackage{fancyvrb} 31 | \RequirePackage{xcolor} 32 | \RequirePackage{float} 33 | \RequirePackage{ifthen} 34 | \RequirePackage{calc} 35 | \RequirePackage{ifplatform} 36 | \DeclareOption{chapter}{\def\minted@float@within{chapter}} 37 | \DeclareOption{section}{\def\minted@float@within{section}} 38 | \ProcessOptions\relax 39 | \ifwindows 40 | \providecommand\DeleteFile[1]{\immediate\write18{del #1}} 41 | \else 42 | \providecommand\DeleteFile[1]{\immediate\write18{rm #1}} 43 | \fi 44 | \newboolean{AppExists} 45 | \newcommand\TestAppExists[1]{ 46 | \ifwindows 47 | \DeleteFile{\jobname.aex} 48 | \immediate\write18{for \string^\@percentchar i in (#1.exe #1.bat #1.cmd) 49 | do set >\jobname.aex >\jobname.aex} %$ 50 | \newread\@appexistsfile 51 | \immediate\openin\@appexistsfile\jobname.aex 52 | \expandafter\def\expandafter\@tmp@cr\expandafter{\the\endlinechar} 53 | \endlinechar=-1\relax 54 | \readline\@appexistsfile to \@apppathifexists 55 | \endlinechar=\@tmp@cr 56 | \ifthenelse{\equal{\@apppathifexists}{}} 57 | {\AppExistsfalse} 58 | {\AppExiststrue} 59 | \immediate\closein\@appexistsfile 60 | \DeleteFile{\jobname.aex} 61 | \immediate\typeout{file deleted} 62 | \else 63 | \immediate\write18{which #1 && touch \jobname.aex} 64 | \IfFileExists{\jobname.aex} 65 | {\AppExiststrue 66 | \DeleteFile{\jobname.aex}} 67 | {\AppExistsfalse} 68 | \fi} 69 | \newcommand\minted@resetoptions{} 70 | \newcommand\minted@defopt[1]{ 71 | \expandafter\def\expandafter\minted@resetoptions\expandafter{% 72 | \minted@resetoptions 73 | \@namedef{minted@opt@#1}{}}} 74 | \newcommand\minted@opt[1]{ 75 | \expandafter\detokenize% 76 | \expandafter\expandafter\expandafter{\csname minted@opt@#1\endcsname}} 77 | \newcommand\minted@define@opt[3][]{ 78 | \minted@defopt{#2} 79 | \ifthenelse{\equal{#1}{}}{ 80 | \define@key{minted@opt}{#2}{\@namedef{minted@opt@#2}{#3}}} 81 | {\define@key{minted@opt}{#2}[#1]{\@namedef{minted@opt@#2}{#3}}}} 82 | \newcommand\minted@define@switch[3][]{ 83 | \minted@defopt{#2} 84 | \define@booleankey{minted@opt}{#2} 85 | {\@namedef{minted@opt@#2}{#3}} 86 | {\@namedef{minted@opt@#2}{#1}}} 87 | \minted@defopt{extra} 88 | \newcommand\minted@define@extra[1]{ 89 | \define@key{minted@opt}{#1}{ 90 | \expandafter\def\expandafter\minted@opt@extra\expandafter{% 91 | \minted@opt@extra,#1=##1}}} 92 | \newcommand\minted@define@extra@switch[1]{ 93 | \define@booleankey{minted@opt}{#1} 94 | {\expandafter\def\expandafter\minted@opt@extra\expandafter{% 95 | \minted@opt@extra,#1}} 96 | {\expandafter\def\expandafter\minted@opt@extra\expandafter{% 97 | \minted@opt@extra,#1=false}}} 98 | \minted@define@switch{texcl}{-P texcomments} 99 | \minted@define@switch{mathescape}{-P mathescape} 100 | \minted@define@switch{linenos}{-P linenos} 101 | \minted@define@switch{startinline}{-P startinline} 102 | \minted@define@switch[-P funcnamehighlighting=False]% 103 | {funcnamehighlighting}{-P funcnamehighlighting} 104 | \minted@define@opt{gobble}{-F gobble:n=#1} 105 | \minted@define@opt{bgcolor}{#1} 106 | \minted@define@extra{frame} 107 | \minted@define@extra{framesep} 108 | \minted@define@extra{framerule} 109 | \minted@define@extra{rulecolor} 110 | \minted@define@extra{numbersep} 111 | \minted@define@extra{firstnumber} 112 | \minted@define@extra{stepnumber} 113 | \minted@define@extra{firstline} 114 | \minted@define@extra{lastline} 115 | \minted@define@extra{baselinestretch} 116 | \minted@define@extra{xleftmargin} 117 | \minted@define@extra{xrightmargin} 118 | \minted@define@extra{fillcolor} 119 | \minted@define@extra{tabsize} 120 | \minted@define@extra{fontfamily} 121 | \minted@define@extra{fontsize} 122 | \minted@define@extra{fontshape} 123 | \minted@define@extra{fontseries} 124 | \minted@define@extra{formatcom} 125 | \minted@define@extra{label} 126 | \minted@define@extra@switch{numberblanklines} 127 | \minted@define@extra@switch{showspaces} 128 | \minted@define@extra@switch{resetmargins} 129 | \minted@define@extra@switch{samepage} 130 | \minted@define@extra@switch{showtabs} 131 | \minted@define@extra@switch{obeytabs} 132 | \newsavebox{\minted@bgbox} 133 | \newenvironment{minted@colorbg}[1]{ 134 | \def\minted@bgcol{#1} 135 | \noindent 136 | \begin{lrbox}{\minted@bgbox} 137 | \begin{minipage}{\linewidth-2\fboxsep}} 138 | {\end{minipage} 139 | \end{lrbox}% 140 | \colorbox{\minted@bgcol}{\usebox{\minted@bgbox}}} 141 | \newwrite\minted@code 142 | \newcommand\minted@savecode[1]{ 143 | \immediate\openout\minted@code\jobname.pyg 144 | \immediate\write\minted@code{#1} 145 | \immediate\closeout\minted@code} 146 | \newcommand\minted@pygmentize[2][\jobname.pyg]{ 147 | \def\minted@cmd{pygmentize -l #2 -f latex -F tokenmerge 148 | \minted@opt{gobble} \minted@opt{texcl} \minted@opt{mathescape} 149 | \minted@opt{startinline} \minted@opt{funcnamehighlighting} 150 | \minted@opt{linenos} -P "verboptions=\minted@opt{extra}" 151 | -o \jobname.out.pyg #1} 152 | \immediate\write18{\minted@cmd} 153 | % For debugging, uncomment: 154 | %\immediate\typeout{\minted@cmd} 155 | \ifthenelse{\equal{\minted@opt@bgcolor}{}} 156 | {} 157 | {\begin{minted@colorbg}{\minted@opt@bgcolor}} 158 | \input{\jobname.out.pyg} 159 | \ifthenelse{\equal{\minted@opt@bgcolor}{}} 160 | {} 161 | {\end{minted@colorbg}} 162 | \DeleteFile{\jobname.out.pyg}} 163 | \newcommand\minted@usedefaultstyle{\usemintedstyle{default}} 164 | \newcommand\usemintedstyle[1]{ 165 | \renewcommand\minted@usedefaultstyle{} 166 | \immediate\write18{pygmentize -S #1 -f latex > \jobname.pyg} 167 | \input{\jobname.pyg}} 168 | \newcommand\mint[3][]{ 169 | \DefineShortVerb{#3} 170 | \minted@resetoptions 171 | \setkeys{minted@opt}{#1} 172 | \SaveVerb[aftersave={ 173 | \UndefineShortVerb{#3} 174 | \minted@savecode{\FV@SV@minted@verb} 175 | \minted@pygmentize{#2} 176 | \DeleteFile{\jobname.pyg}}]{minted@verb}#3} 177 | \newcommand\minted@proglang[1]{} 178 | \newenvironment{minted}[2][] 179 | {\VerbatimEnvironment 180 | \renewcommand{\minted@proglang}[1]{#2} 181 | \minted@resetoptions 182 | \setkeys{minted@opt}{#1} 183 | \begin{VerbatimOut}[codes={\catcode`\^^I=12}]{\jobname.pyg}}% 184 | {\end{VerbatimOut} 185 | \minted@pygmentize{\minted@proglang{}} 186 | \DeleteFile{\jobname.pyg}} 187 | \newcommand\inputminted[3][]{ 188 | \minted@resetoptions 189 | \setkeys{minted@opt}{#1} 190 | \minted@pygmentize[#3]{#2}} 191 | \newcommand\newminted[3][]{ 192 | \ifthenelse{\equal{#1}{}} 193 | {\def\minted@envname{#2code}} 194 | {\def\minted@envname{#1}} 195 | \newenvironment{\minted@envname} 196 | {\VerbatimEnvironment\begin{minted}[#3]{#2}} 197 | {\end{minted}} 198 | \newenvironment{\minted@envname *}[1] 199 | {\VerbatimEnvironment\begin{minted}[#3,##1]{#2}} 200 | {\end{minted}}} 201 | \newcommand\newmint[3][]{ 202 | \ifthenelse{\equal{#1}{}} 203 | {\def\minted@shortname{#2}} 204 | {\def\minted@shortname{#1}} 205 | \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{ 206 | \mint[#3,##1]{#2}##2}} 207 | \newcommand\newmintedfile[3][]{ 208 | \ifthenelse{\equal{#1}{}} 209 | {\def\minted@shortname{#2file}} 210 | {\def\minted@shortname{#1}} 211 | \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{ 212 | \inputminted[#3,##1]{#2}{##2}}} 213 | \@ifundefined{minted@float@within} 214 | {\newfloat{listing}{h}{lol}} 215 | {\newfloat{listing}{h}{lol}[\minted@float@within]} 216 | \newcommand\listingscaption{Listing} 217 | \floatname{listing}{\listingscaption} 218 | \newcommand\listoflistingscaption{List of listings} 219 | \providecommand\listoflistings{\listof{listing}{\listoflistingscaption}} 220 | \AtBeginDocument{ 221 | \minted@usedefaultstyle} 222 | \AtEndOfPackage{ 223 | \ifnum\pdf@shellescape=1\relax\else 224 | \PackageError{minted} 225 | {You must invoke LaTeX with the 226 | -shell-escape flag} 227 | {Pass the -shell-escape flag to LaTeX. Refer to the minted.sty 228 | documentation for more information.}\fi 229 | \TestAppExists{pygmentize} 230 | \ifAppExists\else 231 | \PackageError{minted} 232 | {You must have `pygmentize' installed 233 | to use this package} 234 | {Refer to the installation instructions in the minted 235 | documentation for more information.} 236 | \fi} 237 | \endinput 238 | %% 239 | %% End of file `minted.sty'. 240 | -------------------------------------------------------------------------------- /tex/minted.sty: -------------------------------------------------------------------------------- 1 | %% 2 | %% This is file `minted.sty', 3 | %% generated with the docstrip utility. 4 | %% 5 | %% The original source files were: 6 | %% 7 | %% minted.dtx (with options: `package') 8 | %% Copyright 2010--2011 Konrad Rudolph 9 | %% 10 | %% This work may be distributed and/or modified under the 11 | %% conditions of the LaTeX Project Public License, either version 1.3 12 | %% of this license or (at your option) any later version. 13 | %% The latest version of this license is in 14 | %% http://www.latex-project.org/lppl.txt 15 | %% and version 1.3 or later is part of all distributions of LaTeX 16 | %% version 2005/12/01 or later. 17 | %% 18 | %% Additionally, the project may be distributed under the terms of the new BSD 19 | %% license. 20 | %% 21 | %% This work has the LPPL maintenance status `maintained'. 22 | %% 23 | %% The Current Maintainer of this work is Konrad Rudolph. 24 | %% 25 | %% This work consists of the files minted.dtx and minted.ins 26 | %% and the derived file minted.sty. 27 | \NeedsTeXFormat{LaTeX2e} 28 | \ProvidesPackage{minted}[2011/09/17 v1.7 Yet another Pygments shim for LaTeX] 29 | \RequirePackage{keyval} 30 | \RequirePackage{fancyvrb} 31 | \RequirePackage{xcolor} 32 | \RequirePackage{float} 33 | \RequirePackage{ifthen} 34 | \RequirePackage{calc} 35 | \RequirePackage{ifplatform} 36 | \DeclareOption{chapter}{\def\minted@float@within{chapter}} 37 | \DeclareOption{section}{\def\minted@float@within{section}} 38 | \ProcessOptions\relax 39 | \ifwindows 40 | \providecommand\DeleteFile[1]{\immediate\write18{del #1}} 41 | \else 42 | \providecommand\DeleteFile[1]{\immediate\write18{rm #1}} 43 | \fi 44 | \newboolean{AppExists} 45 | \newcommand\TestAppExists[1]{ 46 | \ifwindows 47 | \DeleteFile{\jobname.aex} 48 | \immediate\write18{for \string^\@percentchar i in (#1.exe #1.bat #1.cmd) 49 | do set >\jobname.aex >\jobname.aex} %$ 50 | \newread\@appexistsfile 51 | \immediate\openin\@appexistsfile\jobname.aex 52 | \expandafter\def\expandafter\@tmp@cr\expandafter{\the\endlinechar} 53 | \endlinechar=-1\relax 54 | \readline\@appexistsfile to \@apppathifexists 55 | \endlinechar=\@tmp@cr 56 | \ifthenelse{\equal{\@apppathifexists}{}} 57 | {\AppExistsfalse} 58 | {\AppExiststrue} 59 | \immediate\closein\@appexistsfile 60 | \DeleteFile{\jobname.aex} 61 | \immediate\typeout{file deleted} 62 | \else 63 | \immediate\write18{which #1 && touch \jobname.aex} 64 | \IfFileExists{\jobname.aex} 65 | {\AppExiststrue 66 | \DeleteFile{\jobname.aex}} 67 | {\AppExistsfalse} 68 | \fi} 69 | \newcommand\minted@resetoptions{} 70 | \newcommand\minted@defopt[1]{ 71 | \expandafter\def\expandafter\minted@resetoptions\expandafter{% 72 | \minted@resetoptions 73 | \@namedef{minted@opt@#1}{}}} 74 | \newcommand\minted@opt[1]{ 75 | \expandafter\detokenize% 76 | \expandafter\expandafter\expandafter{\csname minted@opt@#1\endcsname}} 77 | \newcommand\minted@define@opt[3][]{ 78 | \minted@defopt{#2} 79 | \ifthenelse{\equal{#1}{}}{ 80 | \define@key{minted@opt}{#2}{\@namedef{minted@opt@#2}{#3}}} 81 | {\define@key{minted@opt}{#2}[#1]{\@namedef{minted@opt@#2}{#3}}}} 82 | \newcommand\minted@define@switch[3][]{ 83 | \minted@defopt{#2} 84 | \define@booleankey{minted@opt}{#2} 85 | {\@namedef{minted@opt@#2}{#3}} 86 | {\@namedef{minted@opt@#2}{#1}}} 87 | \minted@defopt{extra} 88 | \newcommand\minted@define@extra[1]{ 89 | \define@key{minted@opt}{#1}{ 90 | \expandafter\def\expandafter\minted@opt@extra\expandafter{% 91 | \minted@opt@extra,#1=##1}}} 92 | \newcommand\minted@define@extra@switch[1]{ 93 | \define@booleankey{minted@opt}{#1} 94 | {\expandafter\def\expandafter\minted@opt@extra\expandafter{% 95 | \minted@opt@extra,#1}} 96 | {\expandafter\def\expandafter\minted@opt@extra\expandafter{% 97 | \minted@opt@extra,#1=false}}} 98 | \minted@define@switch{texcl}{-P texcomments} 99 | \minted@define@switch{mathescape}{-P mathescape} 100 | \minted@define@switch{linenos}{-P linenos} 101 | \minted@define@switch{startinline}{-P startinline} 102 | \minted@define@switch[-P funcnamehighlighting=False]% 103 | {funcnamehighlighting}{-P funcnamehighlighting} 104 | \minted@define@opt{gobble}{-F gobble:n=#1} 105 | \minted@define@opt{bgcolor}{#1} 106 | \minted@define@extra{frame} 107 | \minted@define@extra{framesep} 108 | \minted@define@extra{framerule} 109 | \minted@define@extra{rulecolor} 110 | \minted@define@extra{numbersep} 111 | \minted@define@extra{firstnumber} 112 | \minted@define@extra{stepnumber} 113 | \minted@define@extra{firstline} 114 | \minted@define@extra{lastline} 115 | \minted@define@extra{baselinestretch} 116 | \minted@define@extra{xleftmargin} 117 | \minted@define@extra{xrightmargin} 118 | \minted@define@extra{fillcolor} 119 | \minted@define@extra{tabsize} 120 | \minted@define@extra{fontfamily} 121 | \minted@define@extra{fontsize} 122 | \minted@define@extra{fontshape} 123 | \minted@define@extra{fontseries} 124 | \minted@define@extra{formatcom} 125 | \minted@define@extra{label} 126 | \minted@define@extra@switch{numberblanklines} 127 | \minted@define@extra@switch{showspaces} 128 | \minted@define@extra@switch{resetmargins} 129 | \minted@define@extra@switch{samepage} 130 | \minted@define@extra@switch{showtabs} 131 | \minted@define@extra@switch{obeytabs} 132 | \newsavebox{\minted@bgbox} 133 | \newenvironment{minted@colorbg}[1]{ 134 | \def\minted@bgcol{#1} 135 | \noindent 136 | \begin{lrbox}{\minted@bgbox} 137 | \begin{minipage}{\linewidth-2\fboxsep}} 138 | {\end{minipage} 139 | \end{lrbox}% 140 | \colorbox{\minted@bgcol}{\usebox{\minted@bgbox}}} 141 | \newwrite\minted@code 142 | \newcommand\minted@savecode[1]{ 143 | \immediate\openout\minted@code\jobname.pyg 144 | \immediate\write\minted@code{#1} 145 | \immediate\closeout\minted@code} 146 | \newcommand\minted@pygmentize[2][\jobname.pyg]{ 147 | \def\minted@cmd{pygmentize -l #2 -f latex -F tokenmerge 148 | \minted@opt{gobble} \minted@opt{texcl} \minted@opt{mathescape} 149 | \minted@opt{startinline} \minted@opt{funcnamehighlighting} 150 | \minted@opt{linenos} -P "verboptions=\minted@opt{extra}" 151 | -o \jobname.out.pyg #1} 152 | \immediate\write18{\minted@cmd} 153 | % For debugging, uncomment: 154 | %\immediate\typeout{\minted@cmd} 155 | \ifthenelse{\equal{\minted@opt@bgcolor}{}} 156 | {} 157 | {\begin{minted@colorbg}{\minted@opt@bgcolor}} 158 | \input{\jobname.out.pyg} 159 | \ifthenelse{\equal{\minted@opt@bgcolor}{}} 160 | {} 161 | {\end{minted@colorbg}} 162 | \DeleteFile{\jobname.out.pyg}} 163 | \newcommand\minted@usedefaultstyle{\usemintedstyle{default}} 164 | \newcommand\usemintedstyle[1]{ 165 | \renewcommand\minted@usedefaultstyle{} 166 | \immediate\write18{pygmentize -S #1 -f latex > \jobname.pyg} 167 | \input{\jobname.pyg}} 168 | \newcommand\mint[3][]{ 169 | \DefineShortVerb{#3} 170 | \minted@resetoptions 171 | \setkeys{minted@opt}{#1} 172 | \SaveVerb[aftersave={ 173 | \UndefineShortVerb{#3} 174 | \minted@savecode{\FV@SV@minted@verb} 175 | \minted@pygmentize{#2} 176 | \DeleteFile{\jobname.pyg}}]{minted@verb}#3} 177 | \newcommand\minted@proglang[1]{} 178 | \newenvironment{minted}[2][] 179 | {\VerbatimEnvironment 180 | \renewcommand{\minted@proglang}[1]{#2} 181 | \minted@resetoptions 182 | \setkeys{minted@opt}{#1} 183 | \begin{VerbatimOut}[codes={\catcode`\^^I=12}]{\jobname.pyg}}% 184 | {\end{VerbatimOut} 185 | \minted@pygmentize{\minted@proglang{}} 186 | \DeleteFile{\jobname.pyg}} 187 | \newcommand\inputminted[3][]{ 188 | \minted@resetoptions 189 | \setkeys{minted@opt}{#1} 190 | \minted@pygmentize[#3]{#2}} 191 | \newcommand\newminted[3][]{ 192 | \ifthenelse{\equal{#1}{}} 193 | {\def\minted@envname{#2code}} 194 | {\def\minted@envname{#1}} 195 | \newenvironment{\minted@envname} 196 | {\VerbatimEnvironment\begin{minted}[#3]{#2}} 197 | {\end{minted}} 198 | \newenvironment{\minted@envname *}[1] 199 | {\VerbatimEnvironment\begin{minted}[#3,##1]{#2}} 200 | {\end{minted}}} 201 | \newcommand\newmint[3][]{ 202 | \ifthenelse{\equal{#1}{}} 203 | {\def\minted@shortname{#2}} 204 | {\def\minted@shortname{#1}} 205 | \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{ 206 | \mint[#3,##1]{#2}##2}} 207 | \newcommand\newmintedfile[3][]{ 208 | \ifthenelse{\equal{#1}{}} 209 | {\def\minted@shortname{#2file}} 210 | {\def\minted@shortname{#1}} 211 | \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{ 212 | \inputminted[#3,##1]{#2}{##2}}} 213 | \@ifundefined{minted@float@within} 214 | {\newfloat{listing}{h}{lol}} 215 | {\newfloat{listing}{h}{lol}[\minted@float@within]} 216 | \newcommand\listingscaption{Listing} 217 | \floatname{listing}{\listingscaption} 218 | \newcommand\listoflistingscaption{List of listings} 219 | \providecommand\listoflistings{\listof{listing}{\listoflistingscaption}} 220 | \AtBeginDocument{ 221 | \minted@usedefaultstyle} 222 | \AtEndOfPackage{ 223 | \ifnum\pdf@shellescape=1\relax\else 224 | \PackageError{minted} 225 | {You must invoke LaTeX with the 226 | -shell-escape flag} 227 | {Pass the -shell-escape flag to LaTeX. Refer to the minted.sty 228 | documentation for more information.}\fi 229 | \TestAppExists{pygmentize} 230 | \ifAppExists\else 231 | \PackageError{minted} 232 | {You must have `pygmentize' installed 233 | to use this package} 234 | {Refer to the installation instructions in the minted 235 | documentation for more information.} 236 | \fi} 237 | \endinput 238 | %% 239 | %% End of file `minted.sty'. 240 | -------------------------------------------------------------------------------- /scripts/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | Understanding variable importances in forests of randomized trees. 3 | Gilles Louppe, Louis Wehenkel, Antonio Sutera and Pierre Geurts 4 | NIPS, Lake Tahoe, United States, 2013 5 | http://orbi.ulg.ac.be/handle/2268/155642 6 | 7 | This demo reproduces Table 2 from the paper. It also shows that using Extra- 8 | Trees from Scikit-Learn, or an ensemble of randomized ID3 trees (see ID3.py) 9 | give identical results. 10 | 11 | Figure 2 from the paper can be obtained using the 2d array importances values 12 | yielded by a `RandomizedID3Ensemble` (see the commented code at the bottom). 13 | 14 | Author: Gilles Louppe 15 | License: BSD 3 clause 16 | """ 17 | import itertools 18 | import numpy as np 19 | 20 | from sklearn.utils import check_random_state 21 | 22 | 23 | # Datasets ==================================================================== 24 | 25 | def make_led(irrelevant=0): 26 | """Generate exhaustively all samples from the 7-segment problem. 27 | 28 | Parameters 29 | ---------- 30 | irrelevant : int, optional (default=0) 31 | The number of irrelevant binary features to add. Since samples are 32 | generated exhaustively, this makes the size of the resulting dataset 33 | 2^(irrelevant) times larger. 34 | 35 | Returns 36 | ------- 37 | X, y 38 | """ 39 | data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], 40 | [1, 0, 1, 1, 1, 0, 1, 2], 41 | [1, 0, 1, 1, 0, 1, 1, 3], 42 | [0, 1, 1, 1, 0, 1, 0, 4], 43 | [1, 1, 0, 1, 0, 1, 1, 5], 44 | [1, 1, 0, 1, 1, 1, 1, 6], 45 | [1, 0, 1, 0, 0, 1, 0, 7], 46 | [1, 1, 1, 1, 1, 1, 1, 8], 47 | [1, 1, 1, 1, 0, 1, 1, 9], 48 | [1, 1, 1, 0, 1, 1, 1, 0]]) 49 | 50 | X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] 51 | 52 | if irrelevant > 0: 53 | X_ = [] 54 | y_ = [] 55 | 56 | for i in xrange(10): 57 | for s in itertools.product(range(2), repeat=irrelevant): 58 | X_.append(np.concatenate((X[i], s))) 59 | y_.append(i) 60 | 61 | X = np.array(X_, dtype=np.bool) 62 | y = np.array(y_) 63 | 64 | return X, y 65 | 66 | 67 | def make_led_sample(n_samples=200, irrelevant=0, random_state=None): 68 | """Generate random samples from the 7-segment problem. 69 | 70 | Parameters 71 | ---------- 72 | n_samples : int, optional (default=200) 73 | The number of samples to generate. 74 | 75 | irrelevant : int, optional (default=0) 76 | The number of irrelevant binary features to add. 77 | 78 | Returns 79 | ------- 80 | X, y 81 | """ 82 | 83 | random_state = check_random_state(random_state) 84 | 85 | data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], 86 | [1, 0, 1, 1, 1, 0, 1, 2], 87 | [1, 0, 1, 1, 0, 1, 1, 3], 88 | [0, 1, 1, 1, 0, 1, 0, 4], 89 | [1, 1, 0, 1, 0, 1, 1, 5], 90 | [1, 1, 0, 1, 1, 1, 1, 6], 91 | [1, 0, 1, 0, 0, 1, 0, 7], 92 | [1, 1, 1, 1, 1, 1, 1, 8], 93 | [1, 1, 1, 1, 0, 1, 1, 9], 94 | [1, 1, 1, 0, 1, 1, 1, 0]]) 95 | 96 | data = data[random_state.randint(0, 10, n_samples)] 97 | X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] 98 | 99 | if irrelevant > 0: 100 | X = np.hstack((X, random_state.rand(n_samples, irrelevant) > 0.5)) 101 | 102 | return X, y 103 | 104 | 105 | # Formulae ==================================================================== 106 | 107 | from gmpy import comb 108 | 109 | def binomial(k, n): 110 | """Return the number of combinations of k elements among a collection of 111 | size n.""" 112 | if k < 0: 113 | return 0 114 | elif k > n: 115 | return 0 116 | else: 117 | return comb(int(n), int(k)) 118 | 119 | 120 | def entropy(X): 121 | """Return the entropy (in base 2) of a discrete variable X, encoded as a 122 | 1d array.""" 123 | e = 0. 124 | n_samples = len(X) 125 | 126 | for count in np.bincount(X): 127 | p = 1. * count / n_samples 128 | 129 | if p > 0: 130 | e -= p * np.log2(p) 131 | 132 | return e 133 | 134 | def mdi_importance(X_m, X, y): 135 | """The MDI importance of X_m for Y, as computed with an infinite ensemble 136 | of fully developed totally randomized trees. 137 | 138 | This is a direct implementation of Equation 3 from the paper. 139 | 140 | Parameters 141 | ---------- 142 | X_m : int 143 | The variable for which the importance is computed. It corresponds 144 | to the column in X (from 0 to p-1). 145 | 146 | X : array of shape (N, p) 147 | The input data (X_0, X_1, ... X_{p-1}). X should be large enough 148 | to accurately represent the actual data distribution. 149 | 150 | y : array of shape (N,) 151 | The Y variable. 152 | 153 | Returns 154 | ------- 155 | imp : array of size (p,) 156 | The decomposition of the importance of X_m along its degree of 157 | interaction with the other input variables, i.e the p outter terms 158 | in Equation 3. The actual importance Imp(X_m) amounts np.sum(imp). 159 | """ 160 | n_samples, p = X.shape 161 | 162 | variables = range(p) 163 | variables.pop(X_m) 164 | imp = np.zeros(p) 165 | 166 | values = [] 167 | for i in xrange(p): 168 | values.append(np.unique(X[:, i])) 169 | 170 | for k in xrange(p): 171 | # Weight of each B of size k 172 | coef = 1. / (binomial(k, p) * (p - k)) 173 | 174 | # For all B of size k 175 | for B in itertools.combinations(variables, k): 176 | # For all values B=b 177 | for b in itertools.product(*[values[B[j]] for j in xrange(k)]): 178 | mask_b = np.ones(n_samples, dtype=np.bool) 179 | 180 | for j in xrange(k): 181 | mask_b &= X[:, B[j]] == b[j] 182 | 183 | X_, y_ = X[mask_b, :], y[mask_b] 184 | n_samples_b = len(X_) 185 | 186 | if n_samples_b > 0: 187 | children = [] 188 | 189 | for xi in values[X_m]: 190 | mask_xi = X_[:, X_m] == xi 191 | children.append(y_[mask_xi]) 192 | 193 | imp[k] += (coef 194 | * (1. * n_samples_b / n_samples) # P(B=b) 195 | * (entropy(y_) - 196 | sum([entropy(c) * len(c) / n_samples_b 197 | for c in children]))) 198 | 199 | return imp 200 | 201 | 202 | # Demo ======================================================================== 203 | 204 | if __name__ == "__main__": 205 | # Generate data 206 | n_trees = 5000 207 | 208 | X, y = make_led() 209 | p = X.shape[1] 210 | 211 | results = np.empty((p, p + 1)) 212 | 213 | # Theoretical values 214 | for i in range(p): 215 | results[i, 0] = sum(mdi_importance(i, X, y)) 216 | 217 | # Empirical results 218 | for i in range(p): 219 | # Using scikit-learn 220 | from sklearn.ensemble import ExtraTreesClassifier 221 | clf = ExtraTreesClassifier(n_estimators=n_trees, 222 | max_features=i + 1, 223 | criterion="entropy", 224 | n_jobs=-1).fit(X, y) 225 | 226 | # Note: Variable importances in Scikit-Learn are normalized by 227 | # default. Use normalize=False to disable normalization. 228 | 229 | results[:, i + 1] = sum(tree.tree_.compute_feature_importances(normalize=False) 230 | for tree in clf.estimators_) / clf.n_estimators 231 | 232 | # # Using a simplistic (but slower) randomized ID3 tree classifier 233 | # from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble 234 | # clf = RandomizedID3Ensemble(n_estimators=n_trees, 235 | # base_estimator=RandomizedID3Classifier(k=i + 1)).fit(X, y) 236 | 237 | # # Note: Here clf.feature_importances is a 2d array of shape (p, p). 238 | # # In particular, it could be used to regenerate Figure 2 from 239 | # # the paper. 240 | 241 | # results[:, i + 1] = np.sum(clf.feature_importances_, axis=1) 242 | 243 | 244 | # Print results 245 | print "Table 2:" 246 | print "Variable importances as computed with an ensemble of randomized " \ 247 | "trees, for increasing values of $K$. Importances at $K=1$ follow " \ 248 | "their theoretical values, as predicted by Equation 3 in Theorem 1. " \ 249 | "However, as $K$ increases, importances diverge due to masking " \ 250 | "effects. In accordance with Theorem 2, their sum is also always " \ 251 | "equal to $I(X_{1}, ..., X_{7}; Y) = H(Y) = log2(10)= 3.321$ " \ 252 | "since inputs allow to perfectly predict the output." 253 | print 254 | 255 | print "\tEqn.3", 256 | for m in range(p): 257 | print "\tK=%d" % (m + 1), 258 | print 259 | 260 | for m in range(p): 261 | print "X_%d" % (m + 1), 262 | for j in range(p + 1): 263 | print "\t%.4f" % results[m, j], 264 | print 265 | 266 | print "Sum", 267 | for j in range(p + 1): 268 | print "\t%.4f" % sum(results[:, j]), 269 | -------------------------------------------------------------------------------- /tex/frontback/notations.tex: -------------------------------------------------------------------------------- 1 | % Notations ==================================================================== 2 | 3 | \chapter{Notations} 4 | 5 | \begin{tabularx}{\textwidth}{ l X } 6 | ${\cal A}$ & A supervised learning algorithm \dotfill \pageref{ntn:A}\\ 7 | ${\cal A}(\theta, {\cal L})$ & The model $\varphi_{\cal L}$ produced by algorithm ${\cal A}$ over ${\cal L}$ and hyper-parameters $\theta$ \dotfill \pageref{ntn:A-func}\\ 8 | $\alpha_s$ & The proportion of samples in a random patch \dotfill \pageref{ntn:alpha_s}\\ 9 | $\alpha_f$ & The proportion of features in a random patch \dotfill \pageref{ntn:alpha_f}\\ 10 | $b_l$ & The $l$-th value of a categorical variable \dotfill \pageref{ntn:b_l}\\ 11 | $B$ & A subset $B \subseteq V$ of variables \dotfill \pageref{ntn:B}\\ 12 | $c_k$ & The $k$-th class \dotfill \pageref{ntn:c_k}\\ 13 | $C^k_p$ & The number of $k$-combinations from a set of $p$ elements \dotfill \pageref{ntn:C_k_p}\\ 14 | $C(N)$ & The time complexity for splitting $N$ samples \dotfill \pageref{ntn:cN}\\ 15 | $\mathbb{E}$ & Expectation \dotfill \\ 16 | $\overline{E}(\varphi_{\cal L}, {\cal L}^\prime)$ & The average prediction error of $\varphi_{\cal L}$ over ${\cal L}^\prime$ \dotfill \pageref{ntn:E_bar}\\ 17 | $Err(\varphi_{\cal L})$ & The generalization error of $\varphi_{\cal L}$ \dotfill \pageref{eqn:generalization-error}, \pageref{eqn:4:generalization-error}\\ 18 | %$\widehat{Err}^\text{train}(\varphi_{\cal L})$ & The resubstitution estimate or training sample estimate of the generalization error of $\varphi_{\cal L}$ \dotfill \pageref{eqn:training-error}\\ 19 | %$\widehat{Err}^\text{test}(\varphi_{\cal L})$ & The test sample estimate of the generalization error of $\varphi_{\cal L}$ \dotfill \pageref{eqn:test-error}\\ 20 | %$\widehat{Err}^\text{CV}(\varphi_{\cal L})$ & The cross-validation estimate of the generalization error of $\varphi_{\cal L}$ \dotfill \pageref{eqn:cv-error}\\ 21 | %$\widehat{Err}^\text{OOB}(\psi_{\cal L})$ & The out-of-bag estimate of the generalization error of $\psi_{\cal L}$ \dotfill \pageref{eqn:oob-error}\\ 22 | $H(X)$ & The Shannon entropy of $X$ \dotfill \pageref{eqn:6:entropy}\\ 23 | $H(X|Y)$ & The Shannon entropy of $X$ conditional to $Y$\dotfill \pageref{eqn:6:entropy-cond}\\ 24 | ${\cal H}$ & The space of candidate models \dotfill \pageref{ntn:H}\\ 25 | $i(t)$ & The impurity of node $t$ \dotfill \pageref{ntn:i_t}, \pageref{ntn:i_t2}\\ 26 | $i_R(t)$ & The impurity of node $t$ based on the local resubstitution estimate \dotfill \pageref{eqn:impurity:error},~\pageref{eqn:impurity:variance}\\ 27 | $i_H(t)$ & The entropy impurity of node $t$ \dotfill \pageref{eqn:impurity:shannon}\\ 28 | $i_G(t)$ & The Gini impurity of node $t$ \dotfill \pageref{eqn:impurity:gini}\\ 29 | $\Delta i(s, t)$ & The impurity decrease of the split $s$ at node $t$ \dotfill \pageref{def:impurity-decrease}\\ 30 | $I(X;Y)$ & The mutual information between $X$ and $Y$ \dotfill \pageref{eqn:6:mi}\\ 31 | $\text{Imp}(X_j)$ & The variable importance of $X_j$ \dotfill \pageref{eq:mdi}, \pageref{eq:mda}\\ 32 | $J$ & The number of classes \dotfill \pageref{ntn:J}\\ 33 | $K$ & The number of folds in cross-validation \dotfill \pageref{ntn:K-cv} \newline The number of input variables drawn at each node for finding a split \dotfill \pageref{ntn:K-split} \\ 34 | $K(\mathbf{x}_i, \mathbf{x}_j)$ & The kernel of $\mathbf{x}_i$ and $\mathbf{x}_j$ \dotfill \pageref{ntn:kernel}, \pageref{ntn:kernel2}\\ 35 | $L$ & A loss function \dotfill \pageref{ntn:L}\newline The number of values of a categorical variable \dotfill \pageref{ntn:L2}\\ 36 | ${\cal L}$ & A learning set $(\mathbf{X}, \mathbf{y})$ \dotfill \pageref{ntn:learning-set}\\ 37 | ${\cal L}^m$ & The $m$-th bootstrap replicate of ${\cal L}$ \dotfill \pageref{ntn:L_m}\\ 38 | ${\cal L}_t$ & The subset of node samples falling into node $t$ \dotfill \pageref{ntn:L_t}\\ 39 | $M$ & The number of base models in an ensemble \dotfill \pageref{ntn:M}\\ 40 | $\mu_{{\cal L},\theta_m}(\mathbf{x})$ & The mean prediction at $X = \mathbf{x}$ of $\varphi_{{\cal L},\theta_m}$ \dotfill \pageref{eqn:4:mu} \\ 41 | $N$ & The number of input samples \dotfill \pageref{ntn:N}\\ 42 | $N_t$ & The number of node samples in node $t$ \dotfill \pageref{ntn:N_t}\\ 43 | $N_{ct}$ & The number of node samples of class $c$ in node $t$ \dotfill \pageref{ntn:N_ct}\\ 44 | $\Omega$ & The universe, or population, from which cases are sampled \dotfill \pageref{ntn:omega}\\ 45 | $p$ & The number of input variables \dotfill \pageref{ntn:p}\\ 46 | $p_L$ & The proportion of node samples going to $t_L$ \dotfill \pageref{ntn:p_L}\\ 47 | $p_R$ & The proportion of node samples going to $t_R$ \dotfill \pageref{ntn:p_R}\\ 48 | $p(t)$ & The estimated probability $p(X \in {\cal X}_t)=\tfrac{N_t}{N}$ \dotfill \pageref{ntn:p_t}\\ 49 | $p(c|t)$ & The empirical probability estimate $p(Y=c | X \in {\cal X}_t)=\tfrac{N_{ct}}{N_t}$ of class $c$ at node $t$ \dotfill \pageref{ntn:p_ct}\\ 50 | $\widehat{p}_{\cal L}$ & An empirical probability estimate computed from the learning set ${\cal L}$\dotfill \pageref{eqn:4:proba-estimates}\\ 51 | $P(X,Y)$ & The joint probability distribution of the input variables $X=(X_1,\dots,X_p)$ and the output variable $Y$ \dotfill \pageref{ntn:P_XY}\\ 52 | ${\cal P}_k(V)$ & The set of subsets of $V$ of size $k$ \dotfill \pageref{ntn:P_k}\\ 53 | $\varphi$ & A model or function ${\cal X} \mapsto {\cal Y}$ \dotfill \pageref{ntn:varphi}\newline A single decision tree \dotfill \pageref{ntn:tree}\\ 54 | $\widetilde{\varphi}$ & The set of terminal nodes in $\varphi$ \dotfill \pageref{ntn:varphi-leafs}\\ 55 | $\varphi(\mathbf{x})$ & The prediction of $\varphi$ for the sample $\mathbf{x}$ \dotfill \pageref{ntn:varphi-x}\\ 56 | $\varphi_{\cal L}$ & A model built from ${\cal L}$ \dotfill \pageref{ntn:varphi-L}\\ 57 | $\varphi_{{\cal L},\theta}$ & A model built from ${\cal L}$ with random seed $\theta$ \dotfill \pageref{ntn:varphi-Ltheta}\\ 58 | $\varphi_B$ & A Bayes model \dotfill \pageref{ntn:varphi-B}\\ 59 | $\psi_{{\cal L},\theta_1,\dots,\theta_M}$ & An ensemble of $M$ models built from ${\cal L}$ and random seeds $\theta_1, \dots, \theta_M$ \dotfill \pageref{ntn:psi} \\ 60 | ${\cal Q}$ & A set ${\cal Q} \subseteq {\cal S}$ of splits of restricted structure \dotfill \pageref{ntn:Q}, \pageref{ntn:Q2}\\ 61 | ${\cal Q}(X_j)$ & The set ${\cal Q}(X_j) \subseteq {\cal Q}$ of univariate binary splits that can be defined on variable $X_j$ \dotfill \pageref{eqn:q:ordered}, \pageref{eqn:q:categorical-cart}\\ 62 | $\rho(\mathbf{x})$ & The correlation coefficient between the predictions at $X=\mathbf{x}$ of two randomized models \dotfill \pageref{eqn:4:correlation} \\ 63 | $s$ & A split \dotfill \pageref{ntn:s}, \pageref{ntn:s2}\\ 64 | $s^*$ & The best split \dotfill \pageref{ntn:s-star}, \pageref{eqn:best-best-split}\\ 65 | $s^*_j$ & The best binary split defined on variable $X_j$\dotfill \pageref{ntn:s-star}, \pageref{eqn:best-split-single}\\ 66 | $s_j^v$ & The binary split $(\{\mathbf{x}|x_j \leq v\}, \{\mathbf{x} > v\})$ defined on variable $X_j$ with discretization threshold $v$ \dotfill \pageref{ntn:s_jv}\\ 67 | $s_t$ & The split labeling node $t$ \dotfill \pageref{ntn:s_t}\\ 68 | $\tilde{s}^j_t$ & The best surrogate split for $s_t$ defined from $X_j$ \dotfill \pageref{ntn:s-surrogate}\\ 69 | ${\cal S}$ & The set of all possible splits $s$ \dotfill \pageref{ntn:S-all}\\ 70 | $\sigma^2_{{\cal L},\theta_m}(\mathbf{x})$ & The prediction variance at $X = \mathbf{x}$ of $\varphi_{{\cal L},\theta_m}$ \dotfill \pageref{eqn:4:sigma} \\ 71 | $t$ & A node in a decision tree \dotfill \pageref{ntn:node}\\ 72 | $t_L$ & The left child of node $t$ \dotfill \pageref{ntn:t_L}, \pageref{ntn:t_L2}\\ 73 | $t_R$ & The right child of node $t$ \dotfill \pageref{ntn:t_R}, \pageref{ntn:t_R2}\\ 74 | $\theta$ & A vector of hyper-parameter values \dotfill \pageref{ntn:theta}\newline A random seed \dotfill \pageref{ntn:theta-seed}\\ 75 | $\theta^*$ & The optimal hyper-parameters \dotfill \pageref{ntn:theta-star}\\ 76 | $\widehat{\theta}^*$ & The approximately optimal hyper-parameters \dotfill \pageref{ntn:theta-star-approx}\\ 77 | $\theta_m$ & The seed of the $m$-th model in an ensemble \dotfill \pageref{ntn:theta-seed-m}\\ 78 | $v$ & A discretization threshold in a binary split \dotfill \pageref{ntn:v}\\ 79 | $v_k$ & The $k$-th value of an ordered variable, when node samples are in sorted order \dotfill \pageref{ntn:v_k}\\ 80 | $v_k^\prime$ & The mid-cut point between $v_k$ and $v_{k+1}$ \dotfill \pageref{ntn:v_k_prime}\\ 81 | $V$ & The set $\{X_1, \dots, X_p\}$ of input variables \dotfill \pageref{ntn:V}\\ 82 | $V^{-j}$ & $V \setminus \{X_j\}$ \dotfill \pageref{ntn:V-j}\\ 83 | $\mathbb{V}$ & Variance \dotfill \\ 84 | $\textbf{x}$ & A case, sample or input vector $(x_1, \dots, x_p)$ \dotfill \pageref{ntn:sample-x}\\ 85 | $\textbf{x}_i$ & The $i$-th input sample in ${\cal L}$ \dotfill \pageref{ntn:sample-x_i}\\ 86 | $x_j$ & The value of variable $X_j$ for the sample $\textbf{x}$ \dotfill \pageref{ntn:value-x_j}\\ 87 | $\textbf{X}$ & The $N\times p$ matrix representing the values of all $N$ samples for all $p$ input variables \dotfill \pageref{ntn:matrix-X}\\ 88 | $X_j$ & The $j$-th input variable or feature \dotfill \pageref{ntn:var-X_j}, \pageref{ntn:var-X_j2}\\ 89 | $X$ & The random vector $(X_1,\dots,X_p)$ \dotfill \pageref{ntn:vector-X}\\ 90 | ${\cal X}_j$ & The domain or space of variable $X_j$ \dotfill \pageref{ntn:space-X_j}\\ 91 | ${\cal X}$ & The input space ${\cal X}_1 \times \dots \times {\cal X}_p$ \dotfill \pageref{ntn:space-X}\\ 92 | ${\cal X}_t$ & The subspace ${\cal X}_t \subseteq {\cal X}$ represented by node $t$ \dotfill \pageref{ntn:node-space}\\ 93 | $y$ & A value of the output variable $Y$ \dotfill \pageref{ntn:value-y}\\ 94 | $\widehat{y}_t$ & The value labelling node $t$ \dotfill \pageref{ntn:y_t}\\ 95 | $\widehat{y}_t^*$ & The optimal value labelling node $t$ \dotfill \pageref{ntn:y_t-star}\\ 96 | $\mathbf{y}$ & The output values $(y_1,\dots,y_N)$ \dotfill \pageref{ntn:vector-y}\\ 97 | $Y$ & The output or response variable $Y$ \dotfill \pageref{ntn:var-Y}\\ 98 | ${\cal Y}$ & The domain or space of variable $Y$ \dotfill \pageref{ntn:space-Y}\\ 99 | \end{tabularx} 100 | -------------------------------------------------------------------------------- /benchmarks/visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import glob 3 | import json 4 | import sys 5 | 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import brewer2mpl 9 | 10 | cmap_curve = [(1.0, 0, 0), (0, 0, 1.0)] 11 | cmap_bar = brewer2mpl.get_map('RdYlGn', 'diverging', 9).mpl_colors 12 | 13 | layout = { 14 | "RandomForestClassifier": {"name": "Scikit-Learn-RF", "order": 0}, 15 | "RandomForestRegressor": {"name": "Scikit-Learn-RF", "order": 0}, 16 | "ExtraTreesClassifier": {"name": "Scikit-Learn-ETs", "order": 1}, 17 | "ExtraTreesRegressor": {"name": "Scikit-Learn-ETs", "order": 1}, 18 | "OpenCV": {"name": "OpenCV-RF", "order": 2}, 19 | "OpenCV-ETs": {"name": "OpenCV-ETs", "order": 3}, 20 | "OK3-RandomForest": {"name": "OK3-RF", "order": 4}, 21 | "OK3-ExtraTrees": {"name": "OK3-ETs", "order": 5}, 22 | "R-randomForest": {"name": "R-RF", "order": 7}, 23 | "Weka": {"name": "Weka-RF", "order": 6}, 24 | "Orange": {"name": "Orange-RF", "order": 8}, 25 | } 26 | 27 | 28 | def get(data, field, first=True): 29 | d = data 30 | for token in field.split("__"): 31 | d = d[token] 32 | 33 | if isinstance(d, list) and first: 34 | return d[0] 35 | else: 36 | return d 37 | 38 | 39 | def groupby(filenames, group_fields, param_field, stat_field): 40 | all_data = {} 41 | 42 | for filename in filenames: 43 | with open(filename, "r") as fd: 44 | data = json.load(fd) 45 | 46 | key = [] 47 | for field in group_fields: 48 | key.append(get(data, field)) 49 | key = tuple(key) 50 | 51 | if key not in all_data: 52 | all_data[key] = [] 53 | 54 | all_data[key].append((get(data, param_field), get(data, stat_field, first=False))) 55 | 56 | for key in all_data: 57 | all_data[key] = sorted(all_data[key]) 58 | 59 | return all_data 60 | 61 | 62 | def plot_curve(all_data, x_label=None, y_label=None, width=0.2, curve=True, filename=None): 63 | matplotlib.rc("font", size=13) 64 | title = all_data.keys()[0][1] 65 | title = title.split(".")[0] 66 | 67 | all_data = sorted([(layout[key[0]]["order"], layout[key[0]]["name"], all_data[key]) for key in all_data]) 68 | offset = len(all_data) * width + width/2.0 69 | 70 | fig, ax = plt.subplots() 71 | 72 | for i, (key, name, data) in enumerate(all_data): 73 | xticks = [t[0] for t in data] 74 | x = [offset*t[0]+i*width for t in data] 75 | y = [np.mean(t[1]) for t in data] 76 | 77 | if x_label == "n_estimators": 78 | xticks = [t[0] for t in data if t[0] < 1000] 79 | x = [offset*t[0]+i*width for t in data if t[0] < 1000] 80 | y = [np.mean(t[1]) for t in data if t[0] < 1000] 81 | 82 | if y_label == "MSE": 83 | y = [-y_i for y_i in y] 84 | 85 | if curve: 86 | ax.plot(xticks, y, label=name, color=cmap_curve[i]) 87 | else: 88 | ax.bar(x, y, width=width, label=name, color=cmap_curve[i]) 89 | 90 | if curve: 91 | ax.set_xlim(xticks[0], xticks[-1]) 92 | else: 93 | ax.set_xlim(-2*width+x[0], x[-1]+2*width) 94 | ax.set_xticks(x) 95 | ax.set_xticklabels(xticks) 96 | 97 | if x_label is not None: ax.set_xlabel(x_label) 98 | if y_label is not None: ax.set_ylabel(y_label) 99 | 100 | ax.set_title(title) 101 | ax.legend(loc="best") 102 | 103 | if filename: 104 | plt.savefig("%s.pdf" % filename) 105 | plt.savefig("%s.jpg" % filename) 106 | plt.close("all") 107 | else: 108 | plt.show() 109 | 110 | 111 | def plot_bar(all_data, y_label=None, width=0.2, filename=None): 112 | title = all_data.keys()[0][1] 113 | title = title.split(".")[0] 114 | 115 | all_data = sorted([(layout[key[0]]["order"], layout[key[0]]["name"], all_data[key]) for key in all_data]) 116 | fig, ax = plt.subplots() 117 | 118 | for i, (key, name, data) in enumerate(all_data): 119 | y_mean = np.mean(data[0][1]) 120 | rects = ax.bar([i*width], [y_mean], width=width, label=name, color=cmap_bar[key]) 121 | rect = rects[0] 122 | plt.text(rect.get_x() + rect.get_width() / 2.0, rect.get_height(), '%.2f' % y_mean, ha='center', va='bottom', fontsize=9) 123 | 124 | if y_label is not None: ax.set_ylabel(y_label) 125 | ax.set_title(title) 126 | ax.set_xticks([]) 127 | ax.set_xlim(-width, len(layout)*width-width) 128 | ax.legend(loc="best", prop={"size": 9}) 129 | 130 | if filename: 131 | plt.savefig("%s.pdf" % filename) 132 | plt.savefig("%s.jpg" % filename) 133 | plt.close("all") 134 | else: 135 | plt.show() 136 | 137 | 138 | def make_5_4_1(): 139 | # Plot result on artifical data 140 | regression = ["make_friedman1", "make_friedman2", "make_friedman3"] 141 | classification = ["make_hastie_10_2", "make_waveforms", "make_twonorm", "make_threenorm", "make_ringnorm"] 142 | 143 | params = [("n_estimators", "params__n_estimators", True), 144 | ("max_features", "params__max_features", False), 145 | ("bootstrap", "params__bootstrap", False), 146 | ("n_train", "stats__n_train", True), 147 | ("n_features", "stats__n_features", True)] 148 | 149 | stats = [("time_fit", "Fit time (s)"), 150 | ("time_predict", "Predict time(s)"), 151 | ("score_make_scorer(accuracy_score)", "Accuracy"), 152 | ("score_make_scorer(roc_auc_score, needs_threshold=True)", "AUC"), 153 | ("score_make_scorer(mean_squared_error, greater_is_better=False)", "MSE"), 154 | ("score_make_scorer(r2_score)", "R2"), 155 | ("leaves", "Leaves"), 156 | ("average_depth", "Average depth")] 157 | 158 | for dataset in regression+classification: 159 | for prefix, param_field, curve in params: 160 | files = [f for f in glob.glob("output/%s_*_%s*" % (prefix, dataset))] 161 | 162 | if len(files) == 0: 163 | continue 164 | 165 | for stat_field, label in stats: 166 | print dataset, prefix, stat_field 167 | 168 | try: 169 | plot_curve(groupby(files, ["estimator", "generator"], param_field, "stats__%s" % stat_field), 170 | x_label=prefix, 171 | y_label=label, 172 | filename="figs/generators/%s/%s_%s" % (dataset, prefix, stat_field), 173 | curve=curve) 174 | except: 175 | print "Failed!" 176 | 177 | def make_5_4_2_plots(): 178 | # Plot results on datasets 179 | datasets = ["diabetes.npz", "dig44.npz", "ionosphere.npz", "pendigits.npz", 180 | "letter.npz", "liver.npz", "musk2.npz", "ring-norm.npz", "satellite.npz", 181 | "segment.npz", "sonar.npz", "spambase.npz", "two-norm.npz", "vehicle.npz", 182 | "vowel.npz", "waveform.npz", "cifar10.npz", "mnist3vs8.npz", "mnist4vs9.npz", "mnist.npz", 183 | "isolet.npz", "arcene.npz", "breast2.npz", "madelon.npz", "marti0.npz", 184 | "reged0.npz", "secom.npz", "tis.npz", "sido0.npz"] 185 | 186 | for dataset in datasets: 187 | print dataset 188 | files = glob.glob("output/default_*_%s*" % dataset) 189 | plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__time_fit"), y_label="Fit time (s)", filename="figs/datasets/%s_fit" % dataset) 190 | plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__time_predict"), y_label="Predict time (s)", filename="figs/datasets/%s_predict" % dataset) 191 | plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__score_make_scorer(accuracy_score)"), y_label="Accuracy", filename="figs/datasets/%s_accuracy" % dataset) 192 | 193 | 194 | def make_5_4_2_table(): 195 | impls = ["RandomForestClassifier", 196 | "ExtraTreesClassifier", 197 | "OpenCV", 198 | "OpenCV-ETs", 199 | "OK3-RandomForest", 200 | "OK3-ExtraTrees", 201 | "Weka", 202 | "R-randomForest", 203 | "Orange"] 204 | 205 | datasets = ["diabetes.npz", "dig44.npz", "ionosphere.npz", "pendigits.npz", 206 | "letter.npz", "liver.npz", "musk2.npz", "ring-norm.npz", "satellite.npz", 207 | "segment.npz", "sonar.npz", "spambase.npz", "two-norm.npz", "vehicle.npz", 208 | "vowel.npz", "waveform.npz", "cifar10.npz", "mnist3vs8.npz", "mnist4vs9.npz", "mnist.npz", 209 | "isolet.npz", "arcene.npz", "breast2.npz", "madelon.npz", "marti0.npz", 210 | "reged0.npz", "secom.npz", "tis.npz", "sido0.npz"] 211 | 212 | all_stats = {} 213 | 214 | for dataset in datasets: 215 | all_stats[dataset] = {} 216 | files = glob.glob("output/default_*_%s*" % dataset) 217 | data = groupby(files, ["estimator", "generator"], "estimator", "stats__time_predict") 218 | 219 | for (estimator, _), s in data.items(): 220 | all_stats[dataset][estimator] = np.mean(s[0][1]) 221 | 222 | table = np.zeros((len(datasets), len(impls))) 223 | 224 | for i, (dataset, stats) in enumerate(sorted(all_stats.items())): 225 | for j, impl in enumerate(impls): 226 | if impls[j] in stats: 227 | table[i, j] = stats[impls[j]] 228 | else: 229 | table[i, j] = np.inf 230 | 231 | speedups = np.zeros(table.shape) 232 | 233 | for i, dataset in enumerate(sorted(datasets)): 234 | for j, impl in enumerate(impls): 235 | speedups[i, j] = table[i, j] / table[i, 0] 236 | 237 | speedups = np.ma.masked_array(speedups, np.isinf(speedups)) 238 | 239 | print "\\begin{tabular}{|c|", 240 | for j, impl in enumerate(impls): 241 | print "c", 242 | print "|}" 243 | print "\\hline" 244 | 245 | for j, impl in enumerate(impls): 246 | print "&", layout[impl]["name"], 247 | print "\\\\" 248 | print "\\hline" 249 | print "\\hline" 250 | 251 | for i, dataset in enumerate(sorted(datasets)): 252 | print "\\textsc{%s}" % dataset.split(".")[0], 253 | min_j = np.argmin(speedups[i]) 254 | 255 | for j, impl in enumerate(impls): 256 | if j == min_j: 257 | print "& \\textbf{%.2f}" % speedups[i, j], 258 | else: 259 | print "& %.2f" % speedups[i, j], 260 | print "\\\\" 261 | print "\\hline" 262 | print "\\hline" 263 | 264 | print "\\textit{Average}", 265 | means = speedups.mean(axis=0) 266 | min_j = np.argmin(means) 267 | for j, m in enumerate(means): 268 | if j == min_j: 269 | print "& \\textbf{%.2f}" % m, 270 | else: 271 | print "& %.2f" % m, 272 | print "\\\\" 273 | print "\\textit{Median}", 274 | medians = np.ma.median(speedups, axis=0) 275 | min_j = np.argmin(medians) 276 | for j, m in enumerate(medians): 277 | if j == min_j: 278 | print "& \\textbf{%.2f}" % m, 279 | else: 280 | print "& %.2f" % m, 281 | print "\\\\" 282 | print "\\hline" 283 | print "\\end{tabular}" 284 | 285 | if __name__ == "__main__": 286 | # make_5_4_1() 287 | make_5_4_2_plots() 288 | #make_5_4_2_table() 289 | 290 | -------------------------------------------------------------------------------- /tex/chapters/chapter01.tex: -------------------------------------------------------------------------------- 1 | \chapter{Introduction}\label{ch:introduction} 2 | 3 | In various fields of science, technology and humanities, as in biology, 4 | meteorology, medicine or finance to cite a few, experts aim at predicting a 5 | phenomenon based on past observations or measurements. For instance, 6 | meteorologists try to forecast the weather for the next days from the climatic 7 | conditions of the previous days. In medicine, practitioners collect measurements 8 | and information such as blood pressure, age or history for diagnosing the 9 | condition of incoming patients. Similarly, in chemistry, compounds are analyzed 10 | using mass spectrometry measurements in order to determine whether they contain 11 | a given type of molecules or atoms. In all of these cases, the goal is the 12 | prediction of a response variable based on a set of observed predictor 13 | variables. 14 | 15 | For centuries, scientists have addressed such problems by deriving theoretical 16 | frameworks from first principles or have accumulated knowledge in order to 17 | model, analyze and understand the pheno\-menon under study. For example, 18 | practitioners know from past experience that elderly heart attack patients with 19 | low blood pressure are generally high risk. Similarly, meteorologists know from 20 | elementary climate models that one hot, high pollution day is likely to be 21 | followed by another. For an increasing number of problems however, standard 22 | approaches start showing their limits. For example, identifying the genetic 23 | risk factors for heart disease, where knowledge is still very sparse, is nearly 24 | impractical for the cognitive abilities of humans given the high complexity and 25 | intricacy of interactions that exist between genes. Likewise, for very 26 | fine-grained meteorological forecasts, a large number of variables need to be taken 27 | into account, which quickly goes beyond the capabilities of experts to put them 28 | all into a system of equations. To break this cognitive barrier and further 29 | advance science, machines of increasing speed and capacity have been built and 30 | designed since the mid-twentieth century to assist humans in their 31 | calculations. Amazingly however, alongside this progress in terms of hardware, 32 | developments in theoretical computer science, artificial intelligence and 33 | statistics have made machines to become more than calculators. Recent advances 34 | have made them experts of their own kind, capable to learn from data and to 35 | uncover by themselves the predictive structure of problems. Techniques and 36 | algorithms that have stemmed from the field of {\it machine learning} have 37 | indeed now become a powerful tool for the analysis of complex and large data, 38 | successfully assisting scientists in numerous breakthroughs of various fields 39 | of science and technology. Public and famous examples include the use of 40 | boosted decision trees in the statistical analysis that led to the detection of 41 | the Higgs boson at CERN~\citep{chatrchyan:2012}, the use of random forests for 42 | human pose detection in the Microsoft Kinect~\citep{criminisi:2013} or the 43 | implementation of various machine learning techniques for building the IBM 44 | Watson system~\citep{ferrucci:2010}, capable to compete at the human champion 45 | level on the American TV quiz show Jeopardy. 46 | 47 | Formally, machine learning can be defined as the study of systems that can 48 | learn from data without being explicitly programmed. According to 49 | \citet{mitchell:1997}, a computer program is said to learn from data, with 50 | respect to some class of tasks and performance measure if its performance at 51 | those tasks improves with data. In particular, machine learning provides 52 | algorithms that are able to solve classification or regression tasks, hence 53 | bringing now automated procedures for the prediction of a phenomenon based on 54 | past observations. However, the goal of machine learning is not only to produce 55 | algorithms making accurate predictions, it is also to provide insights on the 56 | predictive structure of the data~\citep{breiman:1984}. If we are aiming at the 57 | latter, then our goal is to understand what variables or interactions of 58 | variables drive the phenomenon. For practitioners, which are not experts in 59 | machine learning, interpretability is indeed often as important as prediction 60 | accuracy. It allows for a better understanding of the phenomenon under study, a 61 | finer exploration of the data and an easier self-appropriation of the results. 62 | By contrast, when an algorithm is used as a black box, yielding results 63 | seemingly out of nowhere, it may indeed be difficult to trust or accept if it 64 | cannot be understood how and why the procedure came to them. Unfortunately, the 65 | current state-of-the-art in machine learning often makes it difficult for 66 | non-experts to understand and interpret the results of an algorithm. While 67 | considerable efforts have been put to improve their prediction accuracy, it is 68 | still not clearly understood what makes machine learning algorithms truly work, 69 | and under what assumptions. Likewise, few of them actually provide clear and 70 | insightful explanations about the results they generate. 71 | 72 | In this context, the goal of this thesis is to provide a comprehensive and 73 | self-contained analysis of a class of algorithms known as decision 74 | trees~\citep{breiman:1984} and random forests~\citep{breiman:2001}. While these 75 | methods have proven to be a robust, accurate and successful tool for solving 76 | countless of machine learning tasks, including classification, regression, 77 | density estimation, manifold learning or semi-supervised 78 | learning~\citep{criminisi:2013}, there remain many gray areas in their 79 | understanding: 80 | \begin{enumerate} 81 | \item First, the theoretical properties and statistical mechanisms that drive 82 | the algorithm are still not clearly and entirely understood. Random forests 83 | indeed evolved from empirical successes rather than from a sound 84 | theory. As such, various parts of the algorithm remain heuristic rather than 85 | theoretically motivated. For example, preliminary 86 | results have proven the consistency of simplified to very close variants of 87 | random forests, but consistency of the original algorithm remains unproven 88 | in a general setting. 89 | \item Second, while the construction process of a single decision tree can 90 | easily be described within half a page, implementing this algorithm properly 91 | and efficiently remains a challenging task involving issues that are easily 92 | overlooked. Unfortunately, implementation details are often omitted in the 93 | scientific literature and can often only be found by diving into 94 | (unequally documented) existing software implementations. As far as we know, 95 | there is indeed no comprehensive survey covering the implementation details of 96 | random forests, nor with their respective effects in terms of runtime and space 97 | complexity or learning ability. 98 | \item Third, interpreting the resulting model remains a difficult task, 99 | for which even machine learning experts still fail at finely analyzing and 100 | uncovering the precise predictive structure learned by the procedure. 101 | In particular, despite their extensive use in a wide range of applications, little 102 | is still known regarding variable importance measures computed by random forests. 103 | Empirical evidence suggests that they are appropriate for identifying 104 | relevant variables, but their statistical mechanisms and properties are 105 | still far from being understood. 106 | \end{enumerate} 107 | All throughout this dissertation, our objective is therefore to call into 108 | question each and every part of the random forests methodology, both from a 109 | theoretical and practical point of view. Accordingly, this work aims at 110 | revisiting decision trees and random forests to hopefully shed new light on 111 | their learning capabilities, inner workings and interpretability. 112 | 113 | \section{Outline and contributions} 114 | 115 | Part~\textsc{\ref{part:1}} of this manuscript is first dedicated to a thorough 116 | treatment of decision trees and forests of randomized trees. We begin in 117 | Chapter~\ref{ch:background} by outlining fundamental concepts of machine 118 | learning, and then proceed in Chapters~\ref{ch:cart} and \ref{ch:forest} with a 119 | comprehensive review of the algorithms at the core of decision trees and random 120 | forests. We discuss the learning capabilities of these models and carefully 121 | study all parts of the algorithm and their complementary effects. In particular, 122 | Chapter~\ref{ch:forest} includes original contributions on the bias-variance 123 | analysis of ensemble methods, highlighting how randomization can help improve 124 | performance. Chapter~\ref{ch:complexity} concludes this first part with an 125 | original space and time complexity analysis of random forests (and their 126 | variants), along with an in-depth discussion of implementation details, 127 | as contributed within the open source Scikit-Learn library. 128 | Overall, Part~\textsc{\ref{part:1}} therefore presents a comprehensive review 129 | of previous work on random forests, including some original contributions 130 | both from a theoretical and practical point of view. 131 | 132 | Part~\textsc{\ref{part:2}} analyzes and discusses the interpretability of 133 | random forests. In Chapter~\ref{ch:importances}, we study variable importances 134 | as computed with a forest of randomized trees and study how these scores can be 135 | interpreted in order to reveal the underlying predictive structure learned from 136 | the data. In particular, we derive a theoretical framework from which we prove 137 | theoretical and practical properties of variable importances. In 138 | Chapter~\ref{ch:applications}, we then exploit this framework to further study 139 | variable importances as derived from actual random forests and present 140 | successful applications of variable importance measures. 141 | Part~\textsc{\ref{part:2}} constitutes the main contributions of this 142 | dissertation. 143 | 144 | Finally, Part~\textsc{\ref{part:3}} addresses limitations of random forests in 145 | the context of large datasets. Through extensive experiments, we show in 146 | Chapter~\ref{ch:random-patches} that subsampling strategies provides on par 147 | performance while simultaneously lowering the memory requirements. This 148 | chapter presents original work. 149 | 150 | \section{Publications} 151 | 152 | This dissertation summarizes several contributions to random forests 153 | algorithms. Publications that have directly stemmed from this work include: 154 | 155 | \begin{itemize} 156 | \item \citep{geurts:2011} \textit{Learning to rank with extremely randomized trees}, 157 | Geurts Pierre and Louppe Gilles. 158 | In JMLR: Workshop and Conference Proceedings, volume 14, 2011. 159 | 160 | \item \citep{louppe:2012} \textit{Ensembles on random patches}, 161 | Louppe Gilles and Geurts Pierre. 162 | In Machine Learning and Knowledge Discovery in Databases, pages 346--361. Springer, 2012. 163 | 164 | \item \citep{louppe:2013} \textit{Understanding variable importances in forests of randomized trees}, 165 | Louppe Gilles, Wehenkel Louis, Sutera Antonio and Geurts Pierre. 166 | In Advances in Neural Information Processing Systems, pages 431--439, 2013. 167 | 168 | \item \citep{buitinck:2013} \textit{API design for machine learning software: experiences from the scikit-learn project}, 169 | Buitinck Lars, Louppe Gilles, Blondel Mathieu et al.. 170 | In ECML-PKDD 2013 Workshop: Languages for Data Mining and Machine Learning, 2013. 171 | 172 | \item \citep{botta:2014} \textit{Exploiting SNP Correlations within Random Forest for Genome-Wide Association Studies}, 173 | Botta Vincent, Louppe Gilles, Geurts Pierre and Wehenkel Louis. 174 | PloS one, 9(4):e93379, 2014. 175 | 176 | \end{itemize} 177 | 178 | During the course of this thesis, several fruitful collaborations have also 179 | led to the following publications. These are not discussed within 180 | this dissertation. 181 | 182 | \begin{itemize} 183 | 184 | \item \citep{louppe:2010} \textit{A zealous parallel gradient descent algorithm}, 185 | Louppe Gilles and Geurts Pierre. 186 | In Learning on Cores, Clusters and Clouds workshop, NIPS, 2010 187 | 188 | \item \citep{maree:2014} \textit{A hybrid human-computer approach for large-scale image-based measurements using web services and machine learning}, 189 | Mar{\'e}e Rapha{\"e}l, Rollus Loic, Stevens Benjamin et al. 190 | Proceedings IEEE International Symposium on Biomedical Imaging, 2014. 191 | 192 | \item \citep{amy:2014} \textit{Solar Energy Prediction: An International Contest to Initiate Interdisciplinary Research on Compelling Meteorological Problems}, 193 | Amy McGovern, David John Gagne II, Lucas Eustaquio et al., 2014. \textit{Submitted.} 194 | 195 | \item \citep{sutera:2014} \textit{Simple connectome inference from partial correlation statistics in calcium imaging}, 196 | Antonio Sutera, Arnaud Joly, Vincent Francois-Lavet et al., 2014. \textit{Submitted.} 197 | 198 | \end{itemize} 199 | -------------------------------------------------------------------------------- /tex/classicthesis-config.tex: -------------------------------------------------------------------------------- 1 | % **************************************************************************************************** 2 | % classicthesis-config.tex 3 | % formerly known as loadpackages.sty, classicthesis-ldpkg.sty, and classicthesis-preamble.sty 4 | % Use it at the beginning of your ClassicThesis.tex, or as a LaTeX Preamble 5 | % in your ClassicThesis.{tex,lyx} with \input{classicthesis-config} 6 | % **************************************************************************************************** 7 | % If you like the classicthesis, then I would appreciate a postcard. 8 | % My address can be found in the file ClassicThesis.pdf. A collection 9 | % of the postcards I received so far is available online at 10 | % http://postcards.miede.de 11 | % **************************************************************************************************** 12 | 13 | % **************************************************************************************************** 14 | % 1. Configure classicthesis for your needs here, e.g., remove "drafting" below 15 | % in order to deactivate the time-stamp on the pages 16 | % **************************************************************************************************** 17 | \PassOptionsToPackage{eulerchapternumbers,listings,%drafting,% 18 | pdfspacing,eulermath,%floatperchapter,%linedheaders,% 19 | subfig,parts,dottedtoc}{classicthesis} 20 | % ******************************************************************** 21 | % Available options for classicthesis.sty 22 | % (see ClassicThesis.pdf for more information): 23 | % drafting 24 | % parts nochapters linedheaders 25 | % eulerchapternumbers beramono eulermath pdfspacing minionprospacing 26 | % tocaligned dottedtoc manychapters 27 | % listings floatperchapter subfig 28 | % ******************************************************************** 29 | 30 | % ******************************************************************** 31 | % Triggers for this config 32 | % ******************************************************************** 33 | \usepackage{ifthen} 34 | \newboolean{enable-backrefs} % enable backrefs in the bibliography 35 | \setboolean{enable-backrefs}{false} % true false 36 | % **************************************************************************************************** 37 | 38 | 39 | % **************************************************************************************************** 40 | % 2. Personal data and user ad-hoc commands 41 | % **************************************************************************************************** 42 | \newcommand{\myTitle}{Understanding Random Forests\xspace} 43 | \newcommand{\mySubtitle}{From Theory to Practice\xspace} 44 | \newcommand{\myDegree}{Doktor-Ingenieur (Dr.-Ing.)\xspace} 45 | \newcommand{\myName}{Gilles Louppe\xspace} 46 | \newcommand{\myProf}{Put name here\xspace} 47 | \newcommand{\myOtherProf}{Put name here\xspace} 48 | \newcommand{\mySupervisor}{Pierre Geurts\xspace} 49 | \newcommand{\myFaculty}{Faculty of Applied Sciences\xspace} 50 | \newcommand{\myDepartment}{Department of EE and CS\xspace} 51 | \newcommand{\myUni}{University of Liege\xspace} 52 | \newcommand{\myLocation}{Liege, Belgium\xspace} 53 | \newcommand{\myTime}{June 2014\xspace} 54 | \newcommand{\myVersion}{version 1.0\xspace} 55 | 56 | % ******************************************************************** 57 | % Setup, finetuning, and useful commands 58 | % ******************************************************************** 59 | \newcounter{dummy} % necessary for correct hyperlinks (to index, bib, etc.) 60 | \newlength{\abcd} % for ab..z string length calculation 61 | \providecommand{\mLyX}{L\kern-.1667em\lower.25em\hbox{Y}\kern-.125emX\@} 62 | \newcommand{\ie}{i.\,e.} 63 | \newcommand{\Ie}{I.\,e.} 64 | \newcommand{\eg}{e.\,g.} 65 | \newcommand{\Eg}{E.\,g.} 66 | % **************************************************************************************************** 67 | 68 | 69 | % **************************************************************************************************** 70 | % 3. Loading some handy packages 71 | % **************************************************************************************************** 72 | % ******************************************************************** 73 | % Packages with options that might require adjustments 74 | % ******************************************************************** 75 | \PassOptionsToPackage{latin9}{inputenc} % latin9 (ISO-8859-9) = latin1+"Euro sign" 76 | \usepackage{inputenc} 77 | 78 | %\PassOptionsToPackage{ngerman,american}{babel} % change this to your language(s) 79 | % Spanish languages need extra options in order to work with this template 80 | %\PassOptionsToPackage{spanish,es-lcroman}{babel} 81 | \usepackage{babel} 82 | 83 | \PassOptionsToPackage{square,authoryear}{natbib} 84 | \usepackage{natbib} 85 | 86 | \PassOptionsToPackage{fleqn}{amsmath} % math environments and more by the AMS 87 | \usepackage{amsmath} 88 | 89 | % ******************************************************************** 90 | % General useful packages 91 | % ******************************************************************** 92 | \PassOptionsToPackage{T1}{fontenc} % T2A for cyrillics 93 | \usepackage{fontenc} 94 | \usepackage{lipsum} 95 | \usepackage{textcomp} % fix warning with missing font shapes 96 | %\usepackage{scrhack} % fix warnings when using KOMA with listings package 97 | \usepackage{xspace} % to get the spacing after macros right 98 | \usepackage{mparhack} % get marginpar right 99 | \usepackage{fixltx2e} % fixes some LaTeX stuff 100 | \PassOptionsToPackage{printonlyused,smaller}{acronym} 101 | \usepackage{acronym} % nice macros for handling all acronyms in the thesis 102 | %\renewcommand*{\acsfont}[1]{\textssc{#1}} % for MinionPro 103 | \renewcommand{\bflabel}[1]{{#1}\hfill} % fix the list of acronyms 104 | % **************************************************************************************************** 105 | 106 | 107 | % **************************************************************************************************** 108 | % 4. Setup floats: tables, (sub)figures, and captions 109 | % **************************************************************************************************** 110 | \usepackage{tabularx} % better tables 111 | \setlength{\extrarowheight}{3pt} % increase table row height 112 | \newcommand{\tableheadline}[1]{\multicolumn{1}{c}{\spacedlowsmallcaps{#1}}} 113 | \newcommand{\myfloatalign}{\centering} % to be used with each float for alignment 114 | \usepackage{caption} 115 | \captionsetup{format=hang,font=small} 116 | \usepackage{subfig} 117 | % **************************************************************************************************** 118 | 119 | 120 | % **************************************************************************************************** 121 | % 5. Setup code listings 122 | % **************************************************************************************************** 123 | \usepackage{listings} 124 | %\lstset{emph={trueIndex,root},emphstyle=\color{BlueViolet}}%\underbar} % for special keywords 125 | \lstset{language=[LaTeX]Tex,%C++, 126 | keywordstyle=\color{RoyalBlue},%\bfseries, 127 | basicstyle=\small\ttfamily, 128 | %identifierstyle=\color{NavyBlue}, 129 | commentstyle=\color{Green}\ttfamily, 130 | stringstyle=\rmfamily, 131 | numbers=none,%left,% 132 | numberstyle=\scriptsize,%\tiny 133 | stepnumber=5, 134 | numbersep=8pt, 135 | showstringspaces=false, 136 | breaklines=true, 137 | frameround=ftff, 138 | frame=single, 139 | belowcaptionskip=.75\baselineskip 140 | %frame=L 141 | } 142 | % **************************************************************************************************** 143 | 144 | 145 | % **************************************************************************************************** 146 | % 6. PDFLaTeX, hyperreferences and citation backreferences 147 | % **************************************************************************************************** 148 | % ******************************************************************** 149 | % Using PDFLaTeX 150 | % ******************************************************************** 151 | \PassOptionsToPackage{pdftex,hyperfootnotes=true,pdfpagelabels}{hyperref} 152 | \usepackage{hyperref} % backref linktocpage pagebackref 153 | \pdfcompresslevel=9 154 | \pdfadjustspacing=1 155 | \PassOptionsToPackage{pdftex}{graphicx} 156 | \usepackage{graphicx} 157 | 158 | % ******************************************************************** 159 | % Setup the style of the backrefs from the bibliography 160 | % (translate the options to any language you use) 161 | % ******************************************************************** 162 | \newcommand{\backrefnotcitedstring}{\relax}%(Not cited.) 163 | \newcommand{\backrefcitedsinglestring}[1]{(Cited on page~#1.)} 164 | \newcommand{\backrefcitedmultistring}[1]{(Cited on pages~#1.)} 165 | \ifthenelse{\boolean{enable-backrefs}}% 166 | {% 167 | \PassOptionsToPackage{hyperpageref}{backref} 168 | \usepackage{backref} % to be loaded after hyperref package 169 | \renewcommand{\backreftwosep}{ and~} % separate 2 pages 170 | \renewcommand{\backreflastsep}{, and~} % separate last of longer list 171 | \renewcommand*{\backref}[1]{} % disable standard 172 | \renewcommand*{\backrefalt}[4]{% detailed backref 173 | \ifcase #1 % 174 | \backrefnotcitedstring% 175 | \or% 176 | \backrefcitedsinglestring{#2}% 177 | \else% 178 | \backrefcitedmultistring{#2}% 179 | \fi}% 180 | }{\relax} 181 | 182 | % ******************************************************************** 183 | % Hyperreferences 184 | % ******************************************************************** 185 | \hypersetup{% 186 | %draft, % = no hyperlinking at all (useful in b/w printouts) 187 | colorlinks=true, linktocpage=true, pdfstartpage=3, pdfstartview=FitV,% 188 | % uncomment the following line if you want to have black links (e.g., for printing) 189 | %colorlinks=false, linktocpage=false, pdfborder={0 0 0}, pdfstartpage=3, pdfstartview=FitV,% 190 | breaklinks=true, pdfpagemode=UseNone, pageanchor=true, pdfpagemode=UseOutlines,% 191 | plainpages=false, bookmarksnumbered, bookmarksopen=true, bookmarksopenlevel=1,% 192 | hypertexnames=true, pdfhighlight=/O,%nesting=true,%frenchlinks,% 193 | urlcolor=webbrown, linkcolor=RoyalBlue, citecolor=webgreen, %pagecolor=RoyalBlue,% 194 | %urlcolor=Black, linkcolor=Black, citecolor=Black, %pagecolor=Black,% 195 | pdftitle={\myTitle},% 196 | pdfauthor={\textcopyright\ \myName, \myUni, \myFaculty},% 197 | pdfsubject={},% 198 | pdfkeywords={},% 199 | pdfcreator={pdfLaTeX},% 200 | pdfproducer={LaTeX with hyperref and classicthesis}% 201 | } 202 | 203 | % ******************************************************************** 204 | % Setup autoreferences 205 | % ******************************************************************** 206 | % There are some issues regarding autorefnames 207 | % http://www.ureader.de/msg/136221647.aspx 208 | % http://www.tex.ac.uk/cgi-bin/texfaq2html?label=latexwords 209 | % you have to redefine the makros for the 210 | % language you use, e.g., american, ngerman 211 | % (as chosen when loading babel/AtBeginDocument) 212 | % ******************************************************************** 213 | \makeatletter 214 | \@ifpackageloaded{babel}% 215 | {% 216 | \addto\extrasamerican{% 217 | \renewcommand*{\figureautorefname}{Figure}% 218 | \renewcommand*{\tableautorefname}{Table}% 219 | \renewcommand*{\partautorefname}{Part}% 220 | \renewcommand*{\chapterautorefname}{Chapter}% 221 | \renewcommand*{\sectionautorefname}{Section}% 222 | \renewcommand*{\subsectionautorefname}{Section}% 223 | \renewcommand*{\subsubsectionautorefname}{Section}% 224 | }% 225 | \addto\extrasngerman{% 226 | \renewcommand*{\paragraphautorefname}{Absatz}% 227 | \renewcommand*{\subparagraphautorefname}{Unterabsatz}% 228 | \renewcommand*{\footnoteautorefname}{Fu\"snote}% 229 | \renewcommand*{\FancyVerbLineautorefname}{Zeile}% 230 | \renewcommand*{\theoremautorefname}{Theorem}% 231 | \renewcommand*{\appendixautorefname}{Anhang}% 232 | \renewcommand*{\equationautorefname}{Gleichung}% 233 | \renewcommand*{\itemautorefname}{Punkt}% 234 | }% 235 | % Fix to getting autorefs for subfigures right (thanks to Belinda Vogt for changing the definition) 236 | \providecommand{\subfigureautorefname}{\figureautorefname}% 237 | }{\relax} 238 | \makeatother 239 | 240 | 241 | % **************************************************************************************************** 242 | % 7. Last calls before the bar closes 243 | % **************************************************************************************************** 244 | % ******************************************************************** 245 | % Development Stuff 246 | % ******************************************************************** 247 | \listfiles 248 | %\PassOptionsToPackage{l2tabu,orthodox,abort}{nag} 249 | % \usepackage{nag} 250 | %\PassOptionsToPackage{warning, all}{onlyamsmath} 251 | % \usepackage{onlyamsmath} 252 | 253 | % ******************************************************************** 254 | % Last, but not least... 255 | % ******************************************************************** 256 | \usepackage{classicthesis} 257 | % **************************************************************************************************** 258 | 259 | 260 | % **************************************************************************************************** 261 | % 8. Further adjustments (experimental) 262 | % **************************************************************************************************** 263 | % ******************************************************************** 264 | % Changing the text area 265 | % ******************************************************************** 266 | %\linespread{1.05} % a bit more for Palatino 267 | %\areaset[current]{312pt}{761pt} % 686 (factor 2.2) + 33 head + 42 head \the\footskip 268 | %\setlength{\marginparwidth}{7em}% 269 | %\setlength{\marginparsep}{2em}% 270 | 271 | % ******************************************************************** 272 | % Using different fonts 273 | % ******************************************************************** 274 | %\usepackage[oldstylenums]{kpfonts} % oldstyle notextcomp 275 | %\usepackage[osf]{libertine} 276 | %\usepackage{hfoldsty} % Computer Modern with osf 277 | %\usepackage[light,condensed,math]{iwona} 278 | %\renewcommand{\sfdefault}{iwona} 279 | %\usepackage{lmodern} % <-- no osf support :-( 280 | % \usepackage[T1]{fontenc} 281 | % \usepackage{textcomp} 282 | %\usepackage[urw-garamond]{mathdesign} <-- no osf support :-( 283 | % **************************************************************************************************** 284 | --------------------------------------------------------------------------------