├── LICENSE
├── thesis.pdf
├── slides
    ├── slides.pdf
    ├── figures
    │   ├── mdi.pdf
    │   ├── bench.pdf
    │   ├── rp-1.pdf
    │   ├── tree.pdf
    │   ├── wine.jpg
    │   ├── blackbox.jpg
    │   ├── fit-time.pdf
    │   ├── forest.pdf
    │   ├── imp-wine.pdf
    │   ├── led-fig.pdf
    │   ├── led-fig.png
    │   ├── led-imp.pdf
    │   ├── condorcet.png
    │   ├── imp-wine2.pdf
    │   ├── motivation.png
    │   ├── rp-memory.pdf
    │   ├── tree-wine.pdf
    │   ├── avatars
    │   │   ├── bholt.jpg
    │   │   ├── joel.jpg
    │   │   ├── lars.png
    │   │   ├── ndawe.jpg
    │   │   ├── satra.jpg
    │   │   ├── arjoly.jpg
    │   │   ├── glouppe.jpg
    │   │   ├── ogrisel.jpg
    │   │   ├── pprett.jpg
    │   │   └── amueller.jpg
    │   ├── bias-variance.pdf
    │   ├── blackbox-open.jpg
    │   ├── tree-simple.pdf
    │   ├── scikit-learn-logo.pdf
    │   ├── tree-partition-a.pdf
    │   ├── tree-partition-b.pdf
    │   ├── tree-partition-c.pdf
    │   ├── tree-partition-d.pdf
    │   └── bias-variance-darts.jpg
    └── minted.sty
├── tex
    ├── figures
    │   ├── blason.pdf
    │   ├── ch2_mlp.pdf
    │   ├── ch3_tree.pdf
    │   ├── ch5_sort.pdf
    │   ├── ch5_tree.pdf
    │   ├── ch6_led.pdf
    │   ├── ch3_splits.pdf
    │   ├── ch6_imp_led.pdf
    │   ├── ch6_order.pdf
    │   ├── ch7_network.png
    │   ├── ch7_red_led.pdf
    │   ├── ch7_red_xor.pdf
    │   ├── ch7_splits.pdf
    │   ├── ch3_goodness.pdf
    │   ├── ch3_partition.pdf
    │   ├── ch4_variance.pdf
    │   ├── ch5_mnist_fit.pdf
    │   ├── ch7_bias_null.pdf
    │   ├── ch7_trees_ets.pdf
    │   ├── ch7_trees_id3.pdf
    │   ├── ch2_hyperplane.pdf
    │   ├── ch3_toy_x1_error.pdf
    │   ├── ch3_toy_x1_gini.pdf
    │   ├── ch3_toy_x2_gini.pdf
    │   ├── ch3_toy_x3_gini.pdf
    │   ├── ch4_correlation.pdf
    │   ├── ch4_overfitting.pdf
    │   ├── ch5_learningset.pdf
    │   ├── ch7_bias_depth.pdf
    │   ├── ch7_bias_trees.pdf
    │   ├── ch7_trees_ets2.pdf
    │   ├── ch8
    │   │   ├── figure4-mem.pdf
    │   │   ├── figure4-none.pdf
    │   │   ├── figure5-c-tis.pdf
    │   │   ├── figure5-a-arcene.pdf
    │   │   ├── figure5-b-cifar10.pdf
    │   │   ├── figure5-d-madelon.pdf
    │   │   ├── figure5-e-isolet.pdf
    │   │   ├── figure5-f-mnist.pdf
    │   │   ├── figure3-c-tis-rp-dt.pdf
    │   │   ├── figure3-c-tis-rp-et.pdf
    │   │   ├── figure3-a-arcene-rp-dt.pdf
    │   │   ├── figure3-a-arcene-rp-et.pdf
    │   │   ├── figure3-e-isolet-rp-dt.pdf
    │   │   ├── figure3-e-isolet-rp-et.pdf
    │   │   ├── figure3-f-mnist-rp-dt.pdf
    │   │   ├── figure3-f-mnist-rp-et.pdf
    │   │   ├── figure3-b-cifar10-rp-dt.pdf
    │   │   ├── figure3-b-cifar10-rp-et.pdf
    │   │   ├── figure3-d-madelon-rp-dt.pdf
    │   │   └── figure3-d-madelon-rp-et.pdf
    │   ├── ch8_rank_large.pdf
    │   ├── ch8_rank_small.pdf
    │   ├── ch3_split_ordered.pdf
    │   ├── ch4_bias_variance.pdf
    │   ├── ch4_proximity_plot.pdf
    │   ├── ch5_mnist_predict.pdf
    │   ├── ch7_bias_depth_rf.pdf
    │   ├── ch2_train_test_error.pdf
    │   ├── ch3_impurity_comparison.pdf
    │   ├── ch4_estimate_distribution.pdf
    │   ├── make_friedman1
    │   │   ├── n_train_mse.pdf
    │   │   ├── bootstrap_mse.pdf
    │   │   ├── max_features_mse.pdf
    │   │   ├── n_estimators_mse.pdf
    │   │   ├── n_features_mse.pdf
    │   │   ├── n_train_time_fit.pdf
    │   │   ├── bootstrap_time_fit.pdf
    │   │   ├── max_features_time_fit.pdf
    │   │   ├── n_estimators_time_fit.pdf
    │   │   ├── n_features_time_fit.pdf
    │   │   ├── n_train_average_depth.pdf
    │   │   ├── bootstrap_average_depth.pdf
    │   │   ├── max_features_average_depth.pdf
    │   │   ├── n_estimators_average_depth.pdf
    │   │   └── n_features_average_depth.pdf
    │   ├── ch3_split_ordered_invariant.pdf
    │   ├── generate.sh
    │   ├── ch7_trees_ets2.tex
    │   ├── ch5_sort.tex
    │   ├── ch3_tree.tex
    │   ├── ch7_trees_id3.tex
    │   ├── ch3_splits.tex
    │   ├── ch7_splits.tex
    │   ├── ch7_trees_ets.tex
    │   ├── ch5_tree.tex
    │   ├── ch2_mlp.tex
    │   ├── ch3_goodness.tex
    │   ├── ch8_rank_large.tex
    │   └── ch8_rank_small.tex
    ├── frontback
    │   ├── bibliography.tex
    │   ├── disclaimer.tex
    │   ├── toc.tex
    │   ├── jury.tex
    │   ├── titlepage.tex
    │   ├── acknowledgments.tex
    │   ├── abstract.tex
    │   └── notations.tex
    ├── Makefile
    ├── summary.tex
    ├── thesis.tex
    ├── chapters
    │   ├── chapter09.tex
    │   └── chapter01.tex
    ├── minted.sty
    └── classicthesis-config.tex
├── .gitignore
├── scripts
    ├── ch6_order.py
    ├── ch5_tree.py
    ├── ch2_hyperplane.py
    ├── ch6_decomposition1.py
    ├── ch4_proximity.py
    ├── ch4_estimate_distribution.py
    ├── ch3_partition.py
    ├── ch6_decomposition2.py
    ├── ch7_redundant.py
    ├── ch4_bias_variance.py
    ├── ch3_split_ordered.py
    ├── ch7_bias_tree.py
    ├── ch2_train_test_error.py
    ├── ch7_bias_depth.py
    ├── ch3_split_ordered_invariant.py
    ├── ch7_bias_null.py
    ├── ch4_correlation_plot.py
    ├── ch3_impurity.py
    ├── ch4_correlation.py
    ├── ch4_overfitting.py
    ├── ch4_correlation_plot2.py
    ├── ID3.py
    └── demo.py
├── benchmarks
    ├── data.py
    ├── resources
    │   └── bench_randomforest.py
    └── visualize.py
└── README.md


/LICENSE:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/thesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/thesis.pdf


--------------------------------------------------------------------------------
/slides/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/slides.pdf


--------------------------------------------------------------------------------
/slides/figures/mdi.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/mdi.pdf


--------------------------------------------------------------------------------
/tex/figures/blason.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/blason.pdf


--------------------------------------------------------------------------------
/slides/figures/bench.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bench.pdf


--------------------------------------------------------------------------------
/slides/figures/rp-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/rp-1.pdf


--------------------------------------------------------------------------------
/slides/figures/tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree.pdf


--------------------------------------------------------------------------------
/slides/figures/wine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/wine.jpg


--------------------------------------------------------------------------------
/tex/figures/ch2_mlp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_mlp.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_tree.pdf


--------------------------------------------------------------------------------
/tex/figures/ch5_sort.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_sort.pdf


--------------------------------------------------------------------------------
/tex/figures/ch5_tree.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_tree.pdf


--------------------------------------------------------------------------------
/tex/figures/ch6_led.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_led.pdf


--------------------------------------------------------------------------------
/slides/figures/blackbox.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/blackbox.jpg


--------------------------------------------------------------------------------
/slides/figures/fit-time.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/fit-time.pdf


--------------------------------------------------------------------------------
/slides/figures/forest.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/forest.pdf


--------------------------------------------------------------------------------
/slides/figures/imp-wine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/imp-wine.pdf


--------------------------------------------------------------------------------
/slides/figures/led-fig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-fig.pdf


--------------------------------------------------------------------------------
/slides/figures/led-fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-fig.png


--------------------------------------------------------------------------------
/slides/figures/led-imp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/led-imp.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_splits.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_splits.pdf


--------------------------------------------------------------------------------
/tex/figures/ch6_imp_led.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_imp_led.pdf


--------------------------------------------------------------------------------
/tex/figures/ch6_order.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch6_order.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_network.png


--------------------------------------------------------------------------------
/tex/figures/ch7_red_led.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_red_led.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_red_xor.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_red_xor.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_splits.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_splits.pdf


--------------------------------------------------------------------------------
/slides/figures/condorcet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/condorcet.png


--------------------------------------------------------------------------------
/slides/figures/imp-wine2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/imp-wine2.pdf


--------------------------------------------------------------------------------
/slides/figures/motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/motivation.png


--------------------------------------------------------------------------------
/slides/figures/rp-memory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/rp-memory.pdf


--------------------------------------------------------------------------------
/slides/figures/tree-wine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-wine.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_goodness.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_goodness.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_partition.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_partition.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_variance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_variance.pdf


--------------------------------------------------------------------------------
/tex/figures/ch5_mnist_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_mnist_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_bias_null.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_null.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_ets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_ets.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_id3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_id3.pdf


--------------------------------------------------------------------------------
/slides/figures/avatars/bholt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/bholt.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/joel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/joel.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/lars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/lars.png


--------------------------------------------------------------------------------
/slides/figures/avatars/ndawe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/ndawe.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/satra.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/satra.jpg


--------------------------------------------------------------------------------
/slides/figures/bias-variance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bias-variance.pdf


--------------------------------------------------------------------------------
/slides/figures/blackbox-open.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/blackbox-open.jpg


--------------------------------------------------------------------------------
/slides/figures/tree-simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-simple.pdf


--------------------------------------------------------------------------------
/tex/figures/ch2_hyperplane.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_hyperplane.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_toy_x1_error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x1_error.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_toy_x1_gini.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x1_gini.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_toy_x2_gini.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x2_gini.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_toy_x3_gini.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_toy_x3_gini.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_correlation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_correlation.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_overfitting.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_overfitting.pdf


--------------------------------------------------------------------------------
/tex/figures/ch5_learningset.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_learningset.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_bias_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_depth.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_bias_trees.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_trees.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_ets2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_trees_ets2.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure4-mem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure4-mem.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure4-none.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure4-none.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8_rank_large.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8_rank_large.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8_rank_small.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8_rank_small.pdf


--------------------------------------------------------------------------------
/slides/figures/avatars/arjoly.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/arjoly.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/glouppe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/glouppe.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/ogrisel.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/ogrisel.jpg


--------------------------------------------------------------------------------
/slides/figures/avatars/pprett.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/pprett.jpg


--------------------------------------------------------------------------------
/tex/figures/ch3_split_ordered.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_split_ordered.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_bias_variance.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_bias_variance.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_proximity_plot.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_proximity_plot.pdf


--------------------------------------------------------------------------------
/tex/figures/ch5_mnist_predict.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch5_mnist_predict.pdf


--------------------------------------------------------------------------------
/tex/figures/ch7_bias_depth_rf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch7_bias_depth_rf.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-c-tis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-c-tis.pdf


--------------------------------------------------------------------------------
/slides/figures/avatars/amueller.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/avatars/amueller.jpg


--------------------------------------------------------------------------------
/slides/figures/scikit-learn-logo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/scikit-learn-logo.pdf


--------------------------------------------------------------------------------
/slides/figures/tree-partition-a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-a.pdf


--------------------------------------------------------------------------------
/slides/figures/tree-partition-b.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-b.pdf


--------------------------------------------------------------------------------
/slides/figures/tree-partition-c.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-c.pdf


--------------------------------------------------------------------------------
/slides/figures/tree-partition-d.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/tree-partition-d.pdf


--------------------------------------------------------------------------------
/tex/figures/ch2_train_test_error.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch2_train_test_error.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-a-arcene.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-a-arcene.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-b-cifar10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-b-cifar10.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-d-madelon.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-d-madelon.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-e-isolet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-e-isolet.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure5-f-mnist.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure5-f-mnist.pdf


--------------------------------------------------------------------------------
/slides/figures/bias-variance-darts.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/slides/figures/bias-variance-darts.jpg


--------------------------------------------------------------------------------
/tex/figures/ch3_impurity_comparison.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_impurity_comparison.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-c-tis-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-c-tis-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-c-tis-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-c-tis-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/ch4_estimate_distribution.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch4_estimate_distribution.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-a-arcene-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-a-arcene-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-a-arcene-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-a-arcene-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-e-isolet-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-e-isolet-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-e-isolet-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-e-isolet-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-f-mnist-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-f-mnist-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-f-mnist-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-f-mnist-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_train_mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_mse.pdf


--------------------------------------------------------------------------------
/tex/figures/ch3_split_ordered_invariant.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch3_split_ordered_invariant.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-b-cifar10-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-b-cifar10-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-b-cifar10-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-b-cifar10-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-d-madelon-rp-dt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-d-madelon-rp-dt.pdf


--------------------------------------------------------------------------------
/tex/figures/ch8/figure3-d-madelon-rp-et.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/ch8/figure3-d-madelon-rp-et.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/bootstrap_mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_mse.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/max_features_mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_mse.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_estimators_mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_mse.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_features_mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_mse.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_train_time_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_time_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/bootstrap_time_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_time_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/max_features_time_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_time_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_estimators_time_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_time_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_features_time_fit.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_time_fit.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_train_average_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_train_average_depth.pdf


--------------------------------------------------------------------------------
/tex/figures/generate.sh:
--------------------------------------------------------------------------------
1 | latex $1.tex
2 | dvips $1.dvi
3 | ps2pdf $1.ps
4 | pdfcrop $1.pdf
5 | rm $1.aux $1.dvi $1.log $1.ps $1.pdf
6 | mv $1-crop.pdf $1.pdf
7 | 


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/bootstrap_average_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/bootstrap_average_depth.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/max_features_average_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/max_features_average_depth.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_estimators_average_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_estimators_average_depth.pdf


--------------------------------------------------------------------------------
/tex/figures/make_friedman1/n_features_average_depth.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/glouppe/phd-thesis/HEAD/tex/figures/make_friedman1/n_features_average_depth.pdf


--------------------------------------------------------------------------------
/tex/frontback/bibliography.tex:
--------------------------------------------------------------------------------
 1 | % Bibliography ================================================================
 2 | 
 3 | \chapter{References}
 4 | 
 5 | \begingroup
 6 |     \def\chapter*#1{}
 7 |     \bibliographystyle{abbrvnat}
 8 |     \renewcommand{\bibname}{}
 9 |     \label{app:bibliography}
10 |     \bibliography{bibliography}
11 | \endgroup
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/tex/frontback/disclaimer.tex:
--------------------------------------------------------------------------------
 1 | % Disclaimer ====================================================================
 2 | 
 3 | \vspace*{\fill}
 4 | \begin{center}
 5 | {\it This dissertation has been submitted in partial fulfillment of
 6 | the requirements for the Degree of Doctor of Philosophy in
 7 | Computer Science.
 8 | 
 9 | \vskip1cm
10 | 
11 | This version of the manuscript is pending the approval
12 | of the jury.}
13 | \end{center}
14 | \vspace*{\fill}
15 | \vspace*{\fill}
16 | \vspace*{\fill}
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.brf
 2 | *.acn
 3 | *.pyc
 4 | *.acr
 5 | *.alg
 6 | *.aux
 7 | *.bbl
 8 | *.blg
 9 | *.dvi
10 | *.fdb_latexmk
11 | *.glg
12 | *.glo
13 | *.gls
14 | *.idx
15 | *.ilg
16 | *.ind
17 | *.ist
18 | *.lof
19 | *.lol
20 | *.log
21 | *.lot
22 | *.maf
23 | *.mtc
24 | *.mtc0
25 | *.nav
26 | *.nlo
27 | *.out
28 | *.pdfsync
29 | *.ps
30 | *.snm
31 | *.synctex.gz
32 | *.toc
33 | *.vrb
34 | *.xdy
35 | Thumbs.db
36 | *.tdo
37 | thesis.pdf
38 | classicthesis/*
39 | benchmarks/output/*
40 | benchmarks/ok3/*
41 | benchmarks/figs/*
42 | benchmarks/resources/*
43 | scripts/.ipynb_checkpoints/*
44 | tex/arxiv/*
45 | 


--------------------------------------------------------------------------------
/tex/Makefile:
--------------------------------------------------------------------------------
 1 | summary.pdf: summary.tex classicthesis-config.tex summary/*.tex frontback/*.tex
 2 | 	pdflatex -shell-escape summary
 3 | 	bibtex summary
 4 | 	pdflatex -shell-escape summary
 5 | 	pdflatex -shell-escape summary
 6 | 
 7 | thesis.pdf: bibliography.bib thesis.tex classicthesis-config.tex chapters/*.tex frontback/*.tex
 8 | 	pdflatex -shell-escape thesis
 9 | 	bibtex thesis
10 | 	pdflatex -shell-escape thesis
11 | 	pdflatex -shell-escape thesis
12 | 
13 | partial:
14 | 	bibtex thesis
15 | 	pdflatex -shell-escape thesis
16 | 
17 | clean:
18 | 	rm -f *.lot *.lof *.lol *.toc *.log *.out *.aux *.blg *.bbl thesis.pdf chapters/*.aux frontback/*.aux
19 | 
20 | rebuild: clean thesis.pdf
21 | 


--------------------------------------------------------------------------------
/tex/frontback/toc.tex:
--------------------------------------------------------------------------------
 1 | % Table of contents ===========================================================
 2 | 
 3 | \refstepcounter{dummy}
 4 | \pdfbookmark[1]{\contentsname}{tableofcontents}
 5 | \setcounter{tocdepth}{3} % <-- 2 includes up to subsections in the ToC
 6 | \setcounter{secnumdepth}{3} % <-- 3 numbers up to subsubsections
 7 | \manualmark
 8 | \markboth{\spacedlowsmallcaps{\contentsname}}{\spacedlowsmallcaps{\contentsname}}
 9 | \tableofcontents
10 | \automark[section]{chapter}
11 | \renewcommand{\chaptermark}[1]{\markboth{\spacedlowsmallcaps{#1}}{\spacedlowsmallcaps{#1}}}
12 | \renewcommand{\sectionmark}[1]{\markright{\thesection\enspace\spacedlowsmallcaps{#1}}}
13 | 
14 | \cleardoublepage
15 | 


--------------------------------------------------------------------------------
/tex/frontback/jury.tex:
--------------------------------------------------------------------------------
 1 | % Jury ====================================================================
 2 | 
 3 | \pdfbookmark[1]{Jury members}{Jury members}
 4 | \chapter*{Jury members}
 5 | 
 6 | 
 7 | \noindent \textsc{Louis Wehenkel}, Professor at the Universit{\'e} de Li{\`e}ge (President); \\
 8 | 
 9 | \noindent \textsc{Pierre Geurts}, Professor at the Universit{\'e} de Li{\`e}ge (Advisor); \\
10 | 
11 | \noindent \textsc{Bernard Boigelot}, Professor at the Universit{\'e} de Li{\`e}ge; \\
12 | 
13 | \noindent \textsc{Renaud Detry}, Postdoctoral Researcher at the Universit{\'e} de Li{\`e}ge; \\
14 | 
15 | \noindent \textsc{Gianluca Bontempi}, Professor at the Universit{\'e} Libre de Bruxelles; \\
16 | 
17 | \noindent \textsc{G{\'e}rard Biau}, Professor at the Universit{\'e} Pierre et Marie Curie (France); \\
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/ch6_order.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import brewer2mpl
 4 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 7).mpl_colors
 5 | #cmap = [(0, 0, 1.0), (1.0, 0, 0)]
 6 | 
 7 | all_importances = np.array(
 8 | [[0.414, 0.362, 0.327, 0.309, 0.304, 0.305, 0.306],
 9 |  [0.583, 0.663, 0.715, 0.757, 0.787, 0.801, 0.799],
10 |  [0.532, 0.512, 0.496, 0.489, 0.483, 0.475, 0.475],
11 |  [0.543, 0.525, 0.484, 0.445, 0.414, 0.409, 0.412],
12 |  [0.658, 0.731, 0.778, 0.810, 0.827, 0.831, 0.835],
13 |  [0.221, 0.140, 0.126, 0.122, 0.122, 0.121, 0.120],
14 |  [0.368, 0.385, 0.392, 0.387, 0.382, 0.375, 0.372]])
15 | 
16 | n_features = all_importances.shape[0]
17 | for m in range(n_features):
18 |     plt.plot(range(1, n_features+1), all_importances[m, :], "o-", label="X%d" % (m+1), color=cmap[m])
19 | 
20 | plt.legend(loc="best")
21 | plt.show()
22 | 
23 | 


--------------------------------------------------------------------------------
/scripts/ch5_tree.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | np.random.seed(0)
 5 | X = np.random.rand(300, 2)
 6 | y = (X[:,0] > 0.3) & (X[:,0] < 0.7) & (X[:,1] > 0.3) & (X[:,1] < 0.7)
 7 | 
 8 | # randomly flips some labels
 9 | mask = np.random.permutation(len(X))[:5]
10 | y[mask] = ~y[mask]
11 | 
12 | X_c1 = X[y == 0]
13 | plt.scatter(X_c1[:, 0], X_c1[:, 1], color=(1.0, 0, 0))
14 | 
15 | X_c2 = X[y == 1]
16 | plt.scatter(X_c2[:, 0], X_c2[:, 1], color=(0, 0, 1.0))
17 | 
18 | # decision tree
19 | from sklearn.tree import DecisionTreeClassifier
20 | clf = DecisionTreeClassifier(max_leaf_nodes=5).fit(X, y)
21 | print "children_left =", clf.tree_.children_left
22 | print "children_right =", clf.tree_.children_right
23 | print "feature =", clf.tree_.feature
24 | print "threshold =", clf.tree_.threshold
25 | print "impurity =", clf.tree_.impurity
26 | print "n_samples =", clf.tree_.n_node_samples
27 | print "value =", clf.tree_.value
28 | 
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/tex/frontback/titlepage.tex:
--------------------------------------------------------------------------------
 1 | % Front page ==================================================================
 2 | 
 3 | \begin{titlepage}
 4 | 	\begin{addmargin}[-1cm]{-3cm}
 5 |     \begin{center}
 6 |         \large
 7 |         {\Large \textsc{University of Li{\`e}ge}}\\[1ex]
 8 |         Faculty of Applied Sciences\\
 9 |         Department of Electrical Engineering \& Computer Science\\
10 | 
11 |         \vfill
12 | 
13 |         PhD dissertation\\ \vskip1cm
14 |         \rule{14cm}{0.4pt}\\ \bigskip
15 |         \begingroup
16 |             \Large
17 |             \color{Maroon}\spacedallcaps{\myTitle} \\ \bigskip
18 |         \endgroup
19 |         \spacedlowsmallcaps{\mySubtitle} \\ \bigskip
20 |         \rule{14cm}{0.4pt}\\ \vskip1cm
21 |         by \textsc{Gilles Louppe}
22 | 
23 |         \vfill
24 |         \vfill
25 |         \vfill
26 | 
27 |         \hfill Advisor: Prof. \textsc{Pierre Geurts}\\
28 |         \hfill July 2014
29 |     \end{center}
30 |     \vspace{-3.5cm}\includegraphics[width=0.25\textwidth]{figures/blason.pdf}
31 |   \end{addmargin}
32 | \end{titlepage}
33 | 


--------------------------------------------------------------------------------
/scripts/ch2_hyperplane.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pylab as pl
 3 | from sklearn import svm
 4 | 
 5 | blue = (0, 0, 1.0)
 6 | red = (1.0, 0, 0)
 7 | gray = (0.7, 0.7, 0.7)
 8 | 
 9 | # we create 40 separable points
10 | X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]]
11 | Y = [0] * 20 + [1] * 20
12 | 
13 | # fit the model
14 | clf = svm.SVC(kernel='linear')
15 | clf.fit(X, Y)
16 | 
17 | # get the separating hyperplane
18 | w = clf.coef_[0]
19 | a = -w[0] / w[1]
20 | xx = np.linspace(-5, 5)
21 | yy = a * xx - (clf.intercept_[0]) / w[1]
22 | 
23 | # plot the parallels to the separating hyperplane that pass through the
24 | # support vectors
25 | b = clf.support_vectors_[0]
26 | yy_down = a * xx + (b[1] - a * b[0])
27 | b = clf.support_vectors_[-1]
28 | yy_up = a * xx + (b[1] - a * b[0])
29 | 
30 | # plot the line, the points, and the nearest vectors to the plane
31 | pl.plot(xx, yy, 'k-')
32 | pl.plot(xx, yy_down, 'k--')
33 | pl.plot(xx, yy_up, 'k--')
34 | 
35 | pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
36 |            s=80, facecolors='none')
37 | 
38 | 
39 | pl.scatter(X[:20, 0], X[:20, 1], color=blue)
40 | pl.scatter(X[20:, 0], X[20:, 1], color=red)
41 | 
42 | pl.axis('tight')
43 | pl.show()
44 | 


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_ets2.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(16,16)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % ETs 3
 9 |     % Arrows
10 |     \psline[linewidth=0.5pt]{->}(9,8)(8,6.3)
11 |     \psline[linewidth=0.5pt]{->}(9,8)(10,6.3)
12 |     % Nodes
13 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,8){0.4}
14 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](7.7,5.7)(8.3,6.3)
15 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,5.7)(10.3,6.3)
16 |     % Text
17 |     \rput(7.9,7.3){$X_1 \leq 0$}
18 |     \rput(10.1,7.3){$X_1 > 0$}
19 |     % ETs 3
20 |     % Arrows
21 |     \psline[linewidth=0.5pt]{->}(14,8)(13,6.3)
22 |     \psline[linewidth=0.5pt]{->}(14,8)(15,6.3)
23 |     % Nodes
24 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](14,8){0.4}
25 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,5.7)(13.3,6.3)
26 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](14.7,5.7)(15.3,6.3)
27 |     % Text
28 |     \rput(12.9,7.3){$X_2 = 0$}
29 |     \rput(15.1,7.3){$X_2 = 1$}
30 |     \end{pspicture}
31 | \end{document}
32 | 


--------------------------------------------------------------------------------
/scripts/ch6_decomposition1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import matplotlib
 4 | from matplotlib import cm
 5 | 
 6 | from demo import make_led
 7 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
 8 | 
 9 | n_trees = 3000
10 | 
11 | X, y = make_led()
12 | fig, axs = plt.subplots(1, 2)
13 | 
14 | ax = axs[0]
15 | clf = RandomizedID3Ensemble(n_estimators=n_trees,
16 |                             base_estimator=RandomizedID3Classifier(k=1)).fit(X, y)
17 | imp = clf.feature_importances_
18 | ax.imshow(imp, cmap=cm.gist_heat_r, interpolation="nearest", vmin=0, vmax=0.4)
19 | ax.set_yticklabels(["$X_%d$" % (i) for i in range(X.shape[1]+1)])
20 | ax.set_title("$K=1$")
21 | 
22 | ax = axs[1]
23 | clf = RandomizedID3Ensemble(n_estimators=n_trees,
24 |                             base_estimator=RandomizedID3Classifier(k=X.shape[1])).fit(X, y)
25 | imp = clf.feature_importances_
26 | img = ax.imshow(imp, cmap=cm.gist_heat_r, interpolation="nearest", vmin=0, vmax=0.4)
27 | ax.set_yticklabels(["$X_%d$" % (i) for i in range(X.shape[1]+1)])
28 | ax.set_title("$K=%d$" % X.shape[1])
29 | 
30 | cax, kw = matplotlib.colorbar.make_axes([ax for ax in axs.flat])
31 | cb = plt.colorbar(img, cax=cax, **kw)
32 | cb.set_ticks([0, 0.2, 0.4])
33 | 
34 | plt.show()
35 | 
36 | 


--------------------------------------------------------------------------------
/scripts/ch4_proximity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | from itertools import cycle
 5 | from scipy.spatial.distance import pdist, squareform
 6 | 
 7 | from sklearn.datasets import load_digits
 8 | from sklearn.ensemble import RandomForestClassifier
 9 | from sklearn.manifold import MDS
10 | 
11 | 
12 | def rf_proximities(forest, X):
13 |     prox = pdist(forest.apply(X), lambda u, v: (u == v).sum()) / forest.n_estimators
14 |     prox = squareform(prox)
15 |     return prox
16 | 
17 | 
18 | data = load_digits()
19 | X, y = data.data, data.target
20 | 
21 | indices = np.argsort(y)
22 | X = X[indices]
23 | y = y[indices]
24 | 
25 | # X = X[y < 2]
26 | # y = y[y < 2]
27 | 
28 | forest = RandomForestClassifier(n_estimators=50, n_jobs=2, random_state=1).fit(X, y)
29 | prox = rf_proximities(forest, X)
30 | 
31 | plt.matshow(prox, cmap="Reds")
32 | plt.show()
33 | 
34 | model = MDS(dissimilarity="precomputed", n_jobs=2)
35 | coords = model.fit_transform(1. - prox)
36 | 
37 | n_classes = forest.n_classes_
38 | cm = plt.get_cmap("hsv")
39 | colors =  (cm(1. * i / n_classes) for i in range(n_classes))
40 | 
41 | for k, c in zip(range(n_classes), colors):
42 |     plt.plot(coords[y == k, 0], coords[y == k, 1], '.', label=k, color=c)
43 | 
44 | plt.legend(loc="best")
45 | plt.show()
46 | 


--------------------------------------------------------------------------------
/scripts/ch4_estimate_distribution.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from scipy.stats import norm
 4 | 
 5 | blue = (0, 0, 1.0)
 6 | green = (0, 0.8, 0)
 7 | red = (1.0, 0, 0)
 8 | red_alpha = (1.0, 0, 0, 0.1)
 9 | gray = (0.7, 0.7, 0.7)
10 | 
11 | x = np.arange(0, 1, 0.0001)
12 | p_y = norm.pdf(x, 0.6, 0.1)
13 | 
14 | plt.plot(x, p_y, color=red)
15 | plt.plot([0.5,0.5], [0.0, np.max(p_y)], '-', color=gray)
16 | plt.plot([0.6,0.6], [0.0, np.max(p_y)+0.01], ':', color=gray)
17 | plt.text(0.6, np.max(p_y) + 0.2, r"$\mathbb{E}_{\cal L} \{ p_{\cal L}(Y=\varphi_B(x)) \}$", fontsize=15, horizontalalignment='center')
18 | plt.text(0.6, 1.7, r"$Var_{\cal L}\{ p_{\cal L}(Y=\varphi_B(x)) \}$", fontsize=15, horizontalalignment='left')
19 | plt.annotate(
20 |     '', xy=(0.45, 2.0), xycoords = 'data',
21 |     xytext = (0.75, 2.0), textcoords = 'data',
22 |     arrowprops = {'arrowstyle':'<->'})
23 | plt.annotate(r"$P_{\cal L}(\varphi_{\cal L}(x)\neq \varphi_B(x))$", xy=(0.475, 1.0), xycoords='data', fontsize=15, xytext=(0.2, 1.7), textcoords='data', arrowprops={'arrowstyle':'->'})
24 | 
25 | plt.fill_between(x, p_y, y2=0, where=x<0.5, color=red_alpha)
26 | 
27 | plt.ylabel("$P$")
28 | plt.ylim((0., 4.5))
29 | plt.xlim((0., 1.0))
30 | plt.xticks([0.0, 0.5, 1.0])
31 | plt.yticks([])
32 | 
33 | plt.show()
34 | 


--------------------------------------------------------------------------------
/tex/figures/ch5_sort.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \definecolor{gray}{rgb}{0.95,0.95,0.95}
 5 | \definecolor{darkgray}{rgb}{0.8,0.8,0.8}
 6 | \pagestyle{empty}
 7 | \begin{document}
 8 |     \begin{pspicture}(20,15)
 9 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
10 |     \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=black](3,13)(13,14)
11 |     \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=gray](3.1,13.1)(4.9,13.9)
12 |     \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=gray](11.1,13.1)(12.9,13.9)
13 |     \psframe[fillstyle=solid,linewidth=0.5pt,linecolor=white,fillcolor=darkgray](8.1,13.1)(9.4,13.9)
14 |     \psline[linewidth=0.5pt]{-}(5,13)(5,14.5)
15 |     \psline[linewidth=0.5pt]{-}(11,13)(11,14.5)
16 |     \psline[linewidth=0.5pt]{-}(7,14)(7,12.5)
17 |     \psline[linewidth=0.5pt]{-}(8,14)(8,12.5)
18 |     \psline[linewidth=0.5pt]{-}(9.5,14)(9.5,12.5)
19 |     \rput(2,13.5){\texttt{samples}}
20 |     \rput[l](5.1,14.5){\texttt{start}}
21 |     \rput[l](11.1,14.5){\texttt{end}}
22 |     \rput[l](7.1,12.5){\texttt{l}}
23 |     \rput[l](8.1,12.5){\texttt{i}}
24 |     \rput[l](9.6,12.5){\texttt{r}}
25 |     \rput(6,13.5){$<$}
26 |     \rput(7.5,13.5){$=$}
27 |     \rput(10.25,13.5){$>$}
28 |     \end{pspicture}
29 | \end{document}
30 | 


--------------------------------------------------------------------------------
/tex/figures/ch3_tree.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \usepackage{color}
 5 | \definecolor{myblue}{rgb}{0.0,0.0,1.0}
 6 | \pagestyle{empty}
 7 | \begin{document}
 8 |     \begin{pspicture}(20,15)
 9 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
10 |     % Arrows
11 |     \psline[linewidth=0.5pt,linecolor=myblue]{->}(10,12)(8.3,11.3)
12 |     \psline[linewidth=0.5pt]{->}(10,12)(11.7,11.3)
13 |     \psline[linewidth=0.5pt]{->}(8,11)(7,9.35)
14 |     \psline[linewidth=0.5pt,linecolor=myblue]{->}(8,11)(9,9.35)
15 |     % Nodes
16 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=myblue](10,12){0.4}
17 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=myblue](8,11){0.4}
18 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](11.7,10.7)(12.3,11.3)
19 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,8.7)(7.3,9.3)
20 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=myblue](8.7,8.7)(9.3,9.3)
21 |     % Text
22 |     \rput(10,12){$t_0$}
23 |     \rput(8,11){$t_1$}
24 |     \rput(12,11){$t_2$}
25 |     \rput(7,9){$t_3$}
26 |     \rput(9,9){$t_4$}
27 |     \rput(10,11.3){{\small $X_1 \leq 0.7$}}
28 |     \rput(8,9.7){{\small $X_2 \leq 0.5$}}
29 |     \rput(12,10.5){$c_2$}
30 |     \rput(7,8.5){$c_2$}
31 |     \rput(9,8.5){$c_1$}
32 |     \end{pspicture}
33 | \end{document}
34 | 


--------------------------------------------------------------------------------
/scripts/ch3_partition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | X = np.random.rand(300, 2)
 5 | y = (X[:,0] < 0.7) & (X[:,1] > 0.5)
 6 | 
 7 | # randomly flips some labels
 8 | mask = np.random.permutation(len(X))[:15]
 9 | y[mask] = ~y[mask]
10 | 
11 | X_c1 = X[y == 0]
12 | plt.scatter(X_c1[:, 0], X_c1[:, 1], color=(1.0, 0, 0))
13 | 
14 | X_c2 = X[y == 1]
15 | plt.scatter(X_c2[:, 0], X_c2[:, 1], color=(0, 0, 1.0))
16 | 
17 | # draw lines + text
18 | plt.plot([0, 1], [0, 0], color='k', linestyle='-', linewidth=1)
19 | plt.plot([0, 0], [0, 1], color='k', linestyle='-', linewidth=1)
20 | plt.plot([1, 1], [0, 1], color='k', linestyle='-', linewidth=1)
21 | plt.plot([0, 1], [1, 1], color='k', linestyle='-', linewidth=1)
22 | 
23 | plt.plot([0.7, 0.7], [0, 1.0], color='k', linestyle='-', linewidth=1)
24 | plt.plot([0, 0.7], [0.5, 0.5], color='k', linestyle='-', linewidth=1)
25 | 
26 | plt.text(0.95, 0.93, r"$t_2$", fontsize=15)
27 | plt.text(0.65, 0.43, r"$t_3$", fontsize=15)
28 | plt.text(0.65, 0.93, r"$t_4$", fontsize=15)
29 | 
30 | plt.text(0.7, -0.07, "$0.7$", fontsize=15, horizontalalignment='center')
31 | plt.text(-0.07, 0.5, "$0.5$", fontsize=15, verticalalignment='center')
32 | 
33 | plt.text(1.0, -0.07, "$X_1$", fontsize=15, horizontalalignment='center')
34 | plt.text(-0.07, 1.0, "$X_2$", fontsize=15, verticalalignment='center')
35 | plt.show()
36 | 
37 | 


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_id3.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(16,16)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % ID3 1
 9 |     % Arrows
10 |     \psline[linewidth=0.5pt]{->}(4,12)(2,10.3)
11 |     \psline[linewidth=0.5pt]{->}(4,12)(4,10.3)
12 |     \psline[linewidth=0.5pt]{->}(4,12)(6,10.3)
13 |     % Nodes
14 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,12){0.4}
15 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](1.7,9.7)(2.3,10.3)
16 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](3.7,9.7)(4.3,10.3)
17 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](5.7,9.7)(6.3,10.3)
18 |     % Text
19 |     \rput(2.5,11.3){$X_1 = 0$}
20 |     \rput(4,11){$X_1 = 1$}
21 |     \rput(5.5,11.3){$X_1 = 2$}
22 |     % ID3 2
23 |     % Arrows
24 |     \psline[linewidth=0.5pt]{->}(10,12)(9,10.3)
25 |     \psline[linewidth=0.5pt]{->}(10,12)(11,10.3)
26 |     % Nodes
27 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4}
28 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](8.7,9.7)(9.3,10.3)
29 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](10.7,9.7)(11.3,10.3)
30 |     % Text
31 |     \rput(8.9,11.3){$X_2 = 0$}
32 |     \rput(11.1,11.3){$X_2 = 1$}
33 |     \end{pspicture}
34 | \end{document}
35 | 


--------------------------------------------------------------------------------
/scripts/ch6_decomposition2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib
 5 | from matplotlib import cm
 6 | 
 7 | from demo import make_led
 8 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
 9 | 
10 | 
11 | def feature_importance_tree(clf):
12 |     def _visit(tree, conditioning):
13 |         conditioning = conditioning + [tree[0]]
14 | 
15 |         if len(tree) == 2:
16 |             pass
17 | 
18 |         else:
19 |             for X in conditioning:
20 |                 imp[tree[0], X] += tree[1]
21 | 
22 |             for c in tree[2]:
23 |                 _visit(c, conditioning)
24 | 
25 |     imp = np.zeros((clf.n_features_, clf.n_features_))
26 |     _visit(clf.tree_, [])
27 | 
28 |     return imp
29 | 
30 | def feature_importances_ensemble(clf):
31 |     importances = np.zeros((clf.p, clf.p))
32 | 
33 |     for i, tree in enumerate(clf.estimators_):
34 |         importances += feature_importance_tree(tree)
35 | 
36 |     importances /= clf.n_estimators
37 | 
38 |     return importances
39 | 
40 | 
41 | n_trees = 1000
42 | 
43 | X, y = make_led()
44 | n_features = X.shape[1]
45 | 
46 | clf = RandomizedID3Ensemble(n_estimators=n_trees,
47 |                             base_estimator=RandomizedID3Classifier(k=1)).fit(X, y)
48 | 
49 | imp = feature_importances_ensemble(clf)
50 | plt.imshow(imp, interpolation="nearest", cmap=cm.gist_heat_r)
51 | plt.show()
52 | 
53 | 


--------------------------------------------------------------------------------
/tex/figures/ch3_splits.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(15,15)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % Binary split
 9 |     % Arrows
10 |     \psline[linewidth=0.5pt]{->}(5,12)(4.2,10.4)
11 |     \psline[linewidth=0.5pt]{->}(5,12)(5.8,10.4)
12 |     % Nodes
13 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,12){0.4}
14 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,10){0.4}
15 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](6,10){0.4}
16 |     % Text
17 |     \rput(5,12){$t$}
18 |     \rput(4,10){$t_L$}
19 |     \rput(6,10){$t_R$}
20 |     % N-ary split
21 |     % Arrows
22 |     \psline[linewidth=0.5pt]{->}(10,12)(8.2,10.4)
23 |     \psline[linewidth=0.5pt]{->}(10,12)(9.2,10.4)
24 |     \psline[linewidth=0.5pt]{->}(10,12)(11.8,10.4)
25 |     % Nodes
26 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4}
27 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,10){0.4}
28 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,10){0.4}
29 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,10){0.4}
30 |     % Text
31 |     \rput(10,12){$t$}
32 |     \rput(8,10){$t_{i_1}$}
33 |     \rput(9,10){$t_{i_2}$}
34 |     \rput(10.5,10){$...$}
35 |     \rput(12,10){$t_{i_N}$}
36 |     \end{pspicture}
37 | \end{document}
38 | 


--------------------------------------------------------------------------------
/scripts/ch7_redundant.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import brewer2mpl
 4 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 7).mpl_colors
 5 | #cmap = [(0, 0, 1.0), (1.0, 0, 0)]
 6 | 
 7 | def feature_importances(X, y, n_trees=500):
 8 |     from sklearn.ensemble import ExtraTreesClassifier
 9 |     clf = ExtraTreesClassifier(n_estimators=n_trees, max_features=1, criterion="entropy").fit(X, y)
10 |     imp = np.zeros(X.shape[1])
11 | 
12 |     for tree in clf.estimators_:
13 |         imp += tree.tree_.compute_feature_importances(normalize=False)
14 |     imp = imp / n_trees
15 |     return imp
16 | 
17 | def plot_with_dupplicate(X, y, duplicate=0, n_copies=10):
18 |     n_features = X.shape[1]
19 |     all_importances = []
20 |     X_new = np.hstack([X] + [X[:, duplicate:duplicate+1] for i in range(n_copies)])
21 | 
22 |     for i in range(n_copies+1):
23 |         all_importances.append(feature_importances(X_new[:, :n_features + i], y)[:n_features])
24 | 
25 |     all_importances = np.array(all_importances)
26 | 
27 |     for m in range(n_features):
28 |         plt.plot(range(n_copies+1), all_importances[:, m], "o-", label="X%d" % (m+1), color=cmap[m])
29 | 
30 |     plt.title("Adding copies of X%d" % (duplicate+1))
31 |     plt.legend(loc="best")
32 |     plt.show()
33 | 
34 | from demo import make_led
35 | X, y = make_led()
36 | # X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
37 | # y = np.array([1, 0, 0, 1])
38 | plot_with_dupplicate(X, y, duplicate=4, n_copies=100)
39 | 


--------------------------------------------------------------------------------
/scripts/ch4_bias_variance.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from scipy.stats import norm
 4 | 
 5 | blue = (0, 0, 1.0)
 6 | red = (1.0, 0, 0)
 7 | gray = (0.7, 0.7, 0.7)
 8 | 
 9 | x = np.arange(-10, 10, 0.0001)
10 | p_y = norm.pdf(x, -3.0, 1)
11 | p_y_hat = norm.pdf(x, 3.0, 1.8)
12 | 
13 | plt.plot(x, p_y, color=blue)
14 | plt.plot(x, p_y_hat, color=red)
15 | 
16 | plt.plot([-3,-3], [0.0, np.max(p_y)+0.01], ':', color=gray)
17 | plt.text(-3, np.max(p_y) + 0.02, r"$\varphi_B(x)$", fontsize=15, horizontalalignment='center')
18 | 
19 | plt.plot([3,3], [0.0, np.max(p_y_hat)+0.01], ':', color=gray)
20 | plt.text(3, np.max(p_y_hat) + 0.02, r"$\mathbb{E}_{\cal L} \{ \varphi_{\cal L}(x) \}$", fontsize=15, horizontalalignment='center')
21 | 
22 | plt.text(0, 0.11, r"$bias^2(x)$", fontsize=15, horizontalalignment='center')
23 | plt.annotate(
24 |     '', xy=(-3, 0.1), xycoords = 'data',
25 |     xytext = (3, 0.1), textcoords = 'data',
26 |     arrowprops = {'arrowstyle':'<->'})
27 | 
28 | plt.text(-5.1, 0.21, r"$noise(x)$", fontsize=15, horizontalalignment='right')
29 | plt.annotate(
30 |     '', xy=(-5, 0.2), xycoords = 'data',
31 |     xytext = (-1, 0.2), textcoords = 'data',
32 |     arrowprops = {'arrowstyle':'<->'})
33 | 
34 | plt.text(5.1, 0.21, r"$var(x)$", fontsize=15, horizontalalignment='left')
35 | plt.annotate(
36 |     '', xy=(6, 0.2), xycoords = 'data',
37 |     xytext = (0, 0.2), textcoords = 'data',
38 |     arrowprops = {'arrowstyle':'<->'})
39 | 
40 | plt.tick_params(axis="x", which="both", bottom="off", top="off", labelbottom="off")
41 | plt.xlabel("$y$")
42 | plt.ylabel("$P$")
43 | 
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/tex/figures/ch7_splits.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(15,15)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % Binary split
 9 |     % Arrows
10 |     \psline[linewidth=0.5pt]{->}(4,12)(3.2,10.4)
11 |     \psline[linewidth=0.5pt]{->}(4,12)(4.8,10.4)
12 |     \psline[linewidth=0.5pt]{->}(3,10)(3.8,8.4)
13 |     \psline[linewidth=0.5pt]{->}(3,10)(2.2,8.4)
14 |     % Nodes
15 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,12){0.4}
16 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,10){0.4}
17 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,10){0.4}
18 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](2,8){0.4}
19 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,8){0.4}
20 |     % Text
21 |     \rput(3,11.3){$X_j \leq 1$}
22 |     \rput(5,11.3){$X_j > 1$}
23 |     \rput(2,9.3){$X_j \leq 0$}
24 |     \rput(4,9.3){$X_j > 0$}
25 |     % N-ary split
26 |     % Arrows
27 |     \psline[linewidth=0.5pt]{->}(10,12)(8.2,10.4)
28 |     \psline[linewidth=0.5pt]{->}(10,12)(10,10.5)
29 |     \psline[linewidth=0.5pt]{->}(10,12)(11.8,10.4)
30 |     % Nodes
31 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4}
32 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,10){0.4}
33 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,10){0.4}
34 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,10){0.4}
35 |     % Text
36 |     \rput(8.5,11.3){$X_j = 0$}
37 |     \rput(10,11){$X_j = 1$}
38 |     \rput(11.5,11.3){$X_j = 2$}
39 |     \end{pspicture}
40 | \end{document}
41 | 


--------------------------------------------------------------------------------
/tex/figures/ch7_trees_ets.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(16,16)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % ETs 1
 9 |     % Arrows
10 |     \psline[linewidth=0.5pt]{->}(4,8)(3,6.4)
11 |     \psline[linewidth=0.5pt]{->}(4,8)(5,6.3)
12 |     \psline[linewidth=0.5pt]{->}(3,6)(4,4.3)
13 |     \psline[linewidth=0.5pt]{->}(3,6)(2,4.3)
14 |     % Nodes
15 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](4,8){0.4}
16 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,6){0.4}
17 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](4.7,5.7)(5.3,6.3)
18 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](1.7,3.7)(2.3,4.3)
19 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](3.7,3.7)(4.3,4.3)
20 |     % Text
21 |     \rput(2.9,7.3){$X_1 \leq 1$}
22 |     \rput(5.1,7.3){$X_1 > 1$}
23 |     \rput(1.9,5.3){$X_1 \leq 0$}
24 |     \rput(4.1,5.3){$X_1 > 0$}
25 |     % ETs 2
26 |     % Arrows
27 |     \psline[linewidth=0.5pt]{->}(9,8)(8,6.4)
28 |     \psline[linewidth=0.5pt]{->}(9,8)(10,6.3)
29 |     \psline[linewidth=0.5pt]{->}(8,6)(9,4.3)
30 |     \psline[linewidth=0.5pt]{->}(8,6)(7,4.3)
31 |     % Nodes
32 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](9,8){0.4}
33 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](8,6){0.4}
34 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,5.7)(10.3,6.3)
35 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,3.7)(7.3,4.3)
36 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](8.7,3.7)(9.3,4.3)
37 |     % Text
38 |     \rput(7.9,7.3){$X_1 \leq 1$}
39 |     \rput(10.1,7.3){$X_1 > 1$}
40 |     \rput(6.9,5.3){$X_2 = 0$}
41 |     \rput(9.1,5.3){$X_2 = 1$}
42 |     \end{pspicture}
43 | \end{document}
44 | 


--------------------------------------------------------------------------------
/tex/frontback/acknowledgments.tex:
--------------------------------------------------------------------------------
 1 | % Acknowledgements ============================================================
 2 | 
 3 | \pdfbookmark[1]{Acknowledgments}{acknowledgments}
 4 | \chapter*{Acknowledgments}
 5 | 
 6 | As the saying goes, good premises do not entail good stories. Yet, this
 7 | dissertation would certainly not have come to its successful conclusion
 8 | without the help, support and trust of colleagues, friends and family.
 9 | 
10 | First and foremost, I would like to sincerely thank my advisor Pierre Geurts
11 | for his help, guidance and for the freedom I was granted throughout these
12 | years.
13 | 
14 | I am grateful to all members of the jury for their interest in this work
15 | and for taking the  time to evaluate this dissertation.
16 | 
17 | In alphabetical order, I would also like to thank my colleagues who all
18 | contributed to create and maintain a pleasant and stimulating working
19 | environment: Antonio, Arnaud, Benjamin, Damien, Fabien, Julien, Lo\"{i}c,
20 | Louis, Marie, Olivier, Rapha\"{e}l, Van Anh, Vincent. Special thanks go to
21 | Antonio, Arnaud and Vincent who accepted to proofread parts of this manuscript.
22 | 
23 | I want take this opportunity to thank the Scikit-Learn team and all its
24 | contributors. This experience within the open source world really contributed
25 | to shape my vision of science and software development towards a model
26 | of rigor, pragmatism and openness. Thanks go to Ga\"{e}l, Olivier, Lars,
27 | Mathieu, Andreas, Alexandre and Peter.
28 | 
29 | Special thanks go to the rowing team of the RCAE, for their friendship
30 | and good mood in all circumstances. Guys, I thank you all.
31 | 
32 | Even if I never succeeded to fully explain my research topics, I would finally
33 | like to warmly thank my dear friend J\'er\^{o}me and my family for their help
34 | in moments of doubt.
35 | 
36 | Last but not least, Laura, I am forever grateful for your unconditional support
37 | and love.
38 | 


--------------------------------------------------------------------------------
/scripts/ch3_split_ordered.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | np.random.seed(54)
 5 | 
 6 | blue = (0, 0, 1.0)
 7 | red = (1.0, 0, 0)
 8 | gray = (0.7, 0.7, 0.7)
 9 | 
10 | n_samples = 10
11 | 
12 | X = np.empty(n_samples)
13 | X[:n_samples / 2] = np.sort(np.random.normal(loc=-1.0, size=n_samples / 2))
14 | X[n_samples / 2:] = np.sort(np.random.normal(loc=1.0, size=n_samples / 2))
15 | y = np.zeros(n_samples)
16 | y[n_samples / 2:] = 1
17 | 
18 | plt.plot([-3,3], [0,0], '-', color='k')
19 | 
20 | X_ = np.sort(X)
21 | for i in range(len(X_) - 1):
22 |     s = X[i]
23 |     plt.plot([s,s], [0.00001/2, 0], '-', color=gray)
24 | 
25 | plt.plot([-3,-3], [0.00001, 0], '-', color=gray)
26 | 
27 | plt.scatter(X[:n_samples / 2], np.zeros(n_samples / 2), color=blue)
28 | plt.scatter(X[n_samples / 2:], np.zeros(n_samples / 2), color=red)
29 | 
30 | s1 = X_[6]
31 | s2 = X_[7]
32 | smid = (s1+s2) / 2.0
33 | 
34 | plt.plot([s1,s1], [0.00001/2, 0], '-', color='k')
35 | plt.text(s1, 0.0000055, "$v_k$", fontsize=15, horizontalalignment='center')
36 | plt.text(s2, 0.0000055, "$v_{k+1}$", fontsize=15, horizontalalignment='center')
37 | plt.text(smid, 0.000001, "$v^\prime_k$", fontsize=15, horizontalalignment='center')
38 | plt.plot([smid,smid], [0.00001/2, 0], ':', color=gray)
39 | plt.text((s1+(-3)) / 2.0,  -0.000001, "${\cal L}^{v_k}_{t_L}$", fontsize=15, horizontalalignment='center')
40 | plt.text((s1+3) / 2.0,  -0.000001, "${\cal L}^{v_k}_{t_R}$", fontsize=15, horizontalalignment='center')
41 | 
42 | plt.annotate(
43 |     '', xy=(-3, -0.0000015), xycoords = 'data',
44 |     xytext = (s1, -0.0000015), textcoords = 'data',
45 |     arrowprops = {'arrowstyle':'<->'})
46 | plt.annotate(
47 |     '', xy=(s1, -0.0000015), xycoords = 'data',
48 |     xytext = (3, -0.0000015), textcoords = 'data',
49 |     arrowprops = {'arrowstyle':'<->'})
50 | 
51 | #plt.plot([s2,s2], [0.00003, -0.00003], '-', color='k')
52 | 
53 | 
54 | plt.text(3, -0.0000007, "$X_j$", fontsize=15)
55 | 
56 | plt.show()
57 | 
58 | 


--------------------------------------------------------------------------------
/tex/figures/ch5_tree.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(20,15)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % Arrows
 9 |     \psline[linewidth=0.5pt]{->}(10,12)(8.3,11.3)
10 |     \psline[linewidth=0.5pt]{->}(10,12)(11.7,11.3)
11 |     \psline[linewidth=0.5pt]{->}(12,11)(11,9.4)
12 |     \psline[linewidth=0.5pt]{->}(12,11)(13,9.3)
13 |     \psline[linewidth=0.5pt]{->}(11,9)(10,7.3)
14 |     \psline[linewidth=0.5pt]{->}(11,9)(12,7.4)
15 |     \psline[linewidth=0.5pt]{->}(12,7)(11,5.3)
16 |     \psline[linewidth=0.5pt]{->}(12,7)(13,5.3)
17 |     % Nodes
18 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](10,12){0.4}
19 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](7.7,10.7)(8.3,11.3)
20 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,11){0.4}
21 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](11,9){0.4}
22 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,8.7)(13.3,9.3)
23 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](9.7,6.7)(10.3,7.3)
24 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](12,7){0.4}
25 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](10.7,4.7)(11.3,5.3)
26 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](12.7,4.7)(13.3,5.3)
27 |     % Text
28 |     \rput(10,12){$t_0$}
29 |     \rput(8,11){$t_1$}
30 |     \rput(12,11){$t_2$}
31 |     \rput(11,9){$t_3$}
32 |     \rput(13,9){$t_4$}
33 |     \rput(10,7){$t_5$}
34 |     \rput(12,7){$t_6$}
35 |     \rput(11,5){$t_7$}
36 |     \rput(13,5){$t_8$}
37 |     \rput(10,11.3){{\small $X_2 \leq 0.303$}}
38 |     \rput(12,9.65){{\small $X_2 \leq 0.696$}}
39 |     \rput(11,7.65){{\small $X_1 \leq 0.296$}}
40 |     \rput(12,5.65){{\small $X_1 \leq 0.703$}}
41 |     \rput(8,10.5){$c_1$}
42 |     \rput(13,8.5){$c_1$}
43 |     \rput(10,6.5){$c_1$}
44 |     \rput(11,4.5){$c_2$}
45 |     \rput(13,4.5){$c_1$}
46 |     \end{pspicture}
47 | \end{document}
48 | 


--------------------------------------------------------------------------------
/scripts/ch7_bias_tree.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import brewer2mpl
 5 | 
 6 | from itertools import product
 7 | from functools import partial
 8 | from demo import entropy
 9 | 
10 | from sklearn.ensemble import ExtraTreesClassifier
11 | from sklearn.ensemble import RandomForestClassifier
12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
13 | 
14 | def feature_importances(X, y, cls, n_trees=5000):
15 |     clf = cls(n_estimators=n_trees).fit(X, y)
16 | 
17 |     if isinstance(clf, RandomizedID3Ensemble):
18 |         imp = np.sum(clf.feature_importances_, axis=1)
19 | 
20 |     else:
21 |         imp = np.zeros(X.shape[1])
22 | 
23 |         for tree in clf.estimators_:
24 |             imp += tree.tree_.compute_feature_importances(normalize=False)
25 | 
26 |         imp = imp / n_trees
27 | 
28 |     return imp
29 | 
30 | def generate_copy(n1=20, n2=2):
31 |     X = np.array([np.arange(n1), np.arange(n1)]).T
32 |     X[:, 1] = X[:, 0] >= n1/2
33 |     y = X[:, 1]
34 |     return X, y
35 | 
36 | import brewer2mpl
37 | cmap = [(1., 0, 0), (0, 0, 1)]
38 | 
39 | r = {}
40 | g = generate_copy
41 | 
42 | for name, cls in [("ETs", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")),
43 |                   ("RF", partial(RandomForestClassifier, max_features=1, bootstrap=False, criterion="entropy"))]:
44 |     f = []
45 |     for n1 in range(2, 20+1, 2):
46 |         X, y = g(n1=n1, n2=2)
47 |         f.append(feature_importances(X, y, cls=cls))
48 |     r[name] = np.array(f)
49 | 
50 | 
51 | models = ["ETs", "RF"]
52 | 
53 | plt.subplot(1, 2, 1)
54 | 
55 | for i, name in enumerate(models):
56 |     f = r[name]
57 |     plt.plot(range(2, 20+1, 2), f[:, 0], "o-", label="%s" % name, color=cmap[i])
58 |     plt.ylim([0., 1.0])
59 |     plt.title("$X_1$")
60 |     plt.legend(loc="best")
61 | 
62 | plt.subplot(1, 2, 2)
63 | 
64 | for i, name in enumerate(models):
65 |     f = r[name]
66 |     plt.plot(range(2, 20+1, 2), f[:, 1], "o-", label="%s" % name, color=cmap[i])
67 |     plt.title("$X_2$")
68 |     plt.ylim([0., 1.0])
69 | 
70 | plt.show()
71 | 
72 | 


--------------------------------------------------------------------------------
/tex/figures/ch2_mlp.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{minimal}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(20,15)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % Arrows
 9 |     % Layer 0 to 1
10 |     \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,8.6)
11 |     \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,7.6)
12 |     \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,6.6)
13 |     \psline[linewidth=0.5pt]{->}(3.2,8)(4.6,5.6)
14 |     \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,8.5)
15 |     \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,7.5)
16 |     \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,6.5)
17 |     \psline[linewidth=0.5pt]{->}(3.2,7)(4.6,5.5)
18 |     \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,8.4)
19 |     \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,7.4)
20 |     \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,6.4)
21 |     \psline[linewidth=0.5pt]{->}(3.2,6)(4.6,5.4)
22 |     \psline[linewidth=0.5pt]{->}(7,7)(7.6,7)
23 |     % Layer 1 to 2
24 |     \psline[linewidth=0.5pt]{->}(5.2,8.5)(6.6,7.075)
25 |     \psline[linewidth=0.5pt]{->}(5.2,7.5)(6.6,7.025)
26 |     \psline[linewidth=0.5pt]{->}(5.2,6.5)(6.6,6.975)
27 |     \psline[linewidth=0.5pt]{->}(5.2,5.5)(6.6,6.925)
28 |     % Layer 0
29 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,8){0.4}
30 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,7){0.4}
31 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,6){0.4}
32 |     % Layer 1
33 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,8.5){0.4}
34 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,7.5){0.4}
35 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,6.5){0.4}
36 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,5.5){0.4}
37 |     % Layer 2
38 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](7,7){0.4}
39 |     % Text
40 |     \rput(4,8.75){$w_{ij}$}
41 |     \rput(3,8){$x_1$}
42 |     \rput(3,7){$x_2$}
43 |     \rput(3,6){$x_3$}
44 |     \rput(5,8.5){$h_1$}
45 |     \rput(5,7.5){$h_2$}
46 |     \rput(5,6.5){$h_3$}
47 |     \rput(5,5.5){$h_4$}
48 |     \rput(7,7){$h_5$}
49 |     \rput(7.8,7){$\hat{y}$}
50 |     \end{pspicture}
51 | \end{document}
52 | 


--------------------------------------------------------------------------------
/tex/figures/ch3_goodness.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 |     \begin{pspicture}(10,15)
 7 |     %\psgrid[subgriddiv=1,griddots=10,gridlabels=7pt]
 8 |     % Arrows
 9 |     \psline[linewidth=0.5pt]{->}(5,12)(3.3,11.3)
10 |     \psline[linewidth=0.5pt]{->}(5,12)(6.7,11.3)
11 |     % Nodes
12 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,12){0.4}
13 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,11){0.4}
14 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,10.7)(7.3,11.3)
15 |     % Text
16 |     \rput(5,12){$t_0$}
17 |     \rput(3,11){$t_1$}
18 |     \rput(7,11){$t_2$}
19 |     \rput(5,11.3){{\small $X_1$}}
20 |     \rput(3,10.25){$p(y=c_1|t_1)=\frac{2}{5}$}
21 |     \rput(3,9.75){$p(y=c_2|t_1)=\frac{3}{5}$}
22 |     \rput(7,10.25){$p(y=c_1|t_2)=\frac{0}{5}$}
23 |     \rput(7,9.75){$p(y=c_2|t_2)=\frac{5}{5}$}
24 |     % Arrows
25 |     \psline[linewidth=0.5pt]{->}(5,9)(3.3,8.3)
26 |     \psline[linewidth=0.5pt]{->}(5,9)(6.7,8.3)
27 |     % Nodes
28 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,9){0.4}
29 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,8){0.4}
30 |     \psframe[fillstyle=solid,linewidth=1pt,linecolor=black](6.7,7.7)(7.3,8.3)
31 |     % Text
32 |     \rput(5,9){$t_0$}
33 |     \rput(3,8){$t_1$}
34 |     \rput(7,8){$t_2$}
35 |     \rput(5,8.3){{\small $X_2$}}
36 |     \rput(3,7.25){$p(y=c_1|t_1)=\frac{2}{6}$}
37 |     \rput(3,6.75){$p(y=c_2|t_1)=\frac{4}{6}$}
38 |     \rput(7,7.25){$p(y=c_1|t_2)=\frac{0}{4}$}
39 |     \rput(7,6.75){$p(y=c_2|t_2)=\frac{4}{4}$}
40 |     % Arrows
41 |     \psline[linewidth=0.5pt]{->}(5,6)(3.3,5.3)
42 |     \psline[linewidth=0.5pt]{->}(5,6)(6.7,5.3)
43 |     % Nodes
44 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](5,6){0.4}
45 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](3,5){0.4}
46 |     \pscircle[fillstyle=solid,linewidth=1pt,linecolor=black](7,5){0.4}
47 |     % Text
48 |     \rput(5,6){$t_0$}
49 |     \rput(3,5){$t_1$}
50 |     \rput(7,5){$t_2$}
51 |     \rput(5,5.3){{\small $X_3$}}
52 |     \rput(3,4.25){$p(y=c_1|t_1)=\frac{1}{6}$}
53 |     \rput(3,3.75){$p(y=c_2|t_1)=\frac{5}{6}$}
54 |     \rput(7,4.25){$p(y=c_1|t_2)=\frac{1}{4}$}
55 |     \rput(7,3.75){$p(y=c_2|t_2)=\frac{3}{4}$}
56 |     \end{pspicture}
57 | \end{document}
58 | 


--------------------------------------------------------------------------------
/tex/summary.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[oneside,openright,titlepage,numbers=noenddot,headinclude,%
 2 |                footinclude=true,cleardoublepage=empty,abstractoff,BCOR=5mm,%
 3 |                paper=a4,fontsize=11pt,ngerman,american]{scrreprt}
 4 | 
 5 | % Custom config ===============================================================
 6 | 
 7 | % Classic thesis
 8 | \usepackage{amssymb}
 9 | \input{classicthesis-config}
10 | 
11 | % Theorems and definitions
12 | \usepackage{amsthm}
13 | \newtheorem{theorem}{Theorem}
14 | \newtheorem{lemma}[theorem]{Lemma}
15 | \newtheorem{proposition}[theorem]{Proposition}
16 | \newtheorem{corollary}[theorem]{Corollary}
17 | \newtheorem{definition}{Definition}
18 | 
19 | \newtheorem{algorithm}{Algorithm}
20 | \usepackage{algpseudocode}
21 | 
22 | % Counters
23 | \renewcommand{\labelenumi}{{\color{halfgray}(\alph{enumi})}}
24 | \renewcommand{\labelenumii}{\color{halfgray}{\roman{enumii}.}}
25 | \renewcommand{\labelitemi}{{\color{halfgray}-}}%\raisebox{0.3ex}{\tiny$\blacksquare$}}}
26 | 
27 | \numberwithin{theorem}{chapter}
28 | \numberwithin{definition}{chapter}
29 | \numberwithin{algorithm}{chapter}
30 | \numberwithin{figure}{chapter}
31 | \numberwithin{table}{chapter}
32 | 
33 | % Maths
34 | \DeclareMathOperator*{\argmin}{arg\,min}
35 | \DeclareMathOperator*{\argmax}{arg\,max}
36 | 
37 | \numberwithin{equation}{chapter}
38 | \allowdisplaybreaks
39 | 
40 | % Shaded boxes
41 | \usepackage{framed}
42 | \newenvironment{remark}[1]{%
43 |   \definecolor{shadecolor}{gray}{0.9}%
44 |   \begin{shaded}{\color{Maroon}\noindent\textsc{#1}}\\%
45 | }{%
46 |   \end{shaded}%
47 | }
48 | 
49 | % Code snippets
50 | \usepackage{minted}
51 | \definecolor{rulecolor}{rgb}{0.80,0.80,0.80}
52 | \definecolor{bgcolor}{rgb}{1.0,1.0,1.0}
53 | \newminted{python}{bgcolor=bgcolor}
54 | 
55 | % Todo
56 | \newcommand{\todo}[1]{\textcolor{red}{[TODO] #1}}
57 | 
58 | % PS pictures
59 | \usepackage{pstricks,auto-pst-pdf}
60 | 
61 | % Landscape tables
62 | \usepackage{rotating}
63 | 
64 | % Checkmarks
65 | \usepackage{pifont}% http://ctan.org/pkg/pifont
66 | \newcommand{\cmark}{\ding{51}}%
67 | \newcommand{\xmark}{\ding{55}}%
68 | 
69 | % Wide tables
70 | \usepackage{ltablex}
71 | 
72 | 
73 | % -----------------------------------------------------------------------------
74 | 
75 | \begin{document}
76 | \frenchspacing
77 | \raggedbottom
78 | \selectlanguage{american}
79 | \pagenumbering{roman}
80 | \pagestyle{plain}
81 | 
82 | 
83 | \pagenumbering{arabic}
84 | 
85 | \include{summary/summary}
86 | 
87 | 
88 | \end{document}
89 | 


--------------------------------------------------------------------------------
/scripts/ch2_train_test_error.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.datasets import make_friedman1
 7 | from sklearn.metrics import mean_squared_error
 8 | from sklearn.tree import DecisionTreeRegressor
 9 | 
10 | 
11 | # Compute train/test error curves on Friedman1
12 | def error_curves(estimator, parameter, parameter_values, n_repeat=100):
13 |     all_train_errors = []
14 |     all_test_errors = []
15 | 
16 |     for i in range(n_repeat):
17 |         X, y = make_friedman1(n_samples=200)
18 |         X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
19 | 
20 |         train_errors = []
21 |         test_errors = []
22 | 
23 |         for j, p in enumerate(parameter_values):
24 |             est = estimator(**{parameter: p})
25 |             est.fit(X_train, y_train)
26 | 
27 |             train_errors.append(mean_squared_error(y_train, est.predict(X_train)))
28 |             test_errors.append(mean_squared_error(y_test, est.predict(X_test)))
29 | 
30 |         all_train_errors.append(train_errors)
31 |         all_test_errors.append(test_errors)
32 | 
33 |     return all_train_errors, all_test_errors
34 | 
35 | parameter_values = np.arange(1, 100, dtype=np.int)
36 | all_train_errors, all_test_errors = error_curves(DecisionTreeRegressor,
37 |                                                  "min_samples_split",
38 |                                                  parameter_values)
39 | 
40 | 
41 | # Plot the error curves
42 | all_train_errors = np.array(all_train_errors)
43 | all_test_errors = np.array(all_test_errors)
44 | 
45 | for i, train_errors in enumerate(all_train_errors):
46 |     plt.plot(parameter_values[::-1], train_errors, color=(0, 0, 1, 0.1))
47 | plt.plot(parameter_values[::-1], np.mean(all_train_errors, axis=0),
48 |          color=(0, 0, 1), label="Training error")
49 | 
50 | for i, test_errors in enumerate(all_test_errors):
51 |     plt.plot(parameter_values[::-1], test_errors, color=(1, 0, 0, 0.1))
52 | plt.plot(parameter_values[::-1], np.mean(all_test_errors, axis=0),
53 |          color=(1, 0, 0), label="Test error")
54 | 
55 | m = np.mean(all_test_errors, axis=0)
56 | i = np.argmin(m)
57 | plt.vlines((parameter_values[::-1])[i], 0, 30, color=(0.7, 0.7, 0.7))
58 | plt.ylim([0, 30])
59 | 
60 | plt.tick_params(axis="x", which="both", bottom="off", top="off", labelbottom="off")
61 | plt.xlabel("Model complexity")
62 | plt.ylabel("Mean square error")
63 | plt.legend(loc="best")
64 | 
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/scripts/ch7_bias_depth.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import brewer2mpl
 5 | 
 6 | from itertools import product
 7 | from functools import partial
 8 | from demo import entropy
 9 | 
10 | from sklearn.ensemble import ExtraTreesClassifier
11 | from sklearn.ensemble import RandomForestClassifier
12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
13 | 
14 | import brewer2mpl
15 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', 5).mpl_colors
16 | 
17 | 
18 | def feature_importances(X, y, cls, n_trees=500):
19 |     clf = cls(n_estimators=n_trees).fit(X, y)
20 | 
21 |     if isinstance(clf, RandomizedID3Ensemble):
22 |         imp = np.sum(clf.feature_importances_, axis=1)
23 | 
24 |     else:
25 |         imp = np.zeros(X.shape[1])
26 | 
27 |         for tree in clf.estimators_:
28 |             imp += tree.tree_.compute_feature_importances(normalize=False)
29 | 
30 |         imp = imp / n_trees
31 | 
32 |     return imp
33 | 
34 | def generate_strobl_power(n_samples=120, relevance=0.2):
35 |     X = np.array([v for v in product(range(2), range(4), range(10), range(20))]).astype(np.int32)
36 |     X = np.hstack((np.random.rand(len(X), 1), X))
37 | 
38 |     y = np.zeros(len(X))
39 |     mask = (X[:, 1] == 1)
40 |     y[mask] = np.random.rand(mask.sum()) < 0.5-relevance
41 |     y[~mask] = np.random.rand((~mask).sum()) < 0.5+relevance
42 | 
43 |     indices = np.random.permutation(X.shape[0])[:n_samples]
44 |     return X[indices], y[indices].astype(np.int32)
45 | 
46 |     return X, y
47 | 
48 | # Generate all importances
49 | #cls = partial(ExtraTreesClassifier, max_features=1, criterion="entropy")
50 | cls = partial(RandomForestClassifier, max_features=5, criterion="entropy")
51 | 
52 | relevances = [0.0, 0.1, 0.2, 0.3]
53 | depths = range(1, 10)
54 | 
55 | 
56 | for i, relevance in enumerate(relevances):
57 |     imp_all = []
58 | 
59 |     for n in range(10):
60 |         imp = []
61 |         X, y = generate_strobl_power(relevance=relevance)
62 | 
63 |         for q in depths:
64 |             c = partial(cls, max_depth=q)
65 |             imp.append(feature_importances(X, y, cls=c))
66 | 
67 |         imp = np.array(imp)
68 |         imp_all.append(imp)
69 | 
70 |     imp = np.mean(imp_all, axis=0)
71 | 
72 |     for q in range(imp.shape[0]):
73 |         imp[q] /= np.sum(imp[q, :])
74 | 
75 |     plt.subplot(2, 2, i + 1)
76 | 
77 |     for j in range(X.shape[1]):
78 |         plt.plot(depths, imp[:, j], "o-", label="$X_%d$" % j, color=cmap[j])
79 | 
80 |     plt.ylim([0., 1.0])
81 |     plt.title("Relevance = %.1f" % relevance)
82 | 
83 |     if i == 0:
84 |         plt.legend(loc="best")
85 | 
86 | plt.show()
87 | 


--------------------------------------------------------------------------------
/scripts/ch3_split_ordered_invariant.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | np.random.seed(54)
 5 | 
 6 | blue = (0, 0, 1.0)
 7 | red = (1.0, 0, 0)
 8 | gray = (0.7, 0.7, 0.7)
 9 | 
10 | n_samples = 10
11 | 
12 | X = np.empty(n_samples)
13 | X[:n_samples / 2] = np.sort(np.random.normal(loc=-1.0, size=n_samples / 2))
14 | X[n_samples / 2:] = np.sort(np.random.normal(loc=1.0, size=n_samples / 2))
15 | y = np.zeros(n_samples)
16 | y[n_samples / 2:] = 1
17 | 
18 | plt.plot([-3,3], [0,0], '-', color='k')
19 | 
20 | indices = np.argsort(X)
21 | X_ = X[indices]
22 | for i in range(len(X_) - 1):
23 |     s = (X_[i]+X_[i+1]) / 2.0
24 |     plt.plot([s,s], [0.00001, 0], ':', color=gray)
25 | 
26 | plt.plot([-3,-3], [0.00001, 0], '-', color='k')
27 | 
28 | y = y[indices]
29 | 
30 | def gini(p):
31 |     p1 = 1.0 * np.sum(p) / len(p)
32 |     p0 = 1.0 - p1
33 |     return p0 * (1 - p0) + p1 * (1 - p1)
34 | 
35 | s = []
36 | delta = []
37 | 
38 | for i in range(1, 8):
39 |     i_t = gini(y)
40 |     i_t_L = gini(y[:i])
41 |     i_t_R = gini(y[i:])
42 |     p_L = 1.0 * i / n_samples
43 |     p_R = 1.0 - p_L
44 | 
45 |     s.append((X_[i-1] + X_[i]) / 2.0)
46 |     delta.append(i_t - p_L * i_t_L - p_R * i_t_R)
47 | 
48 | delta = np.array(delta)
49 | delta /= np.max(delta)
50 | delta *= 0.00001
51 | 
52 | plt.plot(s, delta, "o-")
53 | 
54 | 
55 | plt.scatter(X[:n_samples / 2], np.zeros(n_samples / 2), color=blue)
56 | plt.scatter(X[n_samples / 2:], np.zeros(n_samples / 2), color=red)
57 | 
58 | s1 = X_[6]
59 | s2 = X_[7]
60 | smid = (s1+s2) / 2.0
61 | 
62 | #plt.plot([s2,s2], [0.00001, -0.00001], '-', color=gray)
63 | plt.text(-3, 0.0000105, "$\Delta i(s_j^v, t)$", fontsize=15, horizontalalignment='center')
64 | plt.text(s1, 0.00000095, "$x_{i-1,j}$", fontsize=15, horizontalalignment='center')
65 | plt.text(s2, 0.00000095, "$x_{i,j}$", fontsize=15, horizontalalignment='center')
66 | plt.text(smid, 0.0000105, "$v^\prime_k$", fontsize=15, horizontalalignment='center')
67 | plt.text((smid+(-3)) / 2.0, -0.000001, "${\cal L}^{v^\prime_k}_{t_L}$", fontsize=15, horizontalalignment='center')
68 | plt.text((smid+3) / 2.0, -0.000001, "${\cal L}^{v^\prime_k}_{t_R}$", fontsize=15, horizontalalignment='center')
69 | 
70 | plt.annotate(
71 |     '', xy=(-3, -0.0000015), xycoords = 'data',
72 |     xytext = (smid, -0.0000015), textcoords = 'data',
73 |     arrowprops = {'arrowstyle':'<->'})
74 | plt.annotate(
75 |     '', xy=(smid, -0.0000015), xycoords = 'data',
76 |     xytext = (3, -0.0000015), textcoords = 'data',
77 |     arrowprops = {'arrowstyle':'<->'})
78 | 
79 | plt.annotate("$\Delta$", xy=(s[3], delta[3]), xycoords='data', xytext=(s[3]-0.5, delta[3]-0.000001), textcoords='data', arrowprops={'arrowstyle':'->'})
80 | 
81 | plt.text(3, -0.0000007, "$X_j$", fontsize=15)
82 | 
83 | plt.show()
84 | 
85 | 


--------------------------------------------------------------------------------
/tex/figures/ch8_rank_large.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{minimal}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 | \begin{pspicture}(0,-2.5)(7.0,1.5)
 7 |             \usefont{T1}{ptm}{m}{n}
 8 |             \psline[linewidth=0.02cm](0.0,1.0)(2.91,1.0)
 9 |             \psline[linewidth=0.02cm](0.0,1.1)(0.0,0.9)
10 |             \psline[linewidth=0.02cm](2.91,1.1)(2.91,0.9)
11 |             \rput(1.4505,1.25){CD}
12 | 
13 |             \psline[linewidth=0.02cm](0.0,0.0)(7.0,0.0)
14 |             \psline[linewidth=0.02cm](0.0,0.25)(0.0,0.0) \rput(0.0,0.5){8}
15 |             \psline[linewidth=0.02cm](1.0,0.25)(1.0,0.0) \rput(1.0,0.5){7}
16 |             \psline[linewidth=0.02cm](2.0,0.25)(2.0,0.0) \rput(2.0,0.5){6}
17 |             \psline[linewidth=0.02cm](3.0,0.25)(3.0,0.0) \rput(3.0,0.5){5}
18 |             \psline[linewidth=0.02cm](4.0,0.25)(4.0,0.0) \rput(4.0,0.5){4}
19 |             \psline[linewidth=0.02cm](5.0,0.25)(5.0,0.0) \rput(5.0,0.5){3}
20 |             \psline[linewidth=0.02cm](6.0,0.25)(6.0,0.0) \rput(6.0,0.5){2}
21 |             \psline[linewidth=0.02cm](7.0,0.25)(7.0,0.0) \rput(7.0,0.5){1}
22 | 
23 |             \psline[linewidth=0.02cm](5.62,0.0)(5.62,-1.0)
24 |             \psline[linewidth=0.02cm](5.62,-1.0)(7.0,-1.0)
25 |             \rput(7.5,-1.0){RS-ET}
26 | 
27 |             \psline[linewidth=0.02cm](4.85,0.0)(4.85,-1.5)
28 |             \psline[linewidth=0.02cm](4.85,-1.5)(7.0,-1.5)
29 |             \rput(7.5,-1.5){ET}
30 | 
31 |             \psline[linewidth=0.02cm](4.47,0.0)(4.47,-2.0)
32 |             \psline[linewidth=0.02cm](4.47,-2.0)(7.0,-2.0)
33 |             \rput(7.5,-2.0){RS-DT}
34 | 
35 |             \psline[linewidth=0.02cm](4.16,0.0)(4.16,-2.5)
36 |             \psline[linewidth=0.02cm](4.16,-2.5)(7.0,-2.5)
37 |             \rput(7.5,-2.5){RP-ET}
38 | 
39 |             \psline[linewidth=0.02cm](1.39,0.0)(1.39,-1.0)
40 |             \psline[linewidth=0.02cm](1.39,-1.0)(0.0,-1.0)
41 |             \rput(-0.5,-1.0){P-DT}
42 | 
43 |             \psline[linewidth=0.02cm](1.77,0.0)(1.77,-1.5)
44 |             \psline[linewidth=0.02cm](1.77,-1.5)(0.0,-1.5)
45 |             \rput(-0.5,-1.5){RF}
46 | 
47 |             \psline[linewidth=0.02cm](2.31,0.0)(2.31,-2.0)
48 |             \psline[linewidth=0.02cm](2.31,-2.0)(0.0,-2.0)
49 |             \rput(-0.5,-2.0){P-ET}
50 | 
51 |             \psline[linewidth=0.02cm](3.47,0.0)(3.47,-2.5)
52 |             \psline[linewidth=0.02cm](3.47,-2.5)(0.0,-2.5)
53 |             \rput(-0.5,-2.5){RP-DT}
54 | 
55 |             \psline[linewidth=0.05cm](5.72,-0.25)(3.37,-0.25)
56 |             \psline[linewidth=0.05cm](1.29,-0.5)(4.26,-0.5)
57 |             \psline[linewidth=0.05cm](1.67,-0.75)(4.57,-0.75)
58 |             \psline[linewidth=0.05cm](2.21,-1.0)(4.95,-1.0)
59 |             \end{pspicture}
60 | \end{document}
61 | 


--------------------------------------------------------------------------------
/tex/figures/ch8_rank_small.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{minimal}
 2 | \usepackage{pstricks}
 3 | \usepackage{pst-plot}
 4 | \pagestyle{empty}
 5 | \begin{document}
 6 | \begin{pspicture}(0,-2.5)(7.0,1.5)
 7 |             \usefont{T1}{ptm}{m}{n}
 8 |             \psline[linewidth=0.02cm](0.0,1.0)(2.62,1.0)
 9 |             \psline[linewidth=0.02cm](0.0,1.1)(0.0,0.9)
10 |             \psline[linewidth=0.02cm](2.62,1.1)(2.62,0.9)
11 |             \rput(1.31,1.25){CD}
12 | 
13 |             \psline[linewidth=0.02cm](0.0,0.0)(7.0,0.0)
14 |             \psline[linewidth=0.02cm](0.0,0.25)(0.0,0.0) \rput(0.0,0.5){8}
15 |             \psline[linewidth=0.02cm](1.0,0.25)(1.0,0.0) \rput(1.0,0.5){7}
16 |             \psline[linewidth=0.02cm](2.0,0.25)(2.0,0.0) \rput(2.0,0.5){6}
17 |             \psline[linewidth=0.02cm](3.0,0.25)(3.0,0.0) \rput(3.0,0.5){5}
18 |             \psline[linewidth=0.02cm](4.0,0.25)(4.0,0.0) \rput(4.0,0.5){4}
19 |             \psline[linewidth=0.02cm](5.0,0.25)(5.0,0.0) \rput(5.0,0.5){3}
20 |             \psline[linewidth=0.02cm](6.0,0.25)(6.0,0.0) \rput(6.0,0.5){2}
21 |             \psline[linewidth=0.02cm](7.0,0.25)(7.0,0.0) \rput(7.0,0.5){1}
22 | 
23 |             \psline[linewidth=0.02cm](5.88,0.0)(5.88,-1.0)
24 |             \psline[linewidth=0.02cm](5.88,-1.0)(7.0,-1.0)
25 |             \rput(7.5,-1.0){ET}
26 | 
27 |             \psline[linewidth=0.02cm](5.19,0.0)(5.18,-1.5)
28 |             \psline[linewidth=0.02cm](5.19,-1.5)(7.0,-1.5)
29 |             \rput(7.5,-1.5){RS-ET}
30 | 
31 |             \psline[linewidth=0.02cm](5.07,0.0)(5.07,-2.0)
32 |             \psline[linewidth=0.02cm](5.07,-2.0)(7.0,-2.0)
33 |             \rput(7.5,-2.0){RP-ET}
34 | 
35 |             \psline[linewidth=0.02cm](4.25,0.0)(4.25,-2.5)
36 |             \psline[linewidth=0.02cm](4.25,-2.5)(7.0,-2.5)
37 |             \rput(7.5,-2.5){P-ET}
38 | 
39 |             \psline[linewidth=0.02cm](0.94,0.0)(0.94,-1.0)
40 |             \psline[linewidth=0.02cm](0.94,-1.0)(0.0,-1.0)
41 |             \rput(-0.5,-1.0){P-DT}
42 | 
43 |             \psline[linewidth=0.02cm](1.88,0.0)(1.88,-1.5)
44 |             \psline[linewidth=0.02cm](1.88,-1.5)(0.0,-1.5)
45 |             \rput(-0.5,-1.5){RS-DT}
46 | 
47 |             \psline[linewidth=0.02cm](2.13,0.0)(2.13,-2.0)
48 |             \psline[linewidth=0.02cm](2.13,-2.0)(0.0,-2.0)
49 |             \rput(-0.5,-2.0){RF}
50 | 
51 |             \psline[linewidth=0.02cm](2.69,0.0)(2.69,-2.5)
52 |             \psline[linewidth=0.02cm](2.69,-2.5)(0.0,-2.5)
53 |             \rput(-0.5,-2.5){RP-DT}
54 | 
55 |             \psline[linewidth=0.05cm](5.98,-0.25)(4.15,-0.25)
56 |             \psline[linewidth=0.05cm](0.84,-0.25)(2.79,-0.25)
57 |             \psline[linewidth=0.05cm](1.78,-0.5)(4.35,-0.5)
58 |             \psline[linewidth=0.05cm](2.59,-0.75)(5.29,-0.75)
59 |             \end{pspicture}
60 | \end{document}
61 | 


--------------------------------------------------------------------------------
/tex/frontback/abstract.tex:
--------------------------------------------------------------------------------
 1 | % Abstract ====================================================================
 2 | 
 3 | \pdfbookmark[1]{Abstract}{Abstract}
 4 | \chapter*{Abstract}
 5 | 
 6 | Data analysis and machine learning have become an integrative part of the
 7 | modern scientific methodology, offering automated procedures for the prediction
 8 | of a phenomenon based on past observations, unraveling underlying patterns in
 9 | data and providing insights about the problem. Yet, caution should
10 | avoid using machine learning as a black-box tool, but rather consider it as a
11 | methodology, with a rational thought process that is entirely dependent on the
12 | problem under study. In particular, the use of algorithms
13 | should ideally require a reasonable understanding of their
14 | mechanisms, properties and limitations, in order to better apprehend and
15 | interpret their results.
16 | 
17 | Accordingly, the goal of this thesis is to provide an in-depth
18 | analysis of random forests, consistently calling into
19 | question each and every part of the algorithm, in order to shed new light on
20 | its learning capabilities, inner workings and interpretability. The first
21 | part of this work studies the induction of decision trees and the construction of
22 | ensembles of randomized trees, motivating their design and purpose whenever
23 | possible. Our contributions follow with an original complexity
24 | analysis of random forests, showing their good computational performance
25 | and scalability, along with an in-depth discussion of their
26 | implementation details, as contributed within Scikit-Learn.
27 | 
28 | In the second part of this work, we analyze and discuss the interpretability of
29 | random forests in the eyes of variable importance measures. The core of our
30 | contributions rests in the theoretical characterization of the Mean Decrease of
31 | Impurity variable importance measure, from which we prove and derive some of
32 | its properties in the case of multiway totally randomized trees and in
33 | asymptotic conditions. In consequence of this work, our analysis  demonstrates
34 | that variable importances as computed from non-totally randomized trees (e.g.,
35 | standard Random Forest) suffer from a combination of defects, due to masking
36 | effects, misestimations of node impurity or due to the binary structure of
37 | decision trees.
38 | 
39 | Finally, the last part of this dissertation addresses limitations of random
40 | forests in the context of large datasets. Through extensive experiments, we
41 | show that subsampling both samples and features simultaneously provides on par
42 | performance while lowering at the same time the memory requirements. Overall
43 | this paradigm highlights an intriguing practical fact: there is often no need
44 | to build single models over immensely large datasets. Good performance can
45 | often be achieved by building models on (very) small random parts of the data
46 | and then combining them all in an ensemble, thereby avoiding all practical
47 | burdens of making large data fit into memory.
48 | 


--------------------------------------------------------------------------------
/scripts/ch7_bias_null.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import brewer2mpl
 5 | 
 6 | from itertools import product
 7 | from functools import partial
 8 | from demo import entropy
 9 | 
10 | from sklearn.ensemble import ExtraTreesClassifier
11 | from sklearn.ensemble import RandomForestClassifier
12 | from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
13 | 
14 | def feature_importances(X, y, cls, n_trees=500):
15 |     clf = cls(n_estimators=n_trees).fit(X, y)
16 | 
17 |     if isinstance(clf, RandomizedID3Ensemble):
18 |         imp = np.sum(clf.feature_importances_, axis=1)
19 | 
20 |     else:
21 |         imp = np.zeros(X.shape[1])
22 | 
23 |         for tree in clf.estimators_:
24 |             imp += tree.tree_.compute_feature_importances(normalize=False)
25 | 
26 |         imp = imp / n_trees
27 | 
28 |     return imp
29 | 
30 | def generate_strobl_null(n_samples=120):
31 |     X = np.array([v for v in product(range(2),
32 |                                      range(4),
33 |                                      range(10),
34 |                                      range(20),
35 |                                      range(2))]).astype(np.int32)
36 |     X, y = X[:, :-1], X[:, -1]
37 | 
38 |     indices = np.random.randint(0, X.shape[0], n_samples)
39 |     X, y = X[indices], y[indices].astype(np.int32)
40 |     X = np.hstack((np.random.rand(len(X), 1), X))
41 | 
42 |     return X, y
43 | 
44 | # Generate all importances
45 | models = [("TRT", partial(RandomizedID3Ensemble, base_estimator=RandomizedID3Classifier(k=1))),
46 |           ("ETs K=1", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")),
47 |           ("ETs K=3", partial(ExtraTreesClassifier, max_features=3, criterion="entropy")),
48 |           ("ETs K=5", partial(ExtraTreesClassifier, max_features=5, criterion="entropy")),
49 |           ("RF K=1", partial(RandomForestClassifier, max_features=1, bootstrap=True, criterion="entropy")),
50 |           ("RF K=3", partial(RandomForestClassifier, max_features=3, bootstrap=True, criterion="entropy")),
51 |           ("RF K=5", partial(RandomForestClassifier, max_features=5, bootstrap=True, criterion="entropy")),]
52 | 
53 | n_repeat = 5
54 | r = {}
55 | 
56 | for i in range(n_repeat):
57 |     print "Iteration", i
58 | 
59 |     X, y = generate_strobl_null(n_samples=120)
60 |     print entropy(y)
61 | 
62 |     for name, cls in models:
63 |         f = feature_importances(X, y, cls=cls, n_trees=500)
64 | 
65 |         if i == 0:
66 |             r[name] = np.array(f)
67 |         else:
68 |             r[name] += np.array(f)
69 | 
70 |         print name, np.sum(f)
71 | 
72 | for name in r:
73 |     r[name] /= n_repeat
74 | 
75 | # Convert to pandas and plot
76 | df = pd.DataFrame(r, index=["X%d" % (i+1) for i in range(X.shape[1])])
77 | df = df.reindex_axis([name for name, _ in models], axis=1)
78 | 
79 | import brewer2mpl
80 | cmap = brewer2mpl.get_map('RdYlGn', 'diverging', len(r))
81 | df.plot(kind="bar", colormap=cmap.mpl_colormap, legend="best", grid=False)
82 | plt.show()
83 | 


--------------------------------------------------------------------------------
/scripts/ch4_correlation_plot.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | 
 5 | blue = (0, 0, 1.0)
 6 | green = (0, 0.8, 0)
 7 | red = (1.0, 0, 0)
 8 | red_alpha = (1.0, 0, 0, 0.001)
 9 | gray = (0.7, 0.7, 0.7)
10 | 
11 | results = [[],[],
12 | ["RandomForestRegressor-K=1",3.527128,2.820386,0.706743,0.063868,0.009973,0.286104,0.420639],
13 | ["RandomForestRegressor-K=2",3.036291,2.333874,0.702417,0.075537,0.011347,0.314841,0.387576],
14 | ["RandomForestRegressor-K=3",2.823907,2.109897,0.714009,0.087809,0.012335,0.349486,0.364523],
15 | ["RandomForestRegressor-K=4",2.715613,1.979086,0.736527,0.102472,0.014302,0.391750,0.344778],
16 | ["RandomForestRegressor-K=5",2.643232,1.887080,0.756151,0.111790,0.015411,0.421380,0.334772],
17 | ["RandomForestRegressor-K=6",2.642354,1.851498,0.790856,0.125342,0.016268,0.466556,0.324300],
18 | ["RandomForestRegressor-K=7",2.636296,1.822316,0.813980,0.134200,0.017159,0.495746,0.318234],
19 | ["RandomForestRegressor-K=8",2.623646,1.784344,0.839303,0.146081,0.018631,0.531100,0.308202],
20 | ["RandomForestRegressor-K=9",2.645439,1.780447,0.864992,0.152977,0.019492,0.558601,0.306390],
21 | ["RandomForestRegressor-K=10",2.638901,1.753437,0.885464,0.160371,0.020184,0.583494,0.301970],
22 | ["ExtraTreesRegressor-K=1",3.376099,2.723586,0.652514,0.051864,0.009532,0.230752,0.421761],
23 | ["ExtraTreesRegressor-K=2",2.801100,2.146534,0.654566,0.060858,0.011926,0.258086,0.396480],
24 | ["ExtraTreesRegressor-K=3",2.536644,1.886837,0.649807,0.067322,0.012756,0.273424,0.376383],
25 | ["ExtraTreesRegressor-K=4",2.409943,1.745583,0.664360,0.076519,0.016511,0.302962,0.361399],
26 | ["ExtraTreesRegressor-K=5",2.330165,1.651706,0.678459,0.086137,0.017063,0.331515,0.346944],
27 | ["ExtraTreesRegressor-K=6",2.285386,1.597063,0.688323,0.092147,0.019216,0.349667,0.338655],
28 | ["ExtraTreesRegressor-K=7",2.263983,1.553772,0.710211,0.100322,0.020510,0.378116,0.332094],
29 | ["ExtraTreesRegressor-K=8",2.246997,1.528167,0.718831,0.107167,0.021703,0.396323,0.322507],
30 | ["ExtraTreesRegressor-K=9",2.236845,1.495768,0.741077,0.115699,0.023020,0.423894,0.317183],
31 | ["ExtraTreesRegressor-K=10",2.232862,1.469781,0.763081,0.123849,0.024420,0.451778,0.311304]]
32 | 
33 | max_features = range(1, 10+1)
34 | 
35 | ax = plt.subplot(1, 2, 1)
36 | plt.plot(max_features, [results[1+k][1] for k in max_features], 'o-', color=blue, label='Random Forest')
37 | plt.plot(max_features, [results[1+k][2] for k in max_features], 'o--', color=blue)
38 | plt.plot(max_features, [results[1+k][3] for k in max_features], 'o:', color=blue)
39 | plt.plot(max_features, [results[11+k][1] for k in max_features], 'o-', color=red, label='Extremely Randomized Trees')
40 | plt.plot(max_features, [results[11+k][2] for k in max_features], 'o--', color=red)
41 | plt.plot(max_features, [results[11+k][3] for k in max_features], 'o:', color=red)
42 | plt.legend(loc="best")
43 | plt.xlabel("$K$")
44 | 
45 | plt.subplot(1, 2, 2, sharex=ax)
46 | plt.plot(max_features, [results[1+k][4] for k in max_features], 'o-', color=blue)
47 | plt.plot(max_features, [results[11+k][4] for k in max_features], 'o-', color=red)
48 | plt.xlabel("$K$")
49 | plt.ylabel("$\\rho$")
50 | 
51 | plt.show()
52 | 


--------------------------------------------------------------------------------
/scripts/ch3_impurity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | blue = (0, 0, 1.0)
 5 | red = (1.0, 0, 0)
 6 | gray = (0.7, 0.7, 0.7)
 7 | 
 8 | # Criterion
 9 | 
10 | def impurity_error(p1, p2):
11 |     return min(p1, p2)
12 | 
13 | def impurity_entropy(p1, p2):
14 |     if p1 == 0.0 or p1 == 1.0 or p2 == 0.0 or p2 == 1.0:
15 |         return 0.0
16 |     else:
17 |         return -(p1 * np.log2(p1) + p2 * np.log2(p2))
18 | 
19 | def impurity_gini(p1, p2):
20 |     return p1 * (1 - p1) + p2 * (1 - p2)
21 | 
22 | # Split
23 | 
24 | def p(y_t):
25 |     return 1.0 * y_t / np.sum(y_t)
26 | 
27 | impurity = impurity_gini
28 | y_t = np.array([2, 8], dtype=np.float)
29 | y_t_L = np.array([1, 5], dtype=np.float)
30 | y_t_R = y_t - y_t_L
31 | p_y_t = p(y_t)
32 | p_y_t_L = p(y_t_L)
33 | p_y_t_R = p(y_t_R)
34 | 
35 | p_L = y_t_L.sum() / y_t.sum()
36 | p_R = y_t_R.sum() / y_t.sum()
37 | 
38 | i_t = impurity(*p_y_t)
39 | i_t_L = impurity(*p_y_t_L)
40 | i_t_R = impurity(*p_y_t_R)
41 | 
42 | print "Delta i(s, t) = i(t) - p_L * i(t_L) - p_R * i (t_R)"
43 | print "              = %f - %f * %f - %f * %f" % (i_t, p_L, i_t_L, p_R, i_t_R)
44 | print "              = %f" % (i_t - p_L * i_t_L - p_R * i_t_R, )
45 | 
46 | 
47 | fig = plt.figure()
48 | ax = fig.add_subplot(111)
49 | 
50 | x = np.linspace(0.0, 1.0, num=300)
51 | # ax.plot(x, map(impurity, x, 1-x), label="entropy", color=blue)
52 | ax.plot(x, map(impurity_error, x, 1-x), label="$i_E(t)$", color=gray)
53 | ax.plot(x, map(impurity_entropy, x, 1-x), label="$i_H(t)$", color=blue)
54 | ax.plot(x, map(impurity_gini, x, 1-x), label="$i_G(t)$", color=red)
55 | ax.legend(loc="best")
56 | plt.show()
57 | 
58 | ax.plot(p_y_t[0], i_t, marker="o", color=red)
59 | ax.plot(p_y_t_L[0], i_t_L, marker="o", color=red)
60 | ax.plot(p_y_t_R[0], i_t_R, marker="o", color=red)
61 | 
62 | ax.plot((p_y_t[0], p_y_t[0]), (0, i_t), ":", color=gray)
63 | ax.plot((0, p_y_t[0]), (i_t, i_t), ":", color=gray)
64 | ax.annotate("$i(t)$", xy=(0, i_t), xytext=(0+0.01, i_t), va="center")
65 | ax.annotate("$p(c_1|t)$", xy=(p_y_t[0], 0), xytext=(p_y_t[0], 0+0.025), ha="center")
66 | 
67 | ax.plot((p_y_t_L[0], p_y_t_L[0]), (0, i_t_L), ":", color=gray)
68 | ax.plot((0, p_y_t_L[0]), (i_t_L, i_t_L), ":", color=gray)
69 | ax.annotate("$i(t_L)$", xy=(0, i_t_L), xytext=(0+0.01, i_t_L), va="center")
70 | ax.annotate("$p(c_1|t_L)$", xy=(p_y_t_L[0], 0), xytext=(p_y_t_L[0], 0+0.025), ha="center")
71 | 
72 | ax.plot((p_y_t_R[0], p_y_t_R[0]), (0, i_t_R), ":", color=gray)
73 | ax.plot((0, p_y_t_R[0]), (i_t_R, i_t_R), ":", color=gray)
74 | ax.annotate("$i(t_R)$", xy=(0, i_t_R), xytext=(0+0.01, i_t_R), va="center")
75 | ax.annotate("$p(c_1|t_R)$", xy=(p_y_t_R[0], 0), xytext=(p_y_t_R[0], 0+0.025), ha="center")
76 | 
77 | ax.plot((p_y_t_L[0], p_y_t_R[0]), (i_t_L, i_t_R), "-", color=gray)
78 | ax.plot((p_y_t[0], p_y_t[0]), (i_t, p_L * i_t_L + p_R * i_t_R), "-", color=red)
79 | ax.plot(p_y_t[0], p_L * i_t_L + p_R * i_t_R, marker="o", color=gray)
80 | ax.annotate("$\Delta i(s, t) = %.3f$" % abs(i_t - p_L * i_t_L - p_R * i_t_R), xy=(p_y_t[0], i_t - 0.5*(i_t - p_L * i_t_L - p_R * i_t_R)), xytext=(p_y_t[0]+0.05, i_t - 0.5*(i_t - p_L * i_t_L - p_R * i_t_R)),  arrowprops=dict(arrowstyle="->"), va="center")
81 | 
82 | #ax.legend(loc="best")
83 | plt.show()
84 | 


--------------------------------------------------------------------------------
/benchmarks/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from sklearn.utils import check_random_state
  4 | 
  5 | 
  6 | def make_waveforms(n_samples=300, random_state=None):
  7 |     """Make the waveforms dataset. (CART)"""
  8 |     random_state = check_random_state(random_state)
  9 | 
 10 |     def h1(x):
 11 |         if x < 7:
 12 |             return x
 13 |         elif x < 13:
 14 |             return 13.-x
 15 |         else:
 16 |             return 0.
 17 | 
 18 |     def h2(x):
 19 |         if x < 9:
 20 |             return 0.
 21 |         elif x < 15:
 22 |             return x-9.
 23 |         else:
 24 |             return 21.-x
 25 | 
 26 |     def h3(x):
 27 |         if x < 5:
 28 |             return 0.
 29 |         elif x < 11:
 30 |             return x-5.
 31 |         elif x < 17:
 32 |             return 17.-x
 33 |         else:
 34 |             return 0.
 35 | 
 36 |     u = random_state.rand(n_samples)
 37 |     y = random_state.randint(low=0, high=3, size=n_samples)
 38 |     X = random_state.normal(size=(n_samples, 21))
 39 | 
 40 |     for i in range(n_samples):
 41 |         if y[i] == 0:
 42 |             ha = h1
 43 |             hb = h2
 44 |         elif y[i] == 1:
 45 |             ha = h1
 46 |             hb = h3
 47 |         else:
 48 |             ha = h2
 49 |             hb = h3
 50 | 
 51 |         for m in np.arange(1, 21+1):
 52 |             X[i, m-1] += u[i] * ha(m) + (1 - u[i]) * hb(m)
 53 | 
 54 |     return X, y
 55 | 
 56 | 
 57 | def make_ringnorm(n_samples=300, random_state=None):
 58 |     """Make the ring-norm dataset. (Breiman, Tech. report 460.)"""
 59 |     random_state = check_random_state(random_state)
 60 |     a = 1. / 20.**0.5
 61 | 
 62 |     y = random_state.randint(low=0, high=2, size=n_samples)
 63 |     X = np.zeros((n_samples, 20))
 64 | 
 65 |     negatives = (y == 0)
 66 |     positives = (y == 1)
 67 | 
 68 |     X[negatives] = random_state.multivariate_normal(mean=np.zeros(20), cov=4.*np.eye(20), size=negatives.sum())
 69 |     X[positives] = random_state.normal(loc=[a]*20, size=(positives.sum(), 20))
 70 | 
 71 |     return X, y
 72 | 
 73 | 
 74 | def make_twonorm(n_samples=300, random_state=None):
 75 |     """Make the two-norm dataset. (Breiman, Tech. report 460.)"""
 76 |     random_state = check_random_state(random_state)
 77 |     a = 2. / 20.**0.5
 78 | 
 79 |     y = random_state.randint(low=0, high=2, size=n_samples)
 80 |     X = np.zeros((n_samples, 20))
 81 | 
 82 |     negatives = (y == 0)
 83 |     positives = (y == 1)
 84 | 
 85 |     X[negatives] = random_state.normal(loc=[a]*20, size=(negatives.sum(), 20))
 86 |     X[positives] = random_state.normal(loc=[-a]*20, size=(positives.sum(), 20))
 87 | 
 88 |     return X, y
 89 | 
 90 | def make_threenorm(n_samples=300, random_state=None):
 91 |     """Make the three-norm dataset. (Breiman, Tech. report 460.)"""
 92 |     random_state = check_random_state(random_state)
 93 |     a = 2. / 20.**0.5
 94 | 
 95 |     y = random_state.randint(low=0, high=4, size=n_samples)
 96 |     X = np.zeros((n_samples, 20))
 97 | 
 98 |     class0 = (y == 0)
 99 |     class1 = (y == 1)
100 |     class2 = (y >= 2)
101 | 
102 |     X[class0] = random_state.normal(loc=[a]*20, size=(class0.sum(), 20))
103 |     X[class1] = random_state.normal(loc=[-a]*20, size=(class1.sum(), 20))
104 |     X[class2] = random_state.normal(loc=[a,-a]*10, size=(class2.sum(), 20))
105 | 
106 |     y[class0] = 0
107 |     y[class1] = 0
108 |     y[class2] = 1
109 | 
110 |     return X, y
111 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Understanding Random Forests
 2 | ============================
 3 | 
 4 | PhD dissertation, Gilles Louppe, July 2014. Defended on October 9, 2014. 
 5 | 
 6 | _arXiv:_ http://arxiv.org/abs/1407.7502
 7 | 
 8 | _Mirrors:_ 
 9 | - http://hdl.handle.net/2268/170309
10 | - http://www.montefiore.ulg.ac.be/~glouppe/pdf/phd-thesis.pdf
11 | 
12 | _License:_ BSD 3 clause
13 | 
14 | _Contact:_ Gilles Louppe ([@glouppe](https://twitter.com/glouppe/), <g.louppe@gmail.com>)
15 | 
16 | Please cite using the following BibTex entry:
17 | 
18 | ```
19 | @phdthesis{louppe2014understanding,
20 |   title={Understanding Random Forests: From Theory to Practice},
21 |   author={Louppe, Gilles},
22 |   school={University of Liege, Belgium},
23 |   year=2014,
24 |   month=10,
25 |   note={arXiv:1407.7502}
26 | }
27 | ```
28 | 
29 | ---
30 | 
31 | Data analysis and machine learning have become an integrative part of the
32 | modern scientific methodology, offering automated procedures for the prediction
33 | of a phenomenon based on past observations, unraveling underlying patterns in
34 | data and providing insights about the problem. Yet, caution should
35 | avoid using machine learning as a black-box tool, but rather consider it as a
36 | methodology, with a rational thought process that is entirely dependent on the
37 | problem under study. In particular, the use of algorithms
38 | should ideally require a reasonable understanding of their
39 | mechanisms, properties and limitations, in order to better apprehend and
40 | interpret their results.
41 | 
42 | Accordingly, the goal of this thesis is to provide an in-depth
43 | analysis of random forests, consistently calling into
44 | question each and every part of the algorithm, in order to shed new light on
45 | its learning capabilities, inner workings and interpretability. The first
46 | part of this work studies the induction of decision trees and the construction of
47 | ensembles of randomized trees, motivating their design and purpose whenever
48 | possible. Our contributions follow with an original complexity
49 | analysis of random forests, showing their good computational performance
50 | and scalability, along with an in-depth discussion of their
51 | implementation details, as contributed within Scikit-Learn.
52 | 
53 | In the second part of this work, we analyze and discuss the interpretability of
54 | random forests in the eyes of variable importance measures. The core of our
55 | contributions rests in the theoretical characterization of the Mean Decrease of
56 | Impurity variable importance measure, from which we prove and derive some of
57 | its properties in the case of multiway totally randomized trees and in
58 | asymptotic conditions. In consequence of this work, our analysis  demonstrates
59 | that variable importances as computed from non-totally randomized trees (e.g.,
60 | standard Random Forest) suffer from a combination of defects, due to masking
61 | effects, misestimations of node impurity or due to the binary structure of
62 | decision trees.
63 | 
64 | Finally, the last part of this dissertation addresses limitations of random
65 | forests in the context of large datasets. Through extensive experiments, we
66 | show that subsampling both samples and features simultaneously provides on par
67 | performance while lowering at the same time the memory requirements. Overall
68 | this paradigm highlights an intriguing practical fact: there is often no need
69 | to build single models over immensely large datasets. Good performance can
70 | often be achieved by building models on (very) small random parts of the data
71 | and then combining them all in an ensemble, thereby avoiding all practical
72 | burdens of making large data fit into memory.
73 | 


--------------------------------------------------------------------------------
/tex/thesis.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[twoside,openright,titlepage,numbers=noenddot,headinclude,%
  2 |                footinclude=true,cleardoublepage=empty,abstractoff,BCOR=5mm,%
  3 |                paper=a4,fontsize=11pt,ngerman,american]{scrreprt}
  4 | 
  5 | % Custom config ===============================================================
  6 | 
  7 | % Classic thesis
  8 | \usepackage{amssymb}
  9 | \input{classicthesis-config}
 10 | 
 11 | % Theorems and definitions
 12 | \usepackage{amsthm}
 13 | \newtheorem{theorem}{Theorem}
 14 | \newtheorem{lemma}[theorem]{Lemma}
 15 | \newtheorem{proposition}[theorem]{Proposition}
 16 | \newtheorem{corollary}[theorem]{Corollary}
 17 | \newtheorem{definition}{Definition}
 18 | 
 19 | \newtheorem{algorithm}{Algorithm}
 20 | \usepackage{algpseudocode}
 21 | 
 22 | % Counters
 23 | \renewcommand{\labelenumi}{{\color{halfgray}(\alph{enumi})}}
 24 | \renewcommand{\labelenumii}{\color{halfgray}{\roman{enumii}.}}
 25 | \renewcommand{\labelitemi}{{\color{halfgray}-}}%\raisebox{0.3ex}{\tiny$\blacksquare$}}}
 26 | 
 27 | \numberwithin{theorem}{chapter}
 28 | \numberwithin{definition}{chapter}
 29 | \numberwithin{algorithm}{chapter}
 30 | \numberwithin{figure}{chapter}
 31 | \numberwithin{table}{chapter}
 32 | 
 33 | % Maths
 34 | \DeclareMathOperator*{\argmin}{arg\,min}
 35 | \DeclareMathOperator*{\argmax}{arg\,max}
 36 | 
 37 | \numberwithin{equation}{chapter}
 38 | \allowdisplaybreaks
 39 | 
 40 | % Shaded boxes
 41 | \usepackage{framed}
 42 | \newenvironment{remark}[1]{%
 43 |   \definecolor{shadecolor}{gray}{0.9}%
 44 |   \begin{shaded}{\color{Maroon}\noindent\textsc{#1}}\\%
 45 | }{%
 46 |   \end{shaded}%
 47 | }
 48 | 
 49 | % Code snippets
 50 | \usepackage{minted}
 51 | \definecolor{rulecolor}{rgb}{0.80,0.80,0.80}
 52 | \definecolor{bgcolor}{rgb}{1.0,1.0,1.0}
 53 | \newminted{python}{bgcolor=bgcolor}
 54 | 
 55 | % Todo
 56 | \newcommand{\todo}[1]{\textcolor{red}{[TODO] #1}}
 57 | 
 58 | % PS pictures
 59 | \usepackage{pstricks,auto-pst-pdf}
 60 | 
 61 | % Landscape tables
 62 | \usepackage{rotating}
 63 | 
 64 | % Checkmarks
 65 | \usepackage{pifont}% http://ctan.org/pkg/pifont
 66 | \newcommand{\cmark}{\ding{51}}%
 67 | \newcommand{\xmark}{\ding{55}}%
 68 | 
 69 | % Wide tables
 70 | \usepackage{ltablex}
 71 | 
 72 | 
 73 | % -----------------------------------------------------------------------------
 74 | 
 75 | \begin{document}
 76 | \frenchspacing
 77 | \raggedbottom
 78 | \selectlanguage{american}
 79 | \pagenumbering{roman}
 80 | \pagestyle{plain}
 81 | 
 82 | 
 83 | % Front pages =================================================================
 84 | \include{frontback/titlepage}
 85 | %\cleardoublepage\include{frontback/disclaimer}
 86 | \cleardoublepage\include{frontback/jury}
 87 | \cleardoublepage\include{frontback/abstract}
 88 | %\cleardoublepage\include{frontback/publications}
 89 | \cleardoublepage\include{frontback/acknowledgments}
 90 | \pagestyle{scrheadings}
 91 | \cleardoublepage\include{frontback/toc}
 92 | 
 93 | 
 94 | % Content =====================================================================
 95 | \pagenumbering{arabic}
 96 | 
 97 | \cleardoublepage
 98 | \include{chapters/chapter01}\cleardoublepage
 99 | 
100 | %\ctparttex{}
101 | \part{Growing Decision Trees}\label{part:1}
102 | \include{chapters/chapter02}\cleardoublepage
103 | \include{chapters/chapter03}\cleardoublepage
104 | \include{chapters/chapter04}\cleardoublepage
105 | \include{chapters/chapter05}\cleardoublepage
106 | 
107 | %\ctparttex{}
108 | \part{Interpreting Random Forests}\label{part:2}
109 | \include{chapters/chapter06}\cleardoublepage
110 | \include{chapters/chapter07}\cleardoublepage
111 | 
112 | %\ctparttex{}
113 | \part{Subsampling data}\label{part:3}
114 | \include{chapters/chapter08}
115 | 
116 | \addtocontents{toc}{\protect\vspace*{\baselineskip}\protect}
117 | \cleardoublepage
118 | \makeatletter
119 | \def\toclevel@chapter{-1}
120 | \makeatother
121 | \include{chapters/chapter09}
122 | 
123 | 
124 | % Back pages ==================================================================
125 | \appendix
126 | \cleardoublepage
127 | \part{Appendix}
128 | 
129 | \cleardoublepage\include{frontback/notations}
130 | \cleardoublepage\include{frontback/bibliography}
131 | 
132 | 
133 | \end{document}
134 | 


--------------------------------------------------------------------------------
/scripts/ch4_correlation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from functools import partial
 4 | from sklearn.utils import check_random_state
 5 | 
 6 | 
 7 | def make(n_samples, n_features=5, noise_features=5, random_state=None):
 8 |     X = check_random_state(random_state).normal(size=(n_samples, n_features+noise_features))
 9 |     y = np.sum(X[:, :n_features], axis=1)
10 |     return X, y
11 | 
12 | 
13 | # from sklearn.datasets import make_friedman1 as make
14 | # # make = partial(make,)
15 | 
16 | from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor #, PERTRegressor
17 | from sklearn.tree import DecisionTreeRegressor
18 | from sklearn.metrics import mean_squared_error
19 | 
20 | n_train = 50
21 | n_test = 600
22 | n_estimators = 10       # number of trees per forest
23 | n_sets = 100             # number of learning sets
24 | n_trees = 50            # number of trees per learning sets, for estimating statistics
25 | 
26 | estimators = [("PERTRegressor", PERTRegressor),
27 |               ("Bagging", partial(RandomForestRegressor, max_features=1.0, bootstrap=True))]
28 | estimators.extend([("RandomForestRegressor-K=%d" % i, partial(RandomForestRegressor, max_features=i)) for i in range(1, 10+1)])
29 | estimators.extend([("ExtraTreesRegressor-K=%d" % i, partial(ExtraTreesRegressor, max_features=i)) for i in range(1, 10+1)])
30 | 
31 | estimators = []
32 | estimators.extend([("RandomForestRegressor-M=%d" % i, partial(RandomForestRegressor, n_estimators=i, max_features=1)) for i in range(1, 50+1)])
33 | #estimators.extend([("ExtraTreesRegressor-M=%d" % i, partial(ExtraTreesRegressor, n_estimators=i, max_features=1)) for i in range(1, 50+1)])
34 | 
35 | 
36 | train = [make(n_samples=n_train, random_state=i) for i in range(n_sets)]
37 | X_test, y_test = make(n_samples=n_test)
38 | 
39 | for m in range(1, 50+1):
40 |     n_estimators = m
41 |     estimator = partial(RandomForestRegressor, n_estimators=m, max_features=1)
42 |     method = "RandomForestRegressor-M=%d" % m
43 | 
44 |     # Compute bias/variance on forest predictions
45 |     forests = []
46 | 
47 |     for k, (X_train, y_train) in enumerate(train):
48 |         #forests.append(estimator(n_estimators=n_estimators, random_state=k).fit(X_train, y_train))
49 |         forests.append(estimator(random_state=k).fit(X_train, y_train))
50 | 
51 |     pred_forest = np.zeros((n_test, n_sets))
52 | 
53 |     error = 0.0
54 |     for k, forest in enumerate(forests):
55 |         pred_forest[:, k] = forest.predict(X_test)
56 |         error += mean_squared_error(y_test, pred_forest[:, k])
57 |     error /= n_sets
58 | 
59 |     bias_forest = (y_test - np.mean(pred_forest, axis=1)) ** 2
60 |     var_forest = np.var(pred_forest, axis=1)
61 | 
62 |     # Estimate bias/variance from tree predictions
63 |     trees = []
64 | 
65 |     for k, (X_train, y_train) in enumerate(train):
66 |         #trees.extend(estimator(n_estimators=n_trees, random_state=n_sets+k).fit(X_train, y_train).estimators_)
67 |         trees.extend(RandomForestRegressor(n_estimators=n_trees, max_features=1, random_state=n_sets+k).fit(X_train, y_train).estimators_)
68 | 
69 |     pred_trees = np.zeros((n_test, n_sets * n_trees))
70 | 
71 |     for m, tree in enumerate(trees):
72 |         pred_trees[:, m] = tree.predict(X_test)
73 | 
74 |     mu = np.mean(pred_trees, axis=1)
75 |     sigma = np.var(pred_trees, axis=1)
76 |     rho = np.zeros(n_test)
77 | 
78 |     for i in range(n_test):
79 |         e_prod = 0.0
80 |         for k in range(n_sets):
81 |             p = pred_trees[i, k*n_trees:(k+1)*n_trees]
82 |             p = p.reshape((n_trees, 1))
83 |             e_prod += np.dot(p, p.T).mean()
84 |         e_prod /= n_sets
85 |         rho[i] = (e_prod - mu[i]**2) / sigma[i]
86 | 
87 |     bias = (y_test - mu) ** 2
88 |     var = rho * sigma + (1 - rho) / n_estimators * sigma
89 | 
90 |     print "%s,%f,%f,%f,%f,%f,%f,%f" % (method, bias.mean()+var.mean(), bias.mean(), var.mean(), rho.mean(), rho.std(), (rho*sigma).mean(), ((1 - rho) / n_estimators * sigma).mean())
91 | 
92 |     # print "%f (error) = %f (b^2) + %f (var)" % (error, bias_forest.mean(), var_forest.mean())
93 |     # print "%f (error) = %f (b^2) + %f (rho*sigma + (1-rho)/M*sigma)" % (bias.mean()+var.mean(), bias.mean(), var.mean())
94 |     # print "var = %f (rho*sigma) + %f (1-rho)/M*sigma ; rho = %f" % ((rho*sigma).mean(), ((1 - rho) / n_estimators * sigma).mean(), rho.mean())
95 |     # print "---"
96 | 


--------------------------------------------------------------------------------
/scripts/ch4_overfitting.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | blue = (0, 0, 1.0)
  5 | green = (0, 0.8, 0)
  6 | red = (1.0, 0, 0)
  7 | red_alpha = (1.0, 0, 0, 0.001)
  8 | gray = (0.7, 0.7, 0.7)
  9 | 
 10 | # Settings
 11 | n_repeat = 100       # Number of iterations for computing expectations
 12 | n_train = 30        # Size of the training set
 13 | n_test = 1000       # Size of the test set
 14 | noise = 0.1**0.5         # Standard deviation of the noise
 15 | np.random.seed(0)
 16 | 
 17 | from sklearn.pipeline import Pipeline
 18 | from sklearn.preprocessing import PolynomialFeatures
 19 | from sklearn.linear_model import LinearRegression
 20 | 
 21 | estimators = [("Degree = 1", Pipeline([("polynomial_features", PolynomialFeatures(degree=1, include_bias=False)), ("linear_regression", LinearRegression())])),
 22 |               ("Degree = 5", Pipeline([("polynomial_features", PolynomialFeatures(degree=5, include_bias=False)), ("linear_regression", LinearRegression())])),
 23 |               ("Degree = 15", Pipeline([("polynomial_features", PolynomialFeatures(degree=15, include_bias=False)), ("linear_regression", LinearRegression())])),]
 24 | 
 25 | n_estimators = len(estimators)
 26 | 
 27 | # Generate data
 28 | def f(x):
 29 |     x = x.ravel()
 30 | 
 31 |     return np.cos(2.5 * np.pi * x)
 32 | 
 33 | def generate(n_samples, noise, n_repeat=1):
 34 |     X = np.random.rand(n_samples)
 35 |     X = np.sort(X)
 36 | 
 37 |     if n_repeat == 1:
 38 |         y = f(X) + np.random.normal(0.0, noise, n_samples)
 39 |     else:
 40 |         y = np.zeros((n_samples, n_repeat))
 41 | 
 42 |         for i in range(n_repeat):
 43 |             y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)
 44 | 
 45 |     X = X.reshape((n_samples, 1))
 46 | 
 47 |     return X, y
 48 | 
 49 | X_train = []
 50 | y_train = []
 51 | 
 52 | for i in range(n_repeat):
 53 |     X, y = generate(n_samples=n_train, noise=noise)
 54 |     X_train.append(X)
 55 |     y_train.append(y)
 56 | 
 57 | X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)
 58 | 
 59 | plt.figure(figsize=(14, 8))
 60 | 
 61 | # Loop over estimators to compare
 62 | for n, (name, estimator) in enumerate(estimators):
 63 |     # Compute predictions
 64 |     y_predict = np.zeros((n_test, n_repeat))
 65 | 
 66 |     for i in xrange(n_repeat):
 67 |         estimator.fit(X_train[i], y_train[i])
 68 |         y_predict[:, i] = estimator.predict(X_test)
 69 | 
 70 |     # Bias^2 + Variance + Noise decomposition of the mean squared error
 71 |     y_error = np.zeros(n_test)
 72 | 
 73 |     for i in range(n_repeat):
 74 |         for j in range(n_repeat):
 75 |             y_error += (y_test[:, j] - y_predict[:, i]) ** 2
 76 | 
 77 |     y_error /= (n_repeat * n_repeat)
 78 | 
 79 |     y_noise = np.var(y_test, axis=1)
 80 |     y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
 81 |     y_var = np.var(y_predict, axis=1)
 82 | 
 83 |     print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
 84 |           " + {3:.4f} (var) + {4:.4f} (noise)".format(name,
 85 |                                                       np.mean(y_error),
 86 |                                                       np.mean(y_bias),
 87 |                                                       np.mean(y_var),
 88 |                                                       np.mean(y_noise)))
 89 | 
 90 |     # Plot figures
 91 |     ax = plt.subplot(2, n_estimators, n + 1)
 92 |     plt.setp(ax, xticks=(), yticks=())
 93 |     plt.plot(X_test, f(X_test), color=blue)
 94 |     plt.plot(X_train[0], y_train[0], ".b")
 95 |     plt.plot(X_test, y_predict[:, 0], color=gray)
 96 | 
 97 |     for i in range(1, n_repeat):
 98 |         plt.plot(X_test, y_predict[:, i], color=red_alpha, alpha=0.05)
 99 | 
100 |     plt.plot(X_test, np.mean(y_predict, axis=1), color=red,
101 |              label="$\mathbb{E}_{LS} \^y(x)$")
102 | 
103 |     plt.xlabel("x")
104 |     plt.ylabel("y")
105 |     plt.xlim((0., 1.0))
106 |     plt.ylim((-2, 2))
107 |     plt.title(name)
108 | 
109 |     ax = plt.subplot(2, n_estimators, n_estimators + n + 1)
110 |     plt.setp(ax, xticks=(), yticks=())
111 |     plt.plot(X_test, y_error, color=gray, label="$error(x)$")
112 |     plt.plot(X_test, y_bias, color=blue, label="$bias^2(x)$"),
113 |     plt.plot(X_test, y_var, color=red, label="$var(x)$"),
114 |     plt.plot(X_test, y_noise, color=green, label="$noise(x)$")
115 |     plt.xlabel("x")
116 |     plt.xlim((0., 1.0))
117 |     plt.ylim((0, 2.0))
118 | 
119 |     if n == 0:
120 |         plt.legend(loc="upper left", prop={"size": 11})
121 | 
122 | plt.show()
123 | 


--------------------------------------------------------------------------------
/scripts/ch4_correlation_plot2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | 
 5 | blue = (0, 0, 1.0)
 6 | green = (0, 0.8, 0)
 7 | red = (1.0, 0, 0)
 8 | red_alpha = (1.0, 0, 0, 0.7)
 9 | gray = (0.7, 0.7, 0.7)
10 | 
11 | results=[[],
12 | ["RandomForestRegressor-M=1",7.134504,2.629471,4.505033,0.063198,0.009473,0.283799,4.221234],
13 | ["RandomForestRegressor-M=2",5.023887,2.629471,2.394416,0.063198,0.009473,0.283799,2.110617],
14 | ["RandomForestRegressor-M=3",4.320348,2.629471,1.690877,0.063198,0.009473,0.283799,1.407078],
15 | ["RandomForestRegressor-M=4",3.968578,2.629471,1.339107,0.063198,0.009473,0.283799,1.055309],
16 | ["RandomForestRegressor-M=5",3.757516,2.629471,1.128046,0.063198,0.009473,0.283799,0.844247],
17 | ["RandomForestRegressor-M=6",3.616809,2.629471,0.987338,0.063198,0.009473,0.283799,0.703539],
18 | ["RandomForestRegressor-M=7",3.516303,2.629471,0.886832,0.063198,0.009473,0.283799,0.603033],
19 | ["RandomForestRegressor-M=8",3.440924,2.629471,0.811453,0.063198,0.009473,0.283799,0.527654],
20 | ["RandomForestRegressor-M=9",3.382296,2.629471,0.752825,0.063198,0.009473,0.283799,0.469026],
21 | ["RandomForestRegressor-M=10",3.335393,2.629471,0.705922,0.063198,0.009473,0.283799,0.422123],
22 | ["RandomForestRegressor-M=11",3.297018,2.629471,0.667547,0.063198,0.009473,0.283799,0.383749],
23 | ["RandomForestRegressor-M=12",3.265039,2.629471,0.635568,0.063198,0.009473,0.283799,0.351770],
24 | ["RandomForestRegressor-M=13",3.237980,2.629471,0.608509,0.063198,0.009473,0.283799,0.324710],
25 | ["RandomForestRegressor-M=14",3.214786,2.629471,0.585316,0.063198,0.009473,0.283799,0.301517],
26 | ["RandomForestRegressor-M=15",3.194685,2.629471,0.565215,0.063198,0.009473,0.283799,0.281416],
27 | ["RandomForestRegressor-M=16",3.177097,2.629471,0.547626,0.063198,0.009473,0.283799,0.263827],
28 | ["RandomForestRegressor-M=17",3.161577,2.629471,0.532107,0.063198,0.009473,0.283799,0.248308],
29 | ["RandomForestRegressor-M=18",3.147783,2.629471,0.518312,0.063198,0.009473,0.283799,0.234513],
30 | ["RandomForestRegressor-M=19",3.135440,2.629471,0.505969,0.063198,0.009473,0.283799,0.222170],
31 | ["RandomForestRegressor-M=20",3.124331,2.629471,0.494861,0.063198,0.009473,0.283799,0.211062],
32 | ["ExtraTreesRegressor-M=1",6.931454,2.484647,4.446807,0.051816,0.009069,0.230366,4.216441],
33 | ["ExtraTreesRegressor-M=2",4.823234,2.484647,2.338587,0.051816,0.009069,0.230366,2.108220],
34 | ["ExtraTreesRegressor-M=3",4.120494,2.484647,1.635847,0.051816,0.009069,0.230366,1.405480],
35 | ["ExtraTreesRegressor-M=4",3.769124,2.484647,1.284476,0.051816,0.009069,0.230366,1.054110],
36 | ["ExtraTreesRegressor-M=5",3.558302,2.484647,1.073654,0.051816,0.009069,0.230366,0.843288],
37 | ["ExtraTreesRegressor-M=6",3.417754,2.484647,0.933106,0.051816,0.009069,0.230366,0.702740],
38 | ["ExtraTreesRegressor-M=7",3.317362,2.484647,0.832715,0.051816,0.009069,0.230366,0.602349],
39 | ["ExtraTreesRegressor-M=8",3.242068,2.484647,0.757421,0.051816,0.009069,0.230366,0.527055],
40 | ["ExtraTreesRegressor-M=9",3.183507,2.484647,0.698860,0.051816,0.009069,0.230366,0.468493],
41 | ["ExtraTreesRegressor-M=10",3.136657,2.484647,0.652010,0.051816,0.009069,0.230366,0.421644],
42 | ["ExtraTreesRegressor-M=11",3.098326,2.484647,0.613679,0.051816,0.009069,0.230366,0.383313],
43 | ["ExtraTreesRegressor-M=12",3.066383,2.484647,0.581736,0.051816,0.009069,0.230366,0.351370],
44 | ["ExtraTreesRegressor-M=13",3.039355,2.484647,0.554708,0.051816,0.009069,0.230366,0.324342],
45 | ["ExtraTreesRegressor-M=14",3.016188,2.484647,0.531541,0.051816,0.009069,0.230366,0.301174],
46 | ["ExtraTreesRegressor-M=15",2.996109,2.484647,0.511462,0.051816,0.009069,0.230366,0.281096],
47 | ["ExtraTreesRegressor-M=16",2.978541,2.484647,0.493894,0.051816,0.009069,0.230366,0.263528],
48 | ["ExtraTreesRegressor-M=17",2.963039,2.484647,0.478392,0.051816,0.009069,0.230366,0.248026],
49 | ["ExtraTreesRegressor-M=18",2.949260,2.484647,0.464613,0.051816,0.009069,0.230366,0.234247],
50 | ["ExtraTreesRegressor-M=19",2.936931,2.484647,0.452284,0.051816,0.009069,0.230366,0.221918],
51 | ["ExtraTreesRegressor-M=20",2.925835,2.484647,0.441188,0.051816,0.009069,0.230366,0.210822]]
52 | 
53 | n_estimators = range(1, 20+1)
54 | 
55 | ax = plt.subplot()
56 | 
57 | plt.plot(n_estimators, [results[m][3] for m in n_estimators], 'o-', color=red, label=r'$var(x)$')
58 | plt.plot(n_estimators, [results[m][6] for m in n_estimators], ':', color=red_alpha, label=r'$\rho(x) \sigma^2_{{\cal L},\theta}$')
59 | plt.plot(n_estimators, [results[m][7] for m in n_estimators], '--', color=red_alpha, label=r'$\frac{1-\rho(x)}{M} \sigma^2_{{\cal L},\theta}$')
60 | 
61 | plt.xlabel("$M$")
62 | 
63 | ax.set_xlim([1, 20])
64 | plt.legend()
65 | 
66 | #plt.plot(n_estimators, [results[20+m][3] for m in n_estimators], 'o-', color=red, label='Extremely Randomized Trees')
67 | 
68 | plt.show()
69 | 


--------------------------------------------------------------------------------
/scripts/ID3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Understanding variable importances in forests of randomized trees.
  3 | Gilles Louppe, Louis Wehenkel, Antonio Sutera and Pierre Geurts
  4 | NIPS, Lake Tahoe, United States, 2013
  5 | http://orbi.ulg.ac.be/handle/2268/155642
  6 | 
  7 | This module implements a simplistic randomized ID3 tree classifier
  8 | (`RandomizedID3Classifier`), along with its ensemble counter-part
  9 | (`RandomizedID3Ensemble`).
 10 | 
 11 | Warning: These classes implement `fit` and  `feature_importances_`, but do not
 12 |          provide any `predict` method. They only serve as a proof-of-concept.
 13 | 
 14 | Author: Gilles Louppe <g.louppe@gmail.com>
 15 | License: BSD 3 clause
 16 | """
 17 | import copy
 18 | import itertools
 19 | import numpy as np
 20 | 
 21 | from sklearn.base import BaseEstimator, ClassifierMixin
 22 | from sklearn.ensemble import BaseEnsemble
 23 | from sklearn.utils import check_random_state
 24 | 
 25 | from demo import entropy
 26 | 
 27 | MAX_INT = np.iinfo("i").max
 28 | 
 29 | 
 30 | class RandomizedID3Classifier(BaseEstimator, ClassifierMixin):
 31 |     """Simplistic implementation of an ID3 randomized tree."""
 32 | 
 33 |     def __init__(self, k=1, max_depth=None, random_state=None):
 34 |         self.k = k
 35 |         self.max_depth = max_depth
 36 |         self.random_state = random_state
 37 |         self.tree_ = None
 38 | 
 39 |     def fit(self, X, y):
 40 |         self.n_features_ = X.shape[1]
 41 |         self.classes_ = np.unique(y)
 42 |         self.n_classes_ = len(self.classes_)
 43 |         self.random_state_ = check_random_state(self.random_state)
 44 | 
 45 |         self.values_ = []
 46 |         for i in xrange(self.n_features_):
 47 |             self.values_.append(np.unique(X[:, i]))
 48 | 
 49 |         self.tree_ = self._partition(X,
 50 |                                      np.searchsorted(self.classes_, y),
 51 |                                      range(self.n_features_),
 52 |                                      X.shape[0])
 53 | 
 54 |         return self
 55 | 
 56 |     def predict(self, X):
 57 |         raise NotImplementedError
 58 | 
 59 |     def _partition(self, X, y, variables, n_samples, depth=0):
 60 |         rng = self.random_state_
 61 | 
 62 |         # Leaf
 63 |         if len(variables) == 0 or (self.max_depth is not None and depth >= self.max_depth):
 64 |             values = 1. * np.bincount(y, minlength=self.n_classes_) / len(y)
 65 |             return (values, len(y))
 66 | 
 67 |         # Internal node
 68 |         else:
 69 |             variables = copy.copy(variables)
 70 |             n_variables = len(variables)
 71 |             n_node = len(X)
 72 | 
 73 |             best = None
 74 |             best_score = -np.inf
 75 |             best_children = None
 76 | 
 77 |             features = (rng.permutation(n_variables))[:min(self.k,
 78 |                                                            n_variables)]
 79 | 
 80 |             for i in features:
 81 |                 X_i = variables[i]
 82 | 
 83 |                 children = []
 84 | 
 85 |                 for xi in self.values_[X_i]:
 86 |                     mask_xi = X[:, X_i] == xi
 87 |                     if sum(mask_xi) > 0:
 88 |                         children.append((X[mask_xi], y[mask_xi], sum(mask_xi)))
 89 | 
 90 |                 score = ((1. * n_node / n_samples)  # P(B=b)
 91 |                          * (entropy(y) - sum([1. * entropy(c_y) * c_n / n_node
 92 |                                               for _, c_y, c_n in children])))
 93 | 
 94 |                 if score > best_score:
 95 |                     best = i
 96 |                     best_score = score
 97 |                     best_children = children
 98 | 
 99 |             X_i = variables.pop(best)
100 | 
101 |             return (X_i,
102 |                     best_score,
103 |                     [self._partition(c_X,
104 |                                      c_y,
105 |                                      variables,
106 |                                      n_samples,
107 |                                      depth=depth+1) for c_X,
108 |                                                         c_y,
109 |                                                         _ in best_children])
110 | 
111 |     @property
112 |     def feature_importances_(self):
113 |         def _visit(tree, depth):
114 |             if len(tree) == 2:
115 |                 pass
116 | 
117 |             else:
118 |                 imp[tree[0], depth] += tree[1]
119 | 
120 |                 for c in tree[2]:
121 |                     _visit(c, depth+1)
122 | 
123 |         imp = np.zeros((self.n_features_, self.n_features_))
124 |         _visit(self.tree_, 0)
125 | 
126 |         return imp
127 | 
128 | 
129 | class RandomizedID3Ensemble(BaseEnsemble, ClassifierMixin):
130 |     """Simplistic implementation of an ensemble of ID3 randomized trees."""
131 | 
132 |     def __init__(self, base_estimator=None, n_estimators=10, max_depth=None, random_state=None):
133 |         super(RandomizedID3Ensemble, self).__init__(
134 |             base_estimator=base_estimator,
135 |             n_estimators=n_estimators,
136 |             estimator_params=("max_depth",))
137 | 
138 |         self.max_depth = max_depth
139 |         self.random_state = random_state
140 | 
141 |     def _validate_estimator(self):
142 |         super(RandomizedID3Ensemble, self)._validate_estimator(
143 |             default=RandomizedID3Classifier())
144 | 
145 |     def fit(self, X, y):
146 |         random_state = check_random_state(self.random_state)
147 |         self._validate_estimator()
148 |         self.p = X.shape[1]
149 | 
150 |         for i in xrange(self.n_estimators):
151 |             tree = self._make_estimator()
152 |             tree.set_params(random_state=random_state.randint(MAX_INT))
153 |             tree.fit(X, y)
154 | 
155 |         return self
156 | 
157 |     def predict(self, X):
158 |         raise NotImplementedError
159 | 
160 |     @property
161 |     def feature_importances_(self):
162 |         importances = np.zeros((self.p, self.p))
163 | 
164 |         for i, tree in enumerate(self.estimators_):
165 |             importances += tree.feature_importances_
166 | 
167 |         importances /= self.n_estimators
168 | 
169 |         return importances
170 | 


--------------------------------------------------------------------------------
/tex/chapters/chapter09.tex:
--------------------------------------------------------------------------------
 1 | \chapter{Conclusions}\label{ch:conclusions}
 2 | 
 3 | By and large, machine learning remains an open field of research for which many
 4 | questions are still left unanswered, even regarding well-established methods.
 5 | In this dissertation, we have revisited decision trees and random forests,
 6 | consistently calling into question each and every part of these algorithms, in
 7 | order to shed new light on their learning capabilities, inner workings and
 8 | interpretability.
 9 | 
10 | In Part~\textsc{\ref{part:1}} of this work, we laid out the decision trees and
11 | random forests methodology in the context of classification and regression
12 | tasks. Our treatment first considered the induction of individual decision
13 | trees and put them into a unified and composable framework. In particular, our
14 | analysis reviewed assignment rules, stopping criteria and splitting rules,
15 | theoretically motivating their design and purpose whenever possible. We then
16 | proceeded with a systematic study of randomized ensemble methods within the
17 | bias-variance framework. We established that variance depends on the
18 | correlation between individual tree predictions, thereby showing why
19 | randomization acts as a mechanism for reducing the generalization error of an
20 | ensemble.  Random forest and its variants were then presented within the
21 | framework previously introduced, and their properties and features discussed
22 | and reviewed. Our contributions followed with an original time and space
23 | complexity analysis of random forests, hence showing their good computational
24 | performance and scalability to larger problems. Finally, the first part of this
25 | work concluded with an in-depth discussion of implementation details of random
26 | forests, highlighting and discussing considerations that are critical, yet
27 | easily overlooked, for guaranteeing good computational performance. While not
28 | directly apparent within this manuscript, this discussion also underlined our
29 | contributions in terms of software, within the open source Sckit-Learn library.
30 | As open science and reproducibility concerns are gaining momentum, we indeed
31 | believe that good quality software should be an integrative part, acknowledged
32 | for its own value and impact, of any modern scientific research activity.
33 | 
34 | Part~\textsc{\ref{part:2}} of this dissertation analyzed and discussed the
35 | interpretability of random forests in the eyes of variable importance measures.
36 | The core of our contributions rests in the theoretical characterization of the
37 | Mean Decrease of Impurity variable importance measure, from which we have then
38 | proved and derived some of its properties in the case of multiway totally
39 | randomized trees and in asymptotic conditions. In particular, we have shown
40 | that variable importances offer a three-level decomposition of the information
41 | jointly provided by the input variables about the output, accounting for all
42 | possible interaction terms in a fair and exhaustive way. More interestingly, we
43 | have also shown that variable importances only depend on relevant variables and
44 | that the importance of irrelevant variables is strictly equal to zero, thereby
45 | making importances a sound and appropriate criterion for assessing the
46 | usefulness of variables. In consequence of this work, our analysis then
47 | demonstrated that variable importances as computed from non-totally randomized
48 | trees (e.g., standard Random Forest or Extremely Randomized Trees) suffer from
49 | a combination of defects, due to masking effects, misestimations of node
50 | impurity or due to the binary structure of decision trees. Overall, we believe
51 | that our analysis should bring helpful insights in a wide range of
52 | applications, by shedding new light on variable importances. In particular, we
53 | advise to complement their interpretation and analysis with a systematic
54 | decomposition of their terms, in order to better understand why variables are
55 | (or are not) important.
56 | 
57 | This preliminary work unveils various directions of future work, both from a
58 | theoretical and practical point of view. To our belief, the most interesting
59 | theoretical open question would be the characterization of the distribution of
60 | variable importances in the finite setting. Such a characterization would
61 | indeed allow to more reliably distinguish irrelevant variables (whose
62 | importances are positive in the finite case) from relevant variables. Another
63 | interesting direction of future work would be to derive a proper
64 | characterization of variable importances in the case of binary trees -- even if we
65 | believe, as pointed out earlier, that variable importances derived from such
66 | ensembles may in fact not be as appropriate as desired. From a more practical
67 | point of view, this study also calls for a re-analysis of previous empirical
68 | studies. We indeed believe that variable importances along with their
69 | decomposition should yield new insights in many cases, providing a better
70 | understanding of the interactions between the input variables and the output,
71 | but also between the input variables themselves. Again, we recommend multiway
72 | totally randomized trees to mitigate sources of bias as much as possible.
73 | 
74 | Finally, Part~\textsc{\ref{part:3}} addressed limitations of random forests in
75 | the context of large datasets. Through extensive experiments, we have shown
76 | that subsampling either samples, features or both simultaneously provides on
77 | par performance while lowering at the same time the memory requirements.
78 | Overall this paradigm highlights an intriguing practical fact: there is often
79 | no need to build single models over immensely large datasets. Good performance
80 | can often more simply be achieved by building models on small random parts of the
81 | data and then combining them all in an ensemble, thereby avoiding all practical and
82 | computational burdens of making large data fit into memory. Again, this work
83 | raises interesting questions of further work. From a theoretical point of view,
84 | one would be to identify the statistical properties in the learning problem
85 | that are necessary for guaranteeing subsampling strategies to work. In
86 | particular, in which cases is it better to subsample examples rather than
87 | features? From a more practical point of view, other directions of research
88 | also include the study of smarter sampling strategies or the empirical
89 | verification that conclusions extend to non tree-based methods.
90 | 
91 | Overall, this thesis calls for a permanent re-assessment of machine learning
92 | methods and algorithms. It is only through a better understanding of their
93 | mechanisms that algorithms will advance in a consistent and reliable way.
94 | Always seek for the what and why. In conclusion, machine learning should not be
95 | considered as a black-box tool, but as a methodology, with a rational thought
96 | process that is entirely dependent on the problem we are trying to solve.
97 | 


--------------------------------------------------------------------------------
/benchmarks/resources/bench_randomforest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Benchmark script to bench scikit-learn's RandomForestClassifier
  3 | vs. R's randomForest.
  4 | 
  5 | It uses rpy2 to call R from python. Timings for randomForest are
  6 | pessimistic due to a constant overhead by wrapping numpy matrices
  7 | in R data_frames. The effect of the overhead can be reduced
  8 | by increasing the number of trees.
  9 | 
 10 | Note: make sure the LD_LIBRARY_PATH is set for rpy2::
 11 | 
 12 |     $ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib
 13 | """
 14 | 
 15 | import numpy as np
 16 | 
 17 | from time import time
 18 | from functools import wraps
 19 | from collections import defaultdict
 20 | 
 21 | from sklearn import datasets as sk_datasets
 22 | from sklearn.utils import shuffle
 23 | from sklearn.utils import check_random_state
 24 | from sklearn.ensemble import RandomForestClassifier
 25 | from sklearn.base import BaseEstimator, ClassifierMixin
 26 | 
 27 | from rpy2.robjects.numpy2ri import numpy2ri
 28 | from rpy2.robjects.packages import importr
 29 | from rpy2 import robjects as ro
 30 | 
 31 | import pylab as pl
 32 | 
 33 | rf = importr('randomForest')
 34 | 
 35 | data_path = '/home/pprett/corpora'
 36 | 
 37 | 
 38 | class RRandomForestClassifier(BaseEstimator, ClassifierMixin):
 39 | 
 40 |     def __init__(self, **kargs):
 41 |         self.params = kargs
 42 | 
 43 |     def fit(self, X, y):
 44 |         self.classes_ = np.unique(y)
 45 |         y = np.searchsorted(self.classes_, y) + 1
 46 |         X = numpy2ri(X)
 47 |         y = ro.FactorVector(numpy2ri(y))
 48 |         self.model_ = rf.randomForest(X, y, **self.params)
 49 |         return self
 50 | 
 51 |     def predict(self, X):
 52 |         X = numpy2ri(X)
 53 |         pred = rf.predict_randomForest(self.model_, X)
 54 |         # R maps class labels
 55 |         pred = np.array(pred, dtype=np.int32) - 1
 56 |         return self.classes_[pred]
 57 | 
 58 | 
 59 | def repeat(n_repetitions=3):
 60 |     def wrap(f):
 61 |         def wrapper(*args, **kargs):
 62 |             scores = []
 63 |             for i in range(n_repetitions):
 64 |                 scores.append(f(*args, random_state=i, **kargs))
 65 |             scores = np.array(scores)
 66 |             return scores.mean(axis=0), scores.std(axis=0)
 67 |         return wraps(f)(wrapper)
 68 |     return wrap
 69 | 
 70 | 
 71 | @repeat()
 72 | def bench_hastie_10_2(clf, random_state=None):
 73 |     X, y = sk_datasets.make_hastie_10_2(random_state=random_state)
 74 |     X_train, X_test = X[:2000], X[2000:]
 75 |     y_train, y_test = y[:2000], y[2000:]
 76 |     X_train = np.asarray(X_train, order='f', dtype=np.float32)
 77 |     X_test = np.asarray(X_test, dtype=np.float32)
 78 |     t0 = time()
 79 |     clf.fit(X_train, y_train)
 80 |     train_time = time() - t0
 81 |     t0 = time()
 82 |     pred = clf.predict(X_test)
 83 |     test_time = time() - t0
 84 |     error_rate = np.mean(pred != y_test)
 85 |     return error_rate, train_time, test_time
 86 | 
 87 | 
 88 | @repeat()
 89 | def bench_random_gaussian(clf, random_state=None):
 90 |     rs = check_random_state(random_state)
 91 |     shape = (12000, 10)
 92 |     X = rs.normal(size=shape).reshape(shape)
 93 |     y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.int32)
 94 | 
 95 |     X_train, X_test = X[:2000], X[2000:]
 96 |     y_train, y_test = y[:2000], y[2000:]
 97 |     X_train = np.asarray(X_train, order='f', dtype=np.float32)
 98 |     X_test = np.asarray(X_test, dtype=np.float32)
 99 | 
100 |     t0 = time()
101 |     clf.fit(X_train, y_train)
102 |     train_time = time() - t0
103 |     t0 = time()
104 |     pred = clf.predict(X_test)
105 |     test_time = time() - t0
106 |     error_rate = np.mean(pred != y_test)
107 |     return error_rate, train_time, test_time
108 | 
109 | 
110 | @repeat()
111 | def bench_spam(clf, random_state=None):
112 |     X = np.loadtxt(data_path + "/spam/spambase.data", delimiter=",")
113 |     y = X[:, -1].ravel()
114 |     X = X[:, :-1]
115 |     f = open(data_path + "/spam/spambase.names")
116 |     feature_names = np.array([l.split(":")[0] for l in f])
117 | 
118 |     X, y = shuffle(X, y, random_state=random_state)
119 |     X_test, y_test = X[:1536], y[:1536]
120 |     X_train, y_train = X[1536:], y[1536:]
121 |     X_train = np.asarray(X_train, order='f', dtype=np.float32)
122 |     X_test = np.asarray(X_test, dtype=np.float32)
123 | 
124 |     t0 = time()
125 |     clf.fit(X_train, y_train)
126 |     train_time = time() - t0
127 |     t0 = time()
128 |     error_rate = (1.0 - clf.score(X_test, y_test))
129 |     test_time = time() - t0
130 |     return error_rate, train_time, test_time
131 | 
132 | 
133 | @repeat()
134 | def bench_madelon(clf, random_state=None):
135 |     X_train = np.loadtxt(data_path + "/madelon/madelon_train.data")
136 |     y_train = np.loadtxt(data_path + "/madelon/madelon_train.labels")
137 |     X_test = np.loadtxt(data_path + "/madelon/madelon_valid.data")
138 |     y_test = np.loadtxt(data_path + "/madelon/madelon_valid.labels")
139 |     X_train = np.asarray(X_train, order='f', dtype=np.float32)
140 |     X_test = np.asarray(X_test, dtype=np.float32)
141 |     t0 = time()
142 |     clf.fit(X_train, y_train)
143 |     train_time = time() - t0
144 |     t0 = time()
145 |     error_rate = (1.0 - clf.score(X_test, y_test))
146 |     test_time = time() - t0
147 |     return error_rate, train_time, test_time
148 | 
149 | 
150 | @repeat()
151 | def bench_arcene(clf, random_state=None):
152 |     X_train = np.loadtxt(data_path + "/arcene/arcene_train.data")
153 |     y_train = np.loadtxt(data_path + "/arcene/arcene_train.labels")
154 |     X_test = np.loadtxt(data_path + "/arcene/arcene_valid.data")
155 |     y_test = np.loadtxt(data_path + "/arcene/arcene_valid.labels")
156 |     X_train = np.asarray(X_train, order='f', dtype=np.float32)
157 |     X_test = np.asarray(X_test, dtype=np.float32)
158 |     t0 = time()
159 |     clf.fit(X_train, y_train)
160 |     train_time = time() - t0
161 |     t0 = time()
162 |     error_rate = (1.0 - clf.score(X_test, y_test))
163 |     test_time = time() - t0
164 |     return error_rate, train_time, test_time
165 | 
166 | 
167 | @repeat()
168 | def bench_landsat(clf, random_state=None):
169 |     landsat = sk_datasets.load_landsat()
170 |     X = np.asarray(landsat.data, order='f', dtype=np.float32)
171 |     y = landsat.target
172 |     t0 = time()
173 |     clf.fit(X, y)
174 |     train_time = time() - t0
175 |     t0 = time()
176 |     error_rate = (1.0 - clf.score(X, y))
177 |     test_time = time() - t0
178 |     return error_rate, train_time, test_time
179 | 
180 | 
181 | @repeat(1)
182 | def bench_mnist(clf, random_state=None):
183 |     rs = check_random_state(random_state)
184 |     mnist = sk_datasets.fetch_mldata('MNIST original')
185 |     inds = np.arange(len(mnist.data))
186 |     rs.shuffle(inds)
187 |     cut_off = int(0.9 * len(inds))
188 |     train_i = inds[:cut_off]
189 |     test_i = inds[cut_off:]
190 | 
191 |     X_train = mnist.data[train_i].astype(np.float32)
192 |     y_train = mnist.target[train_i].astype(np.float64)
193 | 
194 |     X_test = mnist.data[test_i].astype(np.float32)
195 |     y_test = mnist.target[test_i].astype(np.float64)
196 | 
197 |     t0 = time()
198 |     clf.fit(X_train, y_train)
199 |     train_time = time() - t0
200 |     t0 = time()
201 |     error_rate = (1.0 - clf.score(X_test, y_test))
202 |     test_time = time() - t0
203 |     return error_rate, train_time, test_time
204 | 
205 | 
206 | if __name__ == '__main__':
207 |     res = defaultdict(dict)
208 | 
209 |     clfs = {'r': RRandomForestClassifier(ntree=100, mtry=3, nodesize=1),
210 |             'py': RandomForestClassifier(n_estimators=100, max_features=3,
211 |                                          min_samples_leaf=1,
212 |                                          n_jobs=1)}
213 |     datasets = {'random_gaussian': bench_random_gaussian,
214 |                 'spam': bench_spam,
215 |                 'madelon': bench_madelon,
216 |                 'arcene': bench_arcene,
217 |                 'landsat': bench_landsat,
218 |                 'hastie_10_2': bench_hastie_10_2}
219 | 
220 |     for impl, clf in clfs.iteritems():
221 |         for dataset, ds_bench in datasets.iteritems():
222 |             mean, std = ds_bench(clf)
223 |             res[dataset][impl] = (mean, std)
224 | 
225 |     clfs = {'r': RRandomForestClassifier(ntree=10, mtry=3, nodesize=1),
226 |             'py': RandomForestClassifier(n_estimators=10, max_features=3,
227 |                                          min_samples_leaf=1,
228 |                                          n_jobs=1)}
229 |     datasets = {'mnist': bench_mnist}
230 |     for impl, clf in clfs.iteritems():
231 |         for dataset, ds_bench in datasets.iteritems():
232 |             mean, std = ds_bench(clf)
233 |             res[dataset][impl] = (mean, std)
234 | 
235 |     for ds in res:
236 |         print('_' * 80)
237 |         print(ds)
238 |         print
239 |         print("%s\t%s\t%s" % (' '*4, 'r'.center(13), 'py'.center(13)))
240 |         for i, metric in enumerate(['score', 'train', 'test']):
241 |             print("%s\t%.4f (%.2f)\t%.4f (%.2f)" %
242 |                   (metric, res[ds]['r'][0][i], res[ds]['r'][1][i],
243 |                    res[ds]['py'][0][i], res[ds]['py'][1][i]))
244 |         print
245 | 
246 | 


--------------------------------------------------------------------------------
/slides/minted.sty:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% This is file `minted.sty',
  3 | %% generated with the docstrip utility.
  4 | %%
  5 | %% The original source files were:
  6 | %%
  7 | %% minted.dtx  (with options: `package')
  8 | %% Copyright 2010--2011 Konrad Rudolph
  9 | %% 
 10 | %% This work may be distributed and/or modified under the
 11 | %% conditions of the LaTeX Project Public License, either version 1.3
 12 | %% of this license or (at your option) any later version.
 13 | %% The latest version of this license is in
 14 | %%   http://www.latex-project.org/lppl.txt
 15 | %% and version 1.3 or later is part of all distributions of LaTeX
 16 | %% version 2005/12/01 or later.
 17 | %% 
 18 | %% Additionally, the project may be distributed under the terms of the new BSD
 19 | %% license.
 20 | %% 
 21 | %% This work has the LPPL maintenance status `maintained'.
 22 | %% 
 23 | %% The Current Maintainer of this work is Konrad Rudolph.
 24 | %% 
 25 | %% This work consists of the files minted.dtx and minted.ins
 26 | %% and the derived file minted.sty.
 27 | \NeedsTeXFormat{LaTeX2e}
 28 | \ProvidesPackage{minted}[2011/09/17 v1.7 Yet another Pygments shim for LaTeX]
 29 | \RequirePackage{keyval}
 30 | \RequirePackage{fancyvrb}
 31 | \RequirePackage{xcolor}
 32 | \RequirePackage{float}
 33 | \RequirePackage{ifthen}
 34 | \RequirePackage{calc}
 35 | \RequirePackage{ifplatform}
 36 | \DeclareOption{chapter}{\def\minted@float@within{chapter}}
 37 | \DeclareOption{section}{\def\minted@float@within{section}}
 38 | \ProcessOptions\relax
 39 | \ifwindows
 40 |   \providecommand\DeleteFile[1]{\immediate\write18{del #1}}
 41 | \else
 42 |   \providecommand\DeleteFile[1]{\immediate\write18{rm #1}}
 43 | \fi
 44 | \newboolean{AppExists}
 45 | \newcommand\TestAppExists[1]{
 46 |   \ifwindows
 47 |     \DeleteFile{\jobname.aex}
 48 |     \immediate\write18{for \string^\@percentchar i in (#1.exe #1.bat #1.cmd)
 49 |       do set >\jobname.aex <nul: /p x=\string^\@percentchar \string~$PATH:i>>\jobname.aex} %$
 50 |     \newread\@appexistsfile
 51 |     \immediate\openin\@appexistsfile\jobname.aex
 52 |     \expandafter\def\expandafter\@tmp@cr\expandafter{\the\endlinechar}
 53 |     \endlinechar=-1\relax
 54 |     \readline\@appexistsfile to \@apppathifexists
 55 |     \endlinechar=\@tmp@cr
 56 |     \ifthenelse{\equal{\@apppathifexists}{}}
 57 |      {\AppExistsfalse}
 58 |      {\AppExiststrue}
 59 |     \immediate\closein\@appexistsfile
 60 |     \DeleteFile{\jobname.aex}
 61 | \immediate\typeout{file deleted}
 62 |   \else
 63 |     \immediate\write18{which #1 && touch \jobname.aex}
 64 |     \IfFileExists{\jobname.aex}
 65 |      {\AppExiststrue
 66 |       \DeleteFile{\jobname.aex}}
 67 |      {\AppExistsfalse}
 68 |   \fi}
 69 | \newcommand\minted@resetoptions{}
 70 | \newcommand\minted@defopt[1]{
 71 |   \expandafter\def\expandafter\minted@resetoptions\expandafter{%
 72 |     \minted@resetoptions
 73 |     \@namedef{minted@opt@#1}{}}}
 74 | \newcommand\minted@opt[1]{
 75 |   \expandafter\detokenize%
 76 |     \expandafter\expandafter\expandafter{\csname minted@opt@#1\endcsname}}
 77 | \newcommand\minted@define@opt[3][]{
 78 |   \minted@defopt{#2}
 79 |   \ifthenelse{\equal{#1}{}}{
 80 |     \define@key{minted@opt}{#2}{\@namedef{minted@opt@#2}{#3}}}
 81 |    {\define@key{minted@opt}{#2}[#1]{\@namedef{minted@opt@#2}{#3}}}}
 82 | \newcommand\minted@define@switch[3][]{
 83 |   \minted@defopt{#2}
 84 |   \define@booleankey{minted@opt}{#2}
 85 |    {\@namedef{minted@opt@#2}{#3}}
 86 |    {\@namedef{minted@opt@#2}{#1}}}
 87 | \minted@defopt{extra}
 88 | \newcommand\minted@define@extra[1]{
 89 |   \define@key{minted@opt}{#1}{
 90 |     \expandafter\def\expandafter\minted@opt@extra\expandafter{%
 91 |       \minted@opt@extra,#1=##1}}}
 92 | \newcommand\minted@define@extra@switch[1]{
 93 |   \define@booleankey{minted@opt}{#1}
 94 |    {\expandafter\def\expandafter\minted@opt@extra\expandafter{%
 95 |       \minted@opt@extra,#1}}
 96 |    {\expandafter\def\expandafter\minted@opt@extra\expandafter{%
 97 |       \minted@opt@extra,#1=false}}}
 98 | \minted@define@switch{texcl}{-P texcomments}
 99 | \minted@define@switch{mathescape}{-P mathescape}
100 | \minted@define@switch{linenos}{-P linenos}
101 | \minted@define@switch{startinline}{-P startinline}
102 | \minted@define@switch[-P funcnamehighlighting=False]%
103 |   {funcnamehighlighting}{-P funcnamehighlighting}
104 | \minted@define@opt{gobble}{-F gobble:n=#1}
105 | \minted@define@opt{bgcolor}{#1}
106 | \minted@define@extra{frame}
107 | \minted@define@extra{framesep}
108 | \minted@define@extra{framerule}
109 | \minted@define@extra{rulecolor}
110 | \minted@define@extra{numbersep}
111 | \minted@define@extra{firstnumber}
112 | \minted@define@extra{stepnumber}
113 | \minted@define@extra{firstline}
114 | \minted@define@extra{lastline}
115 | \minted@define@extra{baselinestretch}
116 | \minted@define@extra{xleftmargin}
117 | \minted@define@extra{xrightmargin}
118 | \minted@define@extra{fillcolor}
119 | \minted@define@extra{tabsize}
120 | \minted@define@extra{fontfamily}
121 | \minted@define@extra{fontsize}
122 | \minted@define@extra{fontshape}
123 | \minted@define@extra{fontseries}
124 | \minted@define@extra{formatcom}
125 | \minted@define@extra{label}
126 | \minted@define@extra@switch{numberblanklines}
127 | \minted@define@extra@switch{showspaces}
128 | \minted@define@extra@switch{resetmargins}
129 | \minted@define@extra@switch{samepage}
130 | \minted@define@extra@switch{showtabs}
131 | \minted@define@extra@switch{obeytabs}
132 | \newsavebox{\minted@bgbox}
133 | \newenvironment{minted@colorbg}[1]{
134 |   \def\minted@bgcol{#1}
135 |   \noindent
136 |   \begin{lrbox}{\minted@bgbox}
137 |   \begin{minipage}{\linewidth-2\fboxsep}}
138 |  {\end{minipage}
139 |   \end{lrbox}%
140 |   \colorbox{\minted@bgcol}{\usebox{\minted@bgbox}}}
141 | \newwrite\minted@code
142 | \newcommand\minted@savecode[1]{
143 |   \immediate\openout\minted@code\jobname.pyg
144 |   \immediate\write\minted@code{#1}
145 |   \immediate\closeout\minted@code}
146 | \newcommand\minted@pygmentize[2][\jobname.pyg]{
147 |   \def\minted@cmd{pygmentize -l #2 -f latex -F tokenmerge
148 |     \minted@opt{gobble} \minted@opt{texcl} \minted@opt{mathescape}
149 |     \minted@opt{startinline} \minted@opt{funcnamehighlighting}
150 |     \minted@opt{linenos} -P "verboptions=\minted@opt{extra}"
151 |     -o \jobname.out.pyg #1}
152 |   \immediate\write18{\minted@cmd}
153 |   % For debugging, uncomment:
154 |   %\immediate\typeout{\minted@cmd}
155 |   \ifthenelse{\equal{\minted@opt@bgcolor}{}}
156 |    {}
157 |    {\begin{minted@colorbg}{\minted@opt@bgcolor}}
158 |   \input{\jobname.out.pyg}
159 |   \ifthenelse{\equal{\minted@opt@bgcolor}{}}
160 |    {}
161 |    {\end{minted@colorbg}}
162 |   \DeleteFile{\jobname.out.pyg}}
163 | \newcommand\minted@usedefaultstyle{\usemintedstyle{default}}
164 | \newcommand\usemintedstyle[1]{
165 |   \renewcommand\minted@usedefaultstyle{}
166 |   \immediate\write18{pygmentize -S #1 -f latex > \jobname.pyg}
167 |   \input{\jobname.pyg}}
168 | \newcommand\mint[3][]{
169 |   \DefineShortVerb{#3}
170 |   \minted@resetoptions
171 |   \setkeys{minted@opt}{#1}
172 |   \SaveVerb[aftersave={
173 |     \UndefineShortVerb{#3}
174 |     \minted@savecode{\FV@SV@minted@verb}
175 |     \minted@pygmentize{#2}
176 |     \DeleteFile{\jobname.pyg}}]{minted@verb}#3}
177 | \newcommand\minted@proglang[1]{}
178 | \newenvironment{minted}[2][]
179 |  {\VerbatimEnvironment
180 |   \renewcommand{\minted@proglang}[1]{#2}
181 |   \minted@resetoptions
182 |   \setkeys{minted@opt}{#1}
183 |   \begin{VerbatimOut}[codes={\catcode`\^^I=12}]{\jobname.pyg}}%
184 |  {\end{VerbatimOut}
185 |   \minted@pygmentize{\minted@proglang{}}
186 |   \DeleteFile{\jobname.pyg}}
187 | \newcommand\inputminted[3][]{
188 |   \minted@resetoptions
189 |   \setkeys{minted@opt}{#1}
190 |   \minted@pygmentize[#3]{#2}}
191 | \newcommand\newminted[3][]{
192 |   \ifthenelse{\equal{#1}{}}
193 |    {\def\minted@envname{#2code}}
194 |    {\def\minted@envname{#1}}
195 |   \newenvironment{\minted@envname}
196 |    {\VerbatimEnvironment\begin{minted}[#3]{#2}}
197 |    {\end{minted}}
198 |   \newenvironment{\minted@envname *}[1]
199 |    {\VerbatimEnvironment\begin{minted}[#3,##1]{#2}}
200 |    {\end{minted}}}
201 | \newcommand\newmint[3][]{
202 |   \ifthenelse{\equal{#1}{}}
203 |    {\def\minted@shortname{#2}}
204 |    {\def\minted@shortname{#1}}
205 |   \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{
206 |     \mint[#3,##1]{#2}##2}}
207 | \newcommand\newmintedfile[3][]{
208 |   \ifthenelse{\equal{#1}{}}
209 |    {\def\minted@shortname{#2file}}
210 |    {\def\minted@shortname{#1}}
211 |   \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{
212 |     \inputminted[#3,##1]{#2}{##2}}}
213 | \@ifundefined{minted@float@within}
214 |  {\newfloat{listing}{h}{lol}}
215 |  {\newfloat{listing}{h}{lol}[\minted@float@within]}
216 | \newcommand\listingscaption{Listing}
217 | \floatname{listing}{\listingscaption}
218 | \newcommand\listoflistingscaption{List of listings}
219 | \providecommand\listoflistings{\listof{listing}{\listoflistingscaption}}
220 | \AtBeginDocument{
221 |   \minted@usedefaultstyle}
222 | \AtEndOfPackage{
223 |   \ifnum\pdf@shellescape=1\relax\else
224 |     \PackageError{minted}
225 |      {You must invoke LaTeX with the
226 |       -shell-escape flag}
227 |      {Pass the -shell-escape flag to LaTeX. Refer to the minted.sty
228 |       documentation for more information.}\fi
229 |   \TestAppExists{pygmentize}
230 |   \ifAppExists\else
231 |     \PackageError{minted}
232 |      {You must have `pygmentize' installed
233 |       to use this package}
234 |      {Refer to the installation instructions in the minted
235 |       documentation for more information.}
236 |   \fi}
237 | \endinput
238 | %%
239 | %% End of file `minted.sty'.
240 | 


--------------------------------------------------------------------------------
/tex/minted.sty:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% This is file `minted.sty',
  3 | %% generated with the docstrip utility.
  4 | %%
  5 | %% The original source files were:
  6 | %%
  7 | %% minted.dtx  (with options: `package')
  8 | %% Copyright 2010--2011 Konrad Rudolph
  9 | %% 
 10 | %% This work may be distributed and/or modified under the
 11 | %% conditions of the LaTeX Project Public License, either version 1.3
 12 | %% of this license or (at your option) any later version.
 13 | %% The latest version of this license is in
 14 | %%   http://www.latex-project.org/lppl.txt
 15 | %% and version 1.3 or later is part of all distributions of LaTeX
 16 | %% version 2005/12/01 or later.
 17 | %% 
 18 | %% Additionally, the project may be distributed under the terms of the new BSD
 19 | %% license.
 20 | %% 
 21 | %% This work has the LPPL maintenance status `maintained'.
 22 | %% 
 23 | %% The Current Maintainer of this work is Konrad Rudolph.
 24 | %% 
 25 | %% This work consists of the files minted.dtx and minted.ins
 26 | %% and the derived file minted.sty.
 27 | \NeedsTeXFormat{LaTeX2e}
 28 | \ProvidesPackage{minted}[2011/09/17 v1.7 Yet another Pygments shim for LaTeX]
 29 | \RequirePackage{keyval}
 30 | \RequirePackage{fancyvrb}
 31 | \RequirePackage{xcolor}
 32 | \RequirePackage{float}
 33 | \RequirePackage{ifthen}
 34 | \RequirePackage{calc}
 35 | \RequirePackage{ifplatform}
 36 | \DeclareOption{chapter}{\def\minted@float@within{chapter}}
 37 | \DeclareOption{section}{\def\minted@float@within{section}}
 38 | \ProcessOptions\relax
 39 | \ifwindows
 40 |   \providecommand\DeleteFile[1]{\immediate\write18{del #1}}
 41 | \else
 42 |   \providecommand\DeleteFile[1]{\immediate\write18{rm #1}}
 43 | \fi
 44 | \newboolean{AppExists}
 45 | \newcommand\TestAppExists[1]{
 46 |   \ifwindows
 47 |     \DeleteFile{\jobname.aex}
 48 |     \immediate\write18{for \string^\@percentchar i in (#1.exe #1.bat #1.cmd)
 49 |       do set >\jobname.aex <nul: /p x=\string^\@percentchar \string~$PATH:i>>\jobname.aex} %$
 50 |     \newread\@appexistsfile
 51 |     \immediate\openin\@appexistsfile\jobname.aex
 52 |     \expandafter\def\expandafter\@tmp@cr\expandafter{\the\endlinechar}
 53 |     \endlinechar=-1\relax
 54 |     \readline\@appexistsfile to \@apppathifexists
 55 |     \endlinechar=\@tmp@cr
 56 |     \ifthenelse{\equal{\@apppathifexists}{}}
 57 |      {\AppExistsfalse}
 58 |      {\AppExiststrue}
 59 |     \immediate\closein\@appexistsfile
 60 |     \DeleteFile{\jobname.aex}
 61 | \immediate\typeout{file deleted}
 62 |   \else
 63 |     \immediate\write18{which #1 && touch \jobname.aex}
 64 |     \IfFileExists{\jobname.aex}
 65 |      {\AppExiststrue
 66 |       \DeleteFile{\jobname.aex}}
 67 |      {\AppExistsfalse}
 68 |   \fi}
 69 | \newcommand\minted@resetoptions{}
 70 | \newcommand\minted@defopt[1]{
 71 |   \expandafter\def\expandafter\minted@resetoptions\expandafter{%
 72 |     \minted@resetoptions
 73 |     \@namedef{minted@opt@#1}{}}}
 74 | \newcommand\minted@opt[1]{
 75 |   \expandafter\detokenize%
 76 |     \expandafter\expandafter\expandafter{\csname minted@opt@#1\endcsname}}
 77 | \newcommand\minted@define@opt[3][]{
 78 |   \minted@defopt{#2}
 79 |   \ifthenelse{\equal{#1}{}}{
 80 |     \define@key{minted@opt}{#2}{\@namedef{minted@opt@#2}{#3}}}
 81 |    {\define@key{minted@opt}{#2}[#1]{\@namedef{minted@opt@#2}{#3}}}}
 82 | \newcommand\minted@define@switch[3][]{
 83 |   \minted@defopt{#2}
 84 |   \define@booleankey{minted@opt}{#2}
 85 |    {\@namedef{minted@opt@#2}{#3}}
 86 |    {\@namedef{minted@opt@#2}{#1}}}
 87 | \minted@defopt{extra}
 88 | \newcommand\minted@define@extra[1]{
 89 |   \define@key{minted@opt}{#1}{
 90 |     \expandafter\def\expandafter\minted@opt@extra\expandafter{%
 91 |       \minted@opt@extra,#1=##1}}}
 92 | \newcommand\minted@define@extra@switch[1]{
 93 |   \define@booleankey{minted@opt}{#1}
 94 |    {\expandafter\def\expandafter\minted@opt@extra\expandafter{%
 95 |       \minted@opt@extra,#1}}
 96 |    {\expandafter\def\expandafter\minted@opt@extra\expandafter{%
 97 |       \minted@opt@extra,#1=false}}}
 98 | \minted@define@switch{texcl}{-P texcomments}
 99 | \minted@define@switch{mathescape}{-P mathescape}
100 | \minted@define@switch{linenos}{-P linenos}
101 | \minted@define@switch{startinline}{-P startinline}
102 | \minted@define@switch[-P funcnamehighlighting=False]%
103 |   {funcnamehighlighting}{-P funcnamehighlighting}
104 | \minted@define@opt{gobble}{-F gobble:n=#1}
105 | \minted@define@opt{bgcolor}{#1}
106 | \minted@define@extra{frame}
107 | \minted@define@extra{framesep}
108 | \minted@define@extra{framerule}
109 | \minted@define@extra{rulecolor}
110 | \minted@define@extra{numbersep}
111 | \minted@define@extra{firstnumber}
112 | \minted@define@extra{stepnumber}
113 | \minted@define@extra{firstline}
114 | \minted@define@extra{lastline}
115 | \minted@define@extra{baselinestretch}
116 | \minted@define@extra{xleftmargin}
117 | \minted@define@extra{xrightmargin}
118 | \minted@define@extra{fillcolor}
119 | \minted@define@extra{tabsize}
120 | \minted@define@extra{fontfamily}
121 | \minted@define@extra{fontsize}
122 | \minted@define@extra{fontshape}
123 | \minted@define@extra{fontseries}
124 | \minted@define@extra{formatcom}
125 | \minted@define@extra{label}
126 | \minted@define@extra@switch{numberblanklines}
127 | \minted@define@extra@switch{showspaces}
128 | \minted@define@extra@switch{resetmargins}
129 | \minted@define@extra@switch{samepage}
130 | \minted@define@extra@switch{showtabs}
131 | \minted@define@extra@switch{obeytabs}
132 | \newsavebox{\minted@bgbox}
133 | \newenvironment{minted@colorbg}[1]{
134 |   \def\minted@bgcol{#1}
135 |   \noindent
136 |   \begin{lrbox}{\minted@bgbox}
137 |   \begin{minipage}{\linewidth-2\fboxsep}}
138 |  {\end{minipage}
139 |   \end{lrbox}%
140 |   \colorbox{\minted@bgcol}{\usebox{\minted@bgbox}}}
141 | \newwrite\minted@code
142 | \newcommand\minted@savecode[1]{
143 |   \immediate\openout\minted@code\jobname.pyg
144 |   \immediate\write\minted@code{#1}
145 |   \immediate\closeout\minted@code}
146 | \newcommand\minted@pygmentize[2][\jobname.pyg]{
147 |   \def\minted@cmd{pygmentize -l #2 -f latex -F tokenmerge
148 |     \minted@opt{gobble} \minted@opt{texcl} \minted@opt{mathescape}
149 |     \minted@opt{startinline} \minted@opt{funcnamehighlighting}
150 |     \minted@opt{linenos} -P "verboptions=\minted@opt{extra}"
151 |     -o \jobname.out.pyg #1}
152 |   \immediate\write18{\minted@cmd}
153 |   % For debugging, uncomment:
154 |   %\immediate\typeout{\minted@cmd}
155 |   \ifthenelse{\equal{\minted@opt@bgcolor}{}}
156 |    {}
157 |    {\begin{minted@colorbg}{\minted@opt@bgcolor}}
158 |   \input{\jobname.out.pyg}
159 |   \ifthenelse{\equal{\minted@opt@bgcolor}{}}
160 |    {}
161 |    {\end{minted@colorbg}}
162 |   \DeleteFile{\jobname.out.pyg}}
163 | \newcommand\minted@usedefaultstyle{\usemintedstyle{default}}
164 | \newcommand\usemintedstyle[1]{
165 |   \renewcommand\minted@usedefaultstyle{}
166 |   \immediate\write18{pygmentize -S #1 -f latex > \jobname.pyg}
167 |   \input{\jobname.pyg}}
168 | \newcommand\mint[3][]{
169 |   \DefineShortVerb{#3}
170 |   \minted@resetoptions
171 |   \setkeys{minted@opt}{#1}
172 |   \SaveVerb[aftersave={
173 |     \UndefineShortVerb{#3}
174 |     \minted@savecode{\FV@SV@minted@verb}
175 |     \minted@pygmentize{#2}
176 |     \DeleteFile{\jobname.pyg}}]{minted@verb}#3}
177 | \newcommand\minted@proglang[1]{}
178 | \newenvironment{minted}[2][]
179 |  {\VerbatimEnvironment
180 |   \renewcommand{\minted@proglang}[1]{#2}
181 |   \minted@resetoptions
182 |   \setkeys{minted@opt}{#1}
183 |   \begin{VerbatimOut}[codes={\catcode`\^^I=12}]{\jobname.pyg}}%
184 |  {\end{VerbatimOut}
185 |   \minted@pygmentize{\minted@proglang{}}
186 |   \DeleteFile{\jobname.pyg}}
187 | \newcommand\inputminted[3][]{
188 |   \minted@resetoptions
189 |   \setkeys{minted@opt}{#1}
190 |   \minted@pygmentize[#3]{#2}}
191 | \newcommand\newminted[3][]{
192 |   \ifthenelse{\equal{#1}{}}
193 |    {\def\minted@envname{#2code}}
194 |    {\def\minted@envname{#1}}
195 |   \newenvironment{\minted@envname}
196 |    {\VerbatimEnvironment\begin{minted}[#3]{#2}}
197 |    {\end{minted}}
198 |   \newenvironment{\minted@envname *}[1]
199 |    {\VerbatimEnvironment\begin{minted}[#3,##1]{#2}}
200 |    {\end{minted}}}
201 | \newcommand\newmint[3][]{
202 |   \ifthenelse{\equal{#1}{}}
203 |    {\def\minted@shortname{#2}}
204 |    {\def\minted@shortname{#1}}
205 |   \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{
206 |     \mint[#3,##1]{#2}##2}}
207 | \newcommand\newmintedfile[3][]{
208 |   \ifthenelse{\equal{#1}{}}
209 |    {\def\minted@shortname{#2file}}
210 |    {\def\minted@shortname{#1}}
211 |   \expandafter\newcommand\csname\minted@shortname\endcsname[2][]{
212 |     \inputminted[#3,##1]{#2}{##2}}}
213 | \@ifundefined{minted@float@within}
214 |  {\newfloat{listing}{h}{lol}}
215 |  {\newfloat{listing}{h}{lol}[\minted@float@within]}
216 | \newcommand\listingscaption{Listing}
217 | \floatname{listing}{\listingscaption}
218 | \newcommand\listoflistingscaption{List of listings}
219 | \providecommand\listoflistings{\listof{listing}{\listoflistingscaption}}
220 | \AtBeginDocument{
221 |   \minted@usedefaultstyle}
222 | \AtEndOfPackage{
223 |   \ifnum\pdf@shellescape=1\relax\else
224 |     \PackageError{minted}
225 |      {You must invoke LaTeX with the
226 |       -shell-escape flag}
227 |      {Pass the -shell-escape flag to LaTeX. Refer to the minted.sty
228 |       documentation for more information.}\fi
229 |   \TestAppExists{pygmentize}
230 |   \ifAppExists\else
231 |     \PackageError{minted}
232 |      {You must have `pygmentize' installed
233 |       to use this package}
234 |      {Refer to the installation instructions in the minted
235 |       documentation for more information.}
236 |   \fi}
237 | \endinput
238 | %%
239 | %% End of file `minted.sty'.
240 | 


--------------------------------------------------------------------------------
/scripts/demo.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Understanding variable importances in forests of randomized trees.
  3 | Gilles Louppe, Louis Wehenkel, Antonio Sutera and Pierre Geurts
  4 | NIPS, Lake Tahoe, United States, 2013
  5 | http://orbi.ulg.ac.be/handle/2268/155642
  6 | 
  7 | This demo reproduces Table 2 from the paper. It also shows that using Extra-
  8 | Trees from Scikit-Learn, or an ensemble of randomized ID3 trees (see ID3.py)
  9 | give identical results.
 10 | 
 11 | Figure 2 from the paper can be obtained using the 2d array importances values
 12 | yielded by a `RandomizedID3Ensemble` (see the commented code at the bottom).
 13 | 
 14 | Author: Gilles Louppe <g.louppe@gmail.com>
 15 | License: BSD 3 clause
 16 | """
 17 | import itertools
 18 | import numpy as np
 19 | 
 20 | from sklearn.utils import check_random_state
 21 | 
 22 | 
 23 | # Datasets ====================================================================
 24 | 
 25 | def make_led(irrelevant=0):
 26 |     """Generate exhaustively all samples from the 7-segment problem.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     irrelevant : int, optional (default=0)
 31 |         The number of irrelevant binary features to add. Since samples are
 32 |         generated exhaustively, this makes the size of the resulting dataset
 33 |         2^(irrelevant) times larger.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     X, y
 38 |     """
 39 |     data = np.array([[0, 0, 1, 0, 0, 1, 0, 1],
 40 |                      [1, 0, 1, 1, 1, 0, 1, 2],
 41 |                      [1, 0, 1, 1, 0, 1, 1, 3],
 42 |                      [0, 1, 1, 1, 0, 1, 0, 4],
 43 |                      [1, 1, 0, 1, 0, 1, 1, 5],
 44 |                      [1, 1, 0, 1, 1, 1, 1, 6],
 45 |                      [1, 0, 1, 0, 0, 1, 0, 7],
 46 |                      [1, 1, 1, 1, 1, 1, 1, 8],
 47 |                      [1, 1, 1, 1, 0, 1, 1, 9],
 48 |                      [1, 1, 1, 0, 1, 1, 1, 0]])
 49 | 
 50 |     X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7]
 51 | 
 52 |     if irrelevant > 0:
 53 |         X_ = []
 54 |         y_ = []
 55 | 
 56 |         for i in xrange(10):
 57 |             for s in itertools.product(range(2), repeat=irrelevant):
 58 |                 X_.append(np.concatenate((X[i], s)))
 59 |                 y_.append(i)
 60 | 
 61 |         X = np.array(X_, dtype=np.bool)
 62 |         y = np.array(y_)
 63 | 
 64 |     return X, y
 65 | 
 66 | 
 67 | def make_led_sample(n_samples=200, irrelevant=0, random_state=None):
 68 |     """Generate random samples from the 7-segment problem.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     n_samples : int, optional (default=200)
 73 |         The number of samples to generate.
 74 | 
 75 |     irrelevant : int, optional (default=0)
 76 |         The number of irrelevant binary features to add.
 77 | 
 78 |     Returns
 79 |     -------
 80 |     X, y
 81 |     """
 82 | 
 83 |     random_state = check_random_state(random_state)
 84 | 
 85 |     data = np.array([[0, 0, 1, 0, 0, 1, 0, 1],
 86 |                      [1, 0, 1, 1, 1, 0, 1, 2],
 87 |                      [1, 0, 1, 1, 0, 1, 1, 3],
 88 |                      [0, 1, 1, 1, 0, 1, 0, 4],
 89 |                      [1, 1, 0, 1, 0, 1, 1, 5],
 90 |                      [1, 1, 0, 1, 1, 1, 1, 6],
 91 |                      [1, 0, 1, 0, 0, 1, 0, 7],
 92 |                      [1, 1, 1, 1, 1, 1, 1, 8],
 93 |                      [1, 1, 1, 1, 0, 1, 1, 9],
 94 |                      [1, 1, 1, 0, 1, 1, 1, 0]])
 95 | 
 96 |     data = data[random_state.randint(0, 10, n_samples)]
 97 |     X, y = np.array(data[:, :7],  dtype=np.bool), data[:, 7]
 98 | 
 99 |     if irrelevant > 0:
100 |         X = np.hstack((X, random_state.rand(n_samples, irrelevant) > 0.5))
101 | 
102 |     return X, y
103 | 
104 | 
105 | # Formulae ====================================================================
106 | 
107 | from gmpy import comb
108 | 
109 | def binomial(k, n):
110 |     """Return the number of combinations of k elements among a collection of
111 |        size n."""
112 |     if k < 0:
113 |         return 0
114 |     elif k > n:
115 |         return 0
116 |     else:
117 |         return comb(int(n), int(k))
118 | 
119 | 
120 | def entropy(X):
121 |     """Return the entropy (in base 2) of a discrete variable X, encoded as a
122 |        1d array."""
123 |     e = 0.
124 |     n_samples = len(X)
125 | 
126 |     for count in np.bincount(X):
127 |         p = 1. * count / n_samples
128 | 
129 |         if p > 0:
130 |             e -= p * np.log2(p)
131 | 
132 |     return e
133 | 
134 | def mdi_importance(X_m, X, y):
135 |     """The MDI importance of X_m for Y, as computed with an infinite ensemble
136 |        of fully developed totally randomized trees.
137 | 
138 |     This is a direct implementation of Equation 3 from the paper.
139 | 
140 |     Parameters
141 |     ----------
142 |     X_m : int
143 |         The variable for which the importance is computed. It corresponds
144 |         to the column in X (from 0 to p-1).
145 | 
146 |     X : array of shape (N, p)
147 |         The input data (X_0, X_1, ... X_{p-1}). X should be large enough
148 |         to accurately represent the actual data distribution.
149 | 
150 |     y : array of shape (N,)
151 |         The Y variable.
152 | 
153 |     Returns
154 |     -------
155 |     imp : array of size (p,)
156 |         The decomposition of the importance of X_m along its degree of
157 |         interaction with the other input variables, i.e the p outter terms
158 |         in Equation 3. The actual importance Imp(X_m) amounts np.sum(imp).
159 |     """
160 |     n_samples, p = X.shape
161 | 
162 |     variables = range(p)
163 |     variables.pop(X_m)
164 |     imp = np.zeros(p)
165 | 
166 |     values = []
167 |     for i in xrange(p):
168 |         values.append(np.unique(X[:, i]))
169 | 
170 |     for k in xrange(p):
171 |         # Weight of each B of size k
172 |         coef = 1. / (binomial(k, p) * (p - k))
173 | 
174 |         # For all B of size k
175 |         for B in itertools.combinations(variables, k):
176 |             # For all values B=b
177 |             for b in itertools.product(*[values[B[j]] for j in xrange(k)]):
178 |                 mask_b = np.ones(n_samples, dtype=np.bool)
179 | 
180 |                 for j in xrange(k):
181 |                     mask_b &= X[:, B[j]] == b[j]
182 | 
183 |                 X_, y_ = X[mask_b, :], y[mask_b]
184 |                 n_samples_b = len(X_)
185 | 
186 |                 if n_samples_b > 0:
187 |                     children = []
188 | 
189 |                     for xi in values[X_m]:
190 |                         mask_xi = X_[:, X_m] == xi
191 |                         children.append(y_[mask_xi])
192 | 
193 |                     imp[k] += (coef
194 |                                * (1. * n_samples_b / n_samples)  # P(B=b)
195 |                                * (entropy(y_) -
196 |                                   sum([entropy(c) * len(c) / n_samples_b
197 |                                        for c in children])))
198 | 
199 |     return imp
200 | 
201 | 
202 | # Demo ========================================================================
203 | 
204 | if __name__ == "__main__":
205 |     # Generate data
206 |     n_trees = 5000
207 | 
208 |     X, y = make_led()
209 |     p = X.shape[1]
210 | 
211 |     results = np.empty((p, p + 1))
212 | 
213 |     # Theoretical values
214 |     for i in range(p):
215 |         results[i, 0] = sum(mdi_importance(i, X, y))
216 | 
217 |     # Empirical results
218 |     for i in range(p):
219 |         # Using scikit-learn
220 |         from sklearn.ensemble import ExtraTreesClassifier
221 |         clf = ExtraTreesClassifier(n_estimators=n_trees,
222 |                                    max_features=i + 1,
223 |                                    criterion="entropy",
224 |                                    n_jobs=-1).fit(X, y)
225 | 
226 |         # Note: Variable importances in Scikit-Learn are normalized by
227 |         #       default. Use normalize=False to disable normalization.
228 | 
229 |         results[:, i + 1] = sum(tree.tree_.compute_feature_importances(normalize=False)
230 |                                 for tree in clf.estimators_) / clf.n_estimators
231 | 
232 |         # # Using a simplistic (but slower) randomized ID3 tree classifier
233 |         # from ID3 import RandomizedID3Classifier, RandomizedID3Ensemble
234 |         # clf = RandomizedID3Ensemble(n_estimators=n_trees,
235 |         #                             base_estimator=RandomizedID3Classifier(k=i + 1)).fit(X, y)
236 | 
237 |         # # Note: Here clf.feature_importances is a 2d array of shape (p, p).
238 |         # #       In particular, it could be used to regenerate Figure 2 from
239 |         # #       the paper.
240 | 
241 |         # results[:, i + 1] = np.sum(clf.feature_importances_, axis=1)
242 | 
243 | 
244 |     # Print results
245 |     print "Table 2:"
246 |     print "Variable importances as computed with an ensemble of randomized " \
247 |           "trees, for increasing values of $K$. Importances at $K=1$ follow " \
248 |           "their theoretical values, as predicted by Equation 3 in Theorem 1. " \
249 |           "However, as $K$ increases, importances diverge due to masking " \
250 |           "effects. In accordance with Theorem 2, their sum is also always " \
251 |           "equal to $I(X_{1}, ..., X_{7}; Y) = H(Y) = log2(10)= 3.321$ " \
252 |           "since inputs allow to perfectly predict the output."
253 |     print
254 | 
255 |     print "\tEqn.3",
256 |     for m in range(p):
257 |         print "\tK=%d" % (m + 1),
258 |     print
259 | 
260 |     for m in range(p):
261 |         print "X_%d" % (m + 1),
262 |         for j in range(p + 1):
263 |             print "\t%.4f" % results[m, j],
264 |         print
265 | 
266 |     print "Sum",
267 |     for j in range(p + 1):
268 |         print "\t%.4f" % sum(results[:, j]),
269 | 


--------------------------------------------------------------------------------
/tex/frontback/notations.tex:
--------------------------------------------------------------------------------
  1 | % Notations ====================================================================
  2 | 
  3 | \chapter{Notations}
  4 | 
  5 | \begin{tabularx}{\textwidth}{ l X }
  6 | ${\cal A}$ & A supervised learning algorithm \dotfill  \pageref{ntn:A}\\
  7 | ${\cal A}(\theta, {\cal L})$ & The model $\varphi_{\cal L}$ produced by algorithm ${\cal A}$ over ${\cal L}$ and hyper-parameters $\theta$ \dotfill  \pageref{ntn:A-func}\\
  8 | $\alpha_s$ & The proportion of samples in a random patch \dotfill  \pageref{ntn:alpha_s}\\
  9 | $\alpha_f$ & The proportion of features in a random patch \dotfill  \pageref{ntn:alpha_f}\\
 10 | $b_l$ & The $l$-th value of a categorical variable \dotfill  \pageref{ntn:b_l}\\
 11 | $B$ & A subset $B \subseteq V$ of variables \dotfill  \pageref{ntn:B}\\
 12 | $c_k$ & The $k$-th class \dotfill  \pageref{ntn:c_k}\\
 13 | $C^k_p$ & The number of $k$-combinations from a set of $p$ elements \dotfill  \pageref{ntn:C_k_p}\\
 14 | $C(N)$ & The time complexity for splitting $N$ samples \dotfill  \pageref{ntn:cN}\\
 15 | $\mathbb{E}$ & Expectation \dotfill \\
 16 | $\overline{E}(\varphi_{\cal L}, {\cal L}^\prime)$ & The average prediction error of $\varphi_{\cal L}$ over ${\cal L}^\prime$ \dotfill  \pageref{ntn:E_bar}\\
 17 | $Err(\varphi_{\cal L})$ & The generalization error of $\varphi_{\cal L}$ \dotfill  \pageref{eqn:generalization-error}, \pageref{eqn:4:generalization-error}\\
 18 | %$\widehat{Err}^\text{train}(\varphi_{\cal L})$ & The resubstitution estimate or training sample estimate of the generalization error of $\varphi_{\cal L}$ \dotfill  \pageref{eqn:training-error}\\
 19 | %$\widehat{Err}^\text{test}(\varphi_{\cal L})$ & The test sample estimate of the generalization error of $\varphi_{\cal L}$ \dotfill  \pageref{eqn:test-error}\\
 20 | %$\widehat{Err}^\text{CV}(\varphi_{\cal L})$ & The cross-validation  estimate of the generalization error of $\varphi_{\cal L}$ \dotfill  \pageref{eqn:cv-error}\\
 21 | %$\widehat{Err}^\text{OOB}(\psi_{\cal L})$ & The out-of-bag estimate of the generalization error of $\psi_{\cal L}$ \dotfill  \pageref{eqn:oob-error}\\
 22 | $H(X)$ & The Shannon entropy of $X$ \dotfill  \pageref{eqn:6:entropy}\\
 23 | $H(X|Y)$ & The Shannon entropy of $X$ conditional to $Y$\dotfill  \pageref{eqn:6:entropy-cond}\\
 24 | ${\cal H}$ & The space of candidate models \dotfill  \pageref{ntn:H}\\
 25 | $i(t)$ & The impurity of node $t$ \dotfill  \pageref{ntn:i_t}, \pageref{ntn:i_t2}\\
 26 | $i_R(t)$ & The impurity of node $t$ based on the local resubstitution estimate \dotfill \pageref{eqn:impurity:error},~\pageref{eqn:impurity:variance}\\
 27 | $i_H(t)$ & The entropy impurity of node $t$ \dotfill  \pageref{eqn:impurity:shannon}\\
 28 | $i_G(t)$ & The Gini impurity of node $t$ \dotfill  \pageref{eqn:impurity:gini}\\
 29 | $\Delta i(s, t)$ & The impurity decrease of the split $s$ at node $t$ \dotfill  \pageref{def:impurity-decrease}\\
 30 | $I(X;Y)$ & The mutual information between $X$ and $Y$ \dotfill  \pageref{eqn:6:mi}\\
 31 | $\text{Imp}(X_j)$ & The variable importance of $X_j$ \dotfill \pageref{eq:mdi}, \pageref{eq:mda}\\
 32 | $J$ & The number of classes \dotfill  \pageref{ntn:J}\\
 33 | $K$ & The number of folds in cross-validation \dotfill  \pageref{ntn:K-cv} \newline The number of input variables drawn at each node for finding a split \dotfill \pageref{ntn:K-split} \\
 34 | $K(\mathbf{x}_i, \mathbf{x}_j)$ & The kernel of $\mathbf{x}_i$ and $\mathbf{x}_j$ \dotfill \pageref{ntn:kernel}, \pageref{ntn:kernel2}\\
 35 | $L$ & A loss function \dotfill  \pageref{ntn:L}\newline The number of values of a categorical variable \dotfill \pageref{ntn:L2}\\
 36 | ${\cal L}$ & A learning set $(\mathbf{X}, \mathbf{y})$ \dotfill  \pageref{ntn:learning-set}\\
 37 | ${\cal L}^m$ & The $m$-th bootstrap replicate of ${\cal L}$ \dotfill  \pageref{ntn:L_m}\\
 38 | ${\cal L}_t$ & The subset of node samples falling into node $t$ \dotfill  \pageref{ntn:L_t}\\
 39 | $M$ & The number of base models in an ensemble \dotfill  \pageref{ntn:M}\\
 40 | $\mu_{{\cal L},\theta_m}(\mathbf{x})$ & The mean prediction at $X = \mathbf{x}$ of $\varphi_{{\cal L},\theta_m}$ \dotfill \pageref{eqn:4:mu} \\
 41 | $N$ & The number of input samples \dotfill  \pageref{ntn:N}\\
 42 | $N_t$ & The number of node samples in node $t$ \dotfill  \pageref{ntn:N_t}\\
 43 | $N_{ct}$ & The number of node samples of class $c$ in node $t$ \dotfill  \pageref{ntn:N_ct}\\
 44 | $\Omega$ & The universe, or population, from which cases are sampled \dotfill  \pageref{ntn:omega}\\
 45 | $p$ & The number of input variables \dotfill  \pageref{ntn:p}\\
 46 | $p_L$ & The proportion of node samples going to $t_L$ \dotfill  \pageref{ntn:p_L}\\
 47 | $p_R$ & The proportion of node samples going to $t_R$ \dotfill  \pageref{ntn:p_R}\\
 48 | $p(t)$ & The estimated probability $p(X \in {\cal X}_t)=\tfrac{N_t}{N}$ \dotfill  \pageref{ntn:p_t}\\
 49 | $p(c|t)$ & The empirical probability estimate $p(Y=c | X \in {\cal X}_t)=\tfrac{N_{ct}}{N_t}$ of class $c$ at node $t$ \dotfill  \pageref{ntn:p_ct}\\
 50 | $\widehat{p}_{\cal L}$ & An empirical probability estimate computed from the learning set ${\cal L}$\dotfill  \pageref{eqn:4:proba-estimates}\\
 51 | $P(X,Y)$ & The joint probability distribution of the input variables $X=(X_1,\dots,X_p)$ and the output variable $Y$ \dotfill  \pageref{ntn:P_XY}\\
 52 | ${\cal P}_k(V)$ & The set of subsets of $V$ of size $k$ \dotfill  \pageref{ntn:P_k}\\
 53 | $\varphi$ & A model or function ${\cal X} \mapsto {\cal Y}$ \dotfill  \pageref{ntn:varphi}\newline A single decision tree \dotfill  \pageref{ntn:tree}\\
 54 | $\widetilde{\varphi}$ & The set of terminal nodes in $\varphi$ \dotfill  \pageref{ntn:varphi-leafs}\\
 55 | $\varphi(\mathbf{x})$ & The prediction of $\varphi$ for the sample $\mathbf{x}$ \dotfill  \pageref{ntn:varphi-x}\\
 56 | $\varphi_{\cal L}$ & A model built from ${\cal L}$ \dotfill  \pageref{ntn:varphi-L}\\
 57 | $\varphi_{{\cal L},\theta}$ & A model built from ${\cal L}$ with random seed $\theta$ \dotfill  \pageref{ntn:varphi-Ltheta}\\
 58 | $\varphi_B$ & A Bayes model \dotfill  \pageref{ntn:varphi-B}\\
 59 | $\psi_{{\cal L},\theta_1,\dots,\theta_M}$ & An ensemble of $M$ models built from ${\cal L}$ and random seeds $\theta_1, \dots, \theta_M$ \dotfill \pageref{ntn:psi} \\
 60 | ${\cal Q}$ & A set ${\cal Q} \subseteq {\cal S}$ of splits of restricted structure \dotfill \pageref{ntn:Q}, \pageref{ntn:Q2}\\
 61 | ${\cal Q}(X_j)$ & The set ${\cal Q}(X_j) \subseteq {\cal Q}$ of univariate binary splits that can be defined on variable $X_j$ \dotfill \pageref{eqn:q:ordered}, \pageref{eqn:q:categorical-cart}\\
 62 | $\rho(\mathbf{x})$ & The correlation coefficient between the predictions at $X=\mathbf{x}$ of two randomized models \dotfill \pageref{eqn:4:correlation} \\
 63 | $s$ & A split \dotfill  \pageref{ntn:s}, \pageref{ntn:s2}\\
 64 | $s^*$ & The best split \dotfill  \pageref{ntn:s-star}, \pageref{eqn:best-best-split}\\
 65 | $s^*_j$ & The best binary split defined on variable $X_j$\dotfill  \pageref{ntn:s-star}, \pageref{eqn:best-split-single}\\
 66 | $s_j^v$ & The binary split $(\{\mathbf{x}|x_j \leq v\}, \{\mathbf{x} > v\})$ defined on variable $X_j$ with discretization threshold $v$ \dotfill  \pageref{ntn:s_jv}\\
 67 | $s_t$ & The split labeling node $t$ \dotfill  \pageref{ntn:s_t}\\
 68 | $\tilde{s}^j_t$ & The best surrogate split for $s_t$ defined from $X_j$ \dotfill \pageref{ntn:s-surrogate}\\
 69 | ${\cal S}$ & The set of all possible splits $s$ \dotfill  \pageref{ntn:S-all}\\
 70 | $\sigma^2_{{\cal L},\theta_m}(\mathbf{x})$ & The prediction variance at $X = \mathbf{x}$ of $\varphi_{{\cal L},\theta_m}$ \dotfill \pageref{eqn:4:sigma} \\
 71 | $t$ & A node in a decision tree \dotfill  \pageref{ntn:node}\\
 72 | $t_L$ & The left child of node $t$ \dotfill \pageref{ntn:t_L}, \pageref{ntn:t_L2}\\
 73 | $t_R$ & The right child of node $t$ \dotfill \pageref{ntn:t_R}, \pageref{ntn:t_R2}\\
 74 | $\theta$ & A vector of hyper-parameter values \dotfill  \pageref{ntn:theta}\newline A random seed \dotfill \pageref{ntn:theta-seed}\\
 75 | $\theta^*$ & The optimal hyper-parameters \dotfill  \pageref{ntn:theta-star}\\
 76 | $\widehat{\theta}^*$ & The approximately optimal hyper-parameters \dotfill  \pageref{ntn:theta-star-approx}\\
 77 | $\theta_m$ & The seed of the $m$-th model in an ensemble \dotfill  \pageref{ntn:theta-seed-m}\\
 78 | $v$ & A discretization threshold in a binary split \dotfill  \pageref{ntn:v}\\
 79 | $v_k$ & The $k$-th value of an ordered variable, when node samples are in sorted order \dotfill  \pageref{ntn:v_k}\\
 80 | $v_k^\prime$ & The mid-cut point between $v_k$ and $v_{k+1}$ \dotfill  \pageref{ntn:v_k_prime}\\
 81 | $V$ & The set $\{X_1, \dots, X_p\}$ of input variables \dotfill  \pageref{ntn:V}\\
 82 | $V^{-j}$ & $V \setminus \{X_j\}$ \dotfill  \pageref{ntn:V-j}\\
 83 | $\mathbb{V}$ & Variance \dotfill \\
 84 | $\textbf{x}$ & A case, sample or input vector $(x_1, \dots, x_p)$ \dotfill  \pageref{ntn:sample-x}\\
 85 | $\textbf{x}_i$ & The $i$-th input sample in ${\cal L}$ \dotfill  \pageref{ntn:sample-x_i}\\
 86 | $x_j$ & The value of variable $X_j$ for the sample $\textbf{x}$ \dotfill  \pageref{ntn:value-x_j}\\
 87 | $\textbf{X}$ & The $N\times p$ matrix representing the values of all $N$ samples for all $p$ input variables \dotfill  \pageref{ntn:matrix-X}\\
 88 | $X_j$ & The $j$-th input variable or feature \dotfill  \pageref{ntn:var-X_j}, \pageref{ntn:var-X_j2}\\
 89 | $X$ & The random vector $(X_1,\dots,X_p)$ \dotfill  \pageref{ntn:vector-X}\\
 90 | ${\cal X}_j$ & The domain or space of variable $X_j$ \dotfill  \pageref{ntn:space-X_j}\\
 91 | ${\cal X}$ & The input space ${\cal X}_1 \times \dots \times {\cal X}_p$ \dotfill  \pageref{ntn:space-X}\\
 92 | ${\cal X}_t$ & The subspace ${\cal X}_t \subseteq {\cal X}$ represented by node $t$ \dotfill  \pageref{ntn:node-space}\\
 93 | $y$ & A value of the output variable $Y$ \dotfill  \pageref{ntn:value-y}\\
 94 | $\widehat{y}_t$ & The value labelling node $t$ \dotfill  \pageref{ntn:y_t}\\
 95 | $\widehat{y}_t^*$ & The optimal value labelling node $t$ \dotfill  \pageref{ntn:y_t-star}\\
 96 | $\mathbf{y}$ & The output values $(y_1,\dots,y_N)$ \dotfill  \pageref{ntn:vector-y}\\
 97 | $Y$ & The output or response variable $Y$ \dotfill  \pageref{ntn:var-Y}\\
 98 | ${\cal Y}$ & The domain or space of variable $Y$ \dotfill  \pageref{ntn:space-Y}\\
 99 | \end{tabularx}
100 | 


--------------------------------------------------------------------------------
/benchmarks/visualize.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import glob
  3 | import json
  4 | import sys
  5 | 
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | import brewer2mpl
  9 | 
 10 | cmap_curve = [(1.0, 0, 0), (0, 0, 1.0)]
 11 | cmap_bar = brewer2mpl.get_map('RdYlGn', 'diverging', 9).mpl_colors
 12 | 
 13 | layout = {
 14 |     "RandomForestClassifier": {"name": "Scikit-Learn-RF", "order": 0},
 15 |     "RandomForestRegressor": {"name": "Scikit-Learn-RF", "order": 0},
 16 |     "ExtraTreesClassifier": {"name": "Scikit-Learn-ETs", "order": 1},
 17 |     "ExtraTreesRegressor": {"name": "Scikit-Learn-ETs", "order": 1},
 18 |     "OpenCV": {"name": "OpenCV-RF", "order": 2},
 19 |     "OpenCV-ETs": {"name": "OpenCV-ETs", "order": 3},
 20 |     "OK3-RandomForest": {"name": "OK3-RF", "order": 4},
 21 |     "OK3-ExtraTrees": {"name": "OK3-ETs", "order": 5},
 22 |     "R-randomForest": {"name": "R-RF", "order": 7},
 23 |     "Weka": {"name": "Weka-RF", "order": 6},
 24 |     "Orange": {"name": "Orange-RF", "order": 8},
 25 | }
 26 | 
 27 | 
 28 | def get(data, field, first=True):
 29 |     d = data
 30 |     for token in field.split("__"):
 31 |         d = d[token]
 32 | 
 33 |     if isinstance(d, list) and first:
 34 |         return d[0]
 35 |     else:
 36 |         return d
 37 | 
 38 | 
 39 | def groupby(filenames, group_fields, param_field, stat_field):
 40 |     all_data = {}
 41 | 
 42 |     for filename in filenames:
 43 |         with open(filename, "r") as fd:
 44 |             data = json.load(fd)
 45 | 
 46 |             key = []
 47 |             for field in group_fields:
 48 |                 key.append(get(data, field))
 49 |             key = tuple(key)
 50 | 
 51 |             if key not in all_data:
 52 |                 all_data[key] = []
 53 | 
 54 |             all_data[key].append((get(data, param_field), get(data, stat_field, first=False)))
 55 | 
 56 |     for key in all_data:
 57 |         all_data[key] = sorted(all_data[key])
 58 | 
 59 |     return all_data
 60 | 
 61 | 
 62 | def plot_curve(all_data, x_label=None, y_label=None, width=0.2, curve=True, filename=None):
 63 |     matplotlib.rc("font", size=13)
 64 |     title = all_data.keys()[0][1]
 65 |     title = title.split(".")[0]
 66 | 
 67 |     all_data = sorted([(layout[key[0]]["order"], layout[key[0]]["name"], all_data[key]) for key in all_data])
 68 |     offset = len(all_data) * width + width/2.0
 69 | 
 70 |     fig, ax = plt.subplots()
 71 | 
 72 |     for i, (key, name, data) in enumerate(all_data):
 73 |         xticks = [t[0] for t in data]
 74 |         x = [offset*t[0]+i*width for t in data]
 75 |         y = [np.mean(t[1]) for t in data]
 76 | 
 77 |         if x_label == "n_estimators":
 78 |             xticks = [t[0] for t in data if t[0] < 1000]
 79 |             x = [offset*t[0]+i*width for t in data if t[0] < 1000]
 80 |             y = [np.mean(t[1]) for t in data if t[0] < 1000]
 81 | 
 82 |         if y_label == "MSE":
 83 |             y = [-y_i for y_i in y]
 84 | 
 85 |         if curve:
 86 |             ax.plot(xticks, y, label=name, color=cmap_curve[i])
 87 |         else:
 88 |             ax.bar(x, y, width=width, label=name, color=cmap_curve[i])
 89 | 
 90 |     if curve:
 91 |         ax.set_xlim(xticks[0], xticks[-1])
 92 |     else:
 93 |         ax.set_xlim(-2*width+x[0], x[-1]+2*width)
 94 |         ax.set_xticks(x)
 95 |         ax.set_xticklabels(xticks)
 96 | 
 97 |     if x_label is not None: ax.set_xlabel(x_label)
 98 |     if y_label is not None: ax.set_ylabel(y_label)
 99 | 
100 |     ax.set_title(title)
101 |     ax.legend(loc="best")
102 | 
103 |     if filename:
104 |         plt.savefig("%s.pdf" % filename)
105 |         plt.savefig("%s.jpg" % filename)
106 |         plt.close("all")
107 |     else:
108 |         plt.show()
109 | 
110 | 
111 | def plot_bar(all_data, y_label=None, width=0.2, filename=None):
112 |     title = all_data.keys()[0][1]
113 |     title = title.split(".")[0]
114 | 
115 |     all_data = sorted([(layout[key[0]]["order"], layout[key[0]]["name"], all_data[key]) for key in all_data])
116 |     fig, ax = plt.subplots()
117 | 
118 |     for i, (key, name, data) in enumerate(all_data):
119 |         y_mean = np.mean(data[0][1])
120 |         rects = ax.bar([i*width], [y_mean], width=width, label=name, color=cmap_bar[key])
121 |         rect = rects[0]
122 |         plt.text(rect.get_x() + rect.get_width() / 2.0, rect.get_height(), '%.2f' % y_mean, ha='center', va='bottom', fontsize=9)
123 | 
124 |     if y_label is not None: ax.set_ylabel(y_label)
125 |     ax.set_title(title)
126 |     ax.set_xticks([])
127 |     ax.set_xlim(-width, len(layout)*width-width)
128 |     ax.legend(loc="best", prop={"size": 9})
129 | 
130 |     if filename:
131 |         plt.savefig("%s.pdf" % filename)
132 |         plt.savefig("%s.jpg" % filename)
133 |         plt.close("all")
134 |     else:
135 |         plt.show()
136 | 
137 | 
138 | def make_5_4_1():
139 |     # Plot result on artifical data
140 |     regression = ["make_friedman1", "make_friedman2", "make_friedman3"]
141 |     classification = ["make_hastie_10_2", "make_waveforms", "make_twonorm", "make_threenorm", "make_ringnorm"]
142 | 
143 |     params = [("n_estimators", "params__n_estimators", True),
144 |               ("max_features", "params__max_features", False),
145 |               ("bootstrap", "params__bootstrap", False),
146 |               ("n_train", "stats__n_train", True),
147 |               ("n_features", "stats__n_features", True)]
148 | 
149 |     stats = [("time_fit", "Fit time (s)"),
150 |              ("time_predict", "Predict time(s)"),
151 |              ("score_make_scorer(accuracy_score)", "Accuracy"),
152 |              ("score_make_scorer(roc_auc_score, needs_threshold=True)", "AUC"),
153 |              ("score_make_scorer(mean_squared_error, greater_is_better=False)", "MSE"),
154 |              ("score_make_scorer(r2_score)", "R2"),
155 |              ("leaves", "Leaves"),
156 |              ("average_depth", "Average depth")]
157 | 
158 |     for dataset in regression+classification:
159 |         for prefix, param_field, curve in params:
160 |             files = [f for f in glob.glob("output/%s_*_%s*" % (prefix, dataset))]
161 | 
162 |             if len(files) == 0:
163 |                 continue
164 | 
165 |             for stat_field, label in stats:
166 |                 print dataset, prefix, stat_field
167 | 
168 |                 try:
169 |                     plot_curve(groupby(files, ["estimator", "generator"], param_field, "stats__%s" % stat_field),
170 |                                x_label=prefix,
171 |                                y_label=label,
172 |                                filename="figs/generators/%s/%s_%s" % (dataset, prefix, stat_field),
173 |                                curve=curve)
174 |                 except:
175 |                     print "Failed!"
176 | 
177 | def make_5_4_2_plots():
178 |     # Plot results on datasets
179 |     datasets = ["diabetes.npz", "dig44.npz", "ionosphere.npz", "pendigits.npz",
180 |                 "letter.npz", "liver.npz", "musk2.npz", "ring-norm.npz", "satellite.npz",
181 |                 "segment.npz", "sonar.npz", "spambase.npz", "two-norm.npz", "vehicle.npz",
182 |                 "vowel.npz", "waveform.npz", "cifar10.npz", "mnist3vs8.npz", "mnist4vs9.npz", "mnist.npz",
183 |                 "isolet.npz", "arcene.npz", "breast2.npz", "madelon.npz", "marti0.npz",
184 |                 "reged0.npz", "secom.npz", "tis.npz", "sido0.npz"]
185 | 
186 |     for dataset in datasets:
187 |         print dataset
188 |         files = glob.glob("output/default_*_%s*" % dataset)
189 |         plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__time_fit"), y_label="Fit time (s)", filename="figs/datasets/%s_fit" % dataset)
190 |         plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__time_predict"), y_label="Predict time (s)", filename="figs/datasets/%s_predict" % dataset)
191 |         plot_bar(groupby(files, ["estimator", "generator"], "estimator", "stats__score_make_scorer(accuracy_score)"), y_label="Accuracy", filename="figs/datasets/%s_accuracy" % dataset)
192 | 
193 | 
194 | def make_5_4_2_table():
195 |     impls = ["RandomForestClassifier",
196 |              "ExtraTreesClassifier",
197 |              "OpenCV",
198 |              "OpenCV-ETs",
199 |              "OK3-RandomForest",
200 |              "OK3-ExtraTrees",
201 |              "Weka",
202 |              "R-randomForest",
203 |              "Orange"]
204 | 
205 |     datasets = ["diabetes.npz", "dig44.npz", "ionosphere.npz", "pendigits.npz",
206 |                 "letter.npz", "liver.npz", "musk2.npz", "ring-norm.npz", "satellite.npz",
207 |                 "segment.npz", "sonar.npz", "spambase.npz", "two-norm.npz", "vehicle.npz",
208 |                 "vowel.npz", "waveform.npz", "cifar10.npz", "mnist3vs8.npz", "mnist4vs9.npz", "mnist.npz",
209 |                 "isolet.npz", "arcene.npz", "breast2.npz", "madelon.npz", "marti0.npz",
210 |                 "reged0.npz", "secom.npz", "tis.npz", "sido0.npz"]
211 | 
212 |     all_stats = {}
213 | 
214 |     for dataset in datasets:
215 |         all_stats[dataset] = {}
216 |         files = glob.glob("output/default_*_%s*" % dataset)
217 |         data = groupby(files, ["estimator", "generator"], "estimator", "stats__time_predict")
218 | 
219 |         for (estimator, _), s in data.items():
220 |             all_stats[dataset][estimator] = np.mean(s[0][1])
221 | 
222 |     table = np.zeros((len(datasets), len(impls)))
223 | 
224 |     for i, (dataset, stats) in enumerate(sorted(all_stats.items())):
225 |         for j, impl in enumerate(impls):
226 |             if impls[j] in stats:
227 |                 table[i, j] = stats[impls[j]]
228 |             else:
229 |                 table[i, j] = np.inf
230 | 
231 |     speedups = np.zeros(table.shape)
232 | 
233 |     for i, dataset in enumerate(sorted(datasets)):
234 |         for j, impl in enumerate(impls):
235 |             speedups[i, j] = table[i, j] / table[i, 0]
236 | 
237 |     speedups = np.ma.masked_array(speedups, np.isinf(speedups))
238 | 
239 |     print "\\begin{tabular}{|c|",
240 |     for j, impl in enumerate(impls):
241 |         print "c",
242 |     print "|}"
243 |     print "\\hline"
244 | 
245 |     for j, impl in enumerate(impls):
246 |         print "&", layout[impl]["name"],
247 |     print "\\\\"
248 |     print "\\hline"
249 |     print "\\hline"
250 | 
251 |     for i, dataset in enumerate(sorted(datasets)):
252 |         print "\\textsc{%s}" % dataset.split(".")[0],
253 |         min_j = np.argmin(speedups[i])
254 | 
255 |         for j, impl in enumerate(impls):
256 |             if j == min_j:
257 |                 print "& \\textbf{%.2f}" % speedups[i, j],
258 |             else:
259 |                 print "& %.2f" % speedups[i, j],
260 |         print "\\\\"
261 |     print "\\hline"
262 |     print "\\hline"
263 | 
264 |     print "\\textit{Average}",
265 |     means = speedups.mean(axis=0)
266 |     min_j = np.argmin(means)
267 |     for j, m in enumerate(means):
268 |         if j == min_j:
269 |             print "& \\textbf{%.2f}" % m,
270 |         else:
271 |             print "& %.2f" % m,
272 |     print "\\\\"
273 |     print "\\textit{Median}",
274 |     medians = np.ma.median(speedups, axis=0)
275 |     min_j = np.argmin(medians)
276 |     for j, m in enumerate(medians):
277 |         if j == min_j:
278 |             print "& \\textbf{%.2f}" % m,
279 |         else:
280 |             print "& %.2f" % m,
281 |     print "\\\\"
282 |     print "\\hline"
283 |     print "\\end{tabular}"
284 | 
285 | if __name__ == "__main__":
286 |     # make_5_4_1()
287 |     make_5_4_2_plots()
288 |     #make_5_4_2_table()
289 | 
290 | 


--------------------------------------------------------------------------------
/tex/chapters/chapter01.tex:
--------------------------------------------------------------------------------
  1 | \chapter{Introduction}\label{ch:introduction}
  2 | 
  3 | In various fields of science, technology and humanities, as in biology,
  4 | meteorology, medicine or finance to cite a few, experts aim at predicting a
  5 | phenomenon based on past observations or measurements. For instance,
  6 | meteorologists try to forecast the weather for the next days from the climatic
  7 | conditions of the previous days. In medicine, practitioners collect measurements
  8 | and information such as blood pressure, age or history for diagnosing the
  9 | condition of incoming patients. Similarly, in chemistry, compounds are analyzed
 10 | using mass spectrometry measurements in order to determine whether they contain
 11 | a given type of molecules or atoms. In all of these cases, the goal is the
 12 | prediction of a response variable based on a set of observed predictor
 13 | variables.
 14 | 
 15 | For centuries, scientists have addressed such problems by deriving theoretical
 16 | frameworks from first principles or have accumulated knowledge in order to
 17 | model, analyze and understand the pheno\-menon under study. For example,
 18 | practitioners know from past experience that elderly heart attack patients with
 19 | low blood pressure are generally high risk. Similarly, meteorologists know from
 20 | elementary climate models that one hot, high pollution day is likely to be
 21 | followed by another. For an increasing number of problems however, standard
 22 | approaches start showing their limits. For example, identifying the genetic
 23 | risk factors for heart disease, where knowledge is still very sparse, is nearly
 24 | impractical for the cognitive abilities of humans given the high complexity and
 25 | intricacy of interactions that exist between genes. Likewise, for very
 26 | fine-grained meteorological forecasts, a large number of variables need to be taken
 27 | into account, which quickly goes beyond the capabilities of experts to put them
 28 | all into a system of equations. To break this cognitive barrier and further
 29 | advance science, machines of increasing speed and capacity have been built and
 30 | designed since the mid-twentieth century to assist humans in their
 31 | calculations. Amazingly however, alongside this progress in terms of hardware,
 32 | developments in theoretical computer science, artificial intelligence and
 33 | statistics have made machines to become more than calculators. Recent advances
 34 | have made them experts of their own kind, capable to learn from data and to
 35 | uncover by themselves the predictive structure of problems. Techniques and
 36 | algorithms that have stemmed from the field of {\it machine learning} have
 37 | indeed now become a powerful tool for the analysis of complex and large data,
 38 | successfully assisting scientists in numerous breakthroughs of various fields
 39 | of science and technology. Public and famous examples include the use of
 40 | boosted decision trees in the statistical analysis that led to the detection of
 41 | the Higgs boson at CERN~\citep{chatrchyan:2012}, the use of random forests for
 42 | human pose detection in the Microsoft Kinect~\citep{criminisi:2013} or the
 43 | implementation of various machine learning techniques for building the IBM
 44 | Watson system~\citep{ferrucci:2010}, capable to compete at the human champion
 45 | level on the American TV quiz show Jeopardy.
 46 | 
 47 | Formally, machine learning can be defined as the study of systems that can
 48 | learn from data without being explicitly programmed. According to
 49 | \citet{mitchell:1997}, a computer program is said to learn from data, with
 50 | respect to some class of tasks and performance measure if its performance at
 51 | those tasks improves with data. In particular, machine learning provides
 52 | algorithms that are able to solve classification or regression tasks, hence
 53 | bringing now automated procedures for the prediction of a phenomenon based on
 54 | past observations. However, the goal of machine learning is not only to produce
 55 | algorithms making accurate predictions, it is also to provide insights on the
 56 | predictive structure of the data~\citep{breiman:1984}. If we are aiming at the
 57 | latter, then our goal is to understand what variables or interactions of
 58 | variables drive the phenomenon. For practitioners, which are not experts in
 59 | machine learning, interpretability is indeed often as important as prediction
 60 | accuracy. It allows for a better understanding of the phenomenon under study, a
 61 | finer exploration of the data and an easier self-appropriation of the results.
 62 | By contrast, when an algorithm is used as a black box, yielding results
 63 | seemingly out of nowhere, it may indeed be difficult to trust or accept if it
 64 | cannot be understood how and why the procedure came to them. Unfortunately, the
 65 | current state-of-the-art in machine learning often makes it difficult for
 66 | non-experts to understand and interpret the results of an algorithm. While
 67 | considerable efforts have been put to improve their prediction accuracy, it is
 68 | still not clearly understood what makes machine learning algorithms truly work,
 69 | and under what assumptions. Likewise, few of them actually provide clear and
 70 | insightful explanations about the results they generate.
 71 | 
 72 | In this context, the goal of this thesis is to provide a comprehensive and
 73 | self-contained analysis of a class of algorithms known as decision
 74 | trees~\citep{breiman:1984} and random forests~\citep{breiman:2001}. While these
 75 | methods have proven to be a robust, accurate and successful tool for solving
 76 | countless of machine learning tasks, including classification, regression,
 77 | density estimation, manifold learning or semi-supervised
 78 | learning~\citep{criminisi:2013}, there remain many gray areas in their
 79 | understanding:
 80 | \begin{enumerate}
 81 | \item First, the theoretical properties and statistical mechanisms that drive
 82 | the algorithm are still not clearly and entirely understood. Random forests
 83 | indeed evolved from empirical successes rather than from a sound
 84 | theory. As such, various parts of the algorithm remain heuristic rather than
 85 | theoretically motivated. For example, preliminary
 86 | results have proven the consistency of simplified to very close variants of
 87 | random forests, but consistency of the original algorithm remains unproven
 88 | in a general setting.
 89 | \item Second, while the construction process of a single decision tree can
 90 | easily be described within half a page, implementing this algorithm properly
 91 | and efficiently remains a challenging task involving issues that are easily
 92 | overlooked. Unfortunately, implementation details are often omitted in the
 93 | scientific literature and can often only be found by diving into
 94 | (unequally documented) existing software implementations. As far as we know,
 95 | there is indeed no comprehensive survey covering the implementation details of
 96 | random forests, nor with their respective effects in terms of runtime and space
 97 | complexity or learning ability.
 98 | \item Third, interpreting the resulting model remains a difficult task,
 99 | for which even machine learning experts still fail at finely analyzing and
100 | uncovering the precise predictive structure learned by the procedure.
101 | In particular, despite their extensive use in a wide range of applications, little
102 | is still known regarding variable importance measures computed by random forests.
103 | Empirical evidence suggests that they are appropriate for identifying
104 | relevant variables, but their statistical mechanisms and properties are
105 | still far from being understood.
106 | \end{enumerate}
107 | All throughout this dissertation, our objective is therefore to call into
108 | question each and every part of the random forests methodology,  both from a
109 | theoretical and practical point of view. Accordingly, this work aims at
110 | revisiting decision trees and random forests to hopefully shed new light on
111 | their learning capabilities, inner workings and interpretability.
112 | 
113 | \section{Outline and contributions}
114 | 
115 | Part~\textsc{\ref{part:1}} of this manuscript is first dedicated to a thorough
116 | treatment of decision trees and forests of randomized trees. We begin in
117 | Chapter~\ref{ch:background} by outlining fundamental concepts of machine
118 | learning, and then proceed in Chapters~\ref{ch:cart} and \ref{ch:forest} with a
119 | comprehensive review of the algorithms at the core of decision trees and random
120 | forests. We discuss the learning capabilities of these models and carefully
121 | study all parts of the algorithm and their complementary effects. In particular,
122 | Chapter~\ref{ch:forest} includes original contributions on the bias-variance
123 | analysis of ensemble methods, highlighting how randomization can help improve
124 | performance. Chapter~\ref{ch:complexity} concludes this first part with an
125 | original space and time complexity analysis of random forests (and their
126 | variants), along with an in-depth discussion of implementation details,
127 | as contributed within the open source Scikit-Learn library.
128 | Overall, Part~\textsc{\ref{part:1}} therefore presents a comprehensive review
129 | of previous work on random forests, including some original contributions
130 | both from a theoretical and practical point of view.
131 | 
132 | Part~\textsc{\ref{part:2}} analyzes and discusses the interpretability of
133 | random forests. In Chapter~\ref{ch:importances}, we study variable importances
134 | as computed with a forest of randomized trees and study how these scores can be
135 | interpreted in order to reveal the underlying predictive structure learned from
136 | the data. In particular, we derive a theoretical framework from which we prove
137 | theoretical and practical properties of variable importances. In
138 | Chapter~\ref{ch:applications}, we then exploit this framework to further study
139 | variable importances as derived from actual random forests and present
140 | successful applications of variable importance measures.
141 | Part~\textsc{\ref{part:2}} constitutes the main contributions of this
142 | dissertation.
143 | 
144 | Finally, Part~\textsc{\ref{part:3}} addresses limitations of random forests in
145 | the context of large datasets. Through extensive experiments, we show in
146 | Chapter~\ref{ch:random-patches} that subsampling strategies provides on par
147 | performance while simultaneously lowering the memory requirements. This
148 | chapter presents original work.
149 | 
150 | \section{Publications}
151 | 
152 | This dissertation summarizes several contributions to random forests
153 | algorithms. Publications that have directly stemmed from this work include:
154 | 
155 | \begin{itemize}
156 | \item \citep{geurts:2011} \textit{Learning to rank with extremely randomized trees},
157 | Geurts Pierre and Louppe Gilles.
158 | In JMLR: Workshop and Conference Proceedings, volume 14, 2011.
159 | 
160 | \item \citep{louppe:2012} \textit{Ensembles on random patches},
161 | Louppe Gilles and Geurts Pierre.
162 | In Machine Learning and Knowledge Discovery in Databases, pages 346--361. Springer, 2012.
163 | 
164 | \item \citep{louppe:2013} \textit{Understanding variable importances in forests of randomized trees},
165 | Louppe Gilles, Wehenkel Louis, Sutera Antonio and Geurts Pierre.
166 | In Advances in Neural Information Processing Systems, pages 431--439, 2013.
167 | 
168 | \item \citep{buitinck:2013} \textit{API design for machine learning software: experiences from the scikit-learn project},
169 | Buitinck Lars, Louppe Gilles, Blondel Mathieu et al..
170 | In ECML-PKDD 2013 Workshop: Languages for Data Mining and Machine Learning, 2013.
171 | 
172 | \item \citep{botta:2014} \textit{Exploiting SNP Correlations within Random Forest for Genome-Wide Association Studies},
173 | Botta Vincent, Louppe Gilles, Geurts Pierre and Wehenkel Louis.
174 | PloS one, 9(4):e93379, 2014.
175 | 
176 | \end{itemize}
177 | 
178 | During the course of this thesis, several fruitful collaborations have also
179 | led to the following publications. These are not discussed within
180 | this dissertation.
181 | 
182 | \begin{itemize}
183 | 
184 | \item \citep{louppe:2010} \textit{A zealous parallel gradient descent algorithm},
185 | Louppe Gilles and Geurts Pierre.
186 | In Learning on Cores, Clusters and Clouds workshop, NIPS, 2010
187 | 
188 | \item \citep{maree:2014} \textit{A hybrid human-computer approach for large-scale image-based measurements using web services and machine learning},
189 | Mar{\'e}e Rapha{\"e}l, Rollus Loic, Stevens Benjamin et al.
190 | Proceedings IEEE International Symposium on Biomedical Imaging, 2014.
191 | 
192 | \item \citep{amy:2014} \textit{Solar Energy Prediction: An International Contest to Initiate Interdisciplinary Research on Compelling Meteorological Problems},
193 | Amy McGovern, David John Gagne II, Lucas Eustaquio et al., 2014. \textit{Submitted.}
194 | 
195 | \item \citep{sutera:2014} \textit{Simple connectome inference from partial correlation statistics in calcium imaging},
196 | Antonio Sutera, Arnaud Joly, Vincent Francois-Lavet et al., 2014. \textit{Submitted.}
197 | 
198 | \end{itemize}
199 | 


--------------------------------------------------------------------------------
/tex/classicthesis-config.tex:
--------------------------------------------------------------------------------
  1 | % ****************************************************************************************************
  2 | % classicthesis-config.tex
  3 | % formerly known as loadpackages.sty, classicthesis-ldpkg.sty, and classicthesis-preamble.sty
  4 | % Use it at the beginning of your ClassicThesis.tex, or as a LaTeX Preamble
  5 | % in your ClassicThesis.{tex,lyx} with \input{classicthesis-config}
  6 | % ****************************************************************************************************
  7 | % If you like the classicthesis, then I would appreciate a postcard.
  8 | % My address can be found in the file ClassicThesis.pdf. A collection
  9 | % of the postcards I received so far is available online at
 10 | % http://postcards.miede.de
 11 | % ****************************************************************************************************
 12 | 
 13 | % ****************************************************************************************************
 14 | % 1. Configure classicthesis for your needs here, e.g., remove "drafting" below
 15 | % in order to deactivate the time-stamp on the pages
 16 | % ****************************************************************************************************
 17 | \PassOptionsToPackage{eulerchapternumbers,listings,%drafting,%
 18 | 				 pdfspacing,eulermath,%floatperchapter,%linedheaders,%
 19 | 				 subfig,parts,dottedtoc}{classicthesis}
 20 | % ********************************************************************
 21 | % Available options for classicthesis.sty
 22 | % (see ClassicThesis.pdf for more information):
 23 | % drafting
 24 | % parts nochapters linedheaders
 25 | % eulerchapternumbers beramono eulermath pdfspacing minionprospacing
 26 | % tocaligned dottedtoc manychapters
 27 | % listings floatperchapter subfig
 28 | % ********************************************************************
 29 | 
 30 | % ********************************************************************
 31 | % Triggers for this config
 32 | % ********************************************************************
 33 | \usepackage{ifthen}
 34 | \newboolean{enable-backrefs} % enable backrefs in the bibliography
 35 | \setboolean{enable-backrefs}{false} % true false
 36 | % ****************************************************************************************************
 37 | 
 38 | 
 39 | % ****************************************************************************************************
 40 | % 2. Personal data and user ad-hoc commands
 41 | % ****************************************************************************************************
 42 | \newcommand{\myTitle}{Understanding Random Forests\xspace}
 43 | \newcommand{\mySubtitle}{From Theory to Practice\xspace}
 44 | \newcommand{\myDegree}{Doktor-Ingenieur (Dr.-Ing.)\xspace}
 45 | \newcommand{\myName}{Gilles Louppe\xspace}
 46 | \newcommand{\myProf}{Put name here\xspace}
 47 | \newcommand{\myOtherProf}{Put name here\xspace}
 48 | \newcommand{\mySupervisor}{Pierre Geurts\xspace}
 49 | \newcommand{\myFaculty}{Faculty of Applied Sciences\xspace}
 50 | \newcommand{\myDepartment}{Department of EE and CS\xspace}
 51 | \newcommand{\myUni}{University of Liege\xspace}
 52 | \newcommand{\myLocation}{Liege, Belgium\xspace}
 53 | \newcommand{\myTime}{June 2014\xspace}
 54 | \newcommand{\myVersion}{version 1.0\xspace}
 55 | 
 56 | % ********************************************************************
 57 | % Setup, finetuning, and useful commands
 58 | % ********************************************************************
 59 | \newcounter{dummy} % necessary for correct hyperlinks (to index, bib, etc.)
 60 | \newlength{\abcd} % for ab..z string length calculation
 61 | \providecommand{\mLyX}{L\kern-.1667em\lower.25em\hbox{Y}\kern-.125emX\@}
 62 | \newcommand{\ie}{i.\,e.}
 63 | \newcommand{\Ie}{I.\,e.}
 64 | \newcommand{\eg}{e.\,g.}
 65 | \newcommand{\Eg}{E.\,g.}
 66 | % ****************************************************************************************************
 67 | 
 68 | 
 69 | % ****************************************************************************************************
 70 | % 3. Loading some handy packages
 71 | % ****************************************************************************************************
 72 | % ********************************************************************
 73 | % Packages with options that might require adjustments
 74 | % ********************************************************************
 75 | \PassOptionsToPackage{latin9}{inputenc}	% latin9 (ISO-8859-9) = latin1+"Euro sign"
 76 |  \usepackage{inputenc}
 77 | 
 78 | %\PassOptionsToPackage{ngerman,american}{babel}   % change this to your language(s)
 79 | % Spanish languages need extra options in order to work with this template
 80 | %\PassOptionsToPackage{spanish,es-lcroman}{babel}
 81 |  \usepackage{babel}
 82 | 
 83 | \PassOptionsToPackage{square,authoryear}{natbib}
 84 |  \usepackage{natbib}
 85 | 
 86 | \PassOptionsToPackage{fleqn}{amsmath}		% math environments and more by the AMS
 87 |  \usepackage{amsmath}
 88 | 
 89 | % ********************************************************************
 90 | % General useful packages
 91 | % ********************************************************************
 92 | \PassOptionsToPackage{T1}{fontenc} % T2A for cyrillics
 93 | 	\usepackage{fontenc}
 94 | \usepackage{lipsum}
 95 | \usepackage{textcomp} % fix warning with missing font shapes
 96 | %\usepackage{scrhack} % fix warnings when using KOMA with listings package
 97 | \usepackage{xspace} % to get the spacing after macros right
 98 | \usepackage{mparhack} % get marginpar right
 99 | \usepackage{fixltx2e} % fixes some LaTeX stuff
100 | \PassOptionsToPackage{printonlyused,smaller}{acronym}
101 | 	\usepackage{acronym} % nice macros for handling all acronyms in the thesis
102 | %\renewcommand*{\acsfont}[1]{\textssc{#1}} % for MinionPro
103 | \renewcommand{\bflabel}[1]{{#1}\hfill} % fix the list of acronyms
104 | % ****************************************************************************************************
105 | 
106 | 
107 | % ****************************************************************************************************
108 | % 4. Setup floats: tables, (sub)figures, and captions
109 | % ****************************************************************************************************
110 | \usepackage{tabularx} % better tables
111 | 	\setlength{\extrarowheight}{3pt} % increase table row height
112 | \newcommand{\tableheadline}[1]{\multicolumn{1}{c}{\spacedlowsmallcaps{#1}}}
113 | \newcommand{\myfloatalign}{\centering} % to be used with each float for alignment
114 | \usepackage{caption}
115 | \captionsetup{format=hang,font=small}
116 | \usepackage{subfig}
117 | % ****************************************************************************************************
118 | 
119 | 
120 | % ****************************************************************************************************
121 | % 5. Setup code listings
122 | % ****************************************************************************************************
123 | \usepackage{listings}
124 | %\lstset{emph={trueIndex,root},emphstyle=\color{BlueViolet}}%\underbar} % for special keywords
125 | \lstset{language=[LaTeX]Tex,%C++,
126 |     keywordstyle=\color{RoyalBlue},%\bfseries,
127 |     basicstyle=\small\ttfamily,
128 |     %identifierstyle=\color{NavyBlue},
129 |     commentstyle=\color{Green}\ttfamily,
130 |     stringstyle=\rmfamily,
131 |     numbers=none,%left,%
132 |     numberstyle=\scriptsize,%\tiny
133 |     stepnumber=5,
134 |     numbersep=8pt,
135 |     showstringspaces=false,
136 |     breaklines=true,
137 |     frameround=ftff,
138 |     frame=single,
139 |     belowcaptionskip=.75\baselineskip
140 |     %frame=L
141 | }
142 | % ****************************************************************************************************
143 | 
144 | 
145 | % ****************************************************************************************************
146 | % 6. PDFLaTeX, hyperreferences and citation backreferences
147 | % ****************************************************************************************************
148 | % ********************************************************************
149 | % Using PDFLaTeX
150 | % ********************************************************************
151 | \PassOptionsToPackage{pdftex,hyperfootnotes=true,pdfpagelabels}{hyperref}
152 | 	\usepackage{hyperref}  % backref linktocpage pagebackref
153 | \pdfcompresslevel=9
154 | \pdfadjustspacing=1
155 | \PassOptionsToPackage{pdftex}{graphicx}
156 | 	\usepackage{graphicx}
157 | 
158 | % ********************************************************************
159 | % Setup the style of the backrefs from the bibliography
160 | % (translate the options to any language you use)
161 | % ********************************************************************
162 | \newcommand{\backrefnotcitedstring}{\relax}%(Not cited.)
163 | \newcommand{\backrefcitedsinglestring}[1]{(Cited on page~#1.)}
164 | \newcommand{\backrefcitedmultistring}[1]{(Cited on pages~#1.)}
165 | \ifthenelse{\boolean{enable-backrefs}}%
166 | {%
167 | 		\PassOptionsToPackage{hyperpageref}{backref}
168 | 		\usepackage{backref} % to be loaded after hyperref package
169 | 		   \renewcommand{\backreftwosep}{ and~} % separate 2 pages
170 | 		   \renewcommand{\backreflastsep}{, and~} % separate last of longer list
171 | 		   \renewcommand*{\backref}[1]{}  % disable standard
172 | 		   \renewcommand*{\backrefalt}[4]{% detailed backref
173 | 		      \ifcase #1 %
174 | 		         \backrefnotcitedstring%
175 | 		      \or%
176 | 		         \backrefcitedsinglestring{#2}%
177 | 		      \else%
178 | 		         \backrefcitedmultistring{#2}%
179 | 		      \fi}%
180 | }{\relax}
181 | 
182 | % ********************************************************************
183 | % Hyperreferences
184 | % ********************************************************************
185 | \hypersetup{%
186 |     %draft,	% = no hyperlinking at all (useful in b/w printouts)
187 |     colorlinks=true, linktocpage=true, pdfstartpage=3, pdfstartview=FitV,%
188 |     % uncomment the following line if you want to have black links (e.g., for printing)
189 |     %colorlinks=false, linktocpage=false, pdfborder={0 0 0}, pdfstartpage=3, pdfstartview=FitV,%
190 |     breaklinks=true, pdfpagemode=UseNone, pageanchor=true, pdfpagemode=UseOutlines,%
191 |     plainpages=false, bookmarksnumbered, bookmarksopen=true, bookmarksopenlevel=1,%
192 |     hypertexnames=true, pdfhighlight=/O,%nesting=true,%frenchlinks,%
193 |     urlcolor=webbrown, linkcolor=RoyalBlue, citecolor=webgreen, %pagecolor=RoyalBlue,%
194 |     %urlcolor=Black, linkcolor=Black, citecolor=Black, %pagecolor=Black,%
195 |     pdftitle={\myTitle},%
196 |     pdfauthor={\textcopyright\ \myName, \myUni, \myFaculty},%
197 |     pdfsubject={},%
198 |     pdfkeywords={},%
199 |     pdfcreator={pdfLaTeX},%
200 |     pdfproducer={LaTeX with hyperref and classicthesis}%
201 | }
202 | 
203 | % ********************************************************************
204 | % Setup autoreferences
205 | % ********************************************************************
206 | % There are some issues regarding autorefnames
207 | % http://www.ureader.de/msg/136221647.aspx
208 | % http://www.tex.ac.uk/cgi-bin/texfaq2html?label=latexwords
209 | % you have to redefine the makros for the
210 | % language you use, e.g., american, ngerman
211 | % (as chosen when loading babel/AtBeginDocument)
212 | % ********************************************************************
213 | \makeatletter
214 | \@ifpackageloaded{babel}%
215 |     {%
216 |        \addto\extrasamerican{%
217 | 					\renewcommand*{\figureautorefname}{Figure}%
218 | 					\renewcommand*{\tableautorefname}{Table}%
219 | 					\renewcommand*{\partautorefname}{Part}%
220 | 					\renewcommand*{\chapterautorefname}{Chapter}%
221 | 					\renewcommand*{\sectionautorefname}{Section}%
222 | 					\renewcommand*{\subsectionautorefname}{Section}%
223 | 					\renewcommand*{\subsubsectionautorefname}{Section}%
224 | 				}%
225 |        \addto\extrasngerman{%
226 | 					\renewcommand*{\paragraphautorefname}{Absatz}%
227 | 					\renewcommand*{\subparagraphautorefname}{Unterabsatz}%
228 | 					\renewcommand*{\footnoteautorefname}{Fu\"snote}%
229 | 					\renewcommand*{\FancyVerbLineautorefname}{Zeile}%
230 | 					\renewcommand*{\theoremautorefname}{Theorem}%
231 | 					\renewcommand*{\appendixautorefname}{Anhang}%
232 | 					\renewcommand*{\equationautorefname}{Gleichung}%
233 | 					\renewcommand*{\itemautorefname}{Punkt}%
234 | 				}%
235 | 			% Fix to getting autorefs for subfigures right (thanks to Belinda Vogt for changing the definition)
236 | 			\providecommand{\subfigureautorefname}{\figureautorefname}%
237 |     }{\relax}
238 | \makeatother
239 | 
240 | 
241 | % ****************************************************************************************************
242 | % 7. Last calls before the bar closes
243 | % ****************************************************************************************************
244 | % ********************************************************************
245 | % Development Stuff
246 | % ********************************************************************
247 | \listfiles
248 | %\PassOptionsToPackage{l2tabu,orthodox,abort}{nag}
249 | %	\usepackage{nag}
250 | %\PassOptionsToPackage{warning, all}{onlyamsmath}
251 | %	\usepackage{onlyamsmath}
252 | 
253 | % ********************************************************************
254 | % Last, but not least...
255 | % ********************************************************************
256 | \usepackage{classicthesis}
257 | % ****************************************************************************************************
258 | 
259 | 
260 | % ****************************************************************************************************
261 | % 8. Further adjustments (experimental)
262 | % ****************************************************************************************************
263 | % ********************************************************************
264 | % Changing the text area
265 | % ********************************************************************
266 | %\linespread{1.05} % a bit more for Palatino
267 | %\areaset[current]{312pt}{761pt} % 686 (factor 2.2) + 33 head + 42 head \the\footskip
268 | %\setlength{\marginparwidth}{7em}%
269 | %\setlength{\marginparsep}{2em}%
270 | 
271 | % ********************************************************************
272 | % Using different fonts
273 | % ********************************************************************
274 | %\usepackage[oldstylenums]{kpfonts} % oldstyle notextcomp
275 | %\usepackage[osf]{libertine}
276 | %\usepackage{hfoldsty} % Computer Modern with osf
277 | %\usepackage[light,condensed,math]{iwona}
278 | %\renewcommand{\sfdefault}{iwona}
279 | %\usepackage{lmodern} % <-- no osf support :-(
280 | % \usepackage[T1]{fontenc}
281 | % \usepackage{textcomp}
282 | %\usepackage[urw-garamond]{mathdesign} <-- no osf support :-(
283 | % ****************************************************************************************************
284 | 


--------------------------------------------------------------------------------