├── introduction
    ├── images
    ├── machine_learning.rst
    └── python_ecosystem.rst
├── lib
    └── pystatsml
    │   ├── __init__.py
    │   └── plot_utils.py
├── statistics
    ├── images
    ├── pystatsml
    ├── README.txt
    └── stat_multiv_solutions.py
├── machine_learning
    ├── images
    ├── pystatsml
    ├── README.txt
    ├── img_sources
    │   ├── bias_variance.png
    │   ├── ada_boost_steps.png
    │   ├── bagging_overview.png
    │   ├── bootstrap_overview.png
    │   ├── bagging_architecture.png
    │   ├── boosting_architecture.png
    │   ├── stacking_architecture.png
    │   ├── boost_algo_weighted_sum.png
    │   ├── gradient_boosting_steps.png
    │   ├── bagging_model_aggregation.png
    │   ├── gradient_desceent_boosting.png
    │   ├── multi_stacking_architecture.png
    │   ├── step_size_gradient_boosting.png
    │   ├── loss_l_step_adaptative_boosting.png
    │   ├── usage_bootstrapping_in_variance.png
    │   └── architecture_adaptative_boosting.png
    ├── examples
    │   └── boot_clustering.py
    ├── resampling_solution.py
    ├── manifold_solutions.ipynb
    ├── ml_supervized_nonlinear.py
    └── decomposition_solutions.ipynb
├── scientific_python
    ├── images
    ├── README.txt
    ├── scipy_numpy_solutions.py
    ├── scipy_pandas_solutions.py
    └── scipy_matplotlib.ipynb
├── images
    ├── svm.png
    ├── boxplot.png
    ├── linear.png
    ├── model_lm.png
    ├── Dot_Product.png
    ├── data_science.png
    ├── numpy_array3d.png
    ├── random_forest.png
    ├── linear_logistic.png
    ├── svd_mixing_dict.png
    ├── trees_elements.odg
    ├── fisher_linear_disc.png
    ├── gradient_boosting.png
    ├── linear_regression.png
    ├── machine_learning.png
    ├── model_two-sample.png
    ├── numpy_broadcasting.png
    ├── python_ecosystem.odg
    ├── python_ecosystem.pdf
    ├── python_ecosystem.png
    ├── train_val_test_cv.odg
    ├── train_val_test_cv.png
    ├── two_samples_ttest.png
    ├── classification_tree.png
    ├── shrinkage
    │   ├── l1_sparse.png
    │   ├── ols_l1_l2.png
    │   └── ols_multicollinearity.png
    ├── stat_tests_flowchart.png
    ├── linear_regression_plan.png
    ├── vc_dimension_linear_2d.png
    ├── ridge_fisher_linear_disc.png
    ├── Coefficient_of_Determination.png
    ├── linear_regression_penalties.png
    └── svm_rbf_kernel_mapping_and_decision_function.png
├── labs
    └── README.txt
├── python_lang
    ├── README.txt
    ├── scripts
    │   ├── count_words.py
    │   └── replace.py
    └── python_lang_solutions.py
├── deep_learning
    ├── figures
    │   ├── vgg.png
    │   ├── alexnet.png
    │   ├── dropout.png
    │   ├── logistic.png
    │   ├── resnet18.png
    │   ├── resnet_vgg.png
    │   ├── nn_two_layers.png
    │   ├── vgg_param_tab.png
    │   ├── inception_block.png
    │   ├── alexnet_param_tab.png
    │   ├── resnet_param_tab.png
    │   ├── LeNet_Original_Image.jpg
    │   ├── logistic_multinominal.png
    │   ├── resnets_modelvariants.png
    │   └── logistic_multinominal_MNIST.png
    ├── train_val_model.py
    └── README.md
├── optimization
    └── images
    │   ├── nestrov.PNG
    │   ├── sgd_momentum.png
    │   ├── SGD_fluctuation.PNG
    │   ├── learning_rate_choice.png
    │   ├── grad_descent_momentum.png
    │   ├── gradient_descent_goals.png
    │   ├── grad_descent_no_momentum.png
    │   └── gradient_descent_intuition.png
├── datasets
    ├── brain_volumes
    │   ├── brain_volumes.xlsx
    │   ├── csf.csv
    │   ├── demo.csv
    │   ├── gm.csv
    │   ├── wm.csv
    │   └── brain_volumes.csv
    ├── default of credit card clients.xls
    ├── iris.csv
    ├── birthwt.csv
    ├── eurodist.csv
    ├── s_curve.csv
    ├── Advertising.csv
    ├── multiTimeline.csv
    ├── salary_table.csv
    ├── brain_anat_ixi
    │   ├── train_rois.csv
    │   ├── train_participants.csv
    │   ├── validation_rois.csv
    │   └── validation_participants.csv
    ├── readme.rst
    └── birthwt.txt
├── utils
    ├── README.txt
    ├── ml_sklearn_pipelines.py
    ├── plot_ml_linear_regression_multicolinearity.py
    ├── datasets.py
    ├── mahalanobis.py
    ├── ml_resampling.py
    ├── plot_ml_linear_regression_overfitting.py
    ├── plot_ml_linear_classification_overfitting.py
    ├── stat_univar_statmodels.py
    ├── ml_non_linear_prediction.py
    ├── ml_processing_pipelines.py
    └── time_series.py
├── .gitattributes
├── LICENSE
├── AUTHORS.rst
├── .gitignore
├── tests
    ├── test_build.py
    └── test_notebook.ipynb
├── .circleci
    └── config.yml
├── COPYING
├── R
    ├── tools_R_exo.R
    ├── stat_multiv_exo.R
    └── ml_dimensionality_reduction_exo.R
├── bin
    ├── conv_python_to_rst.py
    └── filter_fix_rst.py
├── index.rst
├── README.md
├── Makefile
├── info.rst
└── conf.py


/introduction/images:
--------------------------------------------------------------------------------
1 | ../images


--------------------------------------------------------------------------------
/lib/pystatsml/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/statistics/images:
--------------------------------------------------------------------------------
1 | ../images


--------------------------------------------------------------------------------
/machine_learning/images:
--------------------------------------------------------------------------------
1 | ../images


--------------------------------------------------------------------------------
/scientific_python/images:
--------------------------------------------------------------------------------
1 | ../images


--------------------------------------------------------------------------------
/statistics/pystatsml:
--------------------------------------------------------------------------------
1 | ../lib/pystatsml


--------------------------------------------------------------------------------
/machine_learning/pystatsml:
--------------------------------------------------------------------------------
1 | ../lib/pystatsml


--------------------------------------------------------------------------------
/images/svm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svm.png


--------------------------------------------------------------------------------
/images/boxplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/boxplot.png


--------------------------------------------------------------------------------
/images/linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear.png


--------------------------------------------------------------------------------
/images/model_lm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/model_lm.png


--------------------------------------------------------------------------------
/images/Dot_Product.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/Dot_Product.png


--------------------------------------------------------------------------------
/images/data_science.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/data_science.png


--------------------------------------------------------------------------------
/images/numpy_array3d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/numpy_array3d.png


--------------------------------------------------------------------------------
/images/random_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/random_forest.png


--------------------------------------------------------------------------------
/labs/README.txt:
--------------------------------------------------------------------------------
1 | Labs
2 | ====
3 | 
4 | - Supervized classification: face recognition
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/python_lang/README.txt:
--------------------------------------------------------------------------------
1 | Scientific Python
2 | =================
3 | 
4 | - Python language
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/images/linear_logistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_logistic.png


--------------------------------------------------------------------------------
/images/svd_mixing_dict.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svd_mixing_dict.png


--------------------------------------------------------------------------------
/images/trees_elements.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/trees_elements.odg


--------------------------------------------------------------------------------
/deep_learning/figures/vgg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/vgg.png


--------------------------------------------------------------------------------
/images/fisher_linear_disc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/fisher_linear_disc.png


--------------------------------------------------------------------------------
/images/gradient_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/gradient_boosting.png


--------------------------------------------------------------------------------
/images/linear_regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression.png


--------------------------------------------------------------------------------
/images/machine_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/machine_learning.png


--------------------------------------------------------------------------------
/images/model_two-sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/model_two-sample.png


--------------------------------------------------------------------------------
/images/numpy_broadcasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/numpy_broadcasting.png


--------------------------------------------------------------------------------
/images/python_ecosystem.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.odg


--------------------------------------------------------------------------------
/images/python_ecosystem.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.pdf


--------------------------------------------------------------------------------
/images/python_ecosystem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.png


--------------------------------------------------------------------------------
/images/train_val_test_cv.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/train_val_test_cv.odg


--------------------------------------------------------------------------------
/images/train_val_test_cv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/train_val_test_cv.png


--------------------------------------------------------------------------------
/images/two_samples_ttest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/two_samples_ttest.png


--------------------------------------------------------------------------------
/scientific_python/README.txt:
--------------------------------------------------------------------------------
1 | Data Manipulaiion
2 | =================
3 | 
4 | - Numpy
5 | - Pandas
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/images/classification_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/classification_tree.png


--------------------------------------------------------------------------------
/images/shrinkage/l1_sparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/l1_sparse.png


--------------------------------------------------------------------------------
/images/shrinkage/ols_l1_l2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/ols_l1_l2.png


--------------------------------------------------------------------------------
/images/stat_tests_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/stat_tests_flowchart.png


--------------------------------------------------------------------------------
/optimization/images/nestrov.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/nestrov.PNG


--------------------------------------------------------------------------------
/deep_learning/figures/alexnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/alexnet.png


--------------------------------------------------------------------------------
/deep_learning/figures/dropout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/dropout.png


--------------------------------------------------------------------------------
/deep_learning/figures/logistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic.png


--------------------------------------------------------------------------------
/deep_learning/figures/resnet18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet18.png


--------------------------------------------------------------------------------
/images/linear_regression_plan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression_plan.png


--------------------------------------------------------------------------------
/images/vc_dimension_linear_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/vc_dimension_linear_2d.png


--------------------------------------------------------------------------------
/machine_learning/README.txt:
--------------------------------------------------------------------------------
1 | Machine Learning
2 | ====
3 | 
4 | - Supervized non-linear classification
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/deep_learning/figures/resnet_vgg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet_vgg.png


--------------------------------------------------------------------------------
/images/ridge_fisher_linear_disc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/ridge_fisher_linear_disc.png


--------------------------------------------------------------------------------
/optimization/images/sgd_momentum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/sgd_momentum.png


--------------------------------------------------------------------------------
/deep_learning/figures/nn_two_layers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/nn_two_layers.png


--------------------------------------------------------------------------------
/deep_learning/figures/vgg_param_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/vgg_param_tab.png


--------------------------------------------------------------------------------
/images/Coefficient_of_Determination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/Coefficient_of_Determination.png


--------------------------------------------------------------------------------
/images/linear_regression_penalties.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression_penalties.png


--------------------------------------------------------------------------------
/optimization/images/SGD_fluctuation.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/SGD_fluctuation.PNG


--------------------------------------------------------------------------------
/datasets/brain_volumes/brain_volumes.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/datasets/brain_volumes/brain_volumes.xlsx


--------------------------------------------------------------------------------
/deep_learning/figures/inception_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/inception_block.png


--------------------------------------------------------------------------------
/datasets/default of credit card clients.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/datasets/default of credit card clients.xls


--------------------------------------------------------------------------------
/deep_learning/figures/alexnet_param_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/alexnet_param_tab.png


--------------------------------------------------------------------------------
/deep_learning/figures/resnet_param_tab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet_param_tab.png


--------------------------------------------------------------------------------
/images/shrinkage/ols_multicollinearity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/ols_multicollinearity.png


--------------------------------------------------------------------------------
/optimization/images/learning_rate_choice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/learning_rate_choice.png


--------------------------------------------------------------------------------
/deep_learning/figures/LeNet_Original_Image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/LeNet_Original_Image.jpg


--------------------------------------------------------------------------------
/machine_learning/img_sources/bias_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bias_variance.png


--------------------------------------------------------------------------------
/optimization/images/grad_descent_momentum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/grad_descent_momentum.png


--------------------------------------------------------------------------------
/optimization/images/gradient_descent_goals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/gradient_descent_goals.png


--------------------------------------------------------------------------------
/utils/README.txt:
--------------------------------------------------------------------------------
1 | Miscellaneous python scripts used in the document.
2 | Files prefixed by `plot_`  are actually used to generate some figures.
3 | 


--------------------------------------------------------------------------------
/deep_learning/figures/logistic_multinominal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic_multinominal.png


--------------------------------------------------------------------------------
/deep_learning/figures/resnets_modelvariants.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnets_modelvariants.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/ada_boost_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/ada_boost_steps.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/bagging_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_overview.png


--------------------------------------------------------------------------------
/optimization/images/grad_descent_no_momentum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/grad_descent_no_momentum.png


--------------------------------------------------------------------------------
/datasets/iris.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9cc1c345c71bcc9b486b74cbf6063fa66f4bb5e0f603a4b3c3471ec2e5e8e355
3 | size 3858
4 | 


--------------------------------------------------------------------------------
/machine_learning/img_sources/bootstrap_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bootstrap_overview.png


--------------------------------------------------------------------------------
/optimization/images/gradient_descent_intuition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/gradient_descent_intuition.png


--------------------------------------------------------------------------------
/datasets/birthwt.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:50102db24a1fc05d351f83b341d71fa6b5847493bbb1a709811f9910a38f62ca
3 | size 4955
4 | 


--------------------------------------------------------------------------------
/datasets/eurodist.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d42d9cc12cee29073581135909fbc0f058209b2b989c1be4c7ffa2d26ef35154
3 | size 2443
4 | 


--------------------------------------------------------------------------------
/datasets/s_curve.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:28efe138024294daa58a8d76ab08d9065bad7072502e703474981a3560ab9c12
3 | size 77352
4 | 


--------------------------------------------------------------------------------
/deep_learning/figures/logistic_multinominal_MNIST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic_multinominal_MNIST.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/bagging_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_architecture.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/boosting_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/boosting_architecture.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/stacking_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/stacking_architecture.png


--------------------------------------------------------------------------------
/utils/ml_sklearn_pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Apr  5 15:52:46 2016
4 | 
5 | @author: edouard.duchesnay@cea.fr
6 | """
7 | 
8 | 


--------------------------------------------------------------------------------
/datasets/Advertising.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:69104adc017e75d7019f61fe66ca2eb4ab014ee6f2a9b39b452943f209352010
3 | size 5166
4 | 


--------------------------------------------------------------------------------
/datasets/multiTimeline.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c4c9d7976a39894d3d5245fa8a52dfbd87e37453fdfb0bf9e4e4ef765014139f
3 | size 2945
4 | 


--------------------------------------------------------------------------------
/datasets/salary_table.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f7aa35faea436cbdc5e74be0916067f864e2695530bb1fe2e9ede06ad1425886
3 | size 840
4 | 


--------------------------------------------------------------------------------
/images/svm_rbf_kernel_mapping_and_decision_function.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svm_rbf_kernel_mapping_and_decision_function.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/boost_algo_weighted_sum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/boost_algo_weighted_sum.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/gradient_boosting_steps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/gradient_boosting_steps.png


--------------------------------------------------------------------------------
/datasets/brain_volumes/csf.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f592eb619fb1b42d99a6db79864833b157022cdbd7e5aaee5369f6100954da16
3 | size 30941
4 | 


--------------------------------------------------------------------------------
/datasets/brain_volumes/demo.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1b8a87f396dde4b3db728dd9761dfa3cc86a60ad2b6e142acf47c0003ae64876
3 | size 9130
4 | 


--------------------------------------------------------------------------------
/datasets/brain_volumes/gm.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:75e02abc0f9d13a24b2b476056c73db6d006462cdec7b8d9617c5248992a6d89
3 | size 30691
4 | 


--------------------------------------------------------------------------------
/datasets/brain_volumes/wm.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1367e63d1d4973d45ad69937538010afc4299dbc95cf6ff434c31438415d81d8
3 | size 30786
4 | 


--------------------------------------------------------------------------------
/machine_learning/img_sources/bagging_model_aggregation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_model_aggregation.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/gradient_desceent_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/gradient_desceent_boosting.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/multi_stacking_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/multi_stacking_architecture.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/step_size_gradient_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/step_size_gradient_boosting.png


--------------------------------------------------------------------------------
/datasets/brain_anat_ixi/train_rois.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cfebdcbe5bd954c5eefd04ec477c16e1cad6c35e7408284c2ac3d39275407848
3 | size 1820688
4 | 


--------------------------------------------------------------------------------
/datasets/brain_volumes/brain_volumes.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5f816d737e16362a3692a628813432cdead0fdede0e9e23e70ebdda18bc6db45
3 | size 75687
4 | 


--------------------------------------------------------------------------------
/machine_learning/img_sources/loss_l_step_adaptative_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/loss_l_step_adaptative_boosting.png


--------------------------------------------------------------------------------
/machine_learning/img_sources/usage_bootstrapping_in_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/usage_bootstrapping_in_variance.png


--------------------------------------------------------------------------------
/datasets/brain_anat_ixi/train_participants.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87331681947797e0e0ed5ec1cb14747571b0f39bfca47c833576740a7bd8ce4b
3 | size 20036
4 | 


--------------------------------------------------------------------------------
/datasets/brain_anat_ixi/validation_rois.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:999c29e6203744be64654b0be26988b6653023104a0c451f484fca5fcfc8c4f3
3 | size 462442
4 | 


--------------------------------------------------------------------------------
/datasets/readme.rst:
--------------------------------------------------------------------------------
1 | default of credit card clients Data Set 
2 | =======================================
3 | 
4 | http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
5 | 


--------------------------------------------------------------------------------
/machine_learning/img_sources/architecture_adaptative_boosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/architecture_adaptative_boosting.png


--------------------------------------------------------------------------------
/datasets/brain_anat_ixi/validation_participants.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e60bc182bf37af64e9e406de684ad587b49c8120408ccab16d63a538c05db4ad
3 | size 5099
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.npz filter=lfs diff=lfs merge=lfs -text
2 | *.npy filter=lfs diff=lfs merge=lfs -text
3 | *.nii filter=lfs diff=lfs merge=lfs -text
4 | *.nii.gz filter=lfs diff=lfs merge=lfs -text
5 | *.csv filter=lfs diff=lfs merge=lfs -text
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | License
 2 | 
 3 | All code and material is licensed under a
 4 | 
 5 | Creative Commons Attribution 4.0 International License (CC-by)
 6 | 
 7 | http://creativecommons.org/licenses/by/4.0/
 8 | 
 9 | See the AUTHORS.rst file for a list of contributors.
10 | 
11 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | Authors
 2 | ========
 3 | 
 4 | Editors
 5 | --------
 6 | 
 7 | - Edouard Duchesnay (edouard.duchesnay@gmail.com)
 8 | 
 9 | Chapter authors 
10 | ----------------
11 | 
12 | Listed by alphabetical order.
13 | 
14 | - Younes Feki (younesfkih@gmail.com)
15 | 
16 | - Tommy Löfstedt (lofstedt.tommy@gmail.com)
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/statistics/README.txt:
--------------------------------------------------------------------------------
 1 | Univariate statistics
 2 | =====================
 3 | 
 4 | - Estimators of the main statistical measuresx
 5 | - Main distributions
 6 | - Hypothesis Testing
 7 | - Testing pairwise associations
 8 | - Non-parametric test of pairwise associations
 9 | - Linear model
10 | - Linear model with statsmodels
11 | - Multiple comparisons
12 | - Labs
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *~
 3 | .nfs*
 4 | ~$*
 5 | *.Rhistory
 6 | *.aux
 7 | *.bbl
 8 | *.blg
 9 | *.log
10 | .*.swp
11 | .ipynb_checkpoints
12 | notebooks/*.rst
13 | python/*.rst
14 | .spyderproject
15 | build/
16 | auto_gallery/
17 | .spyproject
18 | statistics/*.rst
19 | machine_learning/*.rst
20 | deep_learning/*.rst
21 | scientific_python/*.rst
22 | machine_learning/*_files
23 | deep_learning/*_files
24 | scientific_python/*files
25 | statistics/*_files
26 | 


--------------------------------------------------------------------------------
/tests/test_build.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Apr 17 13:52:03 2019
 5 | 
 6 | @author: edouard
 7 | """
 8 | 
 9 | ## TODO rewrite test function
10 | """
11 | Manual check, run command line
12 | 
13 | nb=tests/test_notebook.ipynb
14 | rst=tests/test_notebook.rst
15 | 
16 | # Run notebook
17 | jupyter nbconvert --to notebook --execute $nb --output $(basename $nb)
18 | 
19 | # Convert to rst
20 | jupyter nbconvert --to rst --stdout $nb
21 | jupyter nbconvert --to rst --stdout $nb | bin/filter_fix_rst.py > $rst
22 | """


--------------------------------------------------------------------------------
/scientific_python/scipy_numpy_solutions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 29 16:54:32 2016
 4 | 
 5 | @author: ed203246
 6 | """
 7 | 
 8 | import numpy as np
 9 | X = np.random.randn(4, 2)
10 | print(X)
11 | 
12 | '''
13 | - For each column find the row indices of the minimiun value.
14 | '''
15 | [np.argmin(X[:, j])
16 |     for j in range(X.shape[1])]
17 | 
18 | np.argmin(X, axis=0)
19 | 
20 | '''
21 | - Write a function ``scale(X)`` that return an array whose columns are centered and scaled (by std-dev).
22 | '''
23 | 
24 | def scale(X):
25 |     return (X - X.mean(axis=0)) / X.std(axis=0)
26 | 
27 | X = np.random.randn(5, 3)
28 | Xs = scale(X)
29 | 
30 | Xs.mean(axis=0)
31 | Xs.std(axis=0)
32 | 


--------------------------------------------------------------------------------
/datasets/birthwt.txt:
--------------------------------------------------------------------------------
 1 | Risk Factors Associated with Low Infant Birth Weight
 2 | 
 3 | Description:
 4 | 
 5 |      The ‘birthwt’ data frame has 189 rows and 10 columns.  The data
 6 |      were collected at Baystate Medical Center, Springfield, Mass
 7 |      during 1986.
 8 | 
 9 |      
10 | Format:
11 | 
12 |      This data frame contains the following columns:
13 | 
14 |      ‘low’ indicator of birth weight less than 2.5 kg.
15 | 
16 |      ‘age’ mother's age in years.
17 | 
18 |      ‘lwt’ mother's weight in pounds at last menstrual period.
19 | 
20 |      ‘race’ mother's race (‘1’ = white, ‘2’ = black, ‘3’ = other).
21 | 
22 |      ‘smoke’ smoking status during pregnancy.
23 | 
24 |      ‘ptl’ number of previous premature labours.
25 | 
26 |      ‘ht’ history of hypertension.
27 | 
28 |      ‘ui’ presence of uterine irritability.
29 | 
30 |      ‘ftv’ number of physician visits during the first trimester.
31 | 
32 |      ‘bwt’ birth weight in grams.
33 | 
34 | Source:
35 | 
36 |      Hosmer, D.W. and Lemeshow, S. (1989) _Applied Logistic
37 |      Regression._ New York: Wiley
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/test_notebook.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# The Lorenz Equations\n",
 8 |     "\n",
 9 |     "\\begin{align}\n",
10 |     "\\dot{x} & = \\sigma(y-x) \\\\\n",
11 |     "\\dot{z} & = -\\beta z + xy\n",
12 |     "\\end{align}\n",
13 |     "\n",
14 |     "toto $a=3$.\n",
15 |     "\n",
16 |     "\\begin{align}\n",
17 |     "\\dot{x} & = \\sigma(y-x) \\\\\n",
18 |     "\\dot{z} & = -\\beta z + xy\n",
19 |     "\\end{align}.\n",
20 |     "\n",
21 |     "titi\n",
22 |     "\n",
23 |     "$$\n",
24 |     "\\dot{x} & = \\sigma(y-x) \\\\\n",
25 |     "\\dot{y} & = \\rho x - y - xz \\\\\n",
26 |     "\\dot{z} & = -\\beta z + xy\n",
27 |     "$$"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": []
36 |   }
37 |  ],
38 |  "metadata": {
39 |   "kernelspec": {
40 |    "display_name": "Python 3",
41 |    "language": "python",
42 |    "name": "python3"
43 |   },
44 |   "language_info": {
45 |    "codemirror_mode": {
46 |     "name": "ipython",
47 |     "version": 3
48 |    },
49 |    "file_extension": ".py",
50 |    "mimetype": "text/x-python",
51 |    "name": "python",
52 |    "nbconvert_exporter": "python",
53 |    "pygments_lexer": "ipython3",
54 |    "version": "3.6.8"
55 |   }
56 |  },
57 |  "nbformat": 4,
58 |  "nbformat_minor": 2
59 | }
60 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | # initial CircleCI setup
 3 | jobs:
 4 |   build:
 5 |     docker:
 6 |       - image: circleci/python:3.6.1
 7 |     working_directory: ~/repo
 8 | 
 9 |     steps:
10 |       - checkout
11 |       - run: |
12 |           sudo apt-get install pandoc 
13 |           sudo apt-get install --no-install-recommends -y \
14 |               texlive-latex-recommended texlive-latex-extra \
15 |               texlive-fonts-recommended latexmk
16 |       - run:
17 |           name: install dependencies
18 |           command: |
19 |             python3 -m venv /tmp/venv
20 |             . /tmp/venv/bin/activate
21 |             pip install ipython notebook sphinx
22 |       - run:
23 |            name: build HTML
24 |            command: |
25 |             . /tmp/venv/bin/activate
26 |             make html
27 |       - run:
28 |            name: build PDF
29 |            command: |
30 |             . /tmp/venv/bin/activate
31 |             # TODO: the nonstopmode option is necessary to avoid timeouts 
32 |             # when compilation errors occurs (e.g. some images are not found).
33 |             make pdf LATEXOPTS="--interaction=nonstopmode"
34 |       - store_artifacts:
35 |           path: build/html/
36 |           destination: html/
37 |       - store_artifacts:
38 |           path: build/latex/StatisticsMachineLearningPython.pdf
39 |           destination: pdf/StatisticsMachineLearningPython.pdf
40 | 


--------------------------------------------------------------------------------
/machine_learning/examples/boot_clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy
 3 | import seaborn as sns
 4 | from sklearn import cluster, datasets
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | import seaborn as sns # nice color
 8 | iris = datasets.load_iris()
 9 | X = iris.data[:, :2] # use only 'sepal length and sepal width'
10 | y_iris = iris.target
11 | kmr = cluster.KMeans(n_clusters=3, random_state=42).fit(X)
12 | labels_r = kmr.predict(X)
13 | %matplotlib qt
14 | 
15 | nboot = 500
16 | orig_all = np.arange(X.shape[0])
17 | scores_boot = np.zeros(nboot)
18 | for boot_i in range(nboot):
19 |     # boot_i = 43
20 |     np.random.seed(boot_i)
21 |     boot_idx = np.random.choice(orig_all, size=len(orig_all), replace=True)
22 |     # boot_idx = orig_all
23 |     kmb = cluster.KMeans(n_clusters=3, random_state=42).fit(X[boot_idx, :])
24 |     dist = scipy.spatial.distance.cdist(kmb.cluster_centers_, kmr.cluster_centers_)
25 |     reorder = np.argmin(dist, axis=1)
26 |     #print(reorder)
27 |     # kmb.cluster_centers_ = kmb.cluster_centers_[reorder]
28 |     labels_b = kmb.predict(X)
29 |     labels_b = np.array([reorder[lab] for lab in labels_b])
30 |     scores_boot[boot_i] = np.sum(labels_b == labels_r) / len(labels_b)
31 | 
32 | sns.distplot(scores_boot)
33 | plt.show()
34 | 
35 | print(np.min(scores_boot), np.argmin(scores_boot))
36 | 
37 | pd.Series(scores_boot).describe(percentiles=[.975, .5, .025])
38 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2007-2021 Edouard Duchesnay.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/utils/plot_ml_linear_regression_multicolinearity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Fri Dec  4 10:23:37 2020
 5 | 
 6 | @author: ed203246
 7 | """
 8 | 
 9 | %matplotlib inline
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | import matplotlib.pyplot as plt
14 | 
15 | from sklearn import datasets
16 | import sklearn.linear_model as lm
17 | import sklearn.metrics as metrics
18 | 
19 | from mpl_toolkits.mplot3d import Axes3D
20 | 
21 | np.set_printoptions(precision=2)
22 | pd.set_option('precision', 2)
23 | 
24 | # %% Plot linear regression plan (in 2d)
25 | 
26 | # Fit Ordinary Least Squares: OLS
27 | csv = pd.read_csv('https://github.com/duchesnay/pystatsml/raw/master/datasets/Advertising.csv', index_col=0)
28 | X = csv[['TV', 'Radio']]
29 | y = csv['Sales']
30 | 
31 | lr = lm.LinearRegression().fit(X, y)
32 | y_pred = lr.predict(X)
33 | print("R-squared =", metrics.r2_score(y, y_pred))
34 | 
35 | print("Coefficients =", lr.coef_, lr.intercept_)
36 | 
37 | # Plot
38 | fig = plt.figure(figsize=(9, 9))
39 | #fig = plt.figure()
40 | ax = fig.add_subplot(111, projection='3d')
41 | 
42 | ax.scatter(csv['TV'], csv['Radio'], csv['Sales'], c='r', marker='o')
43 | 
44 | xx1, xx2 = np.meshgrid(
45 |     np.linspace(csv['TV'].min(), csv['TV'].max(), num=10),
46 |     np.linspace(csv['Radio'].min(), csv['Radio'].max(), num=10))
47 | 
48 | XX = np.column_stack([xx1.ravel(), xx2.ravel()])
49 | 
50 | yy = lr.predict(XX)
51 | ax.plot_wireframe(xx1, xx2, yy.reshape(xx1.shape))
52 | ax.set_xlabel('TV')
53 | ax.set_ylabel('Radio')
54 | _ = ax.set_zlabel('Sales')
55 | 
56 | plt.savefig("/home/ed203246/git/pystatsml/images/linear_regression_plan.png")
57 | 


--------------------------------------------------------------------------------
/statistics/stat_multiv_solutions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Munivariate statistics exercises
 3 | ================================
 4 | '''
 5 | import pandas as pd
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | #%matplotlib inline
 9 | np.random.seed(seed=42)  # make the example reproducible
10 | 
11 | '''
12 | ### Dot product and Euclidean norm
13 | '''
14 | 
15 | a = np.array([2,1])
16 | b = np.array([1,1])
17 | 
18 | def euclidian(x):
19 |     return np.sqrt(np.dot(x, x))
20 | 
21 | euclidian(a)
22 | 
23 | euclidian(a - b)
24 | 
25 | np.dot(b, a / euclidian(a))
26 | 
27 | X = np.random.randn(100, 2)
28 | np.dot(X, a / euclidian(a))
29 | 
30 | '''
31 | ### Covariance matrix and Mahalanobis norm
32 | '''
33 | 
34 | N = 100
35 | mu = np.array([1, 1])
36 | Cov = np.array([[1, .8],
37 |                 [.8, 1]])
38 | 
39 | X = np.random.multivariate_normal(mu, Cov, N)
40 | 
41 | xbar = np.mean(X, axis=0)
42 | print(xbar)
43 | 
44 | Xc = (X - xbar)
45 | 
46 | np.mean(Xc, axis=0)
47 | 
48 | S = 1 / (N - 1) * np.dot(Xc.T, Xc)
49 | print(S)
50 | 
51 | #import scipy
52 | 
53 | Sinv = np.linalg.inv(S)
54 | 
55 | 
56 | def mahalanobis(x, xbar, Sinv):
57 |     xc = x - xbar
58 |     return np.sqrt(np.dot(np.dot(xc, Sinv), xc))
59 | 
60 | dists = pd.DataFrame(
61 | [[mahalanobis(X[i, :], xbar, Sinv),
62 |   euclidian(X[i, :] - xbar)] for i in range(X.shape[0])],
63 |             columns = ['Mahalanobis', 'Euclidean'])
64 | 
65 | print(dists[:10])
66 | 
67 | x = X[0, :]
68 | 
69 | import scipy.spatial
70 | assert(mahalanobis(X[0, :], xbar, Sinv) == scipy.spatial.distance.mahalanobis(xbar, X[0, :], Sinv))
71 | assert(mahalanobis(X[1, :], xbar, Sinv) == scipy.spatial.distance.mahalanobis(xbar, X[1, :], Sinv))
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/lib/pystatsml/plot_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon Aug 29 10:58:31 2016
 4 | 
 5 | @author: edouard.duchesnay@cea.fr
 6 | """
 7 | 
 8 | 
 9 | import numpy as np
10 | import scipy
11 | import matplotlib.pyplot as plt
12 | import seaborn as sns
13 | 
14 | from matplotlib.patches import Ellipse
15 | 
16 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
17 |     """
18 |     Plots an `nstd` sigma error ellipse based on the specified covariance
19 |     matrix (`cov`). Additional keyword arguments are passed on to the 
20 |     ellipse patch artist.
21 | 
22 |     Parameters
23 |     ----------
24 |         cov : The 2x2 covariance matrix to base the ellipse on
25 |         pos : The location of the center of the ellipse. Expects a 2-element
26 |             sequence of [x0, y0].
27 |         nstd : The radius of the ellipse in numbers of standard deviations.
28 |             Defaults to 2 standard deviations.
29 |         ax : The axis that the ellipse will be plotted on. Defaults to the 
30 |             current axis.
31 |         Additional keyword arguments are pass on to the ellipse patch.
32 | 
33 |     Returns
34 |     -------
35 |         A matplotlib ellipse artist
36 |     """
37 |     def eigsorted(cov):
38 |         vals, vecs = np.linalg.eigh(cov)
39 |         order = vals.argsort()[::-1]
40 |         return vals[order], vecs[:,order]
41 | 
42 |     if ax is None:
43 |         ax = plt.gca()
44 | 
45 |     vals, vecs = eigsorted(cov)
46 |     theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
47 | 
48 |     # Width and height are "full" widths, not radius
49 |     width, height = 2 * nstd * np.sqrt(vals)
50 |     ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
51 | 
52 |     ax.add_artist(ellip)
53 |     return ellip
54 | 


--------------------------------------------------------------------------------
/scientific_python/scipy_pandas_solutions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Exercises: Pandas: data manipulation
 3 | ------------------------------------
 4 | 
 5 | Data Frame
 6 | ~~~~~~~~~~
 7 | 
 8 | 1. Read the iris dataset at 'https://github.com/neurospin/pystatsml/tree/master/datasets/iris.csv'
 9 | 
10 | 2. Print column names
11 | 
12 | 3. Get numerical columns
13 | 
14 | 4. For each species compute the mean of numerical columns and store it in  a ``stats`` table like:
15 | 
16 | ::
17 | 
18 |           species  sepal_length  sepal_width  petal_length  petal_width
19 |     0      setosa         5.006        3.428         1.462        0.246
20 |     1  versicolor         5.936        2.770         4.260        1.326
21 |     2   virginica         6.588        2.974         5.552        2.026
22 | 
23 | 
24 | """
25 | 
26 | import pandas as pd
27 | import numpy as np
28 | import matplotlib.pyplot as plt
29 | 
30 | 
31 | url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'
32 | df = pd.read_csv(url)
33 | 
34 | num_cols = df._get_numeric_data().columns
35 | 
36 | stats = list()
37 | 
38 | for grp, d in df.groupby("species"):
39 |     print(grp)
40 |     #print()
41 |     stats.append( [grp] + d.loc[:, num_cols].mean(axis=0).tolist())
42 | 
43 | stats = pd.DataFrame(stats, columns=["species"] + num_cols.tolist())
44 | print(stats)
45 | 
46 | # or
47 | df.groupby("species").mean()
48 | 
49 | ##
50 | 
51 | df.loc[[0, 1] ,"petal_width"] = None
52 | 
53 | df.petal_width
54 | 
55 | df["petal_width"][df["petal_width"].isnull()] = \
56 |     df["petal_width"][df["petal_width"].notnull()].median()
57 | 
58 | 
59 | #
60 | 
61 | l = [(1, "a", 1), (2, "b", 2)]
62 | 
63 | for x, y, z in l:
64 |     print(x, y, z)
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/R/tools_R_exo.R:
--------------------------------------------------------------------------------
 1 | # Set current working directory
 2 | WD = "."
 3 | setwd(WD)
 4 | 
 5 | df = read.csv("../datasets/iris.csv")
 6 | 
 7 | # Print column names
 8 | colnames(df)
 9 | 
10 | # Get numerical columns
11 | unlist(lapply(df, is.numeric))
12 | num_cols = colnames(df)[unlist(lapply(df, is.numeric))]
13 | 
14 | # For each species compute the mean of numerical columns and store it in a stats table like:
15 | stats = NULL
16 | for (grp in levels(df$species)) {
17 |   m = as.matrix(df[df$species == grp, num_cols])
18 |   line = data.frame(species = grp, as.list(colMeans(m)))
19 |   stats = rbind(stats, line)
20 | }
21 | 
22 | print(stats)
23 | 
24 | ## shorter version
25 | aggregate(. ~ species, data = df, mean)
26 | 
27 | ## Merging database
28 | user1 = data.frame(name = c("eric", "sophie"),
29 |                    age = c(22, 48),
30 |                    gender = c("M", "F"),
31 |                    job = c("engineer", "scientist"))
32 | user2 = data.frame(name = c("alice", "john", "peter", "julie", "christine"),
33 |                    age = c(19, 26, 33, 44, 35),
34 |                    gender = c("F", "M", "M", "F", "F"),
35 |                    job = c("student", "student", "engineer", "scientist", "scientist"))
36 | user3 = rbind(user1, user2)
37 | salary = data.frame(name = c("alice", "john", "peter", "julie"),
38 |                     salary = c(2200, 2400, 3500, 4300))
39 | 
40 | user = merge(user3, salary, by = "name", all = TRUE)
41 | 
42 | 
43 | df = user
44 | 
45 | fillmissing_with_mean <- function(df){
46 |   num_cols = colnames(df)[unlist(lapply(df, is.numeric))]
47 |   for (n in num_cols) {
48 |     x = df[, n]
49 |     df[is.na(x), n] = mean(x[!is.na(x)])  # mean(x, na.rm=TRUE)
50 |   }
51 |   return(df)
52 | }
53 | 
54 | user_imputed = fillmissing_with_mean(user)
55 | 
56 | write.csv(user_imputed, "users_imputed.csv", row.names = FALSE)
57 | 


--------------------------------------------------------------------------------
/bin/conv_python_to_rst.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan  6 12:19:07 2016
 4 | 
 5 | @author: edouard.duchesnay@cea.fr
 6 | """
 7 | from __future__ import print_function
 8 | import sys, os, argparse
 9 | 
10 | doc_tag = "'''"
11 | skip_tag = '## SKIP'
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('input', help='Input python file')
16 | 
17 |     options = parser.parse_args()
18 | 
19 |     if not options.input:
20 |         print >> sys.stderr, 'Required input file'
21 |         sys.exit(os.EX_USAGE)
22 |     input_filename = options.input
23 |     #input_filename = "/home/ed203246/git/pylearn-doc/src/tools_numpy.py"
24 |     output_filename = os.path.splitext(input_filename)[0] + ".rst"
25 |     input_fd = open(input_filename, 'r')
26 |     output_fd = open(output_filename, 'w')
27 |     
28 |     #line_in = '## Pandas data manipulation'
29 |     code_block = True
30 |     skip = False
31 |     for line_in in input_fd:
32 |         #print(line_in)
33 |         ## Switch state
34 |         if skip_tag in line_in:
35 |             skip = not skip
36 |             continue
37 |         if skip:
38 |             continue
39 |         if doc_tag in line_in and not code_block:  # end doc start code block
40 |             code_block = True
41 |             output_fd.write('\n') # write new line instead of doc_tag
42 |             #line_in = line_in.replace(doc_tag, '')
43 |             output_fd.write('.. code:: python\n')
44 |             continue
45 |         elif doc_tag in line_in and code_block:  # start doc end code block
46 |             code_block = False
47 |             line_in = line_in.replace(doc_tag, '')
48 | 
49 |         if code_block:
50 |             output_fd.write('    ' + line_in)
51 |         else:
52 |             output_fd.write(line_in)
53 | 
54 |     input_fd.close()
55 |     output_fd.close()
56 | 


--------------------------------------------------------------------------------
/R/stat_multiv_exo.R:
--------------------------------------------------------------------------------
 1 | set.seed(42)
 2 | 
 3 | # http://www.statmethods.net/advstats/matrix.html
 4 | 
 5 | ### Dot product and Euclidean norm
 6 | 
 7 | a = c(2, 1)
 8 | b = c(1, 1)
 9 | 
10 | euclidian <- function(x){
11 |   return(sqrt(x %*% x)[1])
12 | }
13 | 
14 | euclidian(a)
15 | 
16 | euclidian(a - b)
17 | 
18 | b %*% (a / euclidian(a))
19 | 
20 | X = matrix(rnorm(100 * 2), 100, 2)
21 | dim(X)
22 | X %*% (a / euclidian(a))
23 | 
24 | ### Compute row means and store them into a vector
25 | 
26 | row_means = function(X) {
27 |   means = NULL
28 |   for (i in 1:dim(X)[1]) {
29 |     means = c(means, mean(X[i, ]))
30 |   }
31 |   return(means)
32 | }
33 | row_means(X)
34 | 
35 | ## version 2, using built-in accessors
36 | 
37 | row_means <- function(X) {
38 |   n <- nrow(X)
39 |   means <- numeric(n)
40 |   for (i in 1:n)
41 |     means[i] <- mean(X[i, ])
42 | 
43 |   return(means)
44 | }
45 | 
46 | ## version 3, using apply
47 | apply(X, 1, mean)
48 | 
49 | ## version 4, more efficient
50 | rowMeans(X)
51 | 
52 | ### Covariance matrix and Mahalanobis norm
53 | 
54 | N = 100
55 | mu = c(1, 1)
56 | Cov = matrix(c(1, .8,
57 |                .8, 1), 2, 2)
58 | 
59 | library(MASS)
60 | X = mvrnorm(N, mu, Cov)
61 | 
62 | xbar = colMeans(X)
63 | print(xbar)
64 | 
65 | Xc = (X - xbar)
66 | 
67 | colMeans(Xc)
68 | 
69 | S = 1 / (N - 1) * (t(Xc)  %*% Xc)
70 | print(S)
71 | 
72 | 
73 | Sinv = solve(S)
74 | 
75 | x = X[1, ]
76 | 
77 | mahalanobis <- function(x, xbar, Sinv){
78 |   xc = x - xbar
79 |   return(sqrt( (xc %*% Sinv) %*% xc))
80 | }
81 | 
82 | 
83 | dist = matrix(nrow = N, ncol = 2)
84 | 
85 | for(i in 1:nrow(X)){
86 |     dist[i, 1] = mahalanobis(X[i, ], xbar, Sinv)
87 |     dist[i, 2] = euclidian(X[i, ] - xbar)
88 | }
89 | colnames(dist) = c("Mahalanobis", "Euclidian")
90 | 
91 | print(dist[1:10, ])
92 | 
93 | x = X[1, ]
94 | 
95 | print(sqrt(stats::mahalanobis(X[1, ], xbar, Sinv, inverted = TRUE)) -
96 |       mahalanobis(X[1, ], xbar, Sinv))
97 | 


--------------------------------------------------------------------------------
/machine_learning/resampling_solution.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jan 18 10:40:44 2017
 5 | 
 6 | @author: edouard.duchesnay@cea.fr
 7 | """
 8 | 
 9 | """
10 | Exercise
11 | 
12 | Given the logistic regression presented above and its validation given a 5 folds CV.
13 | 
14 |     Compute the p-value associated with the prediction accuracy using a permutation test.
15 | 
16 |     Compute the p-value associated with the prediction accuracy using a parametric test.
17 | 
18 | """
19 | import numpy as np
20 | from sklearn import datasets
21 | import sklearn.linear_model as lm
22 | import sklearn.metrics as metrics
23 | from sklearn.model_selection import StratifiedKFold
24 | 
25 | X, y = datasets.make_classification(n_samples=100, n_features=100,
26 |                          n_informative=10, random_state=42)
27 | 
28 | model = lm.LogisticRegression(C=1)
29 | nperm = 100
30 | scores_perm= np.zeros((nperm, 3))  # 3 scores acc, recall0, recall1
31 | 
32 | for perm in range(0, nperm):
33 |     # perm = 0; y == yp
34 |     # first run on non-permuted samples
35 |     yp = y if perm == 0 else np.random.permutation(y)
36 |     # CV loop
37 |     y_test_pred = np.zeros(len(yp))
38 |     cv = StratifiedKFold(5)
39 |     for train, test in cv.split(X, y):
40 |         X_train, X_test, y_train, y_test = X[train, :], X[test, :], yp[train], yp[test]
41 |         model.fit(X_train, y_train)
42 |         y_test_pred[test] = model.predict(X_test)
43 |     scores_perm[perm, 0] = metrics.accuracy_score(yp, y_test_pred)
44 |     scores_perm[perm, [1, 2]] = metrics.recall_score(yp, y_test_pred, average=None)
45 | 
46 | # Empirical permutation based p-values
47 | pval = np.sum(scores_perm >= scores_perm[0, :], axis=0) / nperm
48 | 
49 | print("ACC:%.2f(P=%.3f); SPC:%.2f(P=%.3f); SEN:%.2f(P=%.3f)" %\
50 |       (scores_perm[0, 0], pval[0],
51 |        scores_perm[0, 1], pval[1],
52 |        scores_perm[0, 2], pval[2]))
53 | 
54 | 


--------------------------------------------------------------------------------
/bin/filter_fix_rst.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on mercredi 17 avril 2019, 11:01:21 (UTC+0200)
 5 | 
 6 | @author: edouard.duchesnay@gmail.com
 7 | 
 8 | Filter that fix rst files generated by jupyter nbconvert
 9 | 
10 | It is called in the Makefile:
11 | jupyter nbconvert --to rst --stdout $< | bin/filter_fix_rst.py > $@
12 | """
13 | 
14 | import sys
15 | import re
16 | 
17 | if __name__ == "__main__":
18 | 
19 |     """
20 |     filename = 'test.rst'
21 |     fd = open(filename, mode='r')
22 |     in_str = fd.read()
23 |     fd.close()
24 |     """
25 |     #in_str = sys.stdin.read()
26 |     lines = sys.stdin.readlines()
27 | 
28 |     # %%
29 |     # FILTER 1:
30 |     #    CONVERT
31 |     #    :raw-latex:`\begin{align}
32 |     #    \dot{x} & = \sigma(y-x) \\
33 |     #    ...
34 |     #    \end{align}`
35 |     #
36 |     #    TO
37 |     #    .. raw:: latex
38 |     #
39 |     #    \begin{align}
40 |     #    \dot{x} & = \sigma(y-x) \\
41 |     #    ...
42 |     #    \end{align}
43 |     """
44 |     regex = re.compile(r":raw-latex:`(.+?)`", re.MULTILINE|re.DOTALL)
45 |     out_str = regex.sub(r'.. raw:: latex\n\n\1', in_str)
46 |     """
47 |     match_start = re.compile(r":raw-latex:`(.+?)$")
48 |     match_stop = re.compile(r"`")
49 |     indent = 0
50 |     for i in range(len(lines)):
51 |         #print(lines[i])
52 |         if len(match_start.findall(lines[i])) > 0:
53 |             #print(i, "match_start", lines[i])
54 |             indent = 3
55 |             lines[i] = match_start.sub(r'.. raw:: latex\n\n%s\1' % (r" " * indent), lines[i])
56 |         elif indent and len(match_stop.findall(lines[i])) > 0:
57 |             lines[i] =  r" " * indent + match_stop.sub(r'', lines[i])
58 |             indent = 0
59 |         elif indent:
60 |             #print(i, "indent", lines[i])
61 |             lines[i] = r" " * indent + lines[i]
62 | 
63 |     """
64 |        :math:`\mathbf{v}_k` associated to the singular value :math:`d_k
65 |     """
66 |     for line in lines:
67 |         sys.stdout.write(line)


--------------------------------------------------------------------------------
/python_lang/scripts/count_words.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jun 20 18:05:38 2018
 5 | 
 6 | @author: edouard.duchesnay@gmail.com
 7 | 
 8 | ./count_words.py -i /tmp/bsd.txt
 9 | """
10 | 
11 | import os
12 | import os.path
13 | import argparse
14 | import re
15 | import pandas as pd
16 | 
17 | if __name__ == "__main__":
18 |     # parse command line options
19 |     output = "word_count.csv"
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('-i', '--input',
22 |                         help='list of input files.',
23 |                         nargs='+', type=str)
24 |     parser.add_argument('-o', '--output',
25 |                         help='output csv file (default %s)' % output,
26 |                         type=str, default=output)
27 |     options = parser.parse_args()
28 | 
29 |     if options.input is None :
30 |         parser.print_help()
31 |         raise SystemExit("Error: input files are missing")
32 |     else:
33 |         filenames = [f for f in options.input if os.path.isfile(f)]
34 | 
35 |     # Match words
36 |     #regex = re.compile("[^ \t\n\r\f\v,\._></\(\)\[\]']+")
37 |     regex = re.compile("[a-zA-Z]+")
38 | 
39 |     count = dict()
40 |     for filename in filenames:
41 |         # Debug purpose
42 |         # filename = '/tmp/licence.txt'
43 |         fd = open(filename, "r")
44 |         for line in fd:
45 |             for word in regex.findall(line.lower()):
46 |                 if not word in count:
47 |                     count[word] = 1
48 |                 else:
49 |                     count[word] += 1
50 | 
51 |     fd = open(options.output, "w")
52 | 
53 |     # Do it manually
54 |     """
55 |     with open(options.output, 'w') as f:
56 |         f.write(",".join(["word", "count"]) + '\n')
57 |         for k, val in count.items():
58 |             f.write(",".join([k, val]) + '\n')
59 |     """
60 | 
61 |     # Pandas
62 |     df = pd.DataFrame([[k, count[k]] for k in count], columns=["word", "count"])
63 |     df.to_csv(options.output, index=False)
64 | 


--------------------------------------------------------------------------------
/python_lang/scripts/replace.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jun 20 18:05:38 2018
 5 | 
 6 | @author: edouard.duchesnay@gmail.com
 7 | 
 8 | ./replace.py -i /tmp/brainvol/data/* -p wm -r WM
 9 | """
10 | 
11 | import os
12 | import os.path
13 | import argparse
14 | import re
15 | import shutil
16 | 
17 | if __name__ == "__main__":
18 | 
19 |     # parse command line options
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument('-i', '--input', help='list of input files or root directory', nargs='+', type=str)
22 |     parser.add_argument('--backup', action='store_true', help='save backup .bak file')
23 |     parser.add_argument('--noaction', action='store_true', help='dry run')
24 |     parser.add_argument('-p', '--pattern', help='list of input files or root directory', type=str)
25 |     parser.add_argument('-r', '--replacement', help='list of input files or root directory', type=str)
26 | 
27 |     options = parser.parse_args()
28 | 
29 |     if options.input is None or options.pattern is None:
30 |         parser.print_help()
31 |         raise SystemExit("Error: files are missing")
32 | 
33 |     if options.replacement is None :
34 |         options.replacement = ""
35 | 
36 |     if len(options.input) == 1 and os.path.isdir(options.input[0]):
37 |         filenames = [os.path.join(curdir, file) \
38 |              for curdir, subdirs, files in os.walk(options.input[0]) for file in files]
39 |     else:
40 |         filenames = [f for f in options.input if os.path.isfile(f)]
41 | 
42 | 
43 |     regex = re.compile(options.pattern)
44 | 
45 |     for filename in filenames:
46 |         lines = ""
47 |         touch = False
48 |         with open(filename, 'r') as infile:
49 |             try:
50 |                 for line in infile:
51 |                     if len(regex.findall(line)) > 0:
52 |                         touch = True
53 |                         line = regex.sub(options.replacement, line)
54 |                         # print(line)
55 |                     lines += line
56 |             except Exception as e:
57 |                 print(filename, ":", e)
58 | 
59 |         if touch and not options.noaction:
60 |             shutil.copy(filename, filename + ".bak")
61 |             with open(filename, 'w') as f:
62 |                 f.write(lines)
63 | 


--------------------------------------------------------------------------------
/machine_learning/manifold_solutions.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## MDS"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "import numpy as np\n",
17 |     "import matplotlib.pyplot as plt\n",
18 |     "from sklearn.decomposition import PCA\n",
19 |     "from sklearn.manifold import MDS\n",
20 |     "%matplotlib inline\n",
21 |     "\n",
22 |     "# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/\n",
23 |     "\n",
24 |     "import pandas as pd\n",
25 |     "\n",
26 |     "try:\n",
27 |     "    salary = pd.read_csv('datasets/iris.csv')\n",
28 |     "except:\n",
29 |     "    url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'\n",
30 |     "    df = pd.read_csv(url)\n",
31 |     "\n",
32 |     "X = np.asarray(df.iloc[:, :4])\n",
33 |     "X -= np.mean(X, axis=0)\n",
34 |     "X /= np.std(X, axis=0, ddof=1)\n",
35 |     "\n",
36 |     "from sklearn import metrics\n",
37 |     "D = metrics.pairwise.pairwise_distances(X, metric='euclidean')\n",
38 |     "\n",
39 |     "\n",
40 |     "stress = [MDS(dissimilarity='precomputed', n_components=k,\n",
41 |     "           random_state=42, max_iter=300, eps=1e-9).fit(D).stress_ for k in range(1, X.shape[1]+1)]\n",
42 |     "\n",
43 |     "print(\"Stress\", stress)\n",
44 |     "plt.plot(range(1, 5), stress)\n",
45 |     "\n",
46 |     "K = 2\n",
47 |     "mds = MDS(dissimilarity='precomputed', n_components=K,\n",
48 |     "           random_state=42, max_iter=300, eps=1e-9)\n",
49 |     "Xmds = mds.fit_transform(D)\n",
50 |     "\n",
51 |     "pca = PCA(n_components=K)\n",
52 |     "pca.fit(X)\n",
53 |     "PC = pca.transform(X)\n",
54 |     "\n",
55 |     "print(\"Correlation between PCA and MDS\")\n",
56 |     "cor = [np.corrcoef(Xmds[:, j], PC[:, j])[0, 1] for j in range(min(Xmds.shape[1], PC.shape[1]))]\n",
57 |     "print(cor)"
58 |    ]
59 |   },
60 |   {
61 |    "cell_type": "code",
62 |    "execution_count": null,
63 |    "metadata": {},
64 |    "outputs": [],
65 |    "source": []
66 |   }
67 |  ],
68 |  "metadata": {
69 |   "kernelspec": {
70 |    "display_name": "Python 3.7.4 64-bit ('base': conda)",
71 |    "language": "python",
72 |    "name": "python37464bitbaseconda862d4bef370e4cc79b56518e37d84318"
73 |   },
74 |   "language_info": {
75 |    "codemirror_mode": {
76 |     "name": "ipython",
77 |     "version": 3
78 |    },
79 |    "file_extension": ".py",
80 |    "mimetype": "text/x-python",
81 |    "name": "python",
82 |    "nbconvert_exporter": "python",
83 |    "pygments_lexer": "ipython3",
84 |    "version": "3.7.9"
85 |   }
86 |  },
87 |  "nbformat": 4,
88 |  "nbformat_minor": 4
89 | }
90 | 


--------------------------------------------------------------------------------
/utils/datasets.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr  4 14:44:13 2016
  4 | 
  5 | @author: edouard.duchesnay@cea.fr
  6 | """
  7 | 
  8 | 
  9 | '''
 10 | Regression
 11 | ==========
 12 | '''
 13 | 
 14 | '''
 15 | Do it yourself
 16 | --------------
 17 | '''
 18 | import numpy as np
 19 | n_features = 5
 20 | n_features_info = 2
 21 | X = np.random.randn(n_samples * 2, n_features)
 22 | beta = np.zeros(n_features)
 23 | beta[:n_features_info] = 1
 24 | Xbeta = np.dot(X, beta)
 25 | eps = np.random.randn(n_samples * 2)
 26 | y = Xbeta + eps
 27 | 
 28 | '''
 29 | sklearn
 30 | -------
 31 | '''
 32 | from sklearn import datasets
 33 | import sklearn.linear_model as lm
 34 | import sklearn.metrics as metrics
 35 | from sklearn.cross_validation import KFold
 36 | 
 37 | X, y = datasets.make_regression(n_samples=100, n_features=100, 
 38 |                          n_informative=10, random_state=42)
 39 | 
 40 | 
 41 | 
 42 | '''
 43 | Classification
 44 | ==============
 45 | '''
 46 | 
 47 | '''
 48 | Do it yourself
 49 | --------------
 50 | '''
 51 | import numpy as np
 52 | import scipy
 53 | 
 54 | #############
 55 | # 2D: Dataset
 56 | #############
 57 | 
 58 | n_samples, n_features = 100, 2
 59 | mean0, mean1 = np.array([0, 0]), np.array([0, 2])
 60 | Cov = np.array([[1, .8],[.8, 1]])
 61 | np.random.seed(42)
 62 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
 63 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
 64 | X = np.vstack([X0, X1])
 65 | y = np.array([0] * X0.shape[0] + [1] * X1.shape[0])
 66 | 
 67 | n_samples, n_features,  = 100, 2
 68 | 
 69 | np.random.randn()
 70 | 
 71 | ##############################################
 72 | # Large Dataset with block diagonal covariance
 73 | ##############################################
 74 | import numpy as np
 75 | import scipy
 76 | 
 77 | n_samples = 100
 78 | block_size = 3
 79 | n_block = 2
 80 | n_features = block_size * n_block
 81 | n_informatives = 2
 82 | cov = .8
 83 | var = 1
 84 | 
 85 | # Block diagonal covariance
 86 | Cov = scipy.linalg.block_diag(
 87 |     *[np.zeros((block_size, block_size)) + cov for i in range(n_block)])
 88 | np.fill_diagonal(Cov, var)
 89 | 
 90 |     
 91 | mean0, mean1 = np.zeros(n_features), np.zeros(n_features)
 92 | mean1[:n_informatives] = 1
 93 | 
 94 | np.random.seed(42)
 95 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
 96 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
 97 | X = np.vstack([X0, X1])
 98 | y = np.array([0] * X0.shape[0] + [1] * X1.shape[0])
 99 | 
100 | 
101 | '''
102 | sklearn
103 | -------
104 | '''
105 | from sklearn import datasets
106 | import sklearn.linear_model as lm
107 | import sklearn.metrics as metrics
108 | from sklearn.cross_validation import StratifiedKFold
109 | 
110 | X, y = datasets.make_classification(n_samples=100, n_features=100, 
111 |                          n_informative=10, random_state=42)
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/index.rst:
--------------------------------------------------------------------------------
  1 | .. Machine Learning documentation master file, created by
  2 |    sphinx-quickstart on Mon Nov 30 16:25:34 2015.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | =======
  7 | Phantom
  8 | =======
  9 | 
 10 | ============
 11 | Introduction
 12 | ============
 13 | 
 14 | Important links:
 15 | 
 16 | - `Web page <https://duchesnay.github.io/pystatsml/>`_
 17 | - `Github <https://github.com/duchesnay/pystatsml>`_
 18 | - `Latest pdf <ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf>`_
 19 | - `Official deposit for citation <https://hal.archives-ouvertes.fr/hal-03038776>`_.
 20 | 
 21 | This document describes statistics and machine learning in Python using:
 22 | 
 23 | - `Scikit-learn <https://scikit-learn.org/>`_ for machine learning.
 24 | - `Pytorch <https://pytorch.org/>`_ for deep learning.
 25 | - `Statsmodels <https://www.statsmodels.org/>`_ for statistics.
 26 | 
 27 | 
 28 | .. toctree::
 29 |     :maxdepth: 2
 30 | 
 31 |     introduction/python_ecosystem.rst
 32 |     introduction/machine_learning.rst
 33 | 
 34 | ===============
 35 | Python language
 36 | ===============
 37 | 
 38 | .. toctree::
 39 |     :maxdepth: 2
 40 | 
 41 |     auto_gallery/python_lang.rst
 42 | 
 43 | =================
 44 | Scientific Python
 45 | =================
 46 | 
 47 | .. toctree::
 48 |     :maxdepth: 2
 49 | 
 50 |     auto_gallery/scipy_numpy.rst
 51 |     auto_gallery/scipy_pandas.rst
 52 |     scientific_python/scipy_matplotlib.rst
 53 | 
 54 | ==========
 55 | Statistics
 56 | ==========
 57 | 
 58 | .. toctree::
 59 |     :maxdepth: 2
 60 | 
 61 |     statistics/stat_univ.rst
 62 |     auto_gallery/stat_univ_lab_brain-volume.rst
 63 |     statistics/stat_multiv.rst
 64 |     statistics/time_series.rst
 65 | 
 66 | ================
 67 | Machine Learning
 68 | ================
 69 | 
 70 | .. toctree::
 71 |     :maxdepth: 2
 72 | 
 73 |     machine_learning/decomposition.rst
 74 |     machine_learning/manifold.rst
 75 |     machine_learning/clustering.rst
 76 |     machine_learning/linear_regression.rst
 77 |     machine_learning/linear_classification.rst
 78 |     auto_gallery/ml_supervized_nonlinear.rst
 79 |     auto_gallery/ml_resampling.rst
 80 |     machine_learning/ensemble_learning.rst
 81 |     optimization/optim_gradient_descent.rst
 82 |     auto_gallery/ml_lab_face_recognition.rst
 83 | 
 84 | =============
 85 | Deep Learning
 86 | =============
 87 | 
 88 | .. toctree::
 89 |     :maxdepth: 2
 90 | 
 91 |     deep_learning/dl_backprop_numpy-pytorch-sklearn.rst
 92 |     deep_learning/dl_mlp_mnist_pytorch.rst
 93 |     deep_learning/dl_cnn_cifar10_pytorch.rst
 94 |     deep_learning/dl_transfer-learning_cifar10-ants-bees_pytorch.rst
 95 | 
 96 | ==================
 97 | Indices and tables
 98 | ==================
 99 | 
100 | * :ref:`genindex`
101 | * :ref:`modindex`
102 | * :ref:`search`
103 | 
104 | 


--------------------------------------------------------------------------------
/deep_learning/train_val_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import time
 4 | import copy
 5 | 
 6 | 
 7 | def train_val_model(model, criterion, optimizer, dataloaders, num_epochs=25,
 8 |         scheduler=None, log_interval=None):
 9 |     since = time.time()
10 | 
11 |     best_model_wts = copy.deepcopy(model.state_dict())
12 |     best_acc = 0.0
13 | 
14 |     # Store losses and accuracies accross epochs
15 |     losses, accuracies = dict(train=[], val=[]), dict(train=[], val=[])
16 |     
17 |     for epoch in range(num_epochs):
18 |         if log_interval is not None and epoch % log_interval == 0:
19 |             print('Epoch {}/{}'.format(epoch, num_epochs - 1))
20 |             print('-' * 10)
21 | 
22 |         # Each epoch has a training and validation phase
23 |         for phase in ['train', 'val']:
24 |             if phase == 'train':
25 |                 model.train()  # Set model to training mode
26 |             else:
27 |                 model.eval()   # Set model to evaluate mode
28 | 
29 |             running_loss = 0.0
30 |             running_corrects = 0
31 | 
32 |             # Iterate over data.
33 |             nsamples = 0
34 |             for inputs, labels in dataloaders[phase]:
35 |                 inputs = inputs.to(device)
36 |                 labels = labels.to(device)
37 |                 nsamples += inputs.shape[0]
38 | 
39 |                 # zero the parameter gradients
40 |                 optimizer.zero_grad()
41 | 
42 |                 # forward
43 |                 # track history if only in train
44 |                 with torch.set_grad_enabled(phase == 'train'):
45 |                     outputs = model(inputs)
46 |                     _, preds = torch.max(outputs, 1)
47 |                     loss = criterion(outputs, labels)
48 | 
49 |                     # backward + optimize only if in training phase
50 |                     if phase == 'train':
51 |                         loss.backward()
52 |                         optimizer.step()
53 | 
54 |                 # statistics
55 |                 running_loss += loss.item() * inputs.size(0)
56 |                 running_corrects += torch.sum(preds == labels.data)
57 | 
58 |             if scheduler is not None and phase == 'train':
59 |                 scheduler.step()
60 |             
61 |             #nsamples = dataloaders[phase].dataset.data.shape[0]
62 |             epoch_loss = running_loss / nsamples
63 |             epoch_acc = running_corrects.double() / nsamples
64 | 
65 |             losses[phase].append(epoch_loss)
66 |             accuracies[phase].append(epoch_acc)
67 |             if log_interval is not None and epoch % log_interval == 0:
68 |                 print('{} Loss: {:.4f} Acc: {:.2f}%'.format(
69 |                     phase, epoch_loss, 100 * epoch_acc))
70 | 
71 |             # deep copy the model
72 |             if phase == 'val' and epoch_acc > best_acc:
73 |                 best_acc = epoch_acc
74 |                 best_model_wts = copy.deepcopy(model.state_dict())
75 |         if log_interval is not None and epoch % log_interval == 0:
76 |             print()
77 | 
78 |     time_elapsed = time.time() - since
79 |     print('Training complete in {:.0f}m {:.0f}s'.format(
80 |         time_elapsed // 60, time_elapsed % 60))
81 |     print('Best val Acc: {:.2f}%'.format(100 * best_acc))
82 | 
83 |     # load best model weights
84 |     model.load_state_dict(best_model_wts)
85 |     
86 |     return model, losses, accuracies
87 | 


--------------------------------------------------------------------------------
/introduction/machine_learning.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Introduction to Machine Learning
 3 | --------------------------------
 4 | 
 5 | 
 6 | Machine learning within data science
 7 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 8 | 
 9 | .. image:: images/data_science.png
10 |    :scale: 25
11 |    :align: center
12 | 
13 | Machine learning covers two main types of data analysis:
14 | 
15 | 1. Exploratory analysis: **Unsupervised learning**. Discover the structure within the data. E.g.: Experience (in years in a company) and salary are correlated.
16 | 2. Predictive analysis: **Supervised learning**. This is sometimes described as **"learn from the past to predict the future"**. Scenario: a company wants to detect potential future clients among a base of prospects. Retrospective data analysis: we go through the data constituted of previous prospected companies, with their characteristics (size, domain, localization, etc...). Some of these companies became clients, others did not. The question is, can we possibly predict which of the new companies are more likely to become clients, based on their characteristics based on previous observations? In this example, the training data consists of a set of *n* training samples. Each sample, :math:`x_i`, is a vector of *p* input features (company characteristics) and a target feature (:math:`y_i \in \{Yes, No\}` (whether they became a client or not).
17 | 
18 | 
19 | .. image:: images/machine_learning.png
20 |    :scale: 50
21 |    :align: center
22 | 
23 | 
24 | IT/computing science tools
25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
26 | 
27 |     - High Performance Computing (HPC)
28 |     - Data flow, data base, file I/O, etc.
29 |     - Python:  the programming language.
30 |     - Numpy: python library particularly useful for handling of raw numerical data (matrices, mathematical operations).
31 |     - Pandas: input/output, manipulation structured data (tables).
32 | 
33 | Statistics and applied mathematics
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
35 | 
36 |     - Linear model.
37 |     - Non parametric statistics.
38 |     - Linear algebra: matrix operations, inversion, eigenvalues.
39 | 
40 | 
41 | Data analysis methodology
42 | -------------------------
43 | 
44 | 1. Formalize customer's needs into a learning problem:
45 |     * A target variable: supervised problem.
46 |         - Target is qualitative: classification.
47 |         - Target is quantitative: regression.
48 |     * No target variable: unsupervised problem
49 |         - Vizualisation of high-dimensional samples: PCA, manifolds learning, etc.
50 |         - Finding groups of samples (hidden structure): clustering.
51 | 
52 | 2. Ask question about the datasets
53 |     * Number of samples
54 |     * Number of variables, types of each variable.
55 | 
56 | 
57 | 3. Define the sample
58 |     * For prospective study formalize the experimental design: inclusion/exlusion criteria. The conditions that define the acquisition of the dataset.
59 |     * For retrospective study formalize the experimental design: inclusion/exlusion criteria. The conditions that define the selection of the dataset.
60 | 
61 | 4.  In a document formalize (i) the project objectives; (ii) the required learning dataset (more specifically the input data and the target variables); (iii) The conditions that define the acquisition of the dataset. In this document, warn the customer that the learned algorithms may not work on new data acquired under different condition.
62 | 
63 | 5. Read the learning dataset.
64 | 
65 | 6. (i) Sanity check (basic descriptive statistics); (ii) data cleaning (impute missing data, recoding); Final Quality Control (QC) perform descriptive statistics and think ! (remove possible confounding variable, etc.).
66 | 
67 | 7. Explore data (visualization, PCA) and perform basic univariate statistics for association between the target an input variables.
68 | 
69 | 8. Perform more complex multivariate-machine learning.
70 | 
71 | 9. Model validation using a left-out-sample strategy (cross-validation, etc.).
72 | 
73 | 10. Apply on new data.
74 | 
75 | 


--------------------------------------------------------------------------------
/utils/mahalanobis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Feb  4 16:09:56 2016
  4 | 
  5 | @author: edouard.duchesnay@cea.fr
  6 | """
  7 | import numpy as np
  8 | import scipy
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | #%matplotlib inline
 12 | 
 13 | '''
 14 | Mahalanobis distance
 15 | ====================
 16 | '''
 17 | 
 18 | from matplotlib.patches import Ellipse
 19 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs):
 20 |     """
 21 |     Plots an `nstd` sigma error ellipse based on the specified covariance
 22 |     matrix (`cov`). Additional keyword arguments are passed on to the
 23 |     ellipse patch artist.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |         cov : The 2x2 covariance matrix to base the ellipse on
 28 |         pos : The location of the center of the ellipse. Expects a 2-element
 29 |             sequence of [x0, y0].
 30 |         nstd : The radius of the ellipse in numbers of standard deviations.
 31 |             Defaults to 2 standard deviations.
 32 |         ax : The axis that the ellipse will be plotted on. Defaults to the
 33 |             current axis.
 34 |         Additional keyword arguments are pass on to the ellipse patch.
 35 | 
 36 |     Returns
 37 |     -------
 38 |         A matplotlib ellipse artist
 39 |     """
 40 |     def eigsorted(cov):
 41 |         vals, vecs = np.linalg.eigh(cov)
 42 |         order = vals.argsort()[::-1]
 43 |         return vals[order], vecs[:,order]
 44 | 
 45 |     if ax is None:
 46 |         ax = plt.gca()
 47 | 
 48 |     vals, vecs = eigsorted(cov)
 49 |     theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
 50 | 
 51 |     # Width and height are "full" widths, not radius
 52 |     width, height = 2 * nstd * np.sqrt(vals)
 53 |     ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs)
 54 | 
 55 |     ax.add_artist(ellip)
 56 |     return ellip
 57 | 
 58 | n_samples, n_features = 100, 2
 59 | mean0, mean1 = np.array([0, 0]), np.array([0, 2])
 60 | Cov = np.array([[1, .8],[.8, 1]])
 61 | np.random.seed(42)
 62 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples)
 63 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples)
 64 | 
 65 | x = np.array([2, 2])
 66 | 
 67 | plt.scatter(X0[:, 0], X0[:, 1], color='b')
 68 | plt.scatter(X1[:, 0], X1[:, 1], color='r')
 69 | plt.scatter(mean0[0], mean0[1], color='b', s=200, label="m0")
 70 | plt.scatter(mean1[0], mean1[1], color='r', s=200, label="m2")
 71 | plt.scatter(x[0], x[1], color='k', s=200, label="x")
 72 | plot_cov_ellipse(Cov, pos=mean0, facecolor='none', linewidth=2, edgecolor='b')
 73 | plot_cov_ellipse(Cov, pos=mean1, facecolor='none', linewidth=2, edgecolor='r')
 74 | plt.legend(loc='upper left')
 75 | 
 76 | #
 77 | d2_m0x = scipy.spatial.distance.euclidean(mean0, x)
 78 | d2_m0m2 = scipy.spatial.distance.euclidean(mean0, mean1)
 79 | 
 80 | Covi = scipy.linalg.inv(Cov)
 81 | dm_m0x = scipy.spatial.distance.mahalanobis(mean0, x, Covi)
 82 | dm_m0m2 = scipy.spatial.distance.mahalanobis(mean0, mean1, Covi)
 83 | 
 84 | print('Euclidean dist(m0, x)=%.2f > dist(m0, m2)=%.2f' % (d2_m0x, d2_m0m2))
 85 | print('Mahalanobis dist(m0, x)=%.2f < dist(m0, m2)=%.2f' % (dm_m0x, dm_m0m2))
 86 | 
 87 | 
 88 | '''
 89 | ## Exercise
 90 | 
 91 | - Write a function `euclidean(a, b)` that compute the euclidean distance
 92 | - Write a function `mahalanobis(a, b, Covi)` that compute the euclidean
 93 |   distance, with the inverse of the covariance matrix. Use `scipy.linalg.inv(Cov)`
 94 |   to invert your matrix.
 95 | '''
 96 | def euclidian(a, b):
 97 |     return np.sqrt(np.sum((a - b) ** 2))
 98 | 
 99 | def mahalanobis(a, b, cov_inv):
100 |     return np.sqrt(np.dot(np.dot((a - b), cov_inv),  (a - b).T))
101 | 
102 | assert mahalanobis(mean0, mean1, Covi) == dm_m0m2
103 | assert euclidian(mean0, mean1)  == d2_m0m2
104 | 
105 | mahalanobis(X0, mean0, Covi)
106 | X = X0
107 | mean = mean0
108 | covi= Covi
109 | 
110 | np.sqrt(np.dot(np.dot((X - mean), covi),  (X - mean).T))
111 | 
112 | def mahalanobis(X, mean, covi):
113 |     """
114 |     from scipy.spatial.distance import mahalanobis
115 |     d2= np.array([mahalanobis(X[i], mean, covi) for i in range(X.shape[0])])
116 |     np.all(mahalanobis(X, mean, covi) == d2)
117 |     """
118 |     return np.sqrt(np.sum(np.dot((X - mean), covi) *  (X - mean), axis=1))
119 | 
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Statistics and Machine Learning in Python
  2 | =========================================
  3 | 
  4 | - [pdf](ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf)
  5 | - [www](https://duchesnay.github.io/pystatsml)
  6 | 
  7 | 
  8 | Structure
  9 | ---------
 10 | 
 11 | Courses are available in three formats:
 12 | 
 13 | 1. Jupyter notebooks.
 14 | 
 15 | 2. Python files using sphinx-gallery.
 16 | 
 17 | 3. ReStructuredText files.
 18 | 
 19 | All notebooks and python files are converted into `rst` format and then assembled together using sphinx.
 20 | 
 21 | Directories and main files:
 22 | 
 23 |     introduction/
 24 |     ├── machine_learning.rst
 25 |     └── python_ecosystem.rst
 26 | 
 27 |     python_lang/                        # (Python language)
 28 |     ├── python_lang.py # (main file)
 29 |     └── python_lang_solutions.py
 30 | 
 31 |     scientific_python/
 32 |     ├── matplotlib.ipynb
 33 |     ├── scipy_numpy.py
 34 |     ├── scipy_numpy_solutions.py
 35 |     ├── scipy_pandas.py
 36 |     └── scipy_pandas_solutions.py
 37 | 
 38 |     statistics/                         # (Statistics)
 39 |     ├── stat_multiv.ipynb               # (multivariate statistics)
 40 |     ├── stat_univ.ipynb                 # (univariate statistics)
 41 |     ├── stat_univ_solutions.ipynb
 42 |     ├── stat_univ_lab01_brain-volume.py # (lab)
 43 |     ├── stat_univ_solutions.ipynb
 44 |     └── time_series.ipynb
 45 | 
 46 |     machine_learning/                   # (Machine learning)
 47 |     ├── clustering.ipynb
 48 |     ├── decomposition.ipynb
 49 |     ├── decomposition_solutions.ipynb
 50 |     ├── linear_classification.ipynb
 51 |     ├── linear_regression.ipynb
 52 |     ├── non_linear_prediction.ipynb
 53 |     ├── resampling.ipynb
 54 |     ├── resampling_solution.py
 55 |     └── sklearn.ipynb
 56 | 
 57 |     optimization/
 58 |     ├── optim_gradient_descent.ipynb
 59 |     └── optim_gradient_descent_lab.ipynb
 60 | 
 61 |     deep_learning/
 62 |     ├── dl_backprop_numpy-pytorch-sklearn.ipynb
 63 |     ├── dl_cnn_cifar10_pytorch.ipynb
 64 |     ├── dl_mlp_mnist_pytorch.ipynb
 65 |     └── dl_transfer-learning_cifar10-ants-
 66 | 
 67 | 
 68 | Build
 69 | -----
 70 | 
 71 | After pulling the repository execute Jupyter notebooks (outputs are expected to be removed before git submission).
 72 | ```
 73 | make exe
 74 | ```
 75 | 
 76 | Build the pdf file (requires LaTeX):
 77 | ```
 78 | make pdf
 79 | ```
 80 | 
 81 | Build the html files:
 82 | ```
 83 | make html
 84 | ```
 85 | 
 86 | Clean everything and  strip output from Jupyter notebook (useless if you installed the nbstripout hook, ):
 87 | ```
 88 | make clean
 89 | ```
 90 | 
 91 | Dependencies
 92 | ------------
 93 | The easier is to install Anaconda at https://www.continuum.io with python >= 3. Anaconda provides
 94 | 
 95 | - python 3
 96 | - ipython
 97 | - Jupyter
 98 | - pandoc
 99 | - LaTeX to generate pdf
100 | 
101 | Then install:
102 | 
103 | 1. [sphinx-gallery](https://sphinx-gallery.readthedocs.io)
104 | 
105 | ```
106 | pip install sphinx-gallery
107 | ```
108 | 
109 | 2. [nbstripout](https://github.com/kynan/nbstripout)
110 | 
111 | ```
112 | conda install -c conda-forge nbstripout
113 | ```
114 | 
115 | Configure your git repository with nbstripout pre-commit hook for users who don't want to track output in VCS.
116 | 
117 | ```
118 | cd pystatsml
119 | nbstripout --install
120 | ```
121 | 
122 | 3. Git [LFS](https://git-lfs.github.com/) for datasets
123 | 
124 | a. Install Git LFS
125 | 
126 | ```
127 | git lfs install
128 | ```
129 | 
130 | b. select the file types you'd like Git LFS to manage
131 | 
132 | ```
133 | git lfs track "*.npz"
134 | git lfs track "*.npy"
135 | git lfs track "*.nii"
136 | git lfs track "*.nii.gz"
137 | git lfs track "*.csv"
138 | ```
139 | 
140 | b. Now make sure .gitattributes is tracked:
141 | 
142 | ```
143 | git add .gitattributes
144 | ```
145 | 
146 | 4. LaTeX (optional for pdf)
147 | 
148 | For Linux debian like:
149 | 
150 | ```
151 | sudo apt-get install latexmk texlive-latex-extra
152 | ```
153 | 
154 | 5. MS docx (optional)
155 | 
156 | [docxbuilder](https://docxbuilder.readthedocs.io/en/latest/docxbuilder.html)
157 | 
158 | a. Install
159 | 
160 | ```
161 | pip install docxbuilder
162 | pip install docxbuilder[math]
163 | ```
164 | 
165 | b. Build
166 | 
167 | ```
168 | make docx
169 | ```
170 | 


--------------------------------------------------------------------------------
/deep_learning/README.md:
--------------------------------------------------------------------------------
  1 | # Course
  2 | 
  3 | ## Introduction to Deep Learning
  4 | 
  5 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/01_intro_to_deep_learning/index.html)
  6 | 
  7 | ## 1. Optimisation: Gradient descent and Backpropagation
  8 | 
  9 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/02_backprop/index.html)
 10 | 
 11 | - [Lab: `dl_optim-backprop_numpy-pytorch-sklearn.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_optim-backprop_numpy-pytorch-sklearn.ipynb)
 12 | 
 13 | ## 2. Multi-Layer Perceptron
 14 | 
 15 | - [Lab: `dl_mlp_mnist_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_mlp_mnist_pytorch.ipynb)
 16 | 
 17 | 
 18 | ## 3. Convolutional Neural Networks (CNN)
 19 | 
 20 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/04_conv_nets/index.html)
 21 | 
 22 | -  [Lab: `dl_cnn_mnist_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_cnn_mnist_pytorch.ipynb)
 23 | 
 24 | 
 25 | ## 4. Transfer Learning
 26 | 
 27 | -  [Lab: `dl_transfer-learning_ants-bees_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_transfer-learning_ants-bees_pytorch.ipynb)
 28 | 
 29 | # Ressources
 30 | 
 31 | ## Deep Learning class, Master Datascience Paris Saclay
 32 | 
 33 | [Deep Learning class, Master Datascience Paris Saclay](https://github.com/m2dsupsdlclass/lectures-labs)
 34 | 
 35 | ## Stanford ML courses
 36 | 
 37 | - [Deep learning - cs-231n @stanford.edu](http://cs231n.stanford.edu/)
 38 | 
 39 | - [Deep Learning Cheatsheet - cs-230 @stanford.edu](https://stanford.edu/~shervine/teaching/cs-230/)
 40 | 
 41 | - [Machine Learning Cheatsheet - cs-229 @stanford.edu](https://stanford.edu/~shervine/teaching/cs-229/)
 42 | 
 43 | 
 44 | ## Anaconda
 45 | 
 46 | Download from  [www.anaconda.com](https://www.anaconda.com/)
 47 | 
 48 | Choose Python 3.x
 49 | 
 50 | Update conda
 51 | 
 52 |     conda update -n base -c defaults conda
 53 | 
 54 | ## Pytorch
 55 | 
 56 | 
 57 | - [WWW tutorials](https://pytorch.org/tutorials/)
 58 | 
 59 | - [github tutorials](https://github.com/pytorch/tutorials)
 60 | 
 61 | - [github examples](https://github.com/pytorch/examples)
 62 | 
 63 | 
 64 | ### Installation
 65 | 
 66 | [pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
 67 | 
 68 | 
 69 | **Anaconda + No CUDA**
 70 | 
 71 |     conda install pytorch-cpu torchvision-cpu -c pytorch
 72 | 
 73 | Check if torch can be loaded. If CUDA is not available, we will use CPU instead of GPU.
 74 | 
 75 |     python3 -c "import torch; print(torch.__version__, torch.cuda.is_available())"
 76 | 
 77 | **Anaconda + CUDA 10:**
 78 | 
 79 |     conda install pytorch torchvision cudatoolkit=10.0 -c pytorch
 80 | 
 81 | 
 82 | ## Optional install Keras for Deep Learning class, Master Datascience Paris Saclay
 83 | 
 84 | [Deep Learning class](https://github.com/m2dsupsdlclass/lectures-labs)
 85 | 
 86 | Create an new environement called ``py36`` where we will install python 3.6 for Keras and tensor flow
 87 | 
 88 |     conda create --name py36
 89 |     conda activate py36
 90 | 
 91 | 
 92 | [installation instructions](https://github.com/m2dsupsdlclass/lectures-labs/blob/master/installation_instructions.md)
 93 | 
 94 | Open a console / terminal and update the following packages with conda:
 95 | 
 96 |     conda activate py36
 97 |     conda install python=3.6 numpy scikit-learn jupyter ipykernel matplotlib pip
 98 |     conda install pandas h5py pillow scikit-image lxml tensorflow keras
 99 | 
100 | Check that you can import tensorflow with the python from anaconda:
101 | 
102 |     python3 -c "import tensorflow as tf; print(tf.__version__)"
103 | 
104 | If you have several installations of Python on your system (virtualenv, conda environments...), it can be confusing to select the correct Python environment from the jupyter interface. You can name this environment for instance "py36" and reference it as a Jupyter kernel:
105 | 
106 |     python3 -m ipykernel install --user --name py36 --display-name py36
107 | 
108 | To take pictures with the webcam we will also need opencv-python:
109 | 
110 |     python3 -m pip install opencv-python
111 | 
112 | Clone Repository:
113 | 
114 |     git clone https://github.com/m2dsupsdlclass/lectures-labs
115 | 
116 | 
117 | # Misc
118 | 
119 | ## Draw neural net
120 | 
121 | [Draw neural net](http://alexlenail.me/NN-SVG/index.html)
122 | 
123 | 


--------------------------------------------------------------------------------
/utils/ml_resampling.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Apr  5 10:48:25 2016
  4 | 
  5 | @author: edouard.duchesnay@cea.fr
  6 | """
  7 | 
  8 | 
  9 | '''
 10 | Regression
 11 | ==========
 12 | '''
 13 | 
 14 | import numpy as np
 15 | from sklearn import datasets
 16 | import sklearn.linear_model as lm
 17 | from sklearn.grid_search import GridSearchCV
 18 | import sklearn.metrics as metrics
 19 | from sklearn.cross_validation import KFold
 20 | 
 21 | # Dataset
 22 | noise_sd = 10
 23 | X, y, coef = datasets.make_regression(n_samples=50, n_features=100, noise=noise_sd,
 24 |                          n_informative=2, random_state=42, coef=True)
 25 |  
 26 | # Use this to tune the noise parameter such that snr < 5
 27 | print("SNR:", np.std(np.dot(X, coef)) / noise_sd)
 28 | 
 29 | # param grid over alpha & l1_ratio
 30 | param_grid = {'alpha': 10. ** np.arange(-3, 3), 'l1_ratio':[.1, .5, .9]}
 31 | 
 32 | 
 33 | # Warp 
 34 | model = GridSearchCV(lm.ElasticNet(max_iter=10000), param_grid, cv=5)
 35 |     
 36 | # 1) Biased usage: fit on all data, ommit outer CV loop                 
 37 | model.fit(X, y)
 38 | print("Train r2:%.2f" % metrics.r2_score(y, model.predict(X)))
 39 | print(model.best_params_)
 40 | 
 41 | # 2) User made outer CV, useful to extract specific information
 42 | cv = KFold(len(y), n_folds=5, random_state=42)
 43 | y_test_pred = np.zeros(len(y))
 44 | y_train_pred = np.zeros(len(y))
 45 | alphas = list()
 46 | 
 47 | for train, test in cv:
 48 |     X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test]
 49 |     model.fit(X_train, y_train)
 50 |     y_test_pred[test] = model.predict(X_test)
 51 |     y_train_pred[train] = model.predict(X_train)
 52 |     alphas.append(model.best_params_)
 53 | 
 54 | print("Train r2:%.2f" % metrics.r2_score(y, y_train_pred))
 55 | print("Test  r2:%.2f" % metrics.r2_score(y, y_test_pred))
 56 | print("Selected alphas:", alphas)
 57 | 
 58 | # 3.) user-friendly sklearn for outer CV
 59 | from sklearn.cross_validation import cross_val_score
 60 | scores = cross_val_score(estimator=model, X=X, y=y, cv=cv)
 61 | print("Test  r2:%.2f" % scores.mean())
 62 | 
 63 | 
 64 | '''
 65 | 3.2.3.1. Specifying an objective metric
 66 | 
 67 | By default, parameter search uses the score function of the estimator to evaluate a parameter setting. These are the sklearn.metrics.accuracy_score for classification and sklearn.metrics.r2_score for regression. For some applications, other scoring functions are better suited (for example in unbalanced classification, the accuracy score is often uninformative). An alternative scoring function can be specified via the scoring parameter to GridSearchCV, RandomizedSearchCV and many of the specialized cross-validation tools described below. See The scoring parameter: defining model evaluation rules for more details.
 68 | '''
 69 | 
 70 | '''
 71 | Bootstrapping
 72 | '''
 73 | 
 74 | import numpy as np
 75 | from sklearn import datasets
 76 | import sklearn.linear_model as lm
 77 | import sklearn.metrics as metrics
 78 | import pandas as pd
 79 | 
 80 | # Dataset
 81 | n_features = 5
 82 | n_features_info = 2
 83 | n_samples = 100
 84 | X = np.random.randn(n_samples, n_features)
 85 | beta = np.zeros(n_features)
 86 | beta[:n_features_info] = 1
 87 | Xbeta = np.dot(X, beta)
 88 | eps = np.random.randn(n_samples)
 89 | y = Xbeta + eps
 90 | 
 91 | 
 92 | model = lm.RidgeCV()
 93 | model.fit(X, y)
 94 | print("Coefficients on all data:")
 95 | print(model.coef_)
 96 |  
 97 | nboot = 100 # !! Should be at least 1000
 98 | scores_names = ["r2"]
 99 | scores_boot = np.zeros((nboot, len(scores_names)))
100 | coefs_boot = np.zeros((nboot, X.shape[1]))
101 | 
102 | orig_all = np.arange(X.shape[0])
103 | for boot_i in range(nboot):
104 |     boot_tr = np.random.choice(orig_all, size=len(orig_all), replace=True)
105 |     boot_te = np.setdiff1d(orig_all, boot_tr, assume_unique=False)
106 |     Xtr, ytr = X[boot_tr, :], y[boot_tr]
107 |     Xte, yte = X[boot_te, :], y[boot_te]
108 |     model.fit(Xtr, ytr)
109 |     y_pred = model.predict(Xte).ravel()
110 |     #y_pred.shape, prob_pred.shape, yte.shape
111 |     scores_boot[boot_i, :] = metrics.r2_score(yte, y_pred)
112 |     coefs_boot[boot_i, :] = model.coef_
113 | 
114 | scores_boot = pd.DataFrame(scores_boot, columns=scores_names)
115 | scores_stat = scores_boot.describe(percentiles=[.99, .95, .5, .1, .05, 0.01])
116 | 
117 | print("r-squared: Mean=%.2f, SE=%.2f, CI=(%.2f %.2f)" %\
118 |       tuple(scores_stat.ix[["mean", "std", "5%", "95%"], "r2"]))
119 | 
120 | 
121 | coefs_boot = pd.DataFrame(coefs_boot)
122 | coefs_stat = coefs_boot.describe(percentiles=[.99, .95, .5, .1, .05, 0.01])
123 | print("Coefficients distribution")
124 | print(coefs_stat)
125 | 


--------------------------------------------------------------------------------
/utils/plot_ml_linear_regression_overfitting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Dec  2 23:25:38 2020
  5 | 
  6 | @author: ed203246
  7 | """
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns # nicer plots
 11 | import sklearn.metrics as metrics
 12 | import sklearn.linear_model as lm
 13 | 
 14 | # %% Plot train/test with inreasing size
 15 | 
 16 | def fit_on_increasing_size(model):
 17 |     n_samples = 100
 18 |     n_features_ = np.arange(10, 350, 20)
 19 |     r2_train, r2_test, snr = [], [], []
 20 |     for n_features in n_features_:
 21 |         # Sample the dataset (* 2 nb of samples)
 22 |         n_features_info = int(n_features / 10)
 23 |         np.random.seed(27)  # Make reproducible 27
 24 |         X = np.random.randn(n_samples * 2, n_features)
 25 |         beta = np.zeros(n_features)
 26 |         beta[:n_features_info] = .7
 27 |         Xbeta = np.dot(X, beta)
 28 |         eps = np.random.randn(n_samples * 2)
 29 |         y =  Xbeta + eps
 30 |         # Split the dataset into train and test sample
 31 |         Xtrain, Xtest = X[:n_samples, :], X[n_samples:, :]
 32 |         ytrain, ytest = y[:n_samples], y[n_samples:]
 33 |         # fit/predict
 34 |         lr = model.fit(Xtrain, ytrain)
 35 |         y_pred_train = lr.predict(Xtrain)
 36 |         y_pred_test = lr.predict(Xtest)
 37 |         snr.append(Xbeta.std() / eps.std())
 38 |         r2_train.append(metrics.r2_score(ytrain, y_pred_train))
 39 |         r2_test.append(metrics.r2_score(ytest, y_pred_test))
 40 |     return n_features_, np.array(r2_train), np.array(r2_test), np.array(snr)
 41 | 
 42 | def plot_r2_snr(n_features_, r2_train, r2_test, xvline, snr, ax, title):
 43 |     """
 44 |     Two scales plot. Left y-axis: train test r-squared. Right y-axis SNR.
 45 |     """
 46 |     ax.plot(n_features_, r2_train, label="Train r2", linewidth=2, color=sns.color_palette()[0])
 47 |     ax.plot(n_features_, r2_test, label="Test r2", linewidth=2, color=sns.color_palette()[1])
 48 |     ax.axvline(x=xvline, linewidth=2, color='k', ls='--')
 49 |     ax.fill_between(n_features_, r2_test, 0, alpha=.3, color=sns.color_palette()[1])
 50 |     ax.fill_between(n_features_, r2_test, r2_train, alpha=.3, color=sns.color_palette()[0])
 51 |     ax.axhline(y=0, linewidth=1, color='k', ls='--')
 52 |     ax.set_ylim(-0.2, 1.1)
 53 |     ax.set_ylabel("r2", fontsize=16)
 54 |     ax.legend(loc='best')
 55 |     ax.grid(True)
 56 |     ax.set_title(title, fontsize=20)
 57 |     ax_right = ax.twinx()
 58 |     ax_right.plot(n_features_, snr, '--', color='gray', label="SNR", linewidth=1)
 59 |     ax_right.set_ylabel("SNR", color='gray')
 60 |     for tl in ax_right.get_yticklabels():
 61 |         tl.set_color('gray')
 62 | 
 63 | # plot
 64 | fig, axis = plt.subplots(4, 1, figsize=(9, 12), sharex=True)
 65 | 
 66 | 
 67 | # %% No regularization
 68 | 
 69 | mod = lm.LinearRegression()
 70 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod)
 71 | argmax = n_features[np.argmax(r2_test)]
 72 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[0], 'Regression')
 73 | 
 74 | # %% L2 regularization
 75 | 
 76 | mod = lm.Ridge(alpha=10)  # lambda is alpha!
 77 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod)
 78 | argmax = n_features[np.argmax(r2_test)]
 79 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[1], 'Ridge')
 80 | 
 81 | # %% L1 regularization
 82 | 
 83 | mod = lm.Lasso(alpha=.1)  # lambda is alpha !
 84 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod)
 85 | argmax = n_features[np.argmax(r2_test)]
 86 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[2], 'Lasso')
 87 | 
 88 | 
 89 | # %% L1-L2 regularization
 90 | 
 91 | mod = lm.ElasticNet(alpha=.5, l1_ratio=.5)
 92 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod)
 93 | argmax = n_features[np.argmax(r2_test)]
 94 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[3], 'ElasticNet')
 95 | 
 96 | 
 97 | 
 98 | plt.tight_layout()
 99 | axis[3].set_xlabel("Number of input features", fontsize=16)
100 | plt.savefig("/home/ed203246/git/pystatsml/images/linear_regression_penalties.png")
101 | 
102 | # %% Codes examples:
103 | 
104 | from sklearn import datasets
105 | import  sklearn.linear_model as lm
106 | 
107 | X, y = datasets.make_regression(n_features=5, n_informative=2, random_state=0)
108 | 
109 | lr = lm.LinearRegression().fit(X, y)
110 | 
111 | l2 = lm.Ridge(alpha=10).fit(X, y)  # lambda is alpha!
112 | print(l2.coef_)
113 | 
114 | l1 = lm.Lasso(alpha=1).fit(X, y)  # lambda is alpha !
115 | print(l1.coef_)
116 | 
117 | l1l2 = lm.ElasticNet(alpha=1, l1_ratio=.9).fit(X, y)
118 | 


--------------------------------------------------------------------------------
/utils/plot_ml_linear_classification_overfitting.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Dec  2 23:25:38 2020
  5 | 
  6 | @author: ed203246
  7 | """
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns # nicer plots
 11 | import sklearn.metrics as metrics
 12 | import sklearn.linear_model as lm
 13 | 
 14 | import sklearn.datasets as datasets
 15 | from sklearn.model_selection import train_test_split
 16 | 
 17 | # %% Plot train/test with inreasing size
 18 | 
 19 | def logistic(x): return 1 / (1 + np.exp(-x))
 20 | 
 21 | def fit_on_increasing_size(model):
 22 |     n_samples = 100
 23 |     n_features_ = np.arange(20, 2000, 100)
 24 |     bacc_train, bacc_test = [], []
 25 |     for n_features in n_features_:
 26 |         n_features_info = int(n_features / 10)
 27 |         X, y = datasets.make_classification(n_samples=n_samples * 2, n_features=n_features,
 28 |                                      n_informative=n_features_info, n_redundant=int(n_features_info / 2),
 29 |                                      n_classes=2,
 30 |                                      n_clusters_per_class=1,
 31 |                                      weights=None, flip_y=0.01,
 32 |                                      class_sep=.5,
 33 |                                      hypercube=True, shift=0.0, scale=1.0, shuffle=True,
 34 |                                      random_state=1)
 35 |         """
 36 |         # Sample the dataset (* 2 nb of samples)
 37 |         n_features_info = int(n_features / 10)
 38 |         np.random.seed(27)  # Make reproducible 27
 39 |         X = np.random.randn(n_samples * 2, n_features)
 40 |         beta = np.zeros(n_features)
 41 |         beta[:n_features_info] = 1
 42 |         Xbeta = np.dot(X, beta)
 43 |         eps = np.random.randn(n_samples * 2)
 44 |         proba = logistic(Xbeta + eps)
 45 |         y =  (proba >= 0.5).astype(int)
 46 |         """
 47 |         X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)
 48 |         # fit/predict
 49 |         mod.fit(X_train, y_train)
 50 |         y_pred_train = mod.predict(X_train)
 51 |         y_pred_test = mod.predict(X_test)
 52 |         #snr.append(Xbeta.std() / eps.std())
 53 |         bacc_train.append(metrics.balanced_accuracy_score(y_train, y_pred_train))
 54 |         bacc_test.append(metrics.balanced_accuracy_score(y_test, y_pred_test))
 55 |     return n_features_, np.array(bacc_train), np.array(bacc_test)
 56 | 
 57 | def plot_bacc(n_features_, bacc_train, bacc_test, xvline, ax, title):
 58 |     """
 59 |     Two scales plot. Left y-axis: train test r-squared. Right y-axis SNR.
 60 |     """
 61 |     ax.plot(n_features_, bacc_train, label="Train Acc", linewidth=2, color=sns.color_palette()[0])
 62 |     ax.plot(n_features_, bacc_test, label="Test Acc", linewidth=2, color=sns.color_palette()[1])
 63 |     ax.axvline(x=xvline, linewidth=2, color='k', ls='--')
 64 |     ax.fill_between(n_features_, bacc_test, 0.5, alpha=.3, color=sns.color_palette()[1])
 65 |     ax.fill_between(n_features_, bacc_test, bacc_train, alpha=.3, color=sns.color_palette()[0])
 66 |     ax.axhline(y=0.5, linewidth=1, color='k', ls='--')
 67 |     ax.set_ylim(0.3, 1.1)
 68 |     ax.set_ylabel("r2", fontsize=16)
 69 |     ax.legend(loc='best')
 70 |     ax.grid(True)
 71 |     ax.set_title(title, fontsize=20)
 72 | 
 73 | # plot
 74 | fig, axis = plt.subplots(4, 1, figsize=(9, 12), sharex=True)
 75 | 
 76 | 
 77 | # %% No regularization
 78 | 
 79 | #fig, axis = plt.subplots(1, 1, figsize=(9, 12), sharex=True)
 80 | #fig, axis = plt.subplots(1, 1, figsize=(9, 9), sharex=True)
 81 | 
 82 | #mod = lm.LogisticRegression(penalty='none')
 83 | mod = lm.LogisticRegression(penalty='l2', C=.1e16) # lambda = 1 / C!
 84 | 
 85 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod)
 86 | argmax = n_features[np.argmax(bacc_test)]
 87 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[0], 'Regression')
 88 | 
 89 | # %% L2 regularization
 90 | 
 91 | mod = lm.LogisticRegression(penalty='l2', C=1e-2) # lambda = 1 / C!
 92 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod)
 93 | argmax = n_features[np.argmax(bacc_test)]
 94 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[1], 'Ridge')
 95 | 
 96 | # %% L1 regularization
 97 | 
 98 | mod = lm.LogisticRegression(penalty='l1', C=.1, solver='saga') # lambda = 1 / C!
 99 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod)
100 | argmax = n_features[np.argmax(bacc_test)]
101 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[2], 'Lasso')
102 | 
103 | 
104 | # %% L1-L2 regularization
105 | 
106 | mod = lm.LogisticRegression(penalty='elasticnet',  C=.1, l1_ratio=0.5, solver='saga')
107 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod)
108 | argmax = n_features[np.argmax(bacc_test)]
109 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[3], 'ElasticNet')
110 | 
111 | 
112 | 
113 | plt.tight_layout()
114 | axis[3].set_xlabel("Number of input features", fontsize=16)
115 | #plt.savefig("/home/ed203246/git/pystatsml/images/linear_classification_penalties.png")
116 | 
117 | # %% Codes examples:
118 | 
119 | if False:
120 |     from sklearn import datasets
121 |     import  sklearn.linear_model as lm
122 | 
123 |     X, y = datasets.make_regression(n_features=5, n_informative=2, random_state=0)
124 | 
125 |     lr = lm.LinearRegression().fit(X, y)
126 | 
127 |     l2 = lm.Ridge(alpha=10).fit(X, y)  # lambda is alpha!
128 |     print(l2.coef_)
129 | 
130 |     l1 = lm.Lasso(alpha=1).fit(X, y)  # lambda is alpha !
131 |     print(l1.coef_)
132 | 
133 |     l1l2 = lm.ElasticNet(alpha=1, l1_ratio=.9).fit(X, y)
134 | 


--------------------------------------------------------------------------------
/utils/stat_univar_statmodels.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | #import matplotlib.pyplot as plt
  4 | #from statsmodels.sandbox.regression.predstd import wls_prediction_std
  5 | 
  6 | np.random.seed(42)
  7 | 
  8 | '''
  9 | Ordinary Least Squares
 10 | ======================
 11 | '''
 12 | 
 13 | '''
 14 | Numpy
 15 | -----
 16 | '''
 17 | import numpy as np
 18 | import scipy
 19 | np.random.seed(seed=42)  # make the example reproducible
 20 | 
 21 | # Dataset
 22 | N, P = 50, 4
 23 | X = np.random.normal(size= N * P).reshape((N, P))
 24 | ## Our model needs an intercept so we add a column of 1s:
 25 | X[:, 0] = 1
 26 | print(X[:5, :])
 27 | 
 28 | betastar = np.array([10, 1., .5, 0.1])
 29 | e = np.random.normal(size=N)
 30 | y = np.dot(X, betastar) + e
 31 | 
 32 | # Estimate the parameters
 33 | Xpinv = scipy.linalg.pinv2(X)
 34 | betahat = np.dot(Xpinv, y)
 35 | print("Estimated beta:\n", betahat)
 36 | 
 37 | '''
 38 | Linear model with statsmodel
 39 | ----------------------------
 40 | '''
 41 | 
 42 | '''
 43 | Interfacing with numpy
 44 | ~~~~~~~~~~~~~~~~~~~~~~
 45 | '''
 46 | import statsmodels.api as sm
 47 | 
 48 | ## Fit and summary:
 49 | model = sm.OLS(y, X).fit()
 50 | print(model.summary())
 51 | 
 52 | # prediction of new values
 53 | ypred = model.predict(X)
 54 | 
 55 | # residuals + prediction == true values
 56 | assert np.all(ypred + model.resid == y)
 57 | 
 58 | '''
 59 | Interfacing with Pandas
 60 | ~~~~~~~~~~~~~~~~~~~~~~
 61 | '''
 62 | import statsmodels.formula.api as smfrmla
 63 | # Build a dataframe excluding the intercept
 64 | df = pd.DataFrame(np.column_stack([X[:, 1:], y]), columns=['x1','x2', 'x3', 'y'])
 65 | 
 66 | 
 67 | ## Fit and summary:
 68 | model = smfrmla.ols("y ~ x1 + x2 + x2", df).fit()
 69 | print(model.summary())
 70 | 
 71 | 
 72 | 
 73 | oneway = smfrmla.ols('salary ~ management + experience', salary).fit()
 74 | 
 75 | twoway = smfrmla.ols('salary ~ education + management + experience', salary).fit()
 76 | 
 77 | sm.stats.anova_lm(oneway, twoway)
 78 | twoway.compare_f_test(oneway)
 79 | 
 80 | oneway = smfrmla.ols('salary ~ management + experience', salary).fit()
 81 | oneway.model.data.param_names
 82 | oneway.model.data.exog
 83 | 
 84 | print(twoway.model.data.param_names)
 85 | print(twoway.model.data.exog[:10, :])
 86 | 
 87 | ttest_exp = oneway.t_test([0, 0, 1])
 88 | ttest_exp.pvalue, ttest_exp.tvalue
 89 | print(ttest_exp)
 90 | 
 91 | # Alternatively, you can specify the hypothesis tests using a string
 92 | oneway.t_test('experience')
 93 | 
 94 | '''
 95 | multiple comparison
 96 | '''
 97 | 
 98 | import numpy as np
 99 | np.random.seed(seed=42)  # make example reproducible
100 | 
101 | # Dataset
102 | import numpy as np
103 | np.random.seed(seed=42)  # make example reproducible
104 | 
105 | 
106 | # Dataset
107 | n_samples, n_features = 100, 1000
108 | n_info = int(n_features/10) # number of features with information
109 | n1, n2 = int(n_samples/2), n_samples - int(n_samples/2)
110 | snr = .5
111 | Y = np.random.randn(n_samples, n_features)
112 | grp = np.array(["g1"] * n1 + ["g2"] * n2)
113 | 
114 | # Add some group effect for Pinfo features
115 | Y[grp=="g1", :n_info] += snr
116 | 
117 | # 
118 | import scipy.stats as stats
119 | import matplotlib.pyplot as plt
120 | tvals, pvals = np.full(n_features, np.NAN), np.full(n_features, np.NAN)
121 | for j in range(n_features):
122 |     tvals[j], pvals[j] = stats.ttest_ind(Y[grp=="g1", j], Y[grp=="g2", j], equal_var=True)
123 | 
124 | fig, axis = plt.subplots(3, 1)#, sharex='col')
125 | 
126 | axis[0].plot(range(n_features), tvals, 'o')
127 | axis[0].set_ylabel("t-value")
128 | 
129 | axis[1].plot(range(n_features), pvals, 'o')
130 | axis[1].axhline(y=0.05, color='red', linewidth=3, label="p-value=0.05")
131 | #axis[1].axhline(y=0.05, label="toto", color='red')
132 | axis[1].set_ylabel("p-value")
133 | axis[1].legend()
134 | 
135 | axis[2].hist([pvals[n_info:], pvals[:n_info]], 
136 |     stacked=True, bins=100, label=["Negatives", "Positives"])
137 | axis[2].set_xlabel("p-value histogram")
138 | axis[2].set_ylabel("density")
139 | axis[2].legend()
140 | 
141 | plt.tight_layout()
142 | 
143 | 
144 | 
145 | '''
146 | No correction
147 | '''
148 | P, N = n_info,  n_features - n_info # Positives, Negatives
149 | TP = np.sum(pvals[:n_info ] < 0.05)  # True Positives
150 | FP = np.sum(pvals[n_info: ] < 0.05)  # False Positives
151 | print("No correction, FP: %i (expected: %.2f), TP: %i" % (FP, N * 0.05, TP))
152 | 
153 | 
154 | '''
155 | False negative rate (FNR)
156 |     FNR} = FN} / (TP} + FN}) = 1-TPR}
157 | '''
158 | FNR = 
159 | print("No correction, false positives: %i (expected value: %i)" % (FP, 0.05 * (n_features - TP)))
160 | 
161 | 
162 | 
163 | ## Bonferoni
164 | import statsmodels.sandbox.stats.multicomp as multicomp
165 | _, pvals_fwer, _, _  = multicomp.multipletests(pvals, alpha=0.05, 
166 |                                                method='bonferroni')
167 | TP = np.sum(pvals_fwer[:n_info ] < 0.05)  # True Positives
168 | FP = np.sum(pvals_fwer[n_info: ] < 0.05)  # False Positives
169 | print("FWER correction, FP: %i, TP: %i" % (FP, TP))
170 | 
171 | 
172 | ## FDR
173 | import statsmodels.sandbox.stats.multicomp as multicomp
174 | _, pvals_fdr, _, _  = multicomp.multipletests(pvals, alpha=0.05, 
175 |                                                method='fdr_bh')
176 | TP = np.sum(pvals_fdr[:n_info ] < 0.05)  # True Positives
177 | FP = np.sum(pvals_fdr[n_info: ] < 0.05)  # False Positives
178 | 
179 | print("FDR correction, FP: %i, TP: %i" % (FP, TP))
180 | 
181 | '''
182 | Binary classification measures:
183 | 
184 | - **Sensitivity** or **true positive rate (TPR)**, eqv. with hit rate, recall:
185 | 
186 |     TPR = TP / P = TP / (TP+FN)
187 |     
188 | - specificity (SPC) or true negative rate
189 | 
190 |     SPC = TN / N = TN / (TN+FP) 
191 | 
192 | - precision or positive predictive value (PPV)
193 | 
194 |     PPV = TP / (TP + FP)
195 | 
196 | - negative predictive value (NPV)
197 | 
198 |     NPV = TN / (TN + FN)
199 | 
200 | - fall-out or **false positive rate (FPR)** 
201 | 
202 |     FPR = FP / N = FP / (FP + TN) = 1-SPC
203 |     
204 |     
205 | - false negative rate (FNR)
206 | 
207 |     FNR = FN / (TP + FN) = 1-TPR
208 | 
209 | - false discovery rate (FDR)
210 | 
211 |     FDR = FP / (TP + FP) = 1 - PPV 
212 | 
213 | '''
214 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = build
  9 | NTBOOK        = $(shell ls scientific_python/*.ipynb statistics/*.ipynb  machine_learning/*.ipynb optimization/*.ipynb deep_learning/*.ipynb)
 10 | # Notebook to execute. Exclude DL file (requires GPU)
 11 | NTBOOK_TO_EXE = $(shell ls scientific_python/*.ipynb statistics/*.ipynb  machine_learning/*.ipynb optimization/*.ipynb)
 12 | 
 13 | #NTBOOK        = $(shell ls statistics/*.ipynb)
 14 | NTBOOK_FILES  = $(NTBOOK:.ipynb=_files)
 15 | #SRC           = $(shell ls python/*.py)
 16 | RST           = $(NTBOOK:.ipynb=.rst) $(SRC:.py=.rst)
 17 | #$(info $(NTBOOK))
 18 | #$(info $(RST))
 19 | #$(info $(NTBOOK_FILES))
 20 | #$(info $(PYTORST))
 21 | 
 22 | # User-friendly check for sphinx-build
 23 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 24 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 25 | endif
 26 | 
 27 | # Internal variables.
 28 | PAPEROPT_a4     = -D latex_paper_size=a4
 29 | PAPEROPT_letter = -D latex_paper_size=letter
 30 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 31 | # the i18n builder cannot share the environment and doctrees with the others
 32 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 33 | 
 34 | 
 35 | 
 36 | #$(shell find notebooks -name "*.ipynb" -exec bash -c -exec sh -c 'echo "$${1%.ipynb}.rst"' _ {} \;)
 37 | 
 38 | .SUFFIXES: .rst .ipynb .py
 39 | 
 40 | .PHONY: help clean html dirhtml singlehtml htmlhelp epub latex latexpdf text changes linkcheck doctest coverage gettext exe
 41 | 
 42 | help:
 43 | 	@echo "Please use \`make <target>' where <target> is one of"
 44 | 	@echo "  pdf        to make LaTeX files and run them through pdflatex"
 45 | 	@echo "  html       to make standalone HTML files"
 46 | 	@echo "  exe        to run jupyter notebooks except those in deep_learning that requires GPU."
 47 | 	@echo "  clean      rm BUILDDIR, auto_gallery, rst files"
 48 | 	@echo "  cleanall   rm BUILDDIR, auto_gallery, rst files and clear output of notebooks"
 49 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 50 | 	@echo "  singlehtml to make a single large HTML file"
 51 | 	@echo "  epub       to make an epub"
 52 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 53 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 54 | 	@echo "  text       to make text files"
 55 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 56 | 	@echo "  linkcheck  to check all external links for integrity"
 57 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 58 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 59 | 
 60 | # Rule to convert notebook to rst
 61 | #.ipynb.rst:
 62 | %.rst : %.ipynb
 63 | 	jupyter nbconvert --to rst $<
 64 | 	mv $@ $@.filtered
 65 | 	cat $@.filtered|bin/filter_fix_rst.py > $@
 66 | 	rm -f $@.filtered
 67 | 
 68 | #	jupyter nbconvert --to rst --stdout $< | bin/filter_fix_rst.py > $@
 69 | #	jupyter nbconvert --to rst $< --output $@
 70 | 
 71 | debug:
 72 | 	@echo $(RST)
 73 | 
 74 | 
 75 | rst: $(RST)
 76 | 
 77 | clean:
 78 | 	rm -rf $(BUILDDIR)/*
 79 | 	rm -rf auto_gallery/
 80 | 	rm -f $(RST)
 81 | 	rm -rf $(NTBOOK_FILES)
 82 | 
 83 | cleanall:
 84 | 	rm -rf $(BUILDDIR)/*
 85 | 	rm -rf auto_gallery/
 86 | 	rm -f $(RST)
 87 | 	rm -rf $(NTBOOK_FILES)
 88 | 	for nb in $(NTBOOK) ; do jupyter nbconvert --clear-output $$nb; done
 89 | 
 90 | exe:
 91 | 	@echo "Execute notebooks" 
 92 | 	for nb in $(NTBOOK_TO_EXE) ; do jupyter nbconvert --to notebook --execute $$nb --output $$(basename $$nb); done
 93 | #	$(EXEIPYNB) $(NTBOOK)
 94 | #	@echo toto nbconvert --to notebook --execute $< --output $(basename $<)
 95 | 
 96 | html: rst
 97 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 98 | 	@echo
 99 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
100 | 
101 | dirhtml: rst
102 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
103 | 	@echo
104 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
105 | 
106 | singlehtml: rst
107 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
108 | 	@echo
109 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
110 | 
111 | docx: rst
112 | 	$(SPHINXBUILD) -b docx $(ALLSPHINXOPTS) $(BUILDDIR)/docx
113 | 	@echo
114 | 	@echo "Build finished. The docx page is in $(BUILDDIR)/docx."
115 | 
116 | epub: rst
117 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
118 | 	@echo
119 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
120 | 
121 | latex: rst
122 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
123 | 	@echo
124 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
125 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
126 | 	      "(use \`make latexpdf' here to do that automatically)."
127 | 
128 | latexpdf: rst
129 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
130 | 	@echo "Running LaTeX files through pdflatex..."
131 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
132 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
133 | 	cp build/latex/StatisticsMachineLearningPython.pdf StatisticsMachineLearningPython.pdf
134 | 
135 | pdf: latexpdf
136 | 
137 | text: rst
138 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
139 | 	@echo
140 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
141 | 
142 | changes: rst
143 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
144 | 	@echo
145 | 	@echo "The overview file is in $(BUILDDIR)/changes."
146 | 
147 | linkcheck: rst
148 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
149 | 	@echo
150 | 	@echo "Link check complete; look for any errors in the above output " \
151 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
152 | 
153 | 


--------------------------------------------------------------------------------
/info.rst:
--------------------------------------------------------------------------------
  1 | gh-pages
  2 | --------
  3 | 
  4 | TODO: do it with: circleci
  5 | 
  6 | - https://circleci.com/blog/deploying-documentation-to-github-pages-with-continuous-integration/
  7 | - https://github.com/jklukas/docs-on-gh-pages
  8 | 
  9 | 
 10 | Publishing sphinx-generated docs on github:
 11 | 
 12 | https://daler.github.io/sphinxdoc-test/includeme.html
 13 | 
 14 | 
 15 | 
 16 | Upload to github
 17 | ----------------
 18 | 
 19 | 
 20 | "$WD/build/html" contains the pystsamsl website. Now we start to upload to github server. Clone from github to a temporary directory, and checkout gh-pages branch
 21 | 
 22 | First time
 23 | ```
 24 | WD=~/git/pystatsml
 25 | cd ~/git
 26 | mv pystatsml_gh-pages pystatsml_gh-pages.bak
 27 | git clone git@github.com:duchesnay/pystatsml.git pystatsml_gh-pages
 28 | git symbolic-ref HEAD refs/heads/gh-pages
 29 | rm .git/index
 30 | git clean -fdx
 31 | cp -r $WD/build/html/* ./
 32 | cp -r $WD/auto_gallery ./
 33 | git add .
 34 | git add -f auto_gallery
 35 | git add -f _sources
 36 | git add -f _static
 37 | git add -f _images
 38 | touch .nojekyll
 39 | gedit index.html # see blow
 40 | git commit -am "gh-pages First commit"
 41 | git push origin gh-pages
 42 | firefox  index.html
 43 | ```
 44 | 
 45 | Update
 46 | ```
 47 | WD=~/git/pystatsml
 48 | cd $WD
 49 | make pdf html singlehtml
 50 | cd ~/git/pystatsml_gh-pages
 51 | git checkout gh-pages
 52 | rsync -avu $WD/build/html/* ./
 53 | rsync -avu $WD/auto_gallery ./
 54 | git add .
 55 | git add -f auto_gallery
 56 | git add -f _sources
 57 | git add -f _static
 58 | git add -f _images
 59 | meld index.html index.html.save
 60 | #gedit   # see blow
 61 | git commit -am "gh-pages update commit"
 62 | git push origin gh-pages
 63 | firefox  index.html
 64 | ```
 65 | 
 66 | Then
 67 | ```
 68 | gedit index.html
 69 | 
 70 | Replace:
 71 | ```
 72 |   <div class="section" id="phantom">
 73 | <h1>Phantom<a class="headerlink" href="#phantom" title="Permalink to this headline">¶</a></h1>
 74 | </div>
 75 | ```
 76 | by
 77 | 
 78 | ```
 79 | <div class="section" id="phantom">
 80 | <h1 style="font-weight:bold;">Statistics and Machine Learning in
 81 | Python<a class="headerlink" href="#phantom" title="Permalink to this headline">¶</a></h1>
 82 | </div>
 83 | 
 84 | <hr>
 85 | 
 86 | <p><a href="https://duchesnay.github.io/">Edouard Duchesnay</a>, <a href="https://www.umu.se/en/staff/toklot02/">Tommy Löfstedt</a>, Feki Younes</p>
 87 | ```
 88 | 
 89 | Then
 90 | 
 91 | ```
 92 | git commit -am "Title and authors"
 93 | git push origin gh-pages
 94 | firefox  $WD/build/html/index.html
 95 | ```
 96 | 
 97 | Now, you can visit your updated website at https://duchesnay.github.io/pystatsml.
 98 | 
 99 | 
100 | ML Resources
101 | ------------
102 | 
103 | - **my_tech_resources**
104 |     https://github.com/JamesLavin/my_tech_resources
105 | 
106 | - **Practical Machine Learning Course Notes (in R)**
107 |     https://sux13.github.io/DataScienceSpCourseNotes/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html
108 | 
109 | - **Computational Statistics in Python**
110 |     https://people.duke.edu/~ccc14/sta-663/index.html
111 | 
112 | - **scipy-lectures**
113 | 
114 |     https://github.com/scipy-lectures/scipy-lecture-notes
115 | 
116 | - **Scientific Python & Software engineering best practices**
117 |     https://github.com/paris-saclay-cds/python-workshop
118 | 
119 | - **Deep Learning course in python**
120 |     https://github.com/m2dsupsdlclass/lectures-labs
121 | 
122 | - **Others**
123 |     https://github.com/justmarkham/DAT4
124 | 
125 |     http://statweb.stanford.edu/~jtaylo/courses/stats202/index.html
126 | 
127 |     http://www.dataschool.io/
128 | 
129 |     https://onlinecourses.science.psu.edu/stat857/node/141
130 | 
131 |     https://github.com/rasbt/python-machine-learning-book
132 | 
133 |     https://onlinecourses.science.psu.edu/stat505/
134 | 
135 |     http://www.kdnuggets.com/2016/04/top-10-ipython-nb-tutorials.html
136 | 
137 | 
138 | Jupyter Notebooks
139 | -----------------
140 | 
141 | https://jupyterbook.org/advanced/advanced.html#jupyter-cell-tags
142 | 
143 | 
144 | Markdown
145 | --------
146 | http://daringfireball.net/projects/markdown/basics
147 | 
148 | R with Jupyther
149 | ~~~~~~~~~~~~~~~
150 | 
151 | conda install -c r r-essentials
152 | 
153 | Sphinx
154 | ------
155 | 
156 | http://sphinx-doc.org/
157 | 
158 | IPython notebooks + Sphinx
159 | --------------------------
160 | 
161 | http://sphinx-ipynb.readthedocs.org/en/latest/howto.html
162 | 
163 | 
164 | nbsphinx: Jupyter Notebook Tools for Sphinx
165 | 
166 | https://nbsphinx.readthedocs.io/en/0.3.3/
167 | 
168 | nbsphinx is a Sphinx extension that provides a source parser for *.ipynb files. Custom Sphinx directives are used to show Jupyter Notebook code cells (and of course their results) in both HTML and LaTeX output. Un-evaluated notebooks – i.e. notebooks without stored output cells – will be automatically executed during the Sphinx build process.
169 | 
170 | conda install -c conda-forge nbsphinx
171 | 
172 | sphinx-gallery
173 | --------------
174 | 
175 | https://sphinx-gallery.readthedocs.io/en/latest/
176 | 
177 | ``pip install sphinx-gallery``
178 | 
179 | http://www.scipy-lectures.org
180 | 
181 | https://github.com/scipy-lectures/scipy-lecture-notes
182 | 
183 | strip jupyter output before submission
184 | --------------------------------------
185 | 
186 | https://github.com/kynan/nbstripout
187 | 
188 | ``conda install -c conda-forge nbstripout``
189 | 
190 | Set up the git filter and attributes as described in the manual installation instructions below:
191 | 
192 | ``cd pystatsml``
193 | ``nbstripout --install``
194 | 
195 | 
196 | rst
197 | ---
198 | 
199 | http://docutils.sourceforge.net/rst.html
200 | http://docutils.sourceforge.net/docs/ref/rst/
201 | 
202 | 
203 | 
204 | R vs Python
205 | -----------
206 | 
207 | https://www.datacamp.com/community/tutorials/r-or-python-for-data-analysis
208 | http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html
209 | 
210 | Mail to share the course
211 | ------------------------
212 | 
213 | Please find the link to my Machine Learning course in Python, it is a draft version:
214 | ftp://ftp.cea.fr//pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf
215 | 
216 | Below the link to github:
217 | https://github.com/duchesnay/pystatsml
218 | 
219 | 
220 | git clone https://github.com/duchesnay/pystatsml.git
221 | 
222 | 
223 | Basically, it uses Jupyter notebook and pure python, everything is converted to rst and assembled to html or pdf using sphynx.
224 | 
225 | It is a draft version, not finished yet with many spelling mistakes.
226 | 
227 | Please fork and perform some pull request. If you are willing to contribute.
228 | 
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/utils/ml_non_linear_prediction.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Mar 31 09:54:25 2016
  4 | 
  5 | @author: edouard.duchesnay@cea.fr
  6 | """
  7 | 
  8 | '''
  9 | SVM & Kernel methods
 10 | ====================
 11 | '''
 12 | import numpy as np
 13 | from numpy.linalg import norm
 14 | 
 15 | from mpl_toolkits.mplot3d import Axes3D
 16 | import matplotlib.pyplot as plt
 17 | import sklearn.metrics as metrics
 18 | #%matplotlib inline
 19 | #%matplotlib qt
 20 | 
 21 | 
 22 | 
 23 | class KernDensity:
 24 |     def __init__(self, sigma=1):
 25 |         self.sigma = sigma
 26 | 
 27 |     def fit(self, X, y, alphas=None):
 28 |         self.X = X
 29 |         self.y = y
 30 |         if alphas is None:
 31 |             alphas = np.ones(X.shape[0])
 32 |         self.alphas = alphas
 33 | 
 34 |     def predict(self, X):
 35 |         y_pred = np.zeros((X.shape[0]))
 36 |         for j, x in enumerate(X):
 37 |             for i in range(self.X.shape[0]):
 38 |                 #print(j, i, x)
 39 |                 y_pred[j] += self.alphas[i] * self.y[i] * np.exp( - (norm(self.X[i, :] - x) ** 2) / (2 * self.sigma ** 2))
 40 |         return(y_pred)
 41 | 
 42 | 
 43 | ## Plot 3D
 44 | def plot3d(coord_x, coord_y, coord_z, points, y, zlim=None, ax=None, fig=None, xylabelsize=33):
 45 |     # Plot
 46 |     from matplotlib import cm
 47 |     if fig is None:
 48 |         fig = plt.figure()
 49 |     if ax is None:
 50 |         ax = fig.add_subplot(111, projection='3d')
 51 |     z_min = np.min(coord_z) - np.max(coord_z) * 2
 52 |     ax.plot_surface(coord_x, coord_y, coord_z, rstride=2, cstride=2,
 53 |                     #vmin=Z.min(), vmax=Z.max(),
 54 |                     cmap=cm.coolwarm,
 55 |                     linewidth=1, antialiased=True)
 56 |     cset = ax.contourf(coord_x, coord_y, coord_z, zdir='z', offset=z_min-10,
 57 |                        cmap=cm.coolwarm)
 58 |     argmin = coord_x.ravel()[coord_z.argmin()], coord_y.ravel()[coord_z.argmin()]
 59 |     print("argmin", argmin)
 60 |     # add point and cross at defined point
 61 |     colors = {-1:'b', 1:'r'}
 62 |     for lev in np.unique(y):
 63 |         pts = points[y==lev, :]
 64 |         ax.plot(pts[:, 0], pts[:, 1], 'o', color=colors[lev], zs=[z_min]*pts.shape[0], ms=10)
 65 |     ax.set_xlabel(r'$x^0$', size=xylabelsize)
 66 |     ax.set_ylabel(r'$x^1$', size=xylabelsize)
 67 |     #ax.set_zlabel(r'$Kernel density$', size=xylabelsize)
 68 |     ax.set_zlim(z_min, np.max(coord_z))
 69 |     return ax, z_min, argmin
 70 | 
 71 | 
 72 | ## Dataset
 73 | ##########
 74 | 
 75 | im = np.array(
 76 |       [[ 1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
 77 |        [ 1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
 78 |        [ 1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
 79 |        [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.],
 80 |        [ 0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.],
 81 |        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
 82 |        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
 83 |        [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.]])
 84 | 
 85 | x0, y0 = np.where(im == 0)       
 86 | x1, y1 = np.where(im == 1)
 87 | 
 88 | X = np.column_stack([
 89 |     np.concatenate([x0, x1]),
 90 |     np.concatenate([y0, y1])])
 91 | y = np.array([-1] * len(x0) + [1] * len(x1))
 92 | 
 93 | xmin, xmax, ymin, ymax = 0, im.shape[0]-1, 0, im.shape[1]-1
 94 | coord_x, coord_y = np.mgrid[xmin:xmax:50j, ymin:ymax:50j]
 95 | XX = np.column_stack([coord_x.ravel(), coord_y.ravel()])
 96 | 
 97 | 
 98 | # Kernel mapping
 99 | ################
100 | 
101 | self = KernDensity(sigma=.2)
102 | self.fit(X, y)
103 | y_pred_kde = self.predict(XX)
104 | coord_z_kde = y_pred_kde.reshape(coord_x.shape)
105 | points=X
106 | 
107 | # View 2D
108 | if False:
109 |     plt.imshow(np.rot90(coord_z_kde), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto')
110 |     plt.plot(X[y==1, 0], X[y==1, 1], 'o', color='r')#, zs=[z_min], ms=20)
111 |     plt.plot(X[y==-1, 0], X[y==-1, 1], 'o', color='b')#, zs=[z_min], ms=20)
112 | 
113 | 
114 | fig = plt.figure(figsize=(30, 15)) 
115 | 
116 | ax=fig.add_subplot(121, projection='3d')
117 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_kde, points=X, y=y, ax=ax, fig=fig)
118 | plt.title(r'$x \rightarrow K(x_i, x) = \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)$', size=33)
119 | # set camera to fixed point of view
120 | print(ax.azim, ax.elev, ax.dist)
121 | #(-152.49214958606902, 21.717791411042867, 10)
122 | #ax.view_init(azim=-152, elev=21) #Reproduce view
123 | #ax.view_init(azim=-14.1935483871, elev=29.6875, dist=10)
124 | 
125 | # SV
126 | #####
127 | 
128 | from sklearn.svm import SVC
129 | #1.0 / X.shape[1] 0.5
130 | #(1/(2 *.2)) : 2.5
131 | clf = SVC(kernel='rbf')#, gamma=1)
132 | clf.fit(X, y) 
133 | clf.support_vectors_.shape
134 | 
135 | print(clf.support_.shape)
136 | 
137 | np.all(X[clf.support_,:] == clf.support_vectors_)
138 | 
139 | Xsv = clf.support_vectors_
140 | y_sv = y[clf.support_]
141 | 
142 | y_pred_svm = clf.predict(XX)
143 | #self = KernDensity(sigma=.2)
144 | #self.fit(X, y)
145 | #y_pred = self.predict(XX)
146 | coord_z_svm = y_pred_svm.reshape(coord_x.shape)
147 | 
148 | # View 2D
149 | if False:
150 |     plt.imshow(np.rot90(coord_z_svm), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto')
151 |     plt.plot(Xsv[y_sv==1, 0], Xsv[y_sv==1, 1], 'o', color='r')#, zs=[z_min], ms=20)
152 |     plt.plot(Xsv[y_sv==-1, 0], Xsv[y_sv==-1, 1], 'o', color='b')#, zs=[z_min], ms=20)
153 | 
154 | 
155 | 
156 | #fig = plt.figure(figsize=(15, 15)) 
157 | ax=fig.add_subplot(122, projection='3d')
158 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_svm, points=Xsv, y=y_sv, ax=ax, fig=fig)
159 | plt.title(r'$f(x) = sign \left(\sum_{i \in SV}\alpha_i y_i \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)\right)$', size=33)
160 | # set camera to fixed point of view
161 | #ax.azim, ax.elev, ax.dist
162 | #(-152.49214958606902, 21.717791411042867, 10)
163 | #ax.view_init(azim=-152, elev=21) #Reproduce view
164 | 
165 | ############
166 | 
167 | import numpy as np
168 | from sklearn.svm import SVC
169 | from sklearn import datasets
170 | import matplotlib.pyplot as plt
171 | 
172 | # dataset
173 | X, y = datasets.make_classification(n_samples=10, n_features=2,n_redundant=0,
174 |                                     n_classes=2,
175 |                                     random_state=1,
176 |                                     shuffle=False)
177 | clf = SVC(kernel='rbf')#, gamma=1)
178 | clf.fit(X, y)
179 | print("#Errors: %i" % np.sum(y != clf.predict(X)))
180 | 
181 | clf.decision_function(X)
182 | 
183 | # Usefull internals:
184 | # Array of support vectors
185 | clf.support_vectors_
186 | 
187 | # indices of support vectors within original X
188 | np.all(X[clf.support_,:] == clf.support_vectors_)
189 | 
190 | 
191 | ########################
192 | 
193 | 
194 | from sklearn.ensemble import RandomForestClassifier 
195 | 
196 | forest = RandomForestClassifier(n_estimators = 100)
197 | forest.fit(X, y)
198 | 
199 | print("#Errors: %i" % np.sum(y != forest.predict(X)))
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/python_lang/python_lang_solutions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 16 10:03:29 2016
  4 | 
  5 | @author: edouard.duchesnay@gmail.com
  6 | """
  7 | 
  8 | ###############################################################################
  9 | # Exercise 1: functions
 10 | # ~~~~~~~~~~~~~~~~~~~~~
 11 | #
 12 | # Create a function that acts as a simple calulator If the operation is
 13 | # not specified, default to addition If the operation is misspecified,
 14 | # return an prompt message Ex: ``calc(4,5,"multiply")`` returns 20 Ex:
 15 | # ``calc(3,5)`` returns 8 Ex: ``calc(1, 2, "something")`` returns error
 16 | # message
 17 | #
 18 | 
 19 | def calc(a, b, op='add'):
 20 |     if op == 'add':
 21 |         return a + b
 22 |     elif op == 'sub':
 23 |         return a - b
 24 |     else:
 25 |         print('valid operations are add and sub')
 26 | 
 27 | 
 28 | # call the function
 29 | calc(10, 4, op='add')   # returns 14
 30 | calc(10, 4, 'add')      # also returns 14: unnamed arguments are inferred by position
 31 | calc(10, 4)             # also returns 14: default for 'op' is 'add'
 32 | calc(10, 4, 'sub')      # returns 6
 33 | calc(10, 4, 'div')      # prints 'valid operations are add and sub'
 34 | 
 35 | a, b, op = 2, 3, "+"
 36 | 
 37 | 
 38 | def calc2(a, b, op='+'):
 39 |     st = "%.f %s %.f" % (a, op, b)
 40 |     return eval(st)
 41 | 
 42 | 
 43 | calc2(3, 3, "+")
 44 | 
 45 | 
 46 | ###############################################################################
 47 | # Exercise 2: functions + list + loop
 48 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 49 | #
 50 | # Given a list of numbers, return a list where all adjacent duplicate
 51 | # elements have been reduced to a single element. Ex: ``[1, 2, 2, 3, 2]``
 52 | # returns ``[1, 2, 3, 2]``. You may create a new list or modify the passed
 53 | # in list.
 54 | #
 55 | # Remove all duplicate values (adjacent or not) Ex: ``[1, 2, 2, 3, 2]``
 56 | # returns ``[1, 2, 3]``
 57 | #
 58 | 
 59 | 
 60 | def remove_adjacent_duplicates(original_list):
 61 |     new_list = []
 62 |     new_list.append(original_list[0])
 63 |     for num in original_list[1:]:
 64 |         if num != new_list[-1]:
 65 |             new_list.append(num)
 66 |     return new_list
 67 | 
 68 | remove_adjacent_duplicates([1, 2, 2, 3, 2])
 69 | 
 70 | def remove_duplicates(original_list):
 71 |     new_list = []
 72 |     for num in original_list:
 73 |         if num not in new_list:
 74 |             new_list.append(num)
 75 |     return new_list
 76 | 
 77 | remove_duplicates([3, 2, 2, 1, 2])
 78 | 
 79 | # or this solution mights modify the order
 80 | 
 81 | def remove_duplicates(original_list):
 82 |     return(list(set(original_list)))
 83 | 
 84 | remove_duplicates([3, 2, 2, 1, 2])
 85 | 
 86 | 
 87 | ###############################################################################
 88 | # Exercise 3: File I/O
 89 | # ~~~~~~~~~~~~~~~~~~~~
 90 | #
 91 | # 1. Copy/paste the BSD 4 clause license (https://en.wikipedia.org/wiki/BSD_licenses)
 92 | # into a text file. Read, the file and count the occurrences of each
 93 | # word within the file. Store the words' occurrence number in a dictionary.
 94 | #
 95 | # 2. Write an executable python command ``count_words.py`` that parse
 96 | # a list of input files provided after ``--input`` parameter.
 97 | # The dictionary of occurrence is save in a csv file provides by ``--output``.
 98 | # with default value word_count.csv.
 99 | # Use:
100 | # - open
101 | # - regular expression
102 | # - argparse (https://docs.python.org/3/howto/argparse.html)
103 | 
104 | 
105 | bsd_4clause = """
106 | Copyright (c) <year>, <copyright holder>
107 | All rights reserved.
108 | 
109 | Redistribution and use in source and binary forms, with or without
110 | modification, are permitted provided that the following conditions are met:
111 | 1. Redistributions of source code must retain the above copyright
112 |    notice, this list of conditions and the following disclaimer.
113 | 2. Redistributions in binary form must reproduce the above copyright
114 |    notice, this list of conditions and the following disclaimer in the
115 |    documentation and/or other materials provided with the distribution.
116 | 3. All advertising materials mentioning features or use of this software
117 |    must display the following acknowledgement:
118 |    This product includes software developed by the <organization>.
119 | 4. Neither the name of the <organization> nor the
120 |    names of its contributors may be used to endorse or promote products
121 |    derived from this software without specific prior written permission.
122 | 
123 | THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ''AS IS'' AND ANY
124 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
125 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
126 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
127 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
128 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
129 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
130 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
131 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
132 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
133 | """
134 | 
135 | import os
136 | import tempfile
137 | 
138 | tmpfilename = os.path.join(tempfile.gettempdir(),
139 |                        "bsd.txt")
140 | 
141 | fd = open(tmpfilename, "w")
142 | fd.write(bsd_4clause)
143 | fd.close()
144 | 
145 | fd = open(tmpfilename, "r")
146 | 
147 | count = dict()
148 | for line in fd:
149 |     line = line.lower()
150 |     for word in line.split():
151 |         if not word in count:
152 |             count[word] = 1
153 |         else:
154 |             count[word] += 1
155 | 
156 | print(count)
157 | 
158 | """
159 | Comment to deal with missing import of urllib2
160 | 
161 | import urllib2
162 | url = "https://www.gnu.org/licenses/gpl-3.0.txt"
163 | f = urllib2.urlopen(url)
164 | content = f.read()
165 | f.close()
166 | content = content.replace("\n", " ")
167 | content = content.lower()
168 | c = content.split(' ')
169 | print(len(c))
170 | from collections import Counter
171 | print(Counter(c))
172 | """
173 | 
174 | ###############################################################################
175 | # Exercise 4: OOP
176 | # ~~~~~~~~~~~~~~~
177 | #
178 | # 1. Create a class ``Employee`` with 2 attributes provided in the
179 | #    constructor: ``name``, ``years_of_service``. With one method
180 | #    ``salary`` with is obtained by ``1500 + 100 * years_of_service``.
181 | #
182 | # 2. Create a subclass ``Manager`` which redefine ``salary`` method
183 | #    ``2500 + 120 * years_of_service``.
184 | #
185 | # 3. Create a small dictionary database where the key is the
186 | #    employee's name. Populate the database with: samples =
187 | #    Employee('lucy', 3), Employee('john', 1), Manager('julie', 10),
188 | #    Manager('paul', 3)
189 | #
190 | # 4. Return a table of made name, salary rows, i.e. a list of list [[name,
191 | #    salary]]
192 | #
193 | # 5. Compute the average salary
194 | 
195 | import pandas as pd
196 | 
197 | 
198 | class Employee:
199 |     def __init__(self, name, years_of_service):
200 |         self.name = name
201 |         self.years_of_service = years_of_service
202 | 
203 |     def salary(self):
204 |         return 1500 + 100 * self.years_of_service
205 | 
206 | 
207 | class Manager(Employee):
208 |     def salary(self):
209 |         return 2500 + 120 * self.years_of_service
210 | 
211 | 
212 | samples = [Employee("lucy", 3),
213 |            Employee("john", 1),
214 |            Manager('julie', 3),
215 |            Manager('paul', 1)]
216 | 
217 | employees = {e.name: e for e in samples}
218 | 
219 | employees.keys()
220 | 
221 | df = pd.DataFrame([[name, obj.salary()] for name, obj in employees.items()],
222 |              columns=['name', 'salary'])
223 | 
224 | [[name, employees[name].salary()] for name
225 |       in employees]
226 | 
227 | sum([e.salary() for e in employees.values()]) / len(employees)
228 | 


--------------------------------------------------------------------------------
/introduction/python_ecosystem.rst:
--------------------------------------------------------------------------------
  1 | Python ecosystem for data-science
  2 | ---------------------------------
  3 | 
  4 | .. RST https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html
  5 | 
  6 | .. image:: images/python_ecosystem.png
  7 |    :scale: 100
  8 |    :align: center
  9 | 
 10 | Python language
 11 | ~~~~~~~~~~~~~~~
 12 | 
 13 | - Interpreted
 14 | - Garbage collector (do not prevent from memory leak)
 15 | - Dynamically-typed language (Java is statically typed)
 16 | 
 17 | 
 18 | Anaconda
 19 | ~~~~~~~~
 20 | 
 21 | Anaconda is a python distribution that ships most of python tools and libraries
 22 | 
 23 | **Installation**
 24 | 
 25 | 
 26 | 1. Download anaconda (Python 3.x) http://continuum.io/downloads
 27 | 
 28 | 2. Install it, on Linux
 29 | ::
 30 | 
 31 |     bash Anaconda3-2.4.1-Linux-x86_64.sh
 32 | 
 33 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file:
 34 | ::
 35 | 
 36 |     export PATH="${HOME}/anaconda3/bin:$PATH"
 37 | 
 38 | **Managing with ``conda``**
 39 | 
 40 | 
 41 | Update conda package and environment manager to current version
 42 | 
 43 | ::
 44 | 
 45 |     conda update conda
 46 | 
 47 | 
 48 | Install additional packages. Those commands install qt back-end (Fix a temporary issue to run spyder)
 49 | 
 50 | ::
 51 | 
 52 |     conda install pyqt
 53 |     conda install PyOpenGL
 54 |     conda update --all
 55 | 
 56 | 
 57 | Install seaborn for graphics
 58 | 
 59 | ::
 60 | 
 61 |     conda install seaborn
 62 |     # install a specific version from anaconda chanel
 63 |     conda install -c anaconda pyqt=4.11.4
 64 | 
 65 | List installed packages
 66 | 
 67 | ::
 68 | 
 69 |     conda list
 70 | 
 71 | Search available packages
 72 | 
 73 | :: 
 74 | 
 75 |     conda search pyqt
 76 |     conda search scikit-learn
 77 | 
 78 | 
 79 | 
 80 | **Environments**
 81 | 
 82 | 
 83 | - A conda environment is a directory that contains a specific collection of conda packages that you have installed.
 84 | - Control packages environment for a specific purpose: collaborating with someone else, delivering an application to your client, 
 85 | - Switch between environments
 86 | 
 87 | List of all environments
 88 | 
 89 | ::
 90 |     conda info --envs
 91 | 
 92 | 1. Create new environment
 93 | 2. Activate
 94 | 3. Install new package
 95 | 
 96 | ::
 97 | 
 98 |     conda create --name test
 99 |     # Or
100 |     conda env create -f environment.yml
101 |     source activate test
102 |     conda info --envs
103 |     conda list
104 |     conda search -f numpy
105 |     conda install numpy
106 | 
107 | **Miniconda**
108 | 
109 | Anaconda without the collection of (>700) packages.
110 | With Miniconda you download only the packages you want with the conda command: ``conda install PACKAGENAME``
111 | 
112 | 
113 | 
114 | 1. Download anaconda (Python 3.x) https://conda.io/miniconda.html
115 | 
116 | 2. Install it, on Linux
117 | 
118 | ::
119 | 
120 |     bash Miniconda3-latest-Linux-x86_64.sh
121 | 
122 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file:
123 | 
124 | ::
125 | 
126 |     export PATH=${HOME}/miniconda3/bin:$PATH
127 | 
128 | 4. Install required packages
129 | 
130 | ::
131 | 
132 |         conda install -y scipy
133 |         conda install -y pandas
134 |         conda install -y matplotlib
135 |         conda install -y statsmodels
136 |         conda install -y scikit-learn
137 |         conda install -y sqlite
138 |         conda install -y spyder
139 |         conda install -y jupyter
140 | 
141 | 
142 | Commands
143 | ~~~~~~~~
144 | 
145 | **python**: python interpreter. On the dos/unix command line execute wholes file::
146 | 
147 |         python file.py
148 | 
149 | Interactive mode::
150 | 
151 |         python
152 | 
153 | Quite with ``CTL-D``
154 | 
155 | **ipython**: advanced interactive python interpreter::
156 | 
157 |         ipython
158 | 
159 | Quite with ``CTL-D``
160 | 
161 | **pip** alternative for packages management (update ``-U`` in user directory ``--user``):
162 | 
163 | ::
164 | 
165 |     pip install -U --user seaborn
166 | 
167 | For neuroimaging:
168 | 
169 | ::
170 | 
171 |     pip install -U --user nibabel
172 |     pip install -U --user nilearn
173 | 
174 | 
175 | **spyder**: IDE (integrated development environment):
176 | 
177 | - Syntax highlighting.
178 | - Code introspection for code completion (use ``TAB``).
179 | - Support for multiple Python consoles (including IPython).
180 | - Explore and edit variables from a GUI.
181 | - Debugging.
182 | - Navigate in code (go to function definition) ``CTL``.
183 | 
184 | 3 or 4 panels:
185 | 
186 | +-------------+-------------------------+
187 | | text editor |  help/variable explorer |
188 | +-------------+-------------------------+
189 | |             |  ipython interpreter    |
190 | +-------------+-------------------------+
191 | 
192 | Shortcuts:
193 | - ``F9`` run line/selection
194 | 
195 | Libraries
196 | ~~~~~~~~~
197 | 
198 | scipy.org: `<https://www.scipy.org/docs.html>`_
199 | 
200 | 
201 | **Numpy**: Basic numerical operation. Matrix operation plus some basic solvers.::
202 | 
203 |         import numpy as np
204 |         X = np.array([[1, 2], [3, 4]])
205 |         #v = np.array([1, 2]).reshape((2, 1))
206 |         v = np.array([1, 2])
207 |         np.dot(X, v) # no broadcasting
208 |         X * v # broadcasting
209 |         np.dot(v, X)
210 |         X - X.mean(axis=0)
211 | 
212 | **Scipy**: general scientific libraries with advanced solver::
213 | 
214 |         import scipy
215 |         import scipy.linalg
216 |         scipy.linalg.svd(X, full_matrices=False)
217 | 
218 | **Matplotlib**: visualization::
219 | 
220 |         import numpy as np
221 |         import matplotlib.pyplot as plt
222 |         #%matplotlib qt
223 |         x = np.linspace(0, 10, 50)
224 |         sinus = np.sin(x)
225 |         plt.plot(x, sinus)
226 |         plt.show()
227 | 
228 | **Pandas**: Manipulation of structured data (tables). input/output excel files, etc.
229 | 
230 | **Statsmodel**: Advanced statistics
231 | 
232 | **Scikit-learn**: Machine learning
233 | 
234 | .. http://truben.no/table/
235 | 
236 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
237 | |   library    | Arrays data, Num. comp, I/O | Structured data, I/O | Solvers: basic | Solvers: advanced | Stats: basic | Stats: advanced | Machine learning |
238 | +==============+=============================+======================+================+===================+==============+=================+==================+
239 | |    Numpy     |               X             |                      | X              |                   |              |                 |                  |
240 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
241 | |    Scipy     |                             |                      | X              | X                 | X            |                 |                  |
242 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
243 | |    Pandas    |                             |         X            |                |                   |              |                 |                  |
244 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
245 | |  Statmodels  |                             |                      |                |                   | X            | X               |                  |
246 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
247 | | Scikit-learn |                             |                      |                |                   |              |                 | X                |
248 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+
249 | 
250 | 


--------------------------------------------------------------------------------
/R/ml_dimensionality_reduction_exo.R:
--------------------------------------------------------------------------------
  1 | ######
  2 | ## PCA
  3 | ######
  4 | 
  5 | # Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean
  6 | # and principal components directions. `transform(X)` that project a new the data
  7 | # into the principal components.
  8 | # 
  9 | # Check that your `BasicPCA` pfermed simillarly than the one from sklearn:
 10 | #   `from sklearn.decomposition import PCA`
 11 | 
 12 | 
 13 | BasicPCA <- function(X, scale=FALSE){
 14 |   obj = list()
 15 |   Xc <- scale(X, center=TRUE, scale=scale)
 16 |   obj$mean <- attr(Xc, "scaled:center")
 17 |   s <- svd(Xc, nu = 0)
 18 |   # v [K x P] a matrix whose columns contain the right singular vectors of x
 19 |   obj$V = s$v
 20 |   obj$var = 1 / (nrow(X) - 1) * s$d ^2
 21 |   return(obj)
 22 | }
 23 | 
 24 | BasicPCA.transform <- function(obj, X){
 25 |   Xc <- scale(X, center=obj$mean, scale=FALSE)
 26 |   return(Xc %*% obj$V)
 27 | }
 28 | 
 29 | # https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/
 30 | # dataset
 31 | n_samples = 10
 32 | experience = rnorm(n_samples)
 33 | salary = 1500 + experience + .5 * rnorm(n_samples)
 34 | other = rnorm(n_samples)
 35 | X = cbind(experience, salary, other)
 36 | 
 37 | # Optional: standardize data
 38 | Xcs = scale(X, center=TRUE, scale=FALSE)
 39 | attr(Xcs, "scaled:center") = NULL
 40 | attr(Xcs, "scaled:scale") = NULL
 41 | 
 42 | basic_pca = BasicPCA(Xcs)
 43 | BasicPCA.transform(basic_pca, Xcs)
 44 | 
 45 | # PCA with prcomp
 46 | pca = prcomp(Xcs, center=TRUE, scale.=FALSE)
 47 | names(pca)
 48 | 
 49 | # Compare
 50 | all(pca$rotation == basic_pca$V)
 51 | all(predict(pca, Xcs) == BasicPCA.transform(basic_pca, Xcs))
 52 | 
 53 | # "https://raw.github.com/neurospin/pystatsml/master/data/iris.csv"
 54 | # 
 55 | # Describe the data set. Should the dataset been standardized ?
 56 | # 
 57 | # Retrieve the explained variance ratio. Determine $K$ the number of components.
 58 | # 
 59 | # Print the $K$ principal components direction and correlation of the $K$ principal
 60 | # components with original variables. Interpret the contribution of original variables
 61 | # into the PC.
 62 | # 
 63 | # Plot samples projected into the $K$ first PCs.
 64 | # 
 65 | # Color samples with their species.
 66 | #
 67 | 
 68 | url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
 69 | data = read.csv(url)
 70 | #setwd("/home/ed203246/git/pystatsml/notebooks")
 71 | data = read.csv("../data/iris.csv")
 72 | 
 73 | # Describe the data set. Should the dataset been standardized ?
 74 | 
 75 | summary(data)
 76 | # sepal_length    sepal_width     petal_length    petal_width          species  
 77 | # Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
 78 | # 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
 79 | # Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50  
 80 | # Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                  
 81 | # 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
 82 | # Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500 
 83 | 
 84 | numcols = colnames(data)[unlist(lapply(data, is.numeric))]
 85 | apply(data[, numcols], 2, sd)
 86 | #sepal_length  sepal_width petal_length  petal_width 
 87 | #0.8280661    0.4358663    1.7652982    0.7622377 
 88 | 
 89 | 
 90 | # Describe the structure of correlation among variables.
 91 | X = data[, numcols]
 92 | cor(X)
 93 | 
 94 | # Compute a PCA with the maximum number of compoenents.
 95 | Xcs = scale(X, center=TRUE, scale=TRUE)
 96 | attr(Xcs, "scaled:center") = NULL
 97 | attr(Xcs, "scaled:scale") = NULL
 98 | apply(Xcs, 2, sd)
 99 | apply(Xcs, 2, mean)
100 | 
101 | #Compute a PCA with the maximum number of compoenents.
102 | pca = prcomp(Xcs)
103 | 
104 | # Variance ratio by component
105 | (pca$sdev ** 2) / sum(pca$sdev ** 2)
106 | #[1] 0.729624454 0.228507618 0.036689219 0.005178709
107 | 
108 | # cumulative explained variance
109 | cumsum(pca$sdev ** 2) / sum(pca$sdev ** 2)
110 | 
111 | # K = 2
112 | names(pca)
113 | pca$rotation
114 | 
115 | PC = predict(pca, Xcs)
116 | t(cor(Xcs, PC[, 1:2]))
117 | #     sepal_length sepal_width petal_length petal_width
118 | # PC1    0.8901688  -0.4601427   0.99155518  0.96497896
119 | # PC2   -0.3608299  -0.8827163  -0.02341519 -0.06399985
120 | 
121 | data = cbind(data, PC)
122 | 
123 | # Plot samples projected into the K first PCs
124 | # Color samples with their species.
125 | library(ggplot2)
126 | 
127 | qplot(PC1, PC2, data=data, colour=species)
128 | 
129 | ####################################################################
130 | ## MDS
131 | ####################################################################
132 | 
133 | ##############
134 | ## eurodist ##
135 | ##############
136 | 
137 | # Perform similar analysis on eurodist dataset using R, using:
138 | # - MDS: cmdscale.
139 | # - Euclidian parwise distance: dist
140 | #
141 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv'
142 | #data = read.csv(url)
143 | 
144 | setwd("~/git/pystatsml/notebooks")
145 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv'
146 | data = read.csv("../data/eurodist.csv")
147 | 
148 | city = data[["city"]]
149 | D = data[, 2:ncol(data)]
150 | 
151 | print(data[1:5, 1:5])
152 | 
153 | # Arbitrary choice of K=2 components
154 | mds = cmdscale(D, k=2, , eig=T)
155 | 
156 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary.
157 | print(as.matrix(dist(mds$points))[1:5, 1:5])
158 | 
159 | plot(mds$points[,1], -mds$points[,2])
160 | text(mds$points[,1], -mds$points[,2], city, cex=0.8)
161 | 
162 | 
163 | # Apply MDS using cmdscale
164 | k_range = 1:(min(5, dim(D)-1))
165 | stress <- rep(0, max.k)
166 | for (kk in k_range){
167 |   mds <- cmdscale(D, k=kk, eig=T)
168 |   stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5
169 | }
170 | plot(k_range, stress, type="l", xlab="k", ylab="stress")
171 | #cbind(1:max.k,P.k)
172 | 
173 | # Ressources
174 | # http://people.stat.sc.edu/Hitchcock/chapter5_R_examples.txt
175 | 
176 | ##########
177 | ## iris ##
178 | ##########
179 | 
180 | # Perform similar analysis on eurodist dataset using R, using:
181 | # - MDS: cmdscale.
182 | # - Euclidian parwise distance: dist
183 | #
184 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
185 | #data = read.csv(url)
186 | 
187 | setwd("~/git/pystatsml/notebooks")
188 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv'
189 | data = read.csv("../data/iris.csv")
190 | 
191 | species = data[["species"]]
192 | X = scale(data[, 1:4])
193 | attr(X, "scaled:center") = NULL
194 | attr(X, "scaled:scale") = NULL
195 | D = as.matrix(dist(X))
196 | print(D[1:5, 1:5])
197 | 
198 | # Select K
199 | k_range = 1:(min(5, dim(D)-1))
200 | stress <- rep(0, max.k)
201 | for (kk in k_range){
202 |   mds <- cmdscale(D, k=kk, eig=T)
203 |   stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5
204 | }
205 | plot(k_range, stress, type="l", xlab="k", ylab="stress")
206 | 
207 | K = 2 # components
208 | mds = cmdscale(D, k=K , eig=T)
209 | 
210 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary.
211 | print(as.matrix(dist(mds$points))[1:5, 1:5])
212 | 
213 | plot(mds$points[,1], -mds$points[,2], col=species)
214 | 
215 | # PCA with prcomp
216 | pca = prcomp(X, center=TRUE, scale.=FALSE)
217 | names(pca)
218 | PC = predict(pca, X)[, 1:K]
219 | 
220 | # Compute correlation between PCA and MDS components
221 | cor(cbind(mds$points, PC))
222 | 
223 | #     1.000000e+00  1.551000e-16 1.000000e+00  4.766625e-16
224 | #     1.551000e-16  1.000000e+00 4.474091e-16 -1.000000e+00
225 | # PC1 1.000000e+00  4.474091e-16 1.000000e+00  1.842964e-16
226 | # PC2 4.766625e-16 -1.000000e+00 1.842964e-16  1.000000e+00
227 | 
228 | 
229 | ####################################################################
230 | ## isomap
231 | ####################################################################
232 | install.packages("vegan")
233 | 
234 | s_curve = read.csv("../data/s_curve.csv")
235 | colnames(s_curve)
236 | 
237 | X = as.matrix(s_curve[, c("x", "y", "z")])
238 | color = s_curve[["color"]]
239 | D <- dist(X, method="euclidean")
240 | 
241 | library(vegan)
242 | 
243 | iso = isomap(D, ndim=2, k=10)
244 | 
245 | #install.packages("ggplot2")
246 | library(ggplot2)
247 | 
248 | qplot(iso$points[,1], iso$points[,2], col=color) + scale_colour_gradientn(colours=rainbow(4))
249 | scale_fill_distiller(palette = "Spectral")


--------------------------------------------------------------------------------
/utils/ml_processing_pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Apr 11 15:40:35 2016
  4 | 
  5 | @author: edoaurd.duchesnay@cea.fr
  6 | """
  7 | from sklearn import preprocessing
  8 | preprocessing.OneHotEncoder
  9 | 
 10 | 
 11 | '''
 12 | Regression pipelines
 13 | ====================
 14 | '''
 15 | import numpy as np
 16 | from sklearn import datasets
 17 | import sklearn.linear_model as lm
 18 | from sklearn import preprocessing
 19 | from sklearn.cross_validation import cross_val_score
 20 | from sklearn.feature_selection import SelectKBest
 21 | from sklearn.feature_selection import f_regression
 22 | from sklearn.pipeline import Pipeline
 23 | from sklearn.grid_search import GridSearchCV
 24 | import sklearn.metrics as metrics
 25 | 
 26 | # Datasets
 27 | n_samples, n_features, noise_sd = 100, 100, 20
 28 | X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=n_features, 
 29 |                                       noise=noise_sd, n_informative=5,
 30 |                                       random_state=42, coef=True)
 31 |  
 32 | # Use this to tune the noise parameter such that snr < 5
 33 | print("SNR:", np.std(np.dot(X, coef)) / noise_sd)
 34 | 
 35 | print("=============================")
 36 | print("== Basic linear regression ==")
 37 | print("=============================")
 38 | 
 39 | scores = cross_val_score(estimator=lm.LinearRegression(), X=X, y=y, cv=5)
 40 | print("Test  r2:%.2f" % scores.mean())
 41 | 
 42 | print("==============================================")
 43 | print("== Scaler + anova filter + ridge regression ==")
 44 | print("==============================================")
 45 | 
 46 | anova_ridge = Pipeline([
 47 |     ('standardscaler', preprocessing.StandardScaler()),
 48 |     ('selectkbest', SelectKBest(f_regression)),
 49 |     ('ridge', lm.Ridge())
 50 | ])
 51 | param_grid = {'selectkbest__k':np.arange(10, 110, 10), 
 52 |               'ridge__alpha':[.001, .01, .1, 1, 10, 100] }
 53 | 
 54 | # Expect execution in ipython, for python remove the %time
 55 | print("----------------------------")
 56 | print("-- Parallelize inner loop --")
 57 | print("----------------------------")
 58 | 
 59 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5,  param_grid=param_grid, n_jobs=-1)
 60 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5)
 61 | print("Test r2:%.2f" % scores.mean())
 62 | 
 63 | print("----------------------------")
 64 | print("-- Parallelize outer loop --")
 65 | print("----------------------------")
 66 | 
 67 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5,  param_grid=param_grid)
 68 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, n_jobs=-1)
 69 | print("Test r2:%.2f" % scores.mean())
 70 | 
 71 | 
 72 | print("=====================================")
 73 | print("== Scaler + Elastic-net regression ==")
 74 | print("=====================================")
 75 | 
 76 | alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000] 
 77 | l1_ratio = [.1, .5, .9]
 78 | 
 79 | print("----------------------------")
 80 | print("-- Parallelize outer loop --")
 81 | print("----------------------------")
 82 | 
 83 | enet = Pipeline([
 84 |     ('standardscaler', preprocessing.StandardScaler()),
 85 |     ('enet', lm.ElasticNet(max_iter=10000)),
 86 | ])
 87 | param_grid = {'enet__alpha':alphas ,
 88 |               'enet__l1_ratio':l1_ratio}
 89 | enet_cv = GridSearchCV(enet, cv=5,  param_grid=param_grid)
 90 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1)
 91 | print("Test r2:%.2f" % scores.mean())
 92 | 
 93 | print("-----------------------------------------------")
 94 | print("-- Parallelize outer loop + built-in CV      --")
 95 | print("-- Remark: scaler is only done on outer loop --")
 96 | print("-----------------------------------------------")
 97 | 
 98 | enet_cv = Pipeline([
 99 |     ('standardscaler', preprocessing.StandardScaler()),
100 |     ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas)),
101 | ])
102 | 
103 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5)
104 | print("Test r2:%.2f" % scores.mean())
105 | 
106 | '''
107 | Classification pipelines
108 | ========================
109 | '''
110 | import numpy as np
111 | from sklearn import datasets
112 | import sklearn.linear_model as lm
113 | from sklearn import preprocessing
114 | from sklearn.cross_validation import cross_val_score
115 | from sklearn.feature_selection import SelectKBest
116 | from sklearn.feature_selection import f_classif
117 | from sklearn.pipeline import Pipeline
118 | from sklearn.grid_search import GridSearchCV
119 | import sklearn.metrics as metrics
120 | 
121 | # Datasets
122 | n_samples, n_features, noise_sd = 100, 100, 20
123 | X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features,
124 |                          n_informative=5, random_state=42)
125 | 
126 | 
127 | def balanced_acc(estimator, X, y, **kwargs):
128 |     '''
129 |     Balanced accuracy scorer
130 |     '''
131 |     return metrics.recall_score(y, estimator.predict(X), average=None).mean()
132 | 
133 | print("===============================")
134 | print("== Basic logistic regression ==")
135 | print("===============================")
136 | 
137 | scores = cross_val_score(estimator=lm.LogisticRegression(C=1e8, class_weight='balanced'),
138 |                          X=X, y=y, cv=5, scoring=balanced_acc)
139 | print("Test  bACC:%.2f" % scores.mean())
140 | 
141 | print("=======================================================")
142 | print("== Scaler + anova filter + ridge logistic regression ==")
143 | print("=======================================================")
144 | 
145 | anova_ridge = Pipeline([
146 |     ('standardscaler', preprocessing.StandardScaler()),
147 |     ('selectkbest', SelectKBest(f_classif)),
148 |     ('ridge', lm.LogisticRegression(penalty='l2', class_weight='balanced'))
149 | ])
150 | param_grid = {'selectkbest__k':np.arange(10, 110, 10), 
151 |               'ridge__C':[.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]}
152 | 
153 | 
154 | # Expect execution in ipython, for python remove the %time
155 | print("----------------------------")
156 | print("-- Parallelize inner loop --")
157 | print("----------------------------")
158 | 
159 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5,  param_grid=param_grid, 
160 |                               scoring=balanced_acc, n_jobs=-1)
161 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\
162 |                                scoring=balanced_acc)
163 | print("Test bACC:%.2f" % scores.mean())
164 | 
165 | print("----------------------------")
166 | print("-- Parallelize outer loop --")
167 | print("----------------------------")
168 | 
169 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5,  param_grid=param_grid,
170 |                               scoring=balanced_acc)
171 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\
172 |                                scoring=balanced_acc, n_jobs=-1)
173 | print("Test bACC:%.2f" % scores.mean())
174 | 
175 | 
176 | print("========================================")
177 | print("== Scaler + lasso logistic regression ==")
178 | print("========================================")
179 | 
180 | Cs = np.array([.0001, .001, .01, .1, 1, 10, 100, 1000, 10000])
181 | alphas = 1 / Cs
182 | l1_ratio = [.1, .5, .9]
183 | 
184 | print("----------------------------")
185 | print("-- Parallelize outer loop --")
186 | print("----------------------------")
187 | 
188 | lasso = Pipeline([
189 |     ('standardscaler', preprocessing.StandardScaler()),
190 |     ('lasso', lm.LogisticRegression(penalty='l1', class_weight='balanced')),
191 | ])
192 | param_grid = {'lasso__C':Cs}
193 | enet_cv = GridSearchCV(lasso, cv=5,  param_grid=param_grid, scoring=balanced_acc)
194 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\
195 |                                scoring=balanced_acc, n_jobs=-1)
196 | print("Test bACC:%.2f" % scores.mean())
197 | 
198 | 
199 | print("-----------------------------------------------")
200 | print("-- Parallelize outer loop + built-in CV      --")
201 | print("-- Remark: scaler is only done on outer loop --")
202 | print("-----------------------------------------------")
203 | 
204 | lasso_cv = Pipeline([
205 |     ('standardscaler', preprocessing.StandardScaler()),
206 |     ('lasso', lm.LogisticRegressionCV(Cs=Cs, scoring=balanced_acc)),
207 | ])
208 | 
209 | %time scores = cross_val_score(estimator=lasso_cv, X=X, y=y, cv=5)
210 | print("Test bACC:%.2f" % scores.mean())
211 | 
212 | 
213 | print("=============================================")
214 | print("== Scaler + Elasticnet logistic regression ==")
215 | print("=============================================")
216 | 
217 | print("----------------------------")
218 | print("-- Parallelize outer loop --")
219 | print("----------------------------")
220 | 
221 | enet = Pipeline([
222 |     ('standardscaler', preprocessing.StandardScaler()),
223 |     ('enet', lm.SGDClassifier(loss="log", penalty="elasticnet",
224 |                             alpha=0.0001, l1_ratio=0.15, class_weight='balanced')),
225 | ])
226 | 
227 | param_grid = {'enet__alpha':alphas,
228 |               'enet__l1_ratio':l1_ratio}
229 | 
230 | enet_cv = GridSearchCV(enet, cv=5,  param_grid=param_grid, scoring=balanced_acc)
231 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\
232 |     scoring=balanced_acc, n_jobs=-1)
233 | print("Test bACC:%.2f" % scores.mean())
234 | 


--------------------------------------------------------------------------------
/machine_learning/ml_supervized_nonlinear.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Non-linear models
  3 | =================
  4 | 
  5 | Here we focuse on non-linear models for classification. Nevertheless, each
  6 | classification model has its regression counterpart.
  7 | '''
  8 | 
  9 | # get_ipython().run_line_magic('matplotlib', 'inline')
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | import seaborn as sns
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | from sklearn.svm import SVC
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | from sklearn import datasets
 21 | from sklearn import metrics
 22 | from sklearn.model_selection import train_test_split
 23 | 
 24 | np.set_printoptions(precision=2)
 25 | pd.set_option('precision', 2)
 26 | 
 27 | # %%
 28 | # Support Vector Machines (SVM)
 29 | # -----------------------------
 30 | #
 31 | # SVM are based kernel methods require only a user-specified kernel function
 32 | # :math:`K(x_i, x_j)`, i.e., a **similarity function** over pairs of data
 33 | # points :math:`(x_i, x_j)` into kernel (dual) space on which learning
 34 | # algorithms operate linearly, i.e. every operation on points is a linear
 35 | # combination of :math:`K(x_i, x_j)`.
 36 | # Outline of the SVM algorithm:
 37 | #
 38 | # 1. Map points  :math:`x` into kernel space using a kernel function:
 39 | #    :math:`x \rightarrow K(x, .)`.
 40 | # 2. Learning algorithms operates linearly by dot product into high-kernel
 41 | #    space :math:`K(., x_i) \cdot K(., x_j)`.
 42 | #     - Using the kernel trick (Mercer’s Theorem) replaces dot product in high
 43 | #       dimensional space by a simpler operation such that
 44 | #       :math:`K(., x_i) \cdot K(., x_j) = K(x_i, x_j)`.
 45 | #       Thus we only need to compute a similarity measure  for each pairs of
 46 | #       point and store in a :math:`N \times N` Gram matrix.
 47 | #     - Finally, The learning process consist of estimating the $\alpha_i$ of
 48 | #       the decision function that maximises the hinge loss (of :math:`f(x)`)
 49 | #       plus some penalty when applied on all training points.
 50 | #
 51 | # .. math::
 52 | #
 53 | #    f(x) = \text{sign} \left(\sum_i^N \alpha_i~y_i~K(x_i, x)\right).
 54 | #
 55 | # 3. Predict a new point $x$ using the decision function.
 56 | #
 57 | # .. figure:: ../images/svm_rbf_kernel_mapping_and_decision_function.png
 58 | #    :alt: Support Vector Machines.
 59 | #
 60 | # Gaussian kernel (RBF, Radial Basis Function):
 61 | #
 62 | # One of the most commonly used kernel is the Radial Basis Function (RBF) Kernel.
 63 | # For a pair of points :math:`x_i, x_j` the RBF kernel is defined as:
 64 | #
 65 | # .. raw:: latex
 66 | #
 67 | #    \begin{align}
 68 | #       K(x_i, x_j) &= \exp\left(-\frac{\|x_i - x_j\|^2}{2\sigma^2}\right)\\
 69 | #       &= \exp\left(-\gamma~\|x_i - x_j\|^2\right)
 70 | #    \end{align}
 71 | #
 72 | # Where :math:`\sigma` (or :math:`\gamma`)  defines the kernel width parameter.
 73 | # Basically, we consider a Gaussian function centered on each training sample
 74 | # :math:`x_i`.  it has a ready interpretation as a similarity measure as it
 75 | # decreases with squared Euclidean distance between the two feature vectors.
 76 | #
 77 | # Non linear SVM also exists for regression problems.
 78 | 
 79 | 
 80 | # %%
 81 | # dataset
 82 | 
 83 | X, y = datasets.load_breast_cancer(return_X_y=True)
 84 | X_train, X_test, y_train, y_test = \
 85 |     train_test_split(X, y, test_size=0.5, stratify=y, random_state=42)
 86 | 
 87 | # %%
 88 | # Preprocessing: unequal variance of input features, requires scaling for svm.
 89 | 
 90 | ax = sns.displot(x=X_train.std(axis=0), kind="kde", bw_adjust=.2, cut=0,
 91 |                  fill=True, height=3, aspect=1.5,)
 92 | _ = ax.set_xlabels("Std-dev").tight_layout()
 93 | 
 94 | scaler = StandardScaler()
 95 | X_train = scaler.fit_transform(X_train)
 96 | X_test = scaler.fit_transform(X_test)
 97 | 
 98 | # %%
 99 | # Fit-predict
100 | # Probalility is a logistic of the decision_function
101 | 
102 | svm = SVC(kernel='rbf', probability=True).fit(X_train, y_train)
103 | y_pred = svm.predict(X_test)
104 | y_score = svm.decision_function(X_test)
105 | y_prob = svm.predict_proba(X_test)[:, 1]
106 | 
107 | ax = sns.relplot(x=y_score, y=y_prob, hue=y_pred, height=2, aspect=1.5)
108 | _ = ax.set_axis_labels("decision function", "Probability").tight_layout()
109 | 
110 | # %% Scores
111 | 
112 | print("bAcc: %.2f, AUC: %.2f (AUC with proba: %.2f)" % (
113 |       metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
114 |       metrics.roc_auc_score(y_true=y_test, y_score=y_score),
115 |       metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
116 | 
117 | # Usefull internals: indices of support vectors within original X
118 | np.all(X_train[svm.support_, :] == svm.support_vectors_)
119 | 
120 | 
121 | # %%
122 | # Random forest
123 | # -------------
124 | #
125 | # Decision tree
126 | # ~~~~~~~~~~~~~
127 | #
128 | # A tree can be "learned" by splitting the training dataset into subsets based on an features value test.
129 | # Each internal node represents a "test" on an feature resulting on the split of the current sample. At each step the algorithm selects the feature and a cutoff value that maximises a given metric. Different metrics exist for regression tree (target is continuous) or classification tree (the target is qualitative).
130 | # This process is repeated on each derived subset in a recursive manner called recursive partitioning. The recursion is completed when the subset at a node has all the same value of the target variable, or when splitting no longer adds value to the predictions. This general principle is implemented by many recursive partitioning tree algorithms.
131 | #
132 | # .. figure:: ../images/classification_tree.png
133 | #    :width: 400
134 | #    :alt: Classification tree.
135 | #
136 | # Decision trees are simple to understand and interpret however they tend to overfit the data. However decision trees tend to overfit the training set.  Leo Breiman propose random forest to deal with this issue.
137 | #
138 | # A single decision tree is usually overfits the data it is learning from because it learn from only one pathway of decisions. Predictions from a single decision tree usually don’t make accurate predictions on new data.
139 | #
140 | # Forest
141 | # ~~~~~~
142 | #
143 | # A random forest is a meta estimator that fits a number of **decision tree learners** on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.
144 | # Random forest models reduce the risk of overfitting by introducing randomness by:
145 | #
146 | # .. figure:: ../images/random_forest.png
147 | #    :width: 300
148 | #    :alt: Random forest.
149 | #
150 | # - building multiple trees (n_estimators)
151 | # - drawing observations with replacement (i.e., a bootstrapped sample)
152 | # - splitting nodes on the best split among a random subset of the features selected at every node
153 | #
154 | 
155 | from sklearn.ensemble import RandomForestClassifier
156 | 
157 | forest = RandomForestClassifier(n_estimators = 100)
158 | forest.fit(X_train, y_train)
159 | 
160 | y_pred = forest.predict(X_test)
161 | y_prob = forest.predict_proba(X_test)[:, 1]
162 | 
163 | 
164 | # %% Scores
165 | 
166 | print("bAcc: %.2f, AUC: %.2f " % (
167 |       metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
168 |       metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
169 | 
170 | # %%
171 | # Extra Trees (Low Variance)
172 | #
173 | # Extra Trees is like Random Forest, in that it builds multiple trees and splits nodes using random subsets of features, but with two key differences: it does not bootstrap observations (meaning it samples without replacement), and nodes are split on random splits, not best splits. So, in summary, ExtraTrees:
174 | # builds multiple trees with bootstrap = False by default, which means it samples without replacement
175 | # nodes are split based on random splits among a random subset of the features selected at every node
176 | # In Extra Trees, randomness doesn’t come from bootstrapping of data, but rather comes from the random splits of all observations.
177 | # ExtraTrees is named for (Extremely Randomized Trees).
178 | 
179 | 
180 | # %%
181 | # Gradient boosting
182 | # -----------------
183 | # 
184 | # Gradient boosting is a meta estimator that fits a sequence of **weak learners**.
185 | # Each learner aims to reduce the residuals (errors) produced by the previous learner.
186 | # The two main hyper-parameters are:
187 | #
188 | # - The **learning rate** (*lr*) controls over-fitting:
189 | #   decreasing the *lr* limits the capacity of a learner to overfit the residuals, ie,
190 | #   it slows down the learning speed and thus increases the **regularisation**. 
191 | #
192 | # - The **sub-sampling fraction** controls the fraction of samples to be used for
193 | #   fitting the learners. Values smaller than 1 leads to **Stochastic Gradient Boosting**.
194 | #   It thus controls for over-fitting reducing variance and incresing bias.
195 | #
196 | # .. figure:: ../images/gradient_boosting.png
197 | #    :width: 500
198 | #    :alt: Gradient boosting.
199 | #
200 | 
201 | 
202 | from sklearn.ensemble import GradientBoostingClassifier
203 | 
204 | gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
205 |                                 subsample=0.5, random_state=0)
206 | gb.fit(X_train, y_train)
207 | 
208 | y_pred = gb.predict(X_test)
209 | y_prob = gb.predict_proba(X_test)[:, 1]
210 | 
211 | print("bAcc: %.2f, AUC: %.2f " % (
212 |       metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred),
213 |       metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
214 | 


--------------------------------------------------------------------------------
/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Machine Learning documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Nov 30 16:25:34 2015.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | import shlex
 18 | 
 19 | # If extensions (or modules to document with autodoc) are in another directory,
 20 | # add these directories to sys.path here. If the directory is relative to the
 21 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 22 | #sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | # -- General configuration ------------------------------------------------
 25 | 
 26 | # If your documentation needs a minimal Sphinx version, state it here.
 27 | #needs_sphinx = '1.0'
 28 | 
 29 | # Add any Sphinx extension module names here, as strings. They can be
 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 31 | # ones.
 32 | extensions = [
 33 |     'sphinx.ext.mathjax',
 34 |     'sphinx_gallery.gen_gallery',
 35 |     'docxbuilder',
 36 | ]
 37 | 
 38 | # Add any paths that contain templates here, relative to this directory.
 39 | templates_path = ['_templates']
 40 | 
 41 | # The suffix(es) of source filenames.
 42 | # You can specify multiple suffix as a list of string:
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The encoding of source files.
 47 | #source_encoding = 'utf-8-sig'
 48 | 
 49 | # The master toctree document.
 50 | master_doc = 'index'
 51 | 
 52 | # General information about the project.
 53 | project = u'Statistics and Machine Learning in Python'
 54 | copyright = u'2020, Edouard Duchesnay, NeuroSpin CEA Université Paris-Saclay, France'
 55 | author = u'Edouard Duchesnay, Tommy Löfstedt, Younes Feki'
 56 | 
 57 | # The version info for the project you're documenting, acts as replacement for
 58 | # |version| and |release|, also used in various other places throughout the
 59 | # built documents.
 60 | #
 61 | # The short X.Y version.
 62 | version = '0.5'
 63 | # The full version, including alpha/beta/rc tags.
 64 | release = '0.5'
 65 | 
 66 | # The language for content autogenerated by Sphinx. Refer to documentation
 67 | # for a list of supported languages.
 68 | #
 69 | # This is also used if you do content translation via gettext catalogs.
 70 | # Usually you set "language" from the command line for these cases.
 71 | language = None
 72 | 
 73 | # There are two options for replacing |today|: either, you set today to some
 74 | # non-false value, then it is used:
 75 | #today = ''
 76 | # Else, today_fmt is used as the format for a strftime call.
 77 | #today_fmt = '%B %d, %Y'
 78 | 
 79 | # List of patterns, relative to source directory, that match files and
 80 | # directories to ignore when looking for source files.
 81 | exclude_patterns = ["notebooks/notebooks"]
 82 | 
 83 | # The reST default role (used for this markup: `text`) to use for all
 84 | # documents.
 85 | #default_role = None
 86 | 
 87 | # If true, '()' will be appended to :func: etc. cross-reference text.
 88 | #add_function_parentheses = True
 89 | 
 90 | # If true, the current module name will be prepended to all description
 91 | # unit titles (such as .. function::).
 92 | #add_module_names = True
 93 | 
 94 | # If true, sectionauthor and moduleauthor directives will be shown in the
 95 | # output. They are ignored by default.
 96 | #show_authors = False
 97 | 
 98 | # The name of the Pygments (syntax highlighting) style to use.
 99 | pygments_style = 'sphinx'
100 | 
101 | # A list of ignored prefixes for module index sorting.
102 | #modindex_common_prefix = []
103 | 
104 | # If true, keep warnings as "system message" paragraphs in the built documents.
105 | keep_warnings = False
106 | 
107 | # If true, `todo` and `todoList` produce output, else they produce nothing.
108 | todo_include_todos = False
109 | 
110 | 
111 | # -- Options for HTML output ----------------------------------------------
112 | 
113 | # The theme to use for HTML and HTML Help pages.  See the documentation for
114 | # a list of builtin themes.
115 | html_theme = 'alabaster'
116 | 
117 | # Theme options are theme-specific and customize the look and feel of a theme
118 | # further.  For a list of options available for each theme, see the
119 | # documentation.
120 | #html_theme_options = {}
121 | 
122 | # Add any paths that contain custom themes here, relative to this directory.
123 | #html_theme_path = []
124 | 
125 | # The name for this set of Sphinx documents.  If None, it defaults to
126 | # "<project> v<release> documentation".
127 | #html_title = None
128 | 
129 | # A shorter title for the navigation bar.  Default is the same as html_title.
130 | #html_short_title = None
131 | 
132 | # The name of an image file (relative to this directory) to place at the top
133 | # of the sidebar.
134 | #html_logo = None
135 | 
136 | # The name of an image file (within the static path) to use as favicon of the
137 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
138 | # pixels large.
139 | #html_favicon = None
140 | 
141 | # Add any paths that contain custom static files (such as style sheets) here,
142 | # relative to this directory. They are copied after the builtin static files,
143 | # so a file named "default.css" will overwrite the builtin "default.css".
144 | html_static_path = ['_static']
145 | 
146 | # Add any extra paths that contain custom files (such as robots.txt or
147 | # .htaccess) here, relative to this directory. These files are copied
148 | # directly to the root of the documentation.
149 | #html_extra_path = []
150 | 
151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
152 | # using the given strftime format.
153 | #html_last_updated_fmt = '%b %d, %Y'
154 | 
155 | # If true, SmartyPants will be used to convert quotes and dashes to
156 | # typographically correct entities.
157 | #html_use_smartypants = True
158 | 
159 | # Custom sidebar templates, maps document names to template names.
160 | #html_sidebars = {}
161 | 
162 | # Additional templates that should be rendered to pages, maps page names to
163 | # template names.
164 | #html_additional_pages = {}
165 | 
166 | # If false, no module index is generated.
167 | #html_domain_indices = True
168 | 
169 | # If false, no index is generated.
170 | #html_use_index = True
171 | 
172 | # If true, the index is split into individual pages for each letter.
173 | #html_split_index = False
174 | 
175 | # If true, links to the reST sources are added to the pages.
176 | #html_show_sourcelink = True
177 | 
178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
179 | #html_show_sphinx = True
180 | 
181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
182 | #html_show_copyright = True
183 | 
184 | # If true, an OpenSearch description file will be output, and all pages will
185 | # contain a <link> tag referring to it.  The value of this option must be the
186 | # base URL from which the finished HTML is served.
187 | html_use_opensearch = 'https://duchesnay.github.io/pystatsml/'
188 | 
189 | # This is the file name suffix for HTML files (e.g. ".xhtml").
190 | #html_file_suffix = None
191 | 
192 | # Language to be used for generating the HTML full-text search index.
193 | # Sphinx supports the following languages:
194 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
195 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
196 | #html_search_language = 'en'
197 | 
198 | # A dictionary with options for the search language support, empty by default.
199 | # Now only 'ja' uses this config value
200 | #html_search_options = {'type': 'default'}
201 | 
202 | # The name of a javascript file (relative to the configuration directory) that
203 | # implements a search results scorer. If empty, the default will be used.
204 | #html_search_scorer = 'scorer.js'
205 | 
206 | # Output file base name for HTML help builder.
207 | htmlhelp_basename = 'StatisticsMachineLearningPython'
208 | 
209 | # -- Options for LaTeX output ---------------------------------------------
210 | 
211 | latex_elements = {
212 | # The paper size ('letterpaper' or 'a4paper').
213 | 'papersize': 'a4paper',
214 | 
215 | # The font size ('10pt', '11pt' or '12pt').
216 | #'pointsize': '10pt',
217 |     'pointsize': '11pt',
218 | # Additional stuff for the LaTeX preamble.
219 | #    'preamble': '''
220 | #        \\usepackage{amsfonts}
221 | #    ''',
222 |     'preamble': r'''
223 |         \usepackage{charter}
224 |         \usepackage[defaultsans]{lato}
225 |         \usepackage{inconsolata}
226 |     ''',
227 | 
228 | # Latex figure (float) alignment
229 | #'figure_align': 'htbp',
230 | }
231 | 
232 | # Grouping the document tree into LaTeX files. List of tuples
233 | # (source start file, target name, title,
234 | #  author, documentclass [howto, manual, or own class]).
235 | latex_documents = [
236 |   (master_doc, 'StatisticsMachineLearningPython.tex', u'Statistics and Machine Learning in Python',
237 | #  (master_doc, 'StatisticsMachineLearningPython.tex', u'Python fundamentals and advanced',
238 |    u'Edouard Duchesnay, Tommy Löfstedt, Feki Younes', 'manual'),
239 | ]
240 | 
241 | # The name of an image file (relative to this directory) to place at the top of
242 | # the title page.
243 | #latex_logo = None
244 | 
245 | # For "manual" documents, if this is true, then toplevel headings are parts,
246 | # not chapters.
247 | #latex_use_parts = False
248 | 
249 | # If true, show page references after internal links.
250 | #latex_show_pagerefs = False
251 | 
252 | # If true, show URL addresses after external links.
253 | # latex_show_urls = True
254 | 
255 | # Documents to append as an appendix to all manuals.
256 | #latex_appendices = []
257 | 
258 | # If false, no module index is generated.
259 | #latex_domain_indices = True
260 | 
261 | 
262 | # -- Options for manual page output ---------------------------------------
263 | 
264 | # One entry per manual page. List of tuples
265 | # (source start file, name, description, authors, manual section).
266 | man_pages = [
267 |     (master_doc, 'statisticsmachinelearning', u'Statistics and Machine Learning in Python',
268 |      [author], 1)
269 | ]
270 | 
271 | # If true, show URL addresses after external links.
272 | #man_show_urls = False
273 | 
274 | 
275 | # -- Options for Texinfo output -------------------------------------------
276 | 
277 | # Grouping the document tree into Texinfo files. List of tuples
278 | # (source start file, target name, title, author,
279 | #  dir menu entry, description, category)
280 | texinfo_documents = [
281 |   (master_doc, 'StatisticsMachineLearningPython', u'Statistics and Machine Learning in Python',
282 |    author, 'MachineLearning', 'One line description of project.',
283 |    'Miscellaneous'),
284 | ]
285 | 
286 | # Documents to append as an appendix to all manuals.
287 | #texinfo_appendices = []
288 | 
289 | # If false, no module index is generated.
290 | #texinfo_domain_indices = True
291 | 
292 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
293 | #texinfo_show_urls = 'footnote'
294 | 
295 | # If true, do not generate a @detailmenu in the "Top" node's menu.
296 | #texinfo_no_detailmenu = False
297 | 
298 | 
299 | # -- Options for sphinx gallery -------------------------------------------
300 | 
301 | sphinx_gallery_conf = {
302 |     # path to your examples scripts
303 |     'examples_dirs' : ['python_lang', 'scientific_python', 'statistics', 'machine_learning', 'labs'],
304 |     'filename_pattern': '/',
305 |     # path where to save gallery generated examples
306 |     'gallery_dirs'  : ['auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery'],
307 |     'backreferences_dir': False}
308 | 
309 | 
310 | 


--------------------------------------------------------------------------------
/machine_learning/decomposition_solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dimension reduction and feature extraction\n",
  8 |     "\n",
  9 |     "## Principal Component Analysis\n",
 10 |     "\n",
 11 |     "### Implement PCA\n",
 12 |     "\n",
 13 |     "- Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean and principal components directions. `transform(X)` that project a new the data into the principal components.\n",
 14 |     "\n",
 15 |     "- Check that your `BasicPCA` performed similarly to the one from sklearn:\n",
 16 |     "`from sklearn.decomposition import PCA`"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "execution": {
 24 |      "iopub.execute_input": "2020-10-11T22:53:14.585085Z",
 25 |      "iopub.status.busy": "2020-10-11T22:53:14.584709Z",
 26 |      "iopub.status.idle": "2020-10-11T22:53:15.274591Z",
 27 |      "shell.execute_reply": "2020-10-11T22:53:15.274226Z"
 28 |     }
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "import numpy as np\n",
 33 |     "import scipy\n",
 34 |     "import matplotlib.pyplot as plt\n",
 35 |     "import seaborn as sns\n",
 36 |     "%matplotlib inline\n",
 37 |     "#%matplotlib qt\n",
 38 |     "\n",
 39 |     "np.random.seed(42)\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "import numpy as np\n",
 43 |     "from sklearn.decomposition import PCA\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "class BasicPCA():\n",
 47 |     "    def fit(self, X):\n",
 48 |     "        # U : Unitary matrix having left singular vectors as columns.\n",
 49 |     "        #     Of shape (n_samples,n_samples) or (n_samples,n_comps), depending on\n",
 50 |     "        #     full_matrices.\n",
 51 |     "        #\n",
 52 |     "        # s : The singular values, sorted in non-increasing order. Of shape (n_comps,), \n",
 53 |     "        #     with n_comps = min(n_samples, n_features).\n",
 54 |     "        #\n",
 55 |     "        # Vh: Unitary matrix having right singular vectors as rows. \n",
 56 |     "        #     Of shape (n_features, n_features) or (n_comps, n_features) depending on full_matrices.\n",
 57 |     "        self.mean = X.mean(axis=0)\n",
 58 |     "        Xc = X - self.mean  # Centering is required\n",
 59 |     "        U, s, V = scipy.linalg.svd(Xc, full_matrices=False)\n",
 60 |     "        self.explained_variance_ = (s ** 2) / X.shape[0]\n",
 61 |     "        self.explained_variance_ratio_ = (self.explained_variance_ /\n",
 62 |     "                                 self.explained_variance_.sum())\n",
 63 |     "        self.princ_comp_dir = V\n",
 64 |     "\n",
 65 |     "    def transform(self, X):\n",
 66 |     "        Xc = X - self.mean\n",
 67 |     "        return(np.dot(Xc, self.princ_comp_dir.T))\n",
 68 |     "\n",
 69 |     "# test\n",
 70 |     "np.random.seed(42)\n",
 71 |     " \n",
 72 |     "# dataset\n",
 73 |     "n_samples = 100\n",
 74 |     "experience = np.random.normal(size=n_samples)\n",
 75 |     "salary = 1500 + experience + np.random.normal(size=n_samples, scale=.5)\n",
 76 |     "X = np.column_stack([experience, salary])\n",
 77 |     "\n",
 78 |     "X = np.column_stack([experience, salary])\n",
 79 |     "pca = PCA(n_components=2)\n",
 80 |     "pca.fit(X)\n",
 81 |     "\n",
 82 |     "basic_pca = BasicPCA()\n",
 83 |     "basic_pca.fit(X)\n",
 84 |     "\n",
 85 |     "print(pca.explained_variance_ratio_)\n",
 86 |     "assert np.all(basic_pca.transform(X) == pca.transform(X))\n"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### Apply PCA on iris dataset\n",
 94 |     "\n",
 95 |     "Apply your sklearn PCA on `iris` dataset available at: 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "execution": {
103 |      "iopub.execute_input": "2020-10-11T22:53:15.278801Z",
104 |      "iopub.status.busy": "2020-10-11T22:53:15.278467Z",
105 |      "iopub.status.idle": "2020-10-11T22:53:16.236441Z",
106 |      "shell.execute_reply": "2020-10-11T22:53:16.234869Z"
107 |     }
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "import matplotlib.pyplot as plt\n",
112 |     "\n",
113 |     "from sklearn.decomposition import PCA\n",
114 |     "# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/\n",
115 |     "\n",
116 |     "import numpy as np\n",
117 |     "import pandas as pd\n",
118 |     "\n",
119 |     "try:\n",
120 |     "    salary = pd.read_csv('datasets/iris.csv')\n",
121 |     "except:\n",
122 |     "    url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'\n",
123 |     "    df = pd.read_csv(url)\n",
124 |     "\n",
125 |     "print(df.head())"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Describe the data set. Should the dataset been standardized ?"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "execution": {
140 |      "iopub.execute_input": "2020-10-11T22:53:16.256201Z",
141 |      "iopub.status.busy": "2020-10-11T22:53:16.255386Z",
142 |      "iopub.status.idle": "2020-10-11T22:53:16.269795Z",
143 |      "shell.execute_reply": "2020-10-11T22:53:16.269211Z"
144 |     }
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "print(df.describe())"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Describe the structure of correlation among variables."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "execution": {
163 |      "iopub.execute_input": "2020-10-11T22:53:16.273240Z",
164 |      "iopub.status.busy": "2020-10-11T22:53:16.272789Z",
165 |      "iopub.status.idle": "2020-10-11T22:53:16.275060Z",
166 |      "shell.execute_reply": "2020-10-11T22:53:16.274585Z"
167 |     }
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "X = np.array(df.iloc[:, :4])\n",
172 |     "#np.around(np.corrcoef(X.T), 3)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "execution": {
180 |      "iopub.execute_input": "2020-10-11T22:53:16.279201Z",
181 |      "iopub.status.busy": "2020-10-11T22:53:16.278783Z",
182 |      "iopub.status.idle": "2020-10-11T22:53:16.283272Z",
183 |      "shell.execute_reply": "2020-10-11T22:53:16.282896Z"
184 |     }
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "# Center and standardize\n",
189 |     "\n",
190 |     "X = np.array(df.iloc[:, :4])\n",
191 |     "X -= np.mean(X, axis=0)\n",
192 |     "X /= np.std(X, axis=0, ddof=1)\n",
193 |     "np.around(np.corrcoef(X.T), 3)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "Compute a PCA with the maximum number of components."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "execution": {
208 |      "iopub.execute_input": "2020-10-11T22:53:16.286362Z",
209 |      "iopub.status.busy": "2020-10-11T22:53:16.285897Z",
210 |      "iopub.status.idle": "2020-10-11T22:53:16.288689Z",
211 |      "shell.execute_reply": "2020-10-11T22:53:16.288349Z"
212 |     }
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "pca = PCA(n_components=X.shape[1])\n",
217 |     "pca.fit(X)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "Retrieve the explained variance ratio. Determine $K$ the number of components."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "execution": {
232 |      "iopub.execute_input": "2020-10-11T22:53:16.291425Z",
233 |      "iopub.status.busy": "2020-10-11T22:53:16.291098Z",
234 |      "iopub.status.idle": "2020-10-11T22:53:16.293764Z",
235 |      "shell.execute_reply": "2020-10-11T22:53:16.294048Z"
236 |     }
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "print(pca.explained_variance_ratio_)\n",
241 |     "\n",
242 |     "K = 2\n",
243 |     "pca = PCA(n_components=X.shape[1])\n",
244 |     "pca.fit(X)\n",
245 |     "PC = pca.transform(X)\n",
246 |     "#print(PC)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "Print the $K$ principal components direction and correlation of the $K$ principal\n",
254 |     "components with original variables. Interpret the contribution of original variables\n",
255 |     "into the PC.\n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {
262 |     "execution": {
263 |      "iopub.execute_input": "2020-10-11T22:53:16.297928Z",
264 |      "iopub.status.busy": "2020-10-11T22:53:16.297500Z",
265 |      "iopub.status.idle": "2020-10-11T22:53:16.302829Z",
266 |      "shell.execute_reply": "2020-10-11T22:53:16.302482Z"
267 |     }
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "print(pca.components_)\n",
272 |     "CorPC = pd.DataFrame(\n",
273 |     "    [[np.corrcoef(X[:, j], PC[:, k])[0, 1] for j in range(X.shape[1])]\n",
274 |     "        for k in range(K)],\n",
275 |     "            columns = df.columns[:4],\n",
276 |     "    index = [\"PC %i\"%k for k in range(K)]\n",
277 |     ")\n",
278 |     "\n",
279 |     "print(CorPC)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "Plot samples projected into the $K$ first PCs. Color samples with their species."
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {
293 |     "execution": {
294 |      "iopub.execute_input": "2020-10-11T22:53:16.316818Z",
295 |      "iopub.status.busy": "2020-10-11T22:53:16.316510Z",
296 |      "iopub.status.idle": "2020-10-11T22:53:16.396495Z",
297 |      "shell.execute_reply": "2020-10-11T22:53:16.396182Z"
298 |     }
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "colors = {'setosa':'r', 'versicolor':'g', 'virginica':'blue'}\n",
303 |     "print(df[\"species\"].unique())\n",
304 |     "#plt.scatter(df['experience'], df['salary'], c=df['education'].apply(lambda x: colors[x]), s=100)\n",
305 |     "plt.scatter(PC[:, 0], PC[:, 1], c=df[\"species\"].apply(lambda x: colors[x]))\n",
306 |     "plt.xlabel(\"PC1\")\n",
307 |     "plt.ylabel(\"PC2\")"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "Pairewise plot"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "execution": {
322 |      "iopub.execute_input": "2020-10-11T22:53:16.442119Z",
323 |      "iopub.status.busy": "2020-10-11T22:53:16.441495Z",
324 |      "iopub.status.idle": "2020-10-11T22:53:23.105722Z",
325 |      "shell.execute_reply": "2020-10-11T22:53:23.106018Z"
326 |     }
327 |    },
328 |    "outputs": [],
329 |    "source": [
330 |     "import seaborn as sns\n",
331 |     "\n",
332 |     "df[\"PC1\"] = PC[:, 0]\n",
333 |     "df[\"PC2\"] = PC[:, 1]\n",
334 |     "\n",
335 |     "ax = sns.pairplot(df, hue=\"species\")"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {},
342 |    "outputs": [],
343 |    "source": []
344 |   }
345 |  ],
346 |  "metadata": {
347 |   "anaconda-cloud": {},
348 |   "kernelspec": {
349 |    "display_name": "Python 3",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.7.9"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/scientific_python/scipy_matplotlib.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data visualization: matplotlib & seaborn \n",
  8 |     "\n",
  9 |     "\n",
 10 |     "## Basic plots"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {
 17 |     "execution": {
 18 |      "iopub.execute_input": "2020-10-11T22:54:06.283262Z",
 19 |      "iopub.status.busy": "2020-10-11T22:54:06.281496Z",
 20 |      "iopub.status.idle": "2020-10-11T22:54:06.619890Z",
 21 |      "shell.execute_reply": "2020-10-11T22:54:06.619484Z"
 22 |     }
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "import matplotlib.pyplot as plt\n",
 28 |     "import seaborn as sns\n",
 29 |     "\n",
 30 |     "# inline plot (for jupyter)\n",
 31 |     "%matplotlib inline\n",
 32 |     "\n",
 33 |     "plt.figure(figsize=(9, 3))\n",
 34 |     "x = np.linspace(0, 10, 50)\n",
 35 |     "sinus = np.sin(x)\n",
 36 |     "\n",
 37 |     "plt.plot(x, sinus)\n",
 38 |     "plt.show()"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "execution": {
 46 |      "iopub.execute_input": "2020-10-11T22:54:06.631218Z",
 47 |      "iopub.status.busy": "2020-10-11T22:54:06.630138Z",
 48 |      "iopub.status.idle": "2020-10-11T22:54:06.715538Z",
 49 |      "shell.execute_reply": "2020-10-11T22:54:06.715894Z"
 50 |     }
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "plt.figure(figsize=(9, 3))\n",
 55 |     "\n",
 56 |     "plt.plot(x, sinus, \"o\")\n",
 57 |     "plt.show()\n",
 58 |     "# use plt.plot to get color / marker abbreviations"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "execution": {
 66 |      "iopub.execute_input": "2020-10-11T22:54:06.728139Z",
 67 |      "iopub.status.busy": "2020-10-11T22:54:06.727746Z",
 68 |      "iopub.status.idle": "2020-10-11T22:54:06.834198Z",
 69 |      "shell.execute_reply": "2020-10-11T22:54:06.833848Z"
 70 |     }
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Rapid multiplot\n",
 75 |     "\n",
 76 |     "plt.figure(figsize=(9, 3))\n",
 77 |     "cosinus = np.cos(x)\n",
 78 |     "plt.plot(x, sinus, \"-b\", x, sinus, \"ob\", x, cosinus, \"-r\", x, cosinus, \"or\")\n",
 79 |     "plt.xlabel('this is x!')\n",
 80 |     "plt.ylabel('this is y!')\n",
 81 |     "plt.title('My First Plot')\n",
 82 |     "plt.show()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "execution": {
 90 |      "iopub.execute_input": "2020-10-11T22:54:06.847651Z",
 91 |      "iopub.status.busy": "2020-10-11T22:54:06.846622Z",
 92 |      "iopub.status.idle": "2020-10-11T22:54:06.953662Z",
 93 |      "shell.execute_reply": "2020-10-11T22:54:06.953293Z"
 94 |     }
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Step by step\n",
 99 |     "\n",
100 |     "plt.figure(figsize=(9, 3))\n",
101 |     "plt.plot(x, sinus, label='sinus', color='blue', linestyle='--', linewidth=2)\n",
102 |     "plt.plot(x, cosinus, label='cosinus', color='red', linestyle='-', linewidth=2)\n",
103 |     "plt.legend()\n",
104 |     "plt.show()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Scatter (2D) plots\n",
112 |     "\n",
113 |     "Load dataset"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "execution": {
121 |      "iopub.execute_input": "2020-10-11T22:54:06.956572Z",
122 |      "iopub.status.busy": "2020-10-11T22:54:06.956237Z",
123 |      "iopub.status.idle": "2020-10-11T22:54:07.103716Z",
124 |      "shell.execute_reply": "2020-10-11T22:54:07.103342Z"
125 |     }
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "import pandas as pd\n",
130 |     "try:\n",
131 |     "    salary = pd.read_csv(\"../datasets/salary_table.csv\")\n",
132 |     "except:\n",
133 |     "    url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv'\n",
134 |     "    salary = pd.read_csv(url)\n",
135 |     "\n",
136 |     "df = salary\n",
137 |     "print(df.head())"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### Simple scatter with colors"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "plt.figure(figsize=(3, 3), dpi=100)\n",
154 |     "_ = sns.scatterplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "Legend outside"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "### Linear model"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "ax = sns.lmplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "### Scatter plot with colors and symbols"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", style='management', data=salary)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Saving Figures"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "execution": {
217 |      "iopub.execute_input": "2020-10-11T22:54:07.420427Z",
218 |      "iopub.status.busy": "2020-10-11T22:54:07.419445Z",
219 |      "iopub.status.idle": "2020-10-11T22:54:07.649956Z",
220 |      "shell.execute_reply": "2020-10-11T22:54:07.649633Z"
221 |     }
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "### bitmap format\n",
226 |     "plt.plot(x, sinus)\n",
227 |     "plt.savefig(\"sinus.png\")\n",
228 |     "plt.close()\n",
229 |     "\n",
230 |     "# Prefer vectorial format (SVG: Scalable Vector Graphics) can be edited with \n",
231 |     "# Inkscape, Adobe Illustrator, Blender, etc.\n",
232 |     "plt.plot(x, sinus)\n",
233 |     "plt.savefig(\"sinus.svg\")\n",
234 |     "plt.close()\n",
235 |     "\n",
236 |     "# Or pdf\n",
237 |     "plt.plot(x, sinus)\n",
238 |     "plt.savefig(\"sinus.pdf\")\n",
239 |     "plt.close()"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "### Boxplot and violin plot: one factor\n",
247 |     "\n",
248 |     "Box plots are non-parametric: they display variation in samples of a statistical population without making any assumptions of the underlying statistical distribution.\n",
249 |     "\n",
250 |     "![title](images/boxplot.png){width=7cm}"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "ax = sns.boxplot(x=\"management\", y=\"salary\", data=salary)\n",
260 |     "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"black\")"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "ax = sns.violinplot(x=\"management\", y=\"salary\", data=salary)\n",
270 |     "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"white\")"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "### Boxplot and violin plot: two factors"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "ax = sns.boxplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n",
287 |     "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "execution": {
295 |      "iopub.execute_input": "2020-10-11T22:54:07.652516Z",
296 |      "iopub.status.busy": "2020-10-11T22:54:07.652175Z",
297 |      "iopub.status.idle": "2020-10-11T22:54:08.055323Z",
298 |      "shell.execute_reply": "2020-10-11T22:54:08.054906Z"
299 |     }
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "ax = sns.violinplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n",
304 |     "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "markdown",
309 |    "metadata": {},
310 |    "source": [
311 |     "### Distributions and density plot\n",
312 |     "\n",
313 |     "[Distributions with seaborn](https://seaborn.pydata.org/tutorial/distributions.html)\n"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": null,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "ax = sns.displot(x=\"salary\", hue=\"management\", kind=\"kde\", data=salary, fill=True)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "## Multiple axis"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "fig, axes = plt.subplots(3, 1, figsize=(9, 9), sharex=True)\n",
339 |     "\n",
340 |     "i = 0\n",
341 |     "for edu, d in salary.groupby(['education']):\n",
342 |     "    sns.kdeplot(x=\"salary\", hue=\"management\", data=d, fill=True, ax=axes[i], palette=\"muted\")\n",
343 |     "    axes[i].set_title(edu)\n",
344 |     "    i += 1"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "## Pairwise scatter plots"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "ax = sns.pairplot(salary, hue=\"management\")"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "## Time series"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {
374 |     "execution": {
375 |      "iopub.execute_input": "2020-10-11T22:54:10.349932Z",
376 |      "iopub.status.busy": "2020-10-11T22:54:10.349585Z",
377 |      "iopub.status.idle": "2020-10-11T22:54:11.426751Z",
378 |      "shell.execute_reply": "2020-10-11T22:54:11.426337Z"
379 |     }
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "import seaborn as sns\n",
384 |     "sns.set(style=\"darkgrid\")\n",
385 |     "\n",
386 |     "# Load an example dataset with long-form data\n",
387 |     "fmri = sns.load_dataset(\"fmri\")\n",
388 |     "\n",
389 |     "# Plot the responses for different events and regions\n",
390 |     "ax = sns.pointplot(x=\"timepoint\", y=\"signal\",\n",
391 |     "             hue=\"region\", style=\"event\",\n",
392 |     "             data=fmri)"
393 |    ]
394 |   }
395 |  ],
396 |  "metadata": {
397 |   "anaconda-cloud": {},
398 |   "kernelspec": {
399 |    "display_name": "Python 3",
400 |    "language": "python",
401 |    "name": "python3"
402 |   },
403 |   "language_info": {
404 |    "codemirror_mode": {
405 |     "name": "ipython",
406 |     "version": 3
407 |    },
408 |    "file_extension": ".py",
409 |    "mimetype": "text/x-python",
410 |    "name": "python",
411 |    "nbconvert_exporter": "python",
412 |    "pygments_lexer": "ipython3",
413 |    "version": "3.7.9"
414 |   }
415 |  },
416 |  "nbformat": 4,
417 |  "nbformat_minor": 2
418 | }
419 | 


--------------------------------------------------------------------------------
/utils/time_series.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | # Time Series in python
  3 | 
  4 | Two libraries:
  5 | 
  6 | - Pandas: https://pandas.pydata.org/pandas-docs/stable/timeseries.html
  7 | - scipy http://www.statsmodels.org/devel/tsa.html
  8 | '''
  9 | 
 10 | '''
 11 | ## Stationarity
 12 | 
 13 | A TS is said to be stationary if its statistical properties such as mean, variance remain constant over time.
 14 | 
 15 | - constant mean
 16 | - constant variance
 17 | - an autocovariance that does not depend on time.
 18 | 
 19 | what is making a TS non-stationary. There are 2 major reasons behind non-stationaruty of a TS:
 20 | 
 21 | 1. Trend – varying mean over time. For eg, in this case we saw that on average, the number of passengers was growing over time.
 22 | 
 23 | 2. Seasonality – variations at specific time-frames. eg people might have a tendency to buy cars in a particular month because of pay increment or festivals.
 24 | '''
 25 | 
 26 | '''
 27 | ## Pandas Time Series Data Structure
 28 | 
 29 | A Series is similar to a list or an array in Python.
 30 | It represents a series of values (numeric or otherwise) such as a column of data.
 31 | It provides additional functionality, methods, and operators, which make it a more powerful version of a list.
 32 | '''
 33 | 
 34 | import pandas as pd
 35 | import numpy as np
 36 | 
 37 | # Create a Series from a list
 38 | ser = pd.Series([1, 3])
 39 | print(ser)
 40 | 
 41 | # String as index
 42 | prices = {'apple': 4.99,
 43 |          'banana': 1.99,
 44 |          'orange': 3.99}
 45 | ser = pd.Series(prices)
 46 | print(ser)
 47 | 
 48 | x = pd.Series(np.arange(1,3), index=[x for x in 'ab'])
 49 | print(x)
 50 | print(x['b'])
 51 | 
 52 | '''
 53 | ## Time Series Analysis of Google Trends
 54 | 
 55 | source: https://www.datacamp.com/community/tutorials/time-series-analysis-tutorial
 56 | 
 57 | Get Google Trends data of keywords such as 'diet' and 'gym' and see how they vary over time while learning about trends and seasonality in time series data.
 58 | 
 59 | In the Facebook Live code along session on the 4th of January, we checked out Google trends data of keywords 'diet', 'gym' and 'finance' to see how they vary over time. We asked ourselves if there could be more searches for these terms in January when we're all trying to turn over a new leaf?
 60 | 
 61 | In this tutorial, you'll go through the code that we put together during the session step by step. You're not going to do much mathematics but you are going to do the following:
 62 | 
 63 | - Read data
 64 | - Recode data
 65 | - Exploratory Data Analysis
 66 | 
 67 | '''
 68 | 
 69 | 
 70 | '''
 71 | ## Read data
 72 | '''
 73 | 
 74 | import numpy as np
 75 | import pandas as pd
 76 | import matplotlib.pyplot as plt
 77 | import seaborn as sns
 78 | 
 79 | # Plot appears on its own windows
 80 | %matplotlib qt
 81 | # Tools / Preferences / Ipython Console  / Graphics  / Graphics Backend / Backend: “automatic”
 82 | # Interactive Matplotlib Jupyter Notebook
 83 | # %matplotlib inline
 84 | 
 85 | try:
 86 |     url = "https://raw.githubusercontent.com/datacamp/datacamp_facebook_live_ny_resolution/master/data/multiTimeline.csv"
 87 |     df = pd.read_csv(url, skiprows=2)
 88 | except:
 89 |     df = pd.read_csv("../data/multiTimeline.csv", skiprows=2)
 90 | 
 91 | print(df.head())
 92 | 
 93 | # Rename columns
 94 | df.columns = ['month', 'diet', 'gym', 'finance']
 95 | 
 96 | # Describe
 97 | print(df.describe())
 98 | 
 99 | '''
100 | ## Recode data
101 | 
102 | Next, you'll turn the 'month' column into a DateTime data type and make it the index of the DataFrame.
103 | 
104 | Note that you do this because you saw in the result of the .info() method that the 'Month' column was actually an of data type object. Now, that generic data type encapsulates everything from strings to integers, etc. That's not exactly what you want when you want to be looking at time series data. That's why you'll use .to_datetime() to convert the 'month' column in your DataFrame to a DateTime.
105 | 
106 | Be careful! Make sure to include the inplace argument when you're setting the index of the DataFrame df so that you actually alter the original index and set it to the 'month' column.
107 | '''
108 | df.month = pd.to_datetime(df.month)
109 | df.set_index('month', inplace=True)
110 | 
111 | print(df.head())
112 | 
113 | '''
114 | ## Exploratory Data Analysis
115 | 
116 | You can use a built-in pandas visualization method .plot() to plot your
117 | data as 3 line plots on a single
118 | figure (one for each column, namely, 'diet', 'gym', and 'finance').
119 | '''
120 | df.plot()
121 | plt.xlabel('Year');
122 | 
123 | # change figure parameters
124 | # df.plot(figsize=(20,10), linewidth=5, fontsize=20)
125 | 
126 | # Plot single column
127 | # df[['diet']].plot(figsize=(20,10), linewidth=5, fontsize=20)
128 | # plt.xlabel('Year', fontsize=20);
129 | 
130 | '''
131 | Note that this data is relative. As you can read on Google trends:
132 | 
133 | Numbers represent search interest relative to the highest point on the chart
134 | for the given region and time.
135 | A value of 100 is the peak popularity for the term.
136 | A value of 50 means that the term is half as popular.
137 | Likewise a score of 0 means the term was less than 1% as popular as the peak.
138 | 
139 | '''
140 | 
141 | 
142 | '''
143 | ## Resampling, Smoothing, Windowing, Rolling average: Trends
144 | 
145 | Rolling average, for each time point, take the average of the points on either side of it.
146 | Note that the number of points is specified by a window size.
147 | 
148 | Remove Seasonality with pandas Series.
149 | 
150 | See: http://pandas.pydata.org/pandas-docs/stable/timeseries.html
151 | A: 'year end frequency' year frequency
152 | '''
153 | diet = df['diet']
154 | 
155 | diet_resamp_yr = diet.resample('A').mean()
156 | diet_roll_yr = diet.rolling(12).mean()
157 | 
158 | ax = diet.plot(alpha=0.5, style='-') # store axis (ax) for latter plots
159 | diet_resamp_yr.plot(style=':', label='Resample at year frequency', ax=ax)
160 | diet_roll_yr.plot(style='--', label='Rolling average (smooth), window size=12', ax=ax)
161 | ax.legend()
162 | 
163 | 
164 | '''
165 | Rolling average (smoothing) with Numpy
166 | '''
167 | 
168 | x = np.asarray(df[['diet']])
169 | win = 12
170 | win_half = int(win / 2)
171 | # print([((idx-win_half), (idx+win_half)) for idx in np.arange(win_half, len(x))])
172 | 
173 | diet_smooth = np.array([x[(idx-win_half):(idx+win_half)].mean() for idx in np.arange(win_half, len(x))])
174 | plt.plot(diet_smooth)
175 | 
176 | '''
177 | Trends Plot Diet and Gym
178 | 
179 | Build a new DataFrame which is the concatenation diet and gym smoothed data
180 | '''
181 | gym = df['gym']
182 | 
183 | df_avg = pd.concat([diet.rolling(12).mean(), gym.rolling(12).mean()], axis=1)
184 | df_avg.plot()
185 | plt.xlabel('Year')
186 | 
187 | '''
188 | Detrending
189 | '''
190 | 
191 | df_dtrend = df[["diet", "gym"]] - df_avg
192 | df_dtrend.plot()
193 | plt.xlabel('Year')
194 | 
195 | '''
196 | ## First-order differencing: Seasonal Patterns
197 | 
198 | '''
199 | 
200 | # diff = original - shiftted data
201 | # (exclude first term for some implementation details)
202 | assert np.all((diet.diff() == diet - diet.shift())[1:])
203 | 
204 | df.diff().plot()
205 | plt.xlabel('Year')
206 | 
207 | '''
208 | ## Periodicity and Correlation
209 | '''
210 | 
211 | df.plot()
212 | plt.xlabel('Year');
213 | print(df.corr())
214 | 
215 | '''
216 | Plot correlation matrix
217 | '''
218 | 
219 | sns.heatmap(df.corr(), cmap="coolwarm")
220 | 
221 | 
222 | '''
223 | 'diet' and 'gym' are negatively correlated!
224 | Remember that you have a seasonal and a trend component.
225 | From the correlation coefficient, 'diet' and 'gym' are negatively correlated:
226 | 
227 | - trends components are negatively correlated.
228 | - seasonal components would positively correlated and their
229 | 
230 | The actual correlation coefficient is actually capturing both of those.
231 | 
232 | Seasonal correlation: correlation of the first-order differences of these time series
233 | '''
234 | 
235 | df.diff().plot()
236 | plt.xlabel('Year');
237 | 
238 | print(df.diff().corr())
239 | 
240 | '''
241 | Plot correlation matrix
242 | '''
243 | 
244 | sns.heatmap(df.diff().corr(), cmap="coolwarm")
245 | 
246 | '''
247 | Decomposing time serie in trend, seasonality and residuals
248 | '''
249 | 
250 | from statsmodels.tsa.seasonal import seasonal_decompose
251 | 
252 | x = gym
253 | 
254 | x = x.astype(float) # force float
255 | decomposition = seasonal_decompose(x)
256 | trend = decomposition.trend
257 | seasonal = decomposition.seasonal
258 | residual = decomposition.resid
259 | 
260 | plt.subplot(411)
261 | plt.plot(x, label='Original')
262 | plt.legend(loc='best')
263 | plt.subplot(412)
264 | plt.plot(trend, label='Trend')
265 | plt.legend(loc='best')
266 | plt.subplot(413)
267 | plt.plot(seasonal,label='Seasonality')
268 | plt.legend(loc='best')
269 | plt.subplot(414)
270 | plt.plot(residual, label='Residuals')
271 | plt.legend(loc='best')
272 | plt.tight_layout()
273 | 
274 | 
275 | '''
276 | ## Autocorrelation
277 | 
278 | A time series is periodic if it repeats itself at equally spaced intervals, say, every 12 months.
279 | Autocorrelation Function (ACF): It is a measure of the correlation between the TS with a
280 | lagged version of itself. For instance at lag 5, ACF would compare series at time instant t1...t2
281 | with series at instant t1-5...t2-5 (t1-5 and t2 being end points).
282 | 
283 | Plot
284 | '''
285 | # from pandas.plotting import autocorrelation_plot
286 | from pandas.tools.plotting import autocorrelation_plot
287 | 
288 | x = df["diet"].astype(float)
289 | autocorrelation_plot(x)
290 | 
291 | '''
292 | Compute Autocorrelation Function (ACF)
293 | '''
294 | 
295 | from statsmodels.tsa.stattools import acf
296 | 
297 | x_diff = x.diff().dropna() # first item is NA
298 | lag_acf = acf(x_diff, nlags=36)
299 | plt.plot(lag_acf)
300 | plt.title('Autocorrelation Function')
301 | 
302 | '''
303 | ACF peaks every 12 months: Time series is correlated with itself shifted by 12 months.
304 | '''
305 | 
306 | '''
307 | ## Time Series Forecasting with Python using Autoregressive Moving Average (ARMA) models
308 | 
309 | Source:
310 | 
311 | - https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783553358/7/ch07lvl1sec77/arma-models
312 | 
313 | - http://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model
314 | 
315 | - ARIMA: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
316 | 
317 | ARMA models are often used to forecast a time series.
318 | These models combine autoregressive and moving average models.
319 | In moving average models, we assume that a variable is the sum of the mean of the
320 | time series and a linear combination of noise components.
321 | 
322 | The autoregressive and moving average models can have different orders. In general, we can define an ARMA model with p autoregressive terms and q moving average terms as follows:
323 | 
324 | $$
325 | x_t = \sum_i^p a_i x_{t-i} +\sum_i^q b_i \varepsilon_{t-i} + \varepsilon_t
326 | $$
327 | '''
328 | 
329 | '''
330 | ### Choosing p and q
331 | 
332 | Plot the partial autocorrelation functions for an estimate of p, and likewise using the autocorrelation functions for an estimate of q.
333 | 
334 | Partial Autocorrelation Function (PACF): This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4.
335 | '''
336 | from statsmodels.tsa.stattools import acf, pacf
337 | 
338 | x = df["gym"].astype(float)
339 | 
340 | x_diff = x.diff().dropna() # first item is NA
341 | # ACF and PACF plots:
342 | 
343 | lag_acf = acf(x_diff, nlags=20)
344 | lag_pacf = pacf(x_diff, nlags=20, method='ols')
345 | 
346 | #Plot ACF:
347 | plt.subplot(121)
348 | plt.plot(lag_acf)
349 | plt.axhline(y=0,linestyle='--',color='gray')
350 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
351 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
352 | plt.title('Autocorrelation Function  (q=1)')
353 | 
354 | #Plot PACF:
355 | plt.subplot(122)
356 | plt.plot(lag_pacf)
357 | plt.axhline(y=0,linestyle='--',color='gray')
358 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
359 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray')
360 | plt.title('Partial Autocorrelation Function (p=1)')
361 | plt.tight_layout()
362 | 
363 | '''
364 | In this plot, the two dotted lines on either sides of 0 are the confidence interevals.
365 | These can be used to determine the p and q values as:
366 | 
367 | - p: The lag value where the PACF chart crosses the upper confidence interval for the first time, in this case p=1.
368 | 
369 | - q: The lag value where the ACF chart crosses the upper confidence interval for the first time, in this case q=1.
370 | '''
371 | 
372 | '''
373 | ### Fit ARMA model with statsmodels
374 | 
375 | 1. Define the model by calling `ARMA()` and passing in the p and q parameters.
376 | 
377 | 2. The model is prepared on the training data by calling the `fit()` function.
378 | 
379 | 3. Predictions can be made by calling the `predict()` function and specifying the index of the time or times to be predicted.
380 | '''
381 | 
382 | from statsmodels.tsa.arima_model import ARMA
383 | 
384 | 
385 | model = ARMA(x, order=(1,1)).fit() # fit model
386 | 
387 | print(model.summary())
388 | plt.plot(x)
389 | plt.plot(model.predict(), color='red')
390 | plt.title('RSS: %.4f'% sum((model.fittedvalues-x)**2))


--------------------------------------------------------------------------------