├── introduction ├── images ├── machine_learning.rst └── python_ecosystem.rst ├── lib └── pystatsml │ ├── __init__.py │ └── plot_utils.py ├── statistics ├── images ├── pystatsml ├── README.txt └── stat_multiv_solutions.py ├── machine_learning ├── images ├── pystatsml ├── README.txt ├── img_sources │ ├── bias_variance.png │ ├── ada_boost_steps.png │ ├── bagging_overview.png │ ├── bootstrap_overview.png │ ├── bagging_architecture.png │ ├── boosting_architecture.png │ ├── stacking_architecture.png │ ├── boost_algo_weighted_sum.png │ ├── gradient_boosting_steps.png │ ├── bagging_model_aggregation.png │ ├── gradient_desceent_boosting.png │ ├── multi_stacking_architecture.png │ ├── step_size_gradient_boosting.png │ ├── loss_l_step_adaptative_boosting.png │ ├── usage_bootstrapping_in_variance.png │ └── architecture_adaptative_boosting.png ├── examples │ └── boot_clustering.py ├── resampling_solution.py ├── manifold_solutions.ipynb ├── ml_supervized_nonlinear.py └── decomposition_solutions.ipynb ├── scientific_python ├── images ├── README.txt ├── scipy_numpy_solutions.py ├── scipy_pandas_solutions.py └── scipy_matplotlib.ipynb ├── images ├── svm.png ├── boxplot.png ├── linear.png ├── model_lm.png ├── Dot_Product.png ├── data_science.png ├── numpy_array3d.png ├── random_forest.png ├── linear_logistic.png ├── svd_mixing_dict.png ├── trees_elements.odg ├── fisher_linear_disc.png ├── gradient_boosting.png ├── linear_regression.png ├── machine_learning.png ├── model_two-sample.png ├── numpy_broadcasting.png ├── python_ecosystem.odg ├── python_ecosystem.pdf ├── python_ecosystem.png ├── train_val_test_cv.odg ├── train_val_test_cv.png ├── two_samples_ttest.png ├── classification_tree.png ├── shrinkage │ ├── l1_sparse.png │ ├── ols_l1_l2.png │ └── ols_multicollinearity.png ├── stat_tests_flowchart.png ├── linear_regression_plan.png ├── vc_dimension_linear_2d.png ├── ridge_fisher_linear_disc.png ├── Coefficient_of_Determination.png ├── linear_regression_penalties.png └── svm_rbf_kernel_mapping_and_decision_function.png ├── labs └── README.txt ├── python_lang ├── README.txt ├── scripts │ ├── count_words.py │ └── replace.py └── python_lang_solutions.py ├── deep_learning ├── figures │ ├── vgg.png │ ├── alexnet.png │ ├── dropout.png │ ├── logistic.png │ ├── resnet18.png │ ├── resnet_vgg.png │ ├── nn_two_layers.png │ ├── vgg_param_tab.png │ ├── inception_block.png │ ├── alexnet_param_tab.png │ ├── resnet_param_tab.png │ ├── LeNet_Original_Image.jpg │ ├── logistic_multinominal.png │ ├── resnets_modelvariants.png │ └── logistic_multinominal_MNIST.png ├── train_val_model.py └── README.md ├── optimization └── images │ ├── nestrov.PNG │ ├── sgd_momentum.png │ ├── SGD_fluctuation.PNG │ ├── learning_rate_choice.png │ ├── grad_descent_momentum.png │ ├── gradient_descent_goals.png │ ├── grad_descent_no_momentum.png │ └── gradient_descent_intuition.png ├── datasets ├── brain_volumes │ ├── brain_volumes.xlsx │ ├── csf.csv │ ├── demo.csv │ ├── gm.csv │ ├── wm.csv │ └── brain_volumes.csv ├── default of credit card clients.xls ├── iris.csv ├── birthwt.csv ├── eurodist.csv ├── s_curve.csv ├── Advertising.csv ├── multiTimeline.csv ├── salary_table.csv ├── brain_anat_ixi │ ├── train_rois.csv │ ├── train_participants.csv │ ├── validation_rois.csv │ └── validation_participants.csv ├── readme.rst └── birthwt.txt ├── utils ├── README.txt ├── ml_sklearn_pipelines.py ├── plot_ml_linear_regression_multicolinearity.py ├── datasets.py ├── mahalanobis.py ├── ml_resampling.py ├── plot_ml_linear_regression_overfitting.py ├── plot_ml_linear_classification_overfitting.py ├── stat_univar_statmodels.py ├── ml_non_linear_prediction.py ├── ml_processing_pipelines.py └── time_series.py ├── .gitattributes ├── LICENSE ├── AUTHORS.rst ├── .gitignore ├── tests ├── test_build.py └── test_notebook.ipynb ├── .circleci └── config.yml ├── COPYING ├── R ├── tools_R_exo.R ├── stat_multiv_exo.R └── ml_dimensionality_reduction_exo.R ├── bin ├── conv_python_to_rst.py └── filter_fix_rst.py ├── index.rst ├── README.md ├── Makefile ├── info.rst └── conf.py /introduction/images: -------------------------------------------------------------------------------- 1 | ../images -------------------------------------------------------------------------------- /lib/pystatsml/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /statistics/images: -------------------------------------------------------------------------------- 1 | ../images -------------------------------------------------------------------------------- /machine_learning/images: -------------------------------------------------------------------------------- 1 | ../images -------------------------------------------------------------------------------- /scientific_python/images: -------------------------------------------------------------------------------- 1 | ../images -------------------------------------------------------------------------------- /statistics/pystatsml: -------------------------------------------------------------------------------- 1 | ../lib/pystatsml -------------------------------------------------------------------------------- /machine_learning/pystatsml: -------------------------------------------------------------------------------- 1 | ../lib/pystatsml -------------------------------------------------------------------------------- /images/svm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svm.png -------------------------------------------------------------------------------- /images/boxplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/boxplot.png -------------------------------------------------------------------------------- /images/linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear.png -------------------------------------------------------------------------------- /images/model_lm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/model_lm.png -------------------------------------------------------------------------------- /images/Dot_Product.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/Dot_Product.png -------------------------------------------------------------------------------- /images/data_science.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/data_science.png -------------------------------------------------------------------------------- /images/numpy_array3d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/numpy_array3d.png -------------------------------------------------------------------------------- /images/random_forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/random_forest.png -------------------------------------------------------------------------------- /labs/README.txt: -------------------------------------------------------------------------------- 1 | Labs 2 | ==== 3 | 4 | - Supervized classification: face recognition 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /python_lang/README.txt: -------------------------------------------------------------------------------- 1 | Scientific Python 2 | ================= 3 | 4 | - Python language 5 | 6 | 7 | -------------------------------------------------------------------------------- /images/linear_logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_logistic.png -------------------------------------------------------------------------------- /images/svd_mixing_dict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svd_mixing_dict.png -------------------------------------------------------------------------------- /images/trees_elements.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/trees_elements.odg -------------------------------------------------------------------------------- /deep_learning/figures/vgg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/vgg.png -------------------------------------------------------------------------------- /images/fisher_linear_disc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/fisher_linear_disc.png -------------------------------------------------------------------------------- /images/gradient_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/gradient_boosting.png -------------------------------------------------------------------------------- /images/linear_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression.png -------------------------------------------------------------------------------- /images/machine_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/machine_learning.png -------------------------------------------------------------------------------- /images/model_two-sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/model_two-sample.png -------------------------------------------------------------------------------- /images/numpy_broadcasting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/numpy_broadcasting.png -------------------------------------------------------------------------------- /images/python_ecosystem.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.odg -------------------------------------------------------------------------------- /images/python_ecosystem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.pdf -------------------------------------------------------------------------------- /images/python_ecosystem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/python_ecosystem.png -------------------------------------------------------------------------------- /images/train_val_test_cv.odg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/train_val_test_cv.odg -------------------------------------------------------------------------------- /images/train_val_test_cv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/train_val_test_cv.png -------------------------------------------------------------------------------- /images/two_samples_ttest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/two_samples_ttest.png -------------------------------------------------------------------------------- /scientific_python/README.txt: -------------------------------------------------------------------------------- 1 | Data Manipulaiion 2 | ================= 3 | 4 | - Numpy 5 | - Pandas 6 | 7 | 8 | -------------------------------------------------------------------------------- /images/classification_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/classification_tree.png -------------------------------------------------------------------------------- /images/shrinkage/l1_sparse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/l1_sparse.png -------------------------------------------------------------------------------- /images/shrinkage/ols_l1_l2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/ols_l1_l2.png -------------------------------------------------------------------------------- /images/stat_tests_flowchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/stat_tests_flowchart.png -------------------------------------------------------------------------------- /optimization/images/nestrov.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/nestrov.PNG -------------------------------------------------------------------------------- /deep_learning/figures/alexnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/alexnet.png -------------------------------------------------------------------------------- /deep_learning/figures/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/dropout.png -------------------------------------------------------------------------------- /deep_learning/figures/logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic.png -------------------------------------------------------------------------------- /deep_learning/figures/resnet18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet18.png -------------------------------------------------------------------------------- /images/linear_regression_plan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression_plan.png -------------------------------------------------------------------------------- /images/vc_dimension_linear_2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/vc_dimension_linear_2d.png -------------------------------------------------------------------------------- /machine_learning/README.txt: -------------------------------------------------------------------------------- 1 | Machine Learning 2 | ==== 3 | 4 | - Supervized non-linear classification 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /deep_learning/figures/resnet_vgg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet_vgg.png -------------------------------------------------------------------------------- /images/ridge_fisher_linear_disc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/ridge_fisher_linear_disc.png -------------------------------------------------------------------------------- /optimization/images/sgd_momentum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/sgd_momentum.png -------------------------------------------------------------------------------- /deep_learning/figures/nn_two_layers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/nn_two_layers.png -------------------------------------------------------------------------------- /deep_learning/figures/vgg_param_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/vgg_param_tab.png -------------------------------------------------------------------------------- /images/Coefficient_of_Determination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/Coefficient_of_Determination.png -------------------------------------------------------------------------------- /images/linear_regression_penalties.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/linear_regression_penalties.png -------------------------------------------------------------------------------- /optimization/images/SGD_fluctuation.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/SGD_fluctuation.PNG -------------------------------------------------------------------------------- /datasets/brain_volumes/brain_volumes.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/datasets/brain_volumes/brain_volumes.xlsx -------------------------------------------------------------------------------- /deep_learning/figures/inception_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/inception_block.png -------------------------------------------------------------------------------- /datasets/default of credit card clients.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/datasets/default of credit card clients.xls -------------------------------------------------------------------------------- /deep_learning/figures/alexnet_param_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/alexnet_param_tab.png -------------------------------------------------------------------------------- /deep_learning/figures/resnet_param_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnet_param_tab.png -------------------------------------------------------------------------------- /images/shrinkage/ols_multicollinearity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/shrinkage/ols_multicollinearity.png -------------------------------------------------------------------------------- /optimization/images/learning_rate_choice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/learning_rate_choice.png -------------------------------------------------------------------------------- /deep_learning/figures/LeNet_Original_Image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/LeNet_Original_Image.jpg -------------------------------------------------------------------------------- /machine_learning/img_sources/bias_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bias_variance.png -------------------------------------------------------------------------------- /optimization/images/grad_descent_momentum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/grad_descent_momentum.png -------------------------------------------------------------------------------- /optimization/images/gradient_descent_goals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/gradient_descent_goals.png -------------------------------------------------------------------------------- /utils/README.txt: -------------------------------------------------------------------------------- 1 | Miscellaneous python scripts used in the document. 2 | Files prefixed by `plot_` are actually used to generate some figures. 3 | -------------------------------------------------------------------------------- /deep_learning/figures/logistic_multinominal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic_multinominal.png -------------------------------------------------------------------------------- /deep_learning/figures/resnets_modelvariants.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/resnets_modelvariants.png -------------------------------------------------------------------------------- /machine_learning/img_sources/ada_boost_steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/ada_boost_steps.png -------------------------------------------------------------------------------- /machine_learning/img_sources/bagging_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_overview.png -------------------------------------------------------------------------------- /optimization/images/grad_descent_no_momentum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/grad_descent_no_momentum.png -------------------------------------------------------------------------------- /datasets/iris.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9cc1c345c71bcc9b486b74cbf6063fa66f4bb5e0f603a4b3c3471ec2e5e8e355 3 | size 3858 4 | -------------------------------------------------------------------------------- /machine_learning/img_sources/bootstrap_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bootstrap_overview.png -------------------------------------------------------------------------------- /optimization/images/gradient_descent_intuition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/optimization/images/gradient_descent_intuition.png -------------------------------------------------------------------------------- /datasets/birthwt.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:50102db24a1fc05d351f83b341d71fa6b5847493bbb1a709811f9910a38f62ca 3 | size 4955 4 | -------------------------------------------------------------------------------- /datasets/eurodist.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d42d9cc12cee29073581135909fbc0f058209b2b989c1be4c7ffa2d26ef35154 3 | size 2443 4 | -------------------------------------------------------------------------------- /datasets/s_curve.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:28efe138024294daa58a8d76ab08d9065bad7072502e703474981a3560ab9c12 3 | size 77352 4 | -------------------------------------------------------------------------------- /deep_learning/figures/logistic_multinominal_MNIST.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/deep_learning/figures/logistic_multinominal_MNIST.png -------------------------------------------------------------------------------- /machine_learning/img_sources/bagging_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_architecture.png -------------------------------------------------------------------------------- /machine_learning/img_sources/boosting_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/boosting_architecture.png -------------------------------------------------------------------------------- /machine_learning/img_sources/stacking_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/stacking_architecture.png -------------------------------------------------------------------------------- /utils/ml_sklearn_pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 5 15:52:46 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /datasets/Advertising.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:69104adc017e75d7019f61fe66ca2eb4ab014ee6f2a9b39b452943f209352010 3 | size 5166 4 | -------------------------------------------------------------------------------- /datasets/multiTimeline.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c4c9d7976a39894d3d5245fa8a52dfbd87e37453fdfb0bf9e4e4ef765014139f 3 | size 2945 4 | -------------------------------------------------------------------------------- /datasets/salary_table.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f7aa35faea436cbdc5e74be0916067f864e2695530bb1fe2e9ede06ad1425886 3 | size 840 4 | -------------------------------------------------------------------------------- /images/svm_rbf_kernel_mapping_and_decision_function.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/images/svm_rbf_kernel_mapping_and_decision_function.png -------------------------------------------------------------------------------- /machine_learning/img_sources/boost_algo_weighted_sum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/boost_algo_weighted_sum.png -------------------------------------------------------------------------------- /machine_learning/img_sources/gradient_boosting_steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/gradient_boosting_steps.png -------------------------------------------------------------------------------- /datasets/brain_volumes/csf.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f592eb619fb1b42d99a6db79864833b157022cdbd7e5aaee5369f6100954da16 3 | size 30941 4 | -------------------------------------------------------------------------------- /datasets/brain_volumes/demo.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1b8a87f396dde4b3db728dd9761dfa3cc86a60ad2b6e142acf47c0003ae64876 3 | size 9130 4 | -------------------------------------------------------------------------------- /datasets/brain_volumes/gm.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:75e02abc0f9d13a24b2b476056c73db6d006462cdec7b8d9617c5248992a6d89 3 | size 30691 4 | -------------------------------------------------------------------------------- /datasets/brain_volumes/wm.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1367e63d1d4973d45ad69937538010afc4299dbc95cf6ff434c31438415d81d8 3 | size 30786 4 | -------------------------------------------------------------------------------- /machine_learning/img_sources/bagging_model_aggregation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/bagging_model_aggregation.png -------------------------------------------------------------------------------- /machine_learning/img_sources/gradient_desceent_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/gradient_desceent_boosting.png -------------------------------------------------------------------------------- /machine_learning/img_sources/multi_stacking_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/multi_stacking_architecture.png -------------------------------------------------------------------------------- /machine_learning/img_sources/step_size_gradient_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/step_size_gradient_boosting.png -------------------------------------------------------------------------------- /datasets/brain_anat_ixi/train_rois.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:cfebdcbe5bd954c5eefd04ec477c16e1cad6c35e7408284c2ac3d39275407848 3 | size 1820688 4 | -------------------------------------------------------------------------------- /datasets/brain_volumes/brain_volumes.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5f816d737e16362a3692a628813432cdead0fdede0e9e23e70ebdda18bc6db45 3 | size 75687 4 | -------------------------------------------------------------------------------- /machine_learning/img_sources/loss_l_step_adaptative_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/loss_l_step_adaptative_boosting.png -------------------------------------------------------------------------------- /machine_learning/img_sources/usage_bootstrapping_in_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/usage_bootstrapping_in_variance.png -------------------------------------------------------------------------------- /datasets/brain_anat_ixi/train_participants.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:87331681947797e0e0ed5ec1cb14747571b0f39bfca47c833576740a7bd8ce4b 3 | size 20036 4 | -------------------------------------------------------------------------------- /datasets/brain_anat_ixi/validation_rois.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:999c29e6203744be64654b0be26988b6653023104a0c451f484fca5fcfc8c4f3 3 | size 462442 4 | -------------------------------------------------------------------------------- /datasets/readme.rst: -------------------------------------------------------------------------------- 1 | default of credit card clients Data Set 2 | ======================================= 3 | 4 | http://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients 5 | -------------------------------------------------------------------------------- /machine_learning/img_sources/architecture_adaptative_boosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neurospin/pystatsml/HEAD/machine_learning/img_sources/architecture_adaptative_boosting.png -------------------------------------------------------------------------------- /datasets/brain_anat_ixi/validation_participants.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e60bc182bf37af64e9e406de684ad587b49c8120408ccab16d63a538c05db4ad 3 | size 5099 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.npz filter=lfs diff=lfs merge=lfs -text 2 | *.npy filter=lfs diff=lfs merge=lfs -text 3 | *.nii filter=lfs diff=lfs merge=lfs -text 4 | *.nii.gz filter=lfs diff=lfs merge=lfs -text 5 | *.csv filter=lfs diff=lfs merge=lfs -text 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | License 2 | 3 | All code and material is licensed under a 4 | 5 | Creative Commons Attribution 4.0 International License (CC-by) 6 | 7 | http://creativecommons.org/licenses/by/4.0/ 8 | 9 | See the AUTHORS.rst file for a list of contributors. 10 | 11 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | Authors 2 | ======== 3 | 4 | Editors 5 | -------- 6 | 7 | - Edouard Duchesnay (edouard.duchesnay@gmail.com) 8 | 9 | Chapter authors 10 | ---------------- 11 | 12 | Listed by alphabetical order. 13 | 14 | - Younes Feki (younesfkih@gmail.com) 15 | 16 | - Tommy Löfstedt (lofstedt.tommy@gmail.com) 17 | 18 | 19 | -------------------------------------------------------------------------------- /statistics/README.txt: -------------------------------------------------------------------------------- 1 | Univariate statistics 2 | ===================== 3 | 4 | - Estimators of the main statistical measuresx 5 | - Main distributions 6 | - Hypothesis Testing 7 | - Testing pairwise associations 8 | - Non-parametric test of pairwise associations 9 | - Linear model 10 | - Linear model with statsmodels 11 | - Multiple comparisons 12 | - Labs 13 | 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | .nfs* 4 | ~$* 5 | *.Rhistory 6 | *.aux 7 | *.bbl 8 | *.blg 9 | *.log 10 | .*.swp 11 | .ipynb_checkpoints 12 | notebooks/*.rst 13 | python/*.rst 14 | .spyderproject 15 | build/ 16 | auto_gallery/ 17 | .spyproject 18 | statistics/*.rst 19 | machine_learning/*.rst 20 | deep_learning/*.rst 21 | scientific_python/*.rst 22 | machine_learning/*_files 23 | deep_learning/*_files 24 | scientific_python/*files 25 | statistics/*_files 26 | -------------------------------------------------------------------------------- /tests/test_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Apr 17 13:52:03 2019 5 | 6 | @author: edouard 7 | """ 8 | 9 | ## TODO rewrite test function 10 | """ 11 | Manual check, run command line 12 | 13 | nb=tests/test_notebook.ipynb 14 | rst=tests/test_notebook.rst 15 | 16 | # Run notebook 17 | jupyter nbconvert --to notebook --execute $nb --output $(basename $nb) 18 | 19 | # Convert to rst 20 | jupyter nbconvert --to rst --stdout $nb 21 | jupyter nbconvert --to rst --stdout $nb | bin/filter_fix_rst.py > $rst 22 | """ -------------------------------------------------------------------------------- /scientific_python/scipy_numpy_solutions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 29 16:54:32 2016 4 | 5 | @author: ed203246 6 | """ 7 | 8 | import numpy as np 9 | X = np.random.randn(4, 2) 10 | print(X) 11 | 12 | ''' 13 | - For each column find the row indices of the minimiun value. 14 | ''' 15 | [np.argmin(X[:, j]) 16 | for j in range(X.shape[1])] 17 | 18 | np.argmin(X, axis=0) 19 | 20 | ''' 21 | - Write a function ``scale(X)`` that return an array whose columns are centered and scaled (by std-dev). 22 | ''' 23 | 24 | def scale(X): 25 | return (X - X.mean(axis=0)) / X.std(axis=0) 26 | 27 | X = np.random.randn(5, 3) 28 | Xs = scale(X) 29 | 30 | Xs.mean(axis=0) 31 | Xs.std(axis=0) 32 | -------------------------------------------------------------------------------- /datasets/birthwt.txt: -------------------------------------------------------------------------------- 1 | Risk Factors Associated with Low Infant Birth Weight 2 | 3 | Description: 4 | 5 | The ‘birthwt’ data frame has 189 rows and 10 columns. The data 6 | were collected at Baystate Medical Center, Springfield, Mass 7 | during 1986. 8 | 9 | 10 | Format: 11 | 12 | This data frame contains the following columns: 13 | 14 | ‘low’ indicator of birth weight less than 2.5 kg. 15 | 16 | ‘age’ mother's age in years. 17 | 18 | ‘lwt’ mother's weight in pounds at last menstrual period. 19 | 20 | ‘race’ mother's race (‘1’ = white, ‘2’ = black, ‘3’ = other). 21 | 22 | ‘smoke’ smoking status during pregnancy. 23 | 24 | ‘ptl’ number of previous premature labours. 25 | 26 | ‘ht’ history of hypertension. 27 | 28 | ‘ui’ presence of uterine irritability. 29 | 30 | ‘ftv’ number of physician visits during the first trimester. 31 | 32 | ‘bwt’ birth weight in grams. 33 | 34 | Source: 35 | 36 | Hosmer, D.W. and Lemeshow, S. (1989) _Applied Logistic 37 | Regression._ New York: Wiley 38 | 39 | -------------------------------------------------------------------------------- /tests/test_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The Lorenz Equations\n", 8 | "\n", 9 | "\\begin{align}\n", 10 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 11 | "\\dot{z} & = -\\beta z + xy\n", 12 | "\\end{align}\n", 13 | "\n", 14 | "toto $a=3$.\n", 15 | "\n", 16 | "\\begin{align}\n", 17 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 18 | "\\dot{z} & = -\\beta z + xy\n", 19 | "\\end{align}.\n", 20 | "\n", 21 | "titi\n", 22 | "\n", 23 | "$$\n", 24 | "\\dot{x} & = \\sigma(y-x) \\\\\n", 25 | "\\dot{y} & = \\rho x - y - xz \\\\\n", 26 | "\\dot{z} & = -\\beta z + xy\n", 27 | "$$" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [] 36 | } 37 | ], 38 | "metadata": { 39 | "kernelspec": { 40 | "display_name": "Python 3", 41 | "language": "python", 42 | "name": "python3" 43 | }, 44 | "language_info": { 45 | "codemirror_mode": { 46 | "name": "ipython", 47 | "version": 3 48 | }, 49 | "file_extension": ".py", 50 | "mimetype": "text/x-python", 51 | "name": "python", 52 | "nbconvert_exporter": "python", 53 | "pygments_lexer": "ipython3", 54 | "version": "3.6.8" 55 | } 56 | }, 57 | "nbformat": 4, 58 | "nbformat_minor": 2 59 | } 60 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | # initial CircleCI setup 3 | jobs: 4 | build: 5 | docker: 6 | - image: circleci/python:3.6.1 7 | working_directory: ~/repo 8 | 9 | steps: 10 | - checkout 11 | - run: | 12 | sudo apt-get install pandoc 13 | sudo apt-get install --no-install-recommends -y \ 14 | texlive-latex-recommended texlive-latex-extra \ 15 | texlive-fonts-recommended latexmk 16 | - run: 17 | name: install dependencies 18 | command: | 19 | python3 -m venv /tmp/venv 20 | . /tmp/venv/bin/activate 21 | pip install ipython notebook sphinx 22 | - run: 23 | name: build HTML 24 | command: | 25 | . /tmp/venv/bin/activate 26 | make html 27 | - run: 28 | name: build PDF 29 | command: | 30 | . /tmp/venv/bin/activate 31 | # TODO: the nonstopmode option is necessary to avoid timeouts 32 | # when compilation errors occurs (e.g. some images are not found). 33 | make pdf LATEXOPTS="--interaction=nonstopmode" 34 | - store_artifacts: 35 | path: build/html/ 36 | destination: html/ 37 | - store_artifacts: 38 | path: build/latex/StatisticsMachineLearningPython.pdf 39 | destination: pdf/StatisticsMachineLearningPython.pdf 40 | -------------------------------------------------------------------------------- /machine_learning/examples/boot_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import seaborn as sns 4 | from sklearn import cluster, datasets 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | import seaborn as sns # nice color 8 | iris = datasets.load_iris() 9 | X = iris.data[:, :2] # use only 'sepal length and sepal width' 10 | y_iris = iris.target 11 | kmr = cluster.KMeans(n_clusters=3, random_state=42).fit(X) 12 | labels_r = kmr.predict(X) 13 | %matplotlib qt 14 | 15 | nboot = 500 16 | orig_all = np.arange(X.shape[0]) 17 | scores_boot = np.zeros(nboot) 18 | for boot_i in range(nboot): 19 | # boot_i = 43 20 | np.random.seed(boot_i) 21 | boot_idx = np.random.choice(orig_all, size=len(orig_all), replace=True) 22 | # boot_idx = orig_all 23 | kmb = cluster.KMeans(n_clusters=3, random_state=42).fit(X[boot_idx, :]) 24 | dist = scipy.spatial.distance.cdist(kmb.cluster_centers_, kmr.cluster_centers_) 25 | reorder = np.argmin(dist, axis=1) 26 | #print(reorder) 27 | # kmb.cluster_centers_ = kmb.cluster_centers_[reorder] 28 | labels_b = kmb.predict(X) 29 | labels_b = np.array([reorder[lab] for lab in labels_b]) 30 | scores_boot[boot_i] = np.sum(labels_b == labels_r) / len(labels_b) 31 | 32 | sns.distplot(scores_boot) 33 | plt.show() 34 | 35 | print(np.min(scores_boot), np.argmin(scores_boot)) 36 | 37 | pd.Series(scores_boot).describe(percentiles=[.975, .5, .025]) 38 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2007-2021 Edouard Duchesnay. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /utils/plot_ml_linear_regression_multicolinearity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri Dec 4 10:23:37 2020 5 | 6 | @author: ed203246 7 | """ 8 | 9 | %matplotlib inline 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import matplotlib.pyplot as plt 14 | 15 | from sklearn import datasets 16 | import sklearn.linear_model as lm 17 | import sklearn.metrics as metrics 18 | 19 | from mpl_toolkits.mplot3d import Axes3D 20 | 21 | np.set_printoptions(precision=2) 22 | pd.set_option('precision', 2) 23 | 24 | # %% Plot linear regression plan (in 2d) 25 | 26 | # Fit Ordinary Least Squares: OLS 27 | csv = pd.read_csv('https://github.com/duchesnay/pystatsml/raw/master/datasets/Advertising.csv', index_col=0) 28 | X = csv[['TV', 'Radio']] 29 | y = csv['Sales'] 30 | 31 | lr = lm.LinearRegression().fit(X, y) 32 | y_pred = lr.predict(X) 33 | print("R-squared =", metrics.r2_score(y, y_pred)) 34 | 35 | print("Coefficients =", lr.coef_, lr.intercept_) 36 | 37 | # Plot 38 | fig = plt.figure(figsize=(9, 9)) 39 | #fig = plt.figure() 40 | ax = fig.add_subplot(111, projection='3d') 41 | 42 | ax.scatter(csv['TV'], csv['Radio'], csv['Sales'], c='r', marker='o') 43 | 44 | xx1, xx2 = np.meshgrid( 45 | np.linspace(csv['TV'].min(), csv['TV'].max(), num=10), 46 | np.linspace(csv['Radio'].min(), csv['Radio'].max(), num=10)) 47 | 48 | XX = np.column_stack([xx1.ravel(), xx2.ravel()]) 49 | 50 | yy = lr.predict(XX) 51 | ax.plot_wireframe(xx1, xx2, yy.reshape(xx1.shape)) 52 | ax.set_xlabel('TV') 53 | ax.set_ylabel('Radio') 54 | _ = ax.set_zlabel('Sales') 55 | 56 | plt.savefig("/home/ed203246/git/pystatsml/images/linear_regression_plan.png") 57 | -------------------------------------------------------------------------------- /statistics/stat_multiv_solutions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Munivariate statistics exercises 3 | ================================ 4 | ''' 5 | import pandas as pd 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | #%matplotlib inline 9 | np.random.seed(seed=42) # make the example reproducible 10 | 11 | ''' 12 | ### Dot product and Euclidean norm 13 | ''' 14 | 15 | a = np.array([2,1]) 16 | b = np.array([1,1]) 17 | 18 | def euclidian(x): 19 | return np.sqrt(np.dot(x, x)) 20 | 21 | euclidian(a) 22 | 23 | euclidian(a - b) 24 | 25 | np.dot(b, a / euclidian(a)) 26 | 27 | X = np.random.randn(100, 2) 28 | np.dot(X, a / euclidian(a)) 29 | 30 | ''' 31 | ### Covariance matrix and Mahalanobis norm 32 | ''' 33 | 34 | N = 100 35 | mu = np.array([1, 1]) 36 | Cov = np.array([[1, .8], 37 | [.8, 1]]) 38 | 39 | X = np.random.multivariate_normal(mu, Cov, N) 40 | 41 | xbar = np.mean(X, axis=0) 42 | print(xbar) 43 | 44 | Xc = (X - xbar) 45 | 46 | np.mean(Xc, axis=0) 47 | 48 | S = 1 / (N - 1) * np.dot(Xc.T, Xc) 49 | print(S) 50 | 51 | #import scipy 52 | 53 | Sinv = np.linalg.inv(S) 54 | 55 | 56 | def mahalanobis(x, xbar, Sinv): 57 | xc = x - xbar 58 | return np.sqrt(np.dot(np.dot(xc, Sinv), xc)) 59 | 60 | dists = pd.DataFrame( 61 | [[mahalanobis(X[i, :], xbar, Sinv), 62 | euclidian(X[i, :] - xbar)] for i in range(X.shape[0])], 63 | columns = ['Mahalanobis', 'Euclidean']) 64 | 65 | print(dists[:10]) 66 | 67 | x = X[0, :] 68 | 69 | import scipy.spatial 70 | assert(mahalanobis(X[0, :], xbar, Sinv) == scipy.spatial.distance.mahalanobis(xbar, X[0, :], Sinv)) 71 | assert(mahalanobis(X[1, :], xbar, Sinv) == scipy.spatial.distance.mahalanobis(xbar, X[1, :], Sinv)) 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /lib/pystatsml/plot_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Aug 29 10:58:31 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | 8 | 9 | import numpy as np 10 | import scipy 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | from matplotlib.patches import Ellipse 15 | 16 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs): 17 | """ 18 | Plots an `nstd` sigma error ellipse based on the specified covariance 19 | matrix (`cov`). Additional keyword arguments are passed on to the 20 | ellipse patch artist. 21 | 22 | Parameters 23 | ---------- 24 | cov : The 2x2 covariance matrix to base the ellipse on 25 | pos : The location of the center of the ellipse. Expects a 2-element 26 | sequence of [x0, y0]. 27 | nstd : The radius of the ellipse in numbers of standard deviations. 28 | Defaults to 2 standard deviations. 29 | ax : The axis that the ellipse will be plotted on. Defaults to the 30 | current axis. 31 | Additional keyword arguments are pass on to the ellipse patch. 32 | 33 | Returns 34 | ------- 35 | A matplotlib ellipse artist 36 | """ 37 | def eigsorted(cov): 38 | vals, vecs = np.linalg.eigh(cov) 39 | order = vals.argsort()[::-1] 40 | return vals[order], vecs[:,order] 41 | 42 | if ax is None: 43 | ax = plt.gca() 44 | 45 | vals, vecs = eigsorted(cov) 46 | theta = np.degrees(np.arctan2(*vecs[:,0][::-1])) 47 | 48 | # Width and height are "full" widths, not radius 49 | width, height = 2 * nstd * np.sqrt(vals) 50 | ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs) 51 | 52 | ax.add_artist(ellip) 53 | return ellip 54 | -------------------------------------------------------------------------------- /scientific_python/scipy_pandas_solutions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exercises: Pandas: data manipulation 3 | ------------------------------------ 4 | 5 | Data Frame 6 | ~~~~~~~~~~ 7 | 8 | 1. Read the iris dataset at 'https://github.com/neurospin/pystatsml/tree/master/datasets/iris.csv' 9 | 10 | 2. Print column names 11 | 12 | 3. Get numerical columns 13 | 14 | 4. For each species compute the mean of numerical columns and store it in a ``stats`` table like: 15 | 16 | :: 17 | 18 | species sepal_length sepal_width petal_length petal_width 19 | 0 setosa 5.006 3.428 1.462 0.246 20 | 1 versicolor 5.936 2.770 4.260 1.326 21 | 2 virginica 6.588 2.974 5.552 2.026 22 | 23 | 24 | """ 25 | 26 | import pandas as pd 27 | import numpy as np 28 | import matplotlib.pyplot as plt 29 | 30 | 31 | url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv' 32 | df = pd.read_csv(url) 33 | 34 | num_cols = df._get_numeric_data().columns 35 | 36 | stats = list() 37 | 38 | for grp, d in df.groupby("species"): 39 | print(grp) 40 | #print() 41 | stats.append( [grp] + d.loc[:, num_cols].mean(axis=0).tolist()) 42 | 43 | stats = pd.DataFrame(stats, columns=["species"] + num_cols.tolist()) 44 | print(stats) 45 | 46 | # or 47 | df.groupby("species").mean() 48 | 49 | ## 50 | 51 | df.loc[[0, 1] ,"petal_width"] = None 52 | 53 | df.petal_width 54 | 55 | df["petal_width"][df["petal_width"].isnull()] = \ 56 | df["petal_width"][df["petal_width"].notnull()].median() 57 | 58 | 59 | # 60 | 61 | l = [(1, "a", 1), (2, "b", 2)] 62 | 63 | for x, y, z in l: 64 | print(x, y, z) 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /R/tools_R_exo.R: -------------------------------------------------------------------------------- 1 | # Set current working directory 2 | WD = "." 3 | setwd(WD) 4 | 5 | df = read.csv("../datasets/iris.csv") 6 | 7 | # Print column names 8 | colnames(df) 9 | 10 | # Get numerical columns 11 | unlist(lapply(df, is.numeric)) 12 | num_cols = colnames(df)[unlist(lapply(df, is.numeric))] 13 | 14 | # For each species compute the mean of numerical columns and store it in a stats table like: 15 | stats = NULL 16 | for (grp in levels(df$species)) { 17 | m = as.matrix(df[df$species == grp, num_cols]) 18 | line = data.frame(species = grp, as.list(colMeans(m))) 19 | stats = rbind(stats, line) 20 | } 21 | 22 | print(stats) 23 | 24 | ## shorter version 25 | aggregate(. ~ species, data = df, mean) 26 | 27 | ## Merging database 28 | user1 = data.frame(name = c("eric", "sophie"), 29 | age = c(22, 48), 30 | gender = c("M", "F"), 31 | job = c("engineer", "scientist")) 32 | user2 = data.frame(name = c("alice", "john", "peter", "julie", "christine"), 33 | age = c(19, 26, 33, 44, 35), 34 | gender = c("F", "M", "M", "F", "F"), 35 | job = c("student", "student", "engineer", "scientist", "scientist")) 36 | user3 = rbind(user1, user2) 37 | salary = data.frame(name = c("alice", "john", "peter", "julie"), 38 | salary = c(2200, 2400, 3500, 4300)) 39 | 40 | user = merge(user3, salary, by = "name", all = TRUE) 41 | 42 | 43 | df = user 44 | 45 | fillmissing_with_mean <- function(df){ 46 | num_cols = colnames(df)[unlist(lapply(df, is.numeric))] 47 | for (n in num_cols) { 48 | x = df[, n] 49 | df[is.na(x), n] = mean(x[!is.na(x)]) # mean(x, na.rm=TRUE) 50 | } 51 | return(df) 52 | } 53 | 54 | user_imputed = fillmissing_with_mean(user) 55 | 56 | write.csv(user_imputed, "users_imputed.csv", row.names = FALSE) 57 | -------------------------------------------------------------------------------- /bin/conv_python_to_rst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 6 12:19:07 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | from __future__ import print_function 8 | import sys, os, argparse 9 | 10 | doc_tag = "'''" 11 | skip_tag = '## SKIP' 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('input', help='Input python file') 16 | 17 | options = parser.parse_args() 18 | 19 | if not options.input: 20 | print >> sys.stderr, 'Required input file' 21 | sys.exit(os.EX_USAGE) 22 | input_filename = options.input 23 | #input_filename = "/home/ed203246/git/pylearn-doc/src/tools_numpy.py" 24 | output_filename = os.path.splitext(input_filename)[0] + ".rst" 25 | input_fd = open(input_filename, 'r') 26 | output_fd = open(output_filename, 'w') 27 | 28 | #line_in = '## Pandas data manipulation' 29 | code_block = True 30 | skip = False 31 | for line_in in input_fd: 32 | #print(line_in) 33 | ## Switch state 34 | if skip_tag in line_in: 35 | skip = not skip 36 | continue 37 | if skip: 38 | continue 39 | if doc_tag in line_in and not code_block: # end doc start code block 40 | code_block = True 41 | output_fd.write('\n') # write new line instead of doc_tag 42 | #line_in = line_in.replace(doc_tag, '') 43 | output_fd.write('.. code:: python\n') 44 | continue 45 | elif doc_tag in line_in and code_block: # start doc end code block 46 | code_block = False 47 | line_in = line_in.replace(doc_tag, '') 48 | 49 | if code_block: 50 | output_fd.write(' ' + line_in) 51 | else: 52 | output_fd.write(line_in) 53 | 54 | input_fd.close() 55 | output_fd.close() 56 | -------------------------------------------------------------------------------- /R/stat_multiv_exo.R: -------------------------------------------------------------------------------- 1 | set.seed(42) 2 | 3 | # http://www.statmethods.net/advstats/matrix.html 4 | 5 | ### Dot product and Euclidean norm 6 | 7 | a = c(2, 1) 8 | b = c(1, 1) 9 | 10 | euclidian <- function(x){ 11 | return(sqrt(x %*% x)[1]) 12 | } 13 | 14 | euclidian(a) 15 | 16 | euclidian(a - b) 17 | 18 | b %*% (a / euclidian(a)) 19 | 20 | X = matrix(rnorm(100 * 2), 100, 2) 21 | dim(X) 22 | X %*% (a / euclidian(a)) 23 | 24 | ### Compute row means and store them into a vector 25 | 26 | row_means = function(X) { 27 | means = NULL 28 | for (i in 1:dim(X)[1]) { 29 | means = c(means, mean(X[i, ])) 30 | } 31 | return(means) 32 | } 33 | row_means(X) 34 | 35 | ## version 2, using built-in accessors 36 | 37 | row_means <- function(X) { 38 | n <- nrow(X) 39 | means <- numeric(n) 40 | for (i in 1:n) 41 | means[i] <- mean(X[i, ]) 42 | 43 | return(means) 44 | } 45 | 46 | ## version 3, using apply 47 | apply(X, 1, mean) 48 | 49 | ## version 4, more efficient 50 | rowMeans(X) 51 | 52 | ### Covariance matrix and Mahalanobis norm 53 | 54 | N = 100 55 | mu = c(1, 1) 56 | Cov = matrix(c(1, .8, 57 | .8, 1), 2, 2) 58 | 59 | library(MASS) 60 | X = mvrnorm(N, mu, Cov) 61 | 62 | xbar = colMeans(X) 63 | print(xbar) 64 | 65 | Xc = (X - xbar) 66 | 67 | colMeans(Xc) 68 | 69 | S = 1 / (N - 1) * (t(Xc) %*% Xc) 70 | print(S) 71 | 72 | 73 | Sinv = solve(S) 74 | 75 | x = X[1, ] 76 | 77 | mahalanobis <- function(x, xbar, Sinv){ 78 | xc = x - xbar 79 | return(sqrt( (xc %*% Sinv) %*% xc)) 80 | } 81 | 82 | 83 | dist = matrix(nrow = N, ncol = 2) 84 | 85 | for(i in 1:nrow(X)){ 86 | dist[i, 1] = mahalanobis(X[i, ], xbar, Sinv) 87 | dist[i, 2] = euclidian(X[i, ] - xbar) 88 | } 89 | colnames(dist) = c("Mahalanobis", "Euclidian") 90 | 91 | print(dist[1:10, ]) 92 | 93 | x = X[1, ] 94 | 95 | print(sqrt(stats::mahalanobis(X[1, ], xbar, Sinv, inverted = TRUE)) - 96 | mahalanobis(X[1, ], xbar, Sinv)) 97 | -------------------------------------------------------------------------------- /machine_learning/resampling_solution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jan 18 10:40:44 2017 5 | 6 | @author: edouard.duchesnay@cea.fr 7 | """ 8 | 9 | """ 10 | Exercise 11 | 12 | Given the logistic regression presented above and its validation given a 5 folds CV. 13 | 14 | Compute the p-value associated with the prediction accuracy using a permutation test. 15 | 16 | Compute the p-value associated with the prediction accuracy using a parametric test. 17 | 18 | """ 19 | import numpy as np 20 | from sklearn import datasets 21 | import sklearn.linear_model as lm 22 | import sklearn.metrics as metrics 23 | from sklearn.model_selection import StratifiedKFold 24 | 25 | X, y = datasets.make_classification(n_samples=100, n_features=100, 26 | n_informative=10, random_state=42) 27 | 28 | model = lm.LogisticRegression(C=1) 29 | nperm = 100 30 | scores_perm= np.zeros((nperm, 3)) # 3 scores acc, recall0, recall1 31 | 32 | for perm in range(0, nperm): 33 | # perm = 0; y == yp 34 | # first run on non-permuted samples 35 | yp = y if perm == 0 else np.random.permutation(y) 36 | # CV loop 37 | y_test_pred = np.zeros(len(yp)) 38 | cv = StratifiedKFold(5) 39 | for train, test in cv.split(X, y): 40 | X_train, X_test, y_train, y_test = X[train, :], X[test, :], yp[train], yp[test] 41 | model.fit(X_train, y_train) 42 | y_test_pred[test] = model.predict(X_test) 43 | scores_perm[perm, 0] = metrics.accuracy_score(yp, y_test_pred) 44 | scores_perm[perm, [1, 2]] = metrics.recall_score(yp, y_test_pred, average=None) 45 | 46 | # Empirical permutation based p-values 47 | pval = np.sum(scores_perm >= scores_perm[0, :], axis=0) / nperm 48 | 49 | print("ACC:%.2f(P=%.3f); SPC:%.2f(P=%.3f); SEN:%.2f(P=%.3f)" %\ 50 | (scores_perm[0, 0], pval[0], 51 | scores_perm[0, 1], pval[1], 52 | scores_perm[0, 2], pval[2])) 53 | 54 | -------------------------------------------------------------------------------- /bin/filter_fix_rst.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on mercredi 17 avril 2019, 11:01:21 (UTC+0200) 5 | 6 | @author: edouard.duchesnay@gmail.com 7 | 8 | Filter that fix rst files generated by jupyter nbconvert 9 | 10 | It is called in the Makefile: 11 | jupyter nbconvert --to rst --stdout $< | bin/filter_fix_rst.py > $@ 12 | """ 13 | 14 | import sys 15 | import re 16 | 17 | if __name__ == "__main__": 18 | 19 | """ 20 | filename = 'test.rst' 21 | fd = open(filename, mode='r') 22 | in_str = fd.read() 23 | fd.close() 24 | """ 25 | #in_str = sys.stdin.read() 26 | lines = sys.stdin.readlines() 27 | 28 | # %% 29 | # FILTER 1: 30 | # CONVERT 31 | # :raw-latex:`\begin{align} 32 | # \dot{x} & = \sigma(y-x) \\ 33 | # ... 34 | # \end{align}` 35 | # 36 | # TO 37 | # .. raw:: latex 38 | # 39 | # \begin{align} 40 | # \dot{x} & = \sigma(y-x) \\ 41 | # ... 42 | # \end{align} 43 | """ 44 | regex = re.compile(r":raw-latex:`(.+?)`", re.MULTILINE|re.DOTALL) 45 | out_str = regex.sub(r'.. raw:: latex\n\n\1', in_str) 46 | """ 47 | match_start = re.compile(r":raw-latex:`(.+?)$") 48 | match_stop = re.compile(r"`") 49 | indent = 0 50 | for i in range(len(lines)): 51 | #print(lines[i]) 52 | if len(match_start.findall(lines[i])) > 0: 53 | #print(i, "match_start", lines[i]) 54 | indent = 3 55 | lines[i] = match_start.sub(r'.. raw:: latex\n\n%s\1' % (r" " * indent), lines[i]) 56 | elif indent and len(match_stop.findall(lines[i])) > 0: 57 | lines[i] = r" " * indent + match_stop.sub(r'', lines[i]) 58 | indent = 0 59 | elif indent: 60 | #print(i, "indent", lines[i]) 61 | lines[i] = r" " * indent + lines[i] 62 | 63 | """ 64 | :math:`\mathbf{v}_k` associated to the singular value :math:`d_k 65 | """ 66 | for line in lines: 67 | sys.stdout.write(line) -------------------------------------------------------------------------------- /python_lang/scripts/count_words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jun 20 18:05:38 2018 5 | 6 | @author: edouard.duchesnay@gmail.com 7 | 8 | ./count_words.py -i /tmp/bsd.txt 9 | """ 10 | 11 | import os 12 | import os.path 13 | import argparse 14 | import re 15 | import pandas as pd 16 | 17 | if __name__ == "__main__": 18 | # parse command line options 19 | output = "word_count.csv" 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-i', '--input', 22 | help='list of input files.', 23 | nargs='+', type=str) 24 | parser.add_argument('-o', '--output', 25 | help='output csv file (default %s)' % output, 26 | type=str, default=output) 27 | options = parser.parse_args() 28 | 29 | if options.input is None : 30 | parser.print_help() 31 | raise SystemExit("Error: input files are missing") 32 | else: 33 | filenames = [f for f in options.input if os.path.isfile(f)] 34 | 35 | # Match words 36 | #regex = re.compile("[^ \t\n\r\f\v,\._> 0: 52 | touch = True 53 | line = regex.sub(options.replacement, line) 54 | # print(line) 55 | lines += line 56 | except Exception as e: 57 | print(filename, ":", e) 58 | 59 | if touch and not options.noaction: 60 | shutil.copy(filename, filename + ".bak") 61 | with open(filename, 'w') as f: 62 | f.write(lines) 63 | -------------------------------------------------------------------------------- /machine_learning/manifold_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## MDS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "from sklearn.decomposition import PCA\n", 19 | "from sklearn.manifold import MDS\n", 20 | "%matplotlib inline\n", 21 | "\n", 22 | "# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/\n", 23 | "\n", 24 | "import pandas as pd\n", 25 | "\n", 26 | "try:\n", 27 | " salary = pd.read_csv('datasets/iris.csv')\n", 28 | "except:\n", 29 | " url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'\n", 30 | " df = pd.read_csv(url)\n", 31 | "\n", 32 | "X = np.asarray(df.iloc[:, :4])\n", 33 | "X -= np.mean(X, axis=0)\n", 34 | "X /= np.std(X, axis=0, ddof=1)\n", 35 | "\n", 36 | "from sklearn import metrics\n", 37 | "D = metrics.pairwise.pairwise_distances(X, metric='euclidean')\n", 38 | "\n", 39 | "\n", 40 | "stress = [MDS(dissimilarity='precomputed', n_components=k,\n", 41 | " random_state=42, max_iter=300, eps=1e-9).fit(D).stress_ for k in range(1, X.shape[1]+1)]\n", 42 | "\n", 43 | "print(\"Stress\", stress)\n", 44 | "plt.plot(range(1, 5), stress)\n", 45 | "\n", 46 | "K = 2\n", 47 | "mds = MDS(dissimilarity='precomputed', n_components=K,\n", 48 | " random_state=42, max_iter=300, eps=1e-9)\n", 49 | "Xmds = mds.fit_transform(D)\n", 50 | "\n", 51 | "pca = PCA(n_components=K)\n", 52 | "pca.fit(X)\n", 53 | "PC = pca.transform(X)\n", 54 | "\n", 55 | "print(\"Correlation between PCA and MDS\")\n", 56 | "cor = [np.corrcoef(Xmds[:, j], PC[:, j])[0, 1] for j in range(min(Xmds.shape[1], PC.shape[1]))]\n", 57 | "print(cor)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [] 66 | } 67 | ], 68 | "metadata": { 69 | "kernelspec": { 70 | "display_name": "Python 3.7.4 64-bit ('base': conda)", 71 | "language": "python", 72 | "name": "python37464bitbaseconda862d4bef370e4cc79b56518e37d84318" 73 | }, 74 | "language_info": { 75 | "codemirror_mode": { 76 | "name": "ipython", 77 | "version": 3 78 | }, 79 | "file_extension": ".py", 80 | "mimetype": "text/x-python", 81 | "name": "python", 82 | "nbconvert_exporter": "python", 83 | "pygments_lexer": "ipython3", 84 | "version": "3.7.9" 85 | } 86 | }, 87 | "nbformat": 4, 88 | "nbformat_minor": 4 89 | } 90 | -------------------------------------------------------------------------------- /utils/datasets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 4 14:44:13 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | 8 | 9 | ''' 10 | Regression 11 | ========== 12 | ''' 13 | 14 | ''' 15 | Do it yourself 16 | -------------- 17 | ''' 18 | import numpy as np 19 | n_features = 5 20 | n_features_info = 2 21 | X = np.random.randn(n_samples * 2, n_features) 22 | beta = np.zeros(n_features) 23 | beta[:n_features_info] = 1 24 | Xbeta = np.dot(X, beta) 25 | eps = np.random.randn(n_samples * 2) 26 | y = Xbeta + eps 27 | 28 | ''' 29 | sklearn 30 | ------- 31 | ''' 32 | from sklearn import datasets 33 | import sklearn.linear_model as lm 34 | import sklearn.metrics as metrics 35 | from sklearn.cross_validation import KFold 36 | 37 | X, y = datasets.make_regression(n_samples=100, n_features=100, 38 | n_informative=10, random_state=42) 39 | 40 | 41 | 42 | ''' 43 | Classification 44 | ============== 45 | ''' 46 | 47 | ''' 48 | Do it yourself 49 | -------------- 50 | ''' 51 | import numpy as np 52 | import scipy 53 | 54 | ############# 55 | # 2D: Dataset 56 | ############# 57 | 58 | n_samples, n_features = 100, 2 59 | mean0, mean1 = np.array([0, 0]), np.array([0, 2]) 60 | Cov = np.array([[1, .8],[.8, 1]]) 61 | np.random.seed(42) 62 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples) 63 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples) 64 | X = np.vstack([X0, X1]) 65 | y = np.array([0] * X0.shape[0] + [1] * X1.shape[0]) 66 | 67 | n_samples, n_features, = 100, 2 68 | 69 | np.random.randn() 70 | 71 | ############################################## 72 | # Large Dataset with block diagonal covariance 73 | ############################################## 74 | import numpy as np 75 | import scipy 76 | 77 | n_samples = 100 78 | block_size = 3 79 | n_block = 2 80 | n_features = block_size * n_block 81 | n_informatives = 2 82 | cov = .8 83 | var = 1 84 | 85 | # Block diagonal covariance 86 | Cov = scipy.linalg.block_diag( 87 | *[np.zeros((block_size, block_size)) + cov for i in range(n_block)]) 88 | np.fill_diagonal(Cov, var) 89 | 90 | 91 | mean0, mean1 = np.zeros(n_features), np.zeros(n_features) 92 | mean1[:n_informatives] = 1 93 | 94 | np.random.seed(42) 95 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples) 96 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples) 97 | X = np.vstack([X0, X1]) 98 | y = np.array([0] * X0.shape[0] + [1] * X1.shape[0]) 99 | 100 | 101 | ''' 102 | sklearn 103 | ------- 104 | ''' 105 | from sklearn import datasets 106 | import sklearn.linear_model as lm 107 | import sklearn.metrics as metrics 108 | from sklearn.cross_validation import StratifiedKFold 109 | 110 | X, y = datasets.make_classification(n_samples=100, n_features=100, 111 | n_informative=10, random_state=42) 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /index.rst: -------------------------------------------------------------------------------- 1 | .. Machine Learning documentation master file, created by 2 | sphinx-quickstart on Mon Nov 30 16:25:34 2015. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ======= 7 | Phantom 8 | ======= 9 | 10 | ============ 11 | Introduction 12 | ============ 13 | 14 | Important links: 15 | 16 | - `Web page `_ 17 | - `Github `_ 18 | - `Latest pdf `_ 19 | - `Official deposit for citation `_. 20 | 21 | This document describes statistics and machine learning in Python using: 22 | 23 | - `Scikit-learn `_ for machine learning. 24 | - `Pytorch `_ for deep learning. 25 | - `Statsmodels `_ for statistics. 26 | 27 | 28 | .. toctree:: 29 | :maxdepth: 2 30 | 31 | introduction/python_ecosystem.rst 32 | introduction/machine_learning.rst 33 | 34 | =============== 35 | Python language 36 | =============== 37 | 38 | .. toctree:: 39 | :maxdepth: 2 40 | 41 | auto_gallery/python_lang.rst 42 | 43 | ================= 44 | Scientific Python 45 | ================= 46 | 47 | .. toctree:: 48 | :maxdepth: 2 49 | 50 | auto_gallery/scipy_numpy.rst 51 | auto_gallery/scipy_pandas.rst 52 | scientific_python/scipy_matplotlib.rst 53 | 54 | ========== 55 | Statistics 56 | ========== 57 | 58 | .. toctree:: 59 | :maxdepth: 2 60 | 61 | statistics/stat_univ.rst 62 | auto_gallery/stat_univ_lab_brain-volume.rst 63 | statistics/stat_multiv.rst 64 | statistics/time_series.rst 65 | 66 | ================ 67 | Machine Learning 68 | ================ 69 | 70 | .. toctree:: 71 | :maxdepth: 2 72 | 73 | machine_learning/decomposition.rst 74 | machine_learning/manifold.rst 75 | machine_learning/clustering.rst 76 | machine_learning/linear_regression.rst 77 | machine_learning/linear_classification.rst 78 | auto_gallery/ml_supervized_nonlinear.rst 79 | auto_gallery/ml_resampling.rst 80 | machine_learning/ensemble_learning.rst 81 | optimization/optim_gradient_descent.rst 82 | auto_gallery/ml_lab_face_recognition.rst 83 | 84 | ============= 85 | Deep Learning 86 | ============= 87 | 88 | .. toctree:: 89 | :maxdepth: 2 90 | 91 | deep_learning/dl_backprop_numpy-pytorch-sklearn.rst 92 | deep_learning/dl_mlp_mnist_pytorch.rst 93 | deep_learning/dl_cnn_cifar10_pytorch.rst 94 | deep_learning/dl_transfer-learning_cifar10-ants-bees_pytorch.rst 95 | 96 | ================== 97 | Indices and tables 98 | ================== 99 | 100 | * :ref:`genindex` 101 | * :ref:`modindex` 102 | * :ref:`search` 103 | 104 | -------------------------------------------------------------------------------- /deep_learning/train_val_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import time 4 | import copy 5 | 6 | 7 | def train_val_model(model, criterion, optimizer, dataloaders, num_epochs=25, 8 | scheduler=None, log_interval=None): 9 | since = time.time() 10 | 11 | best_model_wts = copy.deepcopy(model.state_dict()) 12 | best_acc = 0.0 13 | 14 | # Store losses and accuracies accross epochs 15 | losses, accuracies = dict(train=[], val=[]), dict(train=[], val=[]) 16 | 17 | for epoch in range(num_epochs): 18 | if log_interval is not None and epoch % log_interval == 0: 19 | print('Epoch {}/{}'.format(epoch, num_epochs - 1)) 20 | print('-' * 10) 21 | 22 | # Each epoch has a training and validation phase 23 | for phase in ['train', 'val']: 24 | if phase == 'train': 25 | model.train() # Set model to training mode 26 | else: 27 | model.eval() # Set model to evaluate mode 28 | 29 | running_loss = 0.0 30 | running_corrects = 0 31 | 32 | # Iterate over data. 33 | nsamples = 0 34 | for inputs, labels in dataloaders[phase]: 35 | inputs = inputs.to(device) 36 | labels = labels.to(device) 37 | nsamples += inputs.shape[0] 38 | 39 | # zero the parameter gradients 40 | optimizer.zero_grad() 41 | 42 | # forward 43 | # track history if only in train 44 | with torch.set_grad_enabled(phase == 'train'): 45 | outputs = model(inputs) 46 | _, preds = torch.max(outputs, 1) 47 | loss = criterion(outputs, labels) 48 | 49 | # backward + optimize only if in training phase 50 | if phase == 'train': 51 | loss.backward() 52 | optimizer.step() 53 | 54 | # statistics 55 | running_loss += loss.item() * inputs.size(0) 56 | running_corrects += torch.sum(preds == labels.data) 57 | 58 | if scheduler is not None and phase == 'train': 59 | scheduler.step() 60 | 61 | #nsamples = dataloaders[phase].dataset.data.shape[0] 62 | epoch_loss = running_loss / nsamples 63 | epoch_acc = running_corrects.double() / nsamples 64 | 65 | losses[phase].append(epoch_loss) 66 | accuracies[phase].append(epoch_acc) 67 | if log_interval is not None and epoch % log_interval == 0: 68 | print('{} Loss: {:.4f} Acc: {:.2f}%'.format( 69 | phase, epoch_loss, 100 * epoch_acc)) 70 | 71 | # deep copy the model 72 | if phase == 'val' and epoch_acc > best_acc: 73 | best_acc = epoch_acc 74 | best_model_wts = copy.deepcopy(model.state_dict()) 75 | if log_interval is not None and epoch % log_interval == 0: 76 | print() 77 | 78 | time_elapsed = time.time() - since 79 | print('Training complete in {:.0f}m {:.0f}s'.format( 80 | time_elapsed // 60, time_elapsed % 60)) 81 | print('Best val Acc: {:.2f}%'.format(100 * best_acc)) 82 | 83 | # load best model weights 84 | model.load_state_dict(best_model_wts) 85 | 86 | return model, losses, accuracies 87 | -------------------------------------------------------------------------------- /introduction/machine_learning.rst: -------------------------------------------------------------------------------- 1 | 2 | Introduction to Machine Learning 3 | -------------------------------- 4 | 5 | 6 | Machine learning within data science 7 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 8 | 9 | .. image:: images/data_science.png 10 | :scale: 25 11 | :align: center 12 | 13 | Machine learning covers two main types of data analysis: 14 | 15 | 1. Exploratory analysis: **Unsupervised learning**. Discover the structure within the data. E.g.: Experience (in years in a company) and salary are correlated. 16 | 2. Predictive analysis: **Supervised learning**. This is sometimes described as **"learn from the past to predict the future"**. Scenario: a company wants to detect potential future clients among a base of prospects. Retrospective data analysis: we go through the data constituted of previous prospected companies, with their characteristics (size, domain, localization, etc...). Some of these companies became clients, others did not. The question is, can we possibly predict which of the new companies are more likely to become clients, based on their characteristics based on previous observations? In this example, the training data consists of a set of *n* training samples. Each sample, :math:`x_i`, is a vector of *p* input features (company characteristics) and a target feature (:math:`y_i \in \{Yes, No\}` (whether they became a client or not). 17 | 18 | 19 | .. image:: images/machine_learning.png 20 | :scale: 50 21 | :align: center 22 | 23 | 24 | IT/computing science tools 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 26 | 27 | - High Performance Computing (HPC) 28 | - Data flow, data base, file I/O, etc. 29 | - Python: the programming language. 30 | - Numpy: python library particularly useful for handling of raw numerical data (matrices, mathematical operations). 31 | - Pandas: input/output, manipulation structured data (tables). 32 | 33 | Statistics and applied mathematics 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | - Linear model. 37 | - Non parametric statistics. 38 | - Linear algebra: matrix operations, inversion, eigenvalues. 39 | 40 | 41 | Data analysis methodology 42 | ------------------------- 43 | 44 | 1. Formalize customer's needs into a learning problem: 45 | * A target variable: supervised problem. 46 | - Target is qualitative: classification. 47 | - Target is quantitative: regression. 48 | * No target variable: unsupervised problem 49 | - Vizualisation of high-dimensional samples: PCA, manifolds learning, etc. 50 | - Finding groups of samples (hidden structure): clustering. 51 | 52 | 2. Ask question about the datasets 53 | * Number of samples 54 | * Number of variables, types of each variable. 55 | 56 | 57 | 3. Define the sample 58 | * For prospective study formalize the experimental design: inclusion/exlusion criteria. The conditions that define the acquisition of the dataset. 59 | * For retrospective study formalize the experimental design: inclusion/exlusion criteria. The conditions that define the selection of the dataset. 60 | 61 | 4. In a document formalize (i) the project objectives; (ii) the required learning dataset (more specifically the input data and the target variables); (iii) The conditions that define the acquisition of the dataset. In this document, warn the customer that the learned algorithms may not work on new data acquired under different condition. 62 | 63 | 5. Read the learning dataset. 64 | 65 | 6. (i) Sanity check (basic descriptive statistics); (ii) data cleaning (impute missing data, recoding); Final Quality Control (QC) perform descriptive statistics and think ! (remove possible confounding variable, etc.). 66 | 67 | 7. Explore data (visualization, PCA) and perform basic univariate statistics for association between the target an input variables. 68 | 69 | 8. Perform more complex multivariate-machine learning. 70 | 71 | 9. Model validation using a left-out-sample strategy (cross-validation, etc.). 72 | 73 | 10. Apply on new data. 74 | 75 | -------------------------------------------------------------------------------- /utils/mahalanobis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Feb 4 16:09:56 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | import numpy as np 8 | import scipy 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | #%matplotlib inline 12 | 13 | ''' 14 | Mahalanobis distance 15 | ==================== 16 | ''' 17 | 18 | from matplotlib.patches import Ellipse 19 | def plot_cov_ellipse(cov, pos, nstd=2, ax=None, **kwargs): 20 | """ 21 | Plots an `nstd` sigma error ellipse based on the specified covariance 22 | matrix (`cov`). Additional keyword arguments are passed on to the 23 | ellipse patch artist. 24 | 25 | Parameters 26 | ---------- 27 | cov : The 2x2 covariance matrix to base the ellipse on 28 | pos : The location of the center of the ellipse. Expects a 2-element 29 | sequence of [x0, y0]. 30 | nstd : The radius of the ellipse in numbers of standard deviations. 31 | Defaults to 2 standard deviations. 32 | ax : The axis that the ellipse will be plotted on. Defaults to the 33 | current axis. 34 | Additional keyword arguments are pass on to the ellipse patch. 35 | 36 | Returns 37 | ------- 38 | A matplotlib ellipse artist 39 | """ 40 | def eigsorted(cov): 41 | vals, vecs = np.linalg.eigh(cov) 42 | order = vals.argsort()[::-1] 43 | return vals[order], vecs[:,order] 44 | 45 | if ax is None: 46 | ax = plt.gca() 47 | 48 | vals, vecs = eigsorted(cov) 49 | theta = np.degrees(np.arctan2(*vecs[:,0][::-1])) 50 | 51 | # Width and height are "full" widths, not radius 52 | width, height = 2 * nstd * np.sqrt(vals) 53 | ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwargs) 54 | 55 | ax.add_artist(ellip) 56 | return ellip 57 | 58 | n_samples, n_features = 100, 2 59 | mean0, mean1 = np.array([0, 0]), np.array([0, 2]) 60 | Cov = np.array([[1, .8],[.8, 1]]) 61 | np.random.seed(42) 62 | X0 = np.random.multivariate_normal(mean0, Cov, n_samples) 63 | X1 = np.random.multivariate_normal(mean1, Cov, n_samples) 64 | 65 | x = np.array([2, 2]) 66 | 67 | plt.scatter(X0[:, 0], X0[:, 1], color='b') 68 | plt.scatter(X1[:, 0], X1[:, 1], color='r') 69 | plt.scatter(mean0[0], mean0[1], color='b', s=200, label="m0") 70 | plt.scatter(mean1[0], mean1[1], color='r', s=200, label="m2") 71 | plt.scatter(x[0], x[1], color='k', s=200, label="x") 72 | plot_cov_ellipse(Cov, pos=mean0, facecolor='none', linewidth=2, edgecolor='b') 73 | plot_cov_ellipse(Cov, pos=mean1, facecolor='none', linewidth=2, edgecolor='r') 74 | plt.legend(loc='upper left') 75 | 76 | # 77 | d2_m0x = scipy.spatial.distance.euclidean(mean0, x) 78 | d2_m0m2 = scipy.spatial.distance.euclidean(mean0, mean1) 79 | 80 | Covi = scipy.linalg.inv(Cov) 81 | dm_m0x = scipy.spatial.distance.mahalanobis(mean0, x, Covi) 82 | dm_m0m2 = scipy.spatial.distance.mahalanobis(mean0, mean1, Covi) 83 | 84 | print('Euclidean dist(m0, x)=%.2f > dist(m0, m2)=%.2f' % (d2_m0x, d2_m0m2)) 85 | print('Mahalanobis dist(m0, x)=%.2f < dist(m0, m2)=%.2f' % (dm_m0x, dm_m0m2)) 86 | 87 | 88 | ''' 89 | ## Exercise 90 | 91 | - Write a function `euclidean(a, b)` that compute the euclidean distance 92 | - Write a function `mahalanobis(a, b, Covi)` that compute the euclidean 93 | distance, with the inverse of the covariance matrix. Use `scipy.linalg.inv(Cov)` 94 | to invert your matrix. 95 | ''' 96 | def euclidian(a, b): 97 | return np.sqrt(np.sum((a - b) ** 2)) 98 | 99 | def mahalanobis(a, b, cov_inv): 100 | return np.sqrt(np.dot(np.dot((a - b), cov_inv), (a - b).T)) 101 | 102 | assert mahalanobis(mean0, mean1, Covi) == dm_m0m2 103 | assert euclidian(mean0, mean1) == d2_m0m2 104 | 105 | mahalanobis(X0, mean0, Covi) 106 | X = X0 107 | mean = mean0 108 | covi= Covi 109 | 110 | np.sqrt(np.dot(np.dot((X - mean), covi), (X - mean).T)) 111 | 112 | def mahalanobis(X, mean, covi): 113 | """ 114 | from scipy.spatial.distance import mahalanobis 115 | d2= np.array([mahalanobis(X[i], mean, covi) for i in range(X.shape[0])]) 116 | np.all(mahalanobis(X, mean, covi) == d2) 117 | """ 118 | return np.sqrt(np.sum(np.dot((X - mean), covi) * (X - mean), axis=1)) 119 | 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Statistics and Machine Learning in Python 2 | ========================================= 3 | 4 | - [pdf](ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf) 5 | - [www](https://duchesnay.github.io/pystatsml) 6 | 7 | 8 | Structure 9 | --------- 10 | 11 | Courses are available in three formats: 12 | 13 | 1. Jupyter notebooks. 14 | 15 | 2. Python files using sphinx-gallery. 16 | 17 | 3. ReStructuredText files. 18 | 19 | All notebooks and python files are converted into `rst` format and then assembled together using sphinx. 20 | 21 | Directories and main files: 22 | 23 | introduction/ 24 | ├── machine_learning.rst 25 | └── python_ecosystem.rst 26 | 27 | python_lang/ # (Python language) 28 | ├── python_lang.py # (main file) 29 | └── python_lang_solutions.py 30 | 31 | scientific_python/ 32 | ├── matplotlib.ipynb 33 | ├── scipy_numpy.py 34 | ├── scipy_numpy_solutions.py 35 | ├── scipy_pandas.py 36 | └── scipy_pandas_solutions.py 37 | 38 | statistics/ # (Statistics) 39 | ├── stat_multiv.ipynb # (multivariate statistics) 40 | ├── stat_univ.ipynb # (univariate statistics) 41 | ├── stat_univ_solutions.ipynb 42 | ├── stat_univ_lab01_brain-volume.py # (lab) 43 | ├── stat_univ_solutions.ipynb 44 | └── time_series.ipynb 45 | 46 | machine_learning/ # (Machine learning) 47 | ├── clustering.ipynb 48 | ├── decomposition.ipynb 49 | ├── decomposition_solutions.ipynb 50 | ├── linear_classification.ipynb 51 | ├── linear_regression.ipynb 52 | ├── non_linear_prediction.ipynb 53 | ├── resampling.ipynb 54 | ├── resampling_solution.py 55 | └── sklearn.ipynb 56 | 57 | optimization/ 58 | ├── optim_gradient_descent.ipynb 59 | └── optim_gradient_descent_lab.ipynb 60 | 61 | deep_learning/ 62 | ├── dl_backprop_numpy-pytorch-sklearn.ipynb 63 | ├── dl_cnn_cifar10_pytorch.ipynb 64 | ├── dl_mlp_mnist_pytorch.ipynb 65 | └── dl_transfer-learning_cifar10-ants- 66 | 67 | 68 | Build 69 | ----- 70 | 71 | After pulling the repository execute Jupyter notebooks (outputs are expected to be removed before git submission). 72 | ``` 73 | make exe 74 | ``` 75 | 76 | Build the pdf file (requires LaTeX): 77 | ``` 78 | make pdf 79 | ``` 80 | 81 | Build the html files: 82 | ``` 83 | make html 84 | ``` 85 | 86 | Clean everything and strip output from Jupyter notebook (useless if you installed the nbstripout hook, ): 87 | ``` 88 | make clean 89 | ``` 90 | 91 | Dependencies 92 | ------------ 93 | The easier is to install Anaconda at https://www.continuum.io with python >= 3. Anaconda provides 94 | 95 | - python 3 96 | - ipython 97 | - Jupyter 98 | - pandoc 99 | - LaTeX to generate pdf 100 | 101 | Then install: 102 | 103 | 1. [sphinx-gallery](https://sphinx-gallery.readthedocs.io) 104 | 105 | ``` 106 | pip install sphinx-gallery 107 | ``` 108 | 109 | 2. [nbstripout](https://github.com/kynan/nbstripout) 110 | 111 | ``` 112 | conda install -c conda-forge nbstripout 113 | ``` 114 | 115 | Configure your git repository with nbstripout pre-commit hook for users who don't want to track output in VCS. 116 | 117 | ``` 118 | cd pystatsml 119 | nbstripout --install 120 | ``` 121 | 122 | 3. Git [LFS](https://git-lfs.github.com/) for datasets 123 | 124 | a. Install Git LFS 125 | 126 | ``` 127 | git lfs install 128 | ``` 129 | 130 | b. select the file types you'd like Git LFS to manage 131 | 132 | ``` 133 | git lfs track "*.npz" 134 | git lfs track "*.npy" 135 | git lfs track "*.nii" 136 | git lfs track "*.nii.gz" 137 | git lfs track "*.csv" 138 | ``` 139 | 140 | b. Now make sure .gitattributes is tracked: 141 | 142 | ``` 143 | git add .gitattributes 144 | ``` 145 | 146 | 4. LaTeX (optional for pdf) 147 | 148 | For Linux debian like: 149 | 150 | ``` 151 | sudo apt-get install latexmk texlive-latex-extra 152 | ``` 153 | 154 | 5. MS docx (optional) 155 | 156 | [docxbuilder](https://docxbuilder.readthedocs.io/en/latest/docxbuilder.html) 157 | 158 | a. Install 159 | 160 | ``` 161 | pip install docxbuilder 162 | pip install docxbuilder[math] 163 | ``` 164 | 165 | b. Build 166 | 167 | ``` 168 | make docx 169 | ``` 170 | -------------------------------------------------------------------------------- /deep_learning/README.md: -------------------------------------------------------------------------------- 1 | # Course 2 | 3 | ## Introduction to Deep Learning 4 | 5 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/01_intro_to_deep_learning/index.html) 6 | 7 | ## 1. Optimisation: Gradient descent and Backpropagation 8 | 9 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/02_backprop/index.html) 10 | 11 | - [Lab: `dl_optim-backprop_numpy-pytorch-sklearn.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_optim-backprop_numpy-pytorch-sklearn.ipynb) 12 | 13 | ## 2. Multi-Layer Perceptron 14 | 15 | - [Lab: `dl_mlp_mnist_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_mlp_mnist_pytorch.ipynb) 16 | 17 | 18 | ## 3. Convolutional Neural Networks (CNN) 19 | 20 | - [Slides:](https://m2dsupsdlclass.github.io/lectures-labs/slides/04_conv_nets/index.html) 21 | 22 | - [Lab: `dl_cnn_mnist_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_cnn_mnist_pytorch.ipynb) 23 | 24 | 25 | ## 4. Transfer Learning 26 | 27 | - [Lab: `dl_transfer-learning_ants-bees_pytorch.ipynb`](https://github.com/duchesnay/pystatsml/tree/master/deep_learning/dl_transfer-learning_ants-bees_pytorch.ipynb) 28 | 29 | # Ressources 30 | 31 | ## Deep Learning class, Master Datascience Paris Saclay 32 | 33 | [Deep Learning class, Master Datascience Paris Saclay](https://github.com/m2dsupsdlclass/lectures-labs) 34 | 35 | ## Stanford ML courses 36 | 37 | - [Deep learning - cs-231n @stanford.edu](http://cs231n.stanford.edu/) 38 | 39 | - [Deep Learning Cheatsheet - cs-230 @stanford.edu](https://stanford.edu/~shervine/teaching/cs-230/) 40 | 41 | - [Machine Learning Cheatsheet - cs-229 @stanford.edu](https://stanford.edu/~shervine/teaching/cs-229/) 42 | 43 | 44 | ## Anaconda 45 | 46 | Download from [www.anaconda.com](https://www.anaconda.com/) 47 | 48 | Choose Python 3.x 49 | 50 | Update conda 51 | 52 | conda update -n base -c defaults conda 53 | 54 | ## Pytorch 55 | 56 | 57 | - [WWW tutorials](https://pytorch.org/tutorials/) 58 | 59 | - [github tutorials](https://github.com/pytorch/tutorials) 60 | 61 | - [github examples](https://github.com/pytorch/examples) 62 | 63 | 64 | ### Installation 65 | 66 | [pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) 67 | 68 | 69 | **Anaconda + No CUDA** 70 | 71 | conda install pytorch-cpu torchvision-cpu -c pytorch 72 | 73 | Check if torch can be loaded. If CUDA is not available, we will use CPU instead of GPU. 74 | 75 | python3 -c "import torch; print(torch.__version__, torch.cuda.is_available())" 76 | 77 | **Anaconda + CUDA 10:** 78 | 79 | conda install pytorch torchvision cudatoolkit=10.0 -c pytorch 80 | 81 | 82 | ## Optional install Keras for Deep Learning class, Master Datascience Paris Saclay 83 | 84 | [Deep Learning class](https://github.com/m2dsupsdlclass/lectures-labs) 85 | 86 | Create an new environement called ``py36`` where we will install python 3.6 for Keras and tensor flow 87 | 88 | conda create --name py36 89 | conda activate py36 90 | 91 | 92 | [installation instructions](https://github.com/m2dsupsdlclass/lectures-labs/blob/master/installation_instructions.md) 93 | 94 | Open a console / terminal and update the following packages with conda: 95 | 96 | conda activate py36 97 | conda install python=3.6 numpy scikit-learn jupyter ipykernel matplotlib pip 98 | conda install pandas h5py pillow scikit-image lxml tensorflow keras 99 | 100 | Check that you can import tensorflow with the python from anaconda: 101 | 102 | python3 -c "import tensorflow as tf; print(tf.__version__)" 103 | 104 | If you have several installations of Python on your system (virtualenv, conda environments...), it can be confusing to select the correct Python environment from the jupyter interface. You can name this environment for instance "py36" and reference it as a Jupyter kernel: 105 | 106 | python3 -m ipykernel install --user --name py36 --display-name py36 107 | 108 | To take pictures with the webcam we will also need opencv-python: 109 | 110 | python3 -m pip install opencv-python 111 | 112 | Clone Repository: 113 | 114 | git clone https://github.com/m2dsupsdlclass/lectures-labs 115 | 116 | 117 | # Misc 118 | 119 | ## Draw neural net 120 | 121 | [Draw neural net](http://alexlenail.me/NN-SVG/index.html) 122 | 123 | -------------------------------------------------------------------------------- /utils/ml_resampling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 5 10:48:25 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | 8 | 9 | ''' 10 | Regression 11 | ========== 12 | ''' 13 | 14 | import numpy as np 15 | from sklearn import datasets 16 | import sklearn.linear_model as lm 17 | from sklearn.grid_search import GridSearchCV 18 | import sklearn.metrics as metrics 19 | from sklearn.cross_validation import KFold 20 | 21 | # Dataset 22 | noise_sd = 10 23 | X, y, coef = datasets.make_regression(n_samples=50, n_features=100, noise=noise_sd, 24 | n_informative=2, random_state=42, coef=True) 25 | 26 | # Use this to tune the noise parameter such that snr < 5 27 | print("SNR:", np.std(np.dot(X, coef)) / noise_sd) 28 | 29 | # param grid over alpha & l1_ratio 30 | param_grid = {'alpha': 10. ** np.arange(-3, 3), 'l1_ratio':[.1, .5, .9]} 31 | 32 | 33 | # Warp 34 | model = GridSearchCV(lm.ElasticNet(max_iter=10000), param_grid, cv=5) 35 | 36 | # 1) Biased usage: fit on all data, ommit outer CV loop 37 | model.fit(X, y) 38 | print("Train r2:%.2f" % metrics.r2_score(y, model.predict(X))) 39 | print(model.best_params_) 40 | 41 | # 2) User made outer CV, useful to extract specific information 42 | cv = KFold(len(y), n_folds=5, random_state=42) 43 | y_test_pred = np.zeros(len(y)) 44 | y_train_pred = np.zeros(len(y)) 45 | alphas = list() 46 | 47 | for train, test in cv: 48 | X_train, X_test, y_train, y_test = X[train, :], X[test, :], y[train], y[test] 49 | model.fit(X_train, y_train) 50 | y_test_pred[test] = model.predict(X_test) 51 | y_train_pred[train] = model.predict(X_train) 52 | alphas.append(model.best_params_) 53 | 54 | print("Train r2:%.2f" % metrics.r2_score(y, y_train_pred)) 55 | print("Test r2:%.2f" % metrics.r2_score(y, y_test_pred)) 56 | print("Selected alphas:", alphas) 57 | 58 | # 3.) user-friendly sklearn for outer CV 59 | from sklearn.cross_validation import cross_val_score 60 | scores = cross_val_score(estimator=model, X=X, y=y, cv=cv) 61 | print("Test r2:%.2f" % scores.mean()) 62 | 63 | 64 | ''' 65 | 3.2.3.1. Specifying an objective metric 66 | 67 | By default, parameter search uses the score function of the estimator to evaluate a parameter setting. These are the sklearn.metrics.accuracy_score for classification and sklearn.metrics.r2_score for regression. For some applications, other scoring functions are better suited (for example in unbalanced classification, the accuracy score is often uninformative). An alternative scoring function can be specified via the scoring parameter to GridSearchCV, RandomizedSearchCV and many of the specialized cross-validation tools described below. See The scoring parameter: defining model evaluation rules for more details. 68 | ''' 69 | 70 | ''' 71 | Bootstrapping 72 | ''' 73 | 74 | import numpy as np 75 | from sklearn import datasets 76 | import sklearn.linear_model as lm 77 | import sklearn.metrics as metrics 78 | import pandas as pd 79 | 80 | # Dataset 81 | n_features = 5 82 | n_features_info = 2 83 | n_samples = 100 84 | X = np.random.randn(n_samples, n_features) 85 | beta = np.zeros(n_features) 86 | beta[:n_features_info] = 1 87 | Xbeta = np.dot(X, beta) 88 | eps = np.random.randn(n_samples) 89 | y = Xbeta + eps 90 | 91 | 92 | model = lm.RidgeCV() 93 | model.fit(X, y) 94 | print("Coefficients on all data:") 95 | print(model.coef_) 96 | 97 | nboot = 100 # !! Should be at least 1000 98 | scores_names = ["r2"] 99 | scores_boot = np.zeros((nboot, len(scores_names))) 100 | coefs_boot = np.zeros((nboot, X.shape[1])) 101 | 102 | orig_all = np.arange(X.shape[0]) 103 | for boot_i in range(nboot): 104 | boot_tr = np.random.choice(orig_all, size=len(orig_all), replace=True) 105 | boot_te = np.setdiff1d(orig_all, boot_tr, assume_unique=False) 106 | Xtr, ytr = X[boot_tr, :], y[boot_tr] 107 | Xte, yte = X[boot_te, :], y[boot_te] 108 | model.fit(Xtr, ytr) 109 | y_pred = model.predict(Xte).ravel() 110 | #y_pred.shape, prob_pred.shape, yte.shape 111 | scores_boot[boot_i, :] = metrics.r2_score(yte, y_pred) 112 | coefs_boot[boot_i, :] = model.coef_ 113 | 114 | scores_boot = pd.DataFrame(scores_boot, columns=scores_names) 115 | scores_stat = scores_boot.describe(percentiles=[.99, .95, .5, .1, .05, 0.01]) 116 | 117 | print("r-squared: Mean=%.2f, SE=%.2f, CI=(%.2f %.2f)" %\ 118 | tuple(scores_stat.ix[["mean", "std", "5%", "95%"], "r2"])) 119 | 120 | 121 | coefs_boot = pd.DataFrame(coefs_boot) 122 | coefs_stat = coefs_boot.describe(percentiles=[.99, .95, .5, .1, .05, 0.01]) 123 | print("Coefficients distribution") 124 | print(coefs_stat) 125 | -------------------------------------------------------------------------------- /utils/plot_ml_linear_regression_overfitting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Dec 2 23:25:38 2020 5 | 6 | @author: ed203246 7 | """ 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns # nicer plots 11 | import sklearn.metrics as metrics 12 | import sklearn.linear_model as lm 13 | 14 | # %% Plot train/test with inreasing size 15 | 16 | def fit_on_increasing_size(model): 17 | n_samples = 100 18 | n_features_ = np.arange(10, 350, 20) 19 | r2_train, r2_test, snr = [], [], [] 20 | for n_features in n_features_: 21 | # Sample the dataset (* 2 nb of samples) 22 | n_features_info = int(n_features / 10) 23 | np.random.seed(27) # Make reproducible 27 24 | X = np.random.randn(n_samples * 2, n_features) 25 | beta = np.zeros(n_features) 26 | beta[:n_features_info] = .7 27 | Xbeta = np.dot(X, beta) 28 | eps = np.random.randn(n_samples * 2) 29 | y = Xbeta + eps 30 | # Split the dataset into train and test sample 31 | Xtrain, Xtest = X[:n_samples, :], X[n_samples:, :] 32 | ytrain, ytest = y[:n_samples], y[n_samples:] 33 | # fit/predict 34 | lr = model.fit(Xtrain, ytrain) 35 | y_pred_train = lr.predict(Xtrain) 36 | y_pred_test = lr.predict(Xtest) 37 | snr.append(Xbeta.std() / eps.std()) 38 | r2_train.append(metrics.r2_score(ytrain, y_pred_train)) 39 | r2_test.append(metrics.r2_score(ytest, y_pred_test)) 40 | return n_features_, np.array(r2_train), np.array(r2_test), np.array(snr) 41 | 42 | def plot_r2_snr(n_features_, r2_train, r2_test, xvline, snr, ax, title): 43 | """ 44 | Two scales plot. Left y-axis: train test r-squared. Right y-axis SNR. 45 | """ 46 | ax.plot(n_features_, r2_train, label="Train r2", linewidth=2, color=sns.color_palette()[0]) 47 | ax.plot(n_features_, r2_test, label="Test r2", linewidth=2, color=sns.color_palette()[1]) 48 | ax.axvline(x=xvline, linewidth=2, color='k', ls='--') 49 | ax.fill_between(n_features_, r2_test, 0, alpha=.3, color=sns.color_palette()[1]) 50 | ax.fill_between(n_features_, r2_test, r2_train, alpha=.3, color=sns.color_palette()[0]) 51 | ax.axhline(y=0, linewidth=1, color='k', ls='--') 52 | ax.set_ylim(-0.2, 1.1) 53 | ax.set_ylabel("r2", fontsize=16) 54 | ax.legend(loc='best') 55 | ax.grid(True) 56 | ax.set_title(title, fontsize=20) 57 | ax_right = ax.twinx() 58 | ax_right.plot(n_features_, snr, '--', color='gray', label="SNR", linewidth=1) 59 | ax_right.set_ylabel("SNR", color='gray') 60 | for tl in ax_right.get_yticklabels(): 61 | tl.set_color('gray') 62 | 63 | # plot 64 | fig, axis = plt.subplots(4, 1, figsize=(9, 12), sharex=True) 65 | 66 | 67 | # %% No regularization 68 | 69 | mod = lm.LinearRegression() 70 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod) 71 | argmax = n_features[np.argmax(r2_test)] 72 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[0], 'Regression') 73 | 74 | # %% L2 regularization 75 | 76 | mod = lm.Ridge(alpha=10) # lambda is alpha! 77 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod) 78 | argmax = n_features[np.argmax(r2_test)] 79 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[1], 'Ridge') 80 | 81 | # %% L1 regularization 82 | 83 | mod = lm.Lasso(alpha=.1) # lambda is alpha ! 84 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod) 85 | argmax = n_features[np.argmax(r2_test)] 86 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[2], 'Lasso') 87 | 88 | 89 | # %% L1-L2 regularization 90 | 91 | mod = lm.ElasticNet(alpha=.5, l1_ratio=.5) 92 | n_features, r2_train, r2_test, snr = fit_on_increasing_size(model=mod) 93 | argmax = n_features[np.argmax(r2_test)] 94 | plot_r2_snr(n_features, r2_train, r2_test, argmax, snr, axis[3], 'ElasticNet') 95 | 96 | 97 | 98 | plt.tight_layout() 99 | axis[3].set_xlabel("Number of input features", fontsize=16) 100 | plt.savefig("/home/ed203246/git/pystatsml/images/linear_regression_penalties.png") 101 | 102 | # %% Codes examples: 103 | 104 | from sklearn import datasets 105 | import sklearn.linear_model as lm 106 | 107 | X, y = datasets.make_regression(n_features=5, n_informative=2, random_state=0) 108 | 109 | lr = lm.LinearRegression().fit(X, y) 110 | 111 | l2 = lm.Ridge(alpha=10).fit(X, y) # lambda is alpha! 112 | print(l2.coef_) 113 | 114 | l1 = lm.Lasso(alpha=1).fit(X, y) # lambda is alpha ! 115 | print(l1.coef_) 116 | 117 | l1l2 = lm.ElasticNet(alpha=1, l1_ratio=.9).fit(X, y) 118 | -------------------------------------------------------------------------------- /utils/plot_ml_linear_classification_overfitting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Dec 2 23:25:38 2020 5 | 6 | @author: ed203246 7 | """ 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns # nicer plots 11 | import sklearn.metrics as metrics 12 | import sklearn.linear_model as lm 13 | 14 | import sklearn.datasets as datasets 15 | from sklearn.model_selection import train_test_split 16 | 17 | # %% Plot train/test with inreasing size 18 | 19 | def logistic(x): return 1 / (1 + np.exp(-x)) 20 | 21 | def fit_on_increasing_size(model): 22 | n_samples = 100 23 | n_features_ = np.arange(20, 2000, 100) 24 | bacc_train, bacc_test = [], [] 25 | for n_features in n_features_: 26 | n_features_info = int(n_features / 10) 27 | X, y = datasets.make_classification(n_samples=n_samples * 2, n_features=n_features, 28 | n_informative=n_features_info, n_redundant=int(n_features_info / 2), 29 | n_classes=2, 30 | n_clusters_per_class=1, 31 | weights=None, flip_y=0.01, 32 | class_sep=.5, 33 | hypercube=True, shift=0.0, scale=1.0, shuffle=True, 34 | random_state=1) 35 | """ 36 | # Sample the dataset (* 2 nb of samples) 37 | n_features_info = int(n_features / 10) 38 | np.random.seed(27) # Make reproducible 27 39 | X = np.random.randn(n_samples * 2, n_features) 40 | beta = np.zeros(n_features) 41 | beta[:n_features_info] = 1 42 | Xbeta = np.dot(X, beta) 43 | eps = np.random.randn(n_samples * 2) 44 | proba = logistic(Xbeta + eps) 45 | y = (proba >= 0.5).astype(int) 46 | """ 47 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=42) 48 | # fit/predict 49 | mod.fit(X_train, y_train) 50 | y_pred_train = mod.predict(X_train) 51 | y_pred_test = mod.predict(X_test) 52 | #snr.append(Xbeta.std() / eps.std()) 53 | bacc_train.append(metrics.balanced_accuracy_score(y_train, y_pred_train)) 54 | bacc_test.append(metrics.balanced_accuracy_score(y_test, y_pred_test)) 55 | return n_features_, np.array(bacc_train), np.array(bacc_test) 56 | 57 | def plot_bacc(n_features_, bacc_train, bacc_test, xvline, ax, title): 58 | """ 59 | Two scales plot. Left y-axis: train test r-squared. Right y-axis SNR. 60 | """ 61 | ax.plot(n_features_, bacc_train, label="Train Acc", linewidth=2, color=sns.color_palette()[0]) 62 | ax.plot(n_features_, bacc_test, label="Test Acc", linewidth=2, color=sns.color_palette()[1]) 63 | ax.axvline(x=xvline, linewidth=2, color='k', ls='--') 64 | ax.fill_between(n_features_, bacc_test, 0.5, alpha=.3, color=sns.color_palette()[1]) 65 | ax.fill_between(n_features_, bacc_test, bacc_train, alpha=.3, color=sns.color_palette()[0]) 66 | ax.axhline(y=0.5, linewidth=1, color='k', ls='--') 67 | ax.set_ylim(0.3, 1.1) 68 | ax.set_ylabel("r2", fontsize=16) 69 | ax.legend(loc='best') 70 | ax.grid(True) 71 | ax.set_title(title, fontsize=20) 72 | 73 | # plot 74 | fig, axis = plt.subplots(4, 1, figsize=(9, 12), sharex=True) 75 | 76 | 77 | # %% No regularization 78 | 79 | #fig, axis = plt.subplots(1, 1, figsize=(9, 12), sharex=True) 80 | #fig, axis = plt.subplots(1, 1, figsize=(9, 9), sharex=True) 81 | 82 | #mod = lm.LogisticRegression(penalty='none') 83 | mod = lm.LogisticRegression(penalty='l2', C=.1e16) # lambda = 1 / C! 84 | 85 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod) 86 | argmax = n_features[np.argmax(bacc_test)] 87 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[0], 'Regression') 88 | 89 | # %% L2 regularization 90 | 91 | mod = lm.LogisticRegression(penalty='l2', C=1e-2) # lambda = 1 / C! 92 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod) 93 | argmax = n_features[np.argmax(bacc_test)] 94 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[1], 'Ridge') 95 | 96 | # %% L1 regularization 97 | 98 | mod = lm.LogisticRegression(penalty='l1', C=.1, solver='saga') # lambda = 1 / C! 99 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod) 100 | argmax = n_features[np.argmax(bacc_test)] 101 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[2], 'Lasso') 102 | 103 | 104 | # %% L1-L2 regularization 105 | 106 | mod = lm.LogisticRegression(penalty='elasticnet', C=.1, l1_ratio=0.5, solver='saga') 107 | n_features, bacc_train, bacc_test = fit_on_increasing_size(model=mod) 108 | argmax = n_features[np.argmax(bacc_test)] 109 | plot_bacc(n_features, bacc_train, bacc_test, argmax, axis[3], 'ElasticNet') 110 | 111 | 112 | 113 | plt.tight_layout() 114 | axis[3].set_xlabel("Number of input features", fontsize=16) 115 | #plt.savefig("/home/ed203246/git/pystatsml/images/linear_classification_penalties.png") 116 | 117 | # %% Codes examples: 118 | 119 | if False: 120 | from sklearn import datasets 121 | import sklearn.linear_model as lm 122 | 123 | X, y = datasets.make_regression(n_features=5, n_informative=2, random_state=0) 124 | 125 | lr = lm.LinearRegression().fit(X, y) 126 | 127 | l2 = lm.Ridge(alpha=10).fit(X, y) # lambda is alpha! 128 | print(l2.coef_) 129 | 130 | l1 = lm.Lasso(alpha=1).fit(X, y) # lambda is alpha ! 131 | print(l1.coef_) 132 | 133 | l1l2 = lm.ElasticNet(alpha=1, l1_ratio=.9).fit(X, y) 134 | -------------------------------------------------------------------------------- /utils/stat_univar_statmodels.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | #import matplotlib.pyplot as plt 4 | #from statsmodels.sandbox.regression.predstd import wls_prediction_std 5 | 6 | np.random.seed(42) 7 | 8 | ''' 9 | Ordinary Least Squares 10 | ====================== 11 | ''' 12 | 13 | ''' 14 | Numpy 15 | ----- 16 | ''' 17 | import numpy as np 18 | import scipy 19 | np.random.seed(seed=42) # make the example reproducible 20 | 21 | # Dataset 22 | N, P = 50, 4 23 | X = np.random.normal(size= N * P).reshape((N, P)) 24 | ## Our model needs an intercept so we add a column of 1s: 25 | X[:, 0] = 1 26 | print(X[:5, :]) 27 | 28 | betastar = np.array([10, 1., .5, 0.1]) 29 | e = np.random.normal(size=N) 30 | y = np.dot(X, betastar) + e 31 | 32 | # Estimate the parameters 33 | Xpinv = scipy.linalg.pinv2(X) 34 | betahat = np.dot(Xpinv, y) 35 | print("Estimated beta:\n", betahat) 36 | 37 | ''' 38 | Linear model with statsmodel 39 | ---------------------------- 40 | ''' 41 | 42 | ''' 43 | Interfacing with numpy 44 | ~~~~~~~~~~~~~~~~~~~~~~ 45 | ''' 46 | import statsmodels.api as sm 47 | 48 | ## Fit and summary: 49 | model = sm.OLS(y, X).fit() 50 | print(model.summary()) 51 | 52 | # prediction of new values 53 | ypred = model.predict(X) 54 | 55 | # residuals + prediction == true values 56 | assert np.all(ypred + model.resid == y) 57 | 58 | ''' 59 | Interfacing with Pandas 60 | ~~~~~~~~~~~~~~~~~~~~~~ 61 | ''' 62 | import statsmodels.formula.api as smfrmla 63 | # Build a dataframe excluding the intercept 64 | df = pd.DataFrame(np.column_stack([X[:, 1:], y]), columns=['x1','x2', 'x3', 'y']) 65 | 66 | 67 | ## Fit and summary: 68 | model = smfrmla.ols("y ~ x1 + x2 + x2", df).fit() 69 | print(model.summary()) 70 | 71 | 72 | 73 | oneway = smfrmla.ols('salary ~ management + experience', salary).fit() 74 | 75 | twoway = smfrmla.ols('salary ~ education + management + experience', salary).fit() 76 | 77 | sm.stats.anova_lm(oneway, twoway) 78 | twoway.compare_f_test(oneway) 79 | 80 | oneway = smfrmla.ols('salary ~ management + experience', salary).fit() 81 | oneway.model.data.param_names 82 | oneway.model.data.exog 83 | 84 | print(twoway.model.data.param_names) 85 | print(twoway.model.data.exog[:10, :]) 86 | 87 | ttest_exp = oneway.t_test([0, 0, 1]) 88 | ttest_exp.pvalue, ttest_exp.tvalue 89 | print(ttest_exp) 90 | 91 | # Alternatively, you can specify the hypothesis tests using a string 92 | oneway.t_test('experience') 93 | 94 | ''' 95 | multiple comparison 96 | ''' 97 | 98 | import numpy as np 99 | np.random.seed(seed=42) # make example reproducible 100 | 101 | # Dataset 102 | import numpy as np 103 | np.random.seed(seed=42) # make example reproducible 104 | 105 | 106 | # Dataset 107 | n_samples, n_features = 100, 1000 108 | n_info = int(n_features/10) # number of features with information 109 | n1, n2 = int(n_samples/2), n_samples - int(n_samples/2) 110 | snr = .5 111 | Y = np.random.randn(n_samples, n_features) 112 | grp = np.array(["g1"] * n1 + ["g2"] * n2) 113 | 114 | # Add some group effect for Pinfo features 115 | Y[grp=="g1", :n_info] += snr 116 | 117 | # 118 | import scipy.stats as stats 119 | import matplotlib.pyplot as plt 120 | tvals, pvals = np.full(n_features, np.NAN), np.full(n_features, np.NAN) 121 | for j in range(n_features): 122 | tvals[j], pvals[j] = stats.ttest_ind(Y[grp=="g1", j], Y[grp=="g2", j], equal_var=True) 123 | 124 | fig, axis = plt.subplots(3, 1)#, sharex='col') 125 | 126 | axis[0].plot(range(n_features), tvals, 'o') 127 | axis[0].set_ylabel("t-value") 128 | 129 | axis[1].plot(range(n_features), pvals, 'o') 130 | axis[1].axhline(y=0.05, color='red', linewidth=3, label="p-value=0.05") 131 | #axis[1].axhline(y=0.05, label="toto", color='red') 132 | axis[1].set_ylabel("p-value") 133 | axis[1].legend() 134 | 135 | axis[2].hist([pvals[n_info:], pvals[:n_info]], 136 | stacked=True, bins=100, label=["Negatives", "Positives"]) 137 | axis[2].set_xlabel("p-value histogram") 138 | axis[2].set_ylabel("density") 139 | axis[2].legend() 140 | 141 | plt.tight_layout() 142 | 143 | 144 | 145 | ''' 146 | No correction 147 | ''' 148 | P, N = n_info, n_features - n_info # Positives, Negatives 149 | TP = np.sum(pvals[:n_info ] < 0.05) # True Positives 150 | FP = np.sum(pvals[n_info: ] < 0.05) # False Positives 151 | print("No correction, FP: %i (expected: %.2f), TP: %i" % (FP, N * 0.05, TP)) 152 | 153 | 154 | ''' 155 | False negative rate (FNR) 156 | FNR} = FN} / (TP} + FN}) = 1-TPR} 157 | ''' 158 | FNR = 159 | print("No correction, false positives: %i (expected value: %i)" % (FP, 0.05 * (n_features - TP))) 160 | 161 | 162 | 163 | ## Bonferoni 164 | import statsmodels.sandbox.stats.multicomp as multicomp 165 | _, pvals_fwer, _, _ = multicomp.multipletests(pvals, alpha=0.05, 166 | method='bonferroni') 167 | TP = np.sum(pvals_fwer[:n_info ] < 0.05) # True Positives 168 | FP = np.sum(pvals_fwer[n_info: ] < 0.05) # False Positives 169 | print("FWER correction, FP: %i, TP: %i" % (FP, TP)) 170 | 171 | 172 | ## FDR 173 | import statsmodels.sandbox.stats.multicomp as multicomp 174 | _, pvals_fdr, _, _ = multicomp.multipletests(pvals, alpha=0.05, 175 | method='fdr_bh') 176 | TP = np.sum(pvals_fdr[:n_info ] < 0.05) # True Positives 177 | FP = np.sum(pvals_fdr[n_info: ] < 0.05) # False Positives 178 | 179 | print("FDR correction, FP: %i, TP: %i" % (FP, TP)) 180 | 181 | ''' 182 | Binary classification measures: 183 | 184 | - **Sensitivity** or **true positive rate (TPR)**, eqv. with hit rate, recall: 185 | 186 | TPR = TP / P = TP / (TP+FN) 187 | 188 | - specificity (SPC) or true negative rate 189 | 190 | SPC = TN / N = TN / (TN+FP) 191 | 192 | - precision or positive predictive value (PPV) 193 | 194 | PPV = TP / (TP + FP) 195 | 196 | - negative predictive value (NPV) 197 | 198 | NPV = TN / (TN + FN) 199 | 200 | - fall-out or **false positive rate (FPR)** 201 | 202 | FPR = FP / N = FP / (FP + TN) = 1-SPC 203 | 204 | 205 | - false negative rate (FNR) 206 | 207 | FNR = FN / (TP + FN) = 1-TPR 208 | 209 | - false discovery rate (FDR) 210 | 211 | FDR = FP / (TP + FP) = 1 - PPV 212 | 213 | ''' 214 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = build 9 | NTBOOK = $(shell ls scientific_python/*.ipynb statistics/*.ipynb machine_learning/*.ipynb optimization/*.ipynb deep_learning/*.ipynb) 10 | # Notebook to execute. Exclude DL file (requires GPU) 11 | NTBOOK_TO_EXE = $(shell ls scientific_python/*.ipynb statistics/*.ipynb machine_learning/*.ipynb optimization/*.ipynb) 12 | 13 | #NTBOOK = $(shell ls statistics/*.ipynb) 14 | NTBOOK_FILES = $(NTBOOK:.ipynb=_files) 15 | #SRC = $(shell ls python/*.py) 16 | RST = $(NTBOOK:.ipynb=.rst) $(SRC:.py=.rst) 17 | #$(info $(NTBOOK)) 18 | #$(info $(RST)) 19 | #$(info $(NTBOOK_FILES)) 20 | #$(info $(PYTORST)) 21 | 22 | # User-friendly check for sphinx-build 23 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 24 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 25 | endif 26 | 27 | # Internal variables. 28 | PAPEROPT_a4 = -D latex_paper_size=a4 29 | PAPEROPT_letter = -D latex_paper_size=letter 30 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 31 | # the i18n builder cannot share the environment and doctrees with the others 32 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 33 | 34 | 35 | 36 | #$(shell find notebooks -name "*.ipynb" -exec bash -c -exec sh -c 'echo "$${1%.ipynb}.rst"' _ {} \;) 37 | 38 | .SUFFIXES: .rst .ipynb .py 39 | 40 | .PHONY: help clean html dirhtml singlehtml htmlhelp epub latex latexpdf text changes linkcheck doctest coverage gettext exe 41 | 42 | help: 43 | @echo "Please use \`make ' where is one of" 44 | @echo " pdf to make LaTeX files and run them through pdflatex" 45 | @echo " html to make standalone HTML files" 46 | @echo " exe to run jupyter notebooks except those in deep_learning that requires GPU." 47 | @echo " clean rm BUILDDIR, auto_gallery, rst files" 48 | @echo " cleanall rm BUILDDIR, auto_gallery, rst files and clear output of notebooks" 49 | @echo " dirhtml to make HTML files named index.html in directories" 50 | @echo " singlehtml to make a single large HTML file" 51 | @echo " epub to make an epub" 52 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 53 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 54 | @echo " text to make text files" 55 | @echo " changes to make an overview of all changed/added/deprecated items" 56 | @echo " linkcheck to check all external links for integrity" 57 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 58 | @echo " coverage to run coverage check of the documentation (if enabled)" 59 | 60 | # Rule to convert notebook to rst 61 | #.ipynb.rst: 62 | %.rst : %.ipynb 63 | jupyter nbconvert --to rst $< 64 | mv $@ $@.filtered 65 | cat $@.filtered|bin/filter_fix_rst.py > $@ 66 | rm -f $@.filtered 67 | 68 | # jupyter nbconvert --to rst --stdout $< | bin/filter_fix_rst.py > $@ 69 | # jupyter nbconvert --to rst $< --output $@ 70 | 71 | debug: 72 | @echo $(RST) 73 | 74 | 75 | rst: $(RST) 76 | 77 | clean: 78 | rm -rf $(BUILDDIR)/* 79 | rm -rf auto_gallery/ 80 | rm -f $(RST) 81 | rm -rf $(NTBOOK_FILES) 82 | 83 | cleanall: 84 | rm -rf $(BUILDDIR)/* 85 | rm -rf auto_gallery/ 86 | rm -f $(RST) 87 | rm -rf $(NTBOOK_FILES) 88 | for nb in $(NTBOOK) ; do jupyter nbconvert --clear-output $$nb; done 89 | 90 | exe: 91 | @echo "Execute notebooks" 92 | for nb in $(NTBOOK_TO_EXE) ; do jupyter nbconvert --to notebook --execute $$nb --output $$(basename $$nb); done 93 | # $(EXEIPYNB) $(NTBOOK) 94 | # @echo toto nbconvert --to notebook --execute $< --output $(basename $<) 95 | 96 | html: rst 97 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 98 | @echo 99 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 100 | 101 | dirhtml: rst 102 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 103 | @echo 104 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 105 | 106 | singlehtml: rst 107 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 108 | @echo 109 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 110 | 111 | docx: rst 112 | $(SPHINXBUILD) -b docx $(ALLSPHINXOPTS) $(BUILDDIR)/docx 113 | @echo 114 | @echo "Build finished. The docx page is in $(BUILDDIR)/docx." 115 | 116 | epub: rst 117 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 118 | @echo 119 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 120 | 121 | latex: rst 122 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 123 | @echo 124 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 125 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 126 | "(use \`make latexpdf' here to do that automatically)." 127 | 128 | latexpdf: rst 129 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 130 | @echo "Running LaTeX files through pdflatex..." 131 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 132 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 133 | cp build/latex/StatisticsMachineLearningPython.pdf StatisticsMachineLearningPython.pdf 134 | 135 | pdf: latexpdf 136 | 137 | text: rst 138 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 139 | @echo 140 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 141 | 142 | changes: rst 143 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 144 | @echo 145 | @echo "The overview file is in $(BUILDDIR)/changes." 146 | 147 | linkcheck: rst 148 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 149 | @echo 150 | @echo "Link check complete; look for any errors in the above output " \ 151 | "or in $(BUILDDIR)/linkcheck/output.txt." 152 | 153 | -------------------------------------------------------------------------------- /info.rst: -------------------------------------------------------------------------------- 1 | gh-pages 2 | -------- 3 | 4 | TODO: do it with: circleci 5 | 6 | - https://circleci.com/blog/deploying-documentation-to-github-pages-with-continuous-integration/ 7 | - https://github.com/jklukas/docs-on-gh-pages 8 | 9 | 10 | Publishing sphinx-generated docs on github: 11 | 12 | https://daler.github.io/sphinxdoc-test/includeme.html 13 | 14 | 15 | 16 | Upload to github 17 | ---------------- 18 | 19 | 20 | "$WD/build/html" contains the pystsamsl website. Now we start to upload to github server. Clone from github to a temporary directory, and checkout gh-pages branch 21 | 22 | First time 23 | ``` 24 | WD=~/git/pystatsml 25 | cd ~/git 26 | mv pystatsml_gh-pages pystatsml_gh-pages.bak 27 | git clone git@github.com:duchesnay/pystatsml.git pystatsml_gh-pages 28 | git symbolic-ref HEAD refs/heads/gh-pages 29 | rm .git/index 30 | git clean -fdx 31 | cp -r $WD/build/html/* ./ 32 | cp -r $WD/auto_gallery ./ 33 | git add . 34 | git add -f auto_gallery 35 | git add -f _sources 36 | git add -f _static 37 | git add -f _images 38 | touch .nojekyll 39 | gedit index.html # see blow 40 | git commit -am "gh-pages First commit" 41 | git push origin gh-pages 42 | firefox index.html 43 | ``` 44 | 45 | Update 46 | ``` 47 | WD=~/git/pystatsml 48 | cd $WD 49 | make pdf html singlehtml 50 | cd ~/git/pystatsml_gh-pages 51 | git checkout gh-pages 52 | rsync -avu $WD/build/html/* ./ 53 | rsync -avu $WD/auto_gallery ./ 54 | git add . 55 | git add -f auto_gallery 56 | git add -f _sources 57 | git add -f _static 58 | git add -f _images 59 | meld index.html index.html.save 60 | #gedit # see blow 61 | git commit -am "gh-pages update commit" 62 | git push origin gh-pages 63 | firefox index.html 64 | ``` 65 | 66 | Then 67 | ``` 68 | gedit index.html 69 | 70 | Replace: 71 | ``` 72 |
73 |

Phantom

74 |
75 | ``` 76 | by 77 | 78 | ``` 79 |
80 |

Statistics and Machine Learning in 81 | Python

82 |
83 | 84 |
85 | 86 |

Edouard Duchesnay, Tommy Löfstedt, Feki Younes

87 | ``` 88 | 89 | Then 90 | 91 | ``` 92 | git commit -am "Title and authors" 93 | git push origin gh-pages 94 | firefox $WD/build/html/index.html 95 | ``` 96 | 97 | Now, you can visit your updated website at https://duchesnay.github.io/pystatsml. 98 | 99 | 100 | ML Resources 101 | ------------ 102 | 103 | - **my_tech_resources** 104 | https://github.com/JamesLavin/my_tech_resources 105 | 106 | - **Practical Machine Learning Course Notes (in R)** 107 | https://sux13.github.io/DataScienceSpCourseNotes/8_PREDMACHLEARN/Practical_Machine_Learning_Course_Notes.html 108 | 109 | - **Computational Statistics in Python** 110 | https://people.duke.edu/~ccc14/sta-663/index.html 111 | 112 | - **scipy-lectures** 113 | 114 | https://github.com/scipy-lectures/scipy-lecture-notes 115 | 116 | - **Scientific Python & Software engineering best practices** 117 | https://github.com/paris-saclay-cds/python-workshop 118 | 119 | - **Deep Learning course in python** 120 | https://github.com/m2dsupsdlclass/lectures-labs 121 | 122 | - **Others** 123 | https://github.com/justmarkham/DAT4 124 | 125 | http://statweb.stanford.edu/~jtaylo/courses/stats202/index.html 126 | 127 | http://www.dataschool.io/ 128 | 129 | https://onlinecourses.science.psu.edu/stat857/node/141 130 | 131 | https://github.com/rasbt/python-machine-learning-book 132 | 133 | https://onlinecourses.science.psu.edu/stat505/ 134 | 135 | http://www.kdnuggets.com/2016/04/top-10-ipython-nb-tutorials.html 136 | 137 | 138 | Jupyter Notebooks 139 | ----------------- 140 | 141 | https://jupyterbook.org/advanced/advanced.html#jupyter-cell-tags 142 | 143 | 144 | Markdown 145 | -------- 146 | http://daringfireball.net/projects/markdown/basics 147 | 148 | R with Jupyther 149 | ~~~~~~~~~~~~~~~ 150 | 151 | conda install -c r r-essentials 152 | 153 | Sphinx 154 | ------ 155 | 156 | http://sphinx-doc.org/ 157 | 158 | IPython notebooks + Sphinx 159 | -------------------------- 160 | 161 | http://sphinx-ipynb.readthedocs.org/en/latest/howto.html 162 | 163 | 164 | nbsphinx: Jupyter Notebook Tools for Sphinx 165 | 166 | https://nbsphinx.readthedocs.io/en/0.3.3/ 167 | 168 | nbsphinx is a Sphinx extension that provides a source parser for *.ipynb files. Custom Sphinx directives are used to show Jupyter Notebook code cells (and of course their results) in both HTML and LaTeX output. Un-evaluated notebooks – i.e. notebooks without stored output cells – will be automatically executed during the Sphinx build process. 169 | 170 | conda install -c conda-forge nbsphinx 171 | 172 | sphinx-gallery 173 | -------------- 174 | 175 | https://sphinx-gallery.readthedocs.io/en/latest/ 176 | 177 | ``pip install sphinx-gallery`` 178 | 179 | http://www.scipy-lectures.org 180 | 181 | https://github.com/scipy-lectures/scipy-lecture-notes 182 | 183 | strip jupyter output before submission 184 | -------------------------------------- 185 | 186 | https://github.com/kynan/nbstripout 187 | 188 | ``conda install -c conda-forge nbstripout`` 189 | 190 | Set up the git filter and attributes as described in the manual installation instructions below: 191 | 192 | ``cd pystatsml`` 193 | ``nbstripout --install`` 194 | 195 | 196 | rst 197 | --- 198 | 199 | http://docutils.sourceforge.net/rst.html 200 | http://docutils.sourceforge.net/docs/ref/rst/ 201 | 202 | 203 | 204 | R vs Python 205 | ----------- 206 | 207 | https://www.datacamp.com/community/tutorials/r-or-python-for-data-analysis 208 | http://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html 209 | 210 | Mail to share the course 211 | ------------------------ 212 | 213 | Please find the link to my Machine Learning course in Python, it is a draft version: 214 | ftp://ftp.cea.fr//pub/unati/people/educhesnay/pystatml/StatisticsMachineLearningPython.pdf 215 | 216 | Below the link to github: 217 | https://github.com/duchesnay/pystatsml 218 | 219 | 220 | git clone https://github.com/duchesnay/pystatsml.git 221 | 222 | 223 | Basically, it uses Jupyter notebook and pure python, everything is converted to rst and assembled to html or pdf using sphynx. 224 | 225 | It is a draft version, not finished yet with many spelling mistakes. 226 | 227 | Please fork and perform some pull request. If you are willing to contribute. 228 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /utils/ml_non_linear_prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Mar 31 09:54:25 2016 4 | 5 | @author: edouard.duchesnay@cea.fr 6 | """ 7 | 8 | ''' 9 | SVM & Kernel methods 10 | ==================== 11 | ''' 12 | import numpy as np 13 | from numpy.linalg import norm 14 | 15 | from mpl_toolkits.mplot3d import Axes3D 16 | import matplotlib.pyplot as plt 17 | import sklearn.metrics as metrics 18 | #%matplotlib inline 19 | #%matplotlib qt 20 | 21 | 22 | 23 | class KernDensity: 24 | def __init__(self, sigma=1): 25 | self.sigma = sigma 26 | 27 | def fit(self, X, y, alphas=None): 28 | self.X = X 29 | self.y = y 30 | if alphas is None: 31 | alphas = np.ones(X.shape[0]) 32 | self.alphas = alphas 33 | 34 | def predict(self, X): 35 | y_pred = np.zeros((X.shape[0])) 36 | for j, x in enumerate(X): 37 | for i in range(self.X.shape[0]): 38 | #print(j, i, x) 39 | y_pred[j] += self.alphas[i] * self.y[i] * np.exp( - (norm(self.X[i, :] - x) ** 2) / (2 * self.sigma ** 2)) 40 | return(y_pred) 41 | 42 | 43 | ## Plot 3D 44 | def plot3d(coord_x, coord_y, coord_z, points, y, zlim=None, ax=None, fig=None, xylabelsize=33): 45 | # Plot 46 | from matplotlib import cm 47 | if fig is None: 48 | fig = plt.figure() 49 | if ax is None: 50 | ax = fig.add_subplot(111, projection='3d') 51 | z_min = np.min(coord_z) - np.max(coord_z) * 2 52 | ax.plot_surface(coord_x, coord_y, coord_z, rstride=2, cstride=2, 53 | #vmin=Z.min(), vmax=Z.max(), 54 | cmap=cm.coolwarm, 55 | linewidth=1, antialiased=True) 56 | cset = ax.contourf(coord_x, coord_y, coord_z, zdir='z', offset=z_min-10, 57 | cmap=cm.coolwarm) 58 | argmin = coord_x.ravel()[coord_z.argmin()], coord_y.ravel()[coord_z.argmin()] 59 | print("argmin", argmin) 60 | # add point and cross at defined point 61 | colors = {-1:'b', 1:'r'} 62 | for lev in np.unique(y): 63 | pts = points[y==lev, :] 64 | ax.plot(pts[:, 0], pts[:, 1], 'o', color=colors[lev], zs=[z_min]*pts.shape[0], ms=10) 65 | ax.set_xlabel(r'$x^0$', size=xylabelsize) 66 | ax.set_ylabel(r'$x^1$', size=xylabelsize) 67 | #ax.set_zlabel(r'$Kernel density$', size=xylabelsize) 68 | ax.set_zlim(z_min, np.max(coord_z)) 69 | return ax, z_min, argmin 70 | 71 | 72 | ## Dataset 73 | ########## 74 | 75 | im = np.array( 76 | [[ 1., 1., 1., 1., 0., 0., 0., 0.], 77 | [ 1., 1., 1., 1., 0., 0., 0., 0.], 78 | [ 1., 1., 1., 1., 0., 0., 0., 0.], 79 | [ 1., 1., 1., 1., 1., 0., 0., 0.], 80 | [ 0., 0., 0., 1., 1., 1., 1., 1.], 81 | [ 0., 0., 0., 0., 1., 1., 1., 1.], 82 | [ 0., 0., 0., 0., 1., 1., 1., 1.], 83 | [ 0., 0., 0., 0., 1., 1., 1., 1.]]) 84 | 85 | x0, y0 = np.where(im == 0) 86 | x1, y1 = np.where(im == 1) 87 | 88 | X = np.column_stack([ 89 | np.concatenate([x0, x1]), 90 | np.concatenate([y0, y1])]) 91 | y = np.array([-1] * len(x0) + [1] * len(x1)) 92 | 93 | xmin, xmax, ymin, ymax = 0, im.shape[0]-1, 0, im.shape[1]-1 94 | coord_x, coord_y = np.mgrid[xmin:xmax:50j, ymin:ymax:50j] 95 | XX = np.column_stack([coord_x.ravel(), coord_y.ravel()]) 96 | 97 | 98 | # Kernel mapping 99 | ################ 100 | 101 | self = KernDensity(sigma=.2) 102 | self.fit(X, y) 103 | y_pred_kde = self.predict(XX) 104 | coord_z_kde = y_pred_kde.reshape(coord_x.shape) 105 | points=X 106 | 107 | # View 2D 108 | if False: 109 | plt.imshow(np.rot90(coord_z_kde), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto') 110 | plt.plot(X[y==1, 0], X[y==1, 1], 'o', color='r')#, zs=[z_min], ms=20) 111 | plt.plot(X[y==-1, 0], X[y==-1, 1], 'o', color='b')#, zs=[z_min], ms=20) 112 | 113 | 114 | fig = plt.figure(figsize=(30, 15)) 115 | 116 | ax=fig.add_subplot(121, projection='3d') 117 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_kde, points=X, y=y, ax=ax, fig=fig) 118 | plt.title(r'$x \rightarrow K(x_i, x) = \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)$', size=33) 119 | # set camera to fixed point of view 120 | print(ax.azim, ax.elev, ax.dist) 121 | #(-152.49214958606902, 21.717791411042867, 10) 122 | #ax.view_init(azim=-152, elev=21) #Reproduce view 123 | #ax.view_init(azim=-14.1935483871, elev=29.6875, dist=10) 124 | 125 | # SV 126 | ##### 127 | 128 | from sklearn.svm import SVC 129 | #1.0 / X.shape[1] 0.5 130 | #(1/(2 *.2)) : 2.5 131 | clf = SVC(kernel='rbf')#, gamma=1) 132 | clf.fit(X, y) 133 | clf.support_vectors_.shape 134 | 135 | print(clf.support_.shape) 136 | 137 | np.all(X[clf.support_,:] == clf.support_vectors_) 138 | 139 | Xsv = clf.support_vectors_ 140 | y_sv = y[clf.support_] 141 | 142 | y_pred_svm = clf.predict(XX) 143 | #self = KernDensity(sigma=.2) 144 | #self.fit(X, y) 145 | #y_pred = self.predict(XX) 146 | coord_z_svm = y_pred_svm.reshape(coord_x.shape) 147 | 148 | # View 2D 149 | if False: 150 | plt.imshow(np.rot90(coord_z_svm), cmap=plt.cm.coolwarm, extent=[xmin, xmax, ymin, ymax], aspect='auto') 151 | plt.plot(Xsv[y_sv==1, 0], Xsv[y_sv==1, 1], 'o', color='r')#, zs=[z_min], ms=20) 152 | plt.plot(Xsv[y_sv==-1, 0], Xsv[y_sv==-1, 1], 'o', color='b')#, zs=[z_min], ms=20) 153 | 154 | 155 | 156 | #fig = plt.figure(figsize=(15, 15)) 157 | ax=fig.add_subplot(122, projection='3d') 158 | ax, z_min, argmin = plot3d(coord_x, coord_y, coord_z_svm, points=Xsv, y=y_sv, ax=ax, fig=fig) 159 | plt.title(r'$f(x) = sign \left(\sum_{i \in SV}\alpha_i y_i \exp\left(-\frac{||x_i - x_j||^2}{2\sigma^2}\right)\right)$', size=33) 160 | # set camera to fixed point of view 161 | #ax.azim, ax.elev, ax.dist 162 | #(-152.49214958606902, 21.717791411042867, 10) 163 | #ax.view_init(azim=-152, elev=21) #Reproduce view 164 | 165 | ############ 166 | 167 | import numpy as np 168 | from sklearn.svm import SVC 169 | from sklearn import datasets 170 | import matplotlib.pyplot as plt 171 | 172 | # dataset 173 | X, y = datasets.make_classification(n_samples=10, n_features=2,n_redundant=0, 174 | n_classes=2, 175 | random_state=1, 176 | shuffle=False) 177 | clf = SVC(kernel='rbf')#, gamma=1) 178 | clf.fit(X, y) 179 | print("#Errors: %i" % np.sum(y != clf.predict(X))) 180 | 181 | clf.decision_function(X) 182 | 183 | # Usefull internals: 184 | # Array of support vectors 185 | clf.support_vectors_ 186 | 187 | # indices of support vectors within original X 188 | np.all(X[clf.support_,:] == clf.support_vectors_) 189 | 190 | 191 | ######################## 192 | 193 | 194 | from sklearn.ensemble import RandomForestClassifier 195 | 196 | forest = RandomForestClassifier(n_estimators = 100) 197 | forest.fit(X, y) 198 | 199 | print("#Errors: %i" % np.sum(y != forest.predict(X))) 200 | 201 | 202 | -------------------------------------------------------------------------------- /python_lang/python_lang_solutions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 16 10:03:29 2016 4 | 5 | @author: edouard.duchesnay@gmail.com 6 | """ 7 | 8 | ############################################################################### 9 | # Exercise 1: functions 10 | # ~~~~~~~~~~~~~~~~~~~~~ 11 | # 12 | # Create a function that acts as a simple calulator If the operation is 13 | # not specified, default to addition If the operation is misspecified, 14 | # return an prompt message Ex: ``calc(4,5,"multiply")`` returns 20 Ex: 15 | # ``calc(3,5)`` returns 8 Ex: ``calc(1, 2, "something")`` returns error 16 | # message 17 | # 18 | 19 | def calc(a, b, op='add'): 20 | if op == 'add': 21 | return a + b 22 | elif op == 'sub': 23 | return a - b 24 | else: 25 | print('valid operations are add and sub') 26 | 27 | 28 | # call the function 29 | calc(10, 4, op='add') # returns 14 30 | calc(10, 4, 'add') # also returns 14: unnamed arguments are inferred by position 31 | calc(10, 4) # also returns 14: default for 'op' is 'add' 32 | calc(10, 4, 'sub') # returns 6 33 | calc(10, 4, 'div') # prints 'valid operations are add and sub' 34 | 35 | a, b, op = 2, 3, "+" 36 | 37 | 38 | def calc2(a, b, op='+'): 39 | st = "%.f %s %.f" % (a, op, b) 40 | return eval(st) 41 | 42 | 43 | calc2(3, 3, "+") 44 | 45 | 46 | ############################################################################### 47 | # Exercise 2: functions + list + loop 48 | # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 49 | # 50 | # Given a list of numbers, return a list where all adjacent duplicate 51 | # elements have been reduced to a single element. Ex: ``[1, 2, 2, 3, 2]`` 52 | # returns ``[1, 2, 3, 2]``. You may create a new list or modify the passed 53 | # in list. 54 | # 55 | # Remove all duplicate values (adjacent or not) Ex: ``[1, 2, 2, 3, 2]`` 56 | # returns ``[1, 2, 3]`` 57 | # 58 | 59 | 60 | def remove_adjacent_duplicates(original_list): 61 | new_list = [] 62 | new_list.append(original_list[0]) 63 | for num in original_list[1:]: 64 | if num != new_list[-1]: 65 | new_list.append(num) 66 | return new_list 67 | 68 | remove_adjacent_duplicates([1, 2, 2, 3, 2]) 69 | 70 | def remove_duplicates(original_list): 71 | new_list = [] 72 | for num in original_list: 73 | if num not in new_list: 74 | new_list.append(num) 75 | return new_list 76 | 77 | remove_duplicates([3, 2, 2, 1, 2]) 78 | 79 | # or this solution mights modify the order 80 | 81 | def remove_duplicates(original_list): 82 | return(list(set(original_list))) 83 | 84 | remove_duplicates([3, 2, 2, 1, 2]) 85 | 86 | 87 | ############################################################################### 88 | # Exercise 3: File I/O 89 | # ~~~~~~~~~~~~~~~~~~~~ 90 | # 91 | # 1. Copy/paste the BSD 4 clause license (https://en.wikipedia.org/wiki/BSD_licenses) 92 | # into a text file. Read, the file and count the occurrences of each 93 | # word within the file. Store the words' occurrence number in a dictionary. 94 | # 95 | # 2. Write an executable python command ``count_words.py`` that parse 96 | # a list of input files provided after ``--input`` parameter. 97 | # The dictionary of occurrence is save in a csv file provides by ``--output``. 98 | # with default value word_count.csv. 99 | # Use: 100 | # - open 101 | # - regular expression 102 | # - argparse (https://docs.python.org/3/howto/argparse.html) 103 | 104 | 105 | bsd_4clause = """ 106 | Copyright (c) , 107 | All rights reserved. 108 | 109 | Redistribution and use in source and binary forms, with or without 110 | modification, are permitted provided that the following conditions are met: 111 | 1. Redistributions of source code must retain the above copyright 112 | notice, this list of conditions and the following disclaimer. 113 | 2. Redistributions in binary form must reproduce the above copyright 114 | notice, this list of conditions and the following disclaimer in the 115 | documentation and/or other materials provided with the distribution. 116 | 3. All advertising materials mentioning features or use of this software 117 | must display the following acknowledgement: 118 | This product includes software developed by the . 119 | 4. Neither the name of the nor the 120 | names of its contributors may be used to endorse or promote products 121 | derived from this software without specific prior written permission. 122 | 123 | THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY 124 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 125 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 126 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 127 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 128 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 129 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 130 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 131 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 132 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 133 | """ 134 | 135 | import os 136 | import tempfile 137 | 138 | tmpfilename = os.path.join(tempfile.gettempdir(), 139 | "bsd.txt") 140 | 141 | fd = open(tmpfilename, "w") 142 | fd.write(bsd_4clause) 143 | fd.close() 144 | 145 | fd = open(tmpfilename, "r") 146 | 147 | count = dict() 148 | for line in fd: 149 | line = line.lower() 150 | for word in line.split(): 151 | if not word in count: 152 | count[word] = 1 153 | else: 154 | count[word] += 1 155 | 156 | print(count) 157 | 158 | """ 159 | Comment to deal with missing import of urllib2 160 | 161 | import urllib2 162 | url = "https://www.gnu.org/licenses/gpl-3.0.txt" 163 | f = urllib2.urlopen(url) 164 | content = f.read() 165 | f.close() 166 | content = content.replace("\n", " ") 167 | content = content.lower() 168 | c = content.split(' ') 169 | print(len(c)) 170 | from collections import Counter 171 | print(Counter(c)) 172 | """ 173 | 174 | ############################################################################### 175 | # Exercise 4: OOP 176 | # ~~~~~~~~~~~~~~~ 177 | # 178 | # 1. Create a class ``Employee`` with 2 attributes provided in the 179 | # constructor: ``name``, ``years_of_service``. With one method 180 | # ``salary`` with is obtained by ``1500 + 100 * years_of_service``. 181 | # 182 | # 2. Create a subclass ``Manager`` which redefine ``salary`` method 183 | # ``2500 + 120 * years_of_service``. 184 | # 185 | # 3. Create a small dictionary database where the key is the 186 | # employee's name. Populate the database with: samples = 187 | # Employee('lucy', 3), Employee('john', 1), Manager('julie', 10), 188 | # Manager('paul', 3) 189 | # 190 | # 4. Return a table of made name, salary rows, i.e. a list of list [[name, 191 | # salary]] 192 | # 193 | # 5. Compute the average salary 194 | 195 | import pandas as pd 196 | 197 | 198 | class Employee: 199 | def __init__(self, name, years_of_service): 200 | self.name = name 201 | self.years_of_service = years_of_service 202 | 203 | def salary(self): 204 | return 1500 + 100 * self.years_of_service 205 | 206 | 207 | class Manager(Employee): 208 | def salary(self): 209 | return 2500 + 120 * self.years_of_service 210 | 211 | 212 | samples = [Employee("lucy", 3), 213 | Employee("john", 1), 214 | Manager('julie', 3), 215 | Manager('paul', 1)] 216 | 217 | employees = {e.name: e for e in samples} 218 | 219 | employees.keys() 220 | 221 | df = pd.DataFrame([[name, obj.salary()] for name, obj in employees.items()], 222 | columns=['name', 'salary']) 223 | 224 | [[name, employees[name].salary()] for name 225 | in employees] 226 | 227 | sum([e.salary() for e in employees.values()]) / len(employees) 228 | -------------------------------------------------------------------------------- /introduction/python_ecosystem.rst: -------------------------------------------------------------------------------- 1 | Python ecosystem for data-science 2 | --------------------------------- 3 | 4 | .. RST https://thomas-cokelaer.info/tutorials/sphinx/rest_syntax.html 5 | 6 | .. image:: images/python_ecosystem.png 7 | :scale: 100 8 | :align: center 9 | 10 | Python language 11 | ~~~~~~~~~~~~~~~ 12 | 13 | - Interpreted 14 | - Garbage collector (do not prevent from memory leak) 15 | - Dynamically-typed language (Java is statically typed) 16 | 17 | 18 | Anaconda 19 | ~~~~~~~~ 20 | 21 | Anaconda is a python distribution that ships most of python tools and libraries 22 | 23 | **Installation** 24 | 25 | 26 | 1. Download anaconda (Python 3.x) http://continuum.io/downloads 27 | 28 | 2. Install it, on Linux 29 | :: 30 | 31 | bash Anaconda3-2.4.1-Linux-x86_64.sh 32 | 33 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file: 34 | :: 35 | 36 | export PATH="${HOME}/anaconda3/bin:$PATH" 37 | 38 | **Managing with ``conda``** 39 | 40 | 41 | Update conda package and environment manager to current version 42 | 43 | :: 44 | 45 | conda update conda 46 | 47 | 48 | Install additional packages. Those commands install qt back-end (Fix a temporary issue to run spyder) 49 | 50 | :: 51 | 52 | conda install pyqt 53 | conda install PyOpenGL 54 | conda update --all 55 | 56 | 57 | Install seaborn for graphics 58 | 59 | :: 60 | 61 | conda install seaborn 62 | # install a specific version from anaconda chanel 63 | conda install -c anaconda pyqt=4.11.4 64 | 65 | List installed packages 66 | 67 | :: 68 | 69 | conda list 70 | 71 | Search available packages 72 | 73 | :: 74 | 75 | conda search pyqt 76 | conda search scikit-learn 77 | 78 | 79 | 80 | **Environments** 81 | 82 | 83 | - A conda environment is a directory that contains a specific collection of conda packages that you have installed. 84 | - Control packages environment for a specific purpose: collaborating with someone else, delivering an application to your client, 85 | - Switch between environments 86 | 87 | List of all environments 88 | 89 | :: 90 | conda info --envs 91 | 92 | 1. Create new environment 93 | 2. Activate 94 | 3. Install new package 95 | 96 | :: 97 | 98 | conda create --name test 99 | # Or 100 | conda env create -f environment.yml 101 | source activate test 102 | conda info --envs 103 | conda list 104 | conda search -f numpy 105 | conda install numpy 106 | 107 | **Miniconda** 108 | 109 | Anaconda without the collection of (>700) packages. 110 | With Miniconda you download only the packages you want with the conda command: ``conda install PACKAGENAME`` 111 | 112 | 113 | 114 | 1. Download anaconda (Python 3.x) https://conda.io/miniconda.html 115 | 116 | 2. Install it, on Linux 117 | 118 | :: 119 | 120 | bash Miniconda3-latest-Linux-x86_64.sh 121 | 122 | 3. Add anaconda path in your PATH variable in your ``.bashrc`` file: 123 | 124 | :: 125 | 126 | export PATH=${HOME}/miniconda3/bin:$PATH 127 | 128 | 4. Install required packages 129 | 130 | :: 131 | 132 | conda install -y scipy 133 | conda install -y pandas 134 | conda install -y matplotlib 135 | conda install -y statsmodels 136 | conda install -y scikit-learn 137 | conda install -y sqlite 138 | conda install -y spyder 139 | conda install -y jupyter 140 | 141 | 142 | Commands 143 | ~~~~~~~~ 144 | 145 | **python**: python interpreter. On the dos/unix command line execute wholes file:: 146 | 147 | python file.py 148 | 149 | Interactive mode:: 150 | 151 | python 152 | 153 | Quite with ``CTL-D`` 154 | 155 | **ipython**: advanced interactive python interpreter:: 156 | 157 | ipython 158 | 159 | Quite with ``CTL-D`` 160 | 161 | **pip** alternative for packages management (update ``-U`` in user directory ``--user``): 162 | 163 | :: 164 | 165 | pip install -U --user seaborn 166 | 167 | For neuroimaging: 168 | 169 | :: 170 | 171 | pip install -U --user nibabel 172 | pip install -U --user nilearn 173 | 174 | 175 | **spyder**: IDE (integrated development environment): 176 | 177 | - Syntax highlighting. 178 | - Code introspection for code completion (use ``TAB``). 179 | - Support for multiple Python consoles (including IPython). 180 | - Explore and edit variables from a GUI. 181 | - Debugging. 182 | - Navigate in code (go to function definition) ``CTL``. 183 | 184 | 3 or 4 panels: 185 | 186 | +-------------+-------------------------+ 187 | | text editor | help/variable explorer | 188 | +-------------+-------------------------+ 189 | | | ipython interpreter | 190 | +-------------+-------------------------+ 191 | 192 | Shortcuts: 193 | - ``F9`` run line/selection 194 | 195 | Libraries 196 | ~~~~~~~~~ 197 | 198 | scipy.org: ``_ 199 | 200 | 201 | **Numpy**: Basic numerical operation. Matrix operation plus some basic solvers.:: 202 | 203 | import numpy as np 204 | X = np.array([[1, 2], [3, 4]]) 205 | #v = np.array([1, 2]).reshape((2, 1)) 206 | v = np.array([1, 2]) 207 | np.dot(X, v) # no broadcasting 208 | X * v # broadcasting 209 | np.dot(v, X) 210 | X - X.mean(axis=0) 211 | 212 | **Scipy**: general scientific libraries with advanced solver:: 213 | 214 | import scipy 215 | import scipy.linalg 216 | scipy.linalg.svd(X, full_matrices=False) 217 | 218 | **Matplotlib**: visualization:: 219 | 220 | import numpy as np 221 | import matplotlib.pyplot as plt 222 | #%matplotlib qt 223 | x = np.linspace(0, 10, 50) 224 | sinus = np.sin(x) 225 | plt.plot(x, sinus) 226 | plt.show() 227 | 228 | **Pandas**: Manipulation of structured data (tables). input/output excel files, etc. 229 | 230 | **Statsmodel**: Advanced statistics 231 | 232 | **Scikit-learn**: Machine learning 233 | 234 | .. http://truben.no/table/ 235 | 236 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 237 | | library | Arrays data, Num. comp, I/O | Structured data, I/O | Solvers: basic | Solvers: advanced | Stats: basic | Stats: advanced | Machine learning | 238 | +==============+=============================+======================+================+===================+==============+=================+==================+ 239 | | Numpy | X | | X | | | | | 240 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 241 | | Scipy | | | X | X | X | | | 242 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 243 | | Pandas | | X | | | | | | 244 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 245 | | Statmodels | | | | | X | X | | 246 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 247 | | Scikit-learn | | | | | | | X | 248 | +--------------+-----------------------------+----------------------+----------------+-------------------+--------------+-----------------+------------------+ 249 | 250 | -------------------------------------------------------------------------------- /R/ml_dimensionality_reduction_exo.R: -------------------------------------------------------------------------------- 1 | ###### 2 | ## PCA 3 | ###### 4 | 5 | # Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean 6 | # and principal components directions. `transform(X)` that project a new the data 7 | # into the principal components. 8 | # 9 | # Check that your `BasicPCA` pfermed simillarly than the one from sklearn: 10 | # `from sklearn.decomposition import PCA` 11 | 12 | 13 | BasicPCA <- function(X, scale=FALSE){ 14 | obj = list() 15 | Xc <- scale(X, center=TRUE, scale=scale) 16 | obj$mean <- attr(Xc, "scaled:center") 17 | s <- svd(Xc, nu = 0) 18 | # v [K x P] a matrix whose columns contain the right singular vectors of x 19 | obj$V = s$v 20 | obj$var = 1 / (nrow(X) - 1) * s$d ^2 21 | return(obj) 22 | } 23 | 24 | BasicPCA.transform <- function(obj, X){ 25 | Xc <- scale(X, center=obj$mean, scale=FALSE) 26 | return(Xc %*% obj$V) 27 | } 28 | 29 | # https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/ 30 | # dataset 31 | n_samples = 10 32 | experience = rnorm(n_samples) 33 | salary = 1500 + experience + .5 * rnorm(n_samples) 34 | other = rnorm(n_samples) 35 | X = cbind(experience, salary, other) 36 | 37 | # Optional: standardize data 38 | Xcs = scale(X, center=TRUE, scale=FALSE) 39 | attr(Xcs, "scaled:center") = NULL 40 | attr(Xcs, "scaled:scale") = NULL 41 | 42 | basic_pca = BasicPCA(Xcs) 43 | BasicPCA.transform(basic_pca, Xcs) 44 | 45 | # PCA with prcomp 46 | pca = prcomp(Xcs, center=TRUE, scale.=FALSE) 47 | names(pca) 48 | 49 | # Compare 50 | all(pca$rotation == basic_pca$V) 51 | all(predict(pca, Xcs) == BasicPCA.transform(basic_pca, Xcs)) 52 | 53 | # "https://raw.github.com/neurospin/pystatsml/master/data/iris.csv" 54 | # 55 | # Describe the data set. Should the dataset been standardized ? 56 | # 57 | # Retrieve the explained variance ratio. Determine $K$ the number of components. 58 | # 59 | # Print the $K$ principal components direction and correlation of the $K$ principal 60 | # components with original variables. Interpret the contribution of original variables 61 | # into the PC. 62 | # 63 | # Plot samples projected into the $K$ first PCs. 64 | # 65 | # Color samples with their species. 66 | # 67 | 68 | url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv' 69 | data = read.csv(url) 70 | #setwd("/home/ed203246/git/pystatsml/notebooks") 71 | data = read.csv("../data/iris.csv") 72 | 73 | # Describe the data set. Should the dataset been standardized ? 74 | 75 | summary(data) 76 | # sepal_length sepal_width petal_length petal_width species 77 | # Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50 78 | # 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50 79 | # Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50 80 | # Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199 81 | # 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800 82 | # Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500 83 | 84 | numcols = colnames(data)[unlist(lapply(data, is.numeric))] 85 | apply(data[, numcols], 2, sd) 86 | #sepal_length sepal_width petal_length petal_width 87 | #0.8280661 0.4358663 1.7652982 0.7622377 88 | 89 | 90 | # Describe the structure of correlation among variables. 91 | X = data[, numcols] 92 | cor(X) 93 | 94 | # Compute a PCA with the maximum number of compoenents. 95 | Xcs = scale(X, center=TRUE, scale=TRUE) 96 | attr(Xcs, "scaled:center") = NULL 97 | attr(Xcs, "scaled:scale") = NULL 98 | apply(Xcs, 2, sd) 99 | apply(Xcs, 2, mean) 100 | 101 | #Compute a PCA with the maximum number of compoenents. 102 | pca = prcomp(Xcs) 103 | 104 | # Variance ratio by component 105 | (pca$sdev ** 2) / sum(pca$sdev ** 2) 106 | #[1] 0.729624454 0.228507618 0.036689219 0.005178709 107 | 108 | # cumulative explained variance 109 | cumsum(pca$sdev ** 2) / sum(pca$sdev ** 2) 110 | 111 | # K = 2 112 | names(pca) 113 | pca$rotation 114 | 115 | PC = predict(pca, Xcs) 116 | t(cor(Xcs, PC[, 1:2])) 117 | # sepal_length sepal_width petal_length petal_width 118 | # PC1 0.8901688 -0.4601427 0.99155518 0.96497896 119 | # PC2 -0.3608299 -0.8827163 -0.02341519 -0.06399985 120 | 121 | data = cbind(data, PC) 122 | 123 | # Plot samples projected into the K first PCs 124 | # Color samples with their species. 125 | library(ggplot2) 126 | 127 | qplot(PC1, PC2, data=data, colour=species) 128 | 129 | #################################################################### 130 | ## MDS 131 | #################################################################### 132 | 133 | ############## 134 | ## eurodist ## 135 | ############## 136 | 137 | # Perform similar analysis on eurodist dataset using R, using: 138 | # - MDS: cmdscale. 139 | # - Euclidian parwise distance: dist 140 | # 141 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv' 142 | #data = read.csv(url) 143 | 144 | setwd("~/git/pystatsml/notebooks") 145 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/eurodist.csv' 146 | data = read.csv("../data/eurodist.csv") 147 | 148 | city = data[["city"]] 149 | D = data[, 2:ncol(data)] 150 | 151 | print(data[1:5, 1:5]) 152 | 153 | # Arbitrary choice of K=2 components 154 | mds = cmdscale(D, k=2, , eig=T) 155 | 156 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary. 157 | print(as.matrix(dist(mds$points))[1:5, 1:5]) 158 | 159 | plot(mds$points[,1], -mds$points[,2]) 160 | text(mds$points[,1], -mds$points[,2], city, cex=0.8) 161 | 162 | 163 | # Apply MDS using cmdscale 164 | k_range = 1:(min(5, dim(D)-1)) 165 | stress <- rep(0, max.k) 166 | for (kk in k_range){ 167 | mds <- cmdscale(D, k=kk, eig=T) 168 | stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5 169 | } 170 | plot(k_range, stress, type="l", xlab="k", ylab="stress") 171 | #cbind(1:max.k,P.k) 172 | 173 | # Ressources 174 | # http://people.stat.sc.edu/Hitchcock/chapter5_R_examples.txt 175 | 176 | ########## 177 | ## iris ## 178 | ########## 179 | 180 | # Perform similar analysis on eurodist dataset using R, using: 181 | # - MDS: cmdscale. 182 | # - Euclidian parwise distance: dist 183 | # 184 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv' 185 | #data = read.csv(url) 186 | 187 | setwd("~/git/pystatsml/notebooks") 188 | #url = 'ftp://ftp.cea.fr/pub/unati/people/educhesnay/pystatml/data/iris.csv' 189 | data = read.csv("../data/iris.csv") 190 | 191 | species = data[["species"]] 192 | X = scale(data[, 1:4]) 193 | attr(X, "scaled:center") = NULL 194 | attr(X, "scaled:scale") = NULL 195 | D = as.matrix(dist(X)) 196 | print(D[1:5, 1:5]) 197 | 198 | # Select K 199 | k_range = 1:(min(5, dim(D)-1)) 200 | stress <- rep(0, max.k) 201 | for (kk in k_range){ 202 | mds <- cmdscale(D, k=kk, eig=T) 203 | stress[kk] = (sum((D - as.matrix(dist(mds$points))) ^ 2)) ^ 0.5 204 | } 205 | plot(k_range, stress, type="l", xlab="k", ylab="stress") 206 | 207 | K = 2 # components 208 | mds = cmdscale(D, k=K , eig=T) 209 | 210 | # Recover coordinates of the cities in Euclidian referential whose orientation is arbitrary. 211 | print(as.matrix(dist(mds$points))[1:5, 1:5]) 212 | 213 | plot(mds$points[,1], -mds$points[,2], col=species) 214 | 215 | # PCA with prcomp 216 | pca = prcomp(X, center=TRUE, scale.=FALSE) 217 | names(pca) 218 | PC = predict(pca, X)[, 1:K] 219 | 220 | # Compute correlation between PCA and MDS components 221 | cor(cbind(mds$points, PC)) 222 | 223 | # 1.000000e+00 1.551000e-16 1.000000e+00 4.766625e-16 224 | # 1.551000e-16 1.000000e+00 4.474091e-16 -1.000000e+00 225 | # PC1 1.000000e+00 4.474091e-16 1.000000e+00 1.842964e-16 226 | # PC2 4.766625e-16 -1.000000e+00 1.842964e-16 1.000000e+00 227 | 228 | 229 | #################################################################### 230 | ## isomap 231 | #################################################################### 232 | install.packages("vegan") 233 | 234 | s_curve = read.csv("../data/s_curve.csv") 235 | colnames(s_curve) 236 | 237 | X = as.matrix(s_curve[, c("x", "y", "z")]) 238 | color = s_curve[["color"]] 239 | D <- dist(X, method="euclidean") 240 | 241 | library(vegan) 242 | 243 | iso = isomap(D, ndim=2, k=10) 244 | 245 | #install.packages("ggplot2") 246 | library(ggplot2) 247 | 248 | qplot(iso$points[,1], iso$points[,2], col=color) + scale_colour_gradientn(colours=rainbow(4)) 249 | scale_fill_distiller(palette = "Spectral") -------------------------------------------------------------------------------- /utils/ml_processing_pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 11 15:40:35 2016 4 | 5 | @author: edoaurd.duchesnay@cea.fr 6 | """ 7 | from sklearn import preprocessing 8 | preprocessing.OneHotEncoder 9 | 10 | 11 | ''' 12 | Regression pipelines 13 | ==================== 14 | ''' 15 | import numpy as np 16 | from sklearn import datasets 17 | import sklearn.linear_model as lm 18 | from sklearn import preprocessing 19 | from sklearn.cross_validation import cross_val_score 20 | from sklearn.feature_selection import SelectKBest 21 | from sklearn.feature_selection import f_regression 22 | from sklearn.pipeline import Pipeline 23 | from sklearn.grid_search import GridSearchCV 24 | import sklearn.metrics as metrics 25 | 26 | # Datasets 27 | n_samples, n_features, noise_sd = 100, 100, 20 28 | X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=n_features, 29 | noise=noise_sd, n_informative=5, 30 | random_state=42, coef=True) 31 | 32 | # Use this to tune the noise parameter such that snr < 5 33 | print("SNR:", np.std(np.dot(X, coef)) / noise_sd) 34 | 35 | print("=============================") 36 | print("== Basic linear regression ==") 37 | print("=============================") 38 | 39 | scores = cross_val_score(estimator=lm.LinearRegression(), X=X, y=y, cv=5) 40 | print("Test r2:%.2f" % scores.mean()) 41 | 42 | print("==============================================") 43 | print("== Scaler + anova filter + ridge regression ==") 44 | print("==============================================") 45 | 46 | anova_ridge = Pipeline([ 47 | ('standardscaler', preprocessing.StandardScaler()), 48 | ('selectkbest', SelectKBest(f_regression)), 49 | ('ridge', lm.Ridge()) 50 | ]) 51 | param_grid = {'selectkbest__k':np.arange(10, 110, 10), 52 | 'ridge__alpha':[.001, .01, .1, 1, 10, 100] } 53 | 54 | # Expect execution in ipython, for python remove the %time 55 | print("----------------------------") 56 | print("-- Parallelize inner loop --") 57 | print("----------------------------") 58 | 59 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, n_jobs=-1) 60 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5) 61 | print("Test r2:%.2f" % scores.mean()) 62 | 63 | print("----------------------------") 64 | print("-- Parallelize outer loop --") 65 | print("----------------------------") 66 | 67 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid) 68 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5, n_jobs=-1) 69 | print("Test r2:%.2f" % scores.mean()) 70 | 71 | 72 | print("=====================================") 73 | print("== Scaler + Elastic-net regression ==") 74 | print("=====================================") 75 | 76 | alphas = [.0001, .001, .01, .1, 1, 10, 100, 1000] 77 | l1_ratio = [.1, .5, .9] 78 | 79 | print("----------------------------") 80 | print("-- Parallelize outer loop --") 81 | print("----------------------------") 82 | 83 | enet = Pipeline([ 84 | ('standardscaler', preprocessing.StandardScaler()), 85 | ('enet', lm.ElasticNet(max_iter=10000)), 86 | ]) 87 | param_grid = {'enet__alpha':alphas , 88 | 'enet__l1_ratio':l1_ratio} 89 | enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid) 90 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5, n_jobs=-1) 91 | print("Test r2:%.2f" % scores.mean()) 92 | 93 | print("-----------------------------------------------") 94 | print("-- Parallelize outer loop + built-in CV --") 95 | print("-- Remark: scaler is only done on outer loop --") 96 | print("-----------------------------------------------") 97 | 98 | enet_cv = Pipeline([ 99 | ('standardscaler', preprocessing.StandardScaler()), 100 | ('enet', lm.ElasticNetCV(max_iter=10000, l1_ratio=l1_ratio, alphas=alphas)), 101 | ]) 102 | 103 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5) 104 | print("Test r2:%.2f" % scores.mean()) 105 | 106 | ''' 107 | Classification pipelines 108 | ======================== 109 | ''' 110 | import numpy as np 111 | from sklearn import datasets 112 | import sklearn.linear_model as lm 113 | from sklearn import preprocessing 114 | from sklearn.cross_validation import cross_val_score 115 | from sklearn.feature_selection import SelectKBest 116 | from sklearn.feature_selection import f_classif 117 | from sklearn.pipeline import Pipeline 118 | from sklearn.grid_search import GridSearchCV 119 | import sklearn.metrics as metrics 120 | 121 | # Datasets 122 | n_samples, n_features, noise_sd = 100, 100, 20 123 | X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, 124 | n_informative=5, random_state=42) 125 | 126 | 127 | def balanced_acc(estimator, X, y, **kwargs): 128 | ''' 129 | Balanced accuracy scorer 130 | ''' 131 | return metrics.recall_score(y, estimator.predict(X), average=None).mean() 132 | 133 | print("===============================") 134 | print("== Basic logistic regression ==") 135 | print("===============================") 136 | 137 | scores = cross_val_score(estimator=lm.LogisticRegression(C=1e8, class_weight='balanced'), 138 | X=X, y=y, cv=5, scoring=balanced_acc) 139 | print("Test bACC:%.2f" % scores.mean()) 140 | 141 | print("=======================================================") 142 | print("== Scaler + anova filter + ridge logistic regression ==") 143 | print("=======================================================") 144 | 145 | anova_ridge = Pipeline([ 146 | ('standardscaler', preprocessing.StandardScaler()), 147 | ('selectkbest', SelectKBest(f_classif)), 148 | ('ridge', lm.LogisticRegression(penalty='l2', class_weight='balanced')) 149 | ]) 150 | param_grid = {'selectkbest__k':np.arange(10, 110, 10), 151 | 'ridge__C':[.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]} 152 | 153 | 154 | # Expect execution in ipython, for python remove the %time 155 | print("----------------------------") 156 | print("-- Parallelize inner loop --") 157 | print("----------------------------") 158 | 159 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, 160 | scoring=balanced_acc, n_jobs=-1) 161 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\ 162 | scoring=balanced_acc) 163 | print("Test bACC:%.2f" % scores.mean()) 164 | 165 | print("----------------------------") 166 | print("-- Parallelize outer loop --") 167 | print("----------------------------") 168 | 169 | anova_ridge_cv = GridSearchCV(anova_ridge, cv=5, param_grid=param_grid, 170 | scoring=balanced_acc) 171 | %time scores = cross_val_score(estimator=anova_ridge_cv, X=X, y=y, cv=5,\ 172 | scoring=balanced_acc, n_jobs=-1) 173 | print("Test bACC:%.2f" % scores.mean()) 174 | 175 | 176 | print("========================================") 177 | print("== Scaler + lasso logistic regression ==") 178 | print("========================================") 179 | 180 | Cs = np.array([.0001, .001, .01, .1, 1, 10, 100, 1000, 10000]) 181 | alphas = 1 / Cs 182 | l1_ratio = [.1, .5, .9] 183 | 184 | print("----------------------------") 185 | print("-- Parallelize outer loop --") 186 | print("----------------------------") 187 | 188 | lasso = Pipeline([ 189 | ('standardscaler', preprocessing.StandardScaler()), 190 | ('lasso', lm.LogisticRegression(penalty='l1', class_weight='balanced')), 191 | ]) 192 | param_grid = {'lasso__C':Cs} 193 | enet_cv = GridSearchCV(lasso, cv=5, param_grid=param_grid, scoring=balanced_acc) 194 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\ 195 | scoring=balanced_acc, n_jobs=-1) 196 | print("Test bACC:%.2f" % scores.mean()) 197 | 198 | 199 | print("-----------------------------------------------") 200 | print("-- Parallelize outer loop + built-in CV --") 201 | print("-- Remark: scaler is only done on outer loop --") 202 | print("-----------------------------------------------") 203 | 204 | lasso_cv = Pipeline([ 205 | ('standardscaler', preprocessing.StandardScaler()), 206 | ('lasso', lm.LogisticRegressionCV(Cs=Cs, scoring=balanced_acc)), 207 | ]) 208 | 209 | %time scores = cross_val_score(estimator=lasso_cv, X=X, y=y, cv=5) 210 | print("Test bACC:%.2f" % scores.mean()) 211 | 212 | 213 | print("=============================================") 214 | print("== Scaler + Elasticnet logistic regression ==") 215 | print("=============================================") 216 | 217 | print("----------------------------") 218 | print("-- Parallelize outer loop --") 219 | print("----------------------------") 220 | 221 | enet = Pipeline([ 222 | ('standardscaler', preprocessing.StandardScaler()), 223 | ('enet', lm.SGDClassifier(loss="log", penalty="elasticnet", 224 | alpha=0.0001, l1_ratio=0.15, class_weight='balanced')), 225 | ]) 226 | 227 | param_grid = {'enet__alpha':alphas, 228 | 'enet__l1_ratio':l1_ratio} 229 | 230 | enet_cv = GridSearchCV(enet, cv=5, param_grid=param_grid, scoring=balanced_acc) 231 | %time scores = cross_val_score(estimator=enet_cv, X=X, y=y, cv=5,\ 232 | scoring=balanced_acc, n_jobs=-1) 233 | print("Test bACC:%.2f" % scores.mean()) 234 | -------------------------------------------------------------------------------- /machine_learning/ml_supervized_nonlinear.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Non-linear models 3 | ================= 4 | 5 | Here we focuse on non-linear models for classification. Nevertheless, each 6 | classification model has its regression counterpart. 7 | ''' 8 | 9 | # get_ipython().run_line_magic('matplotlib', 'inline') 10 | import matplotlib.pyplot as plt 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import seaborn as sns 15 | import matplotlib.pyplot as plt 16 | 17 | from sklearn.svm import SVC 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | from sklearn import datasets 21 | from sklearn import metrics 22 | from sklearn.model_selection import train_test_split 23 | 24 | np.set_printoptions(precision=2) 25 | pd.set_option('precision', 2) 26 | 27 | # %% 28 | # Support Vector Machines (SVM) 29 | # ----------------------------- 30 | # 31 | # SVM are based kernel methods require only a user-specified kernel function 32 | # :math:`K(x_i, x_j)`, i.e., a **similarity function** over pairs of data 33 | # points :math:`(x_i, x_j)` into kernel (dual) space on which learning 34 | # algorithms operate linearly, i.e. every operation on points is a linear 35 | # combination of :math:`K(x_i, x_j)`. 36 | # Outline of the SVM algorithm: 37 | # 38 | # 1. Map points :math:`x` into kernel space using a kernel function: 39 | # :math:`x \rightarrow K(x, .)`. 40 | # 2. Learning algorithms operates linearly by dot product into high-kernel 41 | # space :math:`K(., x_i) \cdot K(., x_j)`. 42 | # - Using the kernel trick (Mercer’s Theorem) replaces dot product in high 43 | # dimensional space by a simpler operation such that 44 | # :math:`K(., x_i) \cdot K(., x_j) = K(x_i, x_j)`. 45 | # Thus we only need to compute a similarity measure for each pairs of 46 | # point and store in a :math:`N \times N` Gram matrix. 47 | # - Finally, The learning process consist of estimating the $\alpha_i$ of 48 | # the decision function that maximises the hinge loss (of :math:`f(x)`) 49 | # plus some penalty when applied on all training points. 50 | # 51 | # .. math:: 52 | # 53 | # f(x) = \text{sign} \left(\sum_i^N \alpha_i~y_i~K(x_i, x)\right). 54 | # 55 | # 3. Predict a new point $x$ using the decision function. 56 | # 57 | # .. figure:: ../images/svm_rbf_kernel_mapping_and_decision_function.png 58 | # :alt: Support Vector Machines. 59 | # 60 | # Gaussian kernel (RBF, Radial Basis Function): 61 | # 62 | # One of the most commonly used kernel is the Radial Basis Function (RBF) Kernel. 63 | # For a pair of points :math:`x_i, x_j` the RBF kernel is defined as: 64 | # 65 | # .. raw:: latex 66 | # 67 | # \begin{align} 68 | # K(x_i, x_j) &= \exp\left(-\frac{\|x_i - x_j\|^2}{2\sigma^2}\right)\\ 69 | # &= \exp\left(-\gamma~\|x_i - x_j\|^2\right) 70 | # \end{align} 71 | # 72 | # Where :math:`\sigma` (or :math:`\gamma`) defines the kernel width parameter. 73 | # Basically, we consider a Gaussian function centered on each training sample 74 | # :math:`x_i`. it has a ready interpretation as a similarity measure as it 75 | # decreases with squared Euclidean distance between the two feature vectors. 76 | # 77 | # Non linear SVM also exists for regression problems. 78 | 79 | 80 | # %% 81 | # dataset 82 | 83 | X, y = datasets.load_breast_cancer(return_X_y=True) 84 | X_train, X_test, y_train, y_test = \ 85 | train_test_split(X, y, test_size=0.5, stratify=y, random_state=42) 86 | 87 | # %% 88 | # Preprocessing: unequal variance of input features, requires scaling for svm. 89 | 90 | ax = sns.displot(x=X_train.std(axis=0), kind="kde", bw_adjust=.2, cut=0, 91 | fill=True, height=3, aspect=1.5,) 92 | _ = ax.set_xlabels("Std-dev").tight_layout() 93 | 94 | scaler = StandardScaler() 95 | X_train = scaler.fit_transform(X_train) 96 | X_test = scaler.fit_transform(X_test) 97 | 98 | # %% 99 | # Fit-predict 100 | # Probalility is a logistic of the decision_function 101 | 102 | svm = SVC(kernel='rbf', probability=True).fit(X_train, y_train) 103 | y_pred = svm.predict(X_test) 104 | y_score = svm.decision_function(X_test) 105 | y_prob = svm.predict_proba(X_test)[:, 1] 106 | 107 | ax = sns.relplot(x=y_score, y=y_prob, hue=y_pred, height=2, aspect=1.5) 108 | _ = ax.set_axis_labels("decision function", "Probability").tight_layout() 109 | 110 | # %% Scores 111 | 112 | print("bAcc: %.2f, AUC: %.2f (AUC with proba: %.2f)" % ( 113 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred), 114 | metrics.roc_auc_score(y_true=y_test, y_score=y_score), 115 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob))) 116 | 117 | # Usefull internals: indices of support vectors within original X 118 | np.all(X_train[svm.support_, :] == svm.support_vectors_) 119 | 120 | 121 | # %% 122 | # Random forest 123 | # ------------- 124 | # 125 | # Decision tree 126 | # ~~~~~~~~~~~~~ 127 | # 128 | # A tree can be "learned" by splitting the training dataset into subsets based on an features value test. 129 | # Each internal node represents a "test" on an feature resulting on the split of the current sample. At each step the algorithm selects the feature and a cutoff value that maximises a given metric. Different metrics exist for regression tree (target is continuous) or classification tree (the target is qualitative). 130 | # This process is repeated on each derived subset in a recursive manner called recursive partitioning. The recursion is completed when the subset at a node has all the same value of the target variable, or when splitting no longer adds value to the predictions. This general principle is implemented by many recursive partitioning tree algorithms. 131 | # 132 | # .. figure:: ../images/classification_tree.png 133 | # :width: 400 134 | # :alt: Classification tree. 135 | # 136 | # Decision trees are simple to understand and interpret however they tend to overfit the data. However decision trees tend to overfit the training set. Leo Breiman propose random forest to deal with this issue. 137 | # 138 | # A single decision tree is usually overfits the data it is learning from because it learn from only one pathway of decisions. Predictions from a single decision tree usually don’t make accurate predictions on new data. 139 | # 140 | # Forest 141 | # ~~~~~~ 142 | # 143 | # A random forest is a meta estimator that fits a number of **decision tree learners** on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting. 144 | # Random forest models reduce the risk of overfitting by introducing randomness by: 145 | # 146 | # .. figure:: ../images/random_forest.png 147 | # :width: 300 148 | # :alt: Random forest. 149 | # 150 | # - building multiple trees (n_estimators) 151 | # - drawing observations with replacement (i.e., a bootstrapped sample) 152 | # - splitting nodes on the best split among a random subset of the features selected at every node 153 | # 154 | 155 | from sklearn.ensemble import RandomForestClassifier 156 | 157 | forest = RandomForestClassifier(n_estimators = 100) 158 | forest.fit(X_train, y_train) 159 | 160 | y_pred = forest.predict(X_test) 161 | y_prob = forest.predict_proba(X_test)[:, 1] 162 | 163 | 164 | # %% Scores 165 | 166 | print("bAcc: %.2f, AUC: %.2f " % ( 167 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred), 168 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob))) 169 | 170 | # %% 171 | # Extra Trees (Low Variance) 172 | # 173 | # Extra Trees is like Random Forest, in that it builds multiple trees and splits nodes using random subsets of features, but with two key differences: it does not bootstrap observations (meaning it samples without replacement), and nodes are split on random splits, not best splits. So, in summary, ExtraTrees: 174 | # builds multiple trees with bootstrap = False by default, which means it samples without replacement 175 | # nodes are split based on random splits among a random subset of the features selected at every node 176 | # In Extra Trees, randomness doesn’t come from bootstrapping of data, but rather comes from the random splits of all observations. 177 | # ExtraTrees is named for (Extremely Randomized Trees). 178 | 179 | 180 | # %% 181 | # Gradient boosting 182 | # ----------------- 183 | # 184 | # Gradient boosting is a meta estimator that fits a sequence of **weak learners**. 185 | # Each learner aims to reduce the residuals (errors) produced by the previous learner. 186 | # The two main hyper-parameters are: 187 | # 188 | # - The **learning rate** (*lr*) controls over-fitting: 189 | # decreasing the *lr* limits the capacity of a learner to overfit the residuals, ie, 190 | # it slows down the learning speed and thus increases the **regularisation**. 191 | # 192 | # - The **sub-sampling fraction** controls the fraction of samples to be used for 193 | # fitting the learners. Values smaller than 1 leads to **Stochastic Gradient Boosting**. 194 | # It thus controls for over-fitting reducing variance and incresing bias. 195 | # 196 | # .. figure:: ../images/gradient_boosting.png 197 | # :width: 500 198 | # :alt: Gradient boosting. 199 | # 200 | 201 | 202 | from sklearn.ensemble import GradientBoostingClassifier 203 | 204 | gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, 205 | subsample=0.5, random_state=0) 206 | gb.fit(X_train, y_train) 207 | 208 | y_pred = gb.predict(X_test) 209 | y_prob = gb.predict_proba(X_test)[:, 1] 210 | 211 | print("bAcc: %.2f, AUC: %.2f " % ( 212 | metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred), 213 | metrics.roc_auc_score(y_true=y_test, y_score=y_prob))) 214 | -------------------------------------------------------------------------------- /conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Machine Learning documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Nov 30 16:25:34 2015. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | import shlex 18 | 19 | # If extensions (or modules to document with autodoc) are in another directory, 20 | # add these directories to sys.path here. If the directory is relative to the 21 | # documentation root, use os.path.abspath to make it absolute, like shown here. 22 | #sys.path.insert(0, os.path.abspath('.')) 23 | 24 | # -- General configuration ------------------------------------------------ 25 | 26 | # If your documentation needs a minimal Sphinx version, state it here. 27 | #needs_sphinx = '1.0' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'sphinx.ext.mathjax', 34 | 'sphinx_gallery.gen_gallery', 35 | 'docxbuilder', 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The encoding of source files. 47 | #source_encoding = 'utf-8-sig' 48 | 49 | # The master toctree document. 50 | master_doc = 'index' 51 | 52 | # General information about the project. 53 | project = u'Statistics and Machine Learning in Python' 54 | copyright = u'2020, Edouard Duchesnay, NeuroSpin CEA Université Paris-Saclay, France' 55 | author = u'Edouard Duchesnay, Tommy Löfstedt, Younes Feki' 56 | 57 | # The version info for the project you're documenting, acts as replacement for 58 | # |version| and |release|, also used in various other places throughout the 59 | # built documents. 60 | # 61 | # The short X.Y version. 62 | version = '0.5' 63 | # The full version, including alpha/beta/rc tags. 64 | release = '0.5' 65 | 66 | # The language for content autogenerated by Sphinx. Refer to documentation 67 | # for a list of supported languages. 68 | # 69 | # This is also used if you do content translation via gettext catalogs. 70 | # Usually you set "language" from the command line for these cases. 71 | language = None 72 | 73 | # There are two options for replacing |today|: either, you set today to some 74 | # non-false value, then it is used: 75 | #today = '' 76 | # Else, today_fmt is used as the format for a strftime call. 77 | #today_fmt = '%B %d, %Y' 78 | 79 | # List of patterns, relative to source directory, that match files and 80 | # directories to ignore when looking for source files. 81 | exclude_patterns = ["notebooks/notebooks"] 82 | 83 | # The reST default role (used for this markup: `text`) to use for all 84 | # documents. 85 | #default_role = None 86 | 87 | # If true, '()' will be appended to :func: etc. cross-reference text. 88 | #add_function_parentheses = True 89 | 90 | # If true, the current module name will be prepended to all description 91 | # unit titles (such as .. function::). 92 | #add_module_names = True 93 | 94 | # If true, sectionauthor and moduleauthor directives will be shown in the 95 | # output. They are ignored by default. 96 | #show_authors = False 97 | 98 | # The name of the Pygments (syntax highlighting) style to use. 99 | pygments_style = 'sphinx' 100 | 101 | # A list of ignored prefixes for module index sorting. 102 | #modindex_common_prefix = [] 103 | 104 | # If true, keep warnings as "system message" paragraphs in the built documents. 105 | keep_warnings = False 106 | 107 | # If true, `todo` and `todoList` produce output, else they produce nothing. 108 | todo_include_todos = False 109 | 110 | 111 | # -- Options for HTML output ---------------------------------------------- 112 | 113 | # The theme to use for HTML and HTML Help pages. See the documentation for 114 | # a list of builtin themes. 115 | html_theme = 'alabaster' 116 | 117 | # Theme options are theme-specific and customize the look and feel of a theme 118 | # further. For a list of options available for each theme, see the 119 | # documentation. 120 | #html_theme_options = {} 121 | 122 | # Add any paths that contain custom themes here, relative to this directory. 123 | #html_theme_path = [] 124 | 125 | # The name for this set of Sphinx documents. If None, it defaults to 126 | # " v documentation". 127 | #html_title = None 128 | 129 | # A shorter title for the navigation bar. Default is the same as html_title. 130 | #html_short_title = None 131 | 132 | # The name of an image file (relative to this directory) to place at the top 133 | # of the sidebar. 134 | #html_logo = None 135 | 136 | # The name of an image file (within the static path) to use as favicon of the 137 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 138 | # pixels large. 139 | #html_favicon = None 140 | 141 | # Add any paths that contain custom static files (such as style sheets) here, 142 | # relative to this directory. They are copied after the builtin static files, 143 | # so a file named "default.css" will overwrite the builtin "default.css". 144 | html_static_path = ['_static'] 145 | 146 | # Add any extra paths that contain custom files (such as robots.txt or 147 | # .htaccess) here, relative to this directory. These files are copied 148 | # directly to the root of the documentation. 149 | #html_extra_path = [] 150 | 151 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 152 | # using the given strftime format. 153 | #html_last_updated_fmt = '%b %d, %Y' 154 | 155 | # If true, SmartyPants will be used to convert quotes and dashes to 156 | # typographically correct entities. 157 | #html_use_smartypants = True 158 | 159 | # Custom sidebar templates, maps document names to template names. 160 | #html_sidebars = {} 161 | 162 | # Additional templates that should be rendered to pages, maps page names to 163 | # template names. 164 | #html_additional_pages = {} 165 | 166 | # If false, no module index is generated. 167 | #html_domain_indices = True 168 | 169 | # If false, no index is generated. 170 | #html_use_index = True 171 | 172 | # If true, the index is split into individual pages for each letter. 173 | #html_split_index = False 174 | 175 | # If true, links to the reST sources are added to the pages. 176 | #html_show_sourcelink = True 177 | 178 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 179 | #html_show_sphinx = True 180 | 181 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 182 | #html_show_copyright = True 183 | 184 | # If true, an OpenSearch description file will be output, and all pages will 185 | # contain a tag referring to it. The value of this option must be the 186 | # base URL from which the finished HTML is served. 187 | html_use_opensearch = 'https://duchesnay.github.io/pystatsml/' 188 | 189 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 190 | #html_file_suffix = None 191 | 192 | # Language to be used for generating the HTML full-text search index. 193 | # Sphinx supports the following languages: 194 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 195 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 196 | #html_search_language = 'en' 197 | 198 | # A dictionary with options for the search language support, empty by default. 199 | # Now only 'ja' uses this config value 200 | #html_search_options = {'type': 'default'} 201 | 202 | # The name of a javascript file (relative to the configuration directory) that 203 | # implements a search results scorer. If empty, the default will be used. 204 | #html_search_scorer = 'scorer.js' 205 | 206 | # Output file base name for HTML help builder. 207 | htmlhelp_basename = 'StatisticsMachineLearningPython' 208 | 209 | # -- Options for LaTeX output --------------------------------------------- 210 | 211 | latex_elements = { 212 | # The paper size ('letterpaper' or 'a4paper'). 213 | 'papersize': 'a4paper', 214 | 215 | # The font size ('10pt', '11pt' or '12pt'). 216 | #'pointsize': '10pt', 217 | 'pointsize': '11pt', 218 | # Additional stuff for the LaTeX preamble. 219 | # 'preamble': ''' 220 | # \\usepackage{amsfonts} 221 | # ''', 222 | 'preamble': r''' 223 | \usepackage{charter} 224 | \usepackage[defaultsans]{lato} 225 | \usepackage{inconsolata} 226 | ''', 227 | 228 | # Latex figure (float) alignment 229 | #'figure_align': 'htbp', 230 | } 231 | 232 | # Grouping the document tree into LaTeX files. List of tuples 233 | # (source start file, target name, title, 234 | # author, documentclass [howto, manual, or own class]). 235 | latex_documents = [ 236 | (master_doc, 'StatisticsMachineLearningPython.tex', u'Statistics and Machine Learning in Python', 237 | # (master_doc, 'StatisticsMachineLearningPython.tex', u'Python fundamentals and advanced', 238 | u'Edouard Duchesnay, Tommy Löfstedt, Feki Younes', 'manual'), 239 | ] 240 | 241 | # The name of an image file (relative to this directory) to place at the top of 242 | # the title page. 243 | #latex_logo = None 244 | 245 | # For "manual" documents, if this is true, then toplevel headings are parts, 246 | # not chapters. 247 | #latex_use_parts = False 248 | 249 | # If true, show page references after internal links. 250 | #latex_show_pagerefs = False 251 | 252 | # If true, show URL addresses after external links. 253 | # latex_show_urls = True 254 | 255 | # Documents to append as an appendix to all manuals. 256 | #latex_appendices = [] 257 | 258 | # If false, no module index is generated. 259 | #latex_domain_indices = True 260 | 261 | 262 | # -- Options for manual page output --------------------------------------- 263 | 264 | # One entry per manual page. List of tuples 265 | # (source start file, name, description, authors, manual section). 266 | man_pages = [ 267 | (master_doc, 'statisticsmachinelearning', u'Statistics and Machine Learning in Python', 268 | [author], 1) 269 | ] 270 | 271 | # If true, show URL addresses after external links. 272 | #man_show_urls = False 273 | 274 | 275 | # -- Options for Texinfo output ------------------------------------------- 276 | 277 | # Grouping the document tree into Texinfo files. List of tuples 278 | # (source start file, target name, title, author, 279 | # dir menu entry, description, category) 280 | texinfo_documents = [ 281 | (master_doc, 'StatisticsMachineLearningPython', u'Statistics and Machine Learning in Python', 282 | author, 'MachineLearning', 'One line description of project.', 283 | 'Miscellaneous'), 284 | ] 285 | 286 | # Documents to append as an appendix to all manuals. 287 | #texinfo_appendices = [] 288 | 289 | # If false, no module index is generated. 290 | #texinfo_domain_indices = True 291 | 292 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 293 | #texinfo_show_urls = 'footnote' 294 | 295 | # If true, do not generate a @detailmenu in the "Top" node's menu. 296 | #texinfo_no_detailmenu = False 297 | 298 | 299 | # -- Options for sphinx gallery ------------------------------------------- 300 | 301 | sphinx_gallery_conf = { 302 | # path to your examples scripts 303 | 'examples_dirs' : ['python_lang', 'scientific_python', 'statistics', 'machine_learning', 'labs'], 304 | 'filename_pattern': '/', 305 | # path where to save gallery generated examples 306 | 'gallery_dirs' : ['auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery', 'auto_gallery'], 307 | 'backreferences_dir': False} 308 | 309 | 310 | -------------------------------------------------------------------------------- /machine_learning/decomposition_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dimension reduction and feature extraction\n", 8 | "\n", 9 | "## Principal Component Analysis\n", 10 | "\n", 11 | "### Implement PCA\n", 12 | "\n", 13 | "- Write a class `BasicPCA` with two methods `fit(X)` that estimates the data mean and principal components directions. `transform(X)` that project a new the data into the principal components.\n", 14 | "\n", 15 | "- Check that your `BasicPCA` performed similarly to the one from sklearn:\n", 16 | "`from sklearn.decomposition import PCA`" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "execution": { 24 | "iopub.execute_input": "2020-10-11T22:53:14.585085Z", 25 | "iopub.status.busy": "2020-10-11T22:53:14.584709Z", 26 | "iopub.status.idle": "2020-10-11T22:53:15.274591Z", 27 | "shell.execute_reply": "2020-10-11T22:53:15.274226Z" 28 | } 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import numpy as np\n", 33 | "import scipy\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns\n", 36 | "%matplotlib inline\n", 37 | "#%matplotlib qt\n", 38 | "\n", 39 | "np.random.seed(42)\n", 40 | "\n", 41 | "\n", 42 | "import numpy as np\n", 43 | "from sklearn.decomposition import PCA\n", 44 | "\n", 45 | "\n", 46 | "class BasicPCA():\n", 47 | " def fit(self, X):\n", 48 | " # U : Unitary matrix having left singular vectors as columns.\n", 49 | " # Of shape (n_samples,n_samples) or (n_samples,n_comps), depending on\n", 50 | " # full_matrices.\n", 51 | " #\n", 52 | " # s : The singular values, sorted in non-increasing order. Of shape (n_comps,), \n", 53 | " # with n_comps = min(n_samples, n_features).\n", 54 | " #\n", 55 | " # Vh: Unitary matrix having right singular vectors as rows. \n", 56 | " # Of shape (n_features, n_features) or (n_comps, n_features) depending on full_matrices.\n", 57 | " self.mean = X.mean(axis=0)\n", 58 | " Xc = X - self.mean # Centering is required\n", 59 | " U, s, V = scipy.linalg.svd(Xc, full_matrices=False)\n", 60 | " self.explained_variance_ = (s ** 2) / X.shape[0]\n", 61 | " self.explained_variance_ratio_ = (self.explained_variance_ /\n", 62 | " self.explained_variance_.sum())\n", 63 | " self.princ_comp_dir = V\n", 64 | "\n", 65 | " def transform(self, X):\n", 66 | " Xc = X - self.mean\n", 67 | " return(np.dot(Xc, self.princ_comp_dir.T))\n", 68 | "\n", 69 | "# test\n", 70 | "np.random.seed(42)\n", 71 | " \n", 72 | "# dataset\n", 73 | "n_samples = 100\n", 74 | "experience = np.random.normal(size=n_samples)\n", 75 | "salary = 1500 + experience + np.random.normal(size=n_samples, scale=.5)\n", 76 | "X = np.column_stack([experience, salary])\n", 77 | "\n", 78 | "X = np.column_stack([experience, salary])\n", 79 | "pca = PCA(n_components=2)\n", 80 | "pca.fit(X)\n", 81 | "\n", 82 | "basic_pca = BasicPCA()\n", 83 | "basic_pca.fit(X)\n", 84 | "\n", 85 | "print(pca.explained_variance_ratio_)\n", 86 | "assert np.all(basic_pca.transform(X) == pca.transform(X))\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "### Apply PCA on iris dataset\n", 94 | "\n", 95 | "Apply your sklearn PCA on `iris` dataset available at: 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "execution": { 103 | "iopub.execute_input": "2020-10-11T22:53:15.278801Z", 104 | "iopub.status.busy": "2020-10-11T22:53:15.278467Z", 105 | "iopub.status.idle": "2020-10-11T22:53:16.236441Z", 106 | "shell.execute_reply": "2020-10-11T22:53:16.234869Z" 107 | } 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import matplotlib.pyplot as plt\n", 112 | "\n", 113 | "from sklearn.decomposition import PCA\n", 114 | "# https://tgmstat.wordpress.com/2013/11/28/computing-and-visualizing-pca-in-r/\n", 115 | "\n", 116 | "import numpy as np\n", 117 | "import pandas as pd\n", 118 | "\n", 119 | "try:\n", 120 | " salary = pd.read_csv('datasets/iris.csv')\n", 121 | "except:\n", 122 | " url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/iris.csv'\n", 123 | " df = pd.read_csv(url)\n", 124 | "\n", 125 | "print(df.head())" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Describe the data set. Should the dataset been standardized ?" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "execution": { 140 | "iopub.execute_input": "2020-10-11T22:53:16.256201Z", 141 | "iopub.status.busy": "2020-10-11T22:53:16.255386Z", 142 | "iopub.status.idle": "2020-10-11T22:53:16.269795Z", 143 | "shell.execute_reply": "2020-10-11T22:53:16.269211Z" 144 | } 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "print(df.describe())" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "Describe the structure of correlation among variables." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "execution": { 163 | "iopub.execute_input": "2020-10-11T22:53:16.273240Z", 164 | "iopub.status.busy": "2020-10-11T22:53:16.272789Z", 165 | "iopub.status.idle": "2020-10-11T22:53:16.275060Z", 166 | "shell.execute_reply": "2020-10-11T22:53:16.274585Z" 167 | } 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "X = np.array(df.iloc[:, :4])\n", 172 | "#np.around(np.corrcoef(X.T), 3)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "execution": { 180 | "iopub.execute_input": "2020-10-11T22:53:16.279201Z", 181 | "iopub.status.busy": "2020-10-11T22:53:16.278783Z", 182 | "iopub.status.idle": "2020-10-11T22:53:16.283272Z", 183 | "shell.execute_reply": "2020-10-11T22:53:16.282896Z" 184 | } 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "# Center and standardize\n", 189 | "\n", 190 | "X = np.array(df.iloc[:, :4])\n", 191 | "X -= np.mean(X, axis=0)\n", 192 | "X /= np.std(X, axis=0, ddof=1)\n", 193 | "np.around(np.corrcoef(X.T), 3)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Compute a PCA with the maximum number of components." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "execution": { 208 | "iopub.execute_input": "2020-10-11T22:53:16.286362Z", 209 | "iopub.status.busy": "2020-10-11T22:53:16.285897Z", 210 | "iopub.status.idle": "2020-10-11T22:53:16.288689Z", 211 | "shell.execute_reply": "2020-10-11T22:53:16.288349Z" 212 | } 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "pca = PCA(n_components=X.shape[1])\n", 217 | "pca.fit(X)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "Retrieve the explained variance ratio. Determine $K$ the number of components." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "execution": { 232 | "iopub.execute_input": "2020-10-11T22:53:16.291425Z", 233 | "iopub.status.busy": "2020-10-11T22:53:16.291098Z", 234 | "iopub.status.idle": "2020-10-11T22:53:16.293764Z", 235 | "shell.execute_reply": "2020-10-11T22:53:16.294048Z" 236 | } 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "print(pca.explained_variance_ratio_)\n", 241 | "\n", 242 | "K = 2\n", 243 | "pca = PCA(n_components=X.shape[1])\n", 244 | "pca.fit(X)\n", 245 | "PC = pca.transform(X)\n", 246 | "#print(PC)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "Print the $K$ principal components direction and correlation of the $K$ principal\n", 254 | "components with original variables. Interpret the contribution of original variables\n", 255 | "into the PC.\n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "execution": { 263 | "iopub.execute_input": "2020-10-11T22:53:16.297928Z", 264 | "iopub.status.busy": "2020-10-11T22:53:16.297500Z", 265 | "iopub.status.idle": "2020-10-11T22:53:16.302829Z", 266 | "shell.execute_reply": "2020-10-11T22:53:16.302482Z" 267 | } 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "print(pca.components_)\n", 272 | "CorPC = pd.DataFrame(\n", 273 | " [[np.corrcoef(X[:, j], PC[:, k])[0, 1] for j in range(X.shape[1])]\n", 274 | " for k in range(K)],\n", 275 | " columns = df.columns[:4],\n", 276 | " index = [\"PC %i\"%k for k in range(K)]\n", 277 | ")\n", 278 | "\n", 279 | "print(CorPC)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "Plot samples projected into the $K$ first PCs. Color samples with their species." 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "execution": { 294 | "iopub.execute_input": "2020-10-11T22:53:16.316818Z", 295 | "iopub.status.busy": "2020-10-11T22:53:16.316510Z", 296 | "iopub.status.idle": "2020-10-11T22:53:16.396495Z", 297 | "shell.execute_reply": "2020-10-11T22:53:16.396182Z" 298 | } 299 | }, 300 | "outputs": [], 301 | "source": [ 302 | "colors = {'setosa':'r', 'versicolor':'g', 'virginica':'blue'}\n", 303 | "print(df[\"species\"].unique())\n", 304 | "#plt.scatter(df['experience'], df['salary'], c=df['education'].apply(lambda x: colors[x]), s=100)\n", 305 | "plt.scatter(PC[:, 0], PC[:, 1], c=df[\"species\"].apply(lambda x: colors[x]))\n", 306 | "plt.xlabel(\"PC1\")\n", 307 | "plt.ylabel(\"PC2\")" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Pairewise plot" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "execution": { 322 | "iopub.execute_input": "2020-10-11T22:53:16.442119Z", 323 | "iopub.status.busy": "2020-10-11T22:53:16.441495Z", 324 | "iopub.status.idle": "2020-10-11T22:53:23.105722Z", 325 | "shell.execute_reply": "2020-10-11T22:53:23.106018Z" 326 | } 327 | }, 328 | "outputs": [], 329 | "source": [ 330 | "import seaborn as sns\n", 331 | "\n", 332 | "df[\"PC1\"] = PC[:, 0]\n", 333 | "df[\"PC2\"] = PC[:, 1]\n", 334 | "\n", 335 | "ax = sns.pairplot(df, hue=\"species\")" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [] 344 | } 345 | ], 346 | "metadata": { 347 | "anaconda-cloud": {}, 348 | "kernelspec": { 349 | "display_name": "Python 3", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 3 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython3", 363 | "version": "3.7.9" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /scientific_python/scipy_matplotlib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data visualization: matplotlib & seaborn \n", 8 | "\n", 9 | "\n", 10 | "## Basic plots" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "execution": { 18 | "iopub.execute_input": "2020-10-11T22:54:06.283262Z", 19 | "iopub.status.busy": "2020-10-11T22:54:06.281496Z", 20 | "iopub.status.idle": "2020-10-11T22:54:06.619890Z", 21 | "shell.execute_reply": "2020-10-11T22:54:06.619484Z" 22 | } 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "import seaborn as sns\n", 29 | "\n", 30 | "# inline plot (for jupyter)\n", 31 | "%matplotlib inline\n", 32 | "\n", 33 | "plt.figure(figsize=(9, 3))\n", 34 | "x = np.linspace(0, 10, 50)\n", 35 | "sinus = np.sin(x)\n", 36 | "\n", 37 | "plt.plot(x, sinus)\n", 38 | "plt.show()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "execution": { 46 | "iopub.execute_input": "2020-10-11T22:54:06.631218Z", 47 | "iopub.status.busy": "2020-10-11T22:54:06.630138Z", 48 | "iopub.status.idle": "2020-10-11T22:54:06.715538Z", 49 | "shell.execute_reply": "2020-10-11T22:54:06.715894Z" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "plt.figure(figsize=(9, 3))\n", 55 | "\n", 56 | "plt.plot(x, sinus, \"o\")\n", 57 | "plt.show()\n", 58 | "# use plt.plot to get color / marker abbreviations" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "execution": { 66 | "iopub.execute_input": "2020-10-11T22:54:06.728139Z", 67 | "iopub.status.busy": "2020-10-11T22:54:06.727746Z", 68 | "iopub.status.idle": "2020-10-11T22:54:06.834198Z", 69 | "shell.execute_reply": "2020-10-11T22:54:06.833848Z" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# Rapid multiplot\n", 75 | "\n", 76 | "plt.figure(figsize=(9, 3))\n", 77 | "cosinus = np.cos(x)\n", 78 | "plt.plot(x, sinus, \"-b\", x, sinus, \"ob\", x, cosinus, \"-r\", x, cosinus, \"or\")\n", 79 | "plt.xlabel('this is x!')\n", 80 | "plt.ylabel('this is y!')\n", 81 | "plt.title('My First Plot')\n", 82 | "plt.show()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "execution": { 90 | "iopub.execute_input": "2020-10-11T22:54:06.847651Z", 91 | "iopub.status.busy": "2020-10-11T22:54:06.846622Z", 92 | "iopub.status.idle": "2020-10-11T22:54:06.953662Z", 93 | "shell.execute_reply": "2020-10-11T22:54:06.953293Z" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# Step by step\n", 99 | "\n", 100 | "plt.figure(figsize=(9, 3))\n", 101 | "plt.plot(x, sinus, label='sinus', color='blue', linestyle='--', linewidth=2)\n", 102 | "plt.plot(x, cosinus, label='cosinus', color='red', linestyle='-', linewidth=2)\n", 103 | "plt.legend()\n", 104 | "plt.show()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Scatter (2D) plots\n", 112 | "\n", 113 | "Load dataset" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "execution": { 121 | "iopub.execute_input": "2020-10-11T22:54:06.956572Z", 122 | "iopub.status.busy": "2020-10-11T22:54:06.956237Z", 123 | "iopub.status.idle": "2020-10-11T22:54:07.103716Z", 124 | "shell.execute_reply": "2020-10-11T22:54:07.103342Z" 125 | } 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "import pandas as pd\n", 130 | "try:\n", 131 | " salary = pd.read_csv(\"../datasets/salary_table.csv\")\n", 132 | "except:\n", 133 | " url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv'\n", 134 | " salary = pd.read_csv(url)\n", 135 | "\n", 136 | "df = salary\n", 137 | "print(df.head())" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### Simple scatter with colors" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "plt.figure(figsize=(3, 3), dpi=100)\n", 154 | "_ = sns.scatterplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "Legend outside" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "### Linear model" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "ax = sns.lmplot(x=\"experience\", y=\"salary\", hue=\"education\", data=salary)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### Scatter plot with colors and symbols" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "ax = sns.relplot(x=\"experience\", y=\"salary\", hue=\"education\", style='management', data=salary)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## Saving Figures" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "execution": { 217 | "iopub.execute_input": "2020-10-11T22:54:07.420427Z", 218 | "iopub.status.busy": "2020-10-11T22:54:07.419445Z", 219 | "iopub.status.idle": "2020-10-11T22:54:07.649956Z", 220 | "shell.execute_reply": "2020-10-11T22:54:07.649633Z" 221 | } 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "### bitmap format\n", 226 | "plt.plot(x, sinus)\n", 227 | "plt.savefig(\"sinus.png\")\n", 228 | "plt.close()\n", 229 | "\n", 230 | "# Prefer vectorial format (SVG: Scalable Vector Graphics) can be edited with \n", 231 | "# Inkscape, Adobe Illustrator, Blender, etc.\n", 232 | "plt.plot(x, sinus)\n", 233 | "plt.savefig(\"sinus.svg\")\n", 234 | "plt.close()\n", 235 | "\n", 236 | "# Or pdf\n", 237 | "plt.plot(x, sinus)\n", 238 | "plt.savefig(\"sinus.pdf\")\n", 239 | "plt.close()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "### Boxplot and violin plot: one factor\n", 247 | "\n", 248 | "Box plots are non-parametric: they display variation in samples of a statistical population without making any assumptions of the underlying statistical distribution.\n", 249 | "\n", 250 | "![title](images/boxplot.png){width=7cm}" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "ax = sns.boxplot(x=\"management\", y=\"salary\", data=salary)\n", 260 | "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"black\")" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "ax = sns.violinplot(x=\"management\", y=\"salary\", data=salary)\n", 270 | "ax = sns.stripplot(x=\"management\", y=\"salary\", data=salary, jitter=True, color=\"white\")" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### Boxplot and violin plot: two factors" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "ax = sns.boxplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n", 287 | "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "execution": { 295 | "iopub.execute_input": "2020-10-11T22:54:07.652516Z", 296 | "iopub.status.busy": "2020-10-11T22:54:07.652175Z", 297 | "iopub.status.idle": "2020-10-11T22:54:08.055323Z", 298 | "shell.execute_reply": "2020-10-11T22:54:08.054906Z" 299 | } 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "ax = sns.violinplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary)\n", 304 | "ax = sns.stripplot(x=\"management\", y=\"salary\", hue=\"education\", data=salary, jitter=True, dodge=True, linewidth=1)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "### Distributions and density plot\n", 312 | "\n", 313 | "[Distributions with seaborn](https://seaborn.pydata.org/tutorial/distributions.html)\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "ax = sns.displot(x=\"salary\", hue=\"management\", kind=\"kde\", data=salary, fill=True)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "## Multiple axis" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "fig, axes = plt.subplots(3, 1, figsize=(9, 9), sharex=True)\n", 339 | "\n", 340 | "i = 0\n", 341 | "for edu, d in salary.groupby(['education']):\n", 342 | " sns.kdeplot(x=\"salary\", hue=\"management\", data=d, fill=True, ax=axes[i], palette=\"muted\")\n", 343 | " axes[i].set_title(edu)\n", 344 | " i += 1" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## Pairwise scatter plots" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "ax = sns.pairplot(salary, hue=\"management\")" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## Time series" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "execution": { 375 | "iopub.execute_input": "2020-10-11T22:54:10.349932Z", 376 | "iopub.status.busy": "2020-10-11T22:54:10.349585Z", 377 | "iopub.status.idle": "2020-10-11T22:54:11.426751Z", 378 | "shell.execute_reply": "2020-10-11T22:54:11.426337Z" 379 | } 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "import seaborn as sns\n", 384 | "sns.set(style=\"darkgrid\")\n", 385 | "\n", 386 | "# Load an example dataset with long-form data\n", 387 | "fmri = sns.load_dataset(\"fmri\")\n", 388 | "\n", 389 | "# Plot the responses for different events and regions\n", 390 | "ax = sns.pointplot(x=\"timepoint\", y=\"signal\",\n", 391 | " hue=\"region\", style=\"event\",\n", 392 | " data=fmri)" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "anaconda-cloud": {}, 398 | "kernelspec": { 399 | "display_name": "Python 3", 400 | "language": "python", 401 | "name": "python3" 402 | }, 403 | "language_info": { 404 | "codemirror_mode": { 405 | "name": "ipython", 406 | "version": 3 407 | }, 408 | "file_extension": ".py", 409 | "mimetype": "text/x-python", 410 | "name": "python", 411 | "nbconvert_exporter": "python", 412 | "pygments_lexer": "ipython3", 413 | "version": "3.7.9" 414 | } 415 | }, 416 | "nbformat": 4, 417 | "nbformat_minor": 2 418 | } 419 | -------------------------------------------------------------------------------- /utils/time_series.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Time Series in python 3 | 4 | Two libraries: 5 | 6 | - Pandas: https://pandas.pydata.org/pandas-docs/stable/timeseries.html 7 | - scipy http://www.statsmodels.org/devel/tsa.html 8 | ''' 9 | 10 | ''' 11 | ## Stationarity 12 | 13 | A TS is said to be stationary if its statistical properties such as mean, variance remain constant over time. 14 | 15 | - constant mean 16 | - constant variance 17 | - an autocovariance that does not depend on time. 18 | 19 | what is making a TS non-stationary. There are 2 major reasons behind non-stationaruty of a TS: 20 | 21 | 1. Trend – varying mean over time. For eg, in this case we saw that on average, the number of passengers was growing over time. 22 | 23 | 2. Seasonality – variations at specific time-frames. eg people might have a tendency to buy cars in a particular month because of pay increment or festivals. 24 | ''' 25 | 26 | ''' 27 | ## Pandas Time Series Data Structure 28 | 29 | A Series is similar to a list or an array in Python. 30 | It represents a series of values (numeric or otherwise) such as a column of data. 31 | It provides additional functionality, methods, and operators, which make it a more powerful version of a list. 32 | ''' 33 | 34 | import pandas as pd 35 | import numpy as np 36 | 37 | # Create a Series from a list 38 | ser = pd.Series([1, 3]) 39 | print(ser) 40 | 41 | # String as index 42 | prices = {'apple': 4.99, 43 | 'banana': 1.99, 44 | 'orange': 3.99} 45 | ser = pd.Series(prices) 46 | print(ser) 47 | 48 | x = pd.Series(np.arange(1,3), index=[x for x in 'ab']) 49 | print(x) 50 | print(x['b']) 51 | 52 | ''' 53 | ## Time Series Analysis of Google Trends 54 | 55 | source: https://www.datacamp.com/community/tutorials/time-series-analysis-tutorial 56 | 57 | Get Google Trends data of keywords such as 'diet' and 'gym' and see how they vary over time while learning about trends and seasonality in time series data. 58 | 59 | In the Facebook Live code along session on the 4th of January, we checked out Google trends data of keywords 'diet', 'gym' and 'finance' to see how they vary over time. We asked ourselves if there could be more searches for these terms in January when we're all trying to turn over a new leaf? 60 | 61 | In this tutorial, you'll go through the code that we put together during the session step by step. You're not going to do much mathematics but you are going to do the following: 62 | 63 | - Read data 64 | - Recode data 65 | - Exploratory Data Analysis 66 | 67 | ''' 68 | 69 | 70 | ''' 71 | ## Read data 72 | ''' 73 | 74 | import numpy as np 75 | import pandas as pd 76 | import matplotlib.pyplot as plt 77 | import seaborn as sns 78 | 79 | # Plot appears on its own windows 80 | %matplotlib qt 81 | # Tools / Preferences / Ipython Console / Graphics / Graphics Backend / Backend: “automatic” 82 | # Interactive Matplotlib Jupyter Notebook 83 | # %matplotlib inline 84 | 85 | try: 86 | url = "https://raw.githubusercontent.com/datacamp/datacamp_facebook_live_ny_resolution/master/data/multiTimeline.csv" 87 | df = pd.read_csv(url, skiprows=2) 88 | except: 89 | df = pd.read_csv("../data/multiTimeline.csv", skiprows=2) 90 | 91 | print(df.head()) 92 | 93 | # Rename columns 94 | df.columns = ['month', 'diet', 'gym', 'finance'] 95 | 96 | # Describe 97 | print(df.describe()) 98 | 99 | ''' 100 | ## Recode data 101 | 102 | Next, you'll turn the 'month' column into a DateTime data type and make it the index of the DataFrame. 103 | 104 | Note that you do this because you saw in the result of the .info() method that the 'Month' column was actually an of data type object. Now, that generic data type encapsulates everything from strings to integers, etc. That's not exactly what you want when you want to be looking at time series data. That's why you'll use .to_datetime() to convert the 'month' column in your DataFrame to a DateTime. 105 | 106 | Be careful! Make sure to include the inplace argument when you're setting the index of the DataFrame df so that you actually alter the original index and set it to the 'month' column. 107 | ''' 108 | df.month = pd.to_datetime(df.month) 109 | df.set_index('month', inplace=True) 110 | 111 | print(df.head()) 112 | 113 | ''' 114 | ## Exploratory Data Analysis 115 | 116 | You can use a built-in pandas visualization method .plot() to plot your 117 | data as 3 line plots on a single 118 | figure (one for each column, namely, 'diet', 'gym', and 'finance'). 119 | ''' 120 | df.plot() 121 | plt.xlabel('Year'); 122 | 123 | # change figure parameters 124 | # df.plot(figsize=(20,10), linewidth=5, fontsize=20) 125 | 126 | # Plot single column 127 | # df[['diet']].plot(figsize=(20,10), linewidth=5, fontsize=20) 128 | # plt.xlabel('Year', fontsize=20); 129 | 130 | ''' 131 | Note that this data is relative. As you can read on Google trends: 132 | 133 | Numbers represent search interest relative to the highest point on the chart 134 | for the given region and time. 135 | A value of 100 is the peak popularity for the term. 136 | A value of 50 means that the term is half as popular. 137 | Likewise a score of 0 means the term was less than 1% as popular as the peak. 138 | 139 | ''' 140 | 141 | 142 | ''' 143 | ## Resampling, Smoothing, Windowing, Rolling average: Trends 144 | 145 | Rolling average, for each time point, take the average of the points on either side of it. 146 | Note that the number of points is specified by a window size. 147 | 148 | Remove Seasonality with pandas Series. 149 | 150 | See: http://pandas.pydata.org/pandas-docs/stable/timeseries.html 151 | A: 'year end frequency' year frequency 152 | ''' 153 | diet = df['diet'] 154 | 155 | diet_resamp_yr = diet.resample('A').mean() 156 | diet_roll_yr = diet.rolling(12).mean() 157 | 158 | ax = diet.plot(alpha=0.5, style='-') # store axis (ax) for latter plots 159 | diet_resamp_yr.plot(style=':', label='Resample at year frequency', ax=ax) 160 | diet_roll_yr.plot(style='--', label='Rolling average (smooth), window size=12', ax=ax) 161 | ax.legend() 162 | 163 | 164 | ''' 165 | Rolling average (smoothing) with Numpy 166 | ''' 167 | 168 | x = np.asarray(df[['diet']]) 169 | win = 12 170 | win_half = int(win / 2) 171 | # print([((idx-win_half), (idx+win_half)) for idx in np.arange(win_half, len(x))]) 172 | 173 | diet_smooth = np.array([x[(idx-win_half):(idx+win_half)].mean() for idx in np.arange(win_half, len(x))]) 174 | plt.plot(diet_smooth) 175 | 176 | ''' 177 | Trends Plot Diet and Gym 178 | 179 | Build a new DataFrame which is the concatenation diet and gym smoothed data 180 | ''' 181 | gym = df['gym'] 182 | 183 | df_avg = pd.concat([diet.rolling(12).mean(), gym.rolling(12).mean()], axis=1) 184 | df_avg.plot() 185 | plt.xlabel('Year') 186 | 187 | ''' 188 | Detrending 189 | ''' 190 | 191 | df_dtrend = df[["diet", "gym"]] - df_avg 192 | df_dtrend.plot() 193 | plt.xlabel('Year') 194 | 195 | ''' 196 | ## First-order differencing: Seasonal Patterns 197 | 198 | ''' 199 | 200 | # diff = original - shiftted data 201 | # (exclude first term for some implementation details) 202 | assert np.all((diet.diff() == diet - diet.shift())[1:]) 203 | 204 | df.diff().plot() 205 | plt.xlabel('Year') 206 | 207 | ''' 208 | ## Periodicity and Correlation 209 | ''' 210 | 211 | df.plot() 212 | plt.xlabel('Year'); 213 | print(df.corr()) 214 | 215 | ''' 216 | Plot correlation matrix 217 | ''' 218 | 219 | sns.heatmap(df.corr(), cmap="coolwarm") 220 | 221 | 222 | ''' 223 | 'diet' and 'gym' are negatively correlated! 224 | Remember that you have a seasonal and a trend component. 225 | From the correlation coefficient, 'diet' and 'gym' are negatively correlated: 226 | 227 | - trends components are negatively correlated. 228 | - seasonal components would positively correlated and their 229 | 230 | The actual correlation coefficient is actually capturing both of those. 231 | 232 | Seasonal correlation: correlation of the first-order differences of these time series 233 | ''' 234 | 235 | df.diff().plot() 236 | plt.xlabel('Year'); 237 | 238 | print(df.diff().corr()) 239 | 240 | ''' 241 | Plot correlation matrix 242 | ''' 243 | 244 | sns.heatmap(df.diff().corr(), cmap="coolwarm") 245 | 246 | ''' 247 | Decomposing time serie in trend, seasonality and residuals 248 | ''' 249 | 250 | from statsmodels.tsa.seasonal import seasonal_decompose 251 | 252 | x = gym 253 | 254 | x = x.astype(float) # force float 255 | decomposition = seasonal_decompose(x) 256 | trend = decomposition.trend 257 | seasonal = decomposition.seasonal 258 | residual = decomposition.resid 259 | 260 | plt.subplot(411) 261 | plt.plot(x, label='Original') 262 | plt.legend(loc='best') 263 | plt.subplot(412) 264 | plt.plot(trend, label='Trend') 265 | plt.legend(loc='best') 266 | plt.subplot(413) 267 | plt.plot(seasonal,label='Seasonality') 268 | plt.legend(loc='best') 269 | plt.subplot(414) 270 | plt.plot(residual, label='Residuals') 271 | plt.legend(loc='best') 272 | plt.tight_layout() 273 | 274 | 275 | ''' 276 | ## Autocorrelation 277 | 278 | A time series is periodic if it repeats itself at equally spaced intervals, say, every 12 months. 279 | Autocorrelation Function (ACF): It is a measure of the correlation between the TS with a 280 | lagged version of itself. For instance at lag 5, ACF would compare series at time instant t1...t2 281 | with series at instant t1-5...t2-5 (t1-5 and t2 being end points). 282 | 283 | Plot 284 | ''' 285 | # from pandas.plotting import autocorrelation_plot 286 | from pandas.tools.plotting import autocorrelation_plot 287 | 288 | x = df["diet"].astype(float) 289 | autocorrelation_plot(x) 290 | 291 | ''' 292 | Compute Autocorrelation Function (ACF) 293 | ''' 294 | 295 | from statsmodels.tsa.stattools import acf 296 | 297 | x_diff = x.diff().dropna() # first item is NA 298 | lag_acf = acf(x_diff, nlags=36) 299 | plt.plot(lag_acf) 300 | plt.title('Autocorrelation Function') 301 | 302 | ''' 303 | ACF peaks every 12 months: Time series is correlated with itself shifted by 12 months. 304 | ''' 305 | 306 | ''' 307 | ## Time Series Forecasting with Python using Autoregressive Moving Average (ARMA) models 308 | 309 | Source: 310 | 311 | - https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783553358/7/ch07lvl1sec77/arma-models 312 | 313 | - http://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model 314 | 315 | - ARIMA: https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/ 316 | 317 | ARMA models are often used to forecast a time series. 318 | These models combine autoregressive and moving average models. 319 | In moving average models, we assume that a variable is the sum of the mean of the 320 | time series and a linear combination of noise components. 321 | 322 | The autoregressive and moving average models can have different orders. In general, we can define an ARMA model with p autoregressive terms and q moving average terms as follows: 323 | 324 | $$ 325 | x_t = \sum_i^p a_i x_{t-i} +\sum_i^q b_i \varepsilon_{t-i} + \varepsilon_t 326 | $$ 327 | ''' 328 | 329 | ''' 330 | ### Choosing p and q 331 | 332 | Plot the partial autocorrelation functions for an estimate of p, and likewise using the autocorrelation functions for an estimate of q. 333 | 334 | Partial Autocorrelation Function (PACF): This measures the correlation between the TS with a lagged version of itself but after eliminating the variations already explained by the intervening comparisons. Eg at lag 5, it will check the correlation but remove the effects already explained by lags 1 to 4. 335 | ''' 336 | from statsmodels.tsa.stattools import acf, pacf 337 | 338 | x = df["gym"].astype(float) 339 | 340 | x_diff = x.diff().dropna() # first item is NA 341 | # ACF and PACF plots: 342 | 343 | lag_acf = acf(x_diff, nlags=20) 344 | lag_pacf = pacf(x_diff, nlags=20, method='ols') 345 | 346 | #Plot ACF: 347 | plt.subplot(121) 348 | plt.plot(lag_acf) 349 | plt.axhline(y=0,linestyle='--',color='gray') 350 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray') 351 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray') 352 | plt.title('Autocorrelation Function (q=1)') 353 | 354 | #Plot PACF: 355 | plt.subplot(122) 356 | plt.plot(lag_pacf) 357 | plt.axhline(y=0,linestyle='--',color='gray') 358 | plt.axhline(y=-1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray') 359 | plt.axhline(y=1.96/np.sqrt(len(x_diff)),linestyle='--',color='gray') 360 | plt.title('Partial Autocorrelation Function (p=1)') 361 | plt.tight_layout() 362 | 363 | ''' 364 | In this plot, the two dotted lines on either sides of 0 are the confidence interevals. 365 | These can be used to determine the p and q values as: 366 | 367 | - p: The lag value where the PACF chart crosses the upper confidence interval for the first time, in this case p=1. 368 | 369 | - q: The lag value where the ACF chart crosses the upper confidence interval for the first time, in this case q=1. 370 | ''' 371 | 372 | ''' 373 | ### Fit ARMA model with statsmodels 374 | 375 | 1. Define the model by calling `ARMA()` and passing in the p and q parameters. 376 | 377 | 2. The model is prepared on the training data by calling the `fit()` function. 378 | 379 | 3. Predictions can be made by calling the `predict()` function and specifying the index of the time or times to be predicted. 380 | ''' 381 | 382 | from statsmodels.tsa.arima_model import ARMA 383 | 384 | 385 | model = ARMA(x, order=(1,1)).fit() # fit model 386 | 387 | print(model.summary()) 388 | plt.plot(x) 389 | plt.plot(model.predict(), color='red') 390 | plt.title('RSS: %.4f'% sum((model.fittedvalues-x)**2)) --------------------------------------------------------------------------------