├── 1. An introduction to Machine Learning with Scikit-Learn.ipynb ├── 2. Robust and calibrated estimators with Scikit-Learn.ipynb ├── README.md ├── environment.yml ├── img ├── bv.png ├── classifiers.png ├── cross-validation.png ├── forest.png ├── ipython-logo.jpg ├── kfold.jpg ├── matplotlib-logo.png ├── motivation.png ├── numpy-logo.png ├── pandas-logo.png ├── scikit-learn-logo.png ├── scipy-logo.png ├── tree-partition.png └── tree-simple.png ├── robustness.py └── tutorial.py /README.md: -------------------------------------------------------------------------------- 1 | # Scikit-Learn tutorials 2 | 3 | 1. Tutorial on machine learning and Scikit-Learn (beginner level). 4 | 2. Tutorial on robust and calibrated estimators with Scikit-Learn (mid level) 5 | 6 | Contact: @glouppe | BSD 3-clause license 7 | 8 | ## Installation instructions 9 | 10 | 1) [Download](https://www.continuum.io/downloads) and install the latest Anaconda distribution, coming with Python 3.5 and the full scientific Python stack. 11 | 12 | 2) Install dependencies: 13 | ``` 14 | conda install numpy scipy scikit-learn jupyter matplotlib 15 | ``` 16 | 17 | 3) Clone this repository and start Jupyter 18 | ``` 19 | git clone https://github.com/glouppe/tutorial-scikit-learn.git 20 | cd tutorial-scikit-learn 21 | jupyter notebook 22 | ``` 23 | 24 | ## Launch on Binder without installing anything! 25 | [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/glouppe/tutorial-scikit-learn) 26 | 27 | 28 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: sklearn-tutorial 2 | 3 | dependencies: 4 | - python=3.5 5 | - numpy 6 | - scipy 7 | - matplotlib 8 | - pandas 9 | - scikit-learn 10 | -------------------------------------------------------------------------------- /img/bv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/bv.png -------------------------------------------------------------------------------- /img/classifiers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/classifiers.png -------------------------------------------------------------------------------- /img/cross-validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/cross-validation.png -------------------------------------------------------------------------------- /img/forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/forest.png -------------------------------------------------------------------------------- /img/ipython-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/ipython-logo.jpg -------------------------------------------------------------------------------- /img/kfold.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/kfold.jpg -------------------------------------------------------------------------------- /img/matplotlib-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/matplotlib-logo.png -------------------------------------------------------------------------------- /img/motivation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/motivation.png -------------------------------------------------------------------------------- /img/numpy-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/numpy-logo.png -------------------------------------------------------------------------------- /img/pandas-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/pandas-logo.png -------------------------------------------------------------------------------- /img/scikit-learn-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/scikit-learn-logo.png -------------------------------------------------------------------------------- /img/scipy-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/scipy-logo.png -------------------------------------------------------------------------------- /img/tree-partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/tree-partition.png -------------------------------------------------------------------------------- /img/tree-simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/glouppe/tutorials-scikit-learn/04963eb06f090413e801e8132ec36a7f95685b68/img/tree-simple.png -------------------------------------------------------------------------------- /robustness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy import stats 4 | from sklearn.tree import DecisionTreeClassifier 5 | 6 | 7 | def plot_surface(model, X, y): 8 | n_classes = 3 9 | plot_colors = "ryb" 10 | cmap = plt.cm.RdYlBu 11 | plot_step = 0.02 12 | plot_step_coarser = 0.5 13 | 14 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 15 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 16 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), 17 | np.arange(y_min, y_max, plot_step)) 18 | 19 | if isinstance(model, DecisionTreeClassifier): 20 | Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) 21 | Z = Z.reshape(xx.shape) 22 | cs = plt.contourf(xx, yy, Z, cmap=cmap) 23 | else: 24 | estimator_alpha = 1.0 / len(model.estimators_) 25 | for tree in model.estimators_: 26 | Z = tree.predict(np.c_[xx.ravel(), yy.ravel()]) 27 | Z = Z.reshape(xx.shape) 28 | cs = plt.contourf(xx, yy, Z, alpha=estimator_alpha, cmap=cmap) 29 | 30 | xx_coarser, yy_coarser = np.meshgrid(np.arange(x_min, x_max, plot_step_coarser), 31 | np.arange(y_min, y_max, plot_step_coarser)) 32 | Z_points_coarser = model.predict(np.c_[xx_coarser.ravel(), yy_coarser.ravel()]).reshape(xx_coarser.shape) 33 | cs_points = plt.scatter(xx_coarser, yy_coarser, s=15, 34 | c=Z_points_coarser, cmap=cmap, edgecolors="none") 35 | 36 | for i, c in zip(range(n_classes), plot_colors): 37 | idx = np.where(y == i) 38 | plt.scatter(X[idx, 0], X[idx, 1], c=c, cmap=cmap) 39 | 40 | plt.show() 41 | 42 | 43 | def plot_outlier_detector(clf, X, ground_truth): 44 | n_outliers = (ground_truth == 0).sum() 45 | outliers_fraction = 1. * n_outliers / len(ground_truth) 46 | x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 47 | y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 48 | xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), 49 | np.linspace(y_min, y_max, 500)) 50 | 51 | y_pred = clf.decision_function(X).ravel() 52 | threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) 53 | y_pred = y_pred > threshold 54 | 55 | Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 56 | Z = Z.reshape(xx.shape) 57 | plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), 58 | cmap=plt.cm.Blues_r) 59 | a = plt.contour(xx, yy, Z, levels=[threshold], 60 | linewidths=2, colors='red') 61 | plt.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') 62 | b = plt.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white') 63 | c = plt.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black') 64 | plt.legend( 65 | [a.collections[0], b, c], 66 | ['Learned decision function', 'True inliers', 'True outliers']) 67 | plt.show() 68 | -------------------------------------------------------------------------------- /tutorial.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import numpy as np 3 | 4 | def plot_surface(clf, X, y, 5 | xlim=(-10, 10), ylim=(-10, 10), n_steps=250, 6 | subplot=None, show=True): 7 | if subplot is None: 8 | fig = plt.figure() 9 | else: 10 | plt.subplot(*subplot) 11 | 12 | xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], n_steps), 13 | np.linspace(ylim[0], ylim[1], n_steps)) 14 | 15 | if hasattr(clf, "decision_function"): 16 | z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 17 | else: 18 | z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 19 | 20 | z = z.reshape(xx.shape) 21 | plt.contourf(xx, yy, z, alpha=0.8, cmap=plt.cm.RdBu_r) 22 | plt.scatter(X[:, 0], X[:, 1], c=y) 23 | plt.xlim(*xlim) 24 | plt.ylim(*ylim) 25 | 26 | if show: 27 | plt.show() 28 | 29 | def plot_histogram(clf, X, y, subplot=None, show=True): 30 | if subplot is None: 31 | fig = plt.figure() 32 | else: 33 | plt.subplot(*subplot) 34 | 35 | if hasattr(clf, "decision_function"): 36 | d = clf.decision_function(X) 37 | else: 38 | d = clf.predict_proba(X)[:, 1] 39 | 40 | plt.hist(d[y == "b"], bins=50, normed=True, color="b", alpha=0.5) 41 | plt.hist(d[y == "r"], bins=50, normed=True, color="r", alpha=0.5) 42 | 43 | if show: 44 | plt.show() 45 | 46 | def plot_clf(clf, X, y): 47 | plt.figure(figsize=(16, 8)) 48 | plot_surface(clf, X, y, subplot=(1, 2, 1), show=False) 49 | plot_histogram(clf, X, y, subplot=(1, 2, 2), show=True) 50 | --------------------------------------------------------------------------------