├── README.md
├── __init__.py
├── ch10-linear-regression.py
├── ch10-liner_reg-decisionTrees.py
├── ch10-liner_reg-ploynomial.py
├── ch10-scikit-linear-regression.py
├── ch10-scikit-linear_reg-RANSAC.py
├── ch10-scikit-linear_reg-model_eval.py
├── ch11-clustering-DBSCAN.py
├── ch11-clustering-agglomerative.py
├── ch11-clustering-kmeans-analysis.py
├── ch11-clustering-kmeans.py
├── ch12-NeuralNet-MLP.py
├── ch12-import-MNIST.py
├── ch13-keras.py
├── ch13-theano.py
├── ch2-Adaline-BatchGD.py
├── ch2-Adaline-stochasticGD.py
├── ch2-perceptron.py
├── ch3-SVM-Kernel-on-Iris.py
├── ch3-SVM-Kernel.py
├── ch3-SVM.py
├── ch3-decisionTree-RandomForests.py
├── ch3-decisionTrees-InformationGain.py
├── ch3-decisionTrees.py
├── ch3-k-nearest-neighbors.py
├── ch3-logisticRegression.py
├── ch3-logisticregression-cost.py
├── ch3-scikit-learn-perceptron.py
├── ch3-sigmoid.py
├── ch4-categoricalData.py
├── ch4-featureSelection-randomForest.py
├── ch4-imputation.py
├── ch4-missingData.py
├── ch4-partitioningData.py
├── ch4-seq-feature-selection.py
├── ch5-LDA-scikit.py
├── ch5-PCA-Kernel-ex1.py
├── ch5-PCA-Kernel-ex2.py
├── ch5-PCA-Kernel-newRBF.py
├── ch5-PCA-Kernel-scikit.py
├── ch5-PCA-scikit.py
├── ch5-PCA1.py
├── ch6-F1-score.py
├── ch6-Kfold-CrossValidation.py
├── ch6-ModelSelect-ParamTune-Nested-Kfold-CrossValidation.py
├── ch6-ROC-curve.py
├── ch6-confusion-matrix.py
├── ch6-hyperparameterTuning-gridSearch.py
├── ch6-learningCurve.py
├── ch6-pipeline.py
├── ch6-scikit-Kfold-CrossValidation.py
├── ch6-validationCurve.py
├── ch7-AdaBoost.py
├── ch7-BaggingClassifiers.py
├── ch7-majorityVote-Classifier.py
├── ch8-Online-Sentiment-Analysis.py
├── ch8-Sentiment-Analysis.py
├── ch8-bagOfWords.py
├── ch9-pickle-model.py
├── movieclassifier
    ├── __pycache__
    │   ├── update.cpython-35.pyc
    │   └── vectorizer.cpython-35.pyc
    ├── app.py
    ├── ch9-ex.py
    ├── pkl_objects
    │   ├── classifier.pkl
    │   └── stopwords.pkl
    ├── reviews.sqlite
    ├── static
    │   └── style.css
    ├── templates
    │   ├── _formhelpers.html
    │   ├── results.html
    │   ├── reviewform.html
    │   └── thanks.html
    ├── update.py
    └── vectorizer.py
└── neuralnet.py


/README.md:
--------------------------------------------------------------------------------
1 | # python-ML-book-Raschka
2 | 
3 | Code snippets from 2015 book "Python Machine Learning" by  Sebastian Raschka 
4 | 
5 | https://www.amazon.com/Python-Machine-Learning-Sebastian-Raschka/dp/1783555130/ref=sr_1_1?s=books&ie=UTF8&qid=1476259049&sr=1-1&keywords=python+MAchine+learning
6 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/__init__.py


--------------------------------------------------------------------------------
/ch10-linear-regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | class LinearRegressionGD(object):
 6 |     def __init__(self, eta=0.001, n_iter=20):
 7 |         self.eta = eta
 8 |         self.n_iter = n_iter
 9 | 
10 |     def net_input(self, X):
11 |         return np.dot(X, self.w_[1:]) + self.w_[0]
12 | 
13 |     def fit(self, X, y):
14 |         self.w_ = np.zeros(1 + X.shape[1])
15 |         self.cost_ = []
16 | 
17 |         for i in range(self.n_iter):
18 |             output = self.net_input(X)
19 |             errors = (y - output)
20 |             self.w_[1:] += self.eta * X.T.dot(errors)
21 |             self.w_[0] += self.eta * errors.sum()
22 |             cost = (errors**2).sum() / 2.0
23 |             self.cost_.append(cost)
24 |             return self
25 | 
26 |     def predict(self, X):
27 |         return self.net_input(X)
28 | 
29 | def lin_regplot(X, y, model):
30 |     plt.scatter(X, y, c='blue')
31 |     plt.plot(X, model.predict(X), color='red')
32 |     return None
33 | 
34 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
35 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
36 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
37 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
38 | 
39 | 
40 | import seaborn as sns
41 | sns.set(style='whitegrid', context='notebook')
42 | cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
43 | sns.pairplot(df[cols], size=2.5)
44 | plt.show()
45 | 
46 | X = df[['RM']].values
47 | y = df['MEDV'].values
48 | from sklearn.preprocessing import StandardScaler
49 | sc_x = StandardScaler()
50 | sc_y = StandardScaler()
51 | X_std = sc_x.fit_transform(X)
52 | y_std = sc_y.fit_transform(y)
53 | lr = LinearRegressionGD()
54 | lr.fit(X_std, y_std)
55 | 
56 | #plt.plot(range(1, lr.n_iter+1), lr.cost_)
57 | #plt.ylabel('SSE')
58 | #plt.xlabel('Epoch')
59 | #plt.show()
60 | 
61 | lin_regplot(X_std, y_std, lr)
62 | plt.xlabel('Average number of rooms [RM] (standardized)')
63 | plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
64 | plt.show()
65 | 
66 | num_rooms_std = sc_x.transform([5.0])
67 | price_std = lr.predict(num_rooms_std)
68 | print("Price in $1000's: %.3f" %  sc_y.inverse_transform(price_std))
69 | 
70 | 
71 | print('Slope: %.3f' % lr.w_[1])
72 | print('Intercept: %.3f' % lr.w_[0])
73 | 
74 | 


--------------------------------------------------------------------------------
/ch10-liner_reg-decisionTrees.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.cross_validation import train_test_split
 4 | 
 5 | def lin_regplot(X, y, model):
 6 |     plt.scatter(X, y, c='blue')
 7 |     plt.plot(X, model.predict(X), color='red')
 8 |     return None
 9 | 
10 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
11 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
12 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
13 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
14 | 
15 | # Decision Tree Regression
16 | from sklearn.tree import DecisionTreeRegressor
17 | X = df[['LSTAT']].values
18 | y = df['MEDV'].values
19 | tree = DecisionTreeRegressor(max_depth=3)
20 | tree.fit(X, y)
21 | sort_idx = X.flatten().argsort()
22 | lin_regplot(X[sort_idx], y[sort_idx], tree)
23 | plt.xlabel('% lower status of the population [LSTAT]')
24 | plt.ylabel('Price in $1000\'s [MEDV]')
25 | plt.show()
26 | 
27 | 
28 | # Random Forest Regression
29 | X = df.iloc[:, :-1].values
30 | y = df['MEDV'].values
31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
32 | 
33 | from sklearn.ensemble import RandomForestRegressor
34 | from sklearn.metrics import mean_squared_error
35 | from sklearn.metrics import r2_score
36 | forest = RandomForestRegressor(n_estimators=1000, criterion='mse', random_state=1,  n_jobs=-1)
37 | forest.fit(X_train, y_train)
38 | y_train_pred = forest.predict(X_train)
39 | y_test_pred = forest.predict(X_test)
40 | print('MSE train: %.3f, test: %.3f' % (
41 |         mean_squared_error(y_train, y_train_pred),
42 |         mean_squared_error(y_test, y_test_pred)))
43 | 
44 | print('R^2 train: %.3f, test: %.3f' % (
45 |         r2_score(y_train, y_train_pred),
46 |         r2_score(y_test, y_test_pred)))
47 | 
48 | plt.scatter(y_train_pred,
49 |              y_train_pred - y_train,
50 |              c='black',
51 |              marker='o',
52 |              s=35,
53 |              alpha=0.5,
54 |              label='Training data')
55 | 
56 | plt.scatter(y_test_pred,
57 |              y_test_pred - y_test,
58 |              c='lightgreen',
59 |              marker='s',
60 |              s=35,
61 |              alpha=0.7,
62 |              label='Test data')
63 | 
64 | plt.xlabel('Predicted values')
65 | plt.ylabel('Residuals')
66 | plt.legend(loc='upper left')
67 | plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
68 | plt.xlim([-10, 50])
69 | plt.show()
70 | 
71 | 


--------------------------------------------------------------------------------
/ch10-liner_reg-ploynomial.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | from sklearn.linear_model import LinearRegression
  4 | import numpy as np
  5 | from sklearn.preprocessing import PolynomialFeatures
  6 | from sklearn.metrics import r2_score
  7 | 
  8 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
  9 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
 10 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
 11 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
 12 | 
 13 | X = df[['LSTAT']].values
 14 | y = df['MEDV'].values
 15 | regr = LinearRegression()
 16 | 
 17 | # create polynomial features
 18 | quadratic = PolynomialFeatures(degree=2)
 19 | cubic = PolynomialFeatures(degree=3)
 20 | X_quad = quadratic.fit_transform(X)
 21 | X_cubic = cubic.fit_transform(X)
 22 | 
 23 | # linear fit
 24 | X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
 25 | regr = regr.fit(X, y)
 26 | y_lin_fit = regr.predict(X_fit)
 27 | linear_r2 = r2_score(y, regr.predict(X))
 28 | 
 29 | # quadratic fit
 30 | regr = regr.fit(X_quad, y)
 31 | y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
 32 | quadratic_r2 = r2_score(y, regr.predict(X_quad))
 33 | 
 34 | # cubic fit
 35 | regr = regr.fit(X_cubic, y)
 36 | y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
 37 | cubic_r2 = r2_score(y, regr.predict(X_cubic))
 38 | 
 39 | # plot results
 40 | plt.scatter(X, y,
 41 |              label='training points',
 42 |              color='lightgray')
 43 | 
 44 | plt.plot(X_fit, y_lin_fit,
 45 |           label='linear (d=1), $R^2=%.2f$' % linear_r2,
 46 |           color='blue',
 47 |           lw=2,
 48 |           linestyle=':')
 49 | 
 50 | 
 51 | plt.plot(X_fit, y_quad_fit,
 52 |           label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2,
 53 |           color='red',
 54 |           lw=2,
 55 |           linestyle='-')
 56 | 
 57 | plt.plot(X_fit, y_cubic_fit,
 58 |           label='cubic (d=3), $R^2=%.2f$'% cubic_r2,
 59 |           color='green',
 60 |           lw=2,
 61 |           linestyle='--')
 62 | 
 63 | plt.xlabel('% lower status of the population [LSTAT]')
 64 | plt.ylabel('Price in $1000\'s [MEDV]')
 65 | plt.legend(loc='upper right')
 66 | plt.show()
 67 | 
 68 | 
 69 | 
 70 | # transform features
 71 | X_log = np.log(X)
 72 | y_sqrt = np.sqrt(y)
 73 | 
 74 | # fit features
 75 | X_fit = np.arange(X_log.min()-1,  X_log.max()+1, 1)[:, np.newaxis]
 76 | regr = regr.fit(X_log, y_sqrt)
 77 | y_lin_fit = regr.predict(X_fit)
 78 | linear_r2 = r2_score(y_sqrt, regr.predict(X_log))
 79 | 
 80 | # plot results
 81 | plt.scatter(X_log, y_sqrt,
 82 |              label='training points',
 83 |              color='lightgray')
 84 | 
 85 | plt.plot(X_fit, y_lin_fit,
 86 |           label='linear (d=1), $R^2=%.2f$' % linear_r2,
 87 |           color='blue',
 88 |           lw=2)
 89 | 
 90 | plt.xlabel('log(% lower status of the population [LSTAT])')
 91 | plt.ylabel('$\sqrt{Price \; in \; \$1000\'s [MEDV]}$')
 92 | plt.legend(loc='lower left')
 93 | plt.show()
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/ch10-scikit-linear-regression.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | def lin_regplot(X, y, model):
 6 |     plt.scatter(X, y, c='blue')
 7 |     plt.plot(X, model.predict(X), color='red')
 8 |     return None
 9 | 
10 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
11 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
12 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
13 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
14 | 
15 | 
16 | 
17 | X = df[['RM']].values
18 | y = df['MEDV'].values
19 | from sklearn.preprocessing import StandardScaler
20 | sc_x = StandardScaler()
21 | sc_y = StandardScaler()
22 | X_std = sc_x.fit_transform(X)
23 | y_std = sc_y.fit_transform(y)
24 | 
25 | 
26 | from sklearn.linear_model import LinearRegression
27 | slr = LinearRegression()
28 | slr.fit(X, y)
29 | print('Slope: %.3f' % slr.coef_[0])
30 | print('Intercept: %.3f' % slr.intercept_)
31 | 
32 | 
33 | 
34 | lin_regplot(X, y, slr)
35 | plt.xlabel('Average number of rooms [RM]')
36 | plt.ylabel('Price in $1000\'s [MEDV]')
37 | plt.show()
38 | 


--------------------------------------------------------------------------------
/ch10-scikit-linear_reg-RANSAC.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def lin_regplot(X, y, model):
 5 |     plt.scatter(X, y, c='blue')
 6 |     plt.plot(X, model.predict(X), color='red')
 7 |     return None
 8 | 
 9 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
10 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
11 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
12 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
13 | 
14 | X = df[['RM']].values
15 | y = df['MEDV'].values
16 | from sklearn.preprocessing import StandardScaler
17 | sc_x = StandardScaler()
18 | sc_y = StandardScaler()
19 | X_std = sc_x.fit_transform(X)
20 | y_std = sc_y.fit_transform(y)
21 | 
22 | from sklearn.linear_model import LinearRegression
23 | from sklearn.linear_model import RANSACRegressor
24 | import numpy as np
25 | 
26 | ransac = RANSACRegressor(LinearRegression(),
27 |             max_trials=100,
28 |             min_samples=50,
29 |             residual_metric=lambda x: np.sum(np.abs(x), axis=1),
30 |             residual_threshold=5.0,
31 |             random_state=0)
32 | 
33 | ransac.fit(X, y)
34 | 
35 | inlier_mask = ransac.inlier_mask_
36 | outlier_mask = np.logical_not(inlier_mask)
37 | line_X = np.arange(3, 10, 1)
38 | line_y_ransac = ransac.predict(line_X[:, np.newaxis])
39 | 
40 | plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers')
41 | plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers')
42 | plt.plot(line_X, line_y_ransac, color='red')
43 | plt.xlabel('Average number of rooms [RM]')
44 | plt.ylabel('Price in $1000\'s [MEDV]')
45 | plt.legend(loc='upper left')
46 | plt.show()
47 | 
48 | print('Slope: %.3f' % ransac.estimator_.coef_[0])
49 | print('Intercept: %.3f' % ransac.estimator_.intercept_)
50 | 
51 | 
52 | 
53 | 
54 | 
55 | from sklearn.cross_validation import train_test_split
56 | X = df.iloc[:, :-1].values
57 | y = df['MEDV'].values
58 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
59 | slr = LinearRegression()
60 | slr.fit(X_train, y_train)
61 | y_train_pred = slr.predict(X_train)
62 | y_test_pred = slr.predict(X_test)
63 | 
64 | plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
65 | plt.scatter(y_test_pred,  y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
66 | plt.xlabel('Predicted values')
67 | plt.ylabel('Residuals')
68 | plt.legend(loc='upper left')
69 | plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
70 | plt.xlim([-10, 50])
71 | plt.show()
72 | 
73 | from sklearn.metrics import mean_squared_error
74 | print('MSE train: %.3f, test: %.3f' % (
75 |         mean_squared_error(y_train, y_train_pred),
76 |         mean_squared_error(y_test, y_test_pred)))
77 | 
78 | from sklearn.metrics import r2_score
79 | print('R^2 train: %.3f, test: %.3f' %
80 |        (r2_score(y_train, y_train_pred),
81 |        r2_score(y_test, y_test_pred)))


--------------------------------------------------------------------------------
/ch10-scikit-linear_reg-model_eval.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.linear_model import LinearRegression
 4 | import numpy as np
 5 | from sklearn.cross_validation import train_test_split
 6 | 
 7 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data', header=None, sep='\s+')
 8 | df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
 9 |               'NOX', 'RM', 'AGE', 'DIS', 'RAD',
10 |               'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
11 | 
12 | 
13 | X = df.iloc[:, :-1].values
14 | y = df['MEDV'].values
15 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
16 | slr = LinearRegression()
17 | slr.fit(X_train, y_train)
18 | y_train_pred = slr.predict(X_train)
19 | y_test_pred = slr.predict(X_test)
20 | 
21 | plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
22 | plt.scatter(y_test_pred,  y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
23 | plt.xlabel('Predicted values')
24 | plt.ylabel('Residuals')
25 | plt.legend(loc='upper left')
26 | plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
27 | plt.xlim([-10, 50])
28 | plt.show()
29 | 
30 | from sklearn.metrics import mean_squared_error
31 | print('MSE train: %.3f, test: %.3f' % (
32 |         mean_squared_error(y_train, y_train_pred),
33 |         mean_squared_error(y_test, y_test_pred)))
34 | 
35 | from sklearn.metrics import r2_score
36 | print('R^2 train: %.3f, test: %.3f' %
37 |        (r2_score(y_train, y_train_pred),
38 |        r2_score(y_test, y_test_pred)))
39 | 
40 | 
41 | 
42 | # regularized regression models
43 | from sklearn.linear_model import Ridge
44 | ridge = Ridge(alpha=1.0)
45 | 
46 | from sklearn.linear_model import Lasso
47 | lasso = Lasso(alpha=1.0)
48 | 
49 | from sklearn.linear_model import ElasticNet
50 | lasso = ElasticNet(alpha=1.0, l1_ratio=0.5)
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/ch11-clustering-DBSCAN.py:
--------------------------------------------------------------------------------
 1 | from sklearn.datasets import make_moons
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | # Generate data
 5 | X, y = make_moons(n_samples=200, noise=0.05, random_state=0)
 6 | plt.scatter(X[:,0], X[:,1])
 7 | plt.show()
 8 | 
 9 | # Use Kmeans clustering
10 | f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,3))
11 | from sklearn.cluster import KMeans
12 | km = KMeans(n_clusters=2, random_state=0)
13 | y_km = km.fit_predict(X)
14 | ax1.scatter(X[y_km==0,0], X[y_km==0,1],  c='lightblue',  marker='o',  s=40, label='cluster 1')
15 | ax1.scatter(X[y_km==1,0], X[y_km==1,1], c='red',  marker='s',  s=40,  label='cluster 2')
16 | ax1.set_title('K-means clustering')
17 | 
18 | 
19 | # Use Agglomerative Clustering
20 | from sklearn.cluster import AgglomerativeClustering
21 | ac = AgglomerativeClustering(n_clusters=2,  affinity='euclidean', linkage='complete')
22 | y_ac = ac.fit_predict(X)
23 | ax2.scatter(X[y_ac==0,0],  X[y_ac==0,1],  c='lightblue',  marker='o',  s=40, label='cluster 1')
24 | ax2.scatter(X[y_ac==1,0],  X[y_ac==1,1],  c='red',  marker='s',  s=40,  label='cluster 2')
25 | ax2.set_title('Agglomerative clustering')
26 | plt.legend()
27 | plt.show()
28 | 
29 | 
30 | # Use DBSCAN Clustering (Better at classifying non circular clusters)
31 | from sklearn.cluster import DBSCAN
32 | db = DBSCAN(eps=0.2,  min_samples=5,  metric='euclidean')
33 | y_db = db.fit_predict(X)
34 | plt.scatter(X[y_db==0,0],  X[y_db==0,1],  c='lightblue',  marker='o',  s=40,  label='cluster 1')
35 | plt.scatter(X[y_db==1,0],  X[y_db==1,1],  c='red',        marker='s',  s=40,  label='cluster 2')
36 | plt.legend()
37 | plt.show()
38 | 


--------------------------------------------------------------------------------
/ch11-clustering-agglomerative.py:
--------------------------------------------------------------------------------
 1 | # Generate random data
 2 | import pandas as pd
 3 | import numpy as np
 4 | np.random.seed(123)
 5 | variables = ['X', 'Y', 'Z']
 6 | labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']
 7 | X = np.random.random_sample([5,3])*10
 8 | df = pd.DataFrame(X, columns=variables, index=labels)
 9 | 
10 | #from scipy.spatial.distance import pdist, squareform
11 | #row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
12 | 
13 | 
14 | # apply the complete linkage agglomeration
15 | from scipy.cluster.hierarchy import linkage
16 | row_clusters = linkage(df.values, method='complete',  metric='euclidean')
17 | pd.DataFrame(row_clusters,
18 |       columns=['row label 1',
19 |                'row label 2',
20 |                'distance',
21 |                'no. of items in clust.'],
22 |       index=['cluster %d' %(i+1) for i in range(row_clusters.shape[0])])
23 | 
24 | from scipy.cluster.hierarchy import dendrogram
25 | import matplotlib.pyplot as plt
26 | # make dendrogram black (part 1/2)
27 | # from scipy.cluster.hierarchy import set_link_color_palette
28 | # set_link_color_palette(['black'])
29 | row_dendr = dendrogram(row_clusters,
30 |                        labels=labels,
31 |                        # make dendrogram black (part 2/2)
32 |                        # color_threshold=np.inf
33 |                        )
34 | plt.tight_layout()
35 | plt.ylabel('Euclidean distance')
36 | plt.show()
37 | 
38 | #Attach Dendograms to a heat-map
39 | fig = plt.figure(figsize=(8,8), facecolor='white')
40 | axd = fig.add_axes([0.09,0.1,0.2,0.6])
41 | row_dendr = dendrogram(row_clusters, orientation='left')
42 | df_rowclust = df.ix[row_dendr['leaves'][::-1]]
43 | axm = fig.add_axes([0.23,0.1,0.6,0.6])
44 | cax = axm.matshow(df_rowclust, interpolation='nearest', cmap='hot_r')
45 | 
46 | axd.set_xticks([])
47 | axd.set_yticks([])
48 | 
49 | for i in axd.spines.values():
50 |     i.set_visible(False)
51 | 
52 | fig.colorbar(cax)
53 | axm.set_xticklabels([''] + list(df_rowclust.columns))
54 | axm.set_yticklabels([''] + list(df_rowclust.index))
55 | plt.show()
56 | 
57 | 
58 | # Use scikit-learn, you can define number of clusters (for pruning)
59 | from sklearn.cluster import AgglomerativeClustering
60 | ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='complete')
61 | labels = ac.fit_predict(X)
62 | print('Cluster labels: %s' % labels)
63 | 


--------------------------------------------------------------------------------
/ch11-clustering-kmeans-analysis.py:
--------------------------------------------------------------------------------
 1 | # Generate random data
 2 | from sklearn.datasets import make_blobs
 3 | X, y = make_blobs(n_samples=150, n_features=2, centers=3, cluster_std=0.5, shuffle=True, random_state=0)
 4 | 
 5 | # Use scikit Kmeans for clustering
 6 | from sklearn.cluster import KMeans
 7 | import matplotlib.pyplot as plt
 8 | km = KMeans(n_clusters=3, init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0)
 9 | y_km = km.fit_predict(X)
10 | 
11 | import numpy as np
12 | from matplotlib import cm
13 | from sklearn.metrics import silhouette_samples
14 | cluster_labels = np.unique(y_km)
15 | n_clusters = cluster_labels.shape[0]
16 | silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
17 | y_ax_lower, y_ax_upper = 0, 0
18 | yticks = []
19 | for i, c in enumerate(cluster_labels):
20 |     c_silhouette_vals = silhouette_vals[y_km == c]
21 |     c_silhouette_vals.sort()
22 |     y_ax_upper += len(c_silhouette_vals)
23 |     color = cm.jet(i / n_clusters)
24 |     plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
25 |     yticks.append((y_ax_lower + y_ax_upper) / 2)
26 |     y_ax_lower += len(c_silhouette_vals)
27 | 
28 | silhouette_avg = np.mean(silhouette_vals)
29 | plt.axvline(silhouette_avg,color="red",  linestyle="--")
30 | plt.yticks(yticks, cluster_labels + 1)
31 | plt.ylabel('Cluster')
32 | plt.xlabel('Silhouette coefficient')
33 | plt.show()
34 | 
35 | 
36 | 
37 | 
38 | 
39 | #Bad clustering example
40 | km = KMeans(n_clusters=2,init='k-means++', n_init=10, max_iter=300, tol=1e-04, random_state=0)
41 | y_km = km.fit_predict(X)
42 | plt.scatter(X[y_km==0,0], X[y_km==0,1],s=50, c='lightgreen',marker='s',label='cluster 1')
43 | plt.scatter(X[y_km==1,0], X[y_km==1,1], s=50, c='orange', marker='o', label='cluster 2')
44 | plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=250, marker='*', c='red', label='centroids')
45 | plt.legend()
46 | plt.grid()
47 | plt.show()
48 | cluster_labels = np.unique(y_km)
49 | n_clusters = cluster_labels.shape[0]
50 | silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
51 | y_ax_lower, y_ax_upper = 0, 0
52 | yticks = []
53 | for i, c in enumerate(cluster_labels):
54 |     c_silhouette_vals = silhouette_vals[y_km == c]
55 |     c_silhouette_vals.sort()
56 |     y_ax_upper += len(c_silhouette_vals)
57 |     color = cm.jet(i / n_clusters)
58 |     plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
59 |     yticks.append((y_ax_lower + y_ax_upper) / 2)
60 |     y_ax_lower += len(c_silhouette_vals)
61 | 
62 | silhouette_avg = np.mean(silhouette_vals)
63 | plt.axvline(silhouette_avg, color="red", linestyle="--")
64 | plt.yticks(yticks, cluster_labels + 1)
65 | plt.ylabel('Cluster')
66 | plt.xlabel('Silhouette coefficient')
67 | plt.show()
68 | 
69 | 


--------------------------------------------------------------------------------
/ch11-clustering-kmeans.py:
--------------------------------------------------------------------------------
 1 | # Generate random data
 2 | from sklearn.datasets import make_blobs
 3 | X, y = make_blobs(n_samples=150,
 4 |                    n_features=2,
 5 |                    centers=3,
 6 |                    cluster_std=0.5,
 7 |                    shuffle=True,
 8 |                    random_state=0)
 9 | 
10 | # plot data
11 | import matplotlib.pyplot as plt
12 | plt.scatter(X[:,0],X[:,1], c='white', marker='o', s=50)
13 | plt.grid()
14 | plt.show()
15 | 
16 | 
17 | 
18 | # Use scikit Kmeans for clustering
19 | from sklearn.cluster import KMeans
20 | km = KMeans(n_clusters=3,
21 |              init='random',   # use init='kmeans++' to initialize centroids using kmeans++
22 |              n_init=10,
23 |              max_iter=300,
24 |              tol=1e-04,
25 |              random_state=0)
26 | 
27 | # cluster
28 | y_km = km.fit_predict(X)
29 | # print SSE
30 | print('Distortion: %.2f' % km.inertia_)
31 | 
32 | # Plot clustering
33 | plt.scatter(X[y_km==0,0],X[y_km==0,1],s=50,c='lightgreen',marker='s',label='cluster 1')
34 | plt.scatter(X[y_km==1,0],X[y_km==1,1],s=50,c='orange',marker='o',label='cluster 2')
35 | plt.scatter(X[y_km==2,0],X[y_km==2,1],s=50,c='lightblue',marker='v',label='cluster 3')
36 | plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],s=250,marker='*',c='red',label='centroids')
37 | plt.legend()
38 | plt.grid()
39 | plt.show()
40 | 
41 | # Use elbow method to decide on value of k (no of clusters)
42 | distortions = []
43 | for i in range(1, 11):
44 |      km = KMeans(n_clusters=i,
45 |                  init='k-means++',
46 |                  n_init=10,
47 |                  max_iter=300,
48 |                  random_state=0)
49 |      km.fit(X)
50 |      distortions.append(km.inertia_)
51 | # plot
52 | plt.plot(range(1,11), distortions, marker='o')
53 | plt.xlabel('Number of clusters')
54 | plt.ylabel('Distortion')
55 | plt.show()
56 | 
57 | 


--------------------------------------------------------------------------------
/ch12-NeuralNet-MLP.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import struct
  3 | import numpy as np
  4 | 
  5 | def load_mnist(path, kind='train'):
  6 |     """Load MNIST data from `path`"""
  7 |     labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
  8 |     images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)
  9 | 
 10 |     with open(labels_path, 'rb') as lbpath:
 11 |         magic, n = struct.unpack('>II', lbpath.read(8))
 12 |         labels = np.fromfile(lbpath, dtype=np.uint8)
 13 | 
 14 |     with open(images_path, 'rb') as imgpath:
 15 |         magic, num, rows, cols = struct.unpack(">IIII", imgpath.read(16))
 16 |         images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)
 17 | 
 18 | 
 19 |     return images, labels
 20 | 
 21 | 
 22 | X_train, y_train = load_mnist('mnist', kind='train')
 23 | X_test, y_test = load_mnist('mnist', kind='t10k')
 24 | 
 25 | 
 26 | from neuralnet import NeuralNetMLP
 27 | nn = NeuralNetMLP(n_output=10,
 28 |                    n_features=X_train.shape[1],
 29 |                    n_hidden=50,
 30 |                    l2=0.1,
 31 |                    l1=0.0,
 32 |                    epochs=1000,
 33 |                    eta=0.001,
 34 |                    alpha=0.001,
 35 |                    decrease_const=0.00001,
 36 |                    shuffle=True,
 37 |                    minibatches=50,
 38 |                    random_state=1)
 39 | 
 40 | 
 41 | 
 42 | 
 43 | 
 44 | nn.fit(X_train, y_train, print_progress=True)
 45 | 
 46 | import matplotlib.pyplot as plt
 47 | plt.plot(range(len(nn.cost_)), nn.cost_)
 48 | plt.ylim([0, 2000])
 49 | plt.ylabel('Cost')
 50 | plt.xlabel('Epochs * 50')
 51 | plt.tight_layout()
 52 | plt.show()
 53 | 
 54 | batches = np.array_split(range(len(nn.cost_)), 1000)
 55 | cost_ary = np.array(nn.cost_)
 56 | cost_avgs = [np.mean(cost_ary[i]) for i in batches]
 57 | 
 58 | plt.plot(range(len(cost_avgs)), cost_avgs,  color='red')
 59 | plt.ylim([0, 2000])
 60 | plt.ylabel('Cost')
 61 | plt.xlabel('Epochs')
 62 | plt.tight_layout()
 63 | plt.show()
 64 | 
 65 | y_train_pred = nn.predict(X_train)
 66 | acc = np.sum(y_train == y_train_pred, axis=0) / X_train.shape[0]
 67 | print('Training accuracy: %.2f%%' % (acc * 100))
 68 | 
 69 | 
 70 | y_test_pred = nn.predict(X_test)
 71 | acc = np.sum(y_test == y_test_pred, axis=0) / X_test.shape[0]
 72 | print('Test accuracy: %.2f%%' % (acc * 100))
 73 | 
 74 | 
 75 | 
 76 | miscl_img = X_test[y_test != y_test_pred][:25]
 77 | correct_lab = y_test[y_test != y_test_pred][:25]
 78 | miscl_lab= y_test_pred[y_test != y_test_pred][:25]
 79 | 
 80 | fig, ax = plt.subplots(nrows=5, ncols=5,  sharex=True, sharey=True,)
 81 | ax = ax.flatten()
 82 | for i in range(25):
 83 |     img = miscl_img[i].reshape(28, 28)
 84 |     ax[i].imshow(img, cmap='Greys', interpolation='nearest')
 85 |     ax[i].set_title('%d) t: %d p: %d' % (i+1, correct_lab[i], miscl_lab[i]))
 86 | 
 87 | ax[0].set_xticks([])
 88 | ax[0].set_yticks([])
 89 | plt.tight_layout()
 90 | plt.show()
 91 | 
 92 | 
 93 | # Perform gradient checking
 94 | from neuralnet import MLPGradientCheck
 95 | nn_check = MLPGradientCheck(n_output=10,
 96 |                             n_features=X_train.shape[1],
 97 |                             n_hidden=10,
 98 |                             l2=0.0,
 99 |                             l1=0.0,
100 |                             epochs=10,
101 |                             eta=0.001,
102 |                             alpha=0.0,
103 |                             decrease_const=0.0,
104 |                             minibatches=1,
105 |                             shuffle=False,
106 |                             random_state=1)
107 | 
108 | 
109 | # Perform check for handful samples 
110 | nn_check.fit(X_train[:5], y_train[:5], print_progress=False)
111 | 
112 | 


--------------------------------------------------------------------------------
/ch12-import-MNIST.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import struct
 3 | import numpy as np
 4 | 
 5 | def load_mnist(path, kind='train'):
 6 |     """Load MNIST data from `path`"""
 7 |     labels_path = os.path.join(path, '%s-labels.idx1-ubyte' % kind)
 8 |     images_path = os.path.join(path, '%s-images.idx3-ubyte' % kind)
 9 | 
10 |     with open(labels_path, 'rb') as lbpath:
11 |         magic, n = struct.unpack('>II', lbpath.read(8))
12 |         labels = np.fromfile(lbpath, dtype=np.uint8)
13 | 
14 |     with open(images_path, 'rb') as imgpath:
15 |         magic, num, rows, cols = struct.unpack(">IIII", imgpath.read(16))
16 |         images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)
17 | 
18 | 
19 |     return images, labels
20 | 
21 | 
22 | X_train, y_train = load_mnist('mnist', kind='train')
23 | print('Rows: %d, columns: %d' % (X_train.shape[0], X_train.shape[1]))
24 | 
25 | X_test, y_test = load_mnist('mnist', kind='t10k')
26 | print('Rows: %d, columns: %d' % (X_test.shape[0], X_test.shape[1]))
27 | 
28 | # visualize examples of MNIST data
29 | import matplotlib.pyplot as plt
30 | fig, ax = plt.subplots(nrows=2, ncols=5, sharex=True, sharey=True,)
31 | ax = ax.flatten()
32 | 
33 | for i in range(10):
34 |     img = X_train[y_train == i][0].reshape(28, 28)
35 |     ax[i].imshow(img, cmap='Greys', interpolation='nearest')
36 | 
37 | ax[0].set_xticks([])
38 | ax[0].set_yticks([])
39 | plt.tight_layout()
40 | plt.show()
41 | 
42 | 
43 | # plot multiple examples of the same digit to see how different those handwriting examples really are
44 | fig, ax = plt.subplots(nrows=5, ncols=5, sharex=True,  sharey=True,)
45 | ax = ax.flatten()
46 | 
47 | for i in range(25):
48 |     img = X_train[y_train == 7][i].reshape(28, 28)
49 |     ax[i].imshow(img, cmap='Greys', interpolation='nearest')
50 | 
51 | ax[0].set_xticks([])
52 | ax[0].set_yticks([])
53 | plt.tight_layout()
54 | plt.show()
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/ch13-keras.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import struct
 3 | import numpy as np
 4 | 
 5 | def load_mnist(path, kind='train'):
 6 |     """Load MNIST data from `path`"""
 7 |     labels_path = os.path.join(path,
 8 |                                '%s-labels.idx1-ubyte'
 9 |                                 % kind)
10 |     images_path = os.path.join(path,
11 |                                '%s-images.idx3-ubyte'
12 |                                % kind)
13 | 
14 |     with open(labels_path, 'rb') as lbpath:
15 |         magic, n = struct.unpack('>II',
16 |                                  lbpath.read(8))
17 |         labels = np.fromfile(lbpath,
18 |                              dtype=np.uint8)
19 | 
20 |     with open(images_path, 'rb') as imgpath:
21 |         magic, num, rows, cols = struct.unpack(">IIII",
22 |                                                imgpath.read(16))
23 |         images = np.fromfile(imgpath,
24 |                              dtype=np.uint8).reshape(len(labels), 784)
25 | 
26 |     return images, labels
27 | 
28 | X_train, y_train = load_mnist('mnist', kind='train')
29 | print('Rows: %d, columns: %d' % (X_train.shape[0], X_train.shape[1]))
30 | 
31 | X_test, y_test = load_mnist('mnist', kind='t10k')
32 | print('Rows: %d, columns: %d' % (X_test.shape[0], X_test.shape[1]))
33 | 
34 | import theano
35 | 
36 | theano.config.floatX = 'float32'
37 | X_train = X_train.astype(theano.config.floatX)
38 | X_test = X_test.astype(theano.config.floatX)
39 | 
40 | 
41 | from keras.utils import np_utils
42 | 
43 | print('First 3 labels: ', y_train[:3])
44 | 
45 | y_train_ohe = np_utils.to_categorical(y_train)
46 | print('\nFirst 3 labels (one-hot):\n', y_train_ohe[:3])
47 | 
48 | 
49 | from keras.models import Sequential
50 | from keras.layers.core import Dense
51 | from keras.optimizers import SGD
52 | 
53 | np.random.seed(1)
54 | 
55 | model = Sequential()
56 | model.add(Dense(input_dim=X_train.shape[1],
57 |                 output_dim=50,
58 |                 init='uniform',
59 |                 activation='tanh'))
60 | 
61 | model.add(Dense(input_dim=50,
62 |                 output_dim=50,
63 |                 init='uniform',
64 |                 activation='tanh'))
65 | 
66 | model.add(Dense(input_dim=50,
67 |                 output_dim=y_train_ohe.shape[1],
68 |                 init='uniform',
69 |                 activation='softmax'))
70 | 
71 | sgd = SGD(lr=0.001, decay=1e-7, momentum=.9)
72 | model.compile(loss='categorical_crossentropy', optimizer=sgd)
73 | 
74 | model.fit(X_train, y_train_ohe,
75 |           nb_epoch=50,
76 |           batch_size=300,
77 |           verbose=1,
78 |           validation_split=0.1,
79 |           show_accuracy=True)
80 | 
81 | 
82 | 
83 | y_train_pred = model.predict_classes(X_train, verbose=0)
84 | print('First 3 predictions: ', y_train_pred[:3])
85 | 
86 | 
87 | train_acc = np.sum(y_train == y_train_pred, axis=0) / X_train.shape[0]
88 | print('Training accuracy: %.2f%%' % (train_acc * 100))
89 | 
90 | 
91 | y_test_pred = model.predict_classes(X_test, verbose=0)
92 | test_acc = np.sum(y_test == y_test_pred, axis=0) / X_test.shape[0]
93 | print('Test accuracy: %.2f%%' % (test_acc * 100))
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/ch13-theano.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | from theano import tensor as T
  3 | 
  4 | # initialize
  5 | x1 = T.scalar()
  6 | w1 = T.scalar()
  7 | w0 = T.scalar()
  8 | z1 = w1 * x1 + w0
  9 | 
 10 | # compile
 11 | net_input = theano.function(inputs=[w1, x1, w0], outputs=z1)
 12 | 
 13 | # execute
 14 | net_input(2.0, 1.0, 0.5)
 15 | 
 16 | 
 17 | 
 18 | 
 19 | import numpy as np
 20 | 
 21 | # initialize
 22 | # if you are running Theano on 64 bit mode,
 23 | # you need to use dmatrix instead of fmatrix
 24 | x = T.fmatrix(name='x')
 25 | x_sum = T.sum(x, axis=0)
 26 | 
 27 | # compile
 28 | calc_sum = theano.function(inputs=[x], outputs=x_sum)
 29 | 
 30 | # execute (Python list)
 31 | ary = [[1, 2, 3], [1, 2, 3]]
 32 | print('Column sum:', calc_sum(ary))
 33 | 
 34 | # execute (NumPy array)
 35 | ary = np.array([[1, 2, 3], [1, 2, 3]], dtype=theano.config.floatX)
 36 | print('Column sum:', calc_sum(ary))
 37 | 
 38 | 
 39 | 
 40 | 
 41 | # initialize
 42 | x = T.fmatrix(name='x')
 43 | w = theano.shared(np.asarray([[0.0, 0.0, 0.0]], dtype=theano.config.floatX))
 44 | z = x.dot(w.T)
 45 | update = [[w, w + 1.0]]
 46 | 
 47 | # compile
 48 | net_input = theano.function(inputs=[x],  updates=update,  outputs=z)
 49 | 
 50 | # execute
 51 | data = np.array([[1, 2, 3]], dtype=theano.config.floatX)
 52 | for i in range(5):
 53 |     print('z%d:' % i, net_input(data))
 54 | 
 55 | 
 56 | 
 57 | # initialize
 58 | data = np.array([[1, 2, 3]],
 59 |                 dtype=theano.config.floatX)
 60 | x = T.fmatrix(name='x')
 61 | w = theano.shared(np.asarray([[0.0, 0.0, 0.0]],
 62 |                              dtype=theano.config.floatX))
 63 | z = x.dot(w.T)
 64 | update = [[w, w + 1.0]]
 65 | 
 66 | # compile
 67 | net_input = theano.function(inputs=[],
 68 |                             updates=update,
 69 |                             givens={x: data},
 70 |                             outputs=z)
 71 | 
 72 | # execute
 73 | for i in range(5):
 74 |     print('z:', net_input())
 75 | 
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | 
 83 | # Wrapping things up: A linear regression example
 84 | 
 85 | import numpy as np
 86 | X_train = np.asarray([[0.0], [1.0], [2.0], [3.0], [4.0],
 87 |                       [5.0], [6.0], [7.0], [8.0], [9.0]],
 88 |                      dtype=theano.config.floatX)
 89 | 
 90 | y_train = np.asarray([1.0, 1.3, 3.1, 2.0, 5.0,
 91 |                       6.3, 6.6, 7.4, 8.0, 9.0],
 92 |                      dtype=theano.config.floatX)
 93 | 
 94 | 
 95 | import theano
 96 | from theano import tensor as T
 97 | import numpy as np
 98 | 
 99 | def train_linreg(X_train, y_train, eta, epochs):
100 | 
101 |     costs = []
102 |     # Initialize arrays
103 |     eta0 = T.fscalar('eta0')
104 |     y = T.fvector(name='y')
105 |     X = T.fmatrix(name='X')
106 |     w = theano.shared(np.zeros(
107 |                       shape=(X_train.shape[1] + 1),
108 |                       dtype=theano.config.floatX),
109 |                       name='w')
110 | 
111 |     # calculate cost
112 |     net_input = T.dot(X, w[1:]) + w[0]
113 |     errors = y - net_input
114 |     cost = T.sum(T.pow(errors, 2))
115 | 
116 |     # perform gradient update
117 |     gradient = T.grad(cost, wrt=w)
118 |     update = [(w, w - eta0 * gradient)]
119 | 
120 |     # compile model
121 |     train = theano.function(inputs=[eta0],
122 |                             outputs=cost,
123 |                             updates=update,
124 |                             givens={X: X_train,
125 |                                     y: y_train,})
126 | 
127 |     for _ in range(epochs):
128 |         costs.append(train(eta))
129 | 
130 |     return costs, w
131 | 
132 | 
133 | 
134 | 
135 | 
136 | import matplotlib.pyplot as plt
137 | 
138 | costs, w = train_linreg(X_train, y_train, eta=0.001, epochs=10)
139 | 
140 | plt.plot(range(1, len(costs)+1), costs)
141 | 
142 | plt.tight_layout()
143 | plt.xlabel('Epoch')
144 | plt.ylabel('Cost')
145 | plt.tight_layout()
146 | # plt.savefig('./figures/cost_convergence.png', dpi=300)
147 | plt.show()
148 | 
149 | def predict_linreg(X, w):
150 |     Xt = T.matrix(name='X')
151 |     net_input = T.dot(Xt, w[1:]) + w[0]
152 |     predict = theano.function(inputs=[Xt], givens={w: w}, outputs=net_input)
153 |     return predict(X)
154 | 
155 | plt.scatter(X_train, y_train, marker='s', s=50)
156 | plt.plot(range(X_train.shape[0]),
157 |          predict_linreg(X_train, w),
158 |          color='gray',
159 |          marker='o',
160 |          markersize=4,
161 |          linewidth=3)
162 | 
163 | plt.xlabel('x')
164 | plt.ylabel('y')
165 | 
166 | plt.tight_layout()
167 | # plt.savefig('./figures/linreg.png', dpi=300)
168 | plt.show()


--------------------------------------------------------------------------------
/ch2-Adaline-BatchGD.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from matplotlib.colors import ListedColormap
  5 | 
  6 | class AdalineGD(object):
  7 |     """ADAptive LInear NEuron classifier.
  8 | 
  9 |     Parameters
 10 |     ------------
 11 |     eta : float
 12 |         Learning rate (between 0.0 and 1.0)
 13 |     n_iter : int
 14 |         Passes over the training dataset.
 15 |     
 16 |     Attributes
 17 |     -----------
 18 |     w_ : 1d-array
 19 |         Weights after fitting.
 20 |     errors_ : list
 21 |         Number of misclassifications in every epoch.
 22 | 
 23 |     """
 24 |     def __init__(self, eta=0.01, n_iter=50):
 25 |         self.eta = eta
 26 |         self.n_iter = n_iter
 27 | 
 28 |     def fit(self, X, y):
 29 |         """ Fit training data.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         X : {array-like}, shape = [n_samples, n_features]
 34 |             Training vectors, 
 35 |             where n_samples is the number of samples and”
 36 | 
 37 |             where n_samples is the number of samples and
 38 |             n_features is the number of features.
 39 |         y : array-like, shape = [n_samples]
 40 |             Target values.
 41 | 
 42 |         Returns
 43 |         -------
 44 |         self : object
 45 | 
 46 |         """
 47 |         self.w_ = np.zeros(1 + X.shape[1])
 48 |         self.cost_ = []
 49 | 
 50 |         for i in range(self.n_iter):
 51 |             output = self.net_input(X)
 52 |             errors = (y - output)
 53 |             self.w_[1:] += self.eta * X.T.dot(errors)
 54 |             self.w_[0] += self.eta * errors.sum()
 55 |             cost = (errors**2).sum() / 2.0
 56 |             self.cost_.append(cost)
 57 |         return self
 58 | 
 59 |     def net_input(self, X):
 60 |         """Calculate net input"""
 61 |         return np.dot(X, self.w_[1:]) + self.w_[0]
 62 | 
 63 |     def activation(self, X):
 64 |         """Compute linear activation"""
 65 |         return self.net_input(X)
 66 | 
 67 |     def predict(self, X):
 68 |         """Return class label after unit step"""
 69 |         return np.where(self.activation(X) >= 0.0, 1, -1)
 70 | 
 71 | def plot_decision_regions(X, y, classifier, resolution=0.02):
 72 |     # setup marker generator and color map
 73 |     markers = ('s', 'x', 'o', '^', 'v')
 74 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 75 |     cmap = ListedColormap(colors[:len(np.unique(y))])
 76 | 
 77 |     # plot the decision surface
 78 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 79 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 80 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
 81 |                          np.arange(x2_min, x2_max, resolution))
 82 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
 83 |     Z = Z.reshape(xx1.shape)
 84 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
 85 |     plt.xlim(xx1.min(), xx1.max())
 86 |     plt.ylim(xx2.min(), xx2.max())
 87 | 
 88 |     # plot class samples
 89 |     for idx, cl in enumerate(np.unique(y)):
 90 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
 91 |                     alpha=0.8, c=cmap(idx),
 92 |                     marker=markers[idx], label=cl)
 93 | 
 94 | 
 95 | if __name__ == "__main__":
 96 |     df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
 97 |     y = df.iloc[0:100, 4].values
 98 |     y = np.where(y == 'Iris-setosa', -1, 1)
 99 |     X = df.iloc[0:100, [0, 2]].values
100 |     fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
101 |     ada1 = AdalineGD(n_iter=10, eta=0.01).fit(X, y)
102 |     ax[0].plot(range(1, len(ada1.cost_) + 1), np.log10(ada1.cost_), marker='o')
103 |     ax[0].set_xlabel('Epochs')
104 |     ax[0].set_ylabel('log(Sum-squared-error)')
105 |     ax[0].set_title('Adaline - Learning rate 0.01')
106 |     ada2 = AdalineGD(n_iter=10, eta=0.0001).fit(X, y)
107 |     ax[1].plot(range(1, len(ada2.cost_) + 1),ada2.cost_, marker='o')
108 |     ax[1].set_xlabel('Epochs')
109 |     ax[1].set_ylabel('Sum-squared-error')
110 |     ax[1].set_title('Adaline - Learning rate 0.0001')
111 |     plt.show()
112 | 
113 |     X_std = np.copy(X)
114 |     X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
115 |     X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
116 | 
117 |     ada = AdalineGD(n_iter=15, eta=0.01)
118 |     ada.fit(X_std, y)
119 |     plot_decision_regions(X_std, y, classifier=ada)
120 |     plt.title('Adaline - Gradient Descent')
121 |     plt.xlabel('sepal length [standardized]')
122 |     plt.ylabel('petal length [standardized]')
123 |     plt.legend(loc='upper left')
124 |     plt.show()
125 |     plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
126 |     plt.xlabel('Epochs')
127 |     plt.ylabel('Sum-squared-error')
128 | 
129 |     plt.show()
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 


--------------------------------------------------------------------------------
/ch2-Adaline-stochasticGD.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from matplotlib.colors import ListedColormap
  5 | from numpy.random import seed
  6 | 
  7 | class AdalineSGD(object):
  8 |     """ADAptive LInear NEuron classifier.
  9 | 
 10 |     Parameters
 11 |     ------------
 12 |     eta : float
 13 |         Learning rate (between 0.0 and 1.0)
 14 |     n_iter : int
 15 |         Passes over the training dataset.
 16 | 
 17 |     Attributes
 18 |     -----------
 19 |     w_ : 1d-array
 20 |         Weights after fitting.
 21 |     errors_ : list
 22 |         Number of misclassifications in every epoch.
 23 |     shuffle : bool (default: True)
 24 |         Shuffles training data every epoch 
 25 |         if True to prevent cycles.
 26 |     random_state : int (default: None)
 27 |         Set random state for shuffling 
 28 |         and initializing the weights.
 29 |         
 30 |     """
 31 |     def __init__(self, eta=0.01, n_iter=10, 
 32 |                shuffle=True, random_state=None):
 33 |         self.eta = eta
 34 |         self.n_iter = n_iter
 35 |         self.w_initialized = False
 36 |         self.shuffle = shuffle
 37 |         if random_state:
 38 |             seed(random_state)
 39 | 
 40 |     def fit(self, X, y):
 41 |         """ Fit training data.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         X : {array-like}, shape = [n_samples, n_features]
 46 |             Training vectors, where n_samples
 47 |             is the number of samples and
 48 |             n_features is the number of features.
 49 |         y : array-like, shape = [n_samples]
 50 |             Target values.
 51 | 
 52 |         Returns
 53 |         -------
 54 |         self : object
 55 | 
 56 |          """
 57 |         self._initialize_weights(X.shape[1])
 58 |         self.cost_ = []
 59 |         for i in range(self.n_iter):
 60 |             if self.shuffle:
 61 |                 X, y = self._shuffle(X, y)
 62 |             cost = []
 63 |             for xi, target in zip(X, y):
 64 |                 cost.append(self._update_weights(xi, target))
 65 |             avg_cost = sum(cost)/len(y)
 66 |             self.cost_.append(avg_cost)
 67 |         return self
 68 | 
 69 |     def partial_fit(self, X, y):
 70 |         """Fit training data without reinitializing the weights"""
 71 |         if not self.w_initialized:
 72 |             self._initialize_weights(X.shape[1])
 73 |         if y.ravel().shape[0] > 1:
 74 |             for xi, target in zip(X, y):
 75 |                 self._update_weights(xi, target)
 76 |         else:
 77 |             self._update_weights(X, y)
 78 |         return self
 79 | 
 80 |     def _shuffle(self, X, y):
 81 |         """Shuffle training data"""
 82 |         r = np.random.permutation(len(y))
 83 |         return X[r], y[r]
 84 | 
 85 |     def _initialize_weights(self, m):
 86 |         """Initialize weights to zeros"""
 87 |         self.w_ = np.zeros(1 + m)
 88 |         self.w_initialized = True
 89 | 
 90 |     def _update_weights(self, xi, target):
 91 |         """Apply Adaline learning rule to update the weights"""
 92 |         output = self.net_input(xi)
 93 |         error = (target - output)
 94 |         self.w_[1:] += self.eta * xi.dot(error)
 95 |         error = (target - output)
 96 |         self.w_[1:] += self.eta * xi.dot(error)
 97 |         self.w_[0] += self.eta * error
 98 |         cost = 0.5 * error**2
 99 |         return cost
100 | 
101 |     def net_input(self, X):
102 |         """Calculate net input"""
103 |         return np.dot(X, self.w_[1:]) + self.w_[0]
104 | 
105 |     def activation(self, X):
106 |         """Compute linear activation"""
107 |         return self.net_input(X)
108 | 
109 |     def predict(self, X):
110 |         """Return class label after unit step"""
111 |         return np.where(self.activation(X) >= 0.0, 1, -1)
112 | 
113 | 
114 | def plot_decision_regions(X, y, classifier, resolution=0.02):
115 |     # setup marker generator and color map
116 |     markers = ('s', 'x', 'o', '^', 'v')
117 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
118 |     cmap = ListedColormap(colors[:len(np.unique(y))])
119 | 
120 |     # plot the decision surface
121 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
122 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
123 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
124 |                          np.arange(x2_min, x2_max, resolution))
125 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
126 |     Z = Z.reshape(xx1.shape)
127 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
128 |     plt.xlim(xx1.min(), xx1.max())
129 |     plt.ylim(xx2.min(), xx2.max())
130 | 
131 |     # plot class samples
132 |     for idx, cl in enumerate(np.unique(y)):
133 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
134 |                     alpha=0.8, c=cmap(idx),
135 |                     marker=markers[idx], label=cl)
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
140 |     y = df.iloc[0:100, 4].values
141 |     y = np.where(y == 'Iris-setosa', -1, 1)
142 |     X = df.iloc[0:100, [0, 2]].values
143 |     X_std = np.copy(X)
144 |     X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
145 |     X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
146 |     ada = AdalineSGD(n_iter=15, eta=0.01, random_state=1)
147 |     ada.fit(X_std, y)
148 |     plot_decision_regions(X_std, y, classifier=ada)
149 |     plt.title('Adaline - Stochastic Gradient Descent')
150 |     plt.xlabel('sepal length [standardized]')
151 |     plt.ylabel('petal length [standardized]')
152 |     plt.legend(loc='upper left')
153 |     plt.show()
154 |     plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
155 |     plt.xlabel('Epochs')
156 |     plt.ylabel('Average Cost')
157 |     plt.show()
158 | 
159 | 


--------------------------------------------------------------------------------
/ch2-perceptron.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from matplotlib.colors import ListedColormap
  5 | 
  6 | class Perceptron(object):
  7 |     """Perceptron classifier.
  8 | 
  9 |     Parameters
 10 |     ------------
 11 |     eta : float
 12 |         Learning rate (between 0.0 and 1.0)
 13 |     n_iter : int
 14 |         Passes over the training dataset.
 15 | 
 16 |     Attributes
 17 |     -----------
 18 |     w_ : 1d-array
 19 |         Weights after fitting.
 20 |     errors_ : list
 21 |         Number of misclassifications in every epoch.
 22 | 
 23 |     """
 24 |     def __init__(self, eta=0.01, n_iter=10):
 25 |         self.eta = eta
 26 |         self.n_iter = n_iter
 27 | 
 28 |     def fit(self, X, y):
 29 |         """Fit training data.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         X : {array-like}, shape = [n_samples, n_features]
 34 |             Training vectors, where n_samples is the number of samples and
 35 |             n_features is the number of features.
 36 |         y : array-like, shape = [n_samples]
 37 |             Target values.
 38 | 
 39 |         Returns
 40 |         -------
 41 |         self : object
 42 | 
 43 |         """
 44 |         self.w_ = np.zeros(1 + X.shape[1])
 45 |         self.errors_ = []
 46 | 
 47 |         for _ in range(self.n_iter):
 48 |             errors = 0
 49 |             for xi, target in zip(X, y):
 50 |                 update = self.eta * (target - self.predict(xi))
 51 |                 self.w_[1:] += update * xi
 52 |                 self.w_[0] += update
 53 |                 errors += int(update != 0.0)
 54 |             self.errors_.append(errors)
 55 |         return self
 56 | 
 57 |     def net_input(self, X):
 58 |         """Calculate net input"""
 59 |         return np.dot(X, self.w_[1:]) + self.w_[0]
 60 | 
 61 |     def predict(self, X):
 62 |         """Return class label after unit step"""
 63 |         return np.where(self.net_input(X) >= 0.0, 1, -1)
 64 | 
 65 | 
 66 | def plot_decision_regions(X, y, classifier, resolution=0.02):
 67 |     # setup marker generator and color map
 68 |     markers = ('s', 'x', 'o', '^', 'v')
 69 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 70 |     cmap = ListedColormap(colors[:len(np.unique(y))])
 71 | 
 72 |     # plot the decision surface
 73 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 74 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 75 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
 76 |                          np.arange(x2_min, x2_max, resolution))
 77 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
 78 |     Z = Z.reshape(xx1.shape)
 79 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
 80 |     plt.xlim(xx1.min(), xx1.max())
 81 |     plt.ylim(xx2.min(), xx2.max())
 82 | 
 83 |     # plot class samples
 84 |     for idx, cl in enumerate(np.unique(y)):
 85 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
 86 |                     alpha=0.8, c=cmap(idx),
 87 |                     marker=markers[idx], label=cl)
 88 | 
 89 | 
 90 | if __name__ == "__main__":
 91 | 
 92 |     df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
 93 |     y = df.iloc[0:100, 4].values
 94 |     y = np.where(y == 'Iris-setosa', -1, 1)
 95 |     X = df.iloc[0:100, [0, 2]].values
 96 |     plt.scatter(X[:50, 0], X[:50, 1],color='red', marker='o', label='setosa')
 97 |     plt.scatter(X[50:100, 0], X[50:100, 1],color='blue', marker='x', label='versicolor')
 98 |     plt.xlabel('sepal length')
 99 |     plt.ylabel('petal length')
100 |     plt.legend(loc='upper left')
101 |     plt.show()
102 | 
103 |     ppn = Perceptron(eta=0.1, n_iter=10)
104 |     ppn.fit(X, y)
105 |     plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_,marker='o')
106 |     plt.xlabel('Epochs')
107 |     plt.ylabel('Number of misclassifications')
108 |     plt.show()
109 | 
110 |     plot_decision_regions(X, y, classifier=ppn)
111 |     plt.xlabel('sepal length [cm]')
112 |     plt.ylabel('petal length [cm]')
113 |     plt.legend(loc='upper left')
114 |     plt.show()


--------------------------------------------------------------------------------
/ch3-SVM-Kernel-on-Iris.py:
--------------------------------------------------------------------------------
  1 | from matplotlib.colors import ListedColormap
  2 | import matplotlib.pyplot as plt
  3 | import warnings
  4 | 
  5 | def versiontuple(v):
  6 |     return tuple(map(int, (v.split("."))))
  7 | 
  8 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
  9 | 
 10 |     # setup marker generator and color map
 11 |     markers = ('s', 'x', 'o', '^', 'v')
 12 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 13 |     cmap = ListedColormap(colors[:len(np.unique(y))])
 14 | 
 15 |     # plot the decision surface
 16 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 17 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 18 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
 19 |                            np.arange(x2_min, x2_max, resolution))
 20 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
 21 |     Z = Z.reshape(xx1.shape)
 22 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
 23 |     plt.xlim(xx1.min(), xx1.max())
 24 |     plt.ylim(xx2.min(), xx2.max())
 25 | 
 26 |     for idx, cl in enumerate(np.unique(y)):
 27 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
 28 |                     alpha=0.8, c=cmap(idx),
 29 |                     marker=markers[idx], label=cl)
 30 | 
 31 |     # highlight test samples
 32 |     if test_idx:
 33 |         # plot all samples
 34 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
 35 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
 36 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
 37 |         else:
 38 |             X_test, y_test = X[test_idx, :], y[test_idx]
 39 | 
 40 |         plt.scatter(X_test[:, 0],
 41 |                     X_test[:, 1],
 42 |                     c='',
 43 |                     alpha=1.0,
 44 |                     linewidths=1,
 45 |                     marker='o',
 46 |                     s=55, label='test set')
 47 | 
 48 | 
 49 | if __name__ == "__main__":
 50 | 
 51 |     from sklearn import datasets
 52 |     import numpy as np
 53 | 
 54 |     # Loading Iris dataset
 55 |     iris = datasets.load_iris()
 56 |     X = iris.data[:, [2, 3]]
 57 |     y = iris.target
 58 |     print('Class labels:', np.unique(y))
 59 | 
 60 |     # Splitting data into 70% training and 30% test data:
 61 |     from sklearn.cross_validation import train_test_split
 62 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
 63 | 
 64 |     # Standardizing the features:
 65 |     from sklearn.preprocessing import StandardScaler
 66 |     sc = StandardScaler()
 67 |     sc.fit(X_train)
 68 |     X_train_std = sc.transform(X_train)
 69 |     X_test_std = sc.transform(X_test)
 70 | 
 71 |     X_combined_std = np.vstack((X_train_std, X_test_std))
 72 |     y_combined = np.hstack((y_train, y_test))
 73 | 
 74 |     from sklearn.svm import SVC
 75 | 
 76 |     # Use low Gamma
 77 |     svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)
 78 |     svm.fit(X_train_std, y_train)
 79 |     plot_decision_regions(X_combined_std, y_combined,
 80 |                           classifier=svm, test_idx=range(105,150))
 81 |     plt.xlabel('petal length [standardized]')
 82 |     plt.ylabel('petal width [standardized]')
 83 |     plt.legend(loc='upper left')
 84 |     plt.tight_layout()
 85 |     # plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
 86 |     plt.show()
 87 | 
 88 |     # Use high Gamma
 89 |     svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0)
 90 |     svm.fit(X_train_std, y_train)
 91 |     plot_decision_regions(X_combined_std, y_combined,
 92 |                           classifier=svm, test_idx=range(105, 150))
 93 |     plt.xlabel('petal length [standardized]')
 94 |     plt.ylabel('petal width [standardized]')
 95 |     plt.legend(loc='upper left')
 96 |     plt.tight_layout()
 97 |     # plt.savefig('./figures/support_vector_machine_rbf_iris_2.png', dpi=300)
 98 |     plt.show()
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/ch3-SVM-Kernel.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | import numpy as np
 5 | 
 6 | 
 7 | def versiontuple(v):
 8 |     return tuple(map(int, (v.split("."))))
 9 | 
10 | 
11 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
12 | 
13 |     # setup marker generator and color map
14 |     markers = ('s', 'x', 'o', '^', 'v')
15 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
16 |     cmap = ListedColormap(colors[:len(np.unique(y))])
17 | 
18 |     # plot the decision surface
19 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
20 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
21 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
22 |                            np.arange(x2_min, x2_max, resolution))
23 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
24 |     Z = Z.reshape(xx1.shape)
25 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
26 |     plt.xlim(xx1.min(), xx1.max())
27 |     plt.ylim(xx2.min(), xx2.max())
28 | 
29 |     for idx, cl in enumerate(np.unique(y)):
30 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
31 |                     alpha=0.8, c=cmap(idx),
32 |                     marker=markers[idx], label=cl)
33 | 
34 |     # highlight test samples
35 |     if test_idx:
36 |         # plot all samples
37 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
38 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
39 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
40 |         else:
41 |             X_test, y_test = X[test_idx, :], y[test_idx]
42 | 
43 |         plt.scatter(X_test[:, 0],
44 |                     X_test[:, 1],
45 |                     c='',
46 |                     alpha=1.0,
47 |                     linewidths=1,
48 |                     marker='o',
49 |                     s=55, label='test set')
50 | 
51 | 
52 | 
53 | if __name__ == "__main__":
54 | 
55 |     np.random.seed(0)
56 |     X_xor = np.random.randn(200, 2)
57 |     y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
58 |     y_xor = np.where(y_xor, 1, -1)
59 |     plt.scatter(X_xor[y_xor == 1, 0], X_xor[y_xor == 1, 1], c='b', marker='x', label='1')
60 |     plt.scatter(X_xor[y_xor == -1, 0], X_xor[y_xor == -1, 1], c='r', marker='s', label='-1')
61 |     plt.xlim([-3, 3])
62 |     plt.ylim([-3, 3])
63 |     plt.legend(loc='best')
64 |     plt.tight_layout()
65 |     # plt.savefig('./figures/xor.png', dpi=300)
66 |     plt.show()
67 | 
68 |     from sklearn.svm import SVC
69 |     svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)
70 |     svm.fit(X_xor, y_xor)
71 |     plot_decision_regions(X_xor, y_xor, classifier=svm)
72 |     plt.legend(loc='upper left')
73 |     plt.tight_layout()
74 |     # plt.savefig('./figures/support_vector_machine_rbf_xor.png', dpi=300)
75 |     plt.show()


--------------------------------------------------------------------------------
/ch3-SVM.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | 
 5 | 
 6 | def versiontuple(v):
 7 |     return tuple(map(int, (v.split("."))))
 8 | 
 9 | 
10 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
11 | 
12 |     # setup marker generator and color map
13 |     markers = ('s', 'x', 'o', '^', 'v')
14 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
15 |     cmap = ListedColormap(colors[:len(np.unique(y))])
16 | 
17 |     # plot the decision surface
18 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
19 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
20 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
21 |                            np.arange(x2_min, x2_max, resolution))
22 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
23 |     Z = Z.reshape(xx1.shape)
24 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
25 |     plt.xlim(xx1.min(), xx1.max())
26 |     plt.ylim(xx2.min(), xx2.max())
27 | 
28 |     for idx, cl in enumerate(np.unique(y)):
29 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
30 |                     alpha=0.8, c=cmap(idx),
31 |                     marker=markers[idx], label=cl)
32 | 
33 |     # highlight test samples
34 |     if test_idx:
35 |         # plot all samples
36 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
37 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
38 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
39 |         else:
40 |             X_test, y_test = X[test_idx, :], y[test_idx]
41 | 
42 |         plt.scatter(X_test[:, 0],
43 |                     X_test[:, 1],
44 |                     c='',
45 |                     alpha=1.0,
46 |                     linewidths=1,
47 |                     marker='o',
48 |                     s=55, label='test set')
49 | 
50 | 
51 | if __name__ == "__main__":
52 | 
53 |     from sklearn import datasets
54 |     import numpy as np
55 | 
56 |     # Loading Iris dataset
57 |     iris = datasets.load_iris()
58 |     X = iris.data[:, [2, 3]]
59 |     y = iris.target
60 |     print('Class labels:', np.unique(y))
61 | 
62 |     # Splitting data into 70% training and 30% test data:
63 |     from sklearn.cross_validation import train_test_split
64 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
65 | 
66 |     # Standardizing the features:
67 |     from sklearn.preprocessing import StandardScaler
68 |     sc = StandardScaler()
69 |     sc.fit(X_train)
70 |     X_train_std = sc.transform(X_train)
71 |     X_test_std = sc.transform(X_test)
72 | 
73 |     from sklearn.svm import SVC
74 |     svm = SVC(kernel='linear', C=1.0, random_state=0)
75 |     svm.fit(X_train_std, y_train)
76 | 
77 |     X_combined_std = np.vstack((X_train_std, X_test_std))
78 |     y_combined = np.hstack((y_train, y_test))
79 | 
80 |     plot_decision_regions(X_combined_std, y_combined,classifier=svm, test_idx=range(105, 150))
81 |     plt.xlabel('petal length [standardized]')
82 |     plt.ylabel('petal width [standardized]')
83 |     plt.legend(loc='upper left')
84 |     plt.tight_layout()
85 |     # plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
86 |     plt.show()
87 | 
88 | 
89 | '''
90 | SGDClassifier can be used for online learning by calling partial_fit method
91 | 
92 |     from sklearn.linear_model import SGDClassifier
93 |     ppn = SGDClassifier(loss='perceptron')
94 |     lr = SGDClassifier(loss='log')
95 |     svm = SGDClassifier(loss='hinge')
96 | 
97 | '''


--------------------------------------------------------------------------------
/ch3-decisionTree-RandomForests.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | 
 5 | def versiontuple(v):
 6 |     return tuple(map(int, (v.split("."))))
 7 | 
 8 | 
 9 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
10 | 
11 |     # setup marker generator and color map
12 |     markers = ('s', 'x', 'o', '^', 'v')
13 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
14 |     cmap = ListedColormap(colors[:len(np.unique(y))])
15 | 
16 |     # plot the decision surface
17 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
18 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
19 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
20 |                            np.arange(x2_min, x2_max, resolution))
21 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
22 |     Z = Z.reshape(xx1.shape)
23 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
24 |     plt.xlim(xx1.min(), xx1.max())
25 |     plt.ylim(xx2.min(), xx2.max())
26 | 
27 |     for idx, cl in enumerate(np.unique(y)):
28 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
29 |                     alpha=0.8, c=cmap(idx),
30 |                     marker=markers[idx], label=cl)
31 | 
32 |     # highlight test samples
33 |     if test_idx:
34 |         # plot all samples
35 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
36 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
37 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
38 |         else:
39 |             X_test, y_test = X[test_idx, :], y[test_idx]
40 | 
41 |         plt.scatter(X_test[:, 0],
42 |                     X_test[:, 1],
43 |                     c='',
44 |                     alpha=1.0,
45 |                     linewidths=1,
46 |                     marker='o',
47 |                     s=55, label='test set')
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     from sklearn import datasets
53 |     import numpy as np
54 | 
55 |     # Loading Iris dataset
56 |     iris = datasets.load_iris()
57 |     X = iris.data[:, [2, 3]]
58 |     y = iris.target
59 |     print('Class labels:', np.unique(y))
60 | 
61 |     # Splitting data into 70% training and 30% test data:
62 |     from sklearn.cross_validation import train_test_split
63 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
64 | 
65 |     X_combined = np.vstack((X_train, X_test))
66 |     y_combined = np.hstack((y_train, y_test))
67 | 
68 |     # Note: No feature scaling required
69 | 
70 |     from sklearn.ensemble import RandomForestClassifier
71 | 
72 |     forest = RandomForestClassifier(criterion='entropy',
73 |                                     n_estimators=10,
74 |                                     random_state=1,
75 |                                     n_jobs=2)
76 |     forest.fit(X_train, y_train)
77 | 
78 |     plot_decision_regions(X_combined, y_combined, classifier=forest, test_idx=range(105,150))
79 |     plt.xlabel('petal length [cm]')
80 |     plt.ylabel('petal width [cm]')
81 |     plt.legend(loc='upper left')
82 |     plt.tight_layout()
83 |     # plt.savefig('./figures/random_forest.png', dpi=300)
84 |     plt.show()


--------------------------------------------------------------------------------
/ch3-decisionTrees-InformationGain.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | 
 5 | def gini(p):
 6 |     return (p)*(1 - (p)) + (1-p)*(1 - (1-p))
 7 | 
 8 | 
 9 | def entropy(p):
10 |     return - p*np.log2(p) - (1 - p)*np.log2((1 - p))
11 | 
12 | 
13 | def error(p):
14 |     return 1 - np.max([p, 1 - p])
15 | 
16 | x = np.arange(0.0, 1.0, 0.01)
17 | 
18 | ent = [entropy(p) if p != 0 else None for p in x]
19 | sc_ent = [e*0.5 if e else None for e in ent]
20 | err = [error(i) for i in x]
21 | 
22 | fig = plt.figure()
23 | ax = plt.subplot(111)
24 | for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err], 
25 |                           ['Entropy', 'Entropy (scaled)', 
26 |                            'Gini Impurity', 'Misclassification Error'],
27 |                           ['-', '-', '--', '-.'],
28 |                           ['black', 'lightgray', 'red', 'green', 'cyan']):
29 |     line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)
30 | 
31 | ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15),
32 |           ncol=3, fancybox=True, shadow=False)
33 | 
34 | ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--')
35 | ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')
36 | plt.ylim([0, 1.1])
37 | plt.xlabel('p(i=1)')
38 | plt.ylabel('Impurity Index')
39 | plt.tight_layout()
40 | #plt.savefig('./figures/impurity.png', dpi=300, bbox_inches='tight')
41 | plt.show()


--------------------------------------------------------------------------------
/ch3-decisionTrees.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | 
 5 | def versiontuple(v):
 6 |     return tuple(map(int, (v.split("."))))
 7 | 
 8 | 
 9 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
10 | 
11 |     # setup marker generator and color map
12 |     markers = ('s', 'x', 'o', '^', 'v')
13 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
14 |     cmap = ListedColormap(colors[:len(np.unique(y))])
15 | 
16 |     # plot the decision surface
17 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
18 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
19 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
20 |                            np.arange(x2_min, x2_max, resolution))
21 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
22 |     Z = Z.reshape(xx1.shape)
23 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
24 |     plt.xlim(xx1.min(), xx1.max())
25 |     plt.ylim(xx2.min(), xx2.max())
26 | 
27 |     for idx, cl in enumerate(np.unique(y)):
28 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
29 |                     alpha=0.8, c=cmap(idx),
30 |                     marker=markers[idx], label=cl)
31 | 
32 |     # highlight test samples
33 |     if test_idx:
34 |         # plot all samples
35 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
36 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
37 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
38 |         else:
39 |             X_test, y_test = X[test_idx, :], y[test_idx]
40 | 
41 |         plt.scatter(X_test[:, 0],
42 |                     X_test[:, 1],
43 |                     c='',
44 |                     alpha=1.0,
45 |                     linewidths=1,
46 |                     marker='o',
47 |                     s=55, label='test set')
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     from sklearn import datasets
53 |     import numpy as np
54 | 
55 |     # Loading Iris dataset
56 |     iris = datasets.load_iris()
57 |     X = iris.data[:, [2, 3]]
58 |     y = iris.target
59 |     print('Class labels:', np.unique(y))
60 | 
61 |     # Splitting data into 70% training and 30% test data:
62 |     from sklearn.cross_validation import train_test_split
63 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
64 | 
65 |     # Note: No feature scaling required
66 | 
67 |     from sklearn.tree import DecisionTreeClassifier
68 |     tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
69 |     tree.fit(X_train, y_train)
70 | 
71 |     X_combined = np.vstack((X_train, X_test))
72 |     y_combined = np.hstack((y_train, y_test))
73 |     plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105,150))
74 |     plt.xlabel('petal length [cm]')
75 |     plt.ylabel('petal width [cm]')
76 |     plt.legend(loc='upper left')
77 |     plt.tight_layout()
78 |     # plt.savefig('./figures/decision_tree_decision.png', dpi=300)
79 |     plt.show()
80 | 
81 | 
82 |     # export decision tree to an image
83 |     from sklearn.tree import export_graphviz
84 |     export_graphviz(tree, out_file='/tmp/tree.dot', feature_names=['petal length', 'petal width'])
85 |     # Run below on your computer to create png from .dot file (need GraphViz installed)
86 |     #  “dot -Tpng /tmp/tree.dot -o /tmp/tree.png”
87 | 


--------------------------------------------------------------------------------
/ch3-k-nearest-neighbors.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | 
 5 | def versiontuple(v):
 6 |     return tuple(map(int, (v.split("."))))
 7 | 
 8 | 
 9 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
10 | 
11 |     # setup marker generator and color map
12 |     markers = ('s', 'x', 'o', '^', 'v')
13 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
14 |     cmap = ListedColormap(colors[:len(np.unique(y))])
15 | 
16 |     # plot the decision surface
17 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
18 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
19 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
20 |                            np.arange(x2_min, x2_max, resolution))
21 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
22 |     Z = Z.reshape(xx1.shape)
23 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
24 |     plt.xlim(xx1.min(), xx1.max())
25 |     plt.ylim(xx2.min(), xx2.max())
26 | 
27 |     for idx, cl in enumerate(np.unique(y)):
28 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
29 |                     alpha=0.8, c=cmap(idx),
30 |                     marker=markers[idx], label=cl)
31 | 
32 |     # highlight test samples
33 |     if test_idx:
34 |         # plot all samples
35 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
36 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
37 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
38 |         else:
39 |             X_test, y_test = X[test_idx, :], y[test_idx]
40 | 
41 |         plt.scatter(X_test[:, 0],
42 |                     X_test[:, 1],
43 |                     c='',
44 |                     alpha=1.0,
45 |                     linewidths=1,
46 |                     marker='o',
47 |                     s=55, label='test set')
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     from sklearn import datasets
53 |     import numpy as np
54 | 
55 |     # Loading Iris dataset
56 |     iris = datasets.load_iris()
57 |     X = iris.data[:, [2, 3]]
58 |     y = iris.target
59 |     print('Class labels:', np.unique(y))
60 | 
61 |     # Splitting data into 70% training and 30% test data:
62 |     from sklearn.cross_validation import train_test_split
63 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
64 | 
65 | 
66 |     # Standardizing the features:
67 |     from sklearn.preprocessing import StandardScaler
68 |     sc = StandardScaler()
69 |     sc.fit(X_train)
70 |     X_train_std = sc.transform(X_train)
71 |     X_test_std = sc.transform(X_test)
72 | 
73 |     X_combined_std = np.vstack((X_train_std, X_test_std))
74 |     y_combined = np.hstack((y_train, y_test))
75 | 
76 |     from sklearn.neighbors import KNeighborsClassifier
77 | 
78 |     knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
79 |     knn.fit(X_train_std, y_train)
80 | 
81 |     plot_decision_regions(X_combined_std, y_combined, classifier=knn, test_idx=range(105,150))
82 | 
83 |     plt.xlabel('petal length [standardized]')
84 |     plt.ylabel('petal width [standardized]')
85 |     plt.legend(loc='upper left')
86 |     plt.tight_layout()
87 |     # plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
88 |     plt.show()
89 | 
90 | 


--------------------------------------------------------------------------------
/ch3-logisticRegression.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | from sklearn.linear_model import LogisticRegression
 5 | 
 6 | def versiontuple(v):
 7 |     return tuple(map(int, (v.split("."))))
 8 | 
 9 | 
10 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
11 | 
12 |     # setup marker generator and color map
13 |     markers = ('s', 'x', 'o', '^', 'v')
14 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
15 |     cmap = ListedColormap(colors[:len(np.unique(y))])
16 | 
17 |     # plot the decision surface
18 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
19 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
20 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
21 |                            np.arange(x2_min, x2_max, resolution))
22 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
23 |     Z = Z.reshape(xx1.shape)
24 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
25 |     plt.xlim(xx1.min(), xx1.max())
26 |     plt.ylim(xx2.min(), xx2.max())
27 | 
28 |     for idx, cl in enumerate(np.unique(y)):
29 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
30 |                     alpha=0.8, c=cmap(idx),
31 |                     marker=markers[idx], label=cl)
32 | 
33 |     # highlight test samples
34 |     if test_idx:
35 |         # plot all samples
36 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
37 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
38 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
39 |         else:
40 |             X_test, y_test = X[test_idx, :], y[test_idx]
41 | 
42 |         plt.scatter(X_test[:, 0],
43 |                     X_test[:, 1],
44 |                     c='',
45 |                     alpha=1.0,
46 |                     linewidths=1,
47 |                     marker='o',
48 |                     s=55, label='test set')
49 | 
50 | 
51 | if __name__ == "__main__":
52 | 
53 |     from sklearn import datasets
54 |     import numpy as np
55 | 
56 |     # Loading Iris dataset
57 |     iris = datasets.load_iris()
58 |     X = iris.data[:, [2, 3]]
59 |     y = iris.target
60 |     print('Class labels:', np.unique(y))
61 | 
62 |     # Splitting data into 70% training and 30% test data:
63 |     from sklearn.cross_validation import train_test_split
64 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
65 | 
66 |     # Standardizing the features:
67 |     from sklearn.preprocessing import StandardScaler
68 |     sc = StandardScaler()
69 |     sc.fit(X_train)
70 |     X_train_std = sc.transform(X_train)
71 |     X_test_std = sc.transform(X_test)
72 | 
73 |     lr = LogisticRegression(C=1000.0, random_state=0)
74 |     lr.fit(X_train_std, y_train)
75 | 
76 |     X_combined_std = np.vstack((X_train_std, X_test_std))
77 |     y_combined = np.hstack((y_train, y_test))
78 | 
79 |     plot_decision_regions(X_combined_std, y_combined, classifier=lr, test_idx=range(105,150))
80 |     plt.xlabel('petal length [standardized]')
81 |     plt.ylabel('petal width [standardized]')
82 |     plt.legend(loc='upper left')
83 |     plt.show()
84 | 
85 |     # Display probability of x0
86 |     lr.predict_proba(X_test_std[0,:])


--------------------------------------------------------------------------------
/ch3-logisticregression-cost.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | def sigmoid(z):
 5 |     return 1.0 / (1.0 + np.exp(-z))
 6 | 
 7 | def cost_1(z):
 8 |     return - np.log(sigmoid(z))
 9 | 
10 | def cost_0(z):
11 |     return - np.log(1 - sigmoid(z))
12 | 
13 | z = np.arange(-10, 10, 0.1)
14 | phi_z = sigmoid(z)
15 | 
16 | c1 = [cost_1(x) for x in z]
17 | plt.plot(phi_z, c1, label='J(w) if y=1')
18 | 
19 | c0 = [cost_0(x) for x in z]
20 | plt.plot(phi_z, c0, linestyle='--', label='J(w) if y=0')
21 | 
22 | plt.ylim(0.0, 5.1)
23 | plt.xlim([0, 1])
24 | plt.xlabel('$\phi$(z)')
25 | plt.ylabel('J(w)')
26 | plt.legend(loc='best')
27 | plt.tight_layout()
28 | # plt.savefig('./figures/log_cost.png', dpi=300)
29 | plt.show()


--------------------------------------------------------------------------------
/ch3-scikit-learn-perceptron.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import warnings
 4 | 
 5 | def versiontuple(v):
 6 |     return tuple(map(int, (v.split("."))))
 7 | 
 8 | 
 9 | def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
10 | 
11 |     # setup marker generator and color map
12 |     markers = ('s', 'x', 'o', '^', 'v')
13 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
14 |     cmap = ListedColormap(colors[:len(np.unique(y))])
15 | 
16 |     # plot the decision surface
17 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
18 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
19 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
20 |                            np.arange(x2_min, x2_max, resolution))
21 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
22 |     Z = Z.reshape(xx1.shape)
23 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
24 |     plt.xlim(xx1.min(), xx1.max())
25 |     plt.ylim(xx2.min(), xx2.max())
26 | 
27 |     for idx, cl in enumerate(np.unique(y)):
28 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
29 |                     alpha=0.8, c=cmap(idx),
30 |                     marker=markers[idx], label=cl)
31 | 
32 |     # highlight test samples
33 |     if test_idx:
34 |         # plot all samples
35 |         if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
36 |             X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
37 |             warnings.warn('Please update to NumPy 1.9.0 or newer')
38 |         else:
39 |             X_test, y_test = X[test_idx, :], y[test_idx]
40 | 
41 |         plt.scatter(X_test[:, 0],
42 |                     X_test[:, 1],
43 |                     c='',
44 |                     alpha=1.0,
45 |                     linewidths=1,
46 |                     marker='o',
47 |                     s=55, label='test set')
48 | 
49 | 
50 | if __name__ == "__main__":
51 | 
52 |     from sklearn import datasets
53 |     import numpy as np
54 | 
55 |     # Loading Iris dataset
56 |     iris = datasets.load_iris()
57 |     X = iris.data[:, [2, 3]]
58 |     y = iris.target
59 |     print('Class labels:', np.unique(y))
60 | 
61 |     # Splitting data into 70% training and 30% test data:
62 |     from sklearn.cross_validation import train_test_split
63 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
64 | 
65 |     # Standardizing the features:
66 |     from sklearn.preprocessing import StandardScaler
67 |     sc = StandardScaler()
68 |     sc.fit(X_train)
69 |     X_train_std = sc.transform(X_train)
70 |     X_test_std = sc.transform(X_test)
71 | 
72 |     # Fit data to model
73 |     from sklearn.linear_model import Perceptron
74 |     ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
75 |     ppn.fit(X_train_std, y_train)
76 | 
77 |     # Predict
78 |     y_pred = ppn.predict(X_test_std)
79 |     print('Misclassified samples: %d' % (y_test != y_pred).sum())
80 | 
81 |     # Accuracy
82 |     from sklearn.metrics import accuracy_score
83 |     print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
84 | 
85 |     X_combined_std = np.vstack((X_train_std, X_test_std))
86 |     y_combined = np.hstack((y_train, y_test))
87 | 
88 |     plot_decision_regions(X=X_combined_std, y=y_combined, classifier=ppn, test_idx=range(105, 150))
89 |     plt.xlabel('petal length [standardized]')
90 |     plt.ylabel('petal width [standardized]')
91 |     plt.legend(loc='upper left')
92 | 
93 |     plt.tight_layout()
94 |     # plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
95 |     plt.show()
96 | 


--------------------------------------------------------------------------------
/ch3-sigmoid.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | 
 5 | def sigmoid(z):
 6 |     return 1.0 / (1.0 + np.exp(-z))
 7 | 
 8 | z = np.arange(-7, 7, 0.1)
 9 | phi_z = sigmoid(z)
10 | 
11 | plt.plot(z, phi_z)
12 | plt.axvline(0.0, color='k')
13 | plt.ylim(-0.1, 1.1)
14 | plt.xlabel('z')
15 | plt.ylabel('$\phi (z)$')
16 | 
17 | # y axis ticks and gridline
18 | plt.yticks([0.0, 0.5, 1.0])
19 | ax = plt.gca()
20 | ax.yaxis.grid(True)
21 | 
22 | plt.tight_layout()
23 | # plt.savefig('./figures/sigmoid.png', dpi=300)
24 | plt.show()


--------------------------------------------------------------------------------
/ch4-categoricalData.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df = pd.DataFrame([
 3 |             ['green', 'M', 10.1, 'class1'], 
 4 |             ['red', 'L', 13.5, 'class2'], 
 5 |             ['blue', 'XL', 15.3, 'class1']])
 6 | 
 7 | df.columns = ['color', 'size', 'price', 'classlabel']
 8 | df
 9 | 
10 | # convert ordinal data: sizes to integers
11 | size_mapping = {
12 |            'XL': 3,
13 |            'L': 2,
14 |            'M': 1}
15 | 
16 | df['size'] = df['size'].map(size_mapping)
17 | df
18 | inv_size_mapping = {v: k for k, v in size_mapping.items()}
19 | df['size'].map(inv_size_mapping)
20 | 
21 | # class labels
22 | import numpy as np
23 | class_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
24 | class_mapping
25 | df['classlabel'] = df['classlabel'].map(class_mapping)
26 | df
27 | inv_class_mapping = {v: k for k, v in class_mapping.items()}
28 | df['classlabel'] = df['classlabel'].map(inv_class_mapping)
29 | df
30 | 
31 | 
32 | # alternative way for class labels
33 | from sklearn.preprocessing import LabelEncoder
34 | class_le = LabelEncoder()
35 | y = class_le.fit_transform(df['classlabel'].values)
36 | y
37 | class_le.inverse_transform(y)
38 | 
39 | 
40 | 
41 | X = df[['color', 'size', 'price']].values
42 | # Convert nominal data: color to numerical
43 | color_le = LabelEncoder()
44 | X[:, 0] = color_le.fit_transform(X[:, 0])
45 | X
46 | 
47 | 
48 | # One hot encoding to derive extra features from color so color integer values
49 | # don't cause algorithm issues
50 | from sklearn.preprocessing import OneHotEncoder
51 | ohe = OneHotEncoder(categorical_features=[0])
52 | ohe.fit_transform(X).toarray()
53 | 
54 | # Alternative: One hot encoding using pandas
55 | pd.get_dummies(df[['price', 'color', 'size']])
56 | 


--------------------------------------------------------------------------------
/ch4-featureSelection-randomForest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | if __name__ == "__main__":
 6 | 
 7 |     # Grab wines data set
 8 |     df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 9 | 
10 |     df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
11 |     'Alcalinity of ash', 'Magnesium', 'Total phenols',
12 |     'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
13 |     'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
14 | 
15 |     from sklearn.cross_validation import train_test_split
16 |     X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
17 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
18 | 
19 |     from sklearn.preprocessing import StandardScaler
20 |     stdsc = StandardScaler()
21 |     X_train_std = stdsc.fit_transform(X_train)
22 |     X_test_std = stdsc.transform(X_test)
23 | 
24 |     from sklearn.ensemble import RandomForestClassifier
25 |     feat_labels = df_wine.columns[1:]
26 |     forest = RandomForestClassifier(n_estimators=10000,
27 |                                     random_state=0,
28 |                                     n_jobs=-1)
29 | 
30 |     forest.fit(X_train, y_train)
31 |     importances = forest.feature_importances_
32 | 
33 |     indices = np.argsort(importances)[::-1]
34 | 
35 |     for f in range(X_train.shape[1]):
36 |         print("%2d) %-*s %f" % (f + 1, 30,
37 |                                 feat_labels[indices[f]],
38 |                                 importances[indices[f]]))
39 | 
40 |     plt.title('Feature Importances')
41 |     plt.bar(range(X_train.shape[1]),
42 |             importances[indices],
43 |             color='lightblue',
44 |             align='center')
45 | 
46 |     plt.xticks(range(X_train.shape[1]),
47 |                feat_labels[indices], rotation=90)
48 |     plt.xlim([-1, X_train.shape[1]])
49 |     plt.tight_layout()
50 |     #plt.savefig('./random_forest.png', dpi=300)
51 |     plt.show()
52 | 


--------------------------------------------------------------------------------
/ch4-imputation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import Imputer
 2 | import pandas as pd
 3 | from io import StringIO
 4 | 
 5 | csv_data = '''A,B,C,D
 6 | 1.0,2.0,3.0,4.0
 7 | 5.0,6.0,,8.0
 8 | 10.0,11.0,12.0,'''
 9 | 
10 | # If you are using Python 2.7, you need
11 | # to convert the string to unicode:
12 | # csv_data = unicode(csv_data)
13 | 
14 | df = pd.read_csv(StringIO(csv_data))
15 | 
16 | imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
17 | imr = imr.fit(df)
18 | imputed_data = imr.transform(df.values)
19 | imputed_data
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/ch4-missingData.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from io import StringIO
 3 | 
 4 | csv_data = '''A,B,C,D
 5 | 1.0,2.0,3.0,4.0
 6 | 5.0,6.0,,8.0
 7 | 10.0,11.0,12.0,'''
 8 | 
 9 | # If you are using Python 2.7, you need
10 | # to convert the string to unicode:
11 | # csv_data = unicode(csv_data)
12 | 
13 | df = pd.read_csv(StringIO(csv_data))
14 | 
15 | # Show count of missing data
16 | df.isnull().sum()
17 | 
18 | # Drop rows with missing data
19 | df.dropna()
20 | 
21 | # Drop column if NaN in colum
22 | df.dropna(axis=1)
23 | 
24 | # only drop rows where all columns are NaN
25 | df.dropna(how='all')
26 | 
27 | # drop rows that have not at least 4 non-NaN values
28 | df.dropna(thresh=4)
29 | 
30 | # only drop rows where NaN appear in specific columns (here: 'C')
31 | df.dropna(subset=['C'])
32 | 
33 | 


--------------------------------------------------------------------------------
/ch4-partitioningData.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # Grab wines data set
 5 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 6 | 
 7 | df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
 8 | 'Alcalinity of ash', 'Magnesium', 'Total phenols', 
 9 | 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
10 | 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
11 | 
12 | print('Class labels', np.unique(df_wine['Class label']))
13 | df_wine.head()
14 | 
15 | from sklearn.cross_validation import train_test_split
16 | 
17 | X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
18 | 
19 | X_train, X_test, y_train, y_test = \
20 |         train_test_split(X, y, test_size=0.3, random_state=0)
21 | 
22 | 
23 | # normalization
24 | from sklearn.preprocessing import MinMaxScaler
25 | mms = MinMaxScaler()
26 | X_train_norm = mms.fit_transform(X_train)
27 | X_test_norm = mms.transform(X_test)
28 | 
29 | 
30 | # standardization (more apt to machine learning)
31 | from sklearn.preprocessing import StandardScaler
32 | stdsc = StandardScaler()
33 | X_train_std = stdsc.fit_transform(X_train)
34 | X_test_std = stdsc.transform(X_test)
35 | 
36 | from sklearn.linear_model import LogisticRegression
37 | 
38 | # Using L1 regularization penatly
39 | lr = LogisticRegression(penalty='l1', C=0.1)
40 | lr.fit(X_train_std, y_train)
41 | print('Training accuracy:', lr.score(X_train_std, y_train))
42 | print('Test accuracy:', lr.score(X_test_std, y_test))
43 | 
44 | 
45 | lr.intercept_
46 | 
47 | # show weights (3 rows for three classes)
48 | lr.coef_
49 | '''
50 | We notice that the weight vectors are sparse, which means that they only have a
51 | few non-zero entries. As a result of the L1 regularization, which serves as a method
52 | for feature selection, we just trained a model that is robust to the potentially
53 | irrelevant features in this dataset.'''
54 | 
55 | 
56 | import matplotlib.pyplot as plt
57 | 
58 | fig = plt.figure()
59 | ax = plt.subplot(111)
60 | 
61 | colors = ['blue', 'green', 'red', 'cyan',
62 |          'magenta', 'yellow', 'black',
63 |           'pink', 'lightgreen', 'lightblue',
64 |           'gray', 'indigo', 'orange']
65 | 
66 | weights, params = [], []
67 | for c in np.arange(-4, 6):
68 |     lr = LogisticRegression(penalty='l1', C=10**c, random_state=0)
69 |     lr.fit(X_train_std, y_train)
70 |     weights.append(lr.coef_[1])
71 |     params.append(10**c)
72 | 
73 | weights = np.array(weights)
74 | 
75 | for column, color in zip(range(weights.shape[1]), colors):
76 |     plt.plot(params, weights[:, column],
77 |              label=df_wine.columns[column+1],
78 |              color=color)
79 | plt.axhline(0, color='black', linestyle='--', linewidth=3)
80 | plt.xlim([10**(-5), 10**5])
81 | plt.ylabel('weight coefficient')
82 | plt.xlabel('C')
83 | plt.xscale('log')
84 | plt.legend(loc='upper left')
85 | ax.legend(loc='upper center',
86 |           bbox_to_anchor=(1.38, 1.03),
87 |           ncol=1, fancybox=True)
88 | # plt.savefig('./figures/l1_path.png', dpi=300)
89 | plt.show()
90 | 
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/ch4-seq-feature-selection.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import clone
  2 | from itertools import combinations
  3 | import numpy as np
  4 | import pandas as pd
  5 | from sklearn.metrics import accuracy_score
  6 | 
  7 | # Sequential Backward Selection (SBS)
  8 | class SBS():
  9 |     def __init__(self, estimator, k_features, scoring=accuracy_score,
 10 |                  test_size=0.25, random_state=1):
 11 |         self.scoring = scoring
 12 |         self.estimator = clone(estimator)
 13 |         self.k_features = k_features
 14 |         self.test_size = test_size
 15 |         self.random_state = random_state
 16 | 
 17 |     def fit(self, X, y):
 18 |         
 19 |         X_train, X_test, y_train, y_test = \
 20 |                 train_test_split(X, y, test_size=self.test_size, 
 21 |                                  random_state=self.random_state)
 22 | 
 23 |         dim = X_train.shape[1]
 24 |         self.indices_ = tuple(range(dim))
 25 |         self.subsets_ = [self.indices_]
 26 |         score = self._calc_score(X_train, y_train, 
 27 |                                  X_test, y_test, self.indices_)
 28 |         self.scores_ = [score]
 29 | 
 30 |         while dim > self.k_features:
 31 |             scores = []
 32 |             subsets = []
 33 | 
 34 |             for p in combinations(self.indices_, r=dim-1):
 35 |                 score = self._calc_score(X_train, y_train, 
 36 |                                          X_test, y_test, p)
 37 |                 scores.append(score)
 38 |                 subsets.append(p)
 39 | 
 40 |             best = np.argmax(scores)
 41 |             self.indices_ = subsets[best]
 42 |             self.subsets_.append(self.indices_)
 43 |             dim -= 1
 44 | 
 45 |             self.scores_.append(scores[best])
 46 |         self.k_score_ = self.scores_[-1]
 47 | 
 48 |         return self
 49 | 
 50 |     def transform(self, X):
 51 |         return X[:, self.indices_]
 52 | 
 53 |     def _calc_score(self, X_train, y_train, X_test, y_test, indices):
 54 |         self.estimator.fit(X_train[:, indices], y_train)
 55 |         y_pred = self.estimator.predict(X_test[:, indices])
 56 |         score = self.scoring(y_test, y_pred)
 57 |         return score
 58 | 
 59 | if __name__ == "__main__":
 60 | 
 61 |     # Grab wines data set
 62 |     df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 63 | 
 64 |     df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
 65 |     'Alcalinity of ash', 'Magnesium', 'Total phenols',
 66 |     'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
 67 |     'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
 68 | 
 69 |     from sklearn.cross_validation import train_test_split
 70 |     X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
 71 | 
 72 |     X_train, X_test, y_train, y_test = \
 73 |             train_test_split(X, y, test_size=0.3, random_state=0)
 74 | 
 75 | 
 76 |     from sklearn.preprocessing import StandardScaler
 77 |     stdsc = StandardScaler()
 78 |     X_train_std = stdsc.fit_transform(X_train)
 79 |     X_test_std = stdsc.transform(X_test)
 80 | 
 81 |     from sklearn.neighbors import KNeighborsClassifier
 82 |     import matplotlib.pyplot as plt
 83 |     knn = KNeighborsClassifier(n_neighbors=2)
 84 | 
 85 |     # selecting features using SBS
 86 |     sbs = SBS(knn, k_features=1)
 87 |     sbs.fit(X_train_std, y_train)
 88 | 
 89 |     # plotting performance of feature subsets
 90 |     k_feat = [len(k) for k in sbs.subsets_]
 91 | 
 92 |     plt.plot(k_feat, sbs.scores_, marker='o')
 93 |     plt.ylim([0.7, 1.1])
 94 |     plt.ylabel('Accuracy')
 95 |     plt.xlabel('Number of features')
 96 |     plt.grid()
 97 |     plt.tight_layout()
 98 |     # plt.savefig('./sbs.png', dpi=300)
 99 |     plt.show()
100 | 
101 | 


--------------------------------------------------------------------------------
/ch5-LDA-scikit.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | def plot_decision_regions(X, y, classifier, resolution=0.02):
 6 |     # setup marker generator and color map
 7 |     markers = ('s', 'x', 'o', '^', 'v')
 8 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 9 |     cmap = ListedColormap(colors[:len(np.unique(y))])
10 | 
11 |     # plot the decision surface
12 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
13 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
14 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
15 |                          np.arange(x2_min, x2_max, resolution))
16 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
17 |     Z = Z.reshape(xx1.shape)
18 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
19 |     plt.xlim(xx1.min(), xx1.max())
20 |     plt.ylim(xx2.min(), xx2.max())
21 | 
22 |     # plot class samples
23 |     for idx, cl in enumerate(np.unique(y)):
24 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
25 |                     alpha=0.8, c=cmap(idx),
26 |                     marker=markers[idx], label=cl)
27 | 
28 | import pandas as pd
29 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
30 | 
31 | # Split and Normalize dataset
32 | from sklearn.cross_validation import train_test_split
33 | from sklearn.preprocessing import StandardScaler
34 | X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
35 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
36 | sc = StandardScaler()
37 | X_train_std = sc.fit_transform(X_train)
38 | X_test_std = sc.transform(X_test)
39 | 
40 | from sklearn.linear_model import LogisticRegression
41 | from sklearn.lda import LDA
42 | 
43 | 
44 | lda = LDA(n_components=2)
45 | X_train_lda = lda.fit_transform(X_train_std, y_train)
46 | lr = LogisticRegression()
47 | lr = lr.fit(X_train_lda, y_train)
48 | 
49 | 
50 | plot_decision_regions(X_train_lda, y_train, classifier=lr)
51 | plt.xlabel('LD 1')
52 | plt.ylabel('LD 2')
53 | plt.legend(loc='lower left')
54 | plt.show()
55 | 
56 | 
57 | X_test_lda = lda.transform(X_test_std)
58 | plot_decision_regions(X_test_lda, y_test, classifier=lr)
59 | plt.xlabel('LD 1')
60 | plt.ylabel('LD 2')
61 | plt.legend(loc='lower left')
62 | plt.show()
63 | 


--------------------------------------------------------------------------------
/ch5-PCA-Kernel-ex1.py:
--------------------------------------------------------------------------------
 1 | from scipy.spatial.distance import pdist, squareform
 2 | from scipy import exp
 3 | from scipy.linalg import eigh
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | def rbf_kernel_pca(X, gamma, n_components):
 9 |     """
10 |     RBF kernel PCA implementation.
11 | 
12 |     Parameters
13 |     ------------
14 |     X: {NumPy ndarray}, shape = [n_samples, n_features]
15 | 
16 |     gamma: float
17 |       Tuning parameter of the RBF kernel
18 | 
19 |     n_components: int
20 |       Number of principal components to return
21 | 
22 |     Returns
23 |     ------------
24 |      X_pc: {NumPy ndarray}, shape = [n_samples, k_features]
25 |        Projected dataset
26 | 
27 |     """
28 |     # Calculate pairwise squared Euclidean distances
29 |     # in the MxN dimensional dataset.
30 |     sq_dists = pdist(X, 'sqeuclidean')
31 | 
32 |     # Convert pairwise distances into a square matrix.
33 |     mat_sq_dists = squareform(sq_dists)
34 | 
35 |     # Compute the symmetric kernel matrix.
36 |     K = exp(-gamma * mat_sq_dists)
37 | 
38 |     # Center the kernel matrix.
39 |     N = K.shape[0]
40 |     one_n = np.ones((N,N)) / N
41 |     K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
42 | 
43 |     # Obtaining eigenpairs from the centered kernel matrix
44 |     # numpy.eigh returns them in sorted order
45 |     eigvals, eigvecs = eigh(K)
46 | 
47 |     # Collect the top k eigenvectors (projected samples)
48 |     X_pc = np.column_stack((eigvecs[:, -i]
49 |                             for i in range(1, n_components + 1)))
50 | 
51 |     return X_pc
52 | 
53 | 
54 | from sklearn.datasets import make_moons
55 | X, y = make_moons(n_samples=100, random_state=123)
56 | plt.scatter(X[y==0, 0], X[y==0, 1], color='red', marker='^', alpha=0.5)
57 | plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', marker='o', alpha=0.5)
58 | plt.show()
59 | 
60 | from sklearn.decomposition import PCA
61 | scikit_pca = PCA(n_components=2)
62 | X_spca = scikit_pca.fit_transform(X)
63 | 
64 | # Show effects of normal PCA
65 | fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(7,3))
66 | ax[0].scatter(X_spca[y==0, 0], X_spca[y==0, 1], color='red', marker='^', alpha=0.5)
67 | ax[0].scatter(X_spca[y==1, 0], X_spca[y==1, 1], color='blue', marker='o', alpha=0.5)
68 | ax[1].scatter(X_spca[y==0, 0], np.zeros((50,1))+0.02, color='red', marker='^', alpha=0.5)
69 | ax[1].scatter(X_spca[y==1, 0], np.zeros((50,1))-0.02, color='blue', marker='o', alpha=0.5)
70 | ax[0].set_xlabel('PC1')
71 | ax[0].set_ylabel('PC2')
72 | ax[1].set_ylim([-1, 1])
73 | ax[1].set_yticks([])
74 | ax[1].set_xlabel('PC1')
75 | plt.show()
76 | 
77 | 
78 | # Apply RBF Kernel
79 | from matplotlib.ticker import FormatStrFormatter
80 | X_kpca = rbf_kernel_pca(X, gamma=15, n_components=2)
81 | 
82 | # Show effects of RBF PCA Kernel
83 | fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(7,3))
84 | ax[0].scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', marker='^', alpha=0.5)
85 | ax[0].scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', marker='o', alpha=0.5)
86 | ax[1].scatter(X_kpca[y==0, 0], np.zeros((50,1))+0.02, color='red', marker='^', alpha=0.5)
87 | ax[1].scatter(X_kpca[y==1, 0], np.zeros((50,1))-0.02, color='blue', marker='o', alpha=0.5)
88 | ax[0].set_xlabel('PC1')
89 | ax[0].set_ylabel('PC2')
90 | ax[1].set_ylim([-1, 1])
91 | ax[1].set_yticks([])
92 | ax[1].set_xlabel('PC1')
93 | ax[0].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
94 | ax[1].xaxis.set_major_formatter(FormatStrFormatter('%0.1f'))
95 | plt.show()
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/ch5-PCA-Kernel-ex2.py:
--------------------------------------------------------------------------------
 1 | from scipy.spatial.distance import pdist, squareform
 2 | from scipy import exp
 3 | from scipy.linalg import eigh
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | 
 8 | def rbf_kernel_pca(X, gamma, n_components):
 9 |     """
10 |     RBF kernel PCA implementation.
11 | 
12 |     Parameters
13 |     ------------
14 |     X: {NumPy ndarray}, shape = [n_samples, n_features]
15 | 
16 |     gamma: float
17 |       Tuning parameter of the RBF kernel
18 | 
19 |     n_components: int
20 |       Number of principal components to return
21 | 
22 |     Returns
23 |     ------------
24 |      X_pc: {NumPy ndarray}, shape = [n_samples, k_features]
25 |        Projected dataset
26 | 
27 |     """
28 |     # Calculate pairwise squared Euclidean distances
29 |     # in the MxN dimensional dataset.
30 |     sq_dists = pdist(X, 'sqeuclidean')
31 | 
32 |     # Convert pairwise distances into a square matrix.
33 |     mat_sq_dists = squareform(sq_dists)
34 | 
35 |     # Compute the symmetric kernel matrix.
36 |     K = exp(-gamma * mat_sq_dists)
37 | 
38 |     # Center the kernel matrix.
39 |     N = K.shape[0]
40 |     one_n = np.ones((N,N)) / N
41 |     K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
42 | 
43 |     # Obtaining eigenpairs from the centered kernel matrix
44 |     # numpy.eigh returns them in sorted order
45 |     eigvals, eigvecs = eigh(K)
46 | 
47 |     # Collect the top k eigenvectors (projected samples)
48 |     X_pc = np.column_stack((eigvecs[:, -i]
49 |                             for i in range(1, n_components + 1)))
50 | 
51 |     return X_pc
52 | 
53 | 
54 | from sklearn.datasets import make_circles
55 | X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)
56 | plt.scatter(X[y==0, 0], X[y==0, 1], color='red', marker='^', alpha=0.5)
57 | plt.scatter(X[y==1, 0], X[y==1, 1], color='blue', marker='o', alpha=0.5)
58 | plt.show()
59 | 
60 | from sklearn.decomposition import PCA
61 | scikit_pca = PCA(n_components=2)
62 | X_spca = scikit_pca.fit_transform(X)
63 | fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(7,3))
64 | ax[0].scatter(X_spca[y==0, 0], X_spca[y==0, 1], color='red', marker='^', alpha=0.5)
65 | ax[0].scatter(X_spca[y==1, 0], X_spca[y==1, 1], color='blue', marker='o', alpha=0.5)
66 | ax[1].scatter(X_spca[y==0, 0], np.zeros((500,1))+0.02, color='red', marker='^', alpha=0.5)
67 | ax[1].scatter(X_spca[y==1, 0], np.zeros((500,1))-0.02, color='blue', marker='o', alpha=0.5)
68 | ax[0].set_xlabel('PC1')
69 | ax[0].set_ylabel('PC2')
70 | ax[1].set_ylim([-1, 1])
71 | ax[1].set_yticks([])
72 | ax[1].set_xlabel('PC1')
73 | plt.show()
74 | 
75 | 
76 | X_kpca = rbf_kernel_pca(X, gamma=15, n_components=2)
77 | fig, ax = plt.subplots(nrows=1,ncols=2, figsize=(7,3))
78 | ax[0].scatter(X_kpca[y==0, 0], X_kpca[y==0, 1], color='red', marker='^', alpha=0.5)
79 | ax[0].scatter(X_kpca[y==1, 0], X_kpca[y==1, 1], color='blue', marker='o', alpha=0.5)
80 | ax[1].scatter(X_kpca[y==0, 0], np.zeros((500,1))+0.02, color='red', marker='^', alpha=0.5)
81 | ax[1].scatter(X_kpca[y==1, 0], np.zeros((500,1))-0.02, color='blue', marker='o', alpha=0.5)
82 | ax[0].set_xlabel('PC1')
83 | ax[0].set_ylabel('PC2')
84 | ax[1].set_ylim([-1, 1])
85 | ax[1].set_yticks([])
86 | ax[1].set_xlabel('PC1')
87 | plt.show()


--------------------------------------------------------------------------------
/ch5-PCA-Kernel-newRBF.py:
--------------------------------------------------------------------------------
 1 | from scipy.spatial.distance import pdist, squareform
 2 | from scipy import exp
 3 | from scipy.linalg import eigh
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | def rbf_kernel_pca(X, gamma, n_components):
 8 |     """
 9 |     RBF kernel PCA implementation.
10 | 
11 |     Parameters
12 |     ------------
13 |     X: {NumPy ndarray}, shape = [n_samples, n_features]
14 | 
15 |     gamma: float
16 |       Tuning parameter of the RBF kernel
17 | 
18 |     n_components: int
19 |       Number of principal components to return
20 | 
21 |     Returns
22 |     ------------
23 |      X_pc: {NumPy ndarray}, shape = [n_samples, k_features]
24 |        Projected dataset   
25 | 
26 |      lambdas: list
27 |        Eigenvalues
28 | 
29 |     """
30 |     # Calculate pairwise squared Euclidean distances
31 |     # in the MxN dimensional dataset.
32 |     sq_dists = pdist(X, 'sqeuclidean')
33 | 
34 |     # Convert pairwise distances into a square matrix.
35 |     mat_sq_dists = squareform(sq_dists)
36 | 
37 |     # Compute the symmetric kernel matrix.
38 |     K = exp(-gamma * mat_sq_dists)
39 | 
40 |     # Center the kernel matrix.
41 |     N = K.shape[0]
42 |     one_n = np.ones((N,N)) / N
43 |     K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
44 | 
45 |     # Obtaining eigenpairs from the centered kernel matrix
46 |     # numpy.eigh returns them in sorted order
47 |     eigvals, eigvecs = eigh(K)
48 | 
49 |     # Collect the top k eigenvectors (projected samples)
50 |     alphas = np.column_stack((eigvecs[:,-i]
51 |                     for i in range(1,n_components+1)))
52 | 
53 |     # Collect the corresponding eigenvalues
54 |     lambdas = [eigvals[-i] for i in range(1,n_components+1)]
55 | 
56 |     return alphas, lambdas
57 | 
58 | 
59 | def project_x(x_new, X, gamma, alphas, lambdas):
60 |     pair_dist = np.array([np.sum((x_new-row)**2) for row in X])
61 |     k = np.exp(-gamma * pair_dist)
62 |     return k.dot(alphas / lambdas)
63 | 
64 | 
65 | from sklearn.datasets import make_moons
66 | X, y = make_moons(n_samples=100, random_state=123)
67 | alphas, lambdas =rbf_kernel_pca(X, gamma=15, n_components=1)
68 | x_new = X[25]
69 | x_proj = alphas[25] # original projection
70 | 
71 | x_reproj = project_x(x_new, X, gamma=15, alphas=alphas, lambdas=lambdas)
72 | 
73 | 
74 | plt.scatter(alphas[y==0, 0], np.zeros((50)), color='red', marker='^',alpha=0.5)
75 | plt.scatter(alphas[y==1, 0], np.zeros((50)), color='blue', marker='o', alpha=0.5)
76 | plt.scatter(x_proj, 0, color='black', label='original projection of point X[25]', marker='^', s=100)
77 | plt.scatter(x_reproj, 0, color='green', label='remapped point X[25]', marker='x', s=500)
78 | plt.legend(scatterpoints=1)
79 | plt.show()
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/ch5-PCA-Kernel-scikit.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import KernelPCA
 2 | from sklearn.datasets import make_moons
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | X, y = make_moons(n_samples=100, random_state=123)
 7 | scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=15)
 8 | X_skernpca = scikit_kpca.fit_transform(X)
 9 | plt.scatter(X_skernpca[y==0, 0], X_skernpca[y==0, 1], color='red', marker='^', alpha=0.5)
10 | plt.scatter(X_skernpca[y==1, 0], X_skernpca[y==1, 1], color='blue', marker='o', alpha=0.5)
11 | plt.xlabel('PC1')
12 | plt.ylabel('PC2')
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/ch5-PCA-scikit.py:
--------------------------------------------------------------------------------
 1 | from matplotlib.colors import ListedColormap
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | def plot_decision_regions(X, y, classifier, resolution=0.02):
 6 |     # setup marker generator and color map
 7 |     markers = ('s', 'x', 'o', '^', 'v')
 8 |     colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 9 |     cmap = ListedColormap(colors[:len(np.unique(y))])
10 | 
11 |     # plot the decision surface
12 |     x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
13 |     x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
14 |     xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
15 |                          np.arange(x2_min, x2_max, resolution))
16 |     Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
17 |     Z = Z.reshape(xx1.shape)
18 |     plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
19 |     plt.xlim(xx1.min(), xx1.max())
20 |     plt.ylim(xx2.min(), xx2.max())
21 | 
22 |     # plot class samples
23 |     for idx, cl in enumerate(np.unique(y)):
24 |         plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
25 |                     alpha=0.8, c=cmap(idx),
26 |                     marker=markers[idx], label=cl)
27 | 
28 | import pandas as pd
29 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
30 | 
31 | # Split and Normalize dataset
32 | from sklearn.cross_validation import train_test_split
33 | from sklearn.preprocessing import StandardScaler
34 | X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
35 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
36 | sc = StandardScaler()
37 | X_train_std = sc.fit_transform(X_train)
38 | X_test_std = sc.transform(X_test)
39 | 
40 | from sklearn.linear_model import LogisticRegression
41 | from sklearn.decomposition import PCA
42 | 
43 | # set n_components equal to number of components (k). If set to None, all components are kept
44 | pca = PCA(n_components=2)
45 | lr = LogisticRegression()
46 | X_train_pca = pca.fit_transform(X_train_std)
47 | X_test_pca = pca.transform(X_test_std)
48 | lr.fit(X_train_pca, y_train)
49 | plot_decision_regions(X_train_pca, y_train, classifier=lr)
50 | plt.xlabel('PC1')
51 | plt.ylabel('PC2')
52 | plt.legend(loc='lower left')
53 | plt.show()
54 | 


--------------------------------------------------------------------------------
/ch5-PCA1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 3 | 
 4 | # Split and Normalize dataset
 5 | from sklearn.cross_validation import train_test_split
 6 | from sklearn.preprocessing import StandardScaler
 7 | X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
 8 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=0)
 9 | sc = StandardScaler()
10 | X_train_std = sc.fit_transform(X_train)
11 | X_test_std = sc.transform(X_test)
12 | 
13 | 
14 | # Calculate Covariance Matrix and Eigen Vectors/Values
15 | import numpy as np
16 | cov_mat = np.cov(X_train_std.T)
17 | eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
18 | print('\nEigenvalues \n%s' % eigen_vals)
19 | 
20 | # Plot graph
21 | tot = sum(eigen_vals)
22 | var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
23 | cum_var_exp = np.cumsum(var_exp)
24 | import matplotlib.pyplot as plt
25 | plt.bar(range(1,14), var_exp, alpha=0.5, align='center', label='individual explained variance')
26 | plt.step(range(1,14), cum_var_exp, where='mid',label='cumulative explained variance')
27 | plt.ylabel('Explained variance ratio')
28 | plt.xlabel('Principal components')
29 | plt.legend(loc='best')
30 | plt.show()
31 | 
32 | # Sort the eigenpairs by decreasing order of the eigenvalues:
33 | eigen_pairs =[(np.abs(eigen_vals[i]),eigen_vecs[:,i]) for i in range(len(eigen_vals))]
34 | eigen_pairs.sort(reverse=True)
35 | 
36 | # Pick k eigenvectors (k=2 here)
37 | w= np.hstack((eigen_pairs[0][1][:, np.newaxis],eigen_pairs[1][1][:, np.newaxis]))
38 | 
39 | # Perform PCA
40 | X_train_pca = X_train_std.dot(w)
41 | 
42 | 
43 | # PLot PCAed data
44 | colors = ['r', 'b', 'g']
45 | markers = ['s', 'x', 'o']
46 | for l, c, m in zip(np.unique(y_train), colors, markers):
47 |     plt.scatter(X_train_pca[y_train==l, 0], X_train_pca[y_train==l, 1],  c=c, label=l, marker=m)
48 |     plt.xlabel('PC 1')
49 |     plt.ylabel('PC 2')
50 |     plt.legend(loc='lower left')
51 |     plt.show()
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/ch6-F1-score.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | from sklearn.pipeline import Pipeline
 3 | 
 4 | import pandas as pd
 5 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 6 | 
 7 | from sklearn.preprocessing import LabelEncoder
 8 | X = df.loc[:, 2:].values
 9 | y = df.loc[:, 1].values
10 | le = LabelEncoder()
11 | y = le.fit_transform(y)
12 | 
13 | from sklearn.cross_validation import train_test_split
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
15 | 
16 | from sklearn.svm import SVC
17 | pipe_svc = Pipeline([('scl', StandardScaler()),  ('clf', SVC(random_state=1))])
18 | 
19 | 
20 | pipe_svc.fit(X_train, y_train)
21 | y_pred = pipe_svc.predict(X_test)
22 | 
23 | from sklearn.metrics import precision_score
24 | from sklearn.metrics  import recall_score, f1_score
25 | 
26 | print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
27 | print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
28 | print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))


--------------------------------------------------------------------------------
/ch6-Kfold-CrossValidation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 3 | 
 4 | from sklearn.preprocessing import LabelEncoder
 5 | X = df.loc[:, 2:].values
 6 | y = df.loc[:, 1].values
 7 | le = LabelEncoder()
 8 | y = le.fit_transform(y)
 9 | 
10 | from sklearn.cross_validation import train_test_split
11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.decomposition import PCA
14 | from sklearn.linear_model import LogisticRegression
15 | from sklearn.pipeline import Pipeline
16 | pipe_lr = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=2)),('clf', LogisticRegression(random_state=1))])
17 | #pipe_lr.fit(X_train, y_train)
18 | 
19 | 
20 | import numpy as np
21 | from sklearn.cross_validation import StratifiedKFold
22 | kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1)
23 | scores = []
24 | for k, (train, test) in enumerate(kfold):
25 |     pipe_lr.fit(X_train[train], y_train[train])
26 |     score = pipe_lr.score(X_train[test], y_train[test])
27 |     scores.append(score)
28 |     print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score))
29 | 
30 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
31 | 


--------------------------------------------------------------------------------
/ch6-ModelSelect-ParamTune-Nested-Kfold-CrossValidation.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | from sklearn.pipeline import Pipeline
 3 | import numpy as np
 4 | 
 5 | import pandas as pd
 6 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 7 | 
 8 | from sklearn.preprocessing import LabelEncoder
 9 | X = df.loc[:, 2:].values
10 | y = df.loc[:, 1].values
11 | le = LabelEncoder()
12 | y = le.fit_transform(y)
13 | 
14 | from sklearn.cross_validation import train_test_split
15 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
16 | 
17 | 
18 | from sklearn.grid_search import GridSearchCV
19 | from sklearn.svm import SVC
20 | pipe_svc = Pipeline([('scl', StandardScaler()),  ('clf', SVC(random_state=1))])
21 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
22 | param_grid = [{'clf__C': param_range,
23 |                 'clf__kernel': ['linear']},
24 |                 {'clf__C': param_range,
25 |                 'clf__gamma': param_range,
26 |                 'clf__kernel': ['rbf']}]
27 | 
28 | from sklearn.cross_validation import cross_val_score
29 | # Inner Loop for parameter tuning
30 | gs = GridSearchCV(estimator=pipe_svc,
31 |                 param_grid=param_grid,
32 |                 scoring='accuracy',
33 |                 cv=2,
34 |                 n_jobs=-1)
35 | 
36 | #Outer Loop for model selection
37 | scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
38 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
39 | 
40 | 
41 | from sklearn.tree import DecisionTreeClassifier
42 | gs = GridSearchCV(
43 |         estimator=DecisionTreeClassifier(random_state=0),
44 |         param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
45 |         scoring='accuracy',
46 |         cv=2)
47 | 
48 | scores = cross_val_score(gs,
49 |             X_train,
50 |             y_train,
51 |             scoring='accuracy',
52 |             cv=5)
53 | 
54 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
55 | 


--------------------------------------------------------------------------------
/ch6-ROC-curve.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | from sklearn.pipeline import Pipeline
 3 | from sklearn.decomposition import PCA
 4 | import numpy as np
 5 | from sklearn.cross_validation import StratifiedKFold
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.linear_model import LogisticRegression
 8 | 
 9 | import pandas as pd
10 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
11 | 
12 | from sklearn.preprocessing import LabelEncoder
13 | X = df.loc[:, 2:].values
14 | y = df.loc[:, 1].values
15 | le = LabelEncoder()
16 | y = le.fit_transform(y)
17 | 
18 | from sklearn.cross_validation import train_test_split
19 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
20 | 
21 | from sklearn.metrics import roc_curve, auc
22 | from scipy import interp
23 | pipe_lr = Pipeline([('scl', StandardScaler()),
24 |                     ('pca', PCA(n_components=2)),
25 |                     ('clf', LogisticRegression(penalty='l2', random_state=0, C=100.0))])
26 | 
27 | X_train2 = X_train[:, [4, 14]]
28 | cv = StratifiedKFold(y_train,n_folds=3,random_state=1)
29 | 
30 | fig = plt.figure(figsize=(7, 5))
31 | mean_tpr = 0.0
32 | mean_fpr = np.linspace(0, 1, 100)
33 | all_tpr = []
34 | 
35 | 
36 | for i, (train, test) in enumerate(cv):
37 |     probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test])
38 |     fpr, tpr, thresholds = roc_curve(y_train[test],probas[:, 1],pos_label=1)
39 |     mean_tpr += interp(mean_fpr, fpr, tpr)
40 |     mean_tpr[0] = 0.0
41 |     roc_auc = auc(fpr, tpr)
42 |     plt.plot(fpr,tpr,lw=1,label='ROC fold %d (area = %0.2f)'% (i+1, roc_auc))
43 | 
44 | 
45 | plt.plot([0, 1],[0, 1],linestyle='--',color=(0.6, 0.6, 0.6),label='random guessing')
46 | mean_tpr /= len(cv)
47 | mean_tpr[-1] = 1.0
48 | mean_auc = auc(mean_fpr, mean_tpr)
49 | plt.plot(mean_fpr, mean_tpr, 'k--',label='mean ROC (area = %0.2f)' % mean_auc, lw=2)
50 | plt.plot([0, 0, 1],[0, 1, 1],lw=2,linestyle=':',color='black',label='perfect performance')
51 | 
52 | plt.xlim([-0.05, 1.05])
53 | plt.ylim([-0.05, 1.05])
54 | plt.xlabel('false positive rate')
55 | plt.ylabel('true positive rate')
56 | plt.title('Receiver Operator Characteristic')
57 | plt.legend(loc="lower right")
58 | plt.show()
59 | 
60 | 
61 | pipe_lr = pipe_lr.fit(X_train2, y_train)
62 | y_pred2 = pipe_lr.predict(X_test[:, [4, 14]])
63 | 
64 | from sklearn.metrics import roc_auc_score
65 | from sklearn.metrics import accuracy_score
66 | print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred2))
67 | 
68 | print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred2))
69 | 
70 | 


--------------------------------------------------------------------------------
/ch6-confusion-matrix.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | from sklearn.pipeline import Pipeline
 3 | 
 4 | import pandas as pd
 5 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 6 | 
 7 | from sklearn.preprocessing import LabelEncoder
 8 | X = df.loc[:, 2:].values
 9 | y = df.loc[:, 1].values
10 | le = LabelEncoder()
11 | y = le.fit_transform(y)
12 | 
13 | from sklearn.cross_validation import train_test_split
14 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
15 | 
16 | from sklearn.svm import SVC
17 | pipe_svc = Pipeline([('scl', StandardScaler()),  ('clf', SVC(random_state=1))])
18 | 
19 | from sklearn.metrics import confusion_matrix
20 | pipe_svc.fit(X_train, y_train)
21 | y_pred = pipe_svc.predict(X_test)
22 | confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
23 | # Print confusion Matrix
24 | print(confmat)
25 | 
26 | 
27 | # Plot
28 | import matplotlib.pyplot as plt
29 | fig, ax = plt.subplots(figsize=(2.5, 2.5))
30 | ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
31 | for i in range(confmat.shape[0]):
32 |     for j in range(confmat.shape[1]):
33 |         ax.text(x=j, y=i,s=confmat[i, j], va='center', ha='center')
34 | 
35 | plt.xlabel('predicted label')
36 | plt.ylabel('true label')
37 | plt.show()


--------------------------------------------------------------------------------
/ch6-hyperparameterTuning-gridSearch.py:
--------------------------------------------------------------------------------
 1 | from sklearn.preprocessing import StandardScaler
 2 | from sklearn.pipeline import Pipeline
 3 | 
 4 | 
 5 | import pandas as pd
 6 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 7 | 
 8 | from sklearn.preprocessing import LabelEncoder
 9 | X = df.loc[:, 2:].values
10 | y = df.loc[:, 1].values
11 | le = LabelEncoder()
12 | y = le.fit_transform(y)
13 | 
14 | from sklearn.cross_validation import train_test_split
15 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
16 | 
17 | 
18 | from sklearn.grid_search import GridSearchCV
19 | from sklearn.svm import SVC
20 | pipe_svc = Pipeline([('scl', StandardScaler()),  ('clf', SVC(random_state=1))])
21 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
22 | param_grid = [{'clf__C': param_range,
23 |                 'clf__kernel': ['linear']},
24 |                 {'clf__C': param_range,
25 |                 'clf__gamma': param_range,
26 |                 'clf__kernel': ['rbf']}]
27 | 
28 | gs = GridSearchCV(estimator=pipe_svc,
29 |                     param_grid=param_grid,
30 |                     scoring='accuracy',
31 |                     cv=10,
32 |                     n_jobs=-1)
33 | 
34 | gs = gs.fit(X_train, y_train)
35 | 
36 | print(gs.best_score_)
37 | print(gs.best_params_)
38 | 
39 | clf = gs.best_estimator_
40 | clf.fit(X_train, y_train)
41 | 
42 | print('Test accuracy: %.3f' % clf.score(X_test, y_test))
43 | 
44 | 


--------------------------------------------------------------------------------
/ch6-learningCurve.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from sklearn.learning_curve import learning_curve
 3 | from sklearn.preprocessing import StandardScaler
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.pipeline import Pipeline
 6 | import numpy as np
 7 | 
 8 | import pandas as pd
 9 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
10 | 
11 | from sklearn.preprocessing import LabelEncoder
12 | X = df.loc[:, 2:].values
13 | y = df.loc[:, 1].values
14 | le = LabelEncoder()
15 | y = le.fit_transform(y)
16 | 
17 | from sklearn.cross_validation import train_test_split
18 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
19 | 
20 | pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])
21 | 
22 | train_sizes, train_scores, test_scores = learning_curve(estimator=pipe_lr,
23 |                                                         X=X_train,
24 |                                                         y=y_train,
25 |                                                         train_sizes=np.linspace(0.1, 1.0, 10),
26 |                                                         cv=10,
27 |                                                         n_jobs=1)
28 | train_mean = np.mean(train_scores, axis=1)
29 | train_std = np.std(train_scores, axis=1)
30 | test_mean = np.mean(test_scores, axis=1)
31 | test_std = np.std(test_scores, axis=1)
32 | plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
33 | 
34 | plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
35 | plt.plot(train_sizes, test_mean, color='green', linestyle='--',marker='s', markersize=5, label='validation accuracy')
36 | plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
37 | plt.grid()
38 | plt.xlabel('Number of training samples')
39 | plt.ylabel('Accuracy')
40 | plt.legend(loc='lower right')
41 | plt.ylim([0.8, 1.0])
42 | plt.show()
43 | 
44 | 


--------------------------------------------------------------------------------
/ch6-pipeline.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 3 | 
 4 | from sklearn.preprocessing import LabelEncoder
 5 | X = df.loc[:, 2:].values
 6 | y = df.loc[:, 1].values
 7 | le = LabelEncoder()
 8 | y = le.fit_transform(y)
 9 | 
10 | from sklearn.cross_validation import train_test_split
11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
12 | 
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.decomposition import PCA
15 | from sklearn.linear_model import LogisticRegression
16 | from sklearn.pipeline import Pipeline
17 | pipe_lr = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=2)),('clf', LogisticRegression(random_state=1))])
18 | pipe_lr.fit(X_train, y_train)
19 | print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
20 | 
21 | 


--------------------------------------------------------------------------------
/ch6-scikit-Kfold-CrossValidation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 3 | 
 4 | from sklearn.preprocessing import LabelEncoder
 5 | X = df.loc[:, 2:].values
 6 | y = df.loc[:, 1].values
 7 | le = LabelEncoder()
 8 | y = le.fit_transform(y)
 9 | 
10 | from sklearn.cross_validation import train_test_split
11 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.decomposition import PCA
14 | from sklearn.linear_model import LogisticRegression
15 | from sklearn.pipeline import Pipeline
16 | pipe_lr = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=2)),('clf', LogisticRegression(random_state=1))])
17 | #pipe_lr.fit(X_train, y_train)
18 | 
19 | import numpy as np
20 | from sklearn.cross_validation import cross_val_score
21 | # use 10 kfolds on 1 CPU
22 | scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1)
23 | print('CV accuracy scores: %s' % scores)
24 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
25 | 


--------------------------------------------------------------------------------
/ch6-validationCurve.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from sklearn.preprocessing import StandardScaler
 3 | from sklearn.linear_model import LogisticRegression
 4 | from sklearn.pipeline import Pipeline
 5 | import numpy as np
 6 | 
 7 | import pandas as pd
 8 | df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
 9 | 
10 | from sklearn.preprocessing import LabelEncoder
11 | X = df.loc[:, 2:].values
12 | y = df.loc[:, 1].values
13 | le = LabelEncoder()
14 | y = le.fit_transform(y)
15 | 
16 | from sklearn.cross_validation import train_test_split
17 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
18 | 
19 | pipe_lr = Pipeline([('scl', StandardScaler()),('clf', LogisticRegression(penalty='l2', random_state=0))])
20 | 
21 | 
22 | from sklearn.learning_curve import validation_curve
23 | param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
24 | train_scores, test_scores = validation_curve(
25 |                     estimator=pipe_lr,
26 |                     X=X_train,
27 |                     y=y_train,
28 |                     param_name='clf__C',
29 |                     param_range=param_range,
30 |                     cv=10)
31 | 
32 | train_mean = np.mean(train_scores, axis=1)
33 | train_std = np.std(train_scores, axis=1)
34 | test_mean = np.mean(test_scores, axis=1)
35 | test_std = np.std(test_scores, axis=1)
36 | plt.plot(param_range, train_mean,
37 |             color='blue', marker='o',
38 |             markersize=5,
39 |             label='training accuracy')
40 | 
41 | plt.fill_between(param_range, train_mean + train_std,
42 |                 train_mean - train_std, alpha=0.15,
43 |                 color='blue')
44 | 
45 | plt.plot(param_range, test_mean,
46 |             color='green', linestyle='--',
47 |             marker='s', markersize=5,
48 |             label='validation accuracy')
49 | 
50 | plt.fill_between(param_range,
51 |             test_mean + test_std,
52 |             test_mean - test_std,
53 |             alpha=0.15, color='green')
54 | 
55 | plt.grid()
56 | plt.xscale('log')
57 | plt.legend(loc='lower right')
58 | plt.xlabel('Parameter C')
59 | plt.ylabel('Accuracy')
60 | plt.ylim([0.8, 1.0])
61 | plt.show()
62 | 
63 | 


--------------------------------------------------------------------------------
/ch7-AdaBoost.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 3 | df_wine.columns = ['Class label', 'Alcohol',
 4 |                     'Malic acid', 'Ash',
 5 |                     'Alcalinity of ash',
 6 |                     'Magnesium', 'Total phenols',
 7 |                     'Flavanoids', 'Nonflavanoid phenols',
 8 |                     'Proanthocyanins',
 9 |                     'Color intensity', 'Hue',
10 |                     'OD280/OD315 of diluted wines',
11 |                     'Proline']
12 | 
13 | df_wine = df_wine[df_wine['Class label'] != 1]
14 | y = df_wine['Class label'].values
15 | X = df_wine[['Alcohol', 'Hue']].values
16 | 
17 | from sklearn.preprocessing import LabelEncoder
18 | from sklearn.cross_validation import train_test_split
19 | le = LabelEncoder()
20 | y = le.fit_transform(y)
21 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=1)
22 | 
23 | 
24 | from sklearn.ensemble import AdaBoostClassifier
25 | from sklearn.tree import DecisionTreeClassifier
26 | from sklearn.metrics import accuracy_score
27 | tree = DecisionTreeClassifier(criterion='entropy',
28 |                                max_depth=None,
29 |                                random_state=0)
30 | 
31 | ada = AdaBoostClassifier(base_estimator=tree,
32 |                           n_estimators=500,
33 |                           learning_rate=0.1,
34 |                           random_state=0)
35 | 
36 | tree = tree.fit(X_train, y_train)
37 | y_train_pred = tree.predict(X_train)
38 | y_test_pred = tree.predict(X_test)
39 | tree_train = accuracy_score(y_train, y_train_pred)
40 | tree_test = accuracy_score(y_test, y_test_pred)
41 | print('Decision tree train/test accuracies %.3f/%.3f'% (tree_train, tree_test))
42 | 
43 | ada = ada.fit(X_train, y_train)
44 | y_train_pred = ada.predict(X_train)
45 | y_test_pred = ada.predict(X_test)
46 | ada_train = accuracy_score(y_train, y_train_pred)
47 | ada_test = accuracy_score(y_test, y_test_pred)
48 | print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test))
49 | 
50 | 
51 | 
52 | # Plot
53 | import numpy as np
54 | import matplotlib.pyplot as plt
55 | 
56 | x_min = X_train[:, 0].min() - 1
57 | x_max = X_train[:, 0].max() + 1
58 | y_min = X_train[:, 1].min() - 1
59 | y_max = X_train[:, 1].max() + 1
60 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
61 | f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3))
62 | 
63 | for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision Tree', 'AdaBoost']):
64 |     clf.fit(X_train, y_train)
65 |     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
66 |     Z = Z.reshape(xx.shape)
67 |     axarr[idx].contourf(xx, yy, Z, alpha=0.3)
68 |     axarr[idx].scatter(X_train[y_train==0, 0],
69 |                         X_train[y_train==0, 1],
70 |                         c='blue',
71 |                         marker='^')
72 |     axarr[idx].scatter(X_train[y_train==1, 0],
73 |                         X_train[y_train==1, 1],
74 |                         c='red',
75 |                         marker='o')
76 |     axarr[idx].set_title(tt)
77 |     axarr[0].set_ylabel('Alcohol', fontsize=12)
78 | 
79 | 
80 | plt.text(10.2, -1.2,
81 |           s='Hue',
82 |           ha='center',
83 |           va='center',
84 |           fontsize=12)
85 | plt.show()
86 | 


--------------------------------------------------------------------------------
/ch7-BaggingClassifiers.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
 3 | df_wine.columns = ['Class label', 'Alcohol',
 4 |                     'Malic acid', 'Ash',
 5 |                     'Alcalinity of ash',
 6 |                     'Magnesium', 'Total phenols',
 7 |                     'Flavanoids', 'Nonflavanoid phenols',
 8 |                     'Proanthocyanins',
 9 |                     'Color intensity', 'Hue',
10 |                     'OD280/OD315 of diluted wines',
11 |                     'Proline']
12 | 
13 | df_wine = df_wine[df_wine['Class label'] != 1]
14 | y = df_wine['Class label'].values
15 | X = df_wine[['Alcohol', 'Hue']].values
16 | 
17 | from sklearn.preprocessing import LabelEncoder
18 | from sklearn.cross_validation import train_test_split
19 | le = LabelEncoder()
20 | y = le.fit_transform(y)
21 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=1)
22 | 
23 | from sklearn.ensemble import BaggingClassifier
24 | from sklearn.tree import DecisionTreeClassifier
25 | tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)
26 | bag = BaggingClassifier(base_estimator=tree,
27 |                         n_estimators=500,
28 |                         max_samples=1.0,
29 |                         max_features=1.0,
30 |                         bootstrap=True,
31 |                         bootstrap_features=False,
32 |                         n_jobs=1,
33 |                         random_state=1)
34 | 
35 | from sklearn.metrics import accuracy_score
36 | tree = tree.fit(X_train, y_train)
37 | y_train_pred = tree.predict(X_train)
38 | y_test_pred = tree.predict(X_test)
39 | tree_train = accuracy_score(y_train, y_train_pred)
40 | tree_test = accuracy_score(y_test, y_test_pred)
41 | print('Decision tree train/test accuracies %.3f/%.3f'% (tree_train, tree_test))
42 | 
43 | bag = bag.fit(X_train, y_train)
44 | y_train_pred = bag.predict(X_train)
45 | y_test_pred = bag.predict(X_test)
46 | bag_train = accuracy_score(y_train, y_train_pred)
47 | bag_test = accuracy_score(y_test, y_test_pred)
48 | print('Bagging train/test accuracies %.3f/%.3f'% (bag_train, bag_test))
49 | 
50 | 
51 | #Plot
52 | import numpy as np
53 | import matplotlib.pyplot as plt
54 | x_min = X_train[:, 0].min() - 1
55 | x_max = X_train[:, 0].max() + 1
56 | y_min = X_train[:, 1].min() - 1
57 | y_max = X_train[:, 1].max() + 1
58 | xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),np.arange(y_min, y_max, 0.1))
59 | f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row',figsize=(8, 3))
60 | 
61 | for idx, clf, tt in zip([0, 1],[tree, bag],['Decision Tree', 'Bagging']):
62 |     clf.fit(X_train, y_train)
63 |     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
64 |     Z = Z.reshape(xx.shape)
65 |     axarr[idx].contourf(xx, yy, Z, alpha=0.3)
66 |     axarr[idx].scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], c='blue', marker='^')
67 |     axarr[idx].scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], c='red', marker='o')
68 |     axarr[idx].set_title(tt)
69 | 
70 | axarr[0].set_ylabel('Alcohol', fontsize=12)
71 | plt.text(10.2, -1.2, s='Hue',ha='center', va='center', fontsize=12)
72 | plt.show()


--------------------------------------------------------------------------------
/ch7-majorityVote-Classifier.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator
  2 | from sklearn.base import ClassifierMixin
  3 | from sklearn.preprocessing import LabelEncoder
  4 | from sklearn.externals import six
  5 | from sklearn.base import clone
  6 | from sklearn.pipeline import _name_estimators
  7 | import numpy as np
  8 | import operator
  9 | 
 10 | 
 11 | class MajorityVoteClassifier(BaseEstimator,
 12 |                              ClassifierMixin):
 13 |     """ A majority vote ensemble classifier
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     classifiers : array-like, shape = [n_classifiers]
 18 |       Different classifiers for the ensemble
 19 | 
 20 |     vote : str, {'classlabel', 'probability'}
 21 |       Default: 'classlabel'
 22 |       If 'classlabel' the prediction is based on
 23 |       the argmax of class labels. Else if
 24 |       'probability', the argmax of the sum of
 25 |       probabilities is used to predict the class label
 26 |       (recommended for calibrated classifiers).
 27 | 
 28 |     weights : array-like, shape = [n_classifiers]
 29 |       Optional, default: None
 30 |       If a list of `int` or `float` values are
 31 |       provided, the classifiers are weighted by
 32 |       importance; Uses uniform weights if `weights=None`.
 33 | 
 34 |     """
 35 |     def __init__(self, classifiers,
 36 |                  vote='classlabel', weights=None):
 37 | 
 38 |         self.classifiers = classifiers
 39 |         self.named_classifiers = {key: value for
 40 |                                   key, value in
 41 |                                   _name_estimators(classifiers)}
 42 |         self.vote = vote
 43 |         self.weights = weights
 44 | 
 45 |     def fit(self, X, y):
 46 |         """ Fit classifiers.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         X : {array-like, sparse matrix},
 51 |             shape = [n_samples, n_features]
 52 |             Matrix of training samples.
 53 | 
 54 |         y : array-like, shape = [n_samples]
 55 |             Vector of target class labels.
 56 | 
 57 |         Returns
 58 |         -------
 59 |         self : object
 60 | 
 61 |         """
 62 |         # Use LabelEncoder to ensure class labels start
 63 |         # with 0, which is important for np.argmax
 64 |         # call in self.predict
 65 |         self.lablenc_ = LabelEncoder()
 66 |         self.lablenc_.fit(y)
 67 |         self.classes_ = self.lablenc_.classes_
 68 |         self.classifiers_ = []
 69 |         for clf in self.classifiers:
 70 |             fitted_clf = clone(clf).fit(X,
 71 |                               self.lablenc_.transform(y))
 72 |             self.classifiers_.append(fitted_clf)
 73 |         return self
 74 | 
 75 | 
 76 |     def predict(self, X):
 77 |         """ Predict class labels for X.
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         X : {array-like, sparse matrix},
 82 |             Shape = [n_samples, n_features]
 83 |             Matrix of training samples.
 84 | 
 85 |         Returns
 86 |         ----------
 87 |         maj_vote : array-like, shape = [n_samples]
 88 |             Predicted class labels.
 89 | 
 90 |         """
 91 |         if self.vote == 'probability':
 92 |             maj_vote = np.argmax(self.predict_proba(X),
 93 |                                  axis=1)
 94 |         else:  # 'classlabel' vote
 95 | 
 96 |             #  Collect results from clf.predict calls
 97 |             predictions = np.asarray([clf.predict(X)
 98 |                                       for clf in
 99 |                                       self.classifiers_]).T
100 | 
101 |             maj_vote = np.apply_along_axis(
102 |                            lambda x:
103 |                            np.argmax(np.bincount(x,
104 |                                         weights=self.weights)),
105 |                            axis=1,
106 |                            arr=predictions)
107 |         maj_vote = self.lablenc_.inverse_transform(maj_vote)
108 |         return maj_vote
109 | 
110 | 
111 |     def predict_proba(self, X):
112 |         """ Predict class probabilities for X.
113 | 
114 |         Parameters
115 |         ----------
116 |         X : {array-like, sparse matrix},
117 |             shape = [n_samples, n_features]
118 |             Training vectors, where n_samples is
119 |             the number of samples and
120 |             n_features is the number of features.
121 | 
122 |         Returns
123 |         ----------
124 |         avg_proba : array-like,
125 |             shape = [n_samples, n_classes]
126 |             Weighted average probability for
127 |             each class per sample.
128 | 
129 |         """
130 |         probas = np.asarray([clf.predict_proba(X)
131 |                              for clf in self.classifiers_])
132 |         avg_proba = np.average(probas,
133 |                                axis=0, weights=self.weights)
134 |         return avg_proba
135 | 
136 | 
137 |     def get_params(self, deep=True):
138 |         """ Get classifier parameter names for GridSearch"""
139 |         if not deep:
140 |             return super(MajorityVoteClassifier,
141 |                          self).get_params(deep=False)
142 |         else:
143 |             out = self.named_classifiers.copy()
144 |             for name, step in\
145 |                     six.iteritems(self.named_classifiers):
146 |                 for key, value in six.iteritems(
147 |                         step.get_params(deep=True)):
148 |                     out['%s__%s' % (name, key)] = value
149 |             return out
150 | 
151 | 
152 | # Main
153 | 
154 | from sklearn import datasets
155 | from sklearn.cross_validation import train_test_split
156 | from sklearn.preprocessing import StandardScaler
157 | from sklearn.preprocessing import LabelEncoder
158 | iris = datasets.load_iris()
159 | X, y = iris.data[50:, [1, 2]], iris.target[50:]
160 | le = LabelEncoder()
161 | y = le.fit_transform(y)
162 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
163 | 
164 | from sklearn.cross_validation import cross_val_score
165 | from sklearn.linear_model import LogisticRegression
166 | from sklearn.tree import DecisionTreeClassifier
167 | from sklearn.neighbors import KNeighborsClassifier
168 | from sklearn.pipeline import Pipeline
169 | 
170 | import numpy as np
171 | clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=0)
172 | clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)
173 | clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
174 | pipe1 = Pipeline([['sc', StandardScaler()],['clf', clf1]])
175 | pipe3 = Pipeline([['sc', StandardScaler()],['clf', clf3]])
176 | clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']
177 | 
178 | print('10-fold cross validation:\n')
179 | 
180 | for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
181 |     scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
182 |     print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
183 | 
184 | mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
185 | clf_labels += ['Majority Voting']
186 | all_clf = [pipe1, clf2, pipe3, mv_clf]
187 | for clf, label in zip(all_clf, clf_labels):
188 |     scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
189 |     print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
190 | 
191 | 
192 | 
193 | #plot
194 | from sklearn.metrics import roc_curve
195 | from sklearn.metrics import auc
196 | import matplotlib.pyplot as plt
197 | colors = ['black', 'orange', 'blue', 'green']
198 | linestyles = [':', '--', '-.', '-']
199 | for clf, label, clr, ls in zip(all_clf, clf_labels, colors, linestyles):
200 |     # assuming the label of the positive class is 1
201 |     y_pred = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
202 |     fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
203 |     roc_auc = auc(x=fpr, y=tpr)
204 |     plt.plot(fpr, tpr, color=clr, linestyle=ls, label='%s (auc = %0.2f)' % (label, roc_auc))
205 | 
206 | plt.legend(loc='lower right')
207 | plt.plot([0, 1], [0, 1], linestyle='--', color='gray', linewidth=2)
208 | plt.xlim([-0.1, 1.1])
209 | plt.ylim([-0.1, 1.1])
210 | plt.grid()
211 | plt.xlabel('False Positive Rate')
212 | plt.ylabel('True Positive Rate')
213 | plt.show()
214 | 
215 | 
216 | 
217 | #grid search for tuning params for classifier
218 | from sklearn.grid_search import GridSearchCV
219 | params = {'decisiontreeclassifier__max_depth': [1, 2],'pipeline-1__clf__C': [0.001, 0.1, 100.0]}
220 | grid = GridSearchCV(estimator=mv_clf, param_grid=params, cv=10,  scoring='roc_auc')
221 | grid.fit(X_train, y_train)
222 | print('Best parameters: %s' % grid.best_params_)
223 | print('Accuracy: %.2f' % grid.best_score_)


--------------------------------------------------------------------------------
/ch8-Online-Sentiment-Analysis.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | from nltk.corpus import stopwords
 4 | stop = stopwords.words('english')
 5 | 
 6 | def tokenizer(text):
 7 |     text = re.sub('<[^>]*>', '', text)
 8 |     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
 9 |     text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
10 |     tokenized = [w for w in text.split() if w not in stop]
11 |     return tokenized
12 | 
13 | 
14 | def stream_docs(path):
15 |     with open(path, 'r', encoding='utf-8') as csv:
16 |         next(csv) # skip header
17 |         for line in csv:
18 |             text, label = line[:-3], int(line[-2])
19 |             yield text, label
20 | 
21 | 
22 | def get_minibatch(doc_stream, size):
23 |     docs, y = [], []
24 |     try:
25 |         for _ in range(size):
26 |             text, label = next(doc_stream)
27 |             docs.append(text)
28 |             y.append(label)
29 |     except StopIteration:
30 |         return None, None
31 | 
32 |     return docs, y
33 | 
34 | 
35 | from sklearn.feature_extraction.text import HashingVectorizer
36 | from sklearn.linear_model import SGDClassifier
37 | 
38 | vect = HashingVectorizer(decode_error='ignore',
39 |                           n_features=2**21,
40 |                           preprocessor=None,
41 |                           tokenizer=tokenizer)
42 | 
43 | # regularized linear models with stochastic gradient descent (SGD)
44 | clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
45 | doc_stream = stream_docs(path='./movie_data.csv')
46 | 
47 | import pyprind
48 | pbar = pyprind.ProgBar(45)
49 | classes = np.array([0, 1])
50 | for _ in range(45):
51 |     X_train, y_train = get_minibatch(doc_stream, size=1000)
52 |     if not X_train:
53 |         break
54 | 
55 |     X_train = vect.transform(X_train)
56 |     clf.partial_fit(X_train, y_train, classes=classes)
57 |     pbar.update()
58 | 
59 | 
60 | X_test, y_test = get_minibatch(doc_stream, size=5000)
61 | X_test = vect.transform(X_test)
62 | print('Accuracy: %.3f' % clf.score(X_test, y_test))
63 | 
64 | clf = clf.partial_fit(X_test, y_test)
65 | 


--------------------------------------------------------------------------------
/ch8-Sentiment-Analysis.py:
--------------------------------------------------------------------------------
 1 | import pyprind
 2 | import pandas as pd
 3 | import os
 4 | pbar = pyprind.ProgBar(50000)
 5 | labels = {'pos':1, 'neg':0}
 6 | df = pd.DataFrame()
 7 | for s in ('test', 'train'):
 8 |     for l in ('pos', 'neg'):
 9 |         path ='./aclImdb/%s/%s' % (s, l)
10 |         for file in os.listdir(path):
11 |             with open(os.path.join(path, file), 'r') as infile:
12 |                 txt = infile.read()
13 | 
14 |             df = df.append([[txt, labels[l]]], ignore_index=True)
15 |             pbar.update()
16 | 
17 | 
18 | df.columns = ['review', 'sentiment']
19 | 
20 | import numpy as np
21 | np.random.seed(0)
22 | df = df.reindex(np.random.permutation(df.index))
23 | df.to_csv('./movie_data.csv', index=False)
24 | df = pd.read_csv('./movie_data.csv')
25 | 
26 | X_train = df.loc[:25000, 'review'].values
27 | y_train = df.loc[:25000, 'sentiment'].values
28 | X_test = df.loc[25000:, 'review'].values
29 | y_test = df.loc[25000:, 'sentiment'].values
30 | 
31 | 
32 | from sklearn.grid_search import GridSearchCV
33 | from sklearn.pipeline import Pipeline
34 | from sklearn.linear_model import LogisticRegression
35 | from sklearn.feature_extraction.text import TfidfVectorizer
36 | 
37 | import nltk
38 | nltk.download('stopwords')
39 | from nltk.corpus import stopwords
40 | stop = stopwords.words('english')
41 | 
42 | from nltk.stem.porter import PorterStemmer
43 | porter = PorterStemmer()
44 | 
45 | def tokenizer(text):
46 |     return text.split()
47 | 
48 | def tokenizer_porter(text):
49 |     return [porter.stem(word) for word in text.split()]
50 | 
51 | tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
52 | 
53 | param_grid = [{'vect__ngram_range': [(1,1)],
54 |                'vect__stop_words': [stop, None],
55 |                'vect__tokenizer': [tokenizer,
56 |                                    tokenizer_porter],
57 |                'clf__penalty': ['l1', 'l2'],
58 |                'clf__C': [1.0, 10.0, 100.0]},
59 |              {'vect__ngram_range': [(1,1)],
60 |                'vect__stop_words': [stop, None],
61 |                'vect__tokenizer': [tokenizer,
62 |                                    tokenizer_porter],
63 |                'vect__use_idf':[False],
64 |                'vect__norm':[None],
65 |                'clf__penalty': ['l1', 'l2'],
66 |                'clf__C': [1.0, 10.0, 100.0]}
67 |               ]
68 | 
69 | lr_tfidf = Pipeline([('vect', tfidf),
70 |                      ('clf',
71 |                       LogisticRegression(random_state=0))])
72 | 
73 | gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
74 |                            scoring='accuracy',
75 |                            cv=5, verbose=1,
76 |                            n_jobs=-1)
77 | 
78 | gs_lr_tfidf.fit(X_train, y_train)
79 | 
80 | print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
81 | print('CV Accuracy: %.3f'% gs_lr_tfidf.best_score_)
82 | 
83 | clf = gs_lr_tfidf.best_estimator_
84 | print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/ch8-bagOfWords.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_extraction.text import CountVectorizer
 3 | 
 4 | # divide text into 1-grams (use ngram_range=2,2 for 2-grams)
 5 | count = CountVectorizer(ngram_range=(1,1))
 6 | 
 7 | docs = np.array([
 8 |         'The sun is shining',
 9 |         'The weather is sweet',
10 |         'The sun is shining and the weather is sweet'])
11 | 
12 | bag = count.fit_transform(docs)
13 | print(count.vocabulary_)
14 | print(bag.toarray())
15 | 
16 | import re
17 | def preprocessor(text):
18 |     text = re.sub('<[^>]*>', '', text)
19 |     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
20 |     text = re.sub('[\W]+', ' ', text.lower()) +  ''.join(emoticons).replace('-', '')
21 |     return text
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/ch9-pickle-model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | from nltk.corpus import stopwords
 4 | stop = stopwords.words('english')
 5 | 
 6 | def tokenizer(text):
 7 |     text = re.sub('<[^>]*>', '', text)
 8 |     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text.lower())
 9 |     text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
10 |     tokenized = [w for w in text.split() if w not in stop]
11 |     return tokenized
12 | 
13 | 
14 | def stream_docs(path):
15 |     with open(path, 'r', encoding='utf-8') as csv:
16 |         next(csv) # skip header
17 |         for line in csv:
18 |             text, label = line[:-3], int(line[-2])
19 |             yield text, label
20 | 
21 | 
22 | def get_minibatch(doc_stream, size):
23 |     docs, y = [], []
24 |     try:
25 |         for _ in range(size):
26 |             text, label = next(doc_stream)
27 |             docs.append(text)
28 |             y.append(label)
29 |     except StopIteration:
30 |         return None, None
31 | 
32 |     return docs, y
33 | 
34 | 
35 | from sklearn.feature_extraction.text import HashingVectorizer
36 | from sklearn.linear_model import SGDClassifier
37 | 
38 | vect = HashingVectorizer(decode_error='ignore',
39 |                           n_features=2**21,
40 |                           preprocessor=None,
41 |                           tokenizer=tokenizer)
42 | 
43 | # regularized linear models with stochastic gradient descent (SGD)
44 | clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
45 | doc_stream = stream_docs(path='./movie_data.csv')
46 | 
47 | import pyprind
48 | pbar = pyprind.ProgBar(45)
49 | classes = np.array([0, 1])
50 | for _ in range(45):
51 |     X_train, y_train = get_minibatch(doc_stream, size=1000)
52 |     if not X_train:
53 |         break
54 | 
55 |     X_train = vect.transform(X_train)
56 |     clf.partial_fit(X_train, y_train, classes=classes)
57 |     pbar.update()
58 | 
59 | 
60 | X_test, y_test = get_minibatch(doc_stream, size=5000)
61 | X_test = vect.transform(X_test)
62 | print('Accuracy: %.3f' % clf.score(X_test, y_test))
63 | 
64 | clf = clf.partial_fit(X_test, y_test)
65 | 
66 | 
67 | 
68 | import pickle
69 | import os
70 | 
71 | dest = os.path.join('movieclassifier', 'pkl_objects')
72 | if not os.path.exists(dest):
73 |     os.makedirs(dest)
74 | 
75 | pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'),'wb'), protocol=4)
76 | pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'),protocol=4)
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/movieclassifier/__pycache__/update.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/movieclassifier/__pycache__/update.cpython-35.pyc


--------------------------------------------------------------------------------
/movieclassifier/__pycache__/vectorizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/movieclassifier/__pycache__/vectorizer.cpython-35.pyc


--------------------------------------------------------------------------------
/movieclassifier/app.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request
 2 | from wtforms import Form, TextAreaField, validators
 3 | import pickle
 4 | import sqlite3
 5 | import os
 6 | import numpy as np
 7 | 
 8 | # import HashingVectorizer from local dir
 9 | from vectorizer import vect
10 | 
11 | ######## Preparing the Classifier
12 | cur_dir = os.path.dirname(__file__)
13 | clf = pickle.load(open(os.path.join(cur_dir, 
14 |                  'pkl_objects/classifier.pkl'), 'rb'))
15 | db = os.path.join(cur_dir, 'reviews.sqlite')
16 | 
17 | def classify(document):
18 |     label = {0: 'negative', 1: 'positive'}
19 |     X = vect.transform([document])
20 |     y = clf.predict(X)[0]
21 |     proba =  clf.predict_proba(X).max()
22 |     return label[y], proba
23 | 
24 | def train(document, y):
25 |     X = vect.transform([document])
26 |     clf.partial_fit(X, [y])
27 | 
28 | def sqlite_entry(path, document, y):
29 |     conn = sqlite3.connect(path)
30 |     c = conn.cursor()
31 |     c.execute("INSERT INTO review_db (review, sentiment, date)"\
32 |     " VALUES (?, ?, DATETIME('now'))", (document, y))
33 |     conn.commit()
34 |     conn.close()
35 | 
36 | 
37 | app = Flask(__name__)
38 | class ReviewForm(Form):
39 |     moviereview = TextAreaField('',
40 |                                 [validators.DataRequired(),
41 |                                 validators.length(min=15)])
42 | 
43 | @app.route('/')
44 | def index():
45 |     form = ReviewForm(request.form)
46 |     return render_template('reviewform.html', form=form)
47 | 
48 | @app.route('/results', methods=['POST'])
49 | def results():
50 |     form = ReviewForm(request.form)
51 |     if request.method == 'POST' and form.validate():
52 |         review = request.form['moviereview']
53 |         y, proba = classify(review)
54 |         return render_template('results.html',
55 |                                 content=review,
56 |                                 prediction=y,
57 |                                 probability=round(proba*100, 2))
58 |     return render_template('reviewform.html', form=form)
59 | 
60 | @app.route('/thanks', methods=['POST'])
61 | def feedback():
62 |     feedback = request.form['feedback_button']
63 |     review = request.form['review']
64 |     prediction = request.form['prediction']
65 | 
66 |     inv_label = {'negative': 0, 'positive': 1}
67 |     y = inv_label[prediction]
68 |     if feedback == 'Incorrect':
69 |         y = int(not(y))
70 |     train(review, y)
71 |     sqlite_entry(db, review, y)
72 |     return render_template('thanks.html')
73 | 
74 | if __name__ == '__main__':
75 |     # Update classifier from db on startup
76 |     from update import update_model
77 |     clf = update_model(db_path="reviews.sqlite", model=clf, batch_size=10000)
78 | 
79 |     app.run(debug=True)
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/movieclassifier/ch9-ex.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import re
 3 | import os
 4 | from vectorizer import vect
 5 | clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))
 6 | 
 7 | import numpy as np
 8 | label = {0:'negative', 1:'positive'}
 9 | example = ['I love this movie']
10 | X = vect.transform(example)
11 | print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*100))
12 | 
13 | 
14 | 
15 | import sqlite3
16 | import os
17 | os.unlink('reviews.sqlite')
18 | conn = sqlite3.connect('reviews.sqlite')
19 | c = conn.cursor()
20 | 
21 | c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')
22 | example1 = 'I love this movie'
23 | c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))
24 | 
25 | example2 = 'I disliked this movie'
26 | c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))
27 | 
28 | conn.commit()
29 | conn.close()
30 | 
31 | 
32 | conn = sqlite3.connect('reviews.sqlite')
33 | c = conn.cursor()
34 | c.execute("SELECT * FROM review_db WHERE date BETWEEN '2015-01-01 00:00:00' AND DATETIME('now')")
35 | results = c.fetchall()
36 | conn.close()
37 | print(results)
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/movieclassifier/pkl_objects/classifier.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/movieclassifier/pkl_objects/classifier.pkl


--------------------------------------------------------------------------------
/movieclassifier/pkl_objects/stopwords.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/movieclassifier/pkl_objects/stopwords.pkl


--------------------------------------------------------------------------------
/movieclassifier/reviews.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rupskygill/python-ML-book-Raschka/3e69c6f9ee8514888b45e8a882c25bafafd7f3d5/movieclassifier/reviews.sqlite


--------------------------------------------------------------------------------
/movieclassifier/static/style.css:
--------------------------------------------------------------------------------
1 | body{
2 |   width:600px;
3 | }
4 | .button{
5 |   padding-top: 20px;
6 | }


--------------------------------------------------------------------------------
/movieclassifier/templates/_formhelpers.html:
--------------------------------------------------------------------------------
 1 | {% macro render_field(field) %}
 2 |   <dt>{{ field.label }}
 3 |   <dd>{{ field(**kwargs)|safe }}
 4 |   {% if field.errors %}
 5 |     <ul class=errors>
 6 |     {% for error in field.errors %}
 7 |       <li>{{ error }}</li>
 8 |     {% endfor %}
 9 |     </ul>
10 |   {% endif %}
11 |   </dd>
12 |   </dt>
13 | {% endmacro %}


--------------------------------------------------------------------------------
/movieclassifier/templates/results.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |   <head>
 4 |     <title>Movie Classification</title>
 5 |   <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
 6 |   </head>
 7 |   <body>
 8 | 
 9 | <h3>Your movie review:</h3>
10 | <div>{{ content }}</div>
11 | 
12 | <h3>Prediction:</h3>
13 | <div>This movie review is <strong>{{ prediction }}</strong>
14 |   (probability: {{ probability }}%).</div>
15 | 
16 | <div class='button'>
17 |   <form action="/thanks" method="post">
18 |     <input type=submit value='Correct' name='feedback_button'>
19 |     <input type=submit value='Incorrect' name='feedback_button'>
20 |     <input type=hidden value='{{ prediction }}' name='prediction'>
21 |     <input type=hidden value='{{ content }}' name='review'>
22 |   </form>
23 | </div>
24 | 
25 | <div class='button'>
26 |   <form action="/">
27 |     <input type=submit value='Submit another review'>
28 |   </form>
29 | </div>
30 | 
31 |   </body>
32 | </html>
33 | 


--------------------------------------------------------------------------------
/movieclassifier/templates/reviewform.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 | <head>
 4 |     <title>Movie Classification</title>
 5 | </head>
 6 |   <body>
 7 | 
 8 | <h2>Please enter your movie review:</h2>
 9 | 
10 | {% from "_formhelpers.html" import render_field %}
11 | 
12 | <form method=post action="/results">
13 |   <dl>
14 |     {{ render_field(form.moviereview, cols='30', rows='10') }}
15 |   </dl>
16 |   <div>
17 |     <input type=submit value='Submit review' name='submit_btn'>
18 |   </div>
19 | </form>
20 | 
21 |   </body>
22 | </html>


--------------------------------------------------------------------------------
/movieclassifier/templates/thanks.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |   <head>
 4 |     <title>Movie Classification</title>
 5 | </head>
 6 |   <body>
 7 | 
 8 | <h3>Thank you for your feedback!</h3>
 9 | <div id='button'>
10 |   <form action="/">
11 |     <input type=submit value='Submit another review'>
12 |   </form>
13 | </div>
14 |    </body>
15 | </html>
16 | 
17 | 


--------------------------------------------------------------------------------
/movieclassifier/update.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import sqlite3
 3 | import numpy as np
 4 | import os
 5 | 
 6 | # import HashingVectorizer from local dir
 7 | from vectorizer import vect
 8 | 
 9 | def update_model(db_path, model, batch_size=10000):
10 | 
11 |     conn = sqlite3.connect(db_path)
12 |     c = conn.cursor()
13 |     c.execute('SELECT * from review_db')
14 | 
15 |     results = c.fetchmany(batch_size)
16 |     while results:
17 |         data = np.array(results)
18 |         X = data[:, 0]
19 |         y = data[:, 1].astype(int)
20 | 
21 |         classes = np.array([0, 1])
22 |         X_train = vect.transform(X)
23 |         model.partial_fit(X_train, y, classes=classes)
24 |         results = c.fetchmany(batch_size)
25 | 
26 |     conn.close()
27 |     return model
28 | 
29 | 


--------------------------------------------------------------------------------
/movieclassifier/vectorizer.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import HashingVectorizer
 2 | import re
 3 | import os
 4 | import pickle
 5 | 
 6 | cur_dir = os.path.dirname(__file__)
 7 | stop = pickle.load(open(
 8 |                 os.path.join(cur_dir,
 9 |                 'pkl_objects',
10 |                 'stopwords.pkl'), 'rb'))
11 | 
12 | def tokenizer(text):
13 |     text = re.sub('<[^>]*>', '', text)
14 |     emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
15 |                            text.lower())
16 |     text = re.sub('[\W]+', ' ', text.lower()) \
17 |                    + ' '.join(emoticons).replace('-', '')
18 |     tokenized = [w for w in text.split() if w not in stop]
19 |     return tokenized
20 | 
21 | vect = HashingVectorizer(decode_error='ignore',
22 |                          n_features=2**21,
23 |                          preprocessor=None,
24 |                          tokenizer=tokenizer)
25 | 
26 | 


--------------------------------------------------------------------------------
/neuralnet.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.special import expit
  3 | import sys
  4 | 
  5 | 
  6 | class NeuralNetMLP(object):
  7 |     """ Feedforward neural network / Multi-layer perceptron classifier.
  8 | 
  9 |     Parameters
 10 |     ------------
 11 |     n_output : int
 12 |       Number of output units, should be equal to the
 13 |       number of unique class labels.
 14 | 
 15 |     n_features : int
 16 |       Number of features (dimensions) in the target dataset.
 17 |       Should be equal to the number of columns in the X array.
 18 | 
 19 |     n_hidden : int (default: 30)
 20 |       Number of hidden units.
 21 | 
 22 |     l1 : float (default: 0.0)
 23 |       Lambda value for L1-regularization.
 24 |       No regularization if l1=0.0 (default)
 25 | 
 26 |     l2 : float (default: 0.0)
 27 |       Lambda value for L2-regularization.
 28 |       No regularization if l2=0.0 (default)
 29 | 
 30 |     epochs : int (default: 500)
 31 |       Number of passes over the training set.
 32 | 
 33 |     eta : float (default: 0.001)
 34 |       Learning rate.
 35 | 
 36 |     alpha : float (default: 0.0)
 37 |       Momentum constant. Factor multiplied with the
 38 |       gradient of the previous epoch t-1 to improve
 39 |       learning speed
 40 |       w(t) := w(t) - (grad(t) + alpha*grad(t-1))
 41 | 
 42 |     decrease_const : float (default: 0.0)
 43 |       Decrease constant. Shrinks the learning rate
 44 |       after each epoch via eta / (1 + epoch*decrease_const)
 45 | 
 46 |     shuffle : bool (default: True)
 47 |       Shuffles training data every epoch if True to prevent circles.
 48 | 
 49 |     minibatches : int (default: 1)
 50 |       Divides training data into k minibatches for efficiency.
 51 |       Normal gradient descent learning if k=1 (default).
 52 | 
 53 |     random_state : int (default: None)
 54 |       Set random state for shuffling and initializing the weights.
 55 | 
 56 |     Attributes
 57 |     -----------
 58 |     cost_ : list
 59 |       Sum of squared errors after each epoch.
 60 | 
 61 |     """
 62 |     def __init__(self, n_output, n_features, n_hidden=30,
 63 |                  l1=0.0, l2=0.0, epochs=500, eta=0.001,
 64 |                  alpha=0.0, decrease_const=0.0, shuffle=True,
 65 |                  minibatches=1, random_state=None):
 66 | 
 67 |         np.random.seed(random_state)
 68 |         self.n_output = n_output
 69 |         self.n_features = n_features
 70 |         self.n_hidden = n_hidden
 71 |         self.w1, self.w2 = self._initialize_weights()
 72 |         self.l1 = l1
 73 |         self.l2 = l2
 74 |         self.epochs = epochs
 75 |         self.eta = eta
 76 |         self.alpha = alpha
 77 |         self.decrease_const = decrease_const
 78 |         self.shuffle = shuffle
 79 |         self.minibatches = minibatches
 80 | 
 81 |     def _encode_labels(self, y, k):
 82 |         """Encode labels into one-hot representation
 83 | 
 84 |         Parameters
 85 |         ------------
 86 |         y : array, shape = [n_samples]
 87 |             Target values.
 88 | 
 89 |         Returns
 90 |         -----------
 91 |         onehot : array, shape = (n_labels, n_samples)
 92 | 
 93 |         """
 94 |         onehot = np.zeros((k, y.shape[0]))
 95 |         for idx, val in enumerate(y):
 96 |             onehot[val, idx] = 1.0
 97 |         return onehot
 98 | 
 99 |     def _initialize_weights(self):
100 |         """Initialize weights with small random numbers."""
101 |         w1 = np.random.uniform(-1.0, 1.0, size=self.n_hidden*(self.n_features + 1))
102 |         w1 = w1.reshape(self.n_hidden, self.n_features + 1)
103 |         w2 = np.random.uniform(-1.0, 1.0, size=self.n_output*(self.n_hidden + 1))
104 |         w2 = w2.reshape(self.n_output, self.n_hidden + 1)
105 |         return w1, w2
106 | 
107 |     def _sigmoid(self, z):
108 |         """Compute logistic function (sigmoid)
109 | 
110 |         Uses scipy.special.expit to avoid overflow
111 |         error for very small input values z.
112 | 
113 |         """
114 |         # return 1.0 / (1.0 + np.exp(-z))
115 |         return expit(z)
116 | 
117 |     def _sigmoid_gradient(self, z):
118 |         """Compute gradient of the logistic function"""
119 |         sg = self._sigmoid(z)
120 |         return sg * (1 - sg)
121 | 
122 |     def _add_bias_unit(self, X, how='column'):
123 |         """Add bias unit (column or row of 1s) to array at index 0"""
124 |         if how == 'column':
125 |             X_new = np.ones((X.shape[0], X.shape[1]+1))
126 |             X_new[:, 1:] = X
127 |         elif how == 'row':
128 |             X_new = np.ones((X.shape[0]+1, X.shape[1]))
129 |             X_new[1:, :] = X
130 |         else:
131 |             raise AttributeError('`how` must be `column` or `row`')
132 |         return X_new
133 | 
134 |     def _feedforward(self, X, w1, w2):
135 |         """Compute feedforward step
136 | 
137 |         Parameters
138 |         -----------
139 |         X : array, shape = [n_samples, n_features]
140 |           Input layer with original features.
141 | 
142 |         w1 : array, shape = [n_hidden_units, n_features]
143 |           Weight matrix for input layer -> hidden layer.
144 | 
145 |         w2 : array, shape = [n_output_units, n_hidden_units]
146 |           Weight matrix for hidden layer -> output layer.
147 | 
148 |         Returns
149 |         ----------
150 |         a1 : array, shape = [n_samples, n_features+1]
151 |           Input values with bias unit.
152 | 
153 |         z2 : array, shape = [n_hidden, n_samples]
154 |           Net input of hidden layer.
155 | 
156 |         a2 : array, shape = [n_hidden+1, n_samples]
157 |           Activation of hidden layer.
158 | 
159 |         z3 : array, shape = [n_output_units, n_samples]
160 |           Net input of output layer.
161 | 
162 |         a3 : array, shape = [n_output_units, n_samples]
163 |           Activation of output layer.
164 | 
165 |         """
166 |         a1 = self._add_bias_unit(X, how='column')
167 |         z2 = w1.dot(a1.T)
168 |         a2 = self._sigmoid(z2)
169 |         a2 = self._add_bias_unit(a2, how='row')
170 |         z3 = w2.dot(a2)
171 |         a3 = self._sigmoid(z3)
172 |         return a1, z2, a2, z3, a3
173 | 
174 |     def _L2_reg(self, lambda_, w1, w2):
175 |         """Compute L2-regularization cost"""
176 |         return (lambda_/2.0) * (np.sum(w1[:, 1:] ** 2) + np.sum(w2[:, 1:] ** 2))
177 | 
178 |     def _L1_reg(self, lambda_, w1, w2):
179 |         """Compute L1-regularization cost"""
180 |         return (lambda_/2.0) * (np.abs(w1[:, 1:]).sum() + np.abs(w2[:, 1:]).sum())
181 | 
182 |     def _get_cost(self, y_enc, output, w1, w2):
183 |         """Compute cost function.
184 | 
185 |         y_enc : array, shape = (n_labels, n_samples)
186 |           one-hot encoded class labels.
187 | 
188 |         output : array, shape = [n_output_units, n_samples]
189 |           Activation of the output layer (feedforward)
190 | 
191 |         w1 : array, shape = [n_hidden_units, n_features]
192 |           Weight matrix for input layer -> hidden layer.
193 | 
194 |         w2 : array, shape = [n_output_units, n_hidden_units]
195 |           Weight matrix for hidden layer -> output layer.
196 | 
197 |         Returns
198 |         ---------
199 |         cost : float
200 |           Regularized cost.
201 | 
202 |         """
203 |         term1 = -y_enc * (np.log(output))
204 |         term2 = (1 - y_enc) * np.log(1 - output)
205 |         cost = np.sum(term1 - term2)
206 |         L1_term = self._L1_reg(self.l1, w1, w2)
207 |         L2_term = self._L2_reg(self.l2, w1, w2)
208 |         cost = cost + L1_term + L2_term
209 |         return cost
210 | 
211 |     def _get_gradient(self, a1, a2, a3, z2, y_enc, w1, w2):
212 |         """ Compute gradient step using backpropagation.
213 | 
214 |         Parameters
215 |         ------------
216 |         a1 : array, shape = [n_samples, n_features+1]
217 |           Input values with bias unit.
218 | 
219 |         a2 : array, shape = [n_hidden+1, n_samples]
220 |           Activation of hidden layer.
221 | 
222 |         a3 : array, shape = [n_output_units, n_samples]
223 |           Activation of output layer.
224 | 
225 |         z2 : array, shape = [n_hidden, n_samples]
226 |           Net input of hidden layer.
227 | 
228 |         y_enc : array, shape = (n_labels, n_samples)
229 |           one-hot encoded class labels.
230 | 
231 |         w1 : array, shape = [n_hidden_units, n_features]
232 |           Weight matrix for input layer -> hidden layer.
233 | 
234 |         w2 : array, shape = [n_output_units, n_hidden_units]
235 |           Weight matrix for hidden layer -> output layer.
236 | 
237 |         Returns
238 |         ---------
239 | 
240 |         grad1 : array, shape = [n_hidden_units, n_features]
241 |           Gradient of the weight matrix w1.
242 | 
243 |         grad2 : array, shape = [n_output_units, n_hidden_units]
244 |             Gradient of the weight matrix w2.
245 | 
246 |         """
247 |         # backpropagation
248 |         sigma3 = a3 - y_enc
249 |         z2 = self._add_bias_unit(z2, how='row')
250 |         sigma2 = w2.T.dot(sigma3) * self._sigmoid_gradient(z2)
251 |         sigma2 = sigma2[1:, :]
252 |         grad1 = sigma2.dot(a1)
253 |         grad2 = sigma3.dot(a2.T)
254 | 
255 |         # regularize
256 |         grad1[:, 1:] += (w1[:, 1:] * (self.l1 + self.l2))
257 |         grad2[:, 1:] += (w2[:, 1:] * (self.l1 + self.l2))
258 | 
259 |         return grad1, grad2
260 | 
261 |     def predict(self, X):
262 |         """Predict class labels
263 | 
264 |         Parameters
265 |         -----------
266 |         X : array, shape = [n_samples, n_features]
267 |           Input layer with original features.
268 | 
269 |         Returns:
270 |         ----------
271 |         y_pred : array, shape = [n_samples]
272 |           Predicted class labels.
273 | 
274 |         """
275 |         if len(X.shape) != 2:
276 |             raise AttributeError('X must be a [n_samples, n_features] array.\n'
277 |                                  'Use X[:,None] for 1-feature classification,'
278 |                                  '\nor X[[i]] for 1-sample classification')
279 | 
280 |         a1, z2, a2, z3, a3 = self._feedforward(X, self.w1, self.w2)
281 |         y_pred = np.argmax(z3, axis=0)
282 |         return y_pred
283 | 
284 |     def fit(self, X, y, print_progress=False):
285 |         """ Learn weights from training data.
286 | 
287 |         Parameters
288 |         -----------
289 |         X : array, shape = [n_samples, n_features]
290 |           Input layer with original features.
291 | 
292 |         y : array, shape = [n_samples]
293 |           Target class labels.
294 | 
295 |         print_progress : bool (default: False)
296 |           Prints progress as the number of epochs
297 |           to stderr.
298 | 
299 |         Returns:
300 |         ----------
301 |         self
302 | 
303 |         """
304 |         self.cost_ = []
305 |         X_data, y_data = X.copy(), y.copy()
306 |         y_enc = self._encode_labels(y, self.n_output)
307 | 
308 |         delta_w1_prev = np.zeros(self.w1.shape)
309 |         delta_w2_prev = np.zeros(self.w2.shape)
310 | 
311 |         for i in range(self.epochs):
312 | 
313 |             # adaptive learning rate
314 |             self.eta /= (1 + self.decrease_const*i)
315 | 
316 |             if print_progress:
317 |                 sys.stderr.write('\rEpoch: %d/%d' % (i+1, self.epochs))
318 |                 sys.stderr.flush()
319 | 
320 |             if self.shuffle:
321 |                 idx = np.random.permutation(y_data.shape[0])
322 |                 X_data, y_enc = X_data[idx], y_enc[:, idx]
323 | 
324 |             mini = np.array_split(range(y_data.shape[0]), self.minibatches)
325 |             for idx in mini:
326 | 
327 |                 # feedforward
328 |                 a1, z2, a2, z3, a3 = self._feedforward(X_data[idx], self.w1, self.w2)
329 |                 cost = self._get_cost(y_enc=y_enc[:, idx],
330 |                                       output=a3,
331 |                                       w1=self.w1,
332 |                                       w2=self.w2)
333 |                 self.cost_.append(cost)
334 | 
335 |                 # compute gradient via backpropagation
336 |                 grad1, grad2 = self._get_gradient(a1=a1, a2=a2,
337 |                                                   a3=a3, z2=z2,
338 |                                                   y_enc=y_enc[:, idx],
339 |                                                   w1=self.w1,
340 |                                                   w2=self.w2)
341 | 
342 |                 delta_w1, delta_w2 = self.eta * grad1, self.eta * grad2
343 |                 self.w1 -= (delta_w1 + (self.alpha * delta_w1_prev))
344 |                 self.w2 -= (delta_w2 + (self.alpha * delta_w2_prev))
345 |                 delta_w1_prev, delta_w2_prev = delta_w1, delta_w2
346 | 
347 |         return self
348 | 
349 | 
350 | 
351 | 
352 | 
353 | 
354 | 
355 | ### Improved Neural Net with back propogation
356 | 
357 | class MLPGradientCheck(object):
358 |     """ Feedforward neural network / Multi-layer perceptron classifier.
359 | 
360 |     Parameters
361 |     ------------
362 |     n_output : int
363 |       Number of output units, should be equal to the
364 |       number of unique class labels.
365 | 
366 |     n_features : int
367 |       Number of features (dimensions) in the target dataset.
368 |       Should be equal to the number of columns in the X array.
369 | 
370 |     n_hidden : int (default: 30)
371 |       Number of hidden units.
372 | 
373 |     l1 : float (default: 0.0)
374 |       Lambda value for L1-regularization.
375 |       No regularization if l1=0.0 (default)
376 | 
377 |     l2 : float (default: 0.0)
378 |       Lambda value for L2-regularization.
379 |       No regularization if l2=0.0 (default)
380 | 
381 |     epochs : int (default: 500)
382 |       Number of passes over the training set.
383 | 
384 |     eta : float (default: 0.001)
385 |       Learning rate.
386 | 
387 |     alpha : float (default: 0.0)
388 |       Momentum constant. Factor multiplied with the
389 |       gradient of the previous epoch t-1 to improve
390 |       learning speed
391 |       w(t) := w(t) - (grad(t) + alpha*grad(t-1))
392 | 
393 |     decrease_const : float (default: 0.0)
394 |       Decrease constant. Shrinks the learning rate
395 |       after each epoch via eta / (1 + epoch*decrease_const)
396 | 
397 |     shuffle : bool (default: False)
398 |       Shuffles training data every epoch if True to prevent circles.
399 | 
400 |     minibatches : int (default: 1)
401 |       Divides training data into k minibatches for efficiency.
402 |       Normal gradient descent learning if k=1 (default).
403 | 
404 |     random_state : int (default: None)
405 |       Set random state for shuffling and initializing the weights.
406 | 
407 |     Attributes
408 |     -----------
409 |     cost_ : list
410 |       Sum of squared errors after each epoch.
411 | 
412 |     """
413 |     def __init__(self, n_output, n_features, n_hidden=30,
414 |                  l1=0.0, l2=0.0, epochs=500, eta=0.001,
415 |                  alpha=0.0, decrease_const=0.0, shuffle=True,
416 |                  minibatches=1, random_state=None):
417 | 
418 |         np.random.seed(random_state)
419 |         self.n_output = n_output
420 |         self.n_features = n_features
421 |         self.n_hidden = n_hidden
422 |         self.w1, self.w2 = self._initialize_weights()
423 |         self.l1 = l1
424 |         self.l2 = l2
425 |         self.epochs = epochs
426 |         self.eta = eta
427 |         self.alpha = alpha
428 |         self.decrease_const = decrease_const
429 |         self.shuffle = shuffle
430 |         self.minibatches = minibatches
431 | 
432 |     def _encode_labels(self, y, k):
433 |         """Encode labels into one-hot representation
434 | 
435 |         Parameters
436 |         ------------
437 |         y : array, shape = [n_samples]
438 |             Target values.
439 | 
440 |         Returns
441 |         -----------
442 |         onehot : array, shape = (n_labels, n_samples)
443 | 
444 |         """
445 |         onehot = np.zeros((k, y.shape[0]))
446 |         for idx, val in enumerate(y):
447 |             onehot[val, idx] = 1.0
448 |         return onehot
449 | 
450 |     def _initialize_weights(self):
451 |         """Initialize weights with small random numbers."""
452 |         w1 = np.random.uniform(-1.0, 1.0, size=self.n_hidden*(self.n_features + 1))
453 |         w1 = w1.reshape(self.n_hidden, self.n_features + 1)
454 |         w2 = np.random.uniform(-1.0, 1.0, size=self.n_output*(self.n_hidden + 1))
455 |         w2 = w2.reshape(self.n_output, self.n_hidden + 1)
456 |         return w1, w2
457 | 
458 |     def _sigmoid(self, z):
459 |         """Compute logistic function (sigmoid)
460 | 
461 |         Uses scipy.special.expit to avoid overflow
462 |         error for very small input values z.
463 | 
464 |         """
465 |         # return 1.0 / (1.0 + np.exp(-z))
466 |         return expit(z)
467 | 
468 |     def _sigmoid_gradient(self, z):
469 |         """Compute gradient of the logistic function"""
470 |         sg = self._sigmoid(z)
471 |         return sg * (1 - sg)
472 | 
473 |     def _add_bias_unit(self, X, how='column'):
474 |         """Add bias unit (column or row of 1s) to array at index 0"""
475 |         if how == 'column':
476 |             X_new = np.ones((X.shape[0], X.shape[1]+1))
477 |             X_new[:, 1:] = X
478 |         elif how == 'row':
479 |             X_new = np.ones((X.shape[0]+1, X.shape[1]))
480 |             X_new[1:, :] = X
481 |         else:
482 |             raise AttributeError('`how` must be `column` or `row`')
483 |         return X_new
484 | 
485 |     def _feedforward(self, X, w1, w2):
486 |         """Compute feedforward step
487 | 
488 |         Parameters
489 |         -----------
490 |         X : array, shape = [n_samples, n_features]
491 |           Input layer with original features.
492 | 
493 |         w1 : array, shape = [n_hidden_units, n_features]
494 |           Weight matrix for input layer -> hidden layer.
495 | 
496 |         w2 : array, shape = [n_output_units, n_hidden_units]
497 |           Weight matrix for hidden layer -> output layer.
498 | 
499 |         Returns
500 |         ----------
501 |         a1 : array, shape = [n_samples, n_features+1]
502 |           Input values with bias unit.
503 | 
504 |         z2 : array, shape = [n_hidden, n_samples]
505 |           Net input of hidden layer.
506 | 
507 |         a2 : array, shape = [n_hidden+1, n_samples]
508 |           Activation of hidden layer.
509 | 
510 |         z3 : array, shape = [n_output_units, n_samples]
511 |           Net input of output layer.
512 | 
513 |         a3 : array, shape = [n_output_units, n_samples]
514 |           Activation of output layer.
515 | 
516 |         """
517 |         a1 = self._add_bias_unit(X, how='column')
518 |         z2 = w1.dot(a1.T)
519 |         a2 = self._sigmoid(z2)
520 |         a2 = self._add_bias_unit(a2, how='row')
521 |         z3 = w2.dot(a2)
522 |         a3 = self._sigmoid(z3)
523 |         return a1, z2, a2, z3, a3
524 | 
525 |     def _L2_reg(self, lambda_, w1, w2):
526 |         """Compute L2-regularization cost"""
527 |         return (lambda_/2.0) * (np.sum(w1[:, 1:] ** 2) + np.sum(w2[:, 1:] ** 2))
528 | 
529 |     def _L1_reg(self, lambda_, w1, w2):
530 |         """Compute L1-regularization cost"""
531 |         return (lambda_/2.0) * (np.abs(w1[:, 1:]).sum() + np.abs(w2[:, 1:]).sum())
532 | 
533 |     def _get_cost(self, y_enc, output, w1, w2):
534 |         """Compute cost function.
535 | 
536 |         y_enc : array, shape = (n_labels, n_samples)
537 |           one-hot encoded class labels.
538 | 
539 |         output : array, shape = [n_output_units, n_samples]
540 |           Activation of the output layer (feedforward)
541 | 
542 |         w1 : array, shape = [n_hidden_units, n_features]
543 |           Weight matrix for input layer -> hidden layer.
544 | 
545 |         w2 : array, shape = [n_output_units, n_hidden_units]
546 |           Weight matrix for hidden layer -> output layer.
547 | 
548 |         Returns
549 |         ---------
550 |         cost : float
551 |           Regularized cost.
552 | 
553 |         """
554 |         term1 = -y_enc * (np.log(output))
555 |         term2 = (1 - y_enc) * np.log(1 - output)
556 |         cost = np.sum(term1 - term2)
557 |         L1_term = self._L1_reg(self.l1, w1, w2)
558 |         L2_term = self._L2_reg(self.l2, w1, w2)
559 |         cost = cost + L1_term + L2_term
560 |         return cost
561 | 
562 |     def _get_gradient(self, a1, a2, a3, z2, y_enc, w1, w2):
563 |         """ Compute gradient step using backpropagation.
564 | 
565 |         Parameters
566 |         ------------
567 |         a1 : array, shape = [n_samples, n_features+1]
568 |           Input values with bias unit.
569 | 
570 |         a2 : array, shape = [n_hidden+1, n_samples]
571 |           Activation of hidden layer.
572 | 
573 |         a3 : array, shape = [n_output_units, n_samples]
574 |           Activation of output layer.
575 | 
576 |         z2 : array, shape = [n_hidden, n_samples]
577 |           Net input of hidden layer.
578 | 
579 |         y_enc : array, shape = (n_labels, n_samples)
580 |           one-hot encoded class labels.
581 | 
582 |         w1 : array, shape = [n_hidden_units, n_features]
583 |           Weight matrix for input layer -> hidden layer.
584 | 
585 |         w2 : array, shape = [n_output_units, n_hidden_units]
586 |           Weight matrix for hidden layer -> output layer.
587 | 
588 |         Returns
589 |         ---------
590 | 
591 |         grad1 : array, shape = [n_hidden_units, n_features]
592 |           Gradient of the weight matrix w1.
593 | 
594 |         grad2 : array, shape = [n_output_units, n_hidden_units]
595 |             Gradient of the weight matrix w2.
596 | 
597 |         """
598 |         # backpropagation
599 |         sigma3 = a3 - y_enc
600 |         z2 = self._add_bias_unit(z2, how='row')
601 |         sigma2 = w2.T.dot(sigma3) * self._sigmoid_gradient(z2)
602 |         sigma2 = sigma2[1:, :]
603 |         grad1 = sigma2.dot(a1)
604 |         grad2 = sigma3.dot(a2.T)
605 | 
606 |         # regularize
607 |         grad1[:, 1:] += (w1[:, 1:] * (self.l1 + self.l2))
608 |         grad2[:, 1:] += (w2[:, 1:] * (self.l1 + self.l2))
609 | 
610 |         return grad1, grad2
611 | 
612 |     def _gradient_checking(self, X, y_enc, w1, w2, epsilon, grad1, grad2):
613 |         """ Apply gradient checking (for debugging only)
614 | 
615 |         Returns
616 |         ---------
617 |         relative_error : float
618 |           Relative error between the numerically
619 |           approximated gradients and the backpropagated gradients.
620 | 
621 |         """
622 |         num_grad1 = np.zeros(np.shape(w1))
623 |         epsilon_ary1 = np.zeros(np.shape(w1))
624 |         for i in range(w1.shape[0]):
625 |             for j in range(w1.shape[1]):
626 |                 epsilon_ary1[i, j] = epsilon
627 |                 a1, z2, a2, z3, a3 = self._feedforward(X, w1 - epsilon_ary1, w2)
628 |                 cost1 = self._get_cost(y_enc, a3, w1-epsilon_ary1, w2)
629 |                 a1, z2, a2, z3, a3 = self._feedforward(X, w1 + epsilon_ary1, w2)
630 |                 cost2 = self._get_cost(y_enc, a3, w1 + epsilon_ary1, w2)
631 |                 num_grad1[i, j] = (cost2 - cost1) / (2 * epsilon)
632 |                 epsilon_ary1[i, j] = 0
633 | 
634 |         num_grad2 = np.zeros(np.shape(w2))
635 |         epsilon_ary2 = np.zeros(np.shape(w2))
636 |         for i in range(w2.shape[0]):
637 |             for j in range(w2.shape[1]):
638 |                 epsilon_ary2[i, j] = epsilon
639 |                 a1, z2, a2, z3, a3 = self._feedforward(X, w1, w2 - epsilon_ary2)
640 |                 cost1 = self._get_cost(y_enc, a3, w1, w2 - epsilon_ary2)
641 |                 a1, z2, a2, z3, a3 = self._feedforward(X, w1, w2 + epsilon_ary2)
642 |                 cost2 = self._get_cost(y_enc, a3, w1, w2 + epsilon_ary2)
643 |                 num_grad2[i, j] = (cost2 - cost1) / (2 * epsilon)
644 |                 epsilon_ary2[i, j] = 0
645 | 
646 |         num_grad = np.hstack((num_grad1.flatten(), num_grad2.flatten()))
647 |         grad = np.hstack((grad1.flatten(), grad2.flatten()))
648 |         norm1 = np.linalg.norm(num_grad - grad)
649 |         norm2 = np.linalg.norm(num_grad)
650 |         norm3 = np.linalg.norm(grad)
651 |         relative_error = norm1 / (norm2 + norm3)
652 |         return relative_error
653 | 
654 |     def predict(self, X):
655 |         """Predict class labels
656 | 
657 |         Parameters
658 |         -----------
659 |         X : array, shape = [n_samples, n_features]
660 |           Input layer with original features.
661 | 
662 |         Returns:
663 |         ----------
664 |         y_pred : array, shape = [n_samples]
665 |           Predicted class labels.
666 | 
667 |         """
668 |         if len(X.shape) != 2:
669 |             raise AttributeError('X must be a [n_samples, n_features] array.\n'
670 |                                  'Use X[:,None] for 1-feature classification,'
671 |                                  '\nor X[[i]] for 1-sample classification')
672 | 
673 |         a1, z2, a2, z3, a3 = self._feedforward(X, self.w1, self.w2)
674 |         y_pred = np.argmax(z3, axis=0)
675 |         return y_pred
676 | 
677 |     def fit(self, X, y, print_progress=False):
678 |         """ Learn weights from training data.
679 | 
680 |         Parameters
681 |         -----------
682 |         X : array, shape = [n_samples, n_features]
683 |           Input layer with original features.
684 | 
685 |         y : array, shape = [n_samples]
686 |           Target class labels.
687 | 
688 |         print_progress : bool (default: False)
689 |           Prints progress as the number of epochs
690 |           to stderr.
691 | 
692 |         Returns:
693 |         ----------
694 |         self
695 | 
696 |         """
697 |         self.cost_ = []
698 |         X_data, y_data = X.copy(), y.copy()
699 |         y_enc = self._encode_labels(y, self.n_output)
700 | 
701 |         delta_w1_prev = np.zeros(self.w1.shape)
702 |         delta_w2_prev = np.zeros(self.w2.shape)
703 | 
704 |         for i in range(self.epochs):
705 | 
706 |             # adaptive learning rate
707 |             self.eta /= (1 + self.decrease_const*i)
708 | 
709 |             if print_progress:
710 |                 sys.stderr.write('\rEpoch: %d/%d' % (i+1, self.epochs))
711 |                 sys.stderr.flush()
712 | 
713 |             if self.shuffle:
714 |                 idx = np.random.permutation(y_data.shape[0])
715 |                 X_data, y_enc = X_data[idx], y_enc[idx]
716 | 
717 |             mini = np.array_split(range(y_data.shape[0]), self.minibatches)
718 |             for idx in mini:
719 | 
720 |                 # feedforward
721 |                 a1, z2, a2, z3, a3 = self._feedforward(X[idx], self.w1, self.w2)
722 |                 cost = self._get_cost(y_enc=y_enc[:, idx],
723 |                                       output=a3,
724 |                                       w1=self.w1,
725 |                                       w2=self.w2)
726 |                 self.cost_.append(cost)
727 | 
728 |                 # compute gradient via backpropagation
729 |                 grad1, grad2 = self._get_gradient(a1=a1, a2=a2,
730 |                                                   a3=a3, z2=z2,
731 |                                                   y_enc=y_enc[:, idx],
732 |                                                   w1=self.w1,
733 |                                                   w2=self.w2)
734 | 
735 |                 ## start gradient checking
736 |                 grad_diff = self._gradient_checking(X=X_data[idx], y_enc=y_enc[:, idx],
737 |                                                   w1=self.w1, w2=self.w2,
738 |                                                   epsilon=1e-5,
739 |                                                   grad1=grad1, grad2=grad2)
740 | 
741 |                 if grad_diff <= 1e-7:
742 |                     print('Ok: %s' % grad_diff)
743 |                 elif grad_diff <= 1e-4:
744 |                     print('Warning: %s' % grad_diff)
745 |                 else:
746 |                     print('PROBLEM: %s' % grad_diff)
747 | 
748 |                 # update weights; [alpha * delta_w_prev] for momentum learning
749 |                 delta_w1, delta_w2 = self.eta * grad1, self.eta * grad2
750 |                 self.w1 -= (delta_w1 + (self.alpha * delta_w1_prev))
751 |                 self.w2 -= (delta_w2 + (self.alpha * delta_w2_prev))
752 |                 delta_w1_prev, delta_w2_prev = delta_w1, delta_w2
753 | 
754 |         return self
755 | 
756 | 
757 | 


--------------------------------------------------------------------------------