├── tree.png ├── tensor_flow_graph.png ├── 15-01-01 459_Mont_Lyman.jpg ├── run_batch.sc ├── q9_tensorflow_gpu_test.py ├── license.txt ├── README.md ├── q7_theano_gpu_test.py ├── run_batch.bat ├── q8_tika.py ├── p75_xor_dataset.py ├── p73_support_vector_machine.py ├── q1_database_statistics.py ├── p94_knearest_neighbors.py ├── p124_random_forest_feature_importance.py ├── p186_grid_search.py ├── p78_support_vector_machine_gamma.py ├── p190_confusion_matrix.py ├── q0_simple_e13b_display.py ├── p124_random_forest.py ├── p51_standard_scalar.py ├── p330_dendrogram.py ├── n0_network.py ├── p411_keras.py ├── p322_silhouette_plots.py ├── p193_model_precision_recall.py ├── q2_tensorflow_mnist.py ├── p110_scaling_features.py ├── q6_tensorflow_residual3x4.py ├── p229_adaboost.py ├── p189_nested_cross_validation.py ├── o4_image_to_image.py ├── p86_decision_tree.py ├── q5_tensorflow_residual.py ├── p221_bagging_bootstrap_samples.py ├── p115_l1_l2_regularization.py ├── p25_perceptron.py ├── p119_squential_backward_selection.py ├── p181_learning_curves.py ├── p62_logistic_regression.py ├── p36_adaline_gd.py ├── p44_adaline_sgd.py ├── q3_removing_affine_distortion.py ├── p314_k_means.py ├── p194_receiver_operating_characteristic.py ├── o1_top_secret_cnn.py ├── p177_k_fold_cross_validation.py ├── n1_residual3x4.py ├── o3_top_secret_python_box.py └── p131_principal_component_analysis.py /tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/tree.png -------------------------------------------------------------------------------- /tensor_flow_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/tensor_flow_graph.png -------------------------------------------------------------------------------- /15-01-01 459_Mont_Lyman.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/15-01-01 459_Mont_Lyman.jpg -------------------------------------------------------------------------------- /run_batch.sc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d "/tmp/plots" ]; then 4 | mkdir /tmp/plots 5 | fi 6 | echo "" > /tmp/plots/run_batch.txt 7 | for i in $(ls -1v ./[o-q]*.py ); do 8 | echo "" |& tee -a /tmp/plots/run_batch.txt 9 | echo "##############################################################" |& tee -a /tmp/plots/run_batch.txt 10 | echo "$i ###############################" |& tee -a /tmp/plots/run_batch.txt 11 | echo "##############################################################" |& tee -a /tmp/plots/run_batch.txt 12 | echo "" |& tee -a /tmp/plots/run_batch.txt 13 | python3 $i |& tee -a /tmp/plots/run_batch.txt 14 | 15 | done 16 | 17 | -------------------------------------------------------------------------------- /q9_tensorflow_gpu_test.py: -------------------------------------------------------------------------------- 1 | # Creates a graph. 2 | import tensorflow as tf 3 | #from tensorflow.compat import v1 as tf 4 | 5 | #sess = tf.InteractiveSession() 6 | @tf.function 7 | def d(a,b): 8 | return tf.matmul(a, b) 9 | 10 | a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') 11 | b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') 12 | #c = tf.matmul(a, b) 13 | # Creates a session with log_device_placement set to True. 14 | 15 | # Runs the op. 16 | 17 | # tens1 = tf.constant([ [[1,2],[2,3]], [[3,4],[5,6]] ]) 18 | # print (sess.run(tens1)[1,1,0]) 19 | # self._sess.run(tf.initialize_all_variables()) 20 | for i in range(100000): 21 | d(a,b) 22 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Richard Ricker Lyman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PythonMachineLearingExamples 2 | 3 | These are Python programs mostly taken from the book "Python Machine Learning" 4 | by Sebastian Raschka 5 | 6 | see the original programs from the book at: 7 | 8 | https://github.com/rasbt/python-machine-learning-book 9 | 10 | These programs were intended to prove out the data set of character images found in 11 | fonts.zip which will be automatically downloaded. 12 | 13 | For a detailed explanation of each program, read "Python Machine Learning". 14 | 15 | Due to the fact that a few E13B characters can be classified using only 16 | two features, most of the example programs that use the Iris dataset can also be 17 | be used with E13B. 18 | 19 | run_batch.sc is a bash script that runs all of the programs in the directory 20 | 21 | The script file will create many files in the folder, /tmp/plots. 22 | 23 | 24 | Python 3.8 25 | Anaconda3 26 | Linux or Windows 27 | cuda 11.2.1 28 | h5py 2.10.0 29 | Keras 2.4.3 30 | Lasagne 0.1 31 | matplotlib 3.3.2 32 | numpy 1.19.2 33 | pytesseract 0.3.7 34 | sklearn 0.0 35 | tensorflow 2.4.1 36 | tesseract 4.1.1 37 | Theano 1.0.5 38 | tika 1.24 39 | 40 | -------------------------------------------------------------------------------- /q7_theano_gpu_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' short test from the Theano website to see if Theano is working with CUDA 3 | .theanrc contains 4 | 5 | [global] 6 | floatX = float32 7 | device = gpu0 8 | 9 | 10 | @author: richard lyman 11 | 12 | ''' 13 | import os 14 | print (os.get_exec_path()) 15 | p=os.getenv('PATH') 16 | print("getenv('PATH')={}".format(p)) 17 | p=os.getenv('LD_LIBRARY_PATH') 18 | print("getenv('LD_LIBRARY_PATH')={}".format(p)) 19 | p=os.getenv('CUDA_HOME') 20 | print("getenv('CUDA_HOME')={}".format(p)) 21 | p=os.getenv('PYTHONPATH') 22 | print("getenv('PYTHONPATH')={}".format(p)) 23 | 24 | #os.environ['PATH'] = p 25 | # print(os.getenv('PATH')) 26 | from theano import function, config, shared, sandbox 27 | import theano.tensor as T 28 | import numpy 29 | import time 30 | 31 | vlen = 10 * 30 * 768 # 10 x #cores x # threads per core 32 | iters = 1000 33 | 34 | rng = numpy.random.RandomState(22) 35 | x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) 36 | f = function([], T.exp(x)) 37 | print(f.maker.fgraph.toposort()) 38 | t0 = time.time() 39 | for i in range(iters): 40 | r = f() 41 | t1 = time.time() 42 | print("Looping %d times took %f seconds" % (iters, t1 - t0)) 43 | print("Result is %s" % (r,)) 44 | if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]): 45 | print('Used the cpu') 46 | else: 47 | print('Used the gpu') 48 | 49 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /run_batch.bat: -------------------------------------------------------------------------------- 1 | mkdir plots 2 | python p110_scaling_features.py > ./plots/batch.txt 3 | python p115_l1_l2_regularization.py >> ./plots/batch.txt 4 | python p119_squential_backward_selection.py >> ./plots/batch.txt 5 | python p124_random_forest.py >> ./plots/batch.txt 6 | python p124_random_forest_feature_importance.py >> ./plots/batch.txt 7 | python p131_principal_component_analysis.py >> ./plots/batch.txt 8 | python p141_linear_descriminant_analsys.py >> ./plots/batch.txt 9 | python p154_pca_nonlinear_mapings.py >> ./plots/batch.txt 10 | python p177_k_fold_cross_validation.py >> ./plots/batch.txt 11 | python p181_learning_curves.py >> ./plots/batch.txt 12 | python p186_grid_search.py >> ./plots/batch.txt 13 | python p189_nested_cross_validation.py >> ./plots/batch.txt 14 | python p190_confusion_matrix.py >> ./plots/batch.txt 15 | python p193_model_precision_recall.py >> ./plots/batch.txt 16 | python p194_receiver_operating_characteristic.py >> ./plots/batch.txt 17 | python p206_majority_vote_classifier.py >> ./plots/batch.txt 18 | python p221_bagging_bootstrap_samples.py >> ./plots/batch.txt 19 | python p229_adaboost.py >> ./plots/batch.txt 20 | python p25_perceptron.py >> ./plots/batch.txt 21 | python p314_k_means.py >> ./plots/batch.txt 22 | python p322_silhouette_plots.py >> ./plots/batch.txt 23 | python p330_dendogram.py >> ./plots/batch.txt 24 | python p356_neural_net.py >> ./plots/batch.txt 25 | python p36_adaline_gd.py >> ./plots/batch.txt 26 | python p411_keras.py >> ./plots/batch.txt 27 | python p44_adaline_sgd.py >> ./plots/batch.txt 28 | python p51_standard_scalar.py >> ./plots/batch.txt 29 | python p62_logistic_regression.py >> ./plots/batch.txt 30 | python p73_support_vector_machine.py >> ./plots/batch.txt 31 | python p75_xor_dataset.py >> ./plots/batch.txt 32 | python p78_support_vector_machine_gamma.py >> ./plots/batch.txt 33 | python p86_decision_tree.py >> ./plots/batch.txt 34 | python p94_knearest_neighbors.py >> ./plots/batch.txt 35 | python q0_simple_e13b_display.py >> ./plots/batch.txt 36 | python q1_database_statistics.py >> ./plots/batch.txt 37 | python q2_tensorflow_mnist.py >> ./plots/batch.txt 38 | python q2_Theano_mlp.py >> ./plots/batch.txt 39 | python q3_removing_affine_distortion.py 40 | 41 | -------------------------------------------------------------------------------- /q8_tika.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | '''experiment with apache tika 3 | 4 | 5 | @author: richard lyman 6 | ''' 7 | 8 | import pytesseract 9 | import tika 10 | from tika import translate, detector, language 11 | 12 | 13 | 14 | filename = '15-01-01 459_Mont_Lyman.jpg' 15 | filename2 = 'img20150901_15233271bw.jpg' 16 | 17 | from PIL import Image 18 | 19 | rawText = pytesseract.image_to_string(Image.open(filename2), lang="rus") 20 | print (rawText) 21 | lines = rawText.split('\n') 22 | 23 | import os 24 | #os.putenv( 'TIKA_VERSION','default') # - set to the version string, e.g., 1.12 or default to current Tika version. 25 | #os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache. 26 | os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar. 27 | #os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar. 28 | #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client. 29 | #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation. 30 | #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path. 31 | #os.putenv('TESSDATA_PREFIX','/usr/share/tesseract-ocr/4.00/tessdata/') 32 | tika.initVM() 33 | from tika import parser 34 | parsed = parser.from_buffer("comme çi comme ça") 35 | print(parsed["metadata"]) 36 | print(parsed["content"]) 37 | global Verbose 38 | Verbose=True 39 | 40 | result=translate.auto_from_buffer("comme çi comme ça", 'en') 41 | print(result) 42 | result = detector.from_buffer("comme çi comme ça") 43 | print (result) 44 | result = translate.from_buffer("comme çi comme ça",'fr','en') 45 | print (result) 46 | result = language.from_buffer("comme çi comme ça") 47 | print (result) 48 | for line in lines: 49 | if len(line)>0: 50 | result=translate.from_buffer(line, 'ru','en') 51 | print(result) 52 | 53 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p75_xor_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' xor_dataset.py shows how non linearly separable datasets can use 3 | a non linear combination of the original features to project the 4 | features onto a higher dimensional space where the features are 5 | linearly separable 6 | 7 | A non linearly dataset consisting of XOR values is created. 8 | This is fitted to a Support Vector Machine using the radial basis 9 | function kernel parameter 10 | 11 | Created on Jun 23, 2016 12 | 13 | from Python Machine Learning by Sebastian Raschka under the following license 14 | 15 | The MIT License (MIT) 16 | 17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 18 | 19 | Permission is hereby granted, free of charge, to any person obtaining a copy 20 | of this software and associated documentation files (the "Software"), to deal 21 | in the Software without restriction, including without limitation the rights 22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | copies of the Software, and to permit persons to whom the Software is 24 | furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all 27 | copies or substantial portions of the Software. 28 | 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | SOFTWARE. 36 | 37 | @author: richard lyman 38 | ''' 39 | import numpy as np 40 | 41 | import ocr_utils 42 | 43 | from sklearn.svm import SVC 44 | 45 | np.random.seed(0) 46 | X_xor = np.random.randn(200, 2) 47 | y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0) 48 | y_xor = np.where(y_xor, 1, -1) 49 | 50 | ocr_utils.scatter_plot(X=X_xor, 51 | y=y_xor, 52 | title='xor', 53 | xlim=(-3,3), 54 | ylim=(-3,3)) 55 | 56 | 57 | svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0) 58 | svm.fit(X_xor, y_xor) 59 | ocr_utils.plot_decision_regions(X=X_xor, y=y_xor, 60 | classifier=svm,title='support vector machine rbf xor') 61 | print ('\n########################### No Errors ####################################') 62 | -------------------------------------------------------------------------------- /p73_support_vector_machine.py: -------------------------------------------------------------------------------- 1 | '''support_vector_machine_linear.py illustrates a support vector machine 2 | The SVM attempts to maximize the margin of error between linearly 3 | separable feature sets. 4 | 5 | Column sums are read in from the E13B character set and fitted to 6 | a SVM. The decision regions are plotted. 7 | 8 | Created on Jun 30, 2016 9 | 10 | from Python Machine Learning by Sebastian Raschka under the following license 11 | 12 | The MIT License (MIT) 13 | 14 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 15 | 16 | Permission is hereby granted, free of charge, to any person obtaining a copy 17 | of this software and associated documentation files (the "Software"), to deal 18 | in the Software without restriction, including without limitation the rights 19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 20 | copies of the Software, and to permit persons to whom the Software is 21 | furnished to do so, subject to the following conditions: 22 | 23 | The above copyright notice and this permission notice shall be included in all 24 | copies or substantial portions of the Software. 25 | 26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 | SOFTWARE. 33 | 34 | @author: richard lyman 35 | ''' 36 | import numpy as np 37 | import ocr_utils 38 | from sklearn.preprocessing import StandardScaler 39 | 40 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 41 | 42 | sc = StandardScaler() 43 | sc.fit(X_train) 44 | X_train_std = sc.transform(X_train) 45 | X_test_std = sc.transform(X_test) 46 | X_combined_std = np.vstack((X_train_std, X_test_std)) 47 | y_combined = np.hstack((y_train, y_test)) 48 | 49 | from sklearn.svm import SVC 50 | 51 | svm = SVC(kernel='linear', C=1.0, random_state=0) 52 | svm.fit(X_train_std, y_train) 53 | 54 | 55 | ocr_utils.plot_decision_regions(X=X_combined_std, 56 | y=y_combined, 57 | classifier=svm, 58 | test_idx=range(len(X_test_std),len(X_combined_std)), 59 | labels = labels, 60 | title='support_vector_machine_linear') 61 | print ('\n########################### No Errors ####################################') 62 | -------------------------------------------------------------------------------- /q1_database_statistics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | dumps out some statistics for the fonts.zip database and 4 | makes plots, saved in files, of characters for each fontVariant found in 5 | the database 6 | 7 | Created on Jul 25, 2016 8 | @author: richard 9 | ''' 10 | import ocr_utils 11 | import numpy as np 12 | 13 | df1 = ocr_utils.get_list(input_filters_dict = {'font':()}) 14 | unique_fonts=[] 15 | unique_fontVariants=[] 16 | unique_m_labels=[] 17 | unique_strengths=[] 18 | unique_italics=[] 19 | unique_orientations=[] 20 | 21 | ############################################################################# 22 | # read and show the character images for each font variant 23 | # output only the character label and the image 24 | 25 | for font in df1: 26 | df2 = ocr_utils.get_list(input_filters_dict = {'font':font,'fontVariant':(), 'm_label':(),'strength':(),'italic':(),'orientation':()}) 27 | unique_fonts = np.unique( np.append(unique_fonts, df2['font'])) 28 | u1= np.unique(df2['fontVariant']) 29 | unique_fontVariants = np.unique(np.append(unique_fontVariants, u1)) 30 | u2 = np.unique(df2['m_label']) 31 | unique_m_labels = np.unique(np.append(unique_m_labels,u2)) 32 | u3 = np.unique(df2['strength']) 33 | unique_strengths = np.unique(np.append(unique_strengths,u3)) 34 | u4 = np.unique(df2['italic']) 35 | unique_italics = np.unique(np.append(unique_italics,u4)) 36 | u5 =np.unique( df2['orientation']) 37 | unique_orientations = np.unique(np.append(unique_orientations,u5)) 38 | print ('\n{}, fontVariants={}, labels = {}, strengths = {}, italics = {}, orientations = {}\n'.format(font[0], len(u1), 39 | len(u2), len(u3), len(u4), len(u5))) 40 | for fontVariant in u1: 41 | fd = {'font': font, 'fontVariant': fontVariant} 42 | ds = ocr_utils.read_data(input_filters_dict=fd, output_feature_list=['m_label','image'] , dtype=np.int32) 43 | y,X = ds.train.features 44 | X2D = np.reshape(X, (X.shape[0], ds.train.num_rows, ds.train.num_columns )) 45 | title = '{}-{}'.format(font[0],fontVariant) 46 | ocr_utils.show_examples(X2D, y, title=title) 47 | 48 | print ('unique fonts={}, fontVariants={}, labels = {}, strengths = {}, italics = {}, orientations = {}'.format(len(unique_fonts), len(unique_fontVariants), 49 | len(unique_m_labels), len(unique_strengths), 50 | len(unique_italics), len(unique_orientations))) 51 | 52 | 53 | print ('\n########################### No Errors ####################################') 54 | -------------------------------------------------------------------------------- /p94_knearest_neighbors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' random_forest.py 3 | The k nearest neighbor classifier memorizes the training set. When the 4 | class label of a new sample is to be predicted, the distance, typically 5 | the Euclidean distance some number, like 5, of the nearest memorized 6 | points is found. The class label of the new point is that of the 7 | majority of the nearest neighbors. 8 | 9 | Created on Jun 23, 2016 10 | 11 | from Python Machine Learning by Sebastian Raschka under the following license 12 | 13 | The MIT License (MIT) 14 | 15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy 18 | of this software and associated documentation files (the "Software"), to deal 19 | in the Software without restriction, including without limitation the rights 20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 21 | copies of the Software, and to permit persons to whom the Software is 22 | furnished to do so, subject to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included in all 25 | copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 | SOFTWARE. 34 | 35 | @author: richard lyman 36 | ''' 37 | import numpy as np 38 | import ocr_utils 39 | from sklearn.preprocessing import StandardScaler 40 | 41 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 42 | 43 | 44 | sc = StandardScaler() 45 | sc.fit(X_train) 46 | X_train_std = sc.transform(X_train) 47 | X_test_std = sc.transform(X_test) 48 | X_combined_std = np.vstack((X_train_std, X_test_std)) 49 | y_combined = np.hstack((y_train, y_test)) 50 | X_combined = np.vstack((X_train, X_test)) 51 | y_combined = np.hstack((y_train, y_test)) 52 | 53 | from sklearn.neighbors import KNeighborsClassifier 54 | 55 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') 56 | knn.fit(X_train_std, y_train) 57 | 58 | ocr_utils.plot_decision_regions(X=X_combined_std, 59 | y=y_combined, 60 | classifier=knn, 61 | labels=labels, 62 | test_idx=range(len(X_test_std),len(X_combined_std)), 63 | title='k_nearest_neighbors') 64 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p124_random_forest_feature_importance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' random_forest_feature_importance.py 3 | 4 | Using a random forest (construct strong learners from weak learners, 5 | the importance of each features is evaluated by measuring the impurity 6 | decrease for each of 10000 trees 7 | 8 | 9 | Created on Jun 23, 2016 10 | 11 | from Python Machine Learning by Sebastian Raschka under the following license 12 | 13 | The MIT License (MIT) 14 | 15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy 18 | of this software and associated documentation files (the "Software"), to deal 19 | in the Software without restriction, including without limitation the rights 20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 21 | copies of the Software, and to permit persons to whom the Software is 22 | furnished to do so, subject to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included in all 25 | copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 | SOFTWARE. 34 | 35 | @author: richard lyman 36 | ''' 37 | import numpy as np 38 | import ocr_utils 39 | import matplotlib.pyplot as plt 40 | 41 | 42 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0) 43 | 44 | 45 | from sklearn.ensemble import RandomForestClassifier 46 | 47 | forest = RandomForestClassifier(n_estimators=10000, 48 | random_state=0, 49 | n_jobs=-1) 50 | 51 | forest.fit(X_train, y_train) 52 | importances = forest.feature_importances_ 53 | 54 | indices = np.argsort(importances)[::-1] 55 | 56 | for f in range(X_train.shape[1]): 57 | print("%2d) %-*s %f" % (f + 1, 30, 58 | indices[f], 59 | importances[indices[f]])) 60 | 61 | title = 'Feature Importances from Random Forest' 62 | plt.title(title) 63 | plt.bar(range(X_train.shape[1]), 64 | importances[indices], 65 | color='lightblue', 66 | align='center') 67 | 68 | plt.xticks(range(X_train.shape[1]), 69 | indices, rotation=90) 70 | plt.xlim([-1, X_train.shape[1]]) 71 | plt.xlabel('column sums') 72 | plt.tight_layout() 73 | ocr_utils.show_figures(plt,title) 74 | 75 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p186_grid_search.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 8, 2016 grid_search.py 3 | Grid search does a brute force train and test of sample data, trying 4 | a grid of parameters. 5 | 6 | The SVM attempts to maximize the margin of error between linearly 7 | separable feature sets. 8 | 9 | from Python Machine Learning by Sebastian Raschka under the following license 10 | 11 | The MIT License (MIT) 12 | 13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in all 23 | copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | 33 | @author: richard lyman 34 | ''' 35 | import ocr_utils 36 | from sklearn.preprocessing import StandardScaler 37 | from sklearn.pipeline import Pipeline 38 | from sklearn.model_selection import GridSearchCV 39 | from sklearn.svm import SVC 40 | 41 | if __name__ == '__main__': 42 | 43 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 44 | 45 | pipe_svc = Pipeline([('scl', StandardScaler()), 46 | ('clf', SVC(random_state=1))]) 47 | 48 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] 49 | 50 | param_grid = [{'clf__C': param_range, 51 | 'clf__kernel': ['linear']}, 52 | {'clf__C': param_range, 53 | 'clf__gamma': param_range, 54 | 'clf__kernel': ['rbf']}] 55 | 56 | gs = GridSearchCV(estimator=pipe_svc, 57 | param_grid=param_grid, 58 | scoring='accuracy', 59 | cv=10, 60 | n_jobs=-1) 61 | gs = gs.fit(X_train, y_train) 62 | 63 | print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_)) 64 | print('Support Vector Machine Grid Search best params: {}'.format(sorted(gs.best_params_.items()))) 65 | 66 | clf = gs.best_estimator_ 67 | clf.fit(X_train, y_train) 68 | print('Support Vector Machine Test accuracy: %.3f' % clf.score(X_test, y_test)) 69 | 70 | print ('\n########################### No Errors ####################################') 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /p78_support_vector_machine_gamma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' support_vector_machine_gamma.py illustrates changing the gamma parameter 3 | for a SVM. This is a cut-off parameters for the Gaussian sphere. A 4 | higher value tightens the decision boundary around the samples 5 | 6 | Run the SVM with two values of gamma and plot the decision regions 7 | 8 | Created on Jun 23, 2016 9 | 10 | from Python Machine Learning by Sebastian Raschka under the following license 11 | 12 | The MIT License (MIT) 13 | 14 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 15 | 16 | Permission is hereby granted, free of charge, to any person obtaining a copy 17 | of this software and associated documentation files (the "Software"), to deal 18 | in the Software without restriction, including without limitation the rights 19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 20 | copies of the Software, and to permit persons to whom the Software is 21 | furnished to do so, subject to the following conditions: 22 | 23 | The above copyright notice and this permission notice shall be included in all 24 | copies or substantial portions of the Software. 25 | 26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 | SOFTWARE. 33 | 34 | @author: richard lyman 35 | ''' 36 | import numpy as np 37 | import ocr_utils 38 | from sklearn.preprocessing import StandardScaler 39 | from sklearn.svm import SVC 40 | 41 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 42 | 43 | sc = StandardScaler() 44 | sc.fit(X_train) 45 | X_train_std = sc.transform(X_train) 46 | X_test_std = sc.transform(X_test) 47 | X_combined_std = np.vstack((X_train_std, X_test_std)) 48 | y_combined = np.hstack((y_train, y_test)) 49 | 50 | 51 | svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0) 52 | svm.fit(X_train_std, y_train) 53 | 54 | ocr_utils.plot_decision_regions(X=X_combined_std, 55 | y=y_combined, 56 | classifier=svm, 57 | labels = labels, 58 | test_idx=range(len(X_test_std),len(X_combined_std)), 59 | title='SVM with gamma 0.2') 60 | 61 | svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0) 62 | svm.fit(X_train_std, y_train) 63 | 64 | ocr_utils.plot_decision_regions(X=X_combined_std, 65 | y=y_combined, 66 | classifier=svm, 67 | labels = labels, 68 | test_idx=range(len(X_test_std),len(X_combined_std)), 69 | title='SVM with gamma 100') 70 | 71 | 72 | print ('\n########################### No Errors ####################################') 73 | -------------------------------------------------------------------------------- /p190_confusion_matrix.py: -------------------------------------------------------------------------------- 1 | ''' confusion_matrix.py 2 | 3 | A confusion matrix shows a map of true positives, false positives, false 4 | negatives, and true negatives for a decision. 5 | 6 | Some decisions require a biased output where it we may want to reduce 7 | the number of false positives, for instance. This is especially true 8 | in medical diagnosis. 9 | 10 | 11 | Created on Jul 8, 2016 12 | 13 | from Python Machine Learning by Sebastian Raschka under the following license 14 | 15 | The MIT License (MIT) 16 | 17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 18 | 19 | Permission is hereby granted, free of charge, to any person obtaining a copy 20 | of this software and associated documentation files (the "Software"), to deal 21 | in the Software without restriction, including without limitation the rights 22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | copies of the Software, and to permit persons to whom the Software is 24 | furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all 27 | copies or substantial portions of the Software. 28 | 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | SOFTWARE. 36 | 37 | @author: richard lyman 38 | ''' 39 | 40 | import matplotlib.pyplot as plt 41 | import ocr_utils 42 | from sklearn.preprocessing import StandardScaler 43 | from sklearn.pipeline import Pipeline 44 | from sklearn.svm import SVC 45 | 46 | from sklearn.metrics import confusion_matrix 47 | 48 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 49 | 50 | pipe_svc = Pipeline([('scl', StandardScaler()), 51 | ('clf', SVC(random_state=1))]) 52 | 53 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] 54 | 55 | param_grid = [{'clf__C': param_range, 56 | 'clf__kernel': ['linear']}, 57 | {'clf__C': param_range, 58 | 'clf__gamma': param_range, 59 | 'clf__kernel': ['rbf']}] 60 | 61 | pipe_svc.fit(X_train, y_train) 62 | y_pred = pipe_svc.predict(X_test) 63 | confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) 64 | print(confmat) 65 | 66 | fig, ax = plt.subplots(figsize=(2.5, 2.5)) 67 | ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) 68 | for i in range(confmat.shape[0]): 69 | for j in range(confmat.shape[1]): 70 | ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') 71 | 72 | plt.xlabel('predicted label') 73 | plt.ylabel('true label') 74 | title='c5_confusion_matrix' 75 | plt.title(title) 76 | plt.tight_layout() 77 | ocr_utils.show_figures(plt,title) 78 | 79 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /q0_simple_e13b_display.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | '''simple_e13b_display.py displays a plot of the characters in the E13B font 3 | 4 | See the explanation of the E13B character set in ocr_utils.load_E13B. 5 | 6 | Created on Jun 20, 2016 7 | 8 | from Python Machine Learning by Sebastian Raschka under the following license 9 | 10 | The MIT License (MIT) 11 | 12 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | 32 | @author: richard lyman 33 | 34 | ''' 35 | import ocr_utils 36 | import numpy as np 37 | 38 | ############################################################################# 39 | # read images and scatter plot 40 | 41 | # retrieve 400 sets of target numbers and column sums 42 | # y: the ascii characters 48 and 49 ('0', '1') 43 | # X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17 44 | ascii_characters_to_train = (48,49) 45 | columnsXY = (9,17) 46 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train=ascii_characters_to_train , columns=columnsXY,nChars=256) 47 | 48 | # put the ASCII equivalent of the unique characters in y into the legend of the plot 49 | legend=[] 50 | for ys in np.unique(y): 51 | legend.append('{} \'{}\''.format(ys, chr(ys))) 52 | 53 | ocr_utils.scatter_plot(X=X, 54 | y=y, 55 | legend_entries=legend, 56 | axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 57 | title='E13B sum of columns') 58 | 59 | ############################################################################# 60 | # read and show character images for '0', and '1' 61 | # select the digits in columnsXY in the E13B font 62 | 63 | fd = {'m_label': ascii_characters_to_train, 'font': 'E13B'} 64 | 65 | # output only the character label and the image 66 | fl = ['m_label','image'] 67 | 68 | # read the complete image (20x20) = 400 pixels for each character 69 | ds = ocr_utils.read_data(input_filters_dict=fd, output_feature_list=fl, dtype=np.int32) 70 | y,X = ds.train.features 71 | 72 | # change to a 2D shape 73 | X=np.reshape(X,(X.shape[0],ds.train.num_rows, ds.train.num_columns)) 74 | ocr_utils.montage(X,title='some E13B Characters') 75 | 76 | 77 | 78 | print ('\n########################### No Errors ####################################') 79 | -------------------------------------------------------------------------------- /p124_random_forest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' random_forest.py 3 | The random forest uses weak learners to build a strong learner 4 | 5 | A random subset of samples is drawn and then at each node a decision 6 | tree is grown from a smaller subset of those bootstrap samples 7 | 8 | This is repeated a number of times and then the decision trees are 9 | combined via majority vote. 10 | 11 | 1) run the random forest on the e13b data 12 | 2) plot the decision regions 13 | 14 | 15 | Created on Jun 23, 2016 16 | 17 | from Python Machine Learning by Sebastian Raschka under the following license 18 | 19 | The MIT License (MIT) 20 | 21 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy 24 | of this software and associated documentation files (the "Software"), to deal 25 | in the Software without restriction, including without limitation the rights 26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | copies of the Software, and to permit persons to whom the Software is 28 | furnished to do so, subject to the following conditions: 29 | 30 | The above copyright notice and this permission notice shall be included in all 31 | copies or substantial portions of the Software. 32 | 33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 36 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 | SOFTWARE. 40 | 41 | @author: richard lyman 42 | ''' 43 | import numpy as np 44 | import ocr_utils 45 | from sklearn.preprocessing import StandardScaler 46 | 47 | if __name__ == '__main__': 48 | 49 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 50 | 51 | 52 | sc = StandardScaler() 53 | sc.fit(X_train) 54 | X_train_std = sc.transform(X_train) 55 | X_test_std = sc.transform(X_test) 56 | X_combined_std = np.vstack((X_train_std, X_test_std)) 57 | y_combined = np.hstack((y_train, y_test)) 58 | X_combined = np.vstack((X_train, X_test)) 59 | y_combined = np.hstack((y_train, y_test)) 60 | 61 | from sklearn.ensemble import RandomForestClassifier 62 | 63 | forest = RandomForestClassifier(criterion='entropy', 64 | n_estimators=10, 65 | random_state=1, 66 | n_jobs=2) 67 | forest.fit(X_train, y_train) 68 | 69 | ocr_utils.plot_decision_regions(X=X_combined, 70 | y=y_combined, 71 | classifier=forest, 72 | labels=labels, 73 | test_idx=range(len(X_test_std),len(X_combined_std)), 74 | title='random_forest') 75 | 76 | print ('\n########################### No Errors ####################################') 77 | -------------------------------------------------------------------------------- /p51_standard_scalar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | '''standard_scalar illustrates the use of the scaling from 3 | the sklearn tools. 4 | 1) column sums from the E13B dataset are read in as features. 5 | 2) Features are scaled with the sklearn StandardScaler 6 | 3) The features are then fitted to a Perceptron and the decision regions 7 | are plotted. 8 | 9 | Created on Jun 23, 2016 10 | 11 | from Python Machine Learning by Sebastian Raschka under the following license 12 | 13 | The MIT License (MIT) 14 | 15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy 18 | of this software and associated documentation files (the "Software"), to deal 19 | in the Software without restriction, including without limitation the rights 20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 21 | copies of the Software, and to permit persons to whom the Software is 22 | furnished to do so, subject to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included in all 25 | copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 33 | SOFTWARE. 34 | 35 | @author: richard lyman 36 | ''' 37 | import numpy as np 38 | import ocr_utils 39 | from sklearn.preprocessing import StandardScaler 40 | from sklearn.linear_model import Perceptron 41 | from sklearn.model_selection import train_test_split 42 | 43 | 44 | ############################################################################# 45 | # read images and scatter plot 46 | 47 | # retrieve 500 sets of target numbers and column sums 48 | # y: the ascii characters 48 and 49 ('0', '1') 49 | # X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17 50 | 51 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17),nChars=500, random_state=0) 52 | 53 | print('Class labels:', np.unique(y)) 54 | 55 | 56 | ############################################################################# 57 | # standardize the features 58 | 59 | X_train, X_test, y_train, y_test = train_test_split( 60 | X, y, test_size=0.3, random_state=0) 61 | 62 | sc = StandardScaler() 63 | sc.fit(X_train) 64 | X_train_std = sc.transform(X_train) 65 | X_test_std = sc.transform(X_test) 66 | 67 | ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0) 68 | ppn.fit(X_train_std, y_train) 69 | 70 | y_pred = ppn.predict(X_test_std) 71 | print('Misclassified samples: %d' % (y_test != y_pred).sum()) 72 | 73 | from sklearn.metrics import accuracy_score 74 | 75 | print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) 76 | 77 | X_combined_std = np.vstack((X_train_std, X_test_std)) 78 | y_combined = np.hstack((y_train, y_test)) 79 | 80 | ocr_utils.plot_decision_regions(X_combined_std, y_combined, ppn, 81 | test_idx=range(len(X_test_std),len(X_combined_std)), 82 | labels=labels, 83 | title='perceptron_scikit') 84 | 85 | 86 | 87 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p330_dendrogram.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 21, 2016 3 | 4 | A dendrogram is a diagram that shows the cluster composition of a dataset by 5 | by showing cluster as levels in a tree. At each node, the cluster is 6 | shown as subclusters. This is somewhat like the separation of the data by 7 | a Decision Tree. 8 | 9 | from Python Machine Learning by Sebastian Raschka under the following license 10 | 11 | The MIT License (MIT) 12 | 13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in all 23 | copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | 33 | @author: richard lyman 34 | ''' 35 | 36 | 37 | import ocr_utils 38 | import matplotlib.pyplot as plt 39 | 40 | ############################################## 41 | # separate the original images by cluster 42 | # print(km.cluster_centers_.shape) 43 | 44 | n=300 45 | 46 | variables = ['X', 'Y', 'Z'] 47 | labels = ['ID_0','ID_1','ID_2','ID_3','ID_4'] 48 | 49 | chars_to_train = range(48,51) 50 | columnsXY=(9,17) 51 | column_str = 'column_sum{}'.format(list(columnsXY)) 52 | 53 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 54 | 55 | # output the character label and the image and column sums 56 | output_feature_list = ['m_label','image', column_str] 57 | 58 | # read the complete image (20x20) = 400 pixels for each character 59 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 60 | output_feature_list=output_feature_list, 61 | random_state=0) 62 | 63 | y = ds.train.features[0][:n] 64 | X_image = ds.train.features[1][:n] 65 | X = ds.train.features[2][:n] 66 | 67 | from scipy.spatial.distance import pdist 68 | 69 | row_dist = pdist(X, metric='euclidean') 70 | print(row_dist) 71 | 72 | from scipy.cluster.hierarchy import linkage 73 | 74 | #method 1 using a condensed matrix 75 | row_clusters = linkage(row_dist, method='complete', metric='euclidean') 76 | 77 | print (row_clusters) 78 | #method 2 using raw data 79 | row_clusters = linkage(X, method='complete', metric='euclidean') 80 | 81 | print() 82 | print (row_clusters) 83 | 84 | from scipy.cluster.hierarchy import dendrogram 85 | 86 | # make dendrogram black (part 1/2) 87 | # from scipy.cluster.hierarchy import set_link_color_palette 88 | # set_link_color_palette(['black']) 89 | 90 | row_dendr = dendrogram(row_clusters, p=12, truncate_mode = 'lastp') 91 | 92 | plt.tight_layout() 93 | plt.ylabel('Euclidean distance') 94 | #plt.savefig('./figures/dendrogram.png', dpi=300, 95 | # bbox_inches='tight') 96 | title = "Dendogram" 97 | plt.title(title) 98 | ocr_utils.show_figures(plt, title) 99 | 100 | print ('\n########################### No Errors ####################################') 101 | -------------------------------------------------------------------------------- /n0_network.py: -------------------------------------------------------------------------------- 1 | #import tensorflow as tf 2 | from tensorflow.compat import v1 as tf 3 | tf.compat.v1.disable_eager_execution() 4 | import numpy as np 5 | from collections import namedtuple 6 | import datetime 7 | import ocr_utils 8 | 9 | class base_network(object): 10 | ''' definition of the network 11 | ''' 12 | 13 | 14 | 15 | 16 | def fit(self, truthed_data, nEpochs=5000): 17 | 18 | perfect_count=10 19 | for i in range(nEpochs): 20 | 21 | batch = truthed_data.next_batch(100) 22 | # assign feature data to each placeholder 23 | # the batch list is returned in the same order as the features requested 24 | feed = {self._keep_prob: 0.5} 25 | for j in range(truthed_data.num_features): 26 | feed[self._ph[j]] = batch[j] 27 | 28 | if i%100 == 0: 29 | 30 | feed[self._keep_prob] = 1.0 31 | result = self._sess.run([self._merged, self._accuracy ], feed_dict=feed) 32 | summary_str = result[0] 33 | 34 | self._writer.add_summary(summary_str, i) 35 | train_accuracy = result[1] 36 | if train_accuracy <= (1.0 - 1e-5 ): 37 | perfect_count=10; 38 | else: 39 | perfect_count -= 1 40 | if perfect_count==0: 41 | break; 42 | 43 | print ("step %d, training accuracy %g"%(i, train_accuracy),flush=True) 44 | self._sess.run(self._train_step,feed_dict=feed) 45 | 46 | 47 | 48 | 49 | 50 | def test(self, truthed_data, title = ''): 51 | 52 | # assign feature data to each placeholder 53 | error_images = np.empty((0,self._nRows,self._nCols)) 54 | 55 | test_accuracy=0 56 | m=0 57 | 58 | for i in range(int(len(truthed_data.features[0])/100)): 59 | 60 | batch = truthed_data.next_batch(100) 61 | # assign feature data to each placeholder 62 | # the batch list is returned in the same order as the features requested 63 | feed = {self._keep_prob: 1.0} 64 | for j in range(truthed_data.num_features): 65 | feed[self._ph[j]] = batch[j] 66 | 67 | 68 | result = self._sess.run([self._accuracy, self._x_image, self._correct_prediction], feed_dict=feed) 69 | 70 | test_accuracy += result[0] 71 | error_images = np.append(error_images, result[1][:,:,:,0][result[2]==False],axis=0) 72 | m += 1 73 | try: 74 | print ("test accuracy {} for : {}".format(test_accuracy/m, title),flush=True) 75 | ocr_utils.montage(error_images,title='TensorFlow {} Error Images'.format(title)) 76 | except: 77 | if m==0: 78 | print ("test accuracy 1",flush=True) 79 | else: 80 | print ("test accuracy {}".format(test_accuracy/m),flush=True) 81 | ocr_utils.montage(error_images,title='TensorFlow Error Images') 82 | 83 | 84 | def predict(self, truthed_features): 85 | feed={self._keep_prob: 1.0} 86 | # assign feature data to each placeholder 87 | error_images = np.empty((truthed_features.num_rows,truthed_features.num_columns)) 88 | 89 | test_accuracy=0 90 | m=0 91 | 92 | for j in range(1,truthed_features.num_features): 93 | feed[self._ph[j]] = truthed_features.features[j] 94 | result = self._sess.run([self._prediction], feed_dict=feed) 95 | 96 | return result[0] 97 | 98 | -------------------------------------------------------------------------------- /p411_keras.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 22, 2016 3 | 4 | keras.py implements a neural network with the keras framework. 5 | 6 | This example trains using the Handprint images 7 | 8 | from Python Machine Learning by Sebastian Raschka under the following license 9 | 10 | The MIT License (MIT) 11 | 12 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | 32 | @author: richard lyman 33 | ''' 34 | 35 | import ocr_utils 36 | import numpy as np 37 | 38 | 39 | 40 | def do_keras(X_train,X_test, y_train_ohe, y_train,y_test): 41 | print('First 3 labels: ', y_train[:3]) 42 | 43 | print('\nFirst 3 labels (one-hot):\n', y_train_ohe[:3]) 44 | 45 | from keras.models import Sequential 46 | from keras.layers.core import Dense 47 | from keras.optimizers import SGD 48 | 49 | np.random.seed(1) 50 | 51 | model = Sequential() 52 | model.add(Dense(input_dim=X_train.shape[1], 53 | units=50, 54 | activation='tanh')) 55 | 56 | model.add(Dense(input_dim=50, 57 | units=50, 58 | activation='tanh')) 59 | 60 | model.add(Dense(input_dim=50, 61 | units=y_train_ohe.shape[1], 62 | activation='softmax')) 63 | 64 | sgd = SGD(lr=0.001, decay=1e-7, momentum=.9) 65 | model.compile(loss='categorical_crossentropy', optimizer=sgd,metrics=["accuracy"]) 66 | 67 | model.fit(X_train, y_train_ohe, 68 | epochs=50, 69 | batch_size=300, 70 | verbose=2, 71 | validation_split=0.1 72 | ) 73 | y_train_pred = model.predict_classes(X_train, verbose=0) 74 | print('First 3 predictions: ', y_train_pred[:3]) 75 | train_acc = np.sum(y_train == y_train_pred, axis=0) / X_train.shape[0] 76 | print('Training accuracy: %.2f%%' % (train_acc * 100)) 77 | 78 | 79 | y_test_pred = model.predict_classes(X_test, verbose=0) 80 | test_acc = np.sum(y_test == y_test_pred, axis=0) / X_test.shape[0] 81 | print('Test accuracy: %.2f%%' % (test_acc * 100)) 82 | 83 | input_filters_dict = {'font': ('HANDPRINT',)} 84 | output_feature_list = ['m_label_one_hot','image','m_label'] 85 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 86 | output_feature_list=output_feature_list, 87 | engine_type='keras', 88 | test_size = .1, 89 | dtype=np.float32, 90 | random_state=0) 91 | 92 | X_train = ds.train.features[1] 93 | X_test = ds.test.features[1] 94 | y_train_ohe = ds.train.features[0] 95 | y_train = ds.train.features[2]-48 96 | y_test = ds.test.features[2]-48 97 | do_keras(X_train,X_test, y_train_ohe, y_train, y_test) 98 | 99 | 100 | print ('\n########################### No Errors ####################################') 101 | 102 | -------------------------------------------------------------------------------- /p322_silhouette_plots.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 18, 2016 3 | silhouette.py 4 | 5 | A silhouette plot shows how well the samples are bound to a single 6 | centroid selected by k-means and how well they are separated from the 7 | other clusters. 8 | 9 | Typically the cohesion and dissimilarity coefficients that make up 10 | the silhouette are calculated using Euclidean distance. 11 | 12 | This program shows silhouette plot using a small number of clusters. 13 | 14 | from Python Machine Learning by Sebastian Raschka under the following license 15 | 16 | The MIT License (MIT) 17 | 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining a copy 21 | of this software and associated documentation files (the "Software"), to deal 22 | in the Software without restriction, including without limitation the rights 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | copies of the Software, and to permit persons to whom the Software is 25 | furnished to do so, subject to the following conditions: 26 | 27 | The above copyright notice and this permission notice shall be included in all 28 | copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | SOFTWARE. 37 | 38 | @author: richard lyman 39 | ''' 40 | 41 | import numpy as np 42 | import ocr_utils 43 | import matplotlib.pyplot as plt 44 | n=1000 45 | 46 | chars_to_train = (48,50) 47 | columnsXY=(9,17) 48 | column_str = 'column_sum{}'.format(list(columnsXY)) 49 | skewRange = np.linspace(-0.5,0.5,81) 50 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 51 | 52 | # output the character label and the image and column sums 53 | output_feature_list = ['m_label','image',column_str] 54 | 55 | # read the complete image (20x20) = 400 pixels for each character 56 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 57 | output_feature_list=output_feature_list, 58 | random_state=0) 59 | 60 | y = ds.train.features[0][:n] 61 | X_image = ds.train.features[1][:n] 62 | X = ds.train.features[2][:n] 63 | 64 | from matplotlib import cm 65 | from sklearn.metrics import silhouette_samples 66 | from sklearn.cluster import KMeans 67 | 68 | km = KMeans(n_clusters=2, 69 | init='k-means++', 70 | n_init=10, 71 | max_iter=300, 72 | tol=1e-04, 73 | random_state=0) 74 | y_km = km.fit_predict(X) 75 | 76 | cluster_labels = np.unique(y_km) 77 | n_clusters = cluster_labels.shape[0] 78 | silhouette_vals = silhouette_samples(X, y_km, metric='euclidean') 79 | y_ax_lower, y_ax_upper = 0, 0 80 | yticks = [] 81 | for i, c in enumerate(cluster_labels): 82 | c_silhouette_vals = silhouette_vals[y_km == c] 83 | c_silhouette_vals.sort() 84 | y_ax_upper += len(c_silhouette_vals) 85 | color = cm.jet(i / n_clusters) 86 | plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 87 | edgecolor='none', color=color) 88 | 89 | yticks.append((y_ax_lower + y_ax_upper) / 2) 90 | y_ax_lower += len(c_silhouette_vals) 91 | 92 | silhouette_avg = np.mean(silhouette_vals) 93 | plt.axvline(silhouette_avg, color="red", linestyle="--") 94 | 95 | plt.yticks(yticks, cluster_labels + 1) 96 | plt.ylabel('Cluster') 97 | plt.xlabel('Silhouette coefficient') 98 | title = 'Silhouettes' 99 | plt.title(title) 100 | plt.tight_layout() 101 | ocr_utils.show_figures(plt, title) 102 | print ('\n########################### No Errors ####################################') 103 | -------------------------------------------------------------------------------- /p193_model_precision_recall.py: -------------------------------------------------------------------------------- 1 | '''mode_precision_recall.py 2 | 3 | Precision and recall are measures of true positives. 4 | Precision is also called positive predictive value. 5 | Precision is "how useful the search results are". 6 | 7 | Recall is also called sensitivity 8 | Recall is "how complete the results are". 9 | 10 | Combining them is the F1 score 11 | It is the harmonic meanof Precision and Recall 12 | 13 | Given a couple of E13B, compute the precision and recall values. 14 | Make a scorer using the F1 measure as the score and use 15 | grid search to find the parameters that give the highest F1 measure 16 | 17 | 18 | 19 | Created on Jul 9, 2016 20 | 21 | from Python Machine Learning by Sebastian Raschka under the following license 22 | 23 | The MIT License (MIT) 24 | 25 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 26 | 27 | Permission is hereby granted, free of charge, to any person obtaining a copy 28 | of this software and associated documentation files (the "Software"), to deal 29 | in the Software without restriction, including without limitation the rights 30 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 31 | copies of the Software, and to permit persons to whom the Software is 32 | furnished to do so, subject to the following conditions: 33 | 34 | The above copyright notice and this permission notice shall be included in all 35 | copies or substantial portions of the Software. 36 | 37 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 38 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 39 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 40 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 41 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 42 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 43 | SOFTWARE. 44 | 45 | @author: richard lyman 46 | ''' 47 | 48 | import ocr_utils 49 | from sklearn.preprocessing import StandardScaler 50 | from sklearn.pipeline import Pipeline 51 | from sklearn.svm import SVC 52 | from sklearn.model_selection import GridSearchCV 53 | from sklearn.metrics import make_scorer,precision_score, recall_score, f1_score 54 | from sklearn.model_selection import train_test_split 55 | 56 | if __name__ == '__main__': 57 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 58 | 59 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) 60 | 61 | pipe_svc = Pipeline([('scl', StandardScaler()), 62 | ('clf', SVC(random_state=1))]) 63 | 64 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] 65 | 66 | param_grid = [{'clf__C': param_range, 67 | 'clf__kernel': ['linear']}, 68 | {'clf__C': param_range, 69 | 'clf__gamma': param_range, 70 | 'clf__kernel': ['rbf']}] 71 | pipe_svc.fit(X_train, y_train) 72 | y_pred = pipe_svc.predict(X_test) 73 | 74 | pos_label=y_train[0] 75 | print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label)) 76 | print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label)) 77 | print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label)) 78 | 79 | scorer = make_scorer(f1_score, pos_label=pos_label) 80 | 81 | c_gamma_range = [0.01, 0.1, 1.0, 10.0] 82 | 83 | param_grid = [{'clf__C': c_gamma_range, 84 | 'clf__kernel': ['linear']}, 85 | {'clf__C': c_gamma_range, 86 | 'clf__gamma': c_gamma_range, 87 | 'clf__kernel': ['rbf'],}] 88 | 89 | gs = GridSearchCV(estimator=pipe_svc, 90 | param_grid=param_grid, 91 | scoring=scorer, 92 | cv=10, 93 | n_jobs=-1) 94 | gs = gs.fit(X_train, y_train) 95 | print('\nGrid Search f1 scoring best score: {}'.format(gs.best_score_)) 96 | print('Grid Search f1 scoring best params: {}'.format(sorted(gs.best_params_.items()))) 97 | print ('\n########################### No Errors ####################################') 98 | -------------------------------------------------------------------------------- /q2_tensorflow_mnist.py: -------------------------------------------------------------------------------- 1 | """# ========================================================================== 2 | 3 | # Copyright 2015 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | This sample program is a modified version of the Google mnist convolutional 19 | network tutorial example. See the mnist tutorial in www.tensorflow.org 20 | 21 | The tutorial version of the program is modified in order to send some 22 | features directly to the fully connected layer, thus bypassing the 23 | convolution layer. 24 | 25 | Images go through convolution. Everything else bypasses. 26 | 27 | see tensor_flow_graph.png 28 | """# ============================================================================== 29 | 30 | import ocr_utils 31 | import datetime 32 | from collections import namedtuple 33 | import numpy as np 34 | import pandas as pd 35 | import n1_2cnv1fc as nnetwork 36 | from tensorflow.compat import v1 as tf 37 | #import tf 38 | dtype = np.float32 39 | 40 | if True: 41 | # single font train 42 | 43 | # esamples 44 | # select only images from 'OCRB' scanned font 45 | # input_filters_dict = {'font': ('OCRA',)} 46 | 47 | # select only images from 'HANDPRINT' font 48 | #input_filters_dict = {'font': ('HANDPRINT',)} 49 | 50 | # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant 51 | # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)} 52 | 53 | # select everything; all fonts , font variants, etc. 54 | #input_filters_dict = {} 55 | 56 | # select the digits 0 through 9 in the E13B font 57 | # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'} 58 | 59 | # select the digits 0 and 2in the E13B font 60 | # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'} 61 | 62 | # output the character label, image, italic flag, aspect_ratio and upper_case flag 63 | # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 64 | 65 | # output only the character label and the image 66 | # output_feature_list = ['m_label_one_hot','image'] 67 | 68 | # identify the font given the input images 69 | #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case'] 70 | 71 | # train the digits 0-9 for all fonts 72 | input_filters_dict = {'m_label': range(48,58)} 73 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 74 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 75 | output_feature_list=output_feature_list, 76 | test_size = .1, 77 | engine_type='tensorflow',dtype=dtype) 78 | nn = nnetwork.network(ds.train) 79 | nn.fit( ds.train, nEpochs=5000) 80 | nn.test(ds.test) 81 | 82 | else: 83 | # loop through all the fonts and train individually 84 | 85 | # pick up the entire list of fonts and font variants. Train each one. 86 | df1 = ocr_utils.get_list(input_filters_dict={'font': ()}) 87 | 88 | import pprint as pprint 89 | pp = pprint.PrettyPrinter(indent=4) 90 | pp.pprint(df1) 91 | 92 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 93 | 94 | # Change nEpochs to 5000 for better results 95 | for l in df1: 96 | input_filters_dict= {'font': (l[0],)} 97 | train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 98 | 99 | 100 | print ('\n########################### No Errors ####################################') 101 | 102 | -------------------------------------------------------------------------------- /p110_scaling_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' scaling_features.py 3 | 4 | Investigate normalization versus standardization 5 | 6 | The features in the ocr_utils are already normalized. That is, each image 7 | has been stretched to go from pure black to pure white. The values in the 8 | .csv file are 0 to 255. ocr_utils.py changes these value to be in the range 9 | 0.0 to 1.0 10 | 11 | 1) prints out a sampling of the normalized values. 12 | 2) standardize the values and print them out. 13 | 14 | Created on Jun 23, 2016 15 | 16 | from Python Machine Learning by Sebastian Raschka under the following license 17 | 18 | The MIT License (MIT) 19 | 20 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 21 | 22 | Permission is hereby granted, free of charge, to any person obtaining a copy 23 | of this software and associated documentation files (the "Software"), to deal 24 | in the Software without restriction, including without limitation the rights 25 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 26 | copies of the Software, and to permit persons to whom the Software is 27 | furnished to do so, subject to the following conditions: 28 | 29 | The above copyright notice and this permission notice shall be included in all 30 | copies or substantial portions of the Software. 31 | 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 35 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 36 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 37 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 38 | SOFTWARE. 39 | 40 | @author: richard lyman 41 | ''' 42 | import numpy as np 43 | import ocr_utils 44 | from sklearn.neighbors import KNeighborsClassifier 45 | 46 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 47 | 48 | # put the ASCII equivalent of the unique characters in y into the legend of the plot 49 | legend=[] 50 | for ys in np.unique(y_train): 51 | legend.append('{} \'{}\''.format(ys, chr(ys))) 52 | 53 | X_combined = np.vstack((X_train, X_test)) 54 | y_combined = np.hstack((y_train, y_test)) 55 | 56 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') 57 | knn.fit(X_train, y_train) 58 | 59 | 60 | ocr_utils.plot_decision_regions(X=X_combined, 61 | y=y_combined, 62 | classifier=knn, 63 | labels=labels, 64 | test_idx=range(len(X_test),len(X_combined)), 65 | title='k_nearest_neighbors no scaling') 66 | 67 | from sklearn.preprocessing import MinMaxScaler 68 | 69 | mms = MinMaxScaler() 70 | X_train_norm = mms.fit_transform(X_train) 71 | X_test_norm = mms.transform(X_test) 72 | X_combined_norm = np.vstack((X_train_norm, X_test_norm)) 73 | 74 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') 75 | knn.fit(X_train_norm, y_train) 76 | 77 | 78 | ocr_utils.plot_decision_regions(X=X_combined_norm, 79 | y=y_combined, 80 | classifier=knn, 81 | labels=labels, 82 | test_idx=range(len(X_test_norm),len(X_combined_norm)), 83 | title='k_nearest_neighbors MinMaxScaller') 84 | 85 | from sklearn.preprocessing import StandardScaler 86 | stdsc = StandardScaler() 87 | X_train_std = stdsc.fit_transform(X_train) 88 | X_test_std = stdsc.transform(X_test) 89 | 90 | sc = StandardScaler() 91 | sc.fit(X_train) 92 | X_train_std = sc.transform(X_train) 93 | X_test_std = sc.transform(X_test) 94 | X_combined_std = np.vstack((X_train_std, X_test_std)) 95 | 96 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') 97 | knn.fit(X_train_std, y_train) 98 | 99 | ocr_utils.plot_decision_regions(X=X_combined_std, 100 | y=y_combined, 101 | classifier=knn, 102 | labels=labels, 103 | test_idx=range(len(X_test_std),len(X_combined_std)), 104 | title='k_nearest_neighbors Standard Normalized') 105 | 106 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /q6_tensorflow_residual3x4.py: -------------------------------------------------------------------------------- 1 | """# ========================================================================== 2 | 3 | # Copyright 2015 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | This sample program is a modified version of the Google mnist convolutional 19 | network tutorial example. See the mnist tutorial in www.tensorflow.org 20 | 21 | This graph has multiple sections 3 layers each, 400 100 400 followed 22 | by a fully connected layer. 23 | 24 | see tensor_flow_graph.png 25 | """# ============================================================================== 26 | import ocr_utils 27 | import datetime 28 | from collections import namedtuple 29 | import numpy as np 30 | import pandas as pd 31 | import n1_residual3x4 as nnetwork 32 | from tensorflow.compat import v1 as tf 33 | dtype = np.float32 34 | #with tf.device('/GPU:0'): 35 | #with tf.device('/cpu:0'): 36 | 37 | 38 | if True: 39 | # single font train 40 | 41 | # examples 42 | # select only images from 'OCRB' scanned font 43 | # input_filters_dict = {'font': ('OCRA',)} 44 | 45 | # select only images from 'HANDPRINT' font 46 | #input_filters_dict = {'font': ('HANDPRINT',)} 47 | 48 | # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant 49 | # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)} 50 | 51 | # select everything; all fonts , font variants, etc. 52 | # input_filters_dict = {} 53 | 54 | # select the digits 0 through 9 in the E13B font 55 | # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'} 56 | 57 | # select the digits 0 and 2in the E13B font 58 | # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'} 59 | 60 | # output the character label, image, italic flag, aspect_ratio and upper_case flag 61 | # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 62 | 63 | # output only the character label and the image 64 | # output_feature_list = ['m_label_one_hot','image'] 65 | 66 | # identify the font given the input images 67 | #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case'] 68 | 69 | # train the digits 0-9 for all fonts 70 | #input_filters_dict = {'m_label': range(48,58)} 71 | input_filters_dict = {'font':'ARIAL','m_label': list(range(48,58))+list(range(65,91))+list(range(97,123))} 72 | #input_filters_dict = {} 73 | output_feature_list = ['m_label_one_hot','image'] 74 | 75 | """# ============================================================================== 76 | 77 | Train and Evaluate the Model 78 | 79 | """# ============================================================================== 80 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 81 | output_feature_list=output_feature_list, 82 | test_size = .1, 83 | engine_type='tensorflow',dtype=dtype) 84 | nn = nnetwork.network(ds.train) 85 | nn.fit( ds.train, nEpochs=5000) 86 | nn.test(ds.test) 87 | 88 | # train_a_font(input_filters_dict, output_feature_list, nEpochs = 50000) 89 | 90 | else: 91 | # loop through all the fonts and train individually 92 | 93 | # pick up the entire list of fonts and font variants. Train each one. 94 | df1 = ocr_utils.get_list(input_filters_dict={'font': ()}) 95 | 96 | import pprint as pprint 97 | pp = pprint.PrettyPrinter(indent=4) 98 | pp.pprint(df1) 99 | 100 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot'] 101 | 102 | # Change nEpochs to 5000 for better results 103 | for l in df1: 104 | input_filters_dict= {'font': (l[0],)} 105 | train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 106 | 107 | 108 | print ('\n########################### No Errors ####################################') 109 | 110 | -------------------------------------------------------------------------------- /p229_adaboost.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 10, 2016 3 | adaboost.py 4 | 5 | Adaboost builds a classifier by starting with weak learners like a forest 6 | decision tree, selecting training set samples without replacement, training 7 | a stump, finding samples that are in error, adding a decision tree stump, 8 | to train those weak samples, updating weights to be applied to the samples for 9 | computing the final prediction. 10 | 11 | It increasing emphasizes the weights of outlier samples until they are result in 12 | a sequence of weights and decision trees that handle those samples. 13 | 14 | from Python Machine Learning by Sebastian Raschka under the following license 15 | 16 | The MIT License (MIT) 17 | 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining a copy 21 | of this software and associated documentation files (the "Software"), to deal 22 | in the Software without restriction, including without limitation the rights 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | copies of the Software, and to permit persons to whom the Software is 25 | furnished to do so, subject to the following conditions: 26 | 27 | The above copyright notice and this permission notice shall be included in all 28 | copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | SOFTWARE. 37 | 38 | @author: richard lyman 39 | ''' 40 | 41 | from sklearn.ensemble import AdaBoostClassifier 42 | 43 | import ocr_utils 44 | from sklearn.model_selection import train_test_split 45 | import numpy as np 46 | import matplotlib.pyplot as plt 47 | from sklearn.preprocessing import LabelEncoder 48 | 49 | charsToTrain=(48,51) 50 | nChars = 1000 51 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=(9,17), nChars=nChars) 52 | 53 | le = LabelEncoder() 54 | y = le.fit_transform(y) 55 | 56 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.40,random_state=1) 57 | 58 | 59 | from sklearn.tree import DecisionTreeClassifier 60 | from sklearn.metrics import accuracy_score 61 | tree = DecisionTreeClassifier(criterion='entropy', 62 | max_depth=1) 63 | 64 | ada = AdaBoostClassifier(base_estimator=tree, 65 | n_estimators=500, 66 | learning_rate=0.1, 67 | random_state=0) 68 | 69 | tree = tree.fit(X_train, y_train) 70 | y_train_pred = tree.predict(X_train) 71 | y_test_pred = tree.predict(X_test) 72 | 73 | tree_train = accuracy_score(y_train, y_train_pred) 74 | tree_test = accuracy_score(y_test, y_test_pred) 75 | print('Decision tree train/test accuracies %.3f/%.3f' 76 | % (tree_train, tree_test)) 77 | 78 | ada = ada.fit(X_train, y_train) 79 | y_train_pred = ada.predict(X_train) 80 | y_test_pred = ada.predict(X_test) 81 | 82 | ada_train = accuracy_score(y_train, y_train_pred) 83 | ada_test = accuracy_score(y_test, y_test_pred) 84 | print('AdaBoost train/test accuracies %.3f/%.3f' 85 | % (ada_train, ada_test)) 86 | 87 | 88 | 89 | x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 90 | y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 91 | xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/100), 92 | np.arange(y_min, y_max, (y_max-y_min)/100)) 93 | 94 | f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3)) 95 | 96 | title='AdaBoost' 97 | for idx, clf, tt in zip([0, 1], 98 | [tree, ada], 99 | ['Decision Tree', title]): 100 | clf.fit(X_train, y_train) 101 | 102 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 103 | Z = Z.reshape(xx.shape) 104 | 105 | axarr[idx].contourf(xx, yy, Z, alpha=0.3) 106 | axarr[idx].scatter(X_train[y_train==0, 0], 107 | X_train[y_train==0, 1], 108 | c='blue', marker='^') 109 | axarr[idx].scatter(X_train[y_train==1, 0], 110 | X_train[y_train==1, 1], 111 | c='red', marker='o') 112 | axarr[idx].set_title(tt) 113 | axarr[idx].set_ylabel(labels[0], fontsize=12) 114 | axarr[idx].set_xlabel(labels[1], fontsize=12) 115 | 116 | plt.tight_layout() 117 | 118 | ocr_utils.show_figures(plt, title) 119 | 120 | print ('\n########################### No Errors ####################################') 121 | -------------------------------------------------------------------------------- /p189_nested_cross_validation.py: -------------------------------------------------------------------------------- 1 | ''' nested_cross_validation.py 2 | Nested Cross Validation is a method for tuning model parameters minimizing bias. 3 | 4 | There is an outer k-fold cross validation loop and an inner k-fold cross 5 | validation loop. 6 | 7 | The outer fold selects a number, such as 10 different training and 8 | test sets without replacement so each sample ends up being used as a 9 | test sample exactly once. 10 | 11 | The inner fold uses the training portion of the outer fold, and does a 12 | Grid Search to select a classification model, such as 'linear' SVM version 'rbf' 13 | or Decision Tree versus SVM. 14 | 15 | If the model is stable, then the inner loops should all chose the same 16 | classifier type. 17 | 18 | After selecting the classifier then the outer folds are used for tuning, via 19 | k-fold classification. 20 | 21 | This program uses the sklearn GridSearch Cross Validation that internally uses 22 | a 5 outer fold, 2 inner folder algorithm to tune parameters. 23 | 24 | 25 | Created on Jul 8, 2016 26 | 27 | from Python Machine Learning by Sebastian Raschka under the following license 28 | 29 | The MIT License (MIT) 30 | 31 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 32 | 33 | Permission is hereby granted, free of charge, to any person obtaining a copy 34 | of this software and associated documentation files (the "Software"), to deal 35 | in the Software without restriction, including without limitation the rights 36 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 37 | copies of the Software, and to permit persons to whom the Software is 38 | furnished to do so, subject to the following conditions: 39 | 40 | The above copyright notice and this permission notice shall be included in all 41 | copies or substantial portions of the Software. 42 | 43 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 44 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 45 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 46 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 47 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 48 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 49 | SOFTWARE. 50 | 51 | @author: richard lyman 52 | ''' 53 | 54 | import numpy as np 55 | import ocr_utils 56 | from sklearn.preprocessing import StandardScaler 57 | from sklearn.pipeline import Pipeline 58 | from sklearn.model_selection import cross_val_score 59 | from sklearn.model_selection import GridSearchCV 60 | from sklearn.svm import SVC 61 | if __name__ == '__main__': 62 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 63 | 64 | 65 | pipe_svc = Pipeline([('scl', StandardScaler()), 66 | ('clf', SVC(random_state=1))]) 67 | 68 | c_gamma_range = [0.01, 0.1, 1.0, 10.0] 69 | 70 | param_grid = [{'clf__C': c_gamma_range, 71 | 'clf__kernel': ['linear']}, 72 | {'clf__C': c_gamma_range, 73 | 'clf__gamma': c_gamma_range, 74 | 'clf__kernel': ['rbf'],}] 75 | 76 | gs = GridSearchCV(estimator=pipe_svc, 77 | param_grid=param_grid, 78 | scoring='accuracy', 79 | cv=5, 80 | n_jobs=-1) 81 | 82 | 83 | scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) 84 | print('\nSupport Vector Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 85 | 86 | gs = gs.fit(X_train, y_train) 87 | print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_)) 88 | print('Support Vector Machine Grid Search best params: {}'.format(sorted(gs.best_params_.items()))) 89 | from sklearn.tree import DecisionTreeClassifier 90 | gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), 91 | param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], 92 | scoring='accuracy', 93 | cv=5) 94 | 95 | 96 | scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) 97 | print('Decision Tree Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 98 | 99 | gs = gs.fit(X_train, y_train) 100 | print('Decision Tree Grid Search best score: {}'.format(gs.best_score_)) 101 | print('Decision Tree Grid Search best params: {}'.format(gs.best_params_)) 102 | 103 | print ('\n########################### No Errors ####################################') 104 | -------------------------------------------------------------------------------- /o4_image_to_image.py: -------------------------------------------------------------------------------- 1 | """# ========================================================================== 2 | 3 | # Copyright 2015 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | This sample program is a modified version of the Google mnist convolutional 19 | network tutorial example. See the mnist tutorial in www.tensorflow.org 20 | 21 | This graph has multiple sections 3 layers each, 400 100 400 followed 22 | by a fully connected layer. 23 | 24 | see tensor_flow_graph.png 25 | """# ============================================================================== 26 | import ocr_utils 27 | import datetime 28 | from collections import namedtuple 29 | import numpy as np 30 | import pandas as pd 31 | import n1_image_to_image as nnetwork 32 | #import n1_residual3x4 as nnetwork 33 | from tensorflow.compat import v1 as tf 34 | dtype = np.float32 35 | #with tf.device('/GPU:0'): 36 | #with tf.device('/cpu:0'): 37 | 38 | 39 | if True: 40 | # single font train 41 | 42 | # examples 43 | # select only images from 'OCRB' scanned font 44 | # input_filters_dict = {'font': ('OCRA',)} 45 | 46 | # select only images from 'HANDPRINT' font 47 | #input_filters_dict = {'font': ('HANDPRINT',)} 48 | 49 | # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant 50 | # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)} 51 | 52 | # select everything; all fonts , font variants, etc. 53 | # input_filters_dict = {} 54 | 55 | # select the digits 0 through 9 in the E13B font 56 | # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'} 57 | 58 | # select the digits 0 and 2in the E13B font 59 | # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'} 60 | 61 | # output the character label, image, italic flag, aspect_ratio and upper_case flag 62 | # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 63 | 64 | # output only the character label and the image 65 | # output_feature_list = ['m_label_one_hot','image'] 66 | 67 | # identify the font given the input images 68 | #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case'] 69 | 70 | # train the digits 0-9 for all fonts 71 | input_filters_dict = {'m_label': [43]+list(range(48,58)),'italic':0,'strength':.4} 72 | #input_filters_dict = {'font':'BANKGOTHIC','m_label': list(range(48,58)),'italic':0,'strength':.7} 73 | #input_filters_dict = {} 74 | output_feature_list = ['low_pass_image','image'] 75 | 76 | """# ============================================================================== 77 | 78 | Train and Evaluate the Model 79 | 80 | """# ============================================================================== 81 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 82 | output_feature_list=output_feature_list, 83 | test_size = .2, 84 | engine_type='tensorflow',dtype=dtype) 85 | nn = nnetwork.network(ds.train) 86 | nn.fit_entropy( ds.train, nEpochs=5000) 87 | nn.test2(ds.test) 88 | 89 | # train_a_font(input_filters_dict, output_feature_list, nEpochs = 50000) 90 | 91 | else: 92 | # loop through all the fonts and train individually 93 | 94 | # pick up the entire list of fonts and font variants. Train each one. 95 | df1 = ocr_utils.get_list(input_filters_dict={'font': ()}) 96 | 97 | import pprint as pprint 98 | pp = pprint.PrettyPrinter(indent=4) 99 | pp.pprint(df1) 100 | 101 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot'] 102 | 103 | # Change nEpochs to 5000 for better results 104 | for l in df1: 105 | input_filters_dict= {'font': (l[0],)} 106 | train_a_font(input_filters_dict,output_feature_list, nEpochs = 5000) 107 | 108 | 109 | print ('\n########################### No Errors ####################################') 110 | 111 | -------------------------------------------------------------------------------- /p86_decision_tree.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' decision_tree.py shows three impurity measures Gini, entropy, and 3 | misclassification error when used with a decision tree classifier. 4 | 5 | These measures are used to estimate the information gain at each split. 6 | 7 | 1) plot the 3 kinds of impurity measure 8 | 2) run the decision tree on the e13b data and plot the decision regions 9 | 10 | To create a drawing of the tree run: 11 | dot -Tpng tree.dot -o tree.png 12 | 13 | Created on Jun 23, 2016 14 | 15 | from Python Machine Learning by Sebastian Raschka under the following license 16 | 17 | The MIT License (MIT) 18 | 19 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 20 | 21 | Permission is hereby granted, free of charge, to any person obtaining a copy 22 | of this software and associated documentation files (the "Software"), to deal 23 | in the Software without restriction, including without limitation the rights 24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 25 | copies of the Software, and to permit persons to whom the Software is 26 | furnished to do so, subject to the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be included in all 29 | copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 | SOFTWARE. 38 | 39 | @author: richard lyman 40 | ''' 41 | import numpy as np 42 | import ocr_utils 43 | import matplotlib.pyplot as plt 44 | from sklearn.preprocessing import StandardScaler 45 | 46 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 47 | 48 | def gini(p): 49 | return (p)*(1 - (p)) + (1-p)*(1 - (1-p)) 50 | 51 | def entropy(p): 52 | return - p*np.log2(p) - (1 - p)*np.log2((1 - p)) 53 | 54 | def error(p): 55 | return 1 - np.max([p, 1 - p]) 56 | 57 | x = np.arange(0.0, 1.0, 0.01) 58 | 59 | ent = [entropy(p) if p != 0 else None for p in x] 60 | sc_ent = [e*0.5 if e else None for e in ent] 61 | err = [error(i) for i in x] 62 | 63 | 64 | fig = plt.figure() 65 | 66 | ax = plt.subplot(111) 67 | for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err], 68 | ['Entropy', 'Entropy (scaled)', 69 | 'Gini Impurity', 'Misclassification Error'], 70 | ['-', '-', '--', '-.'], 71 | ['black', 'lightgray', 'red', 'green', 'cyan']): 72 | line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c) 73 | 74 | ax.legend(loc='upper center', ncol=2, fancybox=True, shadow=False) 75 | 76 | ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--') 77 | ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--') 78 | plt.ylim([0, 1.2]) 79 | plt.xlabel('p(i=1)') 80 | plt.ylabel('Impurity Index') 81 | plt.tight_layout() 82 | title='impurity' 83 | plt.title(title) 84 | 85 | ocr_utils.show_figures(plt,title=title) 86 | 87 | sc = StandardScaler() 88 | sc.fit(X_train) 89 | X_train_std = sc.transform(X_train) 90 | X_test_std = sc.transform(X_test) 91 | X_combined_std = np.vstack((X_train_std, X_test_std)) 92 | y_combined = np.hstack((y_train, y_test)) 93 | 94 | from sklearn.tree import DecisionTreeClassifier 95 | 96 | tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0) 97 | tree.fit(X_train, y_train) 98 | 99 | X_combined = np.vstack((X_train, X_test)) 100 | y_combined = np.hstack((y_train, y_test)) 101 | ocr_utils.plot_decision_regions(X=X_combined, 102 | y=y_combined, 103 | classifier=tree, 104 | test_idx=range(len(X_test),len(X_combined)), 105 | labels=labels, 106 | title='decision tree entropy') 107 | 108 | 109 | from sklearn.ensemble import RandomForestClassifier 110 | 111 | forest = RandomForestClassifier(criterion='entropy', 112 | n_estimators=10, 113 | random_state=1, 114 | n_jobs=2) 115 | forest.fit(X_train, y_train) 116 | 117 | ocr_utils.plot_decision_regions(X=X_combined, 118 | y=y_combined, 119 | classifier=forest, 120 | labels=labels, 121 | test_idx=range(len(X_test_std),len(X_combined_std)), 122 | title='random_forest') 123 | 124 | 125 | 126 | 127 | 128 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /q5_tensorflow_residual.py: -------------------------------------------------------------------------------- 1 | """# ========================================================================== 2 | 3 | # Copyright 2015 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | 18 | This sample program is a modified version of the Google mnist convolutional 19 | network tutorial example. See the mnist tutorial in www.tensorflow.org 20 | 21 | The tutorial version of the program is modified in order to send some 22 | features directly to the fully connected layer, thus bypassing the 23 | convolution layer. 24 | 25 | It has TWO convolution layers and THREE fully connected (2048 node) layers 26 | 27 | Images go through convolution. Everything else bypasses. 28 | 29 | see tensor_flow_graph.png 30 | """# ============================================================================== 31 | import ocr_utils 32 | import datetime 33 | from collections import namedtuple 34 | import numpy as np 35 | import pandas as pd 36 | import n1_2cnv1fc as nnetwork 37 | 38 | # import tensorflow as tf 39 | 40 | dtype = np.float32 41 | #with tf.device('/GPU:0'): 42 | #with tf.device('/cpu:0'): 43 | 44 | 45 | if False: 46 | # single font train 47 | 48 | # examples 49 | # select only images from 'OCRB' scanned font 50 | # input_filters_dict = {'font': ('OCRA',)} 51 | 52 | # select only images from 'HANDPRINT' font 53 | #input_filters_dict = {'font': ('HANDPRINT',)} 54 | 55 | # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant 56 | # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)} 57 | 58 | # select everything; all fonts , font variants, etc. 59 | # input_filters_dict = {} 60 | 61 | # select the digits 0 through 9 in the E13B font 62 | # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'} 63 | 64 | # select the digits 0 and 2in the E13B font 65 | # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'} 66 | 67 | # output the character label, image, italic flag, aspect_ratio and upper_case flag 68 | # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 69 | 70 | # output only the character label and the image 71 | # output_feature_list = ['m_label_one_hot','image'] 72 | 73 | # identify the font given the input images 74 | #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case'] 75 | 76 | # train the digits 0-9 for all fonts 77 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))+list(range(97,123)),'fontVariant':'scanned'} 78 | #input_filters_dict = {} 79 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case'] 80 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 81 | output_feature_list=output_feature_list, 82 | test_size = .1, 83 | engine_type='tensorflow',dtype=dtype) 84 | nn = nnetwork.network( ds.train) 85 | nn.fit( ds.train, nEpochs=5000) 86 | nn.test(ds.test) 87 | 88 | else: 89 | # loop through all the fonts and train individually 90 | 91 | # pick up the entire list of fonts and font variants. Train each one. 92 | df1 = ocr_utils.get_list(input_filters_dict={'font': ()}) 93 | 94 | import pprint as pprint 95 | pp = pprint.PrettyPrinter(indent=4) 96 | pp.pprint(df1) 97 | 98 | output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot'] 99 | 100 | # Change nEpochs to 5000 for better results 101 | for l in df1: 102 | #input_filters_dict= {'font': (l[0],)} 103 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))+list(range(97,123)),'font': (l[0],)} 104 | #train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 105 | 106 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 107 | output_feature_list=output_feature_list, 108 | test_size = .1, 109 | engine_type='tensorflow',dtype=dtype) 110 | 111 | nn = nnetwork.network(ds.train) 112 | nn.fit( ds.train, nEpochs=5000) 113 | nn.test(ds.test, title = l[0] ) 114 | nn.reset_graph() 115 | 116 | 117 | 118 | 119 | print ('\n########################### No Errors ####################################') 120 | 121 | -------------------------------------------------------------------------------- /p221_bagging_bootstrap_samples.py: -------------------------------------------------------------------------------- 1 | '''bagging_bootstrap_samples.py 2 | 3 | Bagging draws samples with replacement in order to train classifiers that are 4 | then combined my majority voting. 5 | 6 | 7 | Created on Jul 10, 2016 8 | 9 | from Python Machine Learning by Sebastian Raschka under the following license 10 | 11 | The MIT License (MIT) 12 | 13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 14 | 15 | Permission is hereby granted, free of charge, to any person obtaining a copy 16 | of this software and associated documentation files (the "Software"), to deal 17 | in the Software without restriction, including without limitation the rights 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 19 | copies of the Software, and to permit persons to whom the Software is 20 | furnished to do so, subject to the following conditions: 21 | 22 | The above copyright notice and this permission notice shall be included in all 23 | copies or substantial portions of the Software. 24 | 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 | SOFTWARE. 32 | 33 | @author: richard lyman 34 | ''' 35 | 36 | from sklearn.preprocessing import LabelEncoder 37 | import ocr_utils 38 | from sklearn.model_selection import train_test_split 39 | import numpy as np 40 | import matplotlib.pyplot as plt 41 | if __name__ == '__main__': 42 | 43 | charsToTrain=(48,51) 44 | nChars = 1000 45 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=(9,17), nChars=nChars) 46 | 47 | le = LabelEncoder() 48 | y = le.fit_transform(y) 49 | 50 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.40,random_state=1) 51 | 52 | from sklearn.ensemble import BaggingClassifier 53 | from sklearn.tree import DecisionTreeClassifier 54 | 55 | tree = DecisionTreeClassifier(criterion='entropy', 56 | max_depth=None) 57 | 58 | bag = BaggingClassifier(base_estimator=tree, 59 | n_estimators=500, 60 | max_samples=1.0, 61 | max_features=1.0, 62 | bootstrap=True, 63 | bootstrap_features=False, 64 | n_jobs=-1, 65 | random_state=1) 66 | 67 | from sklearn.metrics import accuracy_score 68 | 69 | tree = tree.fit(X_train, y_train) 70 | y_train_pred = tree.predict(X_train) 71 | y_test_pred = tree.predict(X_test) 72 | 73 | tree_train = accuracy_score(y_train, y_train_pred) 74 | tree_test = accuracy_score(y_test, y_test_pred) 75 | print('Decision tree train/test accuracies %.3f/%.3f' 76 | % (tree_train, tree_test)) 77 | 78 | bag = bag.fit(X_train, y_train) 79 | y_train_pred = bag.predict(X_train) 80 | y_test_pred = bag.predict(X_test) 81 | 82 | bag_train = accuracy_score(y_train, y_train_pred) 83 | bag_test = accuracy_score(y_test, y_test_pred) 84 | print('Bagging train/test accuracies %.3f/%.3f' 85 | % (bag_train, bag_test)) 86 | 87 | x_min = X_train[:, 0].min() - 1 88 | x_max = X_train[:, 0].max() + 1 89 | y_min = X_train[:, 1].min() - 1 90 | y_max = X_train[:, 1].max() + 1 91 | 92 | xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/100), 93 | np.arange(y_min, y_max, (y_max-y_min)/100)) 94 | 95 | f, axarr = plt.subplots(nrows=1, ncols=2, 96 | sharex='col', 97 | sharey='row', 98 | figsize=(8, 3)) 99 | 100 | 101 | for idx, clf, tt in zip([0, 1], 102 | [tree, bag], 103 | ['Decision Tree', 'Bagging']): 104 | clf.fit(X_train, y_train) 105 | 106 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 107 | Z = Z.reshape(xx.shape) 108 | 109 | axarr[idx].contourf(xx, yy, Z, alpha=0.3) 110 | axarr[idx].scatter(X_train[y_train==0, 0], 111 | X_train[y_train==0, 1], 112 | c='blue', marker='^') 113 | 114 | axarr[idx].scatter(X_train[y_train==1, 0], 115 | X_train[y_train==1, 1], 116 | c='red', marker='o') 117 | 118 | axarr[idx].set_title(tt) 119 | axarr[idx].set_ylabel(labels[0], fontsize=12) 120 | axarr[idx].set_xlabel(labels[1], fontsize=12) 121 | # plt.text(10.2, -1.2, 122 | # s='Hue', 123 | # ha='center', va='center', fontsize=12) 124 | 125 | plt.tight_layout() 126 | title='Bagging' 127 | #plt.savefig('./figures/bagging_region.png', 128 | # dpi=300, 129 | # bbox_inches='tight') 130 | ocr_utils.show_figures(plt, title) 131 | 132 | print ('\n########################### No Errors ####################################') 133 | -------------------------------------------------------------------------------- /p115_l1_l2_regularization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' l1_l2_regularization.py 3 | 4 | Show the effects of l1 versus l2 regulartion. 5 | l1 introduces a weight penalty equal to the sum of the absolute weights 6 | times a given factor, lambda 7 | l1, tends to drive a number of weights to zero and thus yields a 8 | sparse weight matrix 9 | 10 | l2 introduces a weight penalty equal to the sum of squares of the 11 | weights times lambda. 12 | l2, tends to reduces the size of the weights but does not drive 13 | them to 0 14 | 15 | 16 | 1) get the data for all column sums in the e13b database 17 | 2 run logistic regression both with l1 and l2 regulization printing 18 | out the accuracies and sampling of the coefficients. 19 | Show how the weights respond versus the regularization 20 | 21 | 22 | 23 | Created on Jun 23, 2016 24 | 25 | from Python Machine Learning by Sebastian Raschka under the following license 26 | 27 | The MIT License (MIT) 28 | 29 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 30 | 31 | Permission is hereby granted, free of charge, to any person obtaining a copy 32 | of this software and associated documentation files (the "Software"), to deal 33 | in the Software without restriction, including without limitation the rights 34 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 35 | copies of the Software, and to permit persons to whom the Software is 36 | furnished to do so, subject to the following conditions: 37 | 38 | The above copyright notice and this permission notice shall be included in all 39 | copies or substantial portions of the Software. 40 | 41 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 42 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 43 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 44 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 45 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 46 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 47 | SOFTWARE. 48 | 49 | @author: richard lyman 50 | ''' 51 | import numpy as np 52 | import ocr_utils 53 | 54 | columnsXY = range(0,20) 55 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=columnsXY , test_size=0.3, nChars=1000, random_state=0) 56 | 57 | from sklearn.preprocessing import StandardScaler 58 | stdsc = StandardScaler() 59 | X_train_std = stdsc.fit_transform(X_train) 60 | X_test_std = stdsc.transform(X_test) 61 | 62 | sc = StandardScaler() 63 | sc.fit(X_train) 64 | X_train_std = sc.transform(X_train) 65 | X_test_std = sc.transform(X_test) 66 | X_combined_std = np.vstack((X_train_std, X_test_std)) 67 | y_combined = np.hstack((y_train, y_test)) 68 | 69 | 70 | from sklearn.linear_model import LogisticRegression 71 | 72 | lr = LogisticRegression(penalty='l1', C=0.1, random_state=0, solver='liblinear',multi_class='auto') 73 | lr.fit(X_train_std, y_train) 74 | print('Training accuracy-l1 regularization:', lr.score(X_train_std, y_train)) 75 | print('Test accuracy-l1 regularization:', lr.score(X_test_std, y_test)) 76 | print('lr.intercept_ L1 regularization') 77 | print('\t{}'.format(lr.intercept_)) 78 | print('lr.coef_ L1 regularization') 79 | print('\t{}'.format(lr.coef_)) 80 | 81 | 82 | lr = LogisticRegression(penalty='l2', C=0.1, random_state=0, solver='liblinear',multi_class='auto') 83 | lr.fit(X_train_std, y_train) 84 | print('Training accuracy-l2 regularization:', lr.score(X_train_std, y_train)) 85 | print('Test accuracy-l2 regularization:', lr.score(X_test_std, y_test)) 86 | print('lr.intercept L2 regularization') 87 | print('\t{}'.format(lr.intercept_)) 88 | print('lr.coef_ L2 regularization') 89 | print('\t{}'.format(lr.coef_)) 90 | 91 | import matplotlib.pyplot as plt 92 | 93 | fig = plt.figure() 94 | ax = plt.subplot(111) 95 | 96 | colors = ['blue', 'green', 'red', 'cyan', 97 | 'magenta', 'yellow', 'black', 98 | 'pink', 'lightgreen', 'lightblue', 99 | 'gray', 'indigo', 'orange'] 100 | 101 | def weight_graph(regularization = 'l1'): 102 | weights, params = [], [] 103 | for c in np.arange(0, 6): 104 | lr = LogisticRegression(penalty=regularization, C=10**c, random_state=0, solver='liblinear',multi_class='auto') 105 | lr.fit(X_train_std, y_train) 106 | weights.append(lr.coef_[1]) 107 | params.append(10**c) 108 | 109 | weights = np.array(weights) 110 | 111 | for column, color in zip(range(weights.shape[1]), colors): 112 | plt.plot(params, weights[:, column], 113 | label=columnsXY[column+1], 114 | color=color) 115 | 116 | 117 | plt.axhline(0, color='black', linestyle='--', linewidth=3) 118 | plt.xlim([10**(-5), 10**5]) 119 | plt.ylabel('weight coefficient') 120 | plt.xlabel('C') 121 | plt.xscale('log') 122 | title = 'regularization {}'.format(regularization) 123 | plt.title(title) 124 | plt.legend(loc='upper left') 125 | ax.legend(loc='upper center', 126 | bbox_to_anchor=(1.38, 1.03), 127 | ncol=1, fancybox=True) 128 | ocr_utils.show_figures(plt,title + ' path') 129 | 130 | weight_graph(regularization = 'l1') 131 | weight_graph(regularization = 'l2') 132 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p25_perceptron.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | '''perceptron is a Python implementation of the Rosenblatt perceptron which 3 | uses a non-differentiatable unit step function as the activation function. 4 | 5 | The target classes such as the characters '0' and '1' are changed to -1 and 1 6 | The difference between target value and the predicted target value multiplied 7 | by a small 'eta' value is an update value used for adjusting the weights. 8 | 9 | The weights are adjusted by the update value times the image. 10 | 11 | This eventually converges the weights to value that can provide a good 12 | prediction of new images. 13 | 14 | The misclassification versus Epochs and the resulting decision regions 15 | are plotted. 16 | 17 | Created on Jun 20, 2016 18 | 19 | from Python Machine Learning by Sebastian Raschka under the following license 20 | 21 | The MIT License (MIT) 22 | 23 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 24 | 25 | Permission is hereby granted, free of charge, to any person obtaining a copy 26 | of this software and associated documentation files (the "Software"), to deal 27 | in the Software without restriction, including without limitation the rights 28 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 29 | copies of the Software, and to permit persons to whom the Software is 30 | furnished to do so, subject to the following conditions: 31 | 32 | The above copyright notice and this permission notice shall be included in all 33 | copies or substantial portions of the Software. 34 | 35 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 36 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 37 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 38 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 39 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 40 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 | SOFTWARE. 42 | 43 | @author: richard lyman 44 | ''' 45 | import ocr_utils 46 | import numpy as np 47 | import matplotlib.pyplot as plt 48 | 49 | ############################################################################# 50 | # read features and scatter plot 51 | 52 | # retrieve 500 sets of target numbers and column sums 53 | # y: the ascii characters 48 and 49 ('0', '1') 54 | # X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17 55 | ascii_characters_to_train=(48,49) 56 | columnsXY = (9,17) 57 | nchars=500 58 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=nchars) 59 | 60 | 61 | ############################################################################# 62 | # Perceptron implementation from Python Machine Learning 63 | class Perceptron(object): 64 | """Perceptron classifier. 65 | 66 | Parameters 67 | ------------ 68 | eta : float 69 | Learning rate (between 0.0 and 1.0) 70 | n_iter : int 71 | Passes over the training dataset. 72 | 73 | Attributes 74 | ----------- 75 | w_ : 1d-array 76 | Weights after fitting. 77 | errors_ : list 78 | Number of misclassifications in every epoch. 79 | 80 | """ 81 | def __init__(self, eta=0.01, n_iter=10): 82 | self.eta = eta 83 | self.n_iter = n_iter 84 | 85 | def fit(self, X, y): 86 | """Fit training data. 87 | 88 | Parameters 89 | ---------- 90 | X : {array-like}, shape = [n_samples, n_features] 91 | Training vectors, where n_samples is the number of samples and 92 | n_features is the number of features. 93 | y : array-like, shape = [n_samples] 94 | Target values. 95 | 96 | Returns 97 | ------- 98 | self : object 99 | 100 | """ 101 | self.w_ = np.zeros(1 + X.shape[1]) 102 | self.errors_ = [] 103 | 104 | for _ in range(self.n_iter): 105 | errors = 0 106 | for xi, target in zip(X, y): 107 | update = self.eta * (target - self.predict(xi)) 108 | self.w_[1:] += update * xi 109 | self.w_[0] += update 110 | errors += int(update != 0.0) 111 | self.errors_.append(errors) 112 | return self 113 | 114 | def net_input(self, X): 115 | """Calculate net input""" 116 | return np.dot(X, self.w_[1:]) + self.w_[0] 117 | 118 | def predict(self, X): 119 | """Return class label after unit step""" 120 | return np.where(self.net_input(X) >= 0.0, 1, -1) 121 | 122 | ############################################################################# 123 | # convert targets 9'0','1') to -1,+1 124 | # fit train the Perceptron 125 | # plot the misclassifications versus Epochs 126 | # plot the decision regions 127 | 128 | y = np.where(y == ascii_characters_to_train[0], -1, 1) 129 | ppn = Perceptron(eta=0.1, n_iter=10) 130 | ppn.fit(X, y) 131 | 132 | title = 'Simple Perception' 133 | plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o') 134 | plt.xlabel('Epochs') 135 | plt.ylabel('Number of misclassifications') 136 | plt.title(title) 137 | plt.tight_layout() 138 | ocr_utils.show_figures(plt, title) 139 | 140 | ocr_utils.plot_decision_regions(X=X, 141 | y=y, 142 | classifier=ppn, 143 | labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 144 | title="Perceptron Decision Regions") 145 | 146 | 147 | 148 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p119_squential_backward_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' sequential backward selection 3 | 4 | In order to reduce the complexity of the model, the number of features 5 | can be reduced by Sequential Backward Selection 6 | 7 | Th e13b dataset has 20 column sums, one for each column in the original 8 | images. Only a few of these would be needed to produce a good 9 | fit. 10 | 11 | The SBS algorithm removes features by repeatedly running a fit of the data, 12 | selecting the feature for removal that makes the least difference to the 13 | accuracy of the fit. 14 | 15 | 16 | Created on Jun 23, 2016 17 | 18 | from Python Machine Learning by Sebastian Raschka under the following license 19 | 20 | The MIT License (MIT) 21 | 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy 25 | of this software and associated documentation files (the "Software"), to deal 26 | in the Software without restriction, including without limitation the rights 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | copies of the Software, and to permit persons to whom the Software is 29 | furnished to do so, subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in all 32 | copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 40 | SOFTWARE. 41 | 42 | @author: richard lyman 43 | ''' 44 | import numpy as np 45 | import ocr_utils 46 | import matplotlib.pyplot as plt 47 | 48 | 49 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, random_state=0) 50 | 51 | 52 | from sklearn.model_selection import train_test_split 53 | 54 | X_train, X_test, y_train, y_test = train_test_split( 55 | X, y, test_size=0.3, random_state=0) 56 | 57 | from sklearn.preprocessing import StandardScaler 58 | stdsc = StandardScaler() 59 | X_train_std = stdsc.fit_transform(X_train) 60 | X_test_std = stdsc.transform(X_test) 61 | 62 | sc = StandardScaler() 63 | sc.fit(X_train) 64 | X_train_std = sc.transform(X_train) 65 | X_test_std = sc.transform(X_test) 66 | X_combined_std = np.vstack((X_train_std, X_test_std)) 67 | y_combined = np.hstack((y_train, y_test)) 68 | 69 | from sklearn.base import clone 70 | from itertools import combinations 71 | 72 | from sklearn.metrics import accuracy_score 73 | 74 | class SBS(): 75 | def __init__(self, estimator, k_features, 76 | scoring=accuracy_score, 77 | test_size=0.25, random_state=1): 78 | self.scoring = scoring 79 | self.estimator = clone(estimator) 80 | self.k_features = k_features 81 | self.test_size = test_size 82 | self.random_state = random_state 83 | 84 | def fit(self, X, y): 85 | X_train, X_test, y_train, y_test = \ 86 | train_test_split(X, y, test_size=self.test_size, 87 | random_state=self.random_state) 88 | dim = X_train.shape[1] 89 | self.indices_ = tuple(range(dim)) 90 | self.subsets_ = [self.indices_] 91 | score = self._calc_score(X_train, y_train, 92 | X_test, y_test, self.indices_) 93 | self.scores_ = [score] 94 | while dim > self.k_features: 95 | scores = [] 96 | subsets = [] 97 | for p in combinations(self.indices_, r=dim-1): 98 | score = self._calc_score(X_train, y_train, 99 | X_test, y_test, p) 100 | scores.append(score) 101 | subsets.append(p) 102 | best = np.argmax(scores) 103 | self.indices_ = subsets[best] 104 | self.subsets_.append(self.indices_) 105 | dim -= 1 106 | self.scores_.append(scores[best]) 107 | self.k_score_ = self.scores_[-1] 108 | return self 109 | def transform(self, X): 110 | return X[:, self.indices_] 111 | 112 | def _calc_score(self, X_train, y_train, 113 | X_test, y_test, indices): 114 | self.estimator.fit(X_train[:, indices], y_train) 115 | y_pred = self.estimator.predict(X_test[:, indices]) 116 | score = self.scoring(y_test, y_pred) 117 | return score 118 | 119 | from sklearn.neighbors import KNeighborsClassifier 120 | 121 | 122 | knn = KNeighborsClassifier(n_neighbors=2) 123 | 124 | # selecting features 125 | sbs = SBS(knn, k_features=1) 126 | sbs.fit(X_train_std, y_train) 127 | 128 | # plotting performance of feature subsets 129 | k_feat = [len(k) for k in sbs.subsets_] 130 | 131 | title='Sequential Backward Selection' 132 | plt.plot(k_feat, sbs.scores_, marker='o') 133 | plt.ylim([0.7, 1.1]) 134 | plt.ylabel('Accuracy') 135 | plt.xlabel('Number of features') 136 | plt.grid() 137 | plt.title(title) 138 | plt.tight_layout() 139 | ocr_utils.show_figures(plt,title) 140 | 141 | best=10 142 | k5 = list(sbs.subsets_[best]) 143 | print('The best {} column_sums'.format(best)) 144 | for s in k5: 145 | print(labels[s]) 146 | print() 147 | 148 | 149 | knn.fit(X_train_std, y_train) 150 | print('Training accuracy using all features:', knn.score(X_train_std, y_train)) 151 | print('Test accuracy using all features:', knn.score(X_test_std, y_test)) 152 | 153 | 154 | knn.fit(X_train_std[:, k5], y_train) 155 | print('Training accuracy using {} features:'.format(best), knn.score(X_train_std[:, k5], y_train)) 156 | print('Test accuracy using {} features:'.format(best), knn.score(X_test_std[:, k5], y_test)) 157 | 158 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p181_learning_curves.py: -------------------------------------------------------------------------------- 1 | ''' 2 | learning_curves.py 3 | 4 | The learning curve shows the training and test accuracies versus the number 5 | of training samples. This can be used to determine if there are enough 6 | training samples to sufficiently fit the training data. If the curves 7 | flatten out then there are enough samples. If there is a big difference 8 | between the training and test accuracies then under and over fitting 9 | can be determined. 10 | 11 | The validation curve can be used to test a range of an estimation 12 | parameter such as the inverse regularization parameter, C, to see how the 13 | test and accuracy varies. This can be used to pick a value of C to reduce 14 | over fitting and under fitting. 15 | 16 | Created on Jul 5, 2016 17 | 18 | from Python Machine Learning by Sebastian Raschka under the following license 19 | 20 | The MIT License (MIT) 21 | 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy 25 | of this software and associated documentation files (the "Software"), to deal 26 | in the Software without restriction, including without limitation the rights 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | copies of the Software, and to permit persons to whom the Software is 29 | furnished to do so, subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in all 32 | copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 40 | SOFTWARE. 41 | 42 | @author: richard lyman 43 | ''' 44 | import matplotlib.pyplot as plt 45 | from sklearn.model_selection import learning_curve 46 | import numpy as np 47 | import ocr_utils 48 | from sklearn.preprocessing import StandardScaler 49 | from sklearn.linear_model import LogisticRegression 50 | from sklearn.pipeline import Pipeline 51 | 52 | if __name__ == '__main__': 53 | 54 | 55 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 56 | 57 | pipe_lr = Pipeline([('scl', StandardScaler()), 58 | ('clf', LogisticRegression(penalty='l2', random_state=0, solver='lbfgs'))]) 59 | 60 | train_sizes, train_scores, test_scores =\ 61 | learning_curve(estimator=pipe_lr, 62 | X=X_train, 63 | y=y_train, 64 | train_sizes=np.linspace(0.1, 1.0, 10), 65 | cv=10, 66 | n_jobs=8) 67 | 68 | train_mean = np.mean(train_scores, axis=1) 69 | train_std = np.std(train_scores, axis=1) 70 | test_mean = np.mean(test_scores, axis=1) 71 | test_std = np.std(test_scores, axis=1) 72 | 73 | plt.plot(train_sizes, train_mean, 74 | color='blue', marker='o', 75 | markersize=5, label='training accuracy') 76 | 77 | plt.fill_between(train_sizes, 78 | train_mean + train_std, 79 | train_mean - train_std, 80 | alpha=0.15, color='blue') 81 | 82 | plt.plot(train_sizes, test_mean, 83 | color='green', linestyle='--', 84 | marker='s', markersize=5, 85 | label='validation accuracy') 86 | 87 | plt.fill_between(train_sizes, 88 | test_mean + test_std, 89 | test_mean - test_std, 90 | alpha=0.15, color='green') 91 | 92 | plt.grid() 93 | plt.xlabel('Number of training samples') 94 | plt.ylabel('Accuracy') 95 | plt.legend(loc='lower right') 96 | plt.ylim([0.8, 1.0]) 97 | title='learning_curve' 98 | plt.title(title) 99 | plt.tight_layout() 100 | ocr_utils.show_figures(plt,title) 101 | 102 | from sklearn.model_selection import validation_curve 103 | 104 | param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 105 | train_scores, test_scores = validation_curve( 106 | estimator=pipe_lr, 107 | X=X_train, 108 | y=y_train, 109 | param_name='clf__C', 110 | param_range=param_range, 111 | cv=10, 112 | n_jobs=8) 113 | 114 | train_mean = np.mean(train_scores, axis=1) 115 | train_std = np.std(train_scores, axis=1) 116 | test_mean = np.mean(test_scores, axis=1) 117 | test_std = np.std(test_scores, axis=1) 118 | 119 | plt.plot(param_range, train_mean, 120 | color='blue', marker='o', 121 | markersize=5, label='training accuracy') 122 | 123 | plt.fill_between(param_range, train_mean + train_std, 124 | train_mean - train_std, alpha=0.15, 125 | color='blue') 126 | 127 | plt.plot(param_range, test_mean, 128 | color='green', linestyle='--', 129 | marker='s', markersize=5, 130 | label='validation accuracy') 131 | 132 | plt.fill_between(param_range, 133 | test_mean + test_std, 134 | test_mean - test_std, 135 | alpha=0.15, color='green') 136 | 137 | plt.grid() 138 | plt.xscale('log') 139 | plt.legend(loc='lower right') 140 | plt.xlabel('Parameter C') 141 | plt.ylabel('Accuracy') 142 | plt.ylim([0.8, 1.0]) 143 | title='validation_curve' 144 | plt.title(title) 145 | plt.tight_layout() 146 | ocr_utils.show_figures(plt,title) 147 | 148 | print ('\n########################### No Errors ####################################') 149 | 150 | -------------------------------------------------------------------------------- /p62_logistic_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' logitistic_function.py replaces the activation function by the 3 | logistic function also known as the sigmoid function. 4 | The logistic function starts with the racetrack odds ratio p/(1-p) 5 | The logit function is the log of this 6 | Solving the inverse function for p yields the logit function 7 | The output predicts the probability of an input sample belonging to 8 | a class label. 9 | This gives a estimate of the probability that a input set of features 10 | belong to a target class. 11 | Works best when classes are linearly separable. 12 | For multiclasses when there are more than 2 classes, uses One versus Rest. 13 | The effect of the regularization parameter in the logistic regression is 14 | shown. 15 | 16 | 1) Plot the logistic (sigmoid) function 17 | 2)Create the cost function to be minimized by using the negative of the 18 | log likelihood function. 19 | 3) Plot the two curves that make up the cost function, one for a target y 20 | that equals 1 and one for a target y that equals 0 21 | Use the sklearn package to fit the input data to a single perceptron using 22 | the logistic function for an activation function. 23 | 4) Plot the decision regions for 3 target classes from the E13B training set 24 | 5) Plot the weight coefficients using two different regulartion values 25 | 26 | Created on Jun 23, 2016 27 | 28 | from Python Machine Learning by Sebastian Raschka under the following license 29 | 30 | The MIT License (MIT) 31 | 32 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 33 | 34 | Permission is hereby granted, free of charge, to any person obtaining a copy 35 | of this software and associated documentation files (the "Software"), to deal 36 | in the Software without restriction, including without limitation the rights 37 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 38 | copies of the Software, and to permit persons to whom the Software is 39 | furnished to do so, subject to the following conditions: 40 | 41 | The above copyright notice and this permission notice shall be included in all 42 | copies or substantial portions of the Software. 43 | 44 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 45 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 46 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 47 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 48 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 49 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 50 | SOFTWARE. 51 | 52 | @author: richard lyman 53 | ''' 54 | import numpy as np 55 | import ocr_utils 56 | import matplotlib.pyplot as plt 57 | from sklearn.preprocessing import StandardScaler 58 | from sklearn.linear_model import LogisticRegression 59 | from sklearn.model_selection import train_test_split 60 | 61 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17),nChars=500) 62 | 63 | def sigmoid(z): 64 | return 1.0 / (1.0 + np.exp(-z)) 65 | 66 | z = np.arange(-7, 7, 0.1) 67 | phi_z = sigmoid(z) 68 | title='sigmoid' 69 | plt.plot(z, phi_z) 70 | plt.axvline(0.0, color='k') 71 | plt.axhspan(0.0, 1.0, facecolor='1.0', alpha=1.0, ls='dotted') 72 | plt.axhline(y=0.5, ls='dotted', color='k') 73 | plt.yticks([0.0, 0.5, 1.0]) 74 | plt.ylim(-0.1, 1.1) 75 | plt.xlabel('z') 76 | plt.ylabel('$\phi (z)$') 77 | plt.title(title) 78 | ocr_utils.show_figures(plt,title=title) 79 | 80 | def cost_1(z): 81 | return - np.log(sigmoid(z)) 82 | 83 | def cost_0(z): 84 | return - np.log(1 - sigmoid(z)) 85 | 86 | z = np.arange(-10, 10, 0.1) 87 | phi_z = sigmoid(z) 88 | 89 | c1 = [cost_1(x) for x in z] 90 | plt.plot(phi_z, c1, label='J(w) if y=1') 91 | 92 | c0 = [cost_0(x) for x in z] 93 | plt.plot(phi_z, c0, linestyle='--', label='J(w) if y=0') 94 | title='log cost' 95 | plt.ylim(0.0, 5.1) 96 | plt.xlim([0, 1]) 97 | plt.xlabel('$\phi$(z)') 98 | plt.ylabel('J(w)') 99 | plt.legend(loc='best') 100 | plt.title(title) 101 | plt.tight_layout() 102 | ocr_utils.show_figures(plt,title=title) 103 | 104 | X_train, X_test, y_train, y_test = train_test_split( 105 | X, y, test_size=0.3, random_state=0) 106 | 107 | sc = StandardScaler() 108 | sc.fit(X_train) 109 | X_train_std = sc.transform(X_train) 110 | X_test_std = sc.transform(X_test) 111 | lr = LogisticRegression(C=1000.0, random_state=0, solver='lbfgs',multi_class='auto') 112 | lr.fit(X_train_std, y_train) 113 | X_combined_std = np.vstack((X_train_std, X_test_std)) 114 | y_combined = np.hstack((y_train, y_test)) 115 | ocr_utils.plot_decision_regions( 116 | X=X_combined_std, 117 | y=y_combined, 118 | classifier=lr, 119 | labels = labels, 120 | test_idx=range(len(X_train_std),len(X_combined_std)), 121 | title='logistic_regression') 122 | 123 | 124 | weights, params = [], [] 125 | for c in np.arange(0, 5): 126 | lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto') 127 | lr.fit(X_train_std, y_train) 128 | weights.append(lr.coef_[0]) 129 | params.append(10**c) 130 | 131 | 132 | title = 'regression_path' 133 | weights, params = [], [] 134 | for c in np.arange(0, 5): 135 | lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto') 136 | lr.fit(X_train_std, y_train) 137 | weights.append(lr.coef_[1]) 138 | params.append(10**c) 139 | 140 | weights = np.array(weights) 141 | plt.plot(params, weights[:, 0], 142 | label=labels[0]) 143 | plt.plot(params, weights[:, 1], linestyle='--', 144 | label=labels[1]) 145 | plt.ylabel('weight coefficient') 146 | plt.xlabel('C') 147 | plt.legend(loc='upper left') 148 | plt.xscale('log') 149 | plt.title(title) 150 | ocr_utils.show_figures(plt,title=title) 151 | 152 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p36_adaline_gd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' adaline_gd replaces the activation function by the identity 3 | function, and adds a differentiable cost function, such as a 4 | Sum of Squared Errors (SSE) 5 | 6 | The gradient of the errors is used to adjust the weights. 7 | 8 | The learning rate is plotted for several different 'eta' multipliers 9 | of the gradient. Also, the effect on the learning rate of standardizing 10 | the features before training is shown. 11 | 12 | The decision regions are plotted. 13 | 14 | The cost function error versus Epochs is plotted 15 | 16 | Created on Jun 20, 2016 17 | 18 | from Python Machine Learning by Sebastian Raschka under the following license 19 | 20 | The MIT License (MIT) 21 | 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 23 | 24 | Permission is hereby granted, free of charge, to any person obtaining a copy 25 | of this software and associated documentation files (the "Software"), to deal 26 | in the Software without restriction, including without limitation the rights 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 28 | copies of the Software, and to permit persons to whom the Software is 29 | furnished to do so, subject to the following conditions: 30 | 31 | The above copyright notice and this permission notice shall be included in all 32 | copies or substantial portions of the Software. 33 | 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 40 | SOFTWARE. 41 | 42 | @author: richard lyman 43 | ''' 44 | import ocr_utils 45 | import numpy as np 46 | import matplotlib.pyplot as plt 47 | 48 | ############################################################################# 49 | # read features 50 | 51 | # retrieve 500 sets of target numbers and column sums 52 | # y: the target is ascii characters 48 and 51 ('0', '3') 53 | # X: the features to fit is the sum of the vertical pixels in the rows in 54 | # horizontal columns 9 and 17 55 | 56 | 57 | ascii_characters_to_train=(48,49) 58 | columnsXY = (9,17) 59 | nchars=500 60 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=120) 61 | 62 | y = np.where(y==ascii_characters_to_train[1],-1,1) 63 | 64 | ############################################################################# 65 | # Adaline implementation from Python Machine Learning 66 | class AdalineGD(object): 67 | """ADAptive LInear NEuron classifier. 68 | 69 | Parameters 70 | ------------ 71 | eta : float 72 | Learning rate (between 0.0 and 1.0) 73 | n_iter : int 74 | Passes over the training dataset. 75 | 76 | Attributes 77 | ----------- 78 | w_ : 1d-array 79 | Weights after fitting. 80 | errors_ : list 81 | Number of misclassifications in every epoch. 82 | 83 | """ 84 | def __init__(self, eta=0.01, n_iter=50): 85 | self.eta = eta 86 | self.n_iter = n_iter 87 | 88 | def fit(self, X, y): 89 | """ Fit training data. 90 | 91 | Parameters 92 | ---------- 93 | X : {array-like}, shape = [n_samples, n_features] 94 | Training vectors, where n_samples is the number of samples and 95 | n_features is the number of features. 96 | y : array-like, shape = [n_samples] 97 | Target values. 98 | 99 | Returns 100 | ------- 101 | self : object 102 | 103 | """ 104 | self.w_ = np.zeros(1 + X.shape[1]) 105 | self.cost_ = [] 106 | 107 | for i in range(self.n_iter): 108 | output = self.net_input(X) 109 | errors = (y - output) 110 | self.w_[1:] += self.eta * X.T.dot(errors) 111 | self.w_[0] += self.eta * errors.sum() 112 | cost = (errors**2).sum() / 2.0 113 | self.cost_.append(cost) 114 | return self 115 | 116 | def net_input(self, X): 117 | """Calculate net input""" 118 | return np.dot(X, self.w_[1:]) + self.w_[0] 119 | 120 | def activation(self, X): 121 | """Compute linear activation""" 122 | return self.net_input(X) 123 | 124 | def predict(self, X): 125 | """Return class label after unit step""" 126 | 127 | return np.where(self.activation(X) >= 0.0, 1, -1) 128 | title = 'Gradient Descent Learning rate 0.01' 129 | 130 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4)) 131 | ada1 = AdalineGD(n_iter=10, eta=0.01).fit(X, y) 132 | ax[0].plot(range(1, len(ada1.cost_) + 1), np.log10(ada1.cost_), marker='o') 133 | ax[0].set_xlabel('Epochs') 134 | ax[0].set_ylabel('log(Sum-squared-error)') 135 | ax[0].set_title('Adaline - Learning rate 0.01') 136 | ada2 = AdalineGD(n_iter=10, eta=0.0001).fit(X, y) 137 | 138 | ax[1].plot(range(1, len(ada2.cost_) + 1), ada2.cost_, marker='o') 139 | ax[1].set_xlabel('Epochs') 140 | ax[1].set_ylabel('Sum-squared-error') 141 | ax[1].set_title('Adaline - Learning rate 0.0001') 142 | ocr_utils.show_figures(plt, title) 143 | 144 | 145 | 146 | # 147 | # plt.plot(range(1,len(ada1.cost_)+1), np.log10(ada1.cost_), marker='o',label = title) 148 | # plt.title(title) 149 | # ocr_utils.show_figures(plt, title) 150 | # 151 | # ada2 = AdalineGD(n_iter=15, eta=0.0001).fit(X, y) 152 | # title = 'Gradient Descent Learning rate 0.0001' 153 | # plt.plot(range(1,len(ada2.cost_)+1), np.log10(ada2.cost_) ,marker='x',label = title) 154 | # plt.title(title) 155 | # ocr_utils.show_figures(plt, title) 156 | # standardize features 157 | X_std = np.copy(X) 158 | X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std() 159 | X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std() 160 | 161 | ada = AdalineGD(n_iter=15, eta=0.01) 162 | ada.fit(X_std, y) 163 | ocr_utils.plot_decision_regions(X=X_std, 164 | y=y, 165 | classifier=ada, 166 | labels= labels, 167 | title='Adaline - Gradient Descent standardized rate 0.01') 168 | 169 | title = 'Standardized Gradient Descent Learning rate 0.01' 170 | plt.plot(range(1,len(ada2.cost_)+1), np.log10(ada2.cost_) ,marker='x',label = title) 171 | plt.title(title) 172 | ocr_utils.show_figures(plt, title) 173 | 174 | plt.plot(range(1,len(ada.cost_)+1), np.log10(ada.cost_), marker='v', label='standardized rate 0.01') 175 | plt.xlabel('Epochs') 176 | plt.ylabel('log(Sum-squared-error)') 177 | plt.legend(loc='lower left') 178 | plt.title('Adaline - Gradient Descent') 179 | plt.tight_layout() 180 | ocr_utils.show_figures(plt, 'Adaline - Gradient Descent') 181 | 182 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /p44_adaline_sgd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | '''adaline_sgd.py illustrates Stochastic Gradient Descent. 3 | 4 | First, the weights are updated after each training sample instead of 5 | calculating the error for the entire batch. This causes the weights to 6 | converge faster than the batch method. 7 | 8 | Second, the samples can be shuffled to avoid bias based on the order of samples in 9 | the training set. 10 | 11 | The decision regions and the speed of convergence is plotted 12 | 13 | Created on Jun 22, 2016 14 | 15 | from Python Machine Learning by Sebastian Raschka under the following license 16 | 17 | The MIT License (MIT) 18 | 19 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 20 | 21 | Permission is hereby granted, free of charge, to any person obtaining a copy 22 | of this software and associated documentation files (the "Software"), to deal 23 | in the Software without restriction, including without limitation the rights 24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 25 | copies of the Software, and to permit persons to whom the Software is 26 | furnished to do so, subject to the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be included in all 29 | copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 37 | SOFTWARE. 38 | 39 | @author: richard lyman 40 | ''' 41 | import ocr_utils 42 | import numpy as np 43 | from numpy.random import seed 44 | import matplotlib.pyplot as plt 45 | 46 | 47 | ############################################################################# 48 | # read images and scatter plot 49 | 50 | # retrieve 100 sets of target numbers and column sums 51 | # y: the ascii characters 48 and 49 ('0', '1') 52 | # X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17 53 | 54 | ascii_characters_to_train=(48,51) 55 | columnsXY = (9,17) 56 | nchars=500 57 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=120) 58 | y = np.where(y==ascii_characters_to_train[1],-1,1) 59 | 60 | ############################################################################# 61 | # AdalineSGD from Python Machine Learning 62 | class AdalineSGD(object): 63 | """ADAptive LInear NEuron classifier. 64 | 65 | Parameters 66 | ------------ 67 | eta : float 68 | Learning rate (between 0.0 and 1.0) 69 | n_iter : int 70 | Passes over the training dataset. 71 | 72 | Attributes 73 | ----------- 74 | w_ : 1d-array 75 | Weights after fitting. 76 | errors_ : list 77 | Number of misclassifications in every epoch. 78 | shuffle : bool (default: True) 79 | Shuffles training data every epoch if True to prevent cycles. 80 | random_state : int (default: None) 81 | Set random state for shuffling and initializing the weights. 82 | 83 | """ 84 | def __init__(self, eta=0.01, n_iter=10, shuffle=True, random_state=None): 85 | self.eta = eta 86 | self.n_iter = n_iter 87 | self.w_initialized = False 88 | self.shuffle = shuffle 89 | if random_state: 90 | seed(random_state) 91 | 92 | def fit(self, X, y): 93 | """ Fit training data. 94 | 95 | Parameters 96 | ---------- 97 | X : {array-like}, shape = [n_samples, n_features] 98 | Training vectors, where n_samples is the number of samples and 99 | n_features is the number of features. 100 | y : array-like, shape = [n_samples] 101 | Target values. 102 | 103 | Returns 104 | ------- 105 | self : object 106 | 107 | """ 108 | self._initialize_weights(X.shape[1]) 109 | self.cost_ = [] 110 | for i in range(self.n_iter): 111 | if self.shuffle: 112 | X, y = self._shuffle(X, y) 113 | cost = [] 114 | for xi, target in zip(X, y): 115 | cost.append(self._update_weights(xi, target)) 116 | avg_cost = sum(cost)/len(y) 117 | self.cost_.append(avg_cost) 118 | return self 119 | 120 | def partial_fit(self, X, y): 121 | """Fit training data without reinitializing the weights""" 122 | if not self.w_initialized: 123 | self._initialize_weights(X.shape[1]) 124 | if y.ravel().shape[0] > 1: 125 | for xi, target in zip(X, y): 126 | self._update_weights(xi, target) 127 | else: 128 | self._update_weights(X, y) 129 | return self 130 | 131 | def _shuffle(self, X, y): 132 | """Shuffle training data""" 133 | r = np.random.permutation(len(y)) 134 | return X[r], y[r] 135 | 136 | def _initialize_weights(self, m): 137 | """Initialize weights to zeros""" 138 | self.w_ = np.zeros(1 + m) 139 | self.w_initialized = True 140 | 141 | def _update_weights(self, xi, target): 142 | """Apply Adaline learning rule to update the weights""" 143 | output = self.net_input(xi) 144 | error = (target - output) 145 | self.w_[1:] += self.eta * xi.dot(error) 146 | self.w_[0] += self.eta * error 147 | cost = 0.5 * error**2 148 | return cost 149 | 150 | def net_input(self, X): 151 | """Calculate net input""" 152 | return np.dot(X, self.w_[1:]) + self.w_[0] 153 | 154 | def activation(self, X): 155 | """Compute linear activation""" 156 | return self.net_input(X) 157 | 158 | def predict(self, X): 159 | """Return class label after unit step""" 160 | return np.where(self.activation(X) >= 0.0, 1, -1) 161 | ############################################################################# 162 | # standardize features,fit, and plot 163 | X_std = np.copy(X) 164 | X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std() 165 | X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std() 166 | ada = AdalineSGD(n_iter=15, eta=0.01, random_state = 1) 167 | ada.fit(X_std, y) 168 | 169 | ocr_utils.plot_decision_regions(X=X_std, 170 | y=y, 171 | classifier=ada, 172 | title='Adaline - Stochastic Gradient Descent', 173 | labels=labels) 174 | 175 | title='Adaline - Stochastic Gradient Descent' 176 | plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o') 177 | plt.xlabel('Epochs') 178 | plt.ylabel('Average Cost') 179 | plt.title(title) 180 | plt.tight_layout() 181 | ocr_utils.show_figures(plt, title) 182 | 183 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /q3_removing_affine_distortion.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 23, 2016 3 | 4 | @author: richard 5 | ''' 6 | ''' 7 | Created on Jul 12, 2016 8 | This program shows how Principal Component Analysis removes affine 9 | transformation distortions. 10 | 11 | Parallel lines in an image remain parallel after an affine transformation. 12 | For instance, if an image is rotated or sheared, lines remain parallel. 13 | 14 | PCA and LDA can remove affine transformations. This is shown by making 3 shapes 15 | and then making a number of shear versions of the shapes. Running 16 | Principal Component Analysis reduces the number of features necessary to 17 | recognize the features during Logistic Regression with 100% accuracy, 18 | down to 2 from 400 (20 columns by 20 rows). 19 | 20 | We make three images and then make about 80 copies of each image created by 21 | shearing the original image. 22 | 23 | Since there is very little noise introduced by the shearing, almost all of 24 | the explained variance is due to the shearing. PCA finds eigenvectors 25 | that line up with shearing. 26 | 27 | 1) For a couple of shapes, make sheared version. 28 | 2) train and print accuracies without PCA 29 | 3) repeat, but use PCA first before training. 30 | 4) observe the improvement 31 | 32 | Do the same thing for Linear Discriminant Analysis 33 | 34 | @author: richard 35 | ''' 36 | 37 | import numpy as np 38 | import ocr_utils 39 | from sklearn.metrics import accuracy_score 40 | white_space = 5 41 | 42 | ######################################################################### 43 | # make a 3 basic images with about 80 sheared clones each 44 | 45 | plus = np.zeros((20,20)) 46 | box = np.zeros((20,20)) 47 | vee = np.zeros((20,20)) 48 | 49 | plus[range(white_space, 20-white_space),9:10] = 1.0 50 | plus[9:10,range(white_space, 20-white_space)] = 1.0 51 | 52 | box[white_space,range(white_space, 20-white_space)] = 1.0 #top 53 | box[20-white_space, range(white_space, 20 -white_space)] = 1.0 #bottom 54 | box[range(white_space, 20-white_space), white_space] = 1.0 # left 55 | box[range(white_space, 20-white_space), 20 - white_space] = 1.0 #right 56 | 57 | for i in range(20): 58 | vee[i,19-int(i/2)] = 1.0 59 | vee[i,int(i/2)] = 1.0 60 | 61 | # make some skewed versions of the shapes 62 | import skimage.transform as tf 63 | 64 | def shear(X, skew): 65 | rows = X.shape[0] 66 | cols = X.shape[1] 67 | ratioY = skew*cols/rows 68 | matrix = np.array( [[1, ratioY, 0] ,[0, 1, 0] ,[0, 0, 1 ]]) 69 | tp=tf.ProjectiveTransform(matrix=matrix) 70 | f = tf.warp(X, tp) 71 | return f 72 | 73 | # make some skewed versions of the shapes 74 | skewRange = np.linspace(-0.5,0.5,81) 75 | images = np.empty((3*len(skewRange),20,20)) 76 | ys = np.empty((3*len(skewRange))) 77 | # make sheared versions of shapes 78 | for i,skew in enumerate(skewRange): 79 | images[3*i] = shear(plus,skew) 80 | images[3*i+1] = shear(box,skew) 81 | images[3*i+2] = shear(vee,skew) 82 | ys[3*i] = 0 83 | ys[3*i+1] = 1 84 | ys[3*i+2] = 2 85 | 86 | title='skewed versions of shapes' 87 | ocr_utils.montage(images,title=title) 88 | 89 | num_image=images.shape[0] 90 | images_reshaped = np.reshape(images,(num_image, 20*20)) 91 | 92 | ######################################################################### 93 | # run a Logistic Regression on the raw features with 20 rows, 20 columns 94 | 95 | from sklearn.linear_model import LogisticRegression 96 | from sklearn.model_selection import train_test_split 97 | 98 | X_train , X_test, y_train, y_test = train_test_split(images_reshaped, ys, test_size=0.3, random_state=0) 99 | 100 | lr = LogisticRegression() 101 | lr.fit(X_train, y_train) 102 | y_train_pred = lr.predict(X_train) 103 | y_test_pred = lr.predict(X_test) 104 | 105 | print('\nTrain Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_train, y_train_pred), lr.coef_.shape)) 106 | print('Test Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_test, y_test_pred), lr.coef_.shape)) 107 | 108 | ######################################################################### 109 | # run Principal Component analysis first, then Logistic Regression 110 | 111 | from sklearn.decomposition import PCA 112 | n_components = 2 113 | pca = PCA(n_components=n_components) 114 | X_train_pca = pca.fit_transform(X_train) 115 | X_test_pca = pca.transform(X_test) 116 | 117 | print('\nPCA components = {}'.format(pca.components_.shape)) 118 | 119 | lr = LogisticRegression() 120 | logistic_fitted = lr.fit(X_train_pca, y_train) 121 | 122 | y_train_pred = logistic_fitted.predict(X_train_pca) 123 | y_test_pred = logistic_fitted.predict(X_test_pca) 124 | 125 | print('\nPCA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components,lr.coef_.shape)) 126 | print('PCA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,lr.coef_.shape)) 127 | 128 | X_errors_image = X_test[y_test!=y_test_pred] 129 | y_errors = y_test[y_test!=y_test_pred] 130 | X_errors_pca = X_test_pca[y_test!=y_test_pred] 131 | 132 | # change to a 2D shape 133 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], 20, 20)) 134 | ocr_utils.montage(X_errors2D,title='PCA Error Images, components={}'.format (n_components)) 135 | 136 | X_combined = np.vstack((X_train_pca, X_test_pca)) 137 | y_combined = np.hstack((y_train, y_test)) 138 | 139 | ocr_utils.plot_decision_regions( 140 | X=X_combined, 141 | y=y_combined, 142 | classifier=lr, 143 | labels = ['PC1','PC2'] , 144 | title='logistic_regression after 2 component PCA') 145 | 146 | ######################################################################### 147 | # run Linear Discriminant Analysis first then Logistic Regression 148 | 149 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 150 | n_components = 2 151 | lda = LDA(n_components=n_components) 152 | 153 | X_train_lda = lda.fit_transform(X_train, y_train) 154 | X_test_lda = lda.transform(X_test) 155 | print('\nLDA components = {}'.format(pca.components_.shape)) 156 | lr = LogisticRegression() 157 | logistic_fitted = lr.fit(X_train_lda, y_train) 158 | 159 | y_train_pred = logistic_fitted.predict(X_train_lda) 160 | y_test_pred = logistic_fitted.predict(X_test_lda) 161 | 162 | print('\nLDA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),lda.n_components,lr.coef_.shape)) 163 | print('LDA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),lda.n_components,lr.coef_.shape)) 164 | 165 | X_errors_image = X_test[y_test!=y_test_pred] 166 | 167 | # change to a 2D shape 168 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], 20, 20)) 169 | ocr_utils.montage(X_errors2D,title='LDA Error Images, components={}'.format (n_components)) 170 | 171 | X_combined = np.vstack((X_train_lda, X_test_lda)) 172 | y_combined = np.hstack((y_train, y_test)) 173 | 174 | ocr_utils.plot_decision_regions( 175 | X=X_combined, 176 | y=y_combined, 177 | classifier=lr, 178 | labels = ['LDA1','LDA2'] , 179 | title='logistic_regression after 2 component LDA') 180 | 181 | print ('\n########################### No Errors ####################################') 182 | -------------------------------------------------------------------------------- /p314_k_means.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 18, 2016 3 | k_means.py 4 | 5 | K-means is a an alogithm for finding clusters of similar data without 6 | supervision. We give it the number of clusters we are looking for and 7 | it lumps the samples together. If does this by finding a centroid, assigning 8 | the closest smaples to the centroid, recomputing the centroid from 9 | the mean of the samples etc. 10 | 11 | It basically uses a Euclidean distance to evaluate whether a sample 12 | belongs to a centroid. So it is good for spherical data, but has 13 | trouble with non spherical data. 14 | 15 | Unfortunately, the data in the ocr_utils is not spherical so we 16 | get some odd results. 17 | 18 | For this program 19 | input a bunch of samples from ocr_utils, 20 | run K means on them and 21 | display the results. 22 | 23 | Repeat this for k++ means, that places beginning centroids far away from 24 | each other. 25 | 26 | Run an 'elbow plot' that uses the inertia values from each cluster 27 | versus the number of clusters. It shows how many cluster we need 28 | to get the inertia distortion values to stabilize. 29 | 30 | Make some montage plots of images so that we can see what images are 31 | in the clusters. 32 | 33 | from Python Machine Learning by Sebastian Raschka under the following license 34 | 35 | The MIT License (MIT) 36 | 37 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 38 | 39 | Permission is hereby granted, free of charge, to any person obtaining a copy 40 | of this software and associated documentation files (the "Software"), to deal 41 | in the Software without restriction, including without limitation the rights 42 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 43 | copies of the Software, and to permit persons to whom the Software is 44 | furnished to do so, subject to the following conditions: 45 | 46 | The above copyright notice and this permission notice shall be included in all 47 | copies or substantial portions of the Software. 48 | 49 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 50 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 51 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 52 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 53 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 54 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 55 | SOFTWARE. 56 | 57 | @author: richard lyman 58 | ''' 59 | import numpy as np 60 | import ocr_utils 61 | import matplotlib.pyplot as plt 62 | n=200 63 | 64 | chars_to_train = range(48,51) 65 | columnsXY=(9,17) 66 | column_str = 'column_sum{}'.format(list(columnsXY)) 67 | skewRange = np.linspace(-0.5,0.5,81) 68 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 69 | 70 | # output the character label and the image and column sums 71 | output_feature_list = ['m_label','image',column_str] 72 | 73 | # read the complete image (20x20) = 400 pixels for each character 74 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 75 | output_feature_list=output_feature_list, 76 | random_state=0) 77 | 78 | y = ds.train.features[0][:n] 79 | X_image = ds.train.features[1][:n] 80 | X = ds.train.features[2][:n] 81 | 82 | # put the ASCII equivalent of the unique characters in y into the legend of the plot 83 | legend=[] 84 | for ys in np.unique(y): 85 | legend.append('{} \'{}\''.format(ys, chr(ys))) 86 | 87 | ocr_utils.scatter_plot(X=X, 88 | y=y, 89 | legend_entries=legend, 90 | axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 91 | title='k-means cluster E13B sum of columns') 92 | 93 | from sklearn.cluster import KMeans 94 | km = KMeans(n_clusters=3,init='random',n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1) 95 | y_km = km.fit_predict(X) 96 | 97 | legend=[] 98 | for ys in np.unique(y_km): 99 | legend.append('{}\''.format(ys)) 100 | 101 | plt.scatter(km.cluster_centers_[:,0], 102 | km.cluster_centers_[:,1], 103 | s=250, 104 | marker='*', 105 | c='red', 106 | label='centroids') 107 | 108 | ocr_utils.scatter_plot(X=X, 109 | y=y_km, 110 | legend_entries=legend, 111 | axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 112 | title='column sums k means centroids') 113 | 114 | km = KMeans(n_clusters=3,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1) 115 | y_km = km.fit_predict(X) 116 | 117 | legend=[] 118 | for ys in np.unique(y_km): 119 | legend.append('{}\''.format(ys)) 120 | 121 | plt.scatter(km.cluster_centers_[:,0], 122 | km.cluster_centers_[:,1], 123 | s=250, 124 | marker='*', 125 | c='red', 126 | label='k++') 127 | 128 | ocr_utils.scatter_plot(X=X, 129 | y=y_km, 130 | legend_entries='', 131 | axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 132 | title='column sums k++ means centroids') 133 | 134 | 135 | for i in range(0,km.cluster_centers_.shape[0]): 136 | image_index2 = np.argwhere(y_km == i) 137 | x2d = X_image[image_index2].reshape((image_index2.shape[0],ds.train.num_rows, ds.train.num_columns)) 138 | ocr_utils.montage(x2d,title='k++ cluster {}'.format(i)) 139 | 140 | ############################################## 141 | # separate the original images by cluster 142 | # print(km.cluster_centers_.shape) 143 | 144 | n=30000 145 | 146 | chars_to_train = range(48,58) 147 | columnsXY=range(0,20) 148 | column_str = 'column_sum{}'.format(list(columnsXY)) 149 | skewRange = np.linspace(-0.5,0.5,81) 150 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 151 | 152 | # output the character label and the image and column sums 153 | output_feature_list = ['m_label','image'] 154 | 155 | # read the complete image (20x20) = 400 pixels for each character 156 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 157 | output_feature_list=output_feature_list, 158 | random_state=0) 159 | 160 | y = ds.train.features[0][:n] 161 | X_image = ds.train.features[1][:n] 162 | # X = ds.train.features[2][:n] 163 | 164 | distortions=[] 165 | for i in range(1,30): 166 | km = KMeans(n_clusters=i,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1) 167 | y_km = km.fit_predict(X_image) 168 | distortions.append(km.inertia_) 169 | 170 | plt.plot(range(1,30), distortions, marker='o') 171 | plt.xlabel('Number of clusters') 172 | plt.ylabel('Distortion') 173 | title = '2D image elbow distortion' 174 | plt.title(title) 175 | ocr_utils.show_figures(plt, title) 176 | 177 | 178 | km = KMeans(n_clusters=8,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1) 179 | y_km = km.fit_predict(X_image) 180 | 181 | nClusters = km.cluster_centers_.shape[0] 182 | x2d = [] 183 | sz = np.zeros((nClusters)) 184 | 185 | for i in range(0,nClusters): 186 | image_index2 = np.argwhere(y_km == i) 187 | x2d.append( X_image[image_index2].reshape((image_index2.shape[0],ds.train.num_rows, ds.train.num_columns))) 188 | print (i,x2d[i].shape[0]) 189 | sz[i] = image_index2.shape[0] 190 | 191 | args= np.argsort(sz)[::-1] 192 | print(sz[args]) 193 | print(args) 194 | for i in range(0,nClusters): 195 | ocr_utils.montage(x2d[args[i]],title='2D image cluster {}'.format(i)) 196 | 197 | 198 | print ('\n########################### No Errors ####################################') 199 | 200 | -------------------------------------------------------------------------------- /p194_receiver_operating_characteristic.py: -------------------------------------------------------------------------------- 1 | ''' receiver_operating_characterist.py 2 | 3 | A receiver operating characterist plot is a plot of the true positive rate 4 | against the false positive rate for a dataset with binary outcomes 5 | 6 | A threshold for determining whether a sample is positive or negative is 7 | the independent variable that is varied to produce the values in the graph. 8 | 9 | The AUC, Area Under the Curve can be calculated. The closer to 1.0, the 10 | better the classification. 11 | 12 | Created on Jul 9, 2016 13 | 14 | from Python Machine Learning by Sebastian Raschka under the following license 15 | 16 | The MIT License (MIT) 17 | 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining a copy 21 | of this software and associated documentation files (the "Software"), to deal 22 | in the Software without restriction, including without limitation the rights 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 | copies of the Software, and to permit persons to whom the Software is 25 | furnished to do so, subject to the following conditions: 26 | 27 | The above copyright notice and this permission notice shall be included in all 28 | copies or substantial portions of the Software. 29 | 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36 | SOFTWARE. 37 | 38 | @author: richard lyman 39 | ''' 40 | # from sklearn.metrics import make_scorer,roc_curve, auc 41 | from scipy import interp 42 | import matplotlib.pyplot as plt 43 | import numpy as np 44 | import ocr_utils 45 | from sklearn.preprocessing import StandardScaler 46 | from sklearn.linear_model import LogisticRegression 47 | from sklearn.pipeline import Pipeline 48 | from sklearn.model_selection import StratifiedKFold 49 | from sklearn.decomposition import PCA 50 | from sklearn.model_selection import train_test_split 51 | from sklearn.metrics import make_scorer,precision_score,roc_curve, auc 52 | from sklearn.metrics import roc_auc_score, accuracy_score 53 | from sklearn.model_selection import cross_val_score 54 | 55 | if __name__ == '__main__': 56 | 57 | y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 58 | from sklearn.preprocessing import LabelEncoder 59 | 60 | # the ROC is for data with a binary outcome. Change the ASCII characters to 0,1 61 | le = LabelEncoder() 62 | y = le.fit_transform(y) 63 | le.transform((48,51)) 64 | 65 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) 66 | 67 | pipe_lr = Pipeline([('scl', StandardScaler()), 68 | ('pca', PCA(n_components=2)), 69 | ('clf', LogisticRegression(penalty='l2',random_state=0,C=100.0, solver='lbfgs'))]) 70 | 71 | # X_train2 = X_train[:, [4, 14]] 72 | X_train2 = X_train 73 | 74 | kfold = StratifiedKFold(n_splits=3, random_state=1) 75 | 76 | # scores = [] 77 | # for train_index, test_index in kfold.split(X_train, y_train): 78 | # pipe_lr.fit(X_train[train_index], y_train[train_index]) 79 | # score = pipe_lr.score(X_train[test_index], y_train[test_index]) 80 | # scores.append(score) 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | # cv = StratifiedKFold(y_train,n_folds=3,random_state=1) 89 | fig = plt.figure(figsize=(7, 5)) 90 | 91 | mean_tpr = 0.0 92 | mean_fpr = np.linspace(0, 1, 100) 93 | all_tpr = [] 94 | i=0 95 | for train_index, test_index in kfold.split(X_train, y_train): 96 | probas = pipe_lr.fit(X_train2[train_index], 97 | y_train[train_index]).predict_proba(X_train2[test_index]) 98 | 99 | fpr, tpr, thresholds = roc_curve(y_train[test_index], 100 | probas[:, 1], 101 | pos_label=1) 102 | mean_tpr += interp(mean_fpr, fpr, tpr) 103 | mean_tpr[0] = 0.0 104 | roc_auc = auc(fpr, tpr) 105 | i=i+1 106 | plt.plot(fpr, 107 | tpr, 108 | lw=1, 109 | label='ROC fold %d (area = %0.2f)' 110 | % (i, roc_auc)) 111 | 112 | plt.plot([0, 1], 113 | [0, 1], 114 | linestyle='--', 115 | color=(0.6, 0.6, 0.6), 116 | label='random guessing') 117 | 118 | mean_tpr /= kfold.get_n_splits(X_train) 119 | mean_tpr[-1] = 1.0 120 | mean_auc = auc(mean_fpr, mean_tpr) 121 | plt.plot(mean_fpr, mean_tpr, 'k--', 122 | label='mean ROC (area = %0.2f)' % mean_auc, lw=2) 123 | plt.plot([0, 0, 1], 124 | [0, 1, 1], 125 | lw=2, 126 | linestyle=':', 127 | color='black', 128 | label='perfect performance') 129 | 130 | plt.xlim([-0.05, 1.05]) 131 | plt.ylim([-0.05, 1.05]) 132 | plt.xlabel('false positive rate') 133 | plt.ylabel('true positive rate') 134 | title='Receiver Operator Characteristic' 135 | plt.title(title) 136 | plt.legend(loc="lower right") 137 | plt.tight_layout() 138 | ocr_utils.show_figures(plt,title) 139 | 140 | 141 | pipe_lr = pipe_lr.fit(X_train2, y_train) 142 | # y_pred2 = pipe_lr.predict(X_test[:, [4, 14]]) 143 | y_pred2 = pipe_lr.predict(X_test) 144 | 145 | print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred2)) 146 | print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred2)) 147 | #=================================================================================================================================================== 148 | #illustrates how to make a scorer using the precision evaluation value 149 | # for more than 2 classes for GridSearch 150 | # i.e. applies a binary scoring technique to multiclasses 151 | pos_label=range(48,58) 152 | # pre_scorer = make_scorer(score_func=precision_score, 153 | # pos_label=pos_label, 154 | # greater_is_better=True, 155 | # average='micro') 156 | 157 | from sklearn.svm import SVC 158 | y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = pos_label , nChars=4000, columns=(9,17), random_state=0) 159 | pipe_svc = Pipeline([('scl', StandardScaler()), 160 | ('clf', SVC(random_state=1))]) 161 | c_gamma_range = [0.01, 0.1, 1.0, 10.0] 162 | 163 | param_grid = [{'clf__C': c_gamma_range, 164 | 'clf__kernel': ['linear']}, 165 | {'clf__C': c_gamma_range, 166 | 'clf__gamma': c_gamma_range, 167 | 'clf__kernel': ['rbf'],}] 168 | from sklearn.model_selection import GridSearchCV 169 | gs = GridSearchCV(estimator=pipe_svc, 170 | param_grid=param_grid, 171 | scoring='accuracy', 172 | cv=5, 173 | n_jobs=-1) 174 | 175 | 176 | scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) 177 | print('\nSupport Vector Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 178 | 179 | gs = gs.fit(X_train, y_train) 180 | print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_)) 181 | print('Support Vector Machine Grid Search best params: {}\n'.format(gs.best_params_)) 182 | 183 | print ('\n########################### No Errors ####################################') 184 | 185 | -------------------------------------------------------------------------------- /o1_top_secret_cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | """# ========================================================================== 5 | 6 | # Copyright 2015 Google Inc. All Rights Reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # ============================================================================== 20 | 21 | 22 | encode a secret message in the angle of rotation of characters 23 | 24 | Train a neural network on rotated versions of characters with the output of 25 | the network being the angle of rotation. 26 | 27 | Thus, given a rotated character, the neural network will yield a value 28 | that is the amount of rotation of the character. 29 | 30 | Encode a test set by applying a secret message with one bit for each character. 31 | Decode the secret message by running the rotated characters through the 32 | neural network, yielding the pattern of bits. 33 | 34 | 35 | @author: richard lyman 36 | 37 | """# ============================================================================== 38 | import ocr_utils 39 | 40 | 41 | import numpy as np 42 | from PIL import Image, ImageDraw 43 | import io 44 | #import n1_2cnv1fc as nnetwork 45 | #import n1_residual3x4 as nnetwork 46 | import n1_2cnv2fc as nnetwork 47 | import skimage.transform as af 48 | from bitarray import bitarray 49 | 50 | 51 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} 52 | output_feature_list = ['orientation_one_hot','image'] 53 | dtype = np.float32 54 | 55 | skewRange = np.linspace(-0.2,0.2,2) 56 | 57 | ''' 58 | pick up the base character 59 | 60 | make a training set by rotating them through n angles 61 | 62 | train 63 | 64 | pick up the base characters 65 | encode the secret message n bits at a time into the characters 66 | this is the testing set 67 | 68 | test secret message yielding a vector of rotations 69 | 70 | convert the rotation back into bits 71 | 72 | assemble the bits into the secret message. 73 | ''' 74 | 75 | 76 | # pick up the base characters from training_image_file 77 | # produce some sheared versions 78 | # make into a training set 79 | # place in a ocr_utils TruthedCharacters class so we can use the 80 | # one hot and batch functions 81 | 82 | character_size = 100 83 | white_space=8 84 | 85 | image_file= '15-01-01 459_Mont_Lyman' 86 | image_file_jpg = image_file+'.jpg' 87 | 88 | df,t1 = ocr_utils.file_to_df(image_file,character_size,title='Characters to Train',white_space=white_space) 89 | 90 | shp = t1.shape 91 | totalN = len(skewRange)*shp[0] 92 | 93 | images=[] 94 | originalH=[] 95 | originalW=[] 96 | tops=[] 97 | lefts=[] 98 | orientation=[] 99 | recognized_label =[] 100 | 101 | 102 | 103 | for j in range(shp[0]): 104 | for i,skew in enumerate(skewRange): 105 | k = i+j*len(skewRange) 106 | 107 | images.append(ocr_utils.shear(t1[j],skew)) 108 | originalH.append(df['originalH'][j]) 109 | tops.append(df['m_top'][j]) 110 | originalW.append(df['originalW'][j]) 111 | lefts.append(df['m_left'][j]) 112 | 113 | orientation.append(skew) 114 | recognized_label.append( df['m_label'][j]) 115 | images=np.array(images) 116 | ocr_utils.montage(images, title='Base Characters Skewed') 117 | 118 | images = np.reshape(images,(images.shape[0],images.shape[1]*images.shape[2])) 119 | df = ocr_utils.make_df(images, character_size, character_size, originalH, originalW, tops, lefts, orientation, recognized_label ) 120 | #df = ocr_utils.make_df(images, character_size, character_size, bottoms, rights, tops, lefts, orientation, recognized_label ) 121 | 122 | 123 | # input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} 124 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} 125 | output_feature_list = ['orientation_one_hot','image'] 126 | ds = ocr_utils.read_df(df,input_filters_dict = input_filters_dict, 127 | output_feature_list=output_feature_list, 128 | test_size = 0, 129 | engine_type='tensorflow', 130 | dtype=dtype) 131 | 132 | nn = nnetwork.network(ds.train) 133 | """# ============================================================================== 134 | 135 | Train and Evaluate the Model 136 | 137 | """# ============================================================================== 138 | 139 | nn.fit( ds.train , nEpochs=5000) 140 | 141 | ####################################################################################### 142 | 143 | # now that the font is trained, pick up some text and encode a message 144 | image_file= '15-01-01 459_Mont_Lyman' 145 | image_file_jpg = image_file+'.jpg' 146 | df,t1 = ocr_utils.file_to_df(image_file,character_size, title = 'unencrypted file',white_space=white_space) 147 | 148 | 149 | secret_message = "top secret" 150 | a = bitarray() 151 | a.frombytes(secret_message.encode('utf_8')) 152 | 153 | index = 0 154 | encoded_skews=[] 155 | def convert_to_shear(a): 156 | index = 0 157 | while True: 158 | if index < len(a): 159 | bits = a[index:index+1].to01() 160 | index += 1 161 | #c = int(bits,2) 162 | c = int(bits) 163 | yield c 164 | else: 165 | yield -1 166 | 167 | gen= convert_to_shear(a) 168 | 169 | im = Image.open(image_file_jpg) 170 | img2 = Image.new('L',(im.height,im.width),color=255) 171 | img3 = Image.new('L',(im.height,im.width),color=255) 172 | draw = ImageDraw.Draw(img3) 173 | for i in range(t1.shape[0]): 174 | left = int(df['m_left'][i]) 175 | right = left + int(df['originalW'][i]) 176 | top = int(df['m_top'][i]) 177 | bottom = top + int(df['originalH'][i]) 178 | skew_index = next(gen) 179 | #print ('i={}, skew_index={}, left={}, top={}, right={}, bottom={}'.format(i,skew_index, left,top,right,bottom)) 180 | encoded_skews.append(skew_index) 181 | if skew_index >= 0: 182 | t1[i] = ocr_utils.shear(t1[i], skewRange[skew_index]) 183 | im_clip = Image.fromarray(256.0-t1[i]*256.0) 184 | img2.paste(im_clip, box= (left , top)) 185 | img3.paste(im_clip, box= (left , top)) 186 | 187 | 188 | draw.rectangle((left,top,right+2*white_space,bottom+2*white_space), outline=0) 189 | 190 | gen.close() 191 | 192 | ###########################################################################vvvvvvv 193 | image_file= '/tmp/plots/01_encrypted_file' 194 | image_file_jpg = image_file+'.jpg' 195 | img2.save(image_file_jpg) 196 | 197 | 198 | image_file3= '/tmp/plots/01_03_encrypted_file_with_box' 199 | image_file3_jpg = image_file3+'.jpg' 200 | img3.save(image_file3_jpg) 201 | 202 | ''' test the new encrptyed file 203 | ''' 204 | df,t1 = ocr_utils.file_to_df(image_file,character_size, title = 'Encrypted File',white_space=white_space) 205 | 206 | ds = ocr_utils.read_df(df,input_filters_dict = input_filters_dict, 207 | output_feature_list=output_feature_list, 208 | test_size = 1, 209 | engine_type='tensorflow', 210 | dtype=dtype) 211 | 212 | results = nn.predict(ds.test) 213 | correct_characters=[] 214 | incorrect_characters=[] 215 | for i,x in enumerate(df['m_label']): 216 | try: 217 | print('index={}, original character={}, result= {}, skew={}'.format(i, chr(int(x)),results[i], encoded_skews[i]) ) 218 | if encoded_skews[i] >=0: 219 | if results[i] == encoded_skews[i]: 220 | correct_characters.append(chr(int(x))) 221 | else: 222 | incorrect_characters.append(chr(int(x))) 223 | except: 224 | print ('index out of bounds={}'.format(i)) 225 | print ('correct characters={}'.format(correct_characters)) 226 | print ('incorrect characters={}'.format(incorrect_characters)) 227 | 228 | print ('\n########################### No Errors ####################################') 229 | 230 | -------------------------------------------------------------------------------- /p177_k_fold_cross_validation.py: -------------------------------------------------------------------------------- 1 | '''k_fold_model_selection.py 2 | k fold cross validation splits the training set into n parts and uses a 3 | different 1/n of the test set for each iteration. It is good for 4 | tuning parameters as all samples are used, reducing the variance of the 5 | model performance. 6 | 7 | Using a pipeline automates the steps by putting into a batch pipe 8 | 1) scaling, 9 | 2) Principle Component Analysis, and 10 | 3) training 11 | 12 | StratifiedKFold returns lists of the indexes of the X samples and y 13 | target samples to be used for each fold 14 | 15 | The cross_val_score returns accuracy scores for each k-fold predictor 16 | from each fold. 17 | 18 | Created on Jul 5, 2016 19 | 20 | from Python Machine Learning by Sebastian Raschka under the following license 21 | 22 | The MIT License (MIT) 23 | 24 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 25 | 26 | Permission is hereby granted, free of charge, to any person obtaining a copy 27 | of this software and associated documentation files (the "Software"), to deal 28 | in the Software without restriction, including without limitation the rights 29 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 30 | copies of the Software, and to permit persons to whom the Software is 31 | furnished to do so, subject to the following conditions: 32 | 33 | The above copyright notice and this permission notice shall be included in all 34 | copies or substantial portions of the Software. 35 | 36 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 37 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 38 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 39 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 40 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 41 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 42 | SOFTWARE. 43 | 44 | @author: richard lyman 45 | ''' 46 | 47 | 48 | import ocr_utils 49 | import matplotlib.pyplot as plt 50 | import numpy as np 51 | from sklearn.model_selection import StratifiedKFold 52 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 53 | 54 | if __name__ == '__main__': 55 | #charsToTrain=range(48,58) 56 | chars_to_train = range(48,58) 57 | n_classes = len(chars_to_train) 58 | 59 | num_chars = 3000 #limit the number to speed up the calculation 60 | 61 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 62 | 63 | # output the character label and the image and column sums 64 | output_feature_list = ['m_label','image'] 65 | 66 | # read the complete image (20x20) = 400 pixels for each character 67 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 68 | output_feature_list=output_feature_list, 69 | random_state=0) 70 | 71 | y_train = ds.train.features[0][:num_chars] 72 | X_train = ds.train.features[1][:num_chars] 73 | 74 | # y_test = ds.test.features[0]-48 75 | # X_test = ds.test.features[1] 76 | # y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0) 77 | 78 | from sklearn.linear_model import LogisticRegression 79 | from sklearn.model_selection import train_test_split 80 | 81 | X_train , X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0) 82 | 83 | from sklearn.preprocessing import StandardScaler 84 | # 85 | # sc = StandardScaler() 86 | # X_train_std = sc.fit_transform(X_train) 87 | # X_test_std = sc.fit_transform(X_test) 88 | 89 | # X_train, X_test, y_train, y_test = \ 90 | # train_test_split(X, y, test_size=0.20, random_state=1) 91 | 92 | from sklearn.decomposition import PCA 93 | 94 | from sklearn.pipeline import Pipeline 95 | 96 | num_planes = range(2,12) 97 | 98 | pca_scores =[] 99 | pca_std_dev =[] 100 | for num_PCA in num_planes: 101 | print ('number of Principal Components = {}'.format(num_PCA)) 102 | pipe_lr = Pipeline([('scl', StandardScaler()), 103 | ('pca', PCA(n_components=num_PCA, svd_solver='full')), 104 | ('clf', LogisticRegression(random_state=1,multi_class='auto', solver='liblinear'))]) 105 | 106 | pipe_lr.fit(X_train, y_train) 107 | print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test)) 108 | 109 | kfold = StratifiedKFold(n_splits=10, random_state=1) 110 | 111 | scores = [] 112 | for train_index, test_index in kfold.split(X_train, y_train): 113 | pipe_lr.fit(X_train[train_index], y_train[train_index]) 114 | score = pipe_lr.score(X_train[test_index], y_train[test_index]) 115 | scores.append(score) 116 | #print ('train {} samples: {}'.format(len(train), train)) 117 | #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score)) 118 | 119 | print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 120 | from sklearn.model_selection import cross_val_score 121 | 122 | scores = cross_val_score(estimator=pipe_lr, 123 | X=X_train, 124 | y=y_train, 125 | cv=10, 126 | n_jobs=-1) 127 | print('CV accuracy scores: %s' % scores) 128 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 129 | pca_scores.append(np.mean(scores)) 130 | pca_std_dev.append(np.std(scores)) 131 | 132 | plt.plot(num_planes, pca_scores, marker='o') 133 | plt.ylabel('Accuracy') 134 | plt.xlabel('number of Principal Components') 135 | title = 'Accuracy versus number of Principal Components' 136 | plt.title(title) 137 | plt.tight_layout() 138 | ocr_utils.show_figures(plt, title) 139 | 140 | plt.plot(num_planes, pca_std_dev, marker='o') 141 | plt.ylabel('Standard Deviation') 142 | plt.xlabel('number of Principal Components') 143 | title = 'Standard Deviation versus number of Principal Components' 144 | plt.title(title) 145 | plt.tight_layout() 146 | ocr_utils.show_figures(plt, title) 147 | 148 | pca_scores =[] 149 | pca_std_dev =[] 150 | for num_LDA in num_planes: 151 | print ('number of Principal Components = {}'.format(num_LDA)) 152 | pipe_lr = Pipeline([('scl', StandardScaler()), 153 | ('lda', LDA(n_components=min(num_LDA,n_classes-1), solver='eigen')), 154 | ('clf', LogisticRegression(random_state=1,multi_class='auto',solver='liblinear'))]) 155 | 156 | kys = pipe_lr.get_params().keys() 157 | print(kys) 158 | # pipe_lr.set_params(lda__solver='eigen',clf__solver='liblinear',clf__multi_class='auto') 159 | pipe_lr.fit(X_train, y_train) 160 | print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test)) 161 | 162 | kfold = StratifiedKFold(n_splits=10, random_state=1) 163 | 164 | scores = [] 165 | for train_index, test_index in kfold.split(X_train, y_train): 166 | pipe_lr.fit(X_train[train_index], y_train[train_index]) 167 | score = pipe_lr.score(X_train[test_index], y_train[test_index]) 168 | scores.append(score) 169 | #print ('train {} samples: {}'.format(len(train), train)) 170 | #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score)) 171 | 172 | print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 173 | 174 | 175 | scores = cross_val_score(estimator=pipe_lr, 176 | X=X_train, 177 | y=y_train, 178 | cv=10, 179 | n_jobs=-1) 180 | print('CV accuracy scores: %s' % scores) 181 | print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) 182 | pca_scores.append(np.mean(scores)) 183 | pca_std_dev.append(np.std(scores)) 184 | 185 | plt.plot(num_planes, pca_scores, marker='o') 186 | plt.ylabel('Accuracy') 187 | plt.xlabel('number of Linear Discriminants') 188 | title = 'Accuracy versus number of Linear Discriminants' 189 | plt.title(title) 190 | plt.tight_layout() 191 | ocr_utils.show_figures(plt, title) 192 | 193 | plt.plot(num_planes, pca_std_dev, marker='o') 194 | plt.ylabel('Standard Deviation') 195 | plt.xlabel('number of Linear Discriminants') 196 | title = 'Standard Deviation versus number of Linear Discriminants' 197 | plt.title(title) 198 | plt.tight_layout() 199 | ocr_utils.show_figures(plt, title) 200 | 201 | print ('\n########################### No Errors ####################################') -------------------------------------------------------------------------------- /n1_residual3x4.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | takes an image input and trains it to make an image output 4 | 5 | funnels down to a 'key' and then goes back up to image 6 | 7 | 8 | 9 | ''' 10 | from tensorflow.compat import v1 as tf 11 | import numpy as np 12 | from collections import namedtuple 13 | import datetime 14 | import ocr_utils 15 | from n0_network import base_network as b_network 16 | 17 | class network(b_network): 18 | ''' definition of the network 19 | ''' 20 | def __init__(self, truthed_features, dtype=np.float32): 21 | self._sess = tf.InteractiveSession() 22 | 23 | lst = [] 24 | extra_features_width = 0 # width of extra features 25 | 26 | """# ============================================================================== 27 | 28 | Placeholders 29 | 30 | Compute the size of various layers 31 | 32 | Create a tensorflow Placeholder for each feature of data returned from the 33 | dataset 34 | 35 | """# ============================================================================== 36 | 37 | for i,nm in enumerate(truthed_features.feature_names): 38 | 39 | # features[0], is always the target. For instance it may be m_label_one_hot 40 | # the second features[1] is the 'image' that is passed to the convolution layers 41 | # Any additional features bypass the convolution layers and go directly 42 | # into the fully connected layer. 43 | 44 | # The width of the extra features is calculated in order to allocate 45 | # the correct widths of weights, # and inputs 46 | # names are assigned to make the look pretty on the tensorboard graph. 47 | 48 | if i == 0: 49 | nm = 'y_'+nm 50 | else: 51 | nm = 'x_'+nm 52 | if i>1: 53 | extra_features_width += truthed_features.feature_width[i] 54 | lst.append(tf.placeholder(dtype, shape=[None, truthed_features.feature_width[i]], name=nm)) 55 | 56 | # ph is a named tuple with key names like 'image', 'm_label', and values that 57 | # are tensors. The display name on the Chrome graph are 'y_m_label', 'self._x_image, 58 | # x_upper_case etc. 59 | 60 | 61 | Place_Holders = namedtuple('Place_Holders', truthed_features.feature_names) 62 | self._ph = Place_Holders(*lst) # unpack placeholders into named Tuple 63 | self._keep_prob = tf.placeholder(dtype,name='keep_prob') 64 | self._nRows = truthed_features.num_rows #image height 65 | self._nCols = truthed_features.num_columns #image width 66 | nSections = 10 67 | 68 | in_out_width = self._nRows*self._nCols 69 | internal_width = int(in_out_width/4) 70 | w = list(range(nSections*3)) 71 | b = list(range(nSections*3)) 72 | h = list(range(nSections*3+1)) 73 | nFc1 = 2048 # size of fully connected layer 74 | 75 | nTarget = truthed_features.feature_width[0] # the number of one_hot features in the target, 'm_label' 76 | 77 | """# ============================================================================== 78 | 79 | Build a Multilayer Convolutional Network 80 | 81 | Weight Initialization 82 | 83 | """# ============================================================================== 84 | 85 | def weight_variable(shape, dtype): 86 | initial = tf.truncated_normal(shape, stddev=0.1,dtype=dtype) 87 | return tf.Variable(initial) 88 | 89 | def bias_variable(shape, dtype): 90 | initial = tf.constant(0, shape=shape, dtype=dtype) 91 | return tf.Variable(initial) 92 | 93 | def shapeOuts(n): 94 | print ('n={}, hin={},w={}, b={} ,hout={}\n'.format(n, h[n].shape, w[n].shape, b[n].shape, h[n+1]._shape)) 95 | 96 | def section(n): 97 | with tf.name_scope('section_'+str(n)+'_0') as scope: 98 | w[n]=weight_variable([in_out_width, internal_width],dtype) 99 | b[n]=bias_variable([internal_width],dtype) 100 | h[n+1] = tf.nn.relu(tf.matmul(h[n], w[n]) + b[n]) 101 | shapeOuts(n) 102 | 103 | with tf.name_scope('section_'+str(n)+'_1') as scope: 104 | w[n+1]=weight_variable([internal_width, internal_width],dtype) 105 | b[n+1]=bias_variable([internal_width],dtype) 106 | 107 | h[n+2]=tf.nn.relu(tf.matmul(h[n+1], w[n+1]) + b[n+1]) 108 | shapeOuts(n+1) 109 | 110 | with tf.name_scope('section_'+str(n)+'_2') as scope: 111 | w[n+2]=weight_variable([internal_width, in_out_width],dtype) 112 | b[n+2]=bias_variable([in_out_width],dtype) 113 | z= tf.nn.relu(tf.matmul(h[n+2], w[n+2]) + b[n+2]) 114 | h[n+3]= tf.add(z ,h[n]) #n+3 115 | 116 | print('z shape ={}'.format(z._shape)) 117 | shapeOuts(n+2) 118 | return 119 | 120 | def computeSize(s,tens): 121 | sumC = 1 122 | tShape = tens.get_shape() 123 | nDims = len(tShape) 124 | for i in range(nDims): 125 | sumC *= tShape[i] 126 | print ('\t{}\t{}'.format(s,sumC),flush=True) 127 | return sumC 128 | 129 | """# ============================================================================== 130 | Build sectional network 131 | 132 | """# ============================================================================== 133 | h[0]= self._ph[1] 134 | for i in range(nSections): 135 | section(3*i) 136 | 137 | """# ============================================================================== 138 | Dropout 139 | 140 | """# ============================================================================== 141 | self._keep_prob = tf.placeholder(dtype,name='keep_prob') 142 | 143 | with tf.name_scope("drop") as scope: 144 | h_fc2_drop = tf.nn.dropout(h[nSections*3], self._keep_prob) 145 | 146 | """# ============================================================================== 147 | 148 | Readout Layer 149 | 150 | """# ============================================================================== 151 | with tf.name_scope("softmax") as scope: 152 | w_fc3 = weight_variable([in_out_width, nTarget],dtype) 153 | b_fc3 = bias_variable([nTarget],dtype) 154 | y_conv=tf.nn.softmax(tf.matmul(h_fc2_drop, w_fc3) + b_fc3) 155 | 156 | print ('network size:',flush=True) 157 | total = 0 158 | for i in range(nSections*3): 159 | total = total + computeSize("w{}".format(i),w[i]) 160 | total = total + computeSize ("b_fc3",b_fc3) + \ 161 | computeSize ("w_fc3",w_fc3) 162 | 163 | print('\ttotal\t{}'.format(total),flush=True) 164 | 165 | 166 | with tf.name_scope("reshape_self._x_image") as scope: 167 | self._x_image = tf.reshape(self._ph.image, [-1,self._nCols,self._nRows,1]) 168 | 169 | with tf.name_scope("xent") as scope: 170 | # 1e-8 added to eliminate the crash of training when taking log of 0 171 | cross_entropy = -tf.reduce_sum(self._ph[0]*tf.log(y_conv+1e-8)) 172 | ce_summ = tf.summary.scalar("cross entropy", cross_entropy) 173 | 174 | with tf.name_scope("train") as scope: 175 | self._train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) 176 | 177 | with tf.name_scope("test") as scope: 178 | self._correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(self._ph[0],1)) 179 | self._prediction = tf.argmax(y_conv,1) 180 | 181 | self._accuracy = tf.reduce_mean(tf.cast(self._correct_prediction, dtype)) 182 | accuracy_summary = tf.summary.scalar("accuracy", self._accuracy) 183 | 184 | """# ============================================================================== 185 | 186 | Start TensorFlow Interactive Session 187 | 188 | """# ============================================================================== 189 | 190 | self._sess.run(tf.initialize_all_variables()) 191 | self._merged = tf.summary.merge_all() 192 | tm = "" 193 | tp = datetime.datetime.now().timetuple() 194 | for i in range(4): 195 | tm += str(tp[i])+'-' 196 | tm += str(tp[4]) 197 | 198 | # To see the results in Chrome, 199 | # Run the following in terminal to activate server. 200 | # tensorboard --logdir '/tmp/ds_logs/' 201 | # See results on localhost:6006 202 | 203 | self._writer = tf.summary.FileWriter("/tmp/ds_logs/"+ tm, self._sess.graph) 204 | 205 | def computeSize(s,tens): 206 | sumC = 1 207 | tShape = tens.get_shape() 208 | nDims = len(tShape) 209 | for i in range(nDims): 210 | sumC *= tShape[i].value 211 | print ('\t{}\t{}'.format(s,sumC),flush=True) 212 | return sumC 213 | 214 | 215 | 216 | def __exit__(self, exc_type, exc_value, traceback): 217 | tf.reset_default_graph() # only necessary when iterating through fonts 218 | self._sess.close() 219 | 220 | 221 | def reset_graph(self): 222 | tf.reset_default_graph() # only necessary when iterating through fonts 223 | self._sess.close() 224 | 225 | # 226 | # def encode(self): 227 | # 228 | # return key 229 | # 230 | # def decode(self, key): -------------------------------------------------------------------------------- /o3_top_secret_python_box.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 23, 2016 3 | 4 | Created on Jul 12, 2016 5 | This program shows how Principal Component Analysis removes affine 6 | transformation distortions. 7 | 8 | Parallel lines in an image remain parallel after an affine transformation. 9 | For instance, if an image is rotated or sheared, lines remain parallel. 10 | 11 | PCA and LDA can remove affine transformations. This is shown by making 3 shapes 12 | and then making a number of shear versions of the shapes. Running 13 | Principal Component Analysis reduces the number of features necessary to 14 | recognize the features during Logistic Regression with 100% accuracy, 15 | down to 2 from 400 (20 columns by 20 rows). 16 | 17 | We make three images and then make about 80 copies of each image created by 18 | shearing the original image. 19 | 20 | Since there is very little noise introduced by the shearing, almost all of 21 | the explained variance is due to the shearing. PCA finds eigenvectors 22 | that line up with shearing. 23 | 24 | 1) For a couple of shapes, make sheared version. 25 | 2) train and print accuracies without PCA 26 | 3) repeat, but use PCA first before training. 27 | 4) observe the improvement 28 | 29 | Do the same thing for Linear Discriminant Analysis 30 | encode a secret message in the angle of rotation of characters 31 | 32 | Train a neural network on rotated versions of characters with the output of 33 | the network being the angle of rotation. 34 | 35 | Thus, given a rotated character, the neural network will yield a value 36 | that is the amount of rotation of the character. 37 | 38 | Encode a test set by applying a secret message with one bit for each character. 39 | Decode the secret message by running the rotated characters through the 40 | neural network, yielding the pattern of bits. 41 | 42 | pick up the base character 43 | 44 | make a training set by rotating them through n angles 45 | 46 | train 47 | 48 | pick up the base characters 49 | encode the secret message n bits at a time into the characters 50 | this is the testing set 51 | 52 | test secret message yielding a vector of rotations 53 | 54 | convert the rotation back into bits 55 | 56 | assemble the bits into the secret message. 57 | 58 | 59 | @author: richard lyman 60 | 61 | 62 | '''# ============================================================================== 63 | 64 | import ocr_utils 65 | 66 | 67 | import numpy as np 68 | from PIL import Image, ImageDraw 69 | import io 70 | from sklearn.metrics import accuracy_score 71 | from sklearn.decomposition import PCA 72 | from sklearn.metrics import accuracy_score 73 | #from sklearn.model_selection 74 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 75 | from sklearn.linear_model import LogisticRegression 76 | from sklearn.model_selection import train_test_split 77 | 78 | # input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} 79 | # output_feature_list = ['orientation_one_hot','image'] 80 | dtype = np.float32 81 | character_size = 100 82 | white_space = 10 83 | skewRange = np.linspace(-0.2,0.2,4) 84 | 85 | class c_box(object): 86 | def __init__(self, top, left, right, bottom): 87 | self._top = top 88 | self._left = left 89 | self._right = right 90 | self._bottom = bottom 91 | 92 | 93 | def find_min_max(sums): 94 | case = 0 95 | mins = [] 96 | maxes = [] 97 | for i,sum in enumerate(sums): 98 | ''' 99 | case 0, going through area between characters 100 | if sum ==0 stay in case 0 101 | if sum != 0 set the top to i and switch to case 1 102 | case 1, going through a character 103 | if sum ==0 set the bottom to i and drop to case 0 104 | also append the box to the list using 105 | left = 0, and right = the width of the image 106 | if sum !=0 then continue in case 1 107 | ''' 108 | 109 | if case==0 : 110 | if sum != 0 : 111 | case = 1 112 | min= i 113 | else: 114 | if sum == 0 : 115 | case = 0 116 | max= i 117 | mins.append(min) 118 | maxes.append(max) 119 | return mins, maxes 120 | 121 | 122 | 123 | # pick up the base characters from training_image_file 124 | # produce some skeared versions 125 | # make into a training set 126 | # place in a ocr_utils TruthedCharacters class so we can use the 127 | # one hot and batch functions 128 | 129 | im = Image.open('15-01-01 459_Mont_Lyman.png') 130 | #im = Image.open('CourierFont.png') 131 | im = im.convert(mode='L') 132 | data = 255-np.asarray( im, dtype="int32" ) 133 | sums = np.sum(data,axis=1) 134 | mins, maxes = find_min_max(sums) 135 | boxes = [] 136 | for top,bottom in zip(mins,maxes): 137 | line = data[top:bottom] 138 | line_sums = np.sum(line,axis=0) 139 | lefts,rights = find_min_max(line_sums) 140 | for left,right in zip(lefts,rights): 141 | boxes.append(c_box(top,left,right,bottom)) 142 | 143 | images=[] 144 | orientation=[] 145 | recognized_label =[] 146 | for box in boxes: 147 | 148 | img2 = Image.new('L',(character_size,character_size),color=255) 149 | 150 | img = im.crop(box=(box._left, box._top, box._right, box._bottom)) 151 | img2.paste(img,box=(white_space,white_space)) 152 | 153 | imgByteArr = img2.tobytes() 154 | lst = list(imgByteArr) 155 | image = np.array(lst)/255.0 156 | image = 1.0 - image 157 | images.append(image) 158 | 159 | height = im.height 160 | width = im.width 161 | 162 | t1 = np.array(images) 163 | t1=np.reshape(t1,(t1.shape[0],character_size,character_size)) 164 | ocr_utils.montage(t1, title='characters from file') 165 | 166 | shp = t1.shape 167 | totalN = len(skewRange)*shp[0] 168 | images = [] 169 | import skimage.transform as af 170 | 171 | for j in range(shp[0]): 172 | for i,skew in enumerate(skewRange): 173 | images.append(ocr_utils.shear(t1[j],skew)) 174 | orientation.append(skew) 175 | 176 | images=np.array(images) 177 | ocr_utils.montage(images, title='characters being trained') 178 | images=np.reshape(images,(len(images),character_size*character_size)) 179 | ys = ocr_utils.convert_to_unique(orientation) 180 | 181 | 182 | X_train , X_test, y_train, y_test = train_test_split(images, ys, test_size=0.3, random_state=0) 183 | print (y_test.shape) 184 | 185 | lr = LogisticRegression() 186 | lr.fit(X_train, y_train) 187 | y_train_pred = lr.predict(X_train) 188 | y_test_pred = lr.predict(X_test) 189 | 190 | print('\nTrain Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_train, y_train_pred), lr.coef_.shape)) 191 | print('Test Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_test, y_test_pred), lr.coef_.shape)) 192 | 193 | ######################################################################### 194 | # run Principal Component analysis first, then Logistic Regression 195 | 196 | n_components = 2 197 | pca = PCA(n_components=n_components) 198 | 199 | X_train_pca = pca.fit_transform(X_train) 200 | X_test_pca = pca.transform(X_test) 201 | 202 | print('\nPCA components = {}'.format(pca.components_.shape)) 203 | 204 | lr = LogisticRegression() 205 | logistic_fitted = lr.fit(X_train_pca, y_train) 206 | 207 | y_train_pred = logistic_fitted.predict(X_train_pca) 208 | y_test_pred = logistic_fitted.predict(X_test_pca) 209 | 210 | print('\nPCA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components,lr.coef_.shape)) 211 | print('PCA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,lr.coef_.shape)) 212 | 213 | X_errors_image = X_test[y_test!=y_test_pred] 214 | y_errors = y_test[y_test!=y_test_pred] 215 | X_errors_pca = X_test_pca[y_test!=y_test_pred] 216 | 217 | # change to a 2D shape 218 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], character_size, character_size)) 219 | ocr_utils.montage(X_errors2D,title='PCA Error Images, components={}'.format (n_components)) 220 | 221 | X_combined = np.vstack((X_train_pca, X_test_pca)) 222 | y_combined = np.hstack((y_train, y_test)) 223 | 224 | ocr_utils.plot_decision_regions( 225 | X=X_combined, 226 | y=y_combined, 227 | classifier=lr, 228 | labels = ['PC1','PC2'] , 229 | title='logistic_regression after 2 component PCA') 230 | 231 | 232 | ######################################################################### 233 | # run Linear Discriminant Analysis first then Logistic Regression 234 | 235 | 236 | n_components = 2 237 | lda = LinearDiscriminantAnalysis(n_components=n_components) 238 | 239 | X_train_lda = lda.fit_transform(X_train, y_train) 240 | X_test_lda = lda.transform(X_test) 241 | 242 | print('\nLDA components = {}'.format(pca.components_.shape)) 243 | lr = LogisticRegression() 244 | logistic_fitted = lr.fit(X_train_lda, y_train) 245 | 246 | y_train_pred = logistic_fitted.predict(X_train_lda) 247 | y_test_pred = logistic_fitted.predict(X_test_lda) 248 | 249 | print('\nLDA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),lda.n_components,lr.coef_.shape)) 250 | print('LDA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),lda.n_components,lr.coef_.shape)) 251 | 252 | X_errors_image = X_test[y_test!=y_test_pred] 253 | 254 | # change to a 2D shape 255 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], character_size, character_size)) 256 | ocr_utils.montage(X_errors2D,title='LDA Error Images, components={}'.format (n_components)) 257 | 258 | X_combined = np.vstack((X_train_lda, X_test_lda)) 259 | y_combined = np.hstack((y_train, y_test)) 260 | if X_combined.shape[1] > 1: 261 | ocr_utils.plot_decision_regions( 262 | X=X_combined, 263 | y=y_combined, 264 | classifier=lr, 265 | labels = ['LDA1','LDA2'] , 266 | title='logistic_regression after 2 component LDA') 267 | print ('\n########################### No Errors ####################################') 268 | -------------------------------------------------------------------------------- /p131_principal_component_analysis.py: -------------------------------------------------------------------------------- 1 | ''' principal_component_analysis.py 2 | Principal Component Analysis reduces the dimensionality of the feature set by 3 | sorting the features by the explained variance. It does this by 4 | 1) computing a covariance matrix for the features 5 | 2) finding the eigenvectors and eigenvalues of the matrix, principal components. 6 | 3) computing explained variance for the components and sorting them 7 | 8 | Always standardize because PCA is sensitive to scaling 9 | 10 | 11 | Created on Jul 2, 2016 12 | 13 | from Python Machine Learning by Sebastian Raschka under the following license 14 | 15 | The MIT License (MIT) 16 | 17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com) 18 | 19 | Permission is hereby granted, free of charge, to any person obtaining a copy 20 | of this software and associated documentation files (the "Software"), to deal 21 | in the Software without restriction, including without limitation the rights 22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 23 | copies of the Software, and to permit persons to whom the Software is 24 | furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all 27 | copies or substantial portions of the Software. 28 | 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 | SOFTWARE. 36 | 37 | @author: richard lyman 38 | ''' 39 | n_components = 10 # of of pca components to use for final accuracy 40 | 41 | import numpy as np 42 | import ocr_utils 43 | import matplotlib.pyplot as plt 44 | from sklearn.metrics import accuracy_score 45 | from sklearn.linear_model import LogisticRegression 46 | from sklearn.decomposition import PCA 47 | from sklearn.decomposition import KernelPCA 48 | 49 | 50 | chars_to_train = range(48,58) 51 | columnsXY=range(0,20) 52 | column_str = 'column_sum{}'.format(list(columnsXY)) 53 | 54 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'} 55 | 56 | # output the character label and the image and column sums 57 | output_feature_list = ['m_label','image',column_str] 58 | 59 | # read the complete image (20x20) = 400 pixels for each character 60 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 61 | output_feature_list=output_feature_list, 62 | test_size=.2, 63 | random_state=0) 64 | windows_limit = 5000 # uses too much memory for my 32 bit windows computer so limit size of sample 65 | y_train = ds.train.features[0][:windows_limit] 66 | X_train_image = ds.train.features[1][:windows_limit] 67 | X_train = ds.train.features[2][:windows_limit] 68 | 69 | y_test = ds.test.features[0] 70 | X_test_image = ds.test.features[1] 71 | X_test = ds.test.features[2] 72 | 73 | 74 | cov_mat = np.cov(X_train_image.T) 75 | eigen_vals, eigen_vecs = np.linalg.eig(cov_mat) 76 | 77 | print('\nEigenvalues \n%s' % eigen_vals[:2*n_components]) 78 | 79 | tot = sum(eigen_vals) 80 | var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)] 81 | cum_var_exp = np.cumsum(var_exp) 82 | var_exp = var_exp[:20] 83 | cum_var_exp = cum_var_exp[:2*n_components] 84 | title='explained variance' 85 | plt.bar(range(1, len(var_exp)+1), var_exp, alpha=0.5, align='center', label='individual explained variance') 86 | plt.step(range(1, len(cum_var_exp)+1), cum_var_exp, where='mid', 87 | label='cumulative explained variance') 88 | plt.ylabel('Explained variance ratio') 89 | plt.xlabel('Principal components') 90 | plt.legend(loc='best') 91 | plt.tight_layout() 92 | plt.title(title) 93 | ocr_utils.show_figures(plt,title) 94 | 95 | # Make a list of (eigenvalue, eigenvector) tuples 96 | eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))] 97 | 98 | # Sort the (eigenvalue, eigenvector) tuples from high to low 99 | eigen_pairs.sort(reverse=True) 100 | 101 | # The eigenpairs with the highest explained variance 102 | w = np.hstack((eigen_pairs[0][1][:, np.newaxis], 103 | eigen_pairs[1][1][:, np.newaxis])) 104 | print('Matrix W:\n', w[:2*n_components,:]) 105 | 106 | X_train_pca = X_train_image.dot(w) 107 | print ('projection of first dataset sample on first 2 eignvectors {}'.format(X_train_image[0].dot(w))) 108 | 109 | markers = ('o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd') 110 | colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan','orange','green','brown','lightblue','pink') 111 | 112 | for l, c, m in zip(np.unique(y_train), colors, markers): 113 | plt.scatter(X_train_pca[y_train==l, 0], 114 | X_train_pca[y_train==l, 1], 115 | c=c, label=l, marker=m) 116 | 117 | plt.xlabel('PC 1') 118 | plt.ylabel('PC 2') 119 | plt.legend(loc='lower left') 120 | plt.tight_layout() 121 | title='features mapped to two principal components' 122 | plt.title(title) 123 | ocr_utils.show_figures(plt,title) 124 | 125 | ######################################################################################## 126 | 127 | 128 | pca = PCA(n_components=2) 129 | 130 | X_train_pca = pca.fit_transform(X_train_image) 131 | X_test_pca = pca.transform(X_test_image) 132 | 133 | lr = LogisticRegression(solver='liblinear',multi_class='auto') 134 | logistic_fitted =lr.fit(X_train_pca, y_train) 135 | 136 | print('\nPCA Train Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_train, logistic_fitted.predict(X_train_pca)),pca.n_components)) 137 | print('PCA Test Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_test, logistic_fitted.predict(X_test_pca)),pca.n_components)) 138 | 139 | title = 'train pc1 versus pc2' 140 | ocr_utils.plot_decision_regions(X=X_train_pca, y=y_train, classifier=lr, labels=['pc1','pc2'], title=title) 141 | 142 | title = 'test pc1 versus pc2' 143 | ocr_utils.plot_decision_regions(X=X_test_pca, y=y_test, classifier=lr, labels=['pc1','pc2'], title=title) 144 | X_train_pca = pca.fit_transform(X_train_image) 145 | X_test_pca = pca.transform(X_test_image) 146 | 147 | ######################################################################################## 148 | pca = PCA(n_components=n_components) 149 | X_train_pca = pca.fit_transform(X_train_image) 150 | X_test_pca = pca.transform(X_test_image) 151 | 152 | lr = LogisticRegression(solver='liblinear',multi_class='auto') 153 | logistic_fitted = lr.fit(X_train_pca, y_train) 154 | 155 | y_train_pred = logistic_fitted.predict(X_train_pca) 156 | y_test_pred = logistic_fitted.predict(X_test_pca) 157 | 158 | print('\nPCA Train Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components)) 159 | print('PCA Test Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components)) 160 | 161 | X_errors_image = X_test_image[y_test!=y_test_pred] 162 | y_errors = y_test[y_test!=y_test_pred] 163 | X_errors_pca = X_test_pca[y_test!=y_test_pred] 164 | 165 | X_orig = X_train_image[:500] 166 | title = 'originals' 167 | X2D=np.reshape(X_orig, (X_orig.shape[0], ds.train.num_rows, ds.train.num_columns)) 168 | ocr_utils.montage(X2D,title=title) 169 | 170 | X_orig = X_train_pca[:500] 171 | title = 'inverse original' 172 | X_inverse = pca.inverse_transform(X_orig) 173 | X2D = np.reshape(X_inverse, (X_inverse.shape[0], ds.train.num_rows, ds.train.num_columns)) 174 | X2D = X2D - np.min(X2D) 175 | ocr_utils.montage(X2D,title=title) 176 | 177 | # change to a 2D shape 178 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], ds.train.num_rows, ds.train.num_columns)) 179 | ocr_utils.montage(X_errors2D,title='PCA Error Characters, components={}'.format (n_components)) 180 | 181 | title = 'inverse transform errors' 182 | X_inverse = pca.inverse_transform(X_errors_pca) 183 | X2D=np.reshape(X_inverse, (X_inverse.shape[0], ds.train.num_rows, ds.train.num_columns)) 184 | X2D = X2D - np.min(X2D) 185 | ocr_utils.montage(X2D,title=title) 186 | 187 | ######################################################################################## 188 | kernel='rbf' # really slow 189 | pca = KernelPCA(n_components=2,kernel=kernel, gamma=15) 190 | 191 | X_train_pca = pca.fit_transform(X_train_image) 192 | X_test_pca = pca.transform(X_test_image) 193 | 194 | lr = LogisticRegression(solver='liblinear',multi_class='auto') 195 | logistic_fitted=lr.fit(X_train_pca, y_train) 196 | y_train_pred = logistic_fitted.predict(X_train_pca) 197 | y_test_pred = logistic_fitted.predict(X_test_pca) 198 | 199 | print('\nKernel PCA Train Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_train, y_train_pred), pca.n_components,kernel)) 200 | print('Kernel PCA Test Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,kernel)) 201 | 202 | title = 'train kernel {} pc1 versus pc2'.format(kernel) 203 | ocr_utils.plot_decision_regions(X=X_train_pca, y=y_train, classifier=lr, labels=['pc1','pc2'], title=title) 204 | 205 | title = 'test kernel {} pc1 versus pc2'.format(kernel) 206 | ocr_utils.plot_decision_regions(X=X_test_pca, y=y_test, classifier=lr, labels=['pc1','pc2'], title=title) 207 | 208 | 209 | 210 | 211 | ######################################################################################## 212 | # too slow on my computer 213 | 214 | # pca = KernelPCA(n_components=n_components,kernel=kernel, gamma = 15) 215 | # 216 | # X_train_pca = pca.fit_transform(X_train_image) 217 | # X_test_pca = pca.transform(X_test_image) 218 | # 219 | # print ('n_components={}'.format(pca.n_components)) 220 | # 221 | # lr = LogisticRegression() 222 | # logistic_fitted = lr.fit(X_train_pca, y_train) 223 | # 224 | # y_pred = logistic_fitted.predict(X_test_pca) 225 | # print('\nKKernelPCA Train Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_train, logistic_fitted.predict(X_train_pca)), pca.n_components, kernel)) 226 | # print('KernelPCA Test Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_test, y_pred), pca.n_components, kernel)) 227 | # 228 | # X_errors_image = X_test_image[y_test!=y_pred] 229 | # y_errors = y_test[y_test!=y_pred] 230 | # 231 | # error_images = X_errors_image.shape[0] 232 | # 233 | # # change to a 2D shape 234 | # X_errors2D=np.reshape(X_errors_image, (error_images, ds.train.num_rows, ds.train.num_columns)) 235 | # ocr_utils.montage(X_errors2D,title='Kernel {} KernelPCA Errors Character,components={}s'.format(kernel,n_components)) 236 | 237 | print ('\n########################### No Errors ####################################') 238 | 239 | --------------------------------------------------------------------------------