├── tree.png
├── tensor_flow_graph.png
├── 15-01-01 459_Mont_Lyman.jpg
├── run_batch.sc
├── q9_tensorflow_gpu_test.py
├── license.txt
├── README.md
├── q7_theano_gpu_test.py
├── run_batch.bat
├── q8_tika.py
├── p75_xor_dataset.py
├── p73_support_vector_machine.py
├── q1_database_statistics.py
├── p94_knearest_neighbors.py
├── p124_random_forest_feature_importance.py
├── p186_grid_search.py
├── p78_support_vector_machine_gamma.py
├── p190_confusion_matrix.py
├── q0_simple_e13b_display.py
├── p124_random_forest.py
├── p51_standard_scalar.py
├── p330_dendrogram.py
├── n0_network.py
├── p411_keras.py
├── p322_silhouette_plots.py
├── p193_model_precision_recall.py
├── q2_tensorflow_mnist.py
├── p110_scaling_features.py
├── q6_tensorflow_residual3x4.py
├── p229_adaboost.py
├── p189_nested_cross_validation.py
├── o4_image_to_image.py
├── p86_decision_tree.py
├── q5_tensorflow_residual.py
├── p221_bagging_bootstrap_samples.py
├── p115_l1_l2_regularization.py
├── p25_perceptron.py
├── p119_squential_backward_selection.py
├── p181_learning_curves.py
├── p62_logistic_regression.py
├── p36_adaline_gd.py
├── p44_adaline_sgd.py
├── q3_removing_affine_distortion.py
├── p314_k_means.py
├── p194_receiver_operating_characteristic.py
├── o1_top_secret_cnn.py
├── p177_k_fold_cross_validation.py
├── n1_residual3x4.py
├── o3_top_secret_python_box.py
└── p131_principal_component_analysis.py


/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/tree.png


--------------------------------------------------------------------------------
/tensor_flow_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/tensor_flow_graph.png


--------------------------------------------------------------------------------
/15-01-01 459_Mont_Lyman.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rrlyman/PythonMachineLearningExamples/HEAD/15-01-01 459_Mont_Lyman.jpg


--------------------------------------------------------------------------------
/run_batch.sc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash  
 2 | 
 3 | if [ ! -d "/tmp/plots" ]; then
 4 | 	mkdir /tmp/plots
 5 | fi
 6 | echo "" > /tmp/plots/run_batch.txt        
 7 | for i in $(ls -1v ./[o-q]*.py ); do
 8 | 	echo "" |& tee -a /tmp/plots/run_batch.txt
 9 | 	echo "##############################################################" |& tee -a /tmp/plots/run_batch.txt
10 | 	echo "$i  ###############################" |& tee -a /tmp/plots/run_batch.txt
11 | 	echo "##############################################################" |& tee -a /tmp/plots/run_batch.txt
12 | 	echo "" |& tee -a /tmp/plots/run_batch.txt
13 |    	python3 $i |& tee -a /tmp/plots/run_batch.txt 
14 | 
15 | done
16 | 
17 |         


--------------------------------------------------------------------------------
/q9_tensorflow_gpu_test.py:
--------------------------------------------------------------------------------
 1 | # Creates a graph.
 2 | import tensorflow as tf
 3 | #from tensorflow.compat import v1 as tf
 4 | 
 5 | #sess = tf.InteractiveSession()  
 6 | @tf.function
 7 | def d(a,b):
 8 |     return tf.matmul(a, b)
 9 | 
10 | a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
11 | b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
12 | #c = tf.matmul(a, b)
13 | # Creates a session with log_device_placement set to True.
14 | 
15 | # Runs the op.
16 | 
17 | # tens1 = tf.constant([ [[1,2],[2,3]], [[3,4],[5,6]] ]) 
18 | # print (sess.run(tens1)[1,1,0])
19 | # self._sess.run(tf.initialize_all_variables())
20 | for i in range(100000):
21 |     d(a,b)
22 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Richard Ricker Lyman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PythonMachineLearingExamples
 2 | 
 3 | These are Python programs mostly taken from the book "Python Machine Learning"
 4 | by Sebastian Raschka
 5 | 
 6 | see the original programs from the book at:
 7 | 
 8 | https://github.com/rasbt/python-machine-learning-book
 9 | 
10 | These programs were intended to prove out the data set of character images found in
11 | fonts.zip which will be automatically downloaded.
12 | 
13 | For a detailed explanation of each program, read "Python Machine Learning".
14 | 
15 | Due to the fact that a few E13B characters can be classified using only
16 | two features, most of the example programs that use the Iris dataset can also be 
17 | be used with E13B.
18 | 
19 | run_batch.sc is a bash script that runs all of the programs in the directory
20 | 
21 | The script file will create many files in the folder, /tmp/plots.
22 | 
23 | 
24 | Python 3.8
25 | Anaconda3
26 | Linux or Windows
27 | cuda                               11.2.1
28 | h5py                               2.10.0
29 | Keras                              2.4.3
30 | Lasagne                            0.1
31 | matplotlib                         3.3.2
32 | numpy                              1.19.2
33 | pytesseract                        0.3.7
34 | sklearn                            0.0
35 | tensorflow                         2.4.1
36 | tesseract                          4.1.1
37 | Theano                             1.0.5
38 | tika                               1.24
39 | 
40 | 


--------------------------------------------------------------------------------
/q7_theano_gpu_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' short test from the Theano website to see if Theano is working with CUDA
 3 | .theanrc contains
 4 | 
 5 | [global]
 6 | floatX = float32
 7 | device = gpu0
 8 | 
 9 | 
10 | @author: richard lyman
11 | 
12 | '''
13 | import os
14 | print (os.get_exec_path())
15 | p=os.getenv('PATH')
16 | print("getenv('PATH')={}".format(p))
17 | p=os.getenv('LD_LIBRARY_PATH')
18 | print("getenv('LD_LIBRARY_PATH')={}".format(p))
19 | p=os.getenv('CUDA_HOME')
20 | print("getenv('CUDA_HOME')={}".format(p))
21 | p=os.getenv('PYTHONPATH')
22 | print("getenv('PYTHONPATH')={}".format(p))
23 | 
24 | #os.environ['PATH'] = p
25 | # print(os.getenv('PATH'))
26 | from theano import function, config, shared, sandbox
27 | import theano.tensor as T
28 | import numpy
29 | import time
30 | 
31 | vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
32 | iters = 1000
33 | 
34 | rng = numpy.random.RandomState(22)
35 | x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
36 | f = function([], T.exp(x))
37 | print(f.maker.fgraph.toposort())
38 | t0 = time.time()
39 | for i in range(iters):
40 |     r = f()
41 | t1 = time.time()
42 | print("Looping %d times took %f seconds" % (iters, t1 - t0))
43 | print("Result is %s" % (r,))
44 | if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]):
45 |     print('Used the cpu')
46 | else:
47 |     print('Used the gpu')
48 |     
49 | print ('\n########################### No Errors ####################################')    


--------------------------------------------------------------------------------
/run_batch.bat:
--------------------------------------------------------------------------------
 1 | mkdir plots
 2 | python p110_scaling_features.py > ./plots/batch.txt
 3 | python p115_l1_l2_regularization.py  >> ./plots/batch.txt
 4 | python p119_squential_backward_selection.py  >> ./plots/batch.txt
 5 | python p124_random_forest.py  >> ./plots/batch.txt
 6 | python p124_random_forest_feature_importance.py  >> ./plots/batch.txt
 7 | python p131_principal_component_analysis.py  >> ./plots/batch.txt
 8 | python p141_linear_descriminant_analsys.py  >> ./plots/batch.txt
 9 | python p154_pca_nonlinear_mapings.py  >> ./plots/batch.txt
10 | python p177_k_fold_cross_validation.py  >> ./plots/batch.txt
11 | python p181_learning_curves.py  >> ./plots/batch.txt
12 | python p186_grid_search.py  >> ./plots/batch.txt
13 | python p189_nested_cross_validation.py  >> ./plots/batch.txt
14 | python p190_confusion_matrix.py  >> ./plots/batch.txt
15 | python p193_model_precision_recall.py  >> ./plots/batch.txt
16 | python p194_receiver_operating_characteristic.py  >> ./plots/batch.txt
17 | python p206_majority_vote_classifier.py  >> ./plots/batch.txt
18 | python p221_bagging_bootstrap_samples.py  >> ./plots/batch.txt
19 | python p229_adaboost.py  >> ./plots/batch.txt
20 | python p25_perceptron.py  >> ./plots/batch.txt
21 | python p314_k_means.py  >> ./plots/batch.txt
22 | python p322_silhouette_plots.py  >> ./plots/batch.txt
23 | python p330_dendogram.py  >> ./plots/batch.txt
24 | python p356_neural_net.py  >> ./plots/batch.txt
25 | python p36_adaline_gd.py  >> ./plots/batch.txt
26 | python p411_keras.py  >> ./plots/batch.txt
27 | python p44_adaline_sgd.py  >> ./plots/batch.txt
28 | python p51_standard_scalar.py  >> ./plots/batch.txt
29 | python p62_logistic_regression.py  >> ./plots/batch.txt
30 | python p73_support_vector_machine.py  >> ./plots/batch.txt
31 | python p75_xor_dataset.py  >> ./plots/batch.txt
32 | python p78_support_vector_machine_gamma.py  >> ./plots/batch.txt
33 | python p86_decision_tree.py  >> ./plots/batch.txt
34 | python p94_knearest_neighbors.py  >> ./plots/batch.txt
35 | python q0_simple_e13b_display.py  >> ./plots/batch.txt
36 | python q1_database_statistics.py  >> ./plots/batch.txt
37 | python q2_tensorflow_mnist.py  >> ./plots/batch.txt
38 | python q2_Theano_mlp.py  >> ./plots/batch.txt
39 | python q3_removing_affine_distortion.py
40 | 
41 | 


--------------------------------------------------------------------------------
/q8_tika.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | '''experiment with apache tika
 3 | 
 4 | 
 5 | @author: richard lyman
 6 | '''
 7 | 
 8 | import pytesseract 
 9 | import tika
10 | from tika import  translate, detector, language
11 | 
12 | 
13 |  
14 | filename = '15-01-01 459_Mont_Lyman.jpg'
15 | filename2 = 'img20150901_15233271bw.jpg'
16 | 
17 | from PIL import Image
18 | 
19 | rawText = pytesseract.image_to_string(Image.open(filename2), lang="rus")
20 | print (rawText)
21 | lines = rawText.split('\n')
22 | 
23 | import os
24 | #os.putenv( 'TIKA_VERSION','default')  # - set to the version string, e.g., 1.12 or default to current Tika version.
25 | #os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache.
26 | os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar.
27 | #os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar.
28 | #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client.
29 | #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
30 | #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
31 | #os.putenv('TESSDATA_PREFIX','/usr/share/tesseract-ocr/4.00/tessdata/')
32 | tika.initVM()
33 | from tika import parser
34 | parsed = parser.from_buffer("comme çi comme ça")
35 | print(parsed["metadata"])
36 | print(parsed["content"])
37 | global Verbose
38 | Verbose=True
39 | 
40 | result=translate.auto_from_buffer("comme çi comme ça", 'en')
41 | print(result)
42 | result = detector.from_buffer("comme çi comme ça")
43 | print (result)
44 | result = translate.from_buffer("comme çi comme ça",'fr','en')
45 | print (result)
46 | result = language.from_buffer("comme çi comme ça")
47 | print (result)
48 | for line in lines:
49 |     if len(line)>0:
50 |         result=translate.from_buffer(line, 'ru','en')
51 |         print(result)
52 | 
53 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p75_xor_dataset.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' xor_dataset.py shows how non linearly separable datasets can use
 3 |     a non linear combination of the original features to project the
 4 |     features onto a higher dimensional space where the features are
 5 |     linearly separable
 6 |     
 7 |     A non linearly dataset consisting of XOR values is created.
 8 |     This is fitted to a Support Vector Machine using the radial basis 
 9 |     function kernel parameter
10 |     
11 | Created on Jun 23, 2016
12 | 
13 | from Python Machine Learning by Sebastian Raschka under the following license
14 | 
15 | The MIT License (MIT)
16 | 
17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
18 | 
19 | Permission is hereby granted, free of charge, to any person obtaining a copy
20 | of this software and associated documentation files (the "Software"), to deal
21 | in the Software without restriction, including without limitation the rights
22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 | copies of the Software, and to permit persons to whom the Software is
24 | furnished to do so, subject to the following conditions:
25 | 
26 | The above copyright notice and this permission notice shall be included in all
27 | copies or substantial portions of the Software.
28 | 
29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 | SOFTWARE.
36 | 
37 | @author: richard lyman
38 | '''
39 | import numpy as np
40 | 
41 | import ocr_utils
42 | 
43 | from sklearn.svm import SVC
44 | 
45 | np.random.seed(0)
46 | X_xor = np.random.randn(200, 2)
47 | y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
48 | y_xor = np.where(y_xor, 1, -1)
49 | 
50 | ocr_utils.scatter_plot(X=X_xor, 
51 |                   y=y_xor,                   
52 |                   title='xor',
53 |                   xlim=(-3,3),
54 |                   ylim=(-3,3))
55 | 
56 | 
57 | svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)
58 | svm.fit(X_xor, y_xor)
59 | ocr_utils.plot_decision_regions(X=X_xor, y=y_xor, 
60 |                       classifier=svm,title='support vector machine rbf xor')
61 | print ('\n########################### No Errors ####################################')
62 | 


--------------------------------------------------------------------------------
/p73_support_vector_machine.py:
--------------------------------------------------------------------------------
 1 | '''support_vector_machine_linear.py illustrates a support vector machine
 2 |     The SVM attempts to maximize the margin of error between linearly 
 3 |     separable feature sets.
 4 |     
 5 |     Column sums are read in from the E13B character set and fitted to
 6 |     a SVM.  The decision regions are plotted.
 7 |     
 8 | Created on Jun 30, 2016
 9 | 
10 | from Python Machine Learning by Sebastian Raschka under the following license
11 | 
12 | The MIT License (MIT)
13 | 
14 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
15 | 
16 | Permission is hereby granted, free of charge, to any person obtaining a copy
17 | of this software and associated documentation files (the "Software"), to deal
18 | in the Software without restriction, including without limitation the rights
19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 | copies of the Software, and to permit persons to whom the Software is
21 | furnished to do so, subject to the following conditions:
22 | 
23 | The above copyright notice and this permission notice shall be included in all
24 | copies or substantial portions of the Software.
25 | 
26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 | SOFTWARE.
33 | 
34 | @author: richard lyman
35 | '''
36 | import numpy as np
37 | import ocr_utils
38 | from sklearn.preprocessing import StandardScaler
39 | 
40 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
41 | 
42 | sc = StandardScaler()
43 | sc.fit(X_train)
44 | X_train_std = sc.transform(X_train)
45 | X_test_std = sc.transform(X_test)
46 | X_combined_std = np.vstack((X_train_std, X_test_std))
47 | y_combined = np.hstack((y_train, y_test))
48 | 
49 | from sklearn.svm import SVC
50 | 
51 | svm = SVC(kernel='linear', C=1.0, random_state=0)
52 | svm.fit(X_train_std, y_train)
53 | 
54 | 
55 | ocr_utils.plot_decision_regions(X=X_combined_std, 
56 |                                          y=y_combined,                       
57 |                                          classifier=svm, 
58 |                                          test_idx=range(len(X_test_std),len(X_combined_std)),
59 |                                          labels = labels, 
60 |                                          title='support_vector_machine_linear')
61 | print ('\n########################### No Errors ####################################')
62 | 


--------------------------------------------------------------------------------
/q1_database_statistics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | dumps out some statistics for the fonts.zip database and
 4 | makes plots, saved in files, of characters for each fontVariant found in 
 5 | the database
 6 | 
 7 | Created on Jul 25, 2016
 8 | @author: richard
 9 | '''
10 | import ocr_utils
11 | import numpy as np 
12 |         
13 | df1 = ocr_utils.get_list(input_filters_dict = {'font':()})
14 | unique_fonts=[]
15 | unique_fontVariants=[]
16 | unique_m_labels=[]
17 | unique_strengths=[]
18 | unique_italics=[]
19 | unique_orientations=[]
20 | 
21 | #############################################################################
22 | # read and show the character images for each font variant
23 | # output only the character label and the image
24 | 
25 | for font in df1:    
26 |     df2 = ocr_utils.get_list(input_filters_dict = {'font':font,'fontVariant':(), 'm_label':(),'strength':(),'italic':(),'orientation':()})
27 |     unique_fonts = np.unique( np.append(unique_fonts, df2['font']))
28 |     u1= np.unique(df2['fontVariant'])    
29 |     unique_fontVariants = np.unique(np.append(unique_fontVariants, u1))    
30 |     u2 = np.unique(df2['m_label'])
31 |     unique_m_labels = np.unique(np.append(unique_m_labels,u2))   
32 |     u3 = np.unique(df2['strength'])
33 |     unique_strengths =  np.unique(np.append(unique_strengths,u3))
34 |     u4 = np.unique(df2['italic'])
35 |     unique_italics = np.unique(np.append(unique_italics,u4))
36 |     u5 =np.unique( df2['orientation'])
37 |     unique_orientations = np.unique(np.append(unique_orientations,u5))
38 |     print ('\n{}, fontVariants={}, labels = {}, strengths = {}, italics = {}, orientations = {}\n'.format(font[0], len(u1), 
39 |                                                                                                                len(u2), len(u3),                                                                                                              len(u4), len(u5))) 
40 |     for fontVariant in u1:
41 |         fd = {'font': font, 'fontVariant': fontVariant}
42 |         ds = ocr_utils.read_data(input_filters_dict=fd, output_feature_list=['m_label','image'] , dtype=np.int32)   
43 |         y,X = ds.train.features
44 |         X2D = np.reshape(X, (X.shape[0], ds.train.num_rows, ds.train.num_columns ))
45 |         title = '{}-{}'.format(font[0],fontVariant)
46 |         ocr_utils.show_examples(X2D, y, title=title)
47 |        
48 | print ('unique fonts={}, fontVariants={}, labels = {}, strengths = {}, italics = {}, orientations = {}'.format(len(unique_fonts), len(unique_fontVariants), 
49 |                                                                                                                len(unique_m_labels), len(unique_strengths), 
50 |                                                                                                                len(unique_italics), len(unique_orientations)))
51 |     
52 |     
53 | print ('\n########################### No Errors ####################################')
54 | 


--------------------------------------------------------------------------------
/p94_knearest_neighbors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' random_forest.py 
 3 |     The k nearest neighbor classifier memorizes the training set.  When the 
 4 |     class label of a new sample is to be predicted, the distance, typically
 5 |     the Euclidean distance some number, like 5, of the nearest memorized
 6 |     points is found. The class label of the new point is that of the 
 7 |     majority of the nearest neighbors. 
 8 |         
 9 | Created on Jun 23, 2016
10 | 
11 | from Python Machine Learning by Sebastian Raschka under the following license
12 | 
13 | The MIT License (MIT)
14 | 
15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
16 | 
17 | Permission is hereby granted, free of charge, to any person obtaining a copy
18 | of this software and associated documentation files (the "Software"), to deal
19 | in the Software without restriction, including without limitation the rights
20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
21 | copies of the Software, and to permit persons to whom the Software is
22 | furnished to do so, subject to the following conditions:
23 | 
24 | The above copyright notice and this permission notice shall be included in all
25 | copies or substantial portions of the Software.
26 | 
27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 | SOFTWARE.
34 | 
35 | @author: richard lyman
36 | '''
37 | import numpy as np
38 | import ocr_utils
39 | from sklearn.preprocessing import StandardScaler
40 | 
41 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
42 | 
43 | 
44 | sc = StandardScaler()
45 | sc.fit(X_train)
46 | X_train_std = sc.transform(X_train)
47 | X_test_std = sc.transform(X_test)
48 | X_combined_std = np.vstack((X_train_std, X_test_std))
49 | y_combined = np.hstack((y_train, y_test))
50 | X_combined = np.vstack((X_train, X_test))
51 | y_combined = np.hstack((y_train, y_test))
52 | 
53 | from sklearn.neighbors import KNeighborsClassifier
54 | 
55 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
56 | knn.fit(X_train_std, y_train)
57 | 
58 | ocr_utils.plot_decision_regions(X=X_combined_std, 
59 |                       y=y_combined, 
60 |                       classifier=knn, 
61 |                       labels=labels,                      
62 |                       test_idx=range(len(X_test_std),len(X_combined_std)),
63 |                       title='k_nearest_neighbors')
64 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p124_random_forest_feature_importance.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' random_forest_feature_importance.py
 3 | 
 4 |     Using a random forest (construct strong learners from weak learners,
 5 |     the importance of each features is evaluated by measuring the impurity
 6 |     decrease for each of 10000 trees
 7 |     
 8 |     
 9 | Created on Jun 23, 2016
10 | 
11 | from Python Machine Learning by Sebastian Raschka under the following license
12 | 
13 | The MIT License (MIT)
14 | 
15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
16 | 
17 | Permission is hereby granted, free of charge, to any person obtaining a copy
18 | of this software and associated documentation files (the "Software"), to deal
19 | in the Software without restriction, including without limitation the rights
20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
21 | copies of the Software, and to permit persons to whom the Software is
22 | furnished to do so, subject to the following conditions:
23 | 
24 | The above copyright notice and this permission notice shall be included in all
25 | copies or substantial portions of the Software.
26 | 
27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 | SOFTWARE.
34 | 
35 | @author: richard lyman
36 | '''
37 | import numpy as np
38 | import ocr_utils
39 | import matplotlib.pyplot as plt
40 | 
41 | 
42 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0) 
43 | 
44 | 
45 | from sklearn.ensemble import RandomForestClassifier
46 | 
47 | forest = RandomForestClassifier(n_estimators=10000,
48 |                                 random_state=0,
49 |                                 n_jobs=-1)
50 | 
51 | forest.fit(X_train, y_train)
52 | importances = forest.feature_importances_
53 | 
54 | indices = np.argsort(importances)[::-1]
55 | 
56 | for f in range(X_train.shape[1]):
57 |     print("%2d) %-*s %f" % (f + 1, 30, 
58 |                             indices[f], 
59 |                             importances[indices[f]]))
60 | 
61 | title = 'Feature Importances from Random Forest'
62 | plt.title(title)
63 | plt.bar(range(X_train.shape[1]), 
64 |         importances[indices],
65 |         color='lightblue', 
66 |         align='center')
67 | 
68 | plt.xticks(range(X_train.shape[1]), 
69 |            indices, rotation=90)
70 | plt.xlim([-1, X_train.shape[1]])
71 | plt.xlabel('column sums')
72 | plt.tight_layout()
73 | ocr_utils.show_figures(plt,title)
74 | 
75 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p186_grid_search.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 8, 2016 grid_search.py
 3 |     Grid search does a brute force train and test of sample data, trying
 4 |     a grid of parameters.
 5 |     
 6 |     The SVM attempts to maximize the margin of error between linearly 
 7 |     separable feature sets.
 8 | 
 9 | from Python Machine Learning by Sebastian Raschka under the following license
10 | 
11 | The MIT License (MIT)
12 | 
13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
14 | 
15 | Permission is hereby granted, free of charge, to any person obtaining a copy
16 | of this software and associated documentation files (the "Software"), to deal
17 | in the Software without restriction, including without limitation the rights
18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19 | copies of the Software, and to permit persons to whom the Software is
20 | furnished to do so, subject to the following conditions:
21 | 
22 | The above copyright notice and this permission notice shall be included in all
23 | copies or substantial portions of the Software.
24 | 
25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 | SOFTWARE.
32 | 
33 | @author: richard lyman
34 | '''
35 | import ocr_utils
36 | from sklearn.preprocessing import StandardScaler
37 | from sklearn.pipeline import Pipeline
38 | from sklearn.model_selection import GridSearchCV
39 | from sklearn.svm import SVC
40 | 
41 | if __name__ == '__main__':	
42 | 
43 |     y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 
44 | 
45 |     pipe_svc = Pipeline([('scl', StandardScaler()),
46 |                 ('clf', SVC(random_state=1))])
47 | 
48 |     param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
49 | 
50 |     param_grid = [{'clf__C': param_range, 
51 |                    'clf__kernel': ['linear']},
52 |                      {'clf__C': param_range, 
53 |                       'clf__gamma': param_range, 
54 |                       'clf__kernel': ['rbf']}]
55 | 
56 |     gs = GridSearchCV(estimator=pipe_svc, 
57 |                       param_grid=param_grid, 
58 |                       scoring='accuracy', 
59 |                       cv=10,
60 |                       n_jobs=-1)
61 |     gs = gs.fit(X_train, y_train)
62 | 
63 |     print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_))
64 |     print('Support Vector Machine Grid Search best params: {}'.format(sorted(gs.best_params_.items())))
65 | 
66 |     clf = gs.best_estimator_
67 |     clf.fit(X_train, y_train)
68 |     print('Support Vector Machine Test accuracy: %.3f' % clf.score(X_test, y_test))
69 | 
70 |     print ('\n########################### No Errors ####################################')
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/p78_support_vector_machine_gamma.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' support_vector_machine_gamma.py illustrates changing the gamma parameter
 3 |     for a SVM.  This is a cut-off parameters for the Gaussian sphere.  A 
 4 |     higher value tightens the decision boundary around the samples
 5 |     
 6 |     Run the SVM with two values of gamma and plot the decision regions
 7 |     
 8 | Created on Jun 23, 2016
 9 | 
10 | from Python Machine Learning by Sebastian Raschka under the following license
11 | 
12 | The MIT License (MIT)
13 | 
14 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
15 | 
16 | Permission is hereby granted, free of charge, to any person obtaining a copy
17 | of this software and associated documentation files (the "Software"), to deal
18 | in the Software without restriction, including without limitation the rights
19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 | copies of the Software, and to permit persons to whom the Software is
21 | furnished to do so, subject to the following conditions:
22 | 
23 | The above copyright notice and this permission notice shall be included in all
24 | copies or substantial portions of the Software.
25 | 
26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 | SOFTWARE.
33 | 
34 | @author: richard lyman
35 | '''
36 | import numpy as np
37 | import ocr_utils
38 | from sklearn.preprocessing import StandardScaler
39 | from sklearn.svm import SVC
40 | 
41 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
42 | 
43 | sc = StandardScaler()
44 | sc.fit(X_train)
45 | X_train_std = sc.transform(X_train)
46 | X_test_std = sc.transform(X_test)
47 | X_combined_std = np.vstack((X_train_std, X_test_std))
48 | y_combined = np.hstack((y_train, y_test))
49 | 
50 | 
51 | svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)
52 | svm.fit(X_train_std, y_train)
53 | 
54 | ocr_utils.plot_decision_regions(X=X_combined_std, 
55 |                       y=y_combined, 
56 |                       classifier=svm, 
57 |                       labels = labels,
58 |                       test_idx=range(len(X_test_std),len(X_combined_std)),
59 |                       title='SVM with gamma 0.2')
60 | 
61 | svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0)
62 | svm.fit(X_train_std, y_train)
63 | 
64 | ocr_utils.plot_decision_regions(X=X_combined_std, 
65 |                       y=y_combined, 
66 |                       classifier=svm, 
67 |                       labels = labels,
68 |                       test_idx=range(len(X_test_std),len(X_combined_std)),
69 |                       title='SVM with gamma 100')
70 | 
71 | 
72 | print ('\n########################### No Errors ####################################')
73 | 


--------------------------------------------------------------------------------
/p190_confusion_matrix.py:
--------------------------------------------------------------------------------
 1 | ''' confusion_matrix.py
 2 | 
 3 | A confusion matrix shows a map of true positives, false positives, false
 4 | negatives, and true negatives for a decision.
 5 | 
 6 | Some decisions require a biased output where it we may want to reduce
 7 | the number of false positives, for instance.  This is especially true
 8 | in medical diagnosis.
 9 | 
10 | 
11 | Created on Jul 8, 2016
12 | 
13 | from Python Machine Learning by Sebastian Raschka under the following license
14 | 
15 | The MIT License (MIT)
16 | 
17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
18 | 
19 | Permission is hereby granted, free of charge, to any person obtaining a copy
20 | of this software and associated documentation files (the "Software"), to deal
21 | in the Software without restriction, including without limitation the rights
22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
23 | copies of the Software, and to permit persons to whom the Software is
24 | furnished to do so, subject to the following conditions:
25 | 
26 | The above copyright notice and this permission notice shall be included in all
27 | copies or substantial portions of the Software.
28 | 
29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 | SOFTWARE.
36 | 
37 | @author: richard lyman
38 | '''
39 | 
40 | import matplotlib.pyplot as plt
41 | import ocr_utils  
42 | from sklearn.preprocessing import StandardScaler
43 | from sklearn.pipeline import Pipeline
44 | from sklearn.svm import SVC
45 | 
46 | from sklearn.metrics import confusion_matrix
47 | 
48 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 
49 | 
50 | pipe_svc = Pipeline([('scl', StandardScaler()),
51 |             ('clf', SVC(random_state=1))])
52 | 
53 | param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
54 | 
55 | param_grid = [{'clf__C': param_range, 
56 |                'clf__kernel': ['linear']},
57 |                  {'clf__C': param_range, 
58 |                   'clf__gamma': param_range, 
59 |                   'clf__kernel': ['rbf']}]
60 | 
61 | pipe_svc.fit(X_train, y_train)
62 | y_pred = pipe_svc.predict(X_test)
63 | confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
64 | print(confmat)
65 | 
66 | fig, ax = plt.subplots(figsize=(2.5, 2.5))
67 | ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
68 | for i in range(confmat.shape[0]):
69 |     for j in range(confmat.shape[1]):
70 |         ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
71 | 
72 | plt.xlabel('predicted label')
73 | plt.ylabel('true label')
74 | title='c5_confusion_matrix'
75 | plt.title(title)
76 | plt.tight_layout()
77 | ocr_utils.show_figures(plt,title)
78 | 
79 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/q0_simple_e13b_display.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | '''simple_e13b_display.py displays a plot of the characters in the E13B font
 3 | 
 4 | See the explanation of the E13B character set in ocr_utils.load_E13B. 
 5 | 
 6 | Created on Jun 20, 2016
 7 | 
 8 | from Python Machine Learning by Sebastian Raschka under the following license
 9 | 
10 | The MIT License (MIT)
11 | 
12 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
13 | 
14 | Permission is hereby granted, free of charge, to any person obtaining a copy
15 | of this software and associated documentation files (the "Software"), to deal
16 | in the Software without restriction, including without limitation the rights
17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | copies of the Software, and to permit persons to whom the Software is
19 | furnished to do so, subject to the following conditions:
20 | 
21 | The above copyright notice and this permission notice shall be included in all
22 | copies or substantial portions of the Software.
23 | 
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE.
31 | 
32 | @author: richard lyman
33 | 
34 | '''
35 | import ocr_utils
36 | import numpy as np
37 |  
38 | #############################################################################
39 | # read images and scatter plot
40 | 
41 | # retrieve 400 sets of target numbers and column sums
42 | #    y: the ascii characters 48 and 49 ('0', '1')
43 | #    X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17
44 | ascii_characters_to_train = (48,49)
45 | columnsXY = (9,17)       
46 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train=ascii_characters_to_train , columns=columnsXY,nChars=256) 
47 | 
48 | # put the ASCII equivalent of the unique characters in y into the legend of the plot
49 | legend=[]
50 | for ys in np.unique(y):
51 |     legend.append('{} \'{}\''.format(ys, chr(ys)))
52 |            
53 | ocr_utils.scatter_plot(X=X, 
54 |                   y=y,
55 |                   legend_entries=legend,
56 |                   axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 
57 |                   title='E13B sum of columns')
58 | 
59 | #############################################################################
60 | # read and show character images for '0', and '1'
61 | # select the digits in columnsXY in the E13B font
62 | 
63 | fd = {'m_label': ascii_characters_to_train, 'font': 'E13B'}
64 | 
65 | # output only the character label and the image
66 | fl = ['m_label','image'] 
67 | 
68 | # read the complete image (20x20) = 400 pixels for each character
69 | ds = ocr_utils.read_data(input_filters_dict=fd, output_feature_list=fl, dtype=np.int32)   
70 | y,X = ds.train.features
71 | 
72 | # change to a 2D shape 
73 | X=np.reshape(X,(X.shape[0],ds.train.num_rows, ds.train.num_columns))
74 | ocr_utils.montage(X,title='some E13B Characters')
75 | 
76 | 
77 | 
78 | print ('\n########################### No Errors ####################################')
79 | 


--------------------------------------------------------------------------------
/p124_random_forest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | ''' random_forest.py 
 3 |     The random forest uses weak learners to build a strong learner
 4 |     
 5 |     A random subset of samples is drawn and then at each node a decision
 6 |     tree is grown from a smaller subset of those bootstrap samples
 7 |     
 8 |     This is repeated a number of times and then the decision trees are 
 9 |     combined via majority vote.
10 |     
11 |     1) run the random forest on the e13b data
12 |     2) plot the decision regions
13 |     
14 |     
15 | Created on Jun 23, 2016
16 | 
17 | from Python Machine Learning by Sebastian Raschka under the following license
18 | 
19 | The MIT License (MIT)
20 | 
21 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
22 | 
23 | Permission is hereby granted, free of charge, to any person obtaining a copy
24 | of this software and associated documentation files (the "Software"), to deal
25 | in the Software without restriction, including without limitation the rights
26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
27 | copies of the Software, and to permit persons to whom the Software is
28 | furnished to do so, subject to the following conditions:
29 | 
30 | The above copyright notice and this permission notice shall be included in all
31 | copies or substantial portions of the Software.
32 | 
33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
36 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
39 | SOFTWARE.
40 | 
41 | @author: richard lyman
42 | '''
43 | import numpy as np
44 | import ocr_utils
45 | from sklearn.preprocessing import StandardScaler
46 | 
47 | if __name__ == '__main__':
48 |     
49 |     y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
50 | 
51 | 
52 |     sc = StandardScaler()
53 |     sc.fit(X_train)
54 |     X_train_std = sc.transform(X_train)
55 |     X_test_std = sc.transform(X_test)
56 |     X_combined_std = np.vstack((X_train_std, X_test_std))
57 |     y_combined = np.hstack((y_train, y_test))
58 |     X_combined = np.vstack((X_train, X_test))
59 |     y_combined = np.hstack((y_train, y_test))
60 | 
61 |     from sklearn.ensemble import RandomForestClassifier
62 | 
63 |     forest = RandomForestClassifier(criterion='entropy',
64 |                                     n_estimators=10, 
65 |                                     random_state=1,
66 |                                     n_jobs=2)
67 |     forest.fit(X_train, y_train)
68 | 
69 |     ocr_utils.plot_decision_regions(X=X_combined, 
70 |                                              y=y_combined, 
71 |                                              classifier=forest, 
72 |                                              labels=labels,                                         
73 |                                              test_idx=range(len(X_test_std),len(X_combined_std)),
74 |                                              title='random_forest')
75 | 
76 |     print ('\n########################### No Errors ####################################')
77 | 


--------------------------------------------------------------------------------
/p51_standard_scalar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | '''standard_scalar illustrates the use of the scaling from
 3 |     the sklearn tools.
 4 |     1) column sums from the E13B dataset are read in as features.
 5 |     2) Features are scaled with the sklearn StandardScaler
 6 |     3) The features are then fitted to a Perceptron and the decision regions
 7 |         are plotted.
 8 | 
 9 | Created on Jun 23, 2016
10 | 
11 | from Python Machine Learning by Sebastian Raschka under the following license
12 | 
13 | The MIT License (MIT)
14 | 
15 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
16 | 
17 | Permission is hereby granted, free of charge, to any person obtaining a copy
18 | of this software and associated documentation files (the "Software"), to deal
19 | in the Software without restriction, including without limitation the rights
20 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
21 | copies of the Software, and to permit persons to whom the Software is
22 | furnished to do so, subject to the following conditions:
23 | 
24 | The above copyright notice and this permission notice shall be included in all
25 | copies or substantial portions of the Software.
26 | 
27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
28 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
30 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
31 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
32 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 | SOFTWARE.
34 | 
35 | @author: richard lyman
36 | '''
37 | import numpy as np
38 | import ocr_utils
39 | from sklearn.preprocessing import StandardScaler
40 | from sklearn.linear_model import Perceptron
41 | from sklearn.model_selection import train_test_split
42 | 
43 | 
44 | #############################################################################
45 | # read images and scatter plot
46 | 
47 | # retrieve 500 sets of target numbers and column sums
48 | #    y: the ascii characters 48 and 49 ('0', '1')
49 | #    X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17
50 | 
51 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17),nChars=500, random_state=0) 
52 | 
53 | print('Class labels:', np.unique(y))
54 | 
55 | 
56 | #############################################################################
57 | # standardize the features
58 | 
59 | X_train, X_test, y_train, y_test = train_test_split(
60 |          X, y, test_size=0.3, random_state=0)
61 | 
62 | sc = StandardScaler()
63 | sc.fit(X_train)
64 | X_train_std = sc.transform(X_train)
65 | X_test_std = sc.transform(X_test)
66 | 
67 | ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0)
68 | ppn.fit(X_train_std, y_train)
69 | 
70 | y_pred = ppn.predict(X_test_std)
71 | print('Misclassified samples: %d' % (y_test != y_pred).sum())
72 | 
73 | from sklearn.metrics import accuracy_score
74 | 
75 | print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
76 |        
77 | X_combined_std = np.vstack((X_train_std, X_test_std))
78 | y_combined = np.hstack((y_train, y_test))
79 | 
80 | ocr_utils.plot_decision_regions(X_combined_std, y_combined, ppn, 
81 |                            test_idx=range(len(X_test_std),len(X_combined_std)),
82 |                            labels=labels, 
83 |                            title='perceptron_scikit')
84 | 
85 | 
86 | 
87 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p330_dendrogram.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 21, 2016
  3 | 
  4 | A dendrogram is a diagram that shows the cluster composition of a dataset by
  5 | by showing cluster as levels in a tree.  At each node, the cluster is
  6 | shown as subclusters. This is somewhat like the separation of the data by
  7 | a Decision Tree.
  8 | 
  9 | from Python Machine Learning by Sebastian Raschka under the following license
 10 | 
 11 | The MIT License (MIT)
 12 | 
 13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 14 | 
 15 | Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | of this software and associated documentation files (the "Software"), to deal
 17 | in the Software without restriction, including without limitation the rights
 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | copies of the Software, and to permit persons to whom the Software is
 20 | furnished to do so, subject to the following conditions:
 21 | 
 22 | The above copyright notice and this permission notice shall be included in all
 23 | copies or substantial portions of the Software.
 24 | 
 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | SOFTWARE.
 32 | 
 33 | @author: richard lyman
 34 | '''
 35 | 
 36 | 
 37 | import ocr_utils
 38 | import matplotlib.pyplot as plt
 39 | 
 40 | ##############################################
 41 | # separate the original images by cluster
 42 | # print(km.cluster_centers_.shape)
 43 | 
 44 | n=300
 45 | 
 46 | variables = ['X', 'Y', 'Z']
 47 | labels = ['ID_0','ID_1','ID_2','ID_3','ID_4']
 48 | 
 49 | chars_to_train = range(48,51)
 50 | columnsXY=(9,17)
 51 | column_str = 'column_sum{}'.format(list(columnsXY))
 52 | 
 53 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
 54 | 
 55 | # output  the character label and the image and column sums
 56 | output_feature_list = ['m_label','image', column_str] 
 57 | 
 58 | # read the complete image (20x20) = 400 pixels for each character
 59 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
 60 |                             output_feature_list=output_feature_list, 
 61 |                             random_state=0)
 62 |    
 63 | y = ds.train.features[0][:n]
 64 | X_image = ds.train.features[1][:n]
 65 | X = ds.train.features[2][:n]
 66 | 
 67 | from scipy.spatial.distance import pdist
 68 | 
 69 | row_dist = pdist(X, metric='euclidean')
 70 | print(row_dist)
 71 | 
 72 | from scipy.cluster.hierarchy import linkage
 73 | 
 74 | #method 1 using a condensed matrix
 75 | row_clusters = linkage(row_dist, method='complete', metric='euclidean')
 76 | 
 77 | print (row_clusters)
 78 | #method 2 using raw data
 79 | row_clusters = linkage(X, method='complete', metric='euclidean')
 80 | 
 81 | print()
 82 | print (row_clusters)
 83 | 
 84 | from scipy.cluster.hierarchy import dendrogram
 85 | 
 86 | # make dendrogram black (part 1/2)
 87 | # from scipy.cluster.hierarchy import set_link_color_palette
 88 | # set_link_color_palette(['black'])
 89 | 
 90 | row_dendr = dendrogram(row_clusters,  p=12, truncate_mode = 'lastp')
 91 | 
 92 | plt.tight_layout()
 93 | plt.ylabel('Euclidean distance')
 94 | #plt.savefig('./figures/dendrogram.png', dpi=300, 
 95 | #            bbox_inches='tight')
 96 | title = "Dendogram"
 97 | plt.title(title)
 98 | ocr_utils.show_figures(plt, title)
 99 | 
100 | print ('\n########################### No Errors ####################################')
101 | 


--------------------------------------------------------------------------------
/n0_network.py:
--------------------------------------------------------------------------------
 1 | #import tensorflow as tf  
 2 | from tensorflow.compat import v1 as tf
 3 | tf.compat.v1.disable_eager_execution()
 4 | import numpy as np
 5 | from collections import namedtuple
 6 | import datetime
 7 | import ocr_utils
 8 | 
 9 | class base_network(object):
10 |     ''' definition of the network
11 |     '''
12 | 
13 |         
14 |         
15 | 
16 |     def fit(self, truthed_data, nEpochs=5000):    
17 | 
18 |         perfect_count=10
19 |         for i in range(nEpochs):
20 |         
21 |             batch = truthed_data.next_batch(100)
22 |             # assign feature data to each placeholder
23 |             # the batch list is returned in the same order as the features requested
24 |             feed = {self._keep_prob: 0.5}
25 |             for j in range(truthed_data.num_features):
26 |                 feed[self._ph[j]] = batch[j]  
27 |                 
28 |             if i%100 == 0:
29 | 
30 |                 feed[self._keep_prob] = 1.0
31 |                 result = self._sess.run([self._merged, self._accuracy ], feed_dict=feed)    
32 |                 summary_str = result[0]
33 |  
34 |                 self._writer.add_summary(summary_str, i)
35 |                 train_accuracy = result[1]   
36 |                 if train_accuracy <= (1.0 - 1e-5  ):
37 |                     perfect_count=10;
38 |                 else:
39 |                     perfect_count -= 1
40 |                     if perfect_count==0:
41 |                         break;  
42 |                     
43 |                 print ("step %d, training accuracy %g"%(i, train_accuracy),flush=True)
44 |             self._sess.run(self._train_step,feed_dict=feed)
45 |             
46 | 
47 |     
48 | 
49 | 
50 |     def test(self, truthed_data,  title = ''): 
51 | 
52 |         # assign feature data to each placeholder
53 |         error_images = np.empty((0,self._nRows,self._nCols))
54 |             
55 |         test_accuracy=0
56 |         m=0
57 |    
58 |         for i in range(int(len(truthed_data.features[0])/100)):
59 |         
60 |             batch = truthed_data.next_batch(100)
61 |             # assign feature data to each placeholder
62 |             # the batch list is returned in the same order as the features requested
63 |             feed = {self._keep_prob: 1.0}
64 |             for j in range(truthed_data.num_features):
65 |                 feed[self._ph[j]] = batch[j]  
66 |                 
67 | 
68 |             result = self._sess.run([self._accuracy, self._x_image, self._correct_prediction], feed_dict=feed)    
69 |             
70 |             test_accuracy += result[0]
71 |             error_images = np.append(error_images, result[1][:,:,:,0][result[2]==False],axis=0)
72 |             m += 1
73 |         try:        
74 |             print ("test accuracy {} for : {}".format(test_accuracy/m, title),flush=True)       
75 |             ocr_utils.montage(error_images,title='TensorFlow {} Error Images'.format(title))  
76 |         except:  
77 |             if m==0:
78 |                 print ("test accuracy 1",flush=True)
79 |             else:                                                                    
80 |                 print ("test accuracy {}".format(test_accuracy/m),flush=True)  
81 |                 ocr_utils.montage(error_images,title='TensorFlow Error Images') 
82 |             
83 |          
84 |     def predict(self, truthed_features): 
85 |         feed={self._keep_prob: 1.0}
86 |         # assign feature data to each placeholder
87 |         error_images = np.empty((truthed_features.num_rows,truthed_features.num_columns))
88 |             
89 |         test_accuracy=0
90 |         m=0
91 |           
92 |         for j in range(1,truthed_features.num_features):
93 |              feed[self._ph[j]] = truthed_features.features[j]
94 |         result = self._sess.run([self._prediction], feed_dict=feed)    
95 |     
96 |         return result[0]
97 |      
98 |                   


--------------------------------------------------------------------------------
/p411_keras.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 22, 2016
  3 | 
  4 | keras.py implements a neural network with the keras framework.
  5 | 
  6 | This example trains using the Handprint images 
  7 | 
  8 | from Python Machine Learning by Sebastian Raschka under the following license
  9 | 
 10 | The MIT License (MIT)
 11 | 
 12 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 13 | 
 14 | Permission is hereby granted, free of charge, to any person obtaining a copy
 15 | of this software and associated documentation files (the "Software"), to deal
 16 | in the Software without restriction, including without limitation the rights
 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 18 | copies of the Software, and to permit persons to whom the Software is
 19 | furnished to do so, subject to the following conditions:
 20 | 
 21 | The above copyright notice and this permission notice shall be included in all
 22 | copies or substantial portions of the Software.
 23 | 
 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 30 | SOFTWARE.
 31 | 
 32 | @author: richard lyman
 33 | '''
 34 |  
 35 | import ocr_utils
 36 | import numpy as np
 37 | 
 38 | 
 39 | 
 40 | def do_keras(X_train,X_test, y_train_ohe, y_train,y_test):
 41 |     print('First 3 labels: ', y_train[:3])
 42 |     
 43 |     print('\nFirst 3 labels (one-hot):\n', y_train_ohe[:3])
 44 |     
 45 |     from keras.models import Sequential
 46 |     from keras.layers.core import Dense
 47 |     from keras.optimizers import SGD
 48 |     
 49 |     np.random.seed(1) 
 50 |     
 51 |     model = Sequential()
 52 |     model.add(Dense(input_dim=X_train.shape[1], 
 53 |                     units=50,  
 54 |                     activation='tanh'))
 55 |     
 56 |     model.add(Dense(input_dim=50, 
 57 |                     units=50, 
 58 |                     activation='tanh'))
 59 |     
 60 |     model.add(Dense(input_dim=50, 
 61 |                     units=y_train_ohe.shape[1],  
 62 |                     activation='softmax'))
 63 |     
 64 |     sgd = SGD(lr=0.001, decay=1e-7, momentum=.9)
 65 |     model.compile(loss='categorical_crossentropy', optimizer=sgd,metrics=["accuracy"])
 66 |     
 67 |     model.fit(X_train, y_train_ohe, 
 68 |               epochs=50, 
 69 |               batch_size=300, 
 70 |               verbose=2, 
 71 |               validation_split=0.1
 72 |               )
 73 |     y_train_pred = model.predict_classes(X_train, verbose=0)
 74 |     print('First 3 predictions: ', y_train_pred[:3])
 75 |     train_acc = np.sum(y_train == y_train_pred, axis=0) / X_train.shape[0]
 76 |     print('Training accuracy: %.2f%%' % (train_acc * 100))
 77 |     
 78 |     
 79 |     y_test_pred = model.predict_classes(X_test, verbose=0)
 80 |     test_acc = np.sum(y_test == y_test_pred, axis=0) / X_test.shape[0]
 81 |     print('Test accuracy: %.2f%%' % (test_acc * 100))
 82 | 
 83 | input_filters_dict = {'font': ('HANDPRINT',)}
 84 | output_feature_list = ['m_label_one_hot','image','m_label'] 
 85 | ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
 86 |                          output_feature_list=output_feature_list, 
 87 |                          engine_type='keras',
 88 |                          test_size = .1,
 89 |                          dtype=np.float32,
 90 |                          random_state=0)
 91 | 
 92 | X_train = ds.train.features[1]
 93 | X_test = ds.test.features[1]
 94 | y_train_ohe = ds.train.features[0]
 95 | y_train = ds.train.features[2]-48
 96 | y_test = ds.test.features[2]-48    
 97 | do_keras(X_train,X_test, y_train_ohe, y_train, y_test)
 98 | 
 99 | 
100 | print ('\n########################### No Errors ####################################')
101 | 
102 | 


--------------------------------------------------------------------------------
/p322_silhouette_plots.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 18, 2016
  3 | silhouette.py
  4 | 
  5 | A silhouette plot shows how well the samples are bound to a single
  6 | centroid selected by k-means and how well they are separated from the
  7 | other clusters.
  8 | 
  9 | Typically the cohesion and dissimilarity coefficients that make up
 10 | the silhouette are calculated using Euclidean distance.
 11 | 
 12 | This program shows silhouette plot using a small number of clusters.
 13 | 
 14 | from Python Machine Learning by Sebastian Raschka under the following license
 15 | 
 16 | The MIT License (MIT)
 17 | 
 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 19 | 
 20 | Permission is hereby granted, free of charge, to any person obtaining a copy
 21 | of this software and associated documentation files (the "Software"), to deal
 22 | in the Software without restriction, including without limitation the rights
 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 24 | copies of the Software, and to permit persons to whom the Software is
 25 | furnished to do so, subject to the following conditions:
 26 | 
 27 | The above copyright notice and this permission notice shall be included in all
 28 | copies or substantial portions of the Software.
 29 | 
 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 36 | SOFTWARE.
 37 | 
 38 | @author: richard lyman
 39 | '''
 40 | 
 41 | import numpy as np
 42 | import ocr_utils
 43 | import matplotlib.pyplot as plt
 44 | n=1000
 45 | 
 46 | chars_to_train = (48,50)
 47 | columnsXY=(9,17)
 48 | column_str = 'column_sum{}'.format(list(columnsXY))
 49 | skewRange = np.linspace(-0.5,0.5,81)
 50 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
 51 | 
 52 | # output  the character label and the image and column sums
 53 | output_feature_list = ['m_label','image',column_str] 
 54 | 
 55 | # read the complete image (20x20) = 400 pixels for each character
 56 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
 57 |                             output_feature_list=output_feature_list, 
 58 |                             random_state=0)
 59 |    
 60 | y = ds.train.features[0][:n]
 61 | X_image = ds.train.features[1][:n]
 62 | X = ds.train.features[2][:n]
 63 | 
 64 | from matplotlib import cm
 65 | from sklearn.metrics import silhouette_samples
 66 | from sklearn.cluster import KMeans
 67 | 
 68 | km = KMeans(n_clusters=2, 
 69 |             init='k-means++', 
 70 |             n_init=10, 
 71 |             max_iter=300,
 72 |             tol=1e-04,
 73 |             random_state=0)
 74 | y_km = km.fit_predict(X)
 75 | 
 76 | cluster_labels = np.unique(y_km)
 77 | n_clusters = cluster_labels.shape[0]
 78 | silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
 79 | y_ax_lower, y_ax_upper = 0, 0
 80 | yticks = []
 81 | for i, c in enumerate(cluster_labels):
 82 |     c_silhouette_vals = silhouette_vals[y_km == c]
 83 |     c_silhouette_vals.sort()
 84 |     y_ax_upper += len(c_silhouette_vals)
 85 |     color = cm.jet(i / n_clusters)
 86 |     plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, 
 87 |             edgecolor='none', color=color)
 88 | 
 89 |     yticks.append((y_ax_lower + y_ax_upper) / 2)
 90 |     y_ax_lower += len(c_silhouette_vals)
 91 |     
 92 | silhouette_avg = np.mean(silhouette_vals)
 93 | plt.axvline(silhouette_avg, color="red", linestyle="--") 
 94 | 
 95 | plt.yticks(yticks, cluster_labels + 1)
 96 | plt.ylabel('Cluster')
 97 | plt.xlabel('Silhouette coefficient')
 98 | title = 'Silhouettes'
 99 | plt.title(title)
100 | plt.tight_layout()
101 | ocr_utils.show_figures(plt, title)
102 | print ('\n########################### No Errors ####################################')
103 | 


--------------------------------------------------------------------------------
/p193_model_precision_recall.py:
--------------------------------------------------------------------------------
 1 | '''mode_precision_recall.py
 2 | 
 3 |     Precision and recall are measures of true positives. 
 4 |     Precision is also called positive predictive value. 
 5 |     Precision is "how useful the search results are".
 6 |     
 7 |     Recall is also called sensitivity
 8 |     Recall is  "how complete the results are".
 9 |     
10 |     Combining them is the F1 score
11 |     It is the harmonic meanof Precision and Recall
12 |     
13 |     Given a couple of E13B, compute the precision and recall values.
14 |     Make a scorer using the F1 measure as the score and use
15 |     grid search to find the parameters that give the highest F1 measure
16 |     
17 |     
18 | 
19 | Created on Jul 9, 2016
20 | 
21 | from Python Machine Learning by Sebastian Raschka under the following license
22 | 
23 | The MIT License (MIT)
24 | 
25 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
26 | 
27 | Permission is hereby granted, free of charge, to any person obtaining a copy
28 | of this software and associated documentation files (the "Software"), to deal
29 | in the Software without restriction, including without limitation the rights
30 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
31 | copies of the Software, and to permit persons to whom the Software is
32 | furnished to do so, subject to the following conditions:
33 | 
34 | The above copyright notice and this permission notice shall be included in all
35 | copies or substantial portions of the Software.
36 | 
37 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43 | SOFTWARE.
44 | 
45 | @author: richard lyman
46 | '''
47 | 
48 | import ocr_utils
49 | from sklearn.preprocessing import StandardScaler
50 | from sklearn.pipeline import Pipeline
51 | from sklearn.svm import SVC
52 | from sklearn.model_selection import GridSearchCV
53 | from sklearn.metrics import make_scorer,precision_score, recall_score, f1_score
54 | from sklearn.model_selection import train_test_split
55 | 
56 | if __name__ == '__main__':
57 |     y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 
58 | 
59 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
60 | 
61 |     pipe_svc = Pipeline([('scl', StandardScaler()),
62 |                 ('clf', SVC(random_state=1))])
63 | 
64 |     param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
65 | 
66 |     param_grid = [{'clf__C': param_range, 
67 |                    'clf__kernel': ['linear']},
68 |                      {'clf__C': param_range, 
69 |                       'clf__gamma': param_range, 
70 |                       'clf__kernel': ['rbf']}]
71 |     pipe_svc.fit(X_train, y_train)
72 |     y_pred = pipe_svc.predict(X_test)
73 | 
74 |     pos_label=y_train[0]
75 |     print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label))
76 |     print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label))
77 |     print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred, pos_label=pos_label))
78 | 
79 |     scorer = make_scorer(f1_score, pos_label=pos_label)
80 | 
81 |     c_gamma_range = [0.01, 0.1, 1.0, 10.0]
82 | 
83 |     param_grid = [{'clf__C': c_gamma_range, 
84 |                    'clf__kernel': ['linear']},
85 |                      {'clf__C': c_gamma_range, 
86 |                       'clf__gamma': c_gamma_range, 
87 |                       'clf__kernel': ['rbf'],}]
88 | 
89 |     gs = GridSearchCV(estimator=pipe_svc, 
90 |                                     param_grid=param_grid, 
91 |                                     scoring=scorer, 
92 |                                     cv=10,
93 |                                     n_jobs=-1)
94 |     gs = gs.fit(X_train, y_train)
95 |     print('\nGrid Search f1 scoring best score: {}'.format(gs.best_score_))
96 |     print('Grid Search f1 scoring best params: {}'.format(sorted(gs.best_params_.items())))
97 |     print ('\n########################### No Errors ####################################')
98 | 


--------------------------------------------------------------------------------
/q2_tensorflow_mnist.py:
--------------------------------------------------------------------------------
  1 | """# ==========================================================================
  2 | 
  3 | # Copyright 2015 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | This sample program is a modified version of the Google mnist convolutional 
 19 | network tutorial example.  See the mnist tutorial in www.tensorflow.org 
 20 | 
 21 | The tutorial version of the program is modified in order to send some
 22 | features directly to the fully connected layer, thus bypassing the
 23 | convolution layer.
 24 | 
 25 | Images go through convolution.  Everything else bypasses.
 26 | 
 27 | see tensor_flow_graph.png
 28 | """# ==============================================================================
 29 | 
 30 | import ocr_utils
 31 | import datetime
 32 | from collections import namedtuple
 33 | import numpy as np
 34 | import pandas as pd
 35 | import n1_2cnv1fc as nnetwork      
 36 | from tensorflow.compat import v1 as tf
 37 | #import tf
 38 | dtype = np.float32
 39 |     
 40 | if True:
 41 |     # single font train
 42 |     
 43 |     # esamples
 44 |     # select only images from 'OCRB'  scanned font
 45 |     # input_filters_dict = {'font': ('OCRA',)}
 46 |     
 47 |     # select only images from 'HANDPRINT'  font
 48 |     #input_filters_dict = {'font': ('HANDPRINT',)}
 49 |     
 50 |     # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant
 51 |     # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)}
 52 |     
 53 |     # select everything; all fonts , font variants, etc.
 54 |     #input_filters_dict = {}
 55 |     
 56 |     # select the digits 0 through 9 in the E13B font
 57 |     # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'}
 58 |     
 59 |     # select the digits 0 and 2in the E13B font
 60 |     # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'}
 61 |     
 62 |     # output the character label, image, italic flag, aspect_ratio and upper_case flag
 63 |     # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 64 |     
 65 |     # output only the character label and the image
 66 |     # output_feature_list = ['m_label_one_hot','image'] 
 67 |     
 68 |     #   identify the font given the input images
 69 |     #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case']   
 70 | 
 71 |     # train the digits 0-9 for all fonts
 72 |     input_filters_dict = {'m_label': range(48,58)}
 73 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 74 |     ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
 75 |                                 output_feature_list=output_feature_list,
 76 |                                 test_size = .1,
 77 |                                 engine_type='tensorflow',dtype=dtype)    
 78 |     nn = nnetwork.network(ds.train)
 79 |     nn.fit( ds.train,  nEpochs=5000)  
 80 |     nn.test(ds.test)
 81 |     
 82 | else:
 83 |     # loop through all the fonts and train individually
 84 | 
 85 |     # pick up the entire list of fonts and font variants. Train each one.
 86 |     df1 = ocr_utils.get_list(input_filters_dict={'font': ()})      
 87 |     
 88 |     import pprint as pprint
 89 |     pp = pprint.PrettyPrinter(indent=4)
 90 |     pp.pprint(df1)
 91 |    
 92 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']
 93 |     
 94 |     # Change nEpochs to 5000 for better results
 95 |     for l in df1:
 96 |         input_filters_dict= {'font': (l[0],)}       
 97 |         train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 
 98 |     
 99 | 
100 | print ('\n########################### No Errors ####################################')
101 | 
102 | 


--------------------------------------------------------------------------------
/p110_scaling_features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' scaling_features.py
  3 | 
  4 |     Investigate normalization versus standardization
  5 | 
  6 |     The features in the ocr_utils are already normalized.  That is, each image
  7 |     has been stretched to go from pure black to pure white.  The values in the
  8 |     .csv file are 0 to 255.  ocr_utils.py changes these value to be in the range
  9 |     0.0 to 1.0
 10 |     
 11 |     1) prints out a sampling of the normalized values. 
 12 |     2) standardize the values and print them out.
 13 |      
 14 | Created on Jun 23, 2016
 15 | 
 16 | from Python Machine Learning by Sebastian Raschka under the following license
 17 | 
 18 | The MIT License (MIT)
 19 | 
 20 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 21 | 
 22 | Permission is hereby granted, free of charge, to any person obtaining a copy
 23 | of this software and associated documentation files (the "Software"), to deal
 24 | in the Software without restriction, including without limitation the rights
 25 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 26 | copies of the Software, and to permit persons to whom the Software is
 27 | furnished to do so, subject to the following conditions:
 28 | 
 29 | The above copyright notice and this permission notice shall be included in all
 30 | copies or substantial portions of the Software.
 31 | 
 32 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 33 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 34 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 35 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 36 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 37 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 38 | SOFTWARE.
 39 | 
 40 | @author: richard lyman
 41 | '''
 42 | import numpy as np
 43 | import ocr_utils
 44 | from sklearn.neighbors import KNeighborsClassifier
 45 | 
 46 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
 47 | 
 48 | # put the ASCII equivalent of the unique characters in y into the legend of the plot
 49 | legend=[]
 50 | for ys in np.unique(y_train):
 51 |     legend.append('{} \'{}\''.format(ys, chr(ys)))
 52 | 
 53 | X_combined = np.vstack((X_train, X_test))
 54 | y_combined = np.hstack((y_train, y_test))
 55 | 
 56 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
 57 | knn.fit(X_train, y_train)
 58 | 
 59 | 
 60 | ocr_utils.plot_decision_regions(X=X_combined, 
 61 |                       y=y_combined, 
 62 |                       classifier=knn, 
 63 |                       labels=labels,                      
 64 |                       test_idx=range(len(X_test),len(X_combined)),
 65 |                       title='k_nearest_neighbors no scaling')
 66 | 
 67 | from sklearn.preprocessing import MinMaxScaler
 68 | 
 69 | mms = MinMaxScaler()
 70 | X_train_norm = mms.fit_transform(X_train)
 71 | X_test_norm = mms.transform(X_test)
 72 | X_combined_norm = np.vstack((X_train_norm, X_test_norm))
 73 | 
 74 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
 75 | knn.fit(X_train_norm, y_train)
 76 | 
 77 | 
 78 | ocr_utils.plot_decision_regions(X=X_combined_norm, 
 79 |                       y=y_combined, 
 80 |                       classifier=knn, 
 81 |                       labels=labels,                      
 82 |                       test_idx=range(len(X_test_norm),len(X_combined_norm)),
 83 |                       title='k_nearest_neighbors MinMaxScaller')
 84 | 
 85 | from sklearn.preprocessing import StandardScaler
 86 | stdsc = StandardScaler()
 87 | X_train_std = stdsc.fit_transform(X_train)
 88 | X_test_std = stdsc.transform(X_test)
 89 | 
 90 | sc = StandardScaler()
 91 | sc.fit(X_train)
 92 | X_train_std = sc.transform(X_train)
 93 | X_test_std = sc.transform(X_test)
 94 | X_combined_std = np.vstack((X_train_std, X_test_std))
 95 | 
 96 | knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
 97 | knn.fit(X_train_std, y_train)
 98 | 
 99 | ocr_utils.plot_decision_regions(X=X_combined_std, 
100 |                       y=y_combined, 
101 |                       classifier=knn, 
102 |                       labels=labels,                      
103 |                       test_idx=range(len(X_test_std),len(X_combined_std)),
104 |                       title='k_nearest_neighbors Standard Normalized')
105 | 
106 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/q6_tensorflow_residual3x4.py:
--------------------------------------------------------------------------------
  1 | """# ==========================================================================
  2 | 
  3 | # Copyright 2015 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | This sample program is a modified version of the Google mnist convolutional 
 19 | network tutorial example.  See the mnist tutorial in www.tensorflow.org 
 20 | 
 21 | This graph has multiple sections  3 layers each, 400 100 400 followed
 22 | by a fully connected layer.
 23 | 
 24 | see tensor_flow_graph.png
 25 | """# ==============================================================================
 26 | import ocr_utils
 27 | import datetime
 28 | from collections import namedtuple
 29 | import numpy as np
 30 | import pandas as pd
 31 | import n1_residual3x4 as nnetwork      
 32 | from tensorflow.compat import v1 as tf
 33 | dtype = np.float32
 34 | #with tf.device('/GPU:0'):
 35 | #with tf.device('/cpu:0'): 
 36 |        
 37 |     
 38 | if True:
 39 |     # single font train
 40 |     
 41 |     # examples
 42 |     # select only images from 'OCRB'  scanned font
 43 |     # input_filters_dict = {'font': ('OCRA',)}
 44 |     
 45 |     # select only images from 'HANDPRINT'  font
 46 |     #input_filters_dict = {'font': ('HANDPRINT',)}
 47 |     
 48 |     # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant
 49 |     # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)}
 50 |     
 51 |     # select everything; all fonts , font variants, etc.
 52 |     # input_filters_dict = {}
 53 |     
 54 |     # select the digits 0 through 9 in the E13B font
 55 |     # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'}
 56 |     
 57 |     # select the digits 0 and 2in the E13B font
 58 |     # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'}
 59 |     
 60 |     # output the character label, image, italic flag, aspect_ratio and upper_case flag
 61 |     # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 62 |     
 63 |     # output only the character label and the image
 64 |     # output_feature_list = ['m_label_one_hot','image'] 
 65 |     
 66 |     #   identify the font given the input images
 67 |     #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case']   
 68 | 
 69 |     # train the digits 0-9 for all fonts
 70 |     #input_filters_dict = {'m_label': range(48,58)}
 71 |     input_filters_dict = {'font':'ARIAL','m_label': list(range(48,58))+list(range(65,91))+list(range(97,123))}    
 72 |     #input_filters_dict = {}    
 73 |     output_feature_list = ['m_label_one_hot','image']    
 74 |  
 75 |     """# ==============================================================================
 76 |     
 77 |     Train and Evaluate the Model
 78 |     
 79 |     """# ==============================================================================
 80 |     ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
 81 |                                 output_feature_list=output_feature_list,
 82 |                                 test_size = .1,
 83 |                                 engine_type='tensorflow',dtype=dtype)    
 84 |     nn = nnetwork.network(ds.train)
 85 |     nn.fit( ds.train,  nEpochs=5000)  
 86 |     nn.test(ds.test)
 87 |       
 88 | #     train_a_font(input_filters_dict,  output_feature_list, nEpochs = 50000)    
 89 |     
 90 | else:
 91 |     # loop through all the fonts and train individually
 92 | 
 93 |     # pick up the entire list of fonts and font variants. Train each one.
 94 |     df1 = ocr_utils.get_list(input_filters_dict={'font': ()})      
 95 |     
 96 |     import pprint as pprint
 97 |     pp = pprint.PrettyPrinter(indent=4)
 98 |     pp.pprint(df1)
 99 |    
100 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot']
101 |     
102 |     # Change nEpochs to 5000 for better results
103 |     for l in df1:
104 |         input_filters_dict= {'font': (l[0],)}       
105 |         train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 
106 |     
107 |     
108 | print ('\n########################### No Errors ####################################')
109 | 
110 | 


--------------------------------------------------------------------------------
/p229_adaboost.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 10, 2016
  3 | adaboost.py
  4 | 
  5 | Adaboost builds a classifier by starting with weak learners like a forest 
  6 | decision tree, selecting training set samples without replacement, training
  7 | a stump, finding samples that are in error, adding a decision tree stump,
  8 | to train those weak samples, updating weights to be applied to the samples for 
  9 | computing the final prediction.
 10 | 
 11 | It increasing emphasizes the weights of outlier samples until they are result in
 12 | a sequence of weights and decision trees that handle those samples.
 13 | 
 14 | from Python Machine Learning by Sebastian Raschka under the following license
 15 | 
 16 | The MIT License (MIT)
 17 | 
 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 19 | 
 20 | Permission is hereby granted, free of charge, to any person obtaining a copy
 21 | of this software and associated documentation files (the "Software"), to deal
 22 | in the Software without restriction, including without limitation the rights
 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 24 | copies of the Software, and to permit persons to whom the Software is
 25 | furnished to do so, subject to the following conditions:
 26 | 
 27 | The above copyright notice and this permission notice shall be included in all
 28 | copies or substantial portions of the Software.
 29 | 
 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 36 | SOFTWARE.
 37 | 
 38 | @author: richard lyman
 39 | '''
 40 | 
 41 | from sklearn.ensemble import AdaBoostClassifier
 42 |       
 43 | import ocr_utils
 44 | from sklearn.model_selection import train_test_split
 45 | import numpy as np
 46 | import matplotlib.pyplot as plt
 47 | from sklearn.preprocessing import LabelEncoder
 48 | 
 49 | charsToTrain=(48,51)
 50 | nChars = 1000
 51 | y, X, y_test, X_test, labels  = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=(9,17), nChars=nChars)
 52 | 
 53 | le = LabelEncoder()
 54 | y = le.fit_transform(y)
 55 | 
 56 | X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.40,random_state=1)
 57 | 
 58 | 
 59 | from sklearn.tree import DecisionTreeClassifier
 60 | from sklearn.metrics import accuracy_score
 61 | tree = DecisionTreeClassifier(criterion='entropy', 
 62 |                               max_depth=1)
 63 | 
 64 | ada = AdaBoostClassifier(base_estimator=tree,
 65 |                          n_estimators=500, 
 66 |                          learning_rate=0.1,
 67 |                          random_state=0)
 68 | 
 69 | tree = tree.fit(X_train, y_train)
 70 | y_train_pred = tree.predict(X_train)
 71 | y_test_pred = tree.predict(X_test)
 72 | 
 73 | tree_train = accuracy_score(y_train, y_train_pred)
 74 | tree_test = accuracy_score(y_test, y_test_pred)
 75 | print('Decision tree train/test accuracies %.3f/%.3f'
 76 |       % (tree_train, tree_test))
 77 | 
 78 | ada = ada.fit(X_train, y_train)
 79 | y_train_pred = ada.predict(X_train)
 80 | y_test_pred = ada.predict(X_test)
 81 | 
 82 | ada_train = accuracy_score(y_train, y_train_pred) 
 83 | ada_test = accuracy_score(y_test, y_test_pred) 
 84 | print('AdaBoost train/test accuracies %.3f/%.3f'
 85 |       % (ada_train, ada_test))
 86 | 
 87 | 
 88 | 
 89 | x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
 90 | y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
 91 | xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/100),
 92 |                      np.arange(y_min, y_max, (y_max-y_min)/100))
 93 | 
 94 | f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3))
 95 | 
 96 | title='AdaBoost'
 97 | for idx, clf, tt in zip([0, 1],
 98 |                         [tree, ada],
 99 |                         ['Decision Tree', title]):
100 |     clf.fit(X_train, y_train)
101 |     
102 |     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
103 |     Z = Z.reshape(xx.shape)
104 | 
105 |     axarr[idx].contourf(xx, yy, Z, alpha=0.3)
106 |     axarr[idx].scatter(X_train[y_train==0, 0], 
107 |                        X_train[y_train==0, 1], 
108 |                        c='blue', marker='^')
109 |     axarr[idx].scatter(X_train[y_train==1, 0], 
110 |                        X_train[y_train==1, 1], 
111 |                        c='red', marker='o')
112 |     axarr[idx].set_title(tt)
113 |     axarr[idx].set_ylabel(labels[0], fontsize=12)
114 |     axarr[idx].set_xlabel(labels[1], fontsize=12)
115 | 
116 | plt.tight_layout()
117 | 
118 | ocr_utils.show_figures(plt, title)
119 | 
120 | print ('\n########################### No Errors ####################################')
121 | 


--------------------------------------------------------------------------------
/p189_nested_cross_validation.py:
--------------------------------------------------------------------------------
  1 | ''' nested_cross_validation.py
  2 | Nested Cross Validation is a method for tuning model parameters minimizing bias.
  3 | 
  4 | There is an outer k-fold cross validation loop and an inner k-fold cross 
  5 | validation loop.
  6 | 
  7 | The outer fold selects a number, such as 10 different training and
  8 | test sets without replacement so each sample ends up being used as a
  9 | test sample exactly once.
 10 | 
 11 | The inner fold uses the training portion of the outer fold, and does a 
 12 | Grid Search to select a classification model, such as 'linear' SVM version 'rbf'
 13 | or Decision Tree versus SVM.
 14 | 
 15 | If the model is stable, then the inner loops should all chose the same 
 16 | classifier type.
 17 | 
 18 | After selecting the classifier then the outer folds are used for tuning, via
 19 | k-fold classification.
 20 | 
 21 | This program uses the sklearn GridSearch Cross Validation that internally uses
 22 | a 5 outer fold, 2 inner folder algorithm to tune parameters.
 23 | 
 24 | 
 25 | Created on Jul 8, 2016
 26 | 
 27 | from Python Machine Learning by Sebastian Raschka under the following license
 28 | 
 29 | The MIT License (MIT)
 30 | 
 31 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 32 | 
 33 | Permission is hereby granted, free of charge, to any person obtaining a copy
 34 | of this software and associated documentation files (the "Software"), to deal
 35 | in the Software without restriction, including without limitation the rights
 36 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 37 | copies of the Software, and to permit persons to whom the Software is
 38 | furnished to do so, subject to the following conditions:
 39 | 
 40 | The above copyright notice and this permission notice shall be included in all
 41 | copies or substantial portions of the Software.
 42 | 
 43 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 44 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 45 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 46 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 47 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 48 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 49 | SOFTWARE.
 50 | 
 51 | @author: richard lyman
 52 | '''
 53 | 
 54 | import numpy as np
 55 | import ocr_utils
 56 | from sklearn.preprocessing import StandardScaler
 57 | from sklearn.pipeline import Pipeline
 58 | from sklearn.model_selection import cross_val_score
 59 | from sklearn.model_selection import GridSearchCV
 60 | from sklearn.svm import SVC
 61 | if __name__ == '__main__':
 62 |         y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) 
 63 | 
 64 | 
 65 |         pipe_svc = Pipeline([('scl', StandardScaler()),
 66 |                     ('clf', SVC(random_state=1))])
 67 | 
 68 |         c_gamma_range = [0.01, 0.1, 1.0, 10.0]
 69 |          
 70 |         param_grid = [{'clf__C': c_gamma_range, 
 71 |                        'clf__kernel': ['linear']},
 72 |                          {'clf__C': c_gamma_range, 
 73 |                           'clf__gamma': c_gamma_range, 
 74 |                           'clf__kernel': ['rbf'],}]
 75 | 
 76 |         gs = GridSearchCV(estimator=pipe_svc, 
 77 |                                     param_grid=param_grid, 
 78 |                                     scoring='accuracy', 
 79 |                                     cv=5,
 80 |                                     n_jobs=-1)
 81 | 
 82 | 
 83 |         scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
 84 |         print('\nSupport Vector Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
 85 | 
 86 |         gs = gs.fit(X_train, y_train)
 87 |         print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_))
 88 |         print('Support Vector Machine Grid Search best params: {}'.format(sorted(gs.best_params_.items())))
 89 |         from sklearn.tree import DecisionTreeClassifier
 90 |         gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), 
 91 |                                     param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], 
 92 |                                     scoring='accuracy', 
 93 |                                     cv=5)
 94 | 
 95 | 
 96 |         scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
 97 |         print('Decision Tree Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
 98 | 
 99 |         gs = gs.fit(X_train, y_train)
100 |         print('Decision Tree Grid Search best score: {}'.format(gs.best_score_))
101 |         print('Decision Tree Grid Search best params: {}'.format(gs.best_params_))
102 | 
103 |         print ('\n########################### No Errors ####################################')
104 | 


--------------------------------------------------------------------------------
/o4_image_to_image.py:
--------------------------------------------------------------------------------
  1 | """# ==========================================================================
  2 | 
  3 | # Copyright 2015 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | This sample program is a modified version of the Google mnist convolutional 
 19 | network tutorial example.  See the mnist tutorial in www.tensorflow.org 
 20 | 
 21 | This graph has multiple sections  3 layers each, 400 100 400 followed
 22 | by a fully connected layer.
 23 | 
 24 | see tensor_flow_graph.png
 25 | """# ==============================================================================
 26 | import ocr_utils
 27 | import datetime
 28 | from collections import namedtuple
 29 | import numpy as np
 30 | import pandas as pd
 31 | import n1_image_to_image as nnetwork  
 32 | #import n1_residual3x4 as nnetwork  
 33 | from tensorflow.compat import v1 as tf 
 34 | dtype = np.float32
 35 | #with tf.device('/GPU:0'):
 36 | #with tf.device('/cpu:0'): 
 37 |        
 38 |     
 39 | if True:
 40 |     # single font train
 41 |     
 42 |     # examples
 43 |     # select only images from 'OCRB'  scanned font
 44 |     # input_filters_dict = {'font': ('OCRA',)}
 45 |     
 46 |     # select only images from 'HANDPRINT'  font
 47 |     #input_filters_dict = {'font': ('HANDPRINT',)}
 48 |     
 49 |     # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant
 50 |     # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)}
 51 |     
 52 |     # select everything; all fonts , font variants, etc.
 53 |     # input_filters_dict = {}
 54 |     
 55 |     # select the digits 0 through 9 in the E13B font
 56 |     # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'}
 57 |     
 58 |     # select the digits 0 and 2in the E13B font
 59 |     # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'}
 60 |     
 61 |     # output the character label, image, italic flag, aspect_ratio and upper_case flag
 62 |     # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 63 |     
 64 |     # output only the character label and the image
 65 |     # output_feature_list = ['m_label_one_hot','image'] 
 66 |     
 67 |     #   identify the font given the input images
 68 |     #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case']   
 69 | 
 70 |     # train the digits 0-9 for all fonts
 71 |     input_filters_dict = {'m_label': [43]+list(range(48,58)),'italic':0,'strength':.4}
 72 |     #input_filters_dict = {'font':'BANKGOTHIC','m_label': list(range(48,58)),'italic':0,'strength':.7}    
 73 |     #input_filters_dict = {}    
 74 |     output_feature_list = ['low_pass_image','image']    
 75 |  
 76 |     """# ==============================================================================
 77 |     
 78 |     Train and Evaluate the Model
 79 |     
 80 |     """# ==============================================================================
 81 |     ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
 82 |                                 output_feature_list=output_feature_list,
 83 |                                 test_size = .2,
 84 |                                 engine_type='tensorflow',dtype=dtype)    
 85 |     nn = nnetwork.network(ds.train)
 86 |     nn.fit_entropy( ds.train,  nEpochs=5000)  
 87 |     nn.test2(ds.test)
 88 |       
 89 | #     train_a_font(input_filters_dict,  output_feature_list, nEpochs = 50000)    
 90 |     
 91 | else:
 92 |     # loop through all the fonts and train individually
 93 | 
 94 |     # pick up the entire list of fonts and font variants. Train each one.
 95 |     df1 = ocr_utils.get_list(input_filters_dict={'font': ()})      
 96 |     
 97 |     import pprint as pprint
 98 |     pp = pprint.PrettyPrinter(indent=4)
 99 |     pp.pprint(df1)
100 |    
101 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot']
102 |     
103 |     # Change nEpochs to 5000 for better results
104 |     for l in df1:
105 |         input_filters_dict= {'font': (l[0],)}       
106 |         train_a_font(input_filters_dict,output_feature_list, nEpochs = 5000) 
107 |     
108 |     
109 | print ('\n########################### No Errors ####################################')
110 | 
111 | 


--------------------------------------------------------------------------------
/p86_decision_tree.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' decision_tree.py shows three impurity measures Gini, entropy, and
  3 |     misclassification error when used with a decision tree classifier.
  4 |     
  5 |     These measures are used to estimate the information gain at each split.
  6 | 
  7 |     1) plot the 3 kinds of impurity measure 
  8 |     2) run the decision tree on the e13b data and plot the decision regions 
  9 |        
 10 |     To create a drawing of the tree run:
 11 |         dot -Tpng tree.dot -o tree.png
 12 |     
 13 | Created on Jun 23, 2016
 14 | 
 15 | from Python Machine Learning by Sebastian Raschka under the following license
 16 | 
 17 | The MIT License (MIT)
 18 | 
 19 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 20 | 
 21 | Permission is hereby granted, free of charge, to any person obtaining a copy
 22 | of this software and associated documentation files (the "Software"), to deal
 23 | in the Software without restriction, including without limitation the rights
 24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 25 | copies of the Software, and to permit persons to whom the Software is
 26 | furnished to do so, subject to the following conditions:
 27 | 
 28 | The above copyright notice and this permission notice shall be included in all
 29 | copies or substantial portions of the Software.
 30 | 
 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 37 | SOFTWARE.
 38 | 
 39 | @author: richard lyman
 40 | '''
 41 | import numpy as np
 42 | import ocr_utils
 43 | import matplotlib.pyplot as plt
 44 | from sklearn.preprocessing import StandardScaler
 45 | 
 46 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17), test_size=0.3, nChars=300, random_state=0) 
 47 | 
 48 | def gini(p):
 49 |     return (p)*(1 - (p)) + (1-p)*(1 - (1-p))
 50 | 
 51 | def entropy(p):
 52 |     return - p*np.log2(p) - (1 - p)*np.log2((1 - p))
 53 | 
 54 | def error(p):
 55 |     return 1 - np.max([p, 1 - p])
 56 | 
 57 | x = np.arange(0.0, 1.0, 0.01)
 58 | 
 59 | ent = [entropy(p) if p != 0 else None for p in x]
 60 | sc_ent = [e*0.5 if e else None for e in ent]
 61 | err = [error(i) for i in x]
 62 | 
 63 | 
 64 | fig = plt.figure()
 65 | 
 66 | ax = plt.subplot(111)
 67 | for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err], 
 68 |                   ['Entropy', 'Entropy (scaled)', 
 69 |                    'Gini Impurity', 'Misclassification Error'],
 70 |                   ['-', '-', '--', '-.'],
 71 |                   ['black', 'lightgray', 'red', 'green', 'cyan']):
 72 |     line = ax.plot(x, i, label=lab, linestyle=ls, lw=2, color=c)
 73 | 
 74 | ax.legend(loc='upper center', ncol=2, fancybox=True, shadow=False)
 75 | 
 76 | ax.axhline(y=0.5, linewidth=1, color='k', linestyle='--')
 77 | ax.axhline(y=1.0, linewidth=1, color='k', linestyle='--')
 78 | plt.ylim([0, 1.2])
 79 | plt.xlabel('p(i=1)')
 80 | plt.ylabel('Impurity Index')
 81 | plt.tight_layout()
 82 | title='impurity'
 83 | plt.title(title)
 84 | 
 85 | ocr_utils.show_figures(plt,title=title)
 86 | 
 87 | sc = StandardScaler()
 88 | sc.fit(X_train)
 89 | X_train_std = sc.transform(X_train)
 90 | X_test_std = sc.transform(X_test)
 91 | X_combined_std = np.vstack((X_train_std, X_test_std))
 92 | y_combined = np.hstack((y_train, y_test))
 93 | 
 94 | from sklearn.tree import DecisionTreeClassifier
 95 | 
 96 | tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
 97 | tree.fit(X_train, y_train)
 98 | 
 99 | X_combined = np.vstack((X_train, X_test))
100 | y_combined = np.hstack((y_train, y_test))
101 | ocr_utils.plot_decision_regions(X=X_combined, 
102 |                                          y=y_combined, 
103 |                                          classifier=tree, 
104 |                                          test_idx=range(len(X_test),len(X_combined)),
105 |                                          labels=labels,
106 |                                          title='decision tree entropy')
107 | 
108 | 
109 | from sklearn.ensemble import RandomForestClassifier
110 | 
111 | forest = RandomForestClassifier(criterion='entropy',
112 |                                 n_estimators=10, 
113 |                                 random_state=1,
114 |                                 n_jobs=2)
115 | forest.fit(X_train, y_train)
116 | 
117 | ocr_utils.plot_decision_regions(X=X_combined, 
118 |                                          y=y_combined, 
119 |                                          classifier=forest, 
120 |                                          labels=labels,                                         
121 |                                          test_idx=range(len(X_test_std),len(X_combined_std)),
122 |                                          title='random_forest')
123 | 
124 | 
125 | 
126 | 
127 | 
128 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/q5_tensorflow_residual.py:
--------------------------------------------------------------------------------
  1 | """# ==========================================================================
  2 | 
  3 | # Copyright 2015 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | 
 18 | This sample program is a modified version of the Google mnist convolutional 
 19 | network tutorial example.  See the mnist tutorial in www.tensorflow.org 
 20 | 
 21 | The tutorial version of the program is modified in order to send some
 22 | features directly to the fully connected layer, thus bypassing the
 23 | convolution layer.
 24 | 
 25 | It has TWO convolution layers and THREE fully connected (2048 node) layers
 26 | 
 27 | Images go through convolution.  Everything else bypasses.
 28 | 
 29 | see tensor_flow_graph.png
 30 | """# ==============================================================================
 31 | import ocr_utils
 32 | import datetime
 33 | from collections import namedtuple
 34 | import numpy as np
 35 | import pandas as pd
 36 | import n1_2cnv1fc as nnetwork 
 37 | 
 38 | # import tensorflow as tf  
 39 |  
 40 | dtype = np.float32
 41 | #with tf.device('/GPU:0'):
 42 | #with tf.device('/cpu:0'):    
 43 | 
 44 |     
 45 | if False:
 46 |     # single font train
 47 |     
 48 |     # examples
 49 |     # select only images from 'OCRB'  scanned font
 50 |     # input_filters_dict = {'font': ('OCRA',)}
 51 |     
 52 |     # select only images from 'HANDPRINT'  font
 53 |     #input_filters_dict = {'font': ('HANDPRINT',)}
 54 |     
 55 |     # select only images from 'OCRA' and 'OCRB' fonts with the 'scanned" fontVariant
 56 |     # input_filters_dict = {'font': ('OCRA','OCRB'), 'fontVariant':('scanned',)}
 57 |     
 58 |     # select everything; all fonts , font variants, etc.
 59 |     # input_filters_dict = {}
 60 |     
 61 |     # select the digits 0 through 9 in the E13B font
 62 |     # input_filters_dict = {'m_label': range(48,58), 'font': 'E13B'}
 63 |     
 64 |     # select the digits 0 and 2in the E13B font
 65 |     # input_filters_dict = {'m_label': (48,50), 'font': 'E13B'}
 66 |     
 67 |     # output the character label, image, italic flag, aspect_ratio and upper_case flag
 68 |     # output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 69 |     
 70 |     # output only the character label and the image
 71 |     # output_feature_list = ['m_label_one_hot','image'] 
 72 |     
 73 |     #   identify the font given the input images
 74 |     #output_feature_list = ['font_one_hot','image','italic','aspect_ratio','upper_case']   
 75 | 
 76 |     # train the digits 0-9 for all fonts
 77 |     input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))+list(range(97,123)),'fontVariant':'scanned'}
 78 |     #input_filters_dict = {}    
 79 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case']    
 80 |     ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
 81 |                                 output_feature_list=output_feature_list,
 82 |                                 test_size = .1,
 83 |                                 engine_type='tensorflow',dtype=dtype)    
 84 |     nn = nnetwork.network( ds.train)
 85 |     nn.fit( ds.train,  nEpochs=5000)  
 86 |     nn.test(ds.test)
 87 |     
 88 | else:
 89 |     # loop through all the fonts and train individually
 90 | 
 91 |     # pick up the entire list of fonts and font variants. Train each one.
 92 |     df1 = ocr_utils.get_list(input_filters_dict={'font': ()})      
 93 |     
 94 |     import pprint as pprint
 95 |     pp = pprint.PrettyPrinter(indent=4)
 96 |     pp.pprint(df1)
 97 |    
 98 |     output_feature_list = ['m_label_one_hot','image','italic','aspect_ratio','upper_case','font_one_hot']
 99 |     
100 |     # Change nEpochs to 5000 for better results
101 |     for l in df1:
102 |         #input_filters_dict= {'font': (l[0],)}   
103 |         input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))+list(range(97,123)),'font': (l[0],)}            
104 |         #train_a_font(input_filters_dict,output_feature_list, nEpochs = 500) 
105 |         
106 |         ds = ocr_utils.read_data(input_filters_dict = input_filters_dict, 
107 |                                     output_feature_list=output_feature_list,
108 |                                     test_size = .1,
109 |                                     engine_type='tensorflow',dtype=dtype)    
110 |         
111 |         nn = nnetwork.network(ds.train)
112 |         nn.fit( ds.train,  nEpochs=5000)  
113 |         nn.test(ds.test, title = l[0] )
114 |         nn.reset_graph()
115 | 
116 | 
117 |         
118 |     
119 | print ('\n########################### No Errors ####################################')
120 | 
121 | 


--------------------------------------------------------------------------------
/p221_bagging_bootstrap_samples.py:
--------------------------------------------------------------------------------
  1 | '''bagging_bootstrap_samples.py
  2 | 
  3 | Bagging draws samples with replacement in order to train classifiers that are 
  4 | then combined my majority voting.
  5 | 
  6 | 
  7 | Created on Jul 10, 2016
  8 | 
  9 | from Python Machine Learning by Sebastian Raschka under the following license
 10 | 
 11 | The MIT License (MIT)
 12 | 
 13 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 14 | 
 15 | Permission is hereby granted, free of charge, to any person obtaining a copy
 16 | of this software and associated documentation files (the "Software"), to deal
 17 | in the Software without restriction, including without limitation the rights
 18 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 19 | copies of the Software, and to permit persons to whom the Software is
 20 | furnished to do so, subject to the following conditions:
 21 | 
 22 | The above copyright notice and this permission notice shall be included in all
 23 | copies or substantial portions of the Software.
 24 | 
 25 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 26 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 27 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 28 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 29 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 30 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 31 | SOFTWARE.
 32 | 
 33 | @author: richard lyman
 34 | '''
 35 | 
 36 | from sklearn.preprocessing import LabelEncoder
 37 | import ocr_utils
 38 | from sklearn.model_selection import train_test_split
 39 | import numpy as np
 40 | import matplotlib.pyplot as plt
 41 | if __name__ == '__main__':
 42 |     
 43 |     charsToTrain=(48,51)
 44 |     nChars = 1000
 45 |     y, X, y_test, X_test, labels  = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=(9,17), nChars=nChars)
 46 | 
 47 |     le = LabelEncoder()
 48 |     y = le.fit_transform(y)
 49 | 
 50 |     X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.40,random_state=1)
 51 | 
 52 |     from sklearn.ensemble import BaggingClassifier
 53 |     from sklearn.tree import DecisionTreeClassifier
 54 | 
 55 |     tree = DecisionTreeClassifier(criterion='entropy', 
 56 |                                   max_depth=None)
 57 | 
 58 |     bag = BaggingClassifier(base_estimator=tree,
 59 |                             n_estimators=500, 
 60 |                             max_samples=1.0, 
 61 |                             max_features=1.0, 
 62 |                             bootstrap=True, 
 63 |                             bootstrap_features=False, 
 64 |                             n_jobs=-1, 
 65 |                             random_state=1)
 66 | 
 67 |     from sklearn.metrics import accuracy_score
 68 | 
 69 |     tree = tree.fit(X_train, y_train)
 70 |     y_train_pred = tree.predict(X_train)
 71 |     y_test_pred = tree.predict(X_test)
 72 | 
 73 |     tree_train = accuracy_score(y_train, y_train_pred)
 74 |     tree_test = accuracy_score(y_test, y_test_pred)
 75 |     print('Decision tree train/test accuracies %.3f/%.3f'
 76 |           % (tree_train, tree_test))
 77 | 
 78 |     bag = bag.fit(X_train, y_train)
 79 |     y_train_pred = bag.predict(X_train)
 80 |     y_test_pred = bag.predict(X_test)
 81 | 
 82 |     bag_train = accuracy_score(y_train, y_train_pred) 
 83 |     bag_test = accuracy_score(y_test, y_test_pred) 
 84 |     print('Bagging train/test accuracies %.3f/%.3f'
 85 |           % (bag_train, bag_test))
 86 | 
 87 |     x_min = X_train[:, 0].min() - 1
 88 |     x_max = X_train[:, 0].max() + 1
 89 |     y_min = X_train[:, 1].min() - 1
 90 |     y_max = X_train[:, 1].max() + 1
 91 | 
 92 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, (x_max-x_min)/100),
 93 |                          np.arange(y_min, y_max, (y_max-y_min)/100))
 94 | 
 95 |     f, axarr = plt.subplots(nrows=1, ncols=2, 
 96 |                             sharex='col', 
 97 |                             sharey='row', 
 98 |                             figsize=(8, 3))
 99 | 
100 | 
101 |     for idx, clf, tt in zip([0, 1],
102 |                             [tree, bag],
103 |                             ['Decision Tree', 'Bagging']):
104 |         clf.fit(X_train, y_train)
105 |         
106 |         Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
107 |         Z = Z.reshape(xx.shape)
108 | 
109 |         axarr[idx].contourf(xx, yy, Z, alpha=0.3)
110 |         axarr[idx].scatter(X_train[y_train==0, 0], 
111 |                            X_train[y_train==0, 1], 
112 |                            c='blue', marker='^')
113 |         
114 |         axarr[idx].scatter(X_train[y_train==1, 0], 
115 |                            X_train[y_train==1, 1], 
116 |                            c='red', marker='o')
117 |         
118 |         axarr[idx].set_title(tt)
119 |         axarr[idx].set_ylabel(labels[0], fontsize=12)
120 |         axarr[idx].set_xlabel(labels[1], fontsize=12)
121 |     # plt.text(10.2, -1.2, 
122 |     #          s='Hue', 
123 |     #          ha='center', va='center', fontsize=12)
124 |         
125 |     plt.tight_layout()
126 |     title='Bagging'
127 |     #plt.savefig('./figures/bagging_region.png', 
128 |     #            dpi=300, 
129 |     #            bbox_inches='tight')
130 |     ocr_utils.show_figures(plt, title)
131 | 
132 |     print ('\n########################### No Errors ####################################')
133 | 


--------------------------------------------------------------------------------
/p115_l1_l2_regularization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' l1_l2_regularization.py
  3 | 
  4 |     Show the effects of l1 versus l2 regulartion. 
  5 |     l1 introduces a weight penalty equal to the sum of the absolute weights
  6 |         times a given factor, lambda
  7 |         l1, tends to drive a number of weights to zero and thus yields a 
  8 |         sparse weight matrix
  9 |         
 10 |     l2 introduces a weight penalty equal to the sum of squares of the
 11 |         weights times lambda.
 12 |         l2, tends to reduces the size of the weights but does not drive
 13 |         them to 0
 14 |     
 15 | 
 16 |     1) get the data for all column sums in the e13b database
 17 |     2  run logistic regression both with l1 and l2 regulization printing
 18 |     out the accuracies and sampling of the coefficients.
 19 |     Show how the weights respond versus the regularization
 20 |      
 21 |     
 22 |     
 23 | Created on Jun 23, 2016
 24 | 
 25 | from Python Machine Learning by Sebastian Raschka under the following license
 26 | 
 27 | The MIT License (MIT)
 28 | 
 29 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 30 | 
 31 | Permission is hereby granted, free of charge, to any person obtaining a copy
 32 | of this software and associated documentation files (the "Software"), to deal
 33 | in the Software without restriction, including without limitation the rights
 34 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 35 | copies of the Software, and to permit persons to whom the Software is
 36 | furnished to do so, subject to the following conditions:
 37 | 
 38 | The above copyright notice and this permission notice shall be included in all
 39 | copies or substantial portions of the Software.
 40 | 
 41 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 42 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 43 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 44 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 45 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 46 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 47 | SOFTWARE.
 48 | 
 49 | @author: richard lyman
 50 | '''
 51 | import numpy as np
 52 | import ocr_utils
 53 | 
 54 | columnsXY = range(0,20)    
 55 | y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=columnsXY , test_size=0.3, nChars=1000, random_state=0) 
 56 | 
 57 | from sklearn.preprocessing import StandardScaler
 58 | stdsc = StandardScaler()
 59 | X_train_std = stdsc.fit_transform(X_train)
 60 | X_test_std = stdsc.transform(X_test)
 61 | 
 62 | sc = StandardScaler()
 63 | sc.fit(X_train)
 64 | X_train_std = sc.transform(X_train)
 65 | X_test_std = sc.transform(X_test)
 66 | X_combined_std = np.vstack((X_train_std, X_test_std))
 67 | y_combined = np.hstack((y_train, y_test))
 68 | 
 69 | 
 70 | from sklearn.linear_model import LogisticRegression
 71 | 
 72 | lr = LogisticRegression(penalty='l1', C=0.1, random_state=0, solver='liblinear',multi_class='auto')
 73 | lr.fit(X_train_std, y_train)
 74 | print('Training accuracy-l1 regularization:', lr.score(X_train_std, y_train))
 75 | print('Test accuracy-l1 regularization:', lr.score(X_test_std, y_test))
 76 | print('lr.intercept_ L1 regularization')
 77 | print('\t{}'.format(lr.intercept_))
 78 | print('lr.coef_ L1 regularization')
 79 | print('\t{}'.format(lr.coef_))
 80 | 
 81 | 
 82 | lr = LogisticRegression(penalty='l2', C=0.1, random_state=0, solver='liblinear',multi_class='auto')
 83 | lr.fit(X_train_std, y_train)
 84 | print('Training accuracy-l2 regularization:', lr.score(X_train_std, y_train))
 85 | print('Test accuracy-l2 regularization:', lr.score(X_test_std, y_test))
 86 | print('lr.intercept L2 regularization')
 87 | print('\t{}'.format(lr.intercept_))
 88 | print('lr.coef_ L2 regularization')
 89 | print('\t{}'.format(lr.coef_))
 90 | 
 91 | import matplotlib.pyplot as plt
 92 | 
 93 | fig = plt.figure()
 94 | ax = plt.subplot(111)
 95 |     
 96 | colors = ['blue', 'green', 'red', 'cyan', 
 97 |          'magenta', 'yellow', 'black', 
 98 |           'pink', 'lightgreen', 'lightblue', 
 99 |           'gray', 'indigo', 'orange']
100 | 
101 | def weight_graph(regularization = 'l1'):
102 |     weights, params = [], []
103 |     for c in np.arange(0, 6):
104 |         lr = LogisticRegression(penalty=regularization, C=10**c, random_state=0, solver='liblinear',multi_class='auto')
105 |         lr.fit(X_train_std, y_train)
106 |         weights.append(lr.coef_[1])
107 |         params.append(10**c)
108 |     
109 |     weights = np.array(weights)
110 |     
111 |     for column, color in zip(range(weights.shape[1]), colors):
112 |         plt.plot(params, weights[:, column],
113 |                  label=columnsXY[column+1],
114 |                  color=color)
115 |     
116 |            
117 |     plt.axhline(0, color='black', linestyle='--', linewidth=3)
118 |     plt.xlim([10**(-5), 10**5])
119 |     plt.ylabel('weight coefficient')
120 |     plt.xlabel('C')
121 |     plt.xscale('log')
122 |     title = 'regularization {}'.format(regularization)
123 |     plt.title(title)
124 |     plt.legend(loc='upper left')
125 |     ax.legend(loc='upper center', 
126 |               bbox_to_anchor=(1.38, 1.03),
127 |               ncol=1, fancybox=True)
128 |     ocr_utils.show_figures(plt,title + ' path')
129 |     
130 | weight_graph(regularization = 'l1')
131 | weight_graph(regularization = 'l2')
132 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p25_perceptron.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | '''perceptron is a Python implementation of the Rosenblatt perceptron which
  3 | uses a non-differentiatable unit step function as the activation function.
  4 | 
  5 | The target classes such as the characters '0' and '1' are changed to -1 and 1
  6 | The difference between target value and the predicted target value multiplied
  7 | by a small 'eta' value is an update value used for adjusting the weights.
  8 | 
  9 | The weights are adjusted by the update value times the image.
 10 | 
 11 | This eventually converges the weights to value that can provide a good
 12 | prediction of new images.
 13 | 
 14 | The misclassification versus Epochs and the resulting decision regions
 15 | are plotted.
 16 | 
 17 | Created on Jun 20, 2016
 18 | 
 19 | from Python Machine Learning by Sebastian Raschka under the following license
 20 | 
 21 | The MIT License (MIT)
 22 | 
 23 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 24 | 
 25 | Permission is hereby granted, free of charge, to any person obtaining a copy
 26 | of this software and associated documentation files (the "Software"), to deal
 27 | in the Software without restriction, including without limitation the rights
 28 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 29 | copies of the Software, and to permit persons to whom the Software is
 30 | furnished to do so, subject to the following conditions:
 31 | 
 32 | The above copyright notice and this permission notice shall be included in all
 33 | copies or substantial portions of the Software.
 34 | 
 35 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 36 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 37 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 38 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 39 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 40 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 41 | SOFTWARE.
 42 | 
 43 | @author: richard lyman
 44 | '''
 45 | import ocr_utils
 46 | import numpy as np
 47 | import matplotlib.pyplot as plt
 48 | 
 49 | #############################################################################
 50 | # read features and scatter plot
 51 | 
 52 | # retrieve 500 sets of target numbers and column sums
 53 | #    y: the ascii characters 48 and 49 ('0', '1')
 54 | #    X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17
 55 | ascii_characters_to_train=(48,49)
 56 | columnsXY = (9,17)       
 57 | nchars=500
 58 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=nchars) 
 59 |  
 60 | 
 61 | #############################################################################
 62 | # Perceptron implementation from Python Machine Learning
 63 | class Perceptron(object):
 64 |     """Perceptron classifier.
 65 | 
 66 |     Parameters
 67 |     ------------
 68 |     eta : float
 69 |         Learning rate (between 0.0 and 1.0)
 70 |     n_iter : int
 71 |         Passes over the training dataset.
 72 | 
 73 |     Attributes
 74 |     -----------
 75 |     w_ : 1d-array
 76 |         Weights after fitting.
 77 |     errors_ : list
 78 |         Number of misclassifications in every epoch.
 79 | 
 80 |     """
 81 |     def __init__(self, eta=0.01, n_iter=10):
 82 |         self.eta = eta
 83 |         self.n_iter = n_iter
 84 | 
 85 |     def fit(self, X, y):
 86 |         """Fit training data.
 87 | 
 88 |         Parameters
 89 |         ----------
 90 |         X : {array-like}, shape = [n_samples, n_features]
 91 |             Training vectors, where n_samples is the number of samples and
 92 |             n_features is the number of features.
 93 |         y : array-like, shape = [n_samples]
 94 |             Target values.
 95 | 
 96 |         Returns
 97 |         -------
 98 |         self : object
 99 | 
100 |         """
101 |         self.w_ = np.zeros(1 + X.shape[1])
102 |         self.errors_ = []
103 | 
104 |         for _ in range(self.n_iter):
105 |             errors = 0
106 |             for xi, target in zip(X, y):
107 |                 update = self.eta * (target - self.predict(xi))
108 |                 self.w_[1:] += update * xi
109 |                 self.w_[0] += update
110 |                 errors += int(update != 0.0)
111 |             self.errors_.append(errors)
112 |         return self
113 | 
114 |     def net_input(self, X):
115 |         """Calculate net input"""
116 |         return np.dot(X, self.w_[1:]) + self.w_[0]
117 | 
118 |     def predict(self, X):
119 |         """Return class label after unit step"""
120 |         return np.where(self.net_input(X) >= 0.0, 1, -1)
121 | 
122 | #############################################################################
123 | # convert targets 9'0','1') to -1,+1
124 | # fit train the Perceptron
125 | # plot the misclassifications versus Epochs
126 | # plot the decision regions
127 |  
128 | y = np.where(y == ascii_characters_to_train[0], -1, 1)
129 | ppn = Perceptron(eta=0.1, n_iter=10)
130 | ppn.fit(X, y)
131 | 
132 | title = 'Simple Perception'
133 | plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
134 | plt.xlabel('Epochs')
135 | plt.ylabel('Number of misclassifications')
136 | plt.title(title)
137 | plt.tight_layout()
138 | ocr_utils.show_figures(plt, title)
139 | 
140 | ocr_utils.plot_decision_regions(X=X, 
141 |                            y=y, 
142 |                            classifier=ppn,
143 |                            labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 
144 |                            title="Perceptron Decision Regions")
145 | 
146 | 
147 | 
148 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p119_squential_backward_selection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' sequential backward selection
  3 | 
  4 | In order to reduce the complexity of the model, the number of features
  5 |     can be reduced by Sequential Backward Selection
  6 |     
  7 | Th e13b dataset has 20 column sums, one for each column in the original
  8 |     images.  Only a few of these would be needed to produce a good
  9 |     fit.
 10 |     
 11 | The SBS algorithm removes features by repeatedly running a fit of the data,
 12 |     selecting the feature for removal that makes the least difference to the 
 13 |     accuracy of the fit.
 14 |     
 15 |     
 16 | Created on Jun 23, 2016
 17 | 
 18 | from Python Machine Learning by Sebastian Raschka under the following license
 19 | 
 20 | The MIT License (MIT)
 21 | 
 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 23 | 
 24 | Permission is hereby granted, free of charge, to any person obtaining a copy
 25 | of this software and associated documentation files (the "Software"), to deal
 26 | in the Software without restriction, including without limitation the rights
 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 28 | copies of the Software, and to permit persons to whom the Software is
 29 | furnished to do so, subject to the following conditions:
 30 | 
 31 | The above copyright notice and this permission notice shall be included in all
 32 | copies or substantial portions of the Software.
 33 | 
 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 40 | SOFTWARE.
 41 | 
 42 | @author: richard lyman
 43 | '''
 44 | import numpy as np
 45 | import ocr_utils
 46 | import matplotlib.pyplot as plt
 47 | 
 48 | 
 49 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, random_state=0) 
 50 | 
 51 | 
 52 | from sklearn.model_selection import train_test_split
 53 | 
 54 | X_train, X_test, y_train, y_test = train_test_split(
 55 |          X, y, test_size=0.3, random_state=0)
 56 | 
 57 | from sklearn.preprocessing import StandardScaler
 58 | stdsc = StandardScaler()
 59 | X_train_std = stdsc.fit_transform(X_train)
 60 | X_test_std = stdsc.transform(X_test)
 61 | 
 62 | sc = StandardScaler()
 63 | sc.fit(X_train)
 64 | X_train_std = sc.transform(X_train)
 65 | X_test_std = sc.transform(X_test)
 66 | X_combined_std = np.vstack((X_train_std, X_test_std))
 67 | y_combined = np.hstack((y_train, y_test))
 68 | 
 69 | from sklearn.base import clone
 70 | from itertools import combinations
 71 | 
 72 | from sklearn.metrics import accuracy_score
 73 | 
 74 | class SBS():
 75 |     def __init__(self, estimator, k_features,
 76 |         scoring=accuracy_score,
 77 |         test_size=0.25, random_state=1):
 78 |         self.scoring = scoring
 79 |         self.estimator = clone(estimator)
 80 |         self.k_features = k_features
 81 |         self.test_size = test_size
 82 |         self.random_state = random_state
 83 |         
 84 |     def fit(self, X, y):
 85 |         X_train, X_test, y_train, y_test = \
 86 |         train_test_split(X, y, test_size=self.test_size,
 87 |         random_state=self.random_state)
 88 |         dim = X_train.shape[1]
 89 |         self.indices_ = tuple(range(dim))
 90 |         self.subsets_ = [self.indices_]
 91 |         score = self._calc_score(X_train, y_train,
 92 |         X_test, y_test, self.indices_)
 93 |         self.scores_ = [score]
 94 |         while dim > self.k_features:
 95 |             scores = []
 96 |             subsets = []
 97 |             for p in combinations(self.indices_, r=dim-1):
 98 |                 score = self._calc_score(X_train, y_train,
 99 |                 X_test, y_test, p)
100 |                 scores.append(score)
101 |                 subsets.append(p)
102 |             best = np.argmax(scores)
103 |             self.indices_ = subsets[best]
104 |             self.subsets_.append(self.indices_)
105 |             dim -= 1
106 |             self.scores_.append(scores[best])
107 |         self.k_score_ = self.scores_[-1]
108 |         return self
109 |     def transform(self, X):
110 |         return X[:, self.indices_]
111 | 
112 |     def _calc_score(self, X_train, y_train,
113 |         X_test, y_test, indices):
114 |         self.estimator.fit(X_train[:, indices], y_train)
115 |         y_pred = self.estimator.predict(X_test[:, indices])
116 |         score = self.scoring(y_test, y_pred)
117 |         return score
118 |  
119 | from sklearn.neighbors import KNeighborsClassifier
120 | 
121 | 
122 | knn = KNeighborsClassifier(n_neighbors=2)
123 | 
124 | # selecting features
125 | sbs = SBS(knn, k_features=1)
126 | sbs.fit(X_train_std, y_train)
127 | 
128 | # plotting performance of feature subsets
129 | k_feat = [len(k) for k in sbs.subsets_]
130 | 
131 | title='Sequential Backward Selection'
132 | plt.plot(k_feat, sbs.scores_, marker='o')
133 | plt.ylim([0.7, 1.1])
134 | plt.ylabel('Accuracy')
135 | plt.xlabel('Number of features')
136 | plt.grid()
137 | plt.title(title)
138 | plt.tight_layout()
139 | ocr_utils.show_figures(plt,title)
140 | 
141 | best=10
142 | k5 = list(sbs.subsets_[best])
143 | print('The best {} column_sums'.format(best))
144 | for s in k5:
145 |     print(labels[s])
146 | print() 
147 | 
148 |     
149 | knn.fit(X_train_std, y_train)
150 | print('Training accuracy using all features:', knn.score(X_train_std, y_train))
151 | print('Test accuracy using all features:', knn.score(X_test_std, y_test))
152 | 
153 | 
154 | knn.fit(X_train_std[:, k5], y_train)
155 | print('Training accuracy using {} features:'.format(best), knn.score(X_train_std[:, k5], y_train))
156 | print('Test accuracy using {} features:'.format(best), knn.score(X_test_std[:, k5], y_test))
157 | 
158 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p181_learning_curves.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | learning_curves.py
  3 | 
  4 | The learning curve shows the training and test accuracies versus the number
  5 | of training samples.  This can be used to determine if there are enough
  6 | training samples to sufficiently fit the training data.  If the curves
  7 | flatten out then there are enough samples.  If there is a big difference 
  8 | between the training and test accuracies then under and over fitting
  9 | can be determined.
 10 | 
 11 | The validation curve can be used to test a range of an estimation 
 12 | parameter such as the inverse regularization parameter, C, to see how the
 13 | test and accuracy varies.  This can be used to pick a value of C to reduce
 14 | over fitting and under fitting.
 15 | 
 16 | Created on Jul 5, 2016
 17 | 
 18 | from Python Machine Learning by Sebastian Raschka under the following license
 19 | 
 20 | The MIT License (MIT)
 21 | 
 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 23 | 
 24 | Permission is hereby granted, free of charge, to any person obtaining a copy
 25 | of this software and associated documentation files (the "Software"), to deal
 26 | in the Software without restriction, including without limitation the rights
 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 28 | copies of the Software, and to permit persons to whom the Software is
 29 | furnished to do so, subject to the following conditions:
 30 | 
 31 | The above copyright notice and this permission notice shall be included in all
 32 | copies or substantial portions of the Software.
 33 | 
 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 40 | SOFTWARE.
 41 | 
 42 | @author: richard lyman
 43 | '''
 44 | import matplotlib.pyplot as plt
 45 | from sklearn.model_selection import learning_curve
 46 | import numpy as np
 47 | import ocr_utils  
 48 | from sklearn.preprocessing import StandardScaler
 49 | from sklearn.linear_model import LogisticRegression
 50 | from sklearn.pipeline import Pipeline
 51 | 
 52 | if __name__ == '__main__':
 53 |     
 54 | 
 55 |     y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 
 56 |           
 57 |     pipe_lr = Pipeline([('scl', StandardScaler()),
 58 |                 ('clf', LogisticRegression(penalty='l2', random_state=0, solver='lbfgs'))])
 59 | 
 60 |     train_sizes, train_scores, test_scores =\
 61 |                     learning_curve(estimator=pipe_lr, 
 62 |                     X=X_train, 
 63 |                     y=y_train, 
 64 |                     train_sizes=np.linspace(0.1, 1.0, 10), 
 65 |                     cv=10,
 66 |                     n_jobs=8)
 67 | 
 68 |     train_mean = np.mean(train_scores, axis=1)
 69 |     train_std = np.std(train_scores, axis=1)
 70 |     test_mean = np.mean(test_scores, axis=1)
 71 |     test_std = np.std(test_scores, axis=1)
 72 | 
 73 |     plt.plot(train_sizes, train_mean, 
 74 |              color='blue', marker='o', 
 75 |              markersize=5, label='training accuracy')
 76 | 
 77 |     plt.fill_between(train_sizes, 
 78 |                      train_mean + train_std,
 79 |                      train_mean - train_std, 
 80 |                      alpha=0.15, color='blue')
 81 | 
 82 |     plt.plot(train_sizes, test_mean, 
 83 |              color='green', linestyle='--', 
 84 |              marker='s', markersize=5, 
 85 |              label='validation accuracy')
 86 | 
 87 |     plt.fill_between(train_sizes, 
 88 |                      test_mean + test_std,
 89 |                      test_mean - test_std, 
 90 |                      alpha=0.15, color='green')
 91 | 
 92 |     plt.grid()
 93 |     plt.xlabel('Number of training samples')
 94 |     plt.ylabel('Accuracy')
 95 |     plt.legend(loc='lower right')
 96 |     plt.ylim([0.8, 1.0])
 97 |     title='learning_curve'
 98 |     plt.title(title)
 99 |     plt.tight_layout()
100 |     ocr_utils.show_figures(plt,title)
101 | 
102 |     from sklearn.model_selection import validation_curve
103 | 
104 |     param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
105 |     train_scores, test_scores = validation_curve(
106 |                     estimator=pipe_lr, 
107 |                     X=X_train, 
108 |                     y=y_train, 
109 |                     param_name='clf__C', 
110 |                     param_range=param_range,
111 |                     cv=10,
112 |                     n_jobs=8)
113 | 
114 |     train_mean = np.mean(train_scores, axis=1)
115 |     train_std = np.std(train_scores, axis=1)
116 |     test_mean = np.mean(test_scores, axis=1)
117 |     test_std = np.std(test_scores, axis=1)
118 | 
119 |     plt.plot(param_range, train_mean, 
120 |              color='blue', marker='o', 
121 |              markersize=5, label='training accuracy')
122 | 
123 |     plt.fill_between(param_range, train_mean + train_std,
124 |                      train_mean - train_std, alpha=0.15,
125 |                      color='blue')
126 | 
127 |     plt.plot(param_range, test_mean, 
128 |              color='green', linestyle='--', 
129 |              marker='s', markersize=5, 
130 |              label='validation accuracy')
131 | 
132 |     plt.fill_between(param_range, 
133 |                      test_mean + test_std,
134 |                      test_mean - test_std, 
135 |                      alpha=0.15, color='green')
136 | 
137 |     plt.grid()
138 |     plt.xscale('log')
139 |     plt.legend(loc='lower right')
140 |     plt.xlabel('Parameter C')
141 |     plt.ylabel('Accuracy')
142 |     plt.ylim([0.8, 1.0])
143 |     title='validation_curve'
144 |     plt.title(title)
145 |     plt.tight_layout()
146 |     ocr_utils.show_figures(plt,title)
147 | 
148 |     print ('\n########################### No Errors ####################################')        
149 | 
150 | 


--------------------------------------------------------------------------------
/p62_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' logitistic_function.py replaces the activation function by the 
  3 |         logistic function also known as the sigmoid function.
  4 |     The logistic function starts with the racetrack odds ratio p/(1-p)
  5 |     The logit function is the log of this
  6 |     Solving the inverse function for p yields the logit function
  7 |     The output predicts the probability of an input sample belonging to 
  8 |         a class label.
  9 |     This gives a estimate of the probability that a input set of features 
 10 |         belong to a target class.
 11 |     Works best when classes are linearly separable.
 12 |     For multiclasses when there are  more than 2 classes, uses One versus Rest.
 13 |     The effect of the regularization parameter in the logistic regression is
 14 |         shown.
 15 |     
 16 |     1) Plot the logistic (sigmoid) function
 17 |     2)Create the cost function to be minimized by using the negative of the
 18 |         log likelihood function.
 19 |     3) Plot the two curves that make up the cost function, one for a target y
 20 |         that equals 1 and one for a target y that equals 0
 21 |         Use the sklearn package to fit the input data to a single perceptron using 
 22 |         the logistic function for an activation function.
 23 |     4) Plot the decision regions for 3 target classes from the E13B training set
 24 |     5) Plot the weight coefficients using two different regulartion values     
 25 |     
 26 | Created on Jun 23, 2016
 27 | 
 28 | from Python Machine Learning by Sebastian Raschka under the following license
 29 | 
 30 | The MIT License (MIT)
 31 | 
 32 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 33 | 
 34 | Permission is hereby granted, free of charge, to any person obtaining a copy
 35 | of this software and associated documentation files (the "Software"), to deal
 36 | in the Software without restriction, including without limitation the rights
 37 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 38 | copies of the Software, and to permit persons to whom the Software is
 39 | furnished to do so, subject to the following conditions:
 40 | 
 41 | The above copyright notice and this permission notice shall be included in all
 42 | copies or substantial portions of the Software.
 43 | 
 44 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 45 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 46 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 47 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 48 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 49 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 50 | SOFTWARE.
 51 | 
 52 | @author: richard lyman
 53 | '''
 54 | import numpy as np
 55 | import ocr_utils
 56 | import matplotlib.pyplot as plt
 57 | from sklearn.preprocessing import StandardScaler
 58 | from sklearn.linear_model import LogisticRegression
 59 | from sklearn.model_selection import train_test_split
 60 | 
 61 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17),nChars=500) 
 62 | 
 63 | def sigmoid(z):
 64 |     return 1.0 / (1.0 + np.exp(-z))
 65 | 
 66 | z = np.arange(-7, 7, 0.1)
 67 | phi_z = sigmoid(z)
 68 | title='sigmoid'
 69 | plt.plot(z, phi_z)
 70 | plt.axvline(0.0, color='k')
 71 | plt.axhspan(0.0, 1.0, facecolor='1.0', alpha=1.0, ls='dotted')
 72 | plt.axhline(y=0.5, ls='dotted', color='k')
 73 | plt.yticks([0.0, 0.5, 1.0])
 74 | plt.ylim(-0.1, 1.1)
 75 | plt.xlabel('z')
 76 | plt.ylabel('$\phi (z)$')
 77 | plt.title(title)
 78 | ocr_utils.show_figures(plt,title=title)
 79 | 
 80 | def cost_1(z):
 81 |     return - np.log(sigmoid(z))
 82 |                    
 83 | def cost_0(z):
 84 |     return - np.log(1 - sigmoid(z))
 85 | 
 86 | z = np.arange(-10, 10, 0.1)
 87 | phi_z = sigmoid(z)
 88 | 
 89 | c1 = [cost_1(x) for x in z]
 90 | plt.plot(phi_z, c1, label='J(w) if y=1')
 91 | 
 92 | c0 = [cost_0(x) for x in z]
 93 | plt.plot(phi_z, c0, linestyle='--', label='J(w) if y=0')
 94 | title='log cost'
 95 | plt.ylim(0.0, 5.1)
 96 | plt.xlim([0, 1])
 97 | plt.xlabel('$\phi$(z)')
 98 | plt.ylabel('J(w)')
 99 | plt.legend(loc='best')
100 | plt.title(title)
101 | plt.tight_layout()
102 | ocr_utils.show_figures(plt,title=title)
103 | 
104 | X_train, X_test, y_train, y_test = train_test_split(
105 |          X, y, test_size=0.3, random_state=0)
106 | 
107 | sc = StandardScaler()
108 | sc.fit(X_train)
109 | X_train_std = sc.transform(X_train)
110 | X_test_std = sc.transform(X_test)
111 | lr = LogisticRegression(C=1000.0, random_state=0, solver='lbfgs',multi_class='auto')
112 | lr.fit(X_train_std, y_train)
113 | X_combined_std = np.vstack((X_train_std, X_test_std))
114 | y_combined = np.hstack((y_train, y_test))
115 | ocr_utils.plot_decision_regions(
116 |                                          X=X_combined_std,                                        
117 |                                          y=y_combined,                                        
118 |                                          classifier=lr,         
119 |                                          labels = labels, 
120 |                                          test_idx=range(len(X_train_std),len(X_combined_std)),
121 |                                          title='logistic_regression')
122 | 
123 | 
124 | weights, params = [], []
125 | for c in np.arange(0, 5):
126 |     lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto')
127 |     lr.fit(X_train_std, y_train)
128 |     weights.append(lr.coef_[0])
129 |     params.append(10**c)
130 | 
131 | 
132 | title = 'regression_path'
133 | weights, params = [], []
134 | for c in np.arange(0, 5):
135 |     lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto')
136 |     lr.fit(X_train_std, y_train)
137 |     weights.append(lr.coef_[1])
138 |     params.append(10**c)
139 | 
140 | weights = np.array(weights)
141 | plt.plot(params, weights[:, 0], 
142 |          label=labels[0])
143 | plt.plot(params, weights[:, 1], linestyle='--', 
144 |          label=labels[1])
145 | plt.ylabel('weight coefficient')
146 | plt.xlabel('C')
147 | plt.legend(loc='upper left')
148 | plt.xscale('log')
149 | plt.title(title)
150 | ocr_utils.show_figures(plt,title=title)
151 | 
152 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p36_adaline_gd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | ''' adaline_gd replaces the activation function by the identity
  3 |     function, and adds a differentiable cost function, such as a 
  4 |     Sum of Squared Errors (SSE) 
  5 |     
  6 |     The gradient of the errors is used to adjust the weights.
  7 |     
  8 |     The learning rate is plotted for several different 'eta' multipliers
  9 |     of the gradient. Also, the effect on the learning rate of standardizing
 10 |     the features before training is shown.
 11 |     
 12 |     The decision regions are plotted.
 13 |     
 14 |     The cost function error versus Epochs is plotted
 15 |     
 16 | Created on Jun 20, 2016
 17 | 
 18 | from Python Machine Learning by Sebastian Raschka under the following license
 19 | 
 20 | The MIT License (MIT)
 21 | 
 22 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 23 | 
 24 | Permission is hereby granted, free of charge, to any person obtaining a copy
 25 | of this software and associated documentation files (the "Software"), to deal
 26 | in the Software without restriction, including without limitation the rights
 27 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 28 | copies of the Software, and to permit persons to whom the Software is
 29 | furnished to do so, subject to the following conditions:
 30 | 
 31 | The above copyright notice and this permission notice shall be included in all
 32 | copies or substantial portions of the Software.
 33 | 
 34 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 35 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 36 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 37 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 38 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 39 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 40 | SOFTWARE.
 41 | 
 42 | @author: richard lyman
 43 | '''
 44 | import ocr_utils
 45 | import numpy as np
 46 | import matplotlib.pyplot as plt
 47 | 
 48 | #############################################################################
 49 | # read features 
 50 | 
 51 | # retrieve 500 sets of target numbers and column sums
 52 | #    y: the target is ascii characters 48 and 51 ('0', '3')
 53 | #    X: the features to fit is the sum of the vertical pixels in the rows in 
 54 | #        horizontal columns 9 and 17
 55 |   
 56 | 
 57 | ascii_characters_to_train=(48,49)
 58 | columnsXY = (9,17)       
 59 | nchars=500
 60 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=120) 
 61 | 
 62 | y = np.where(y==ascii_characters_to_train[1],-1,1)
 63 | 
 64 | #############################################################################
 65 | # Adaline implementation from Python Machine Learning
 66 | class AdalineGD(object):
 67 |     """ADAptive LInear NEuron classifier.
 68 | 
 69 |     Parameters
 70 |     ------------
 71 |     eta : float
 72 |         Learning rate (between 0.0 and 1.0)
 73 |     n_iter : int
 74 |         Passes over the training dataset.
 75 | 
 76 |     Attributes
 77 |     -----------
 78 |     w_ : 1d-array
 79 |         Weights after fitting.
 80 |     errors_ : list
 81 |         Number of misclassifications in every epoch.
 82 | 
 83 |     """
 84 |     def __init__(self, eta=0.01, n_iter=50):
 85 |         self.eta = eta
 86 |         self.n_iter = n_iter
 87 | 
 88 |     def fit(self, X, y):
 89 |         """ Fit training data.
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         X : {array-like}, shape = [n_samples, n_features]
 94 |             Training vectors, where n_samples is the number of samples and
 95 |             n_features is the number of features.
 96 |         y : array-like, shape = [n_samples]
 97 |             Target values.
 98 | 
 99 |         Returns
100 |         -------
101 |         self : object
102 | 
103 |         """
104 |         self.w_ = np.zeros(1 + X.shape[1])
105 |         self.cost_ = []
106 | 
107 |         for i in range(self.n_iter):
108 |             output = self.net_input(X)
109 |             errors = (y - output)
110 |             self.w_[1:] += self.eta * X.T.dot(errors)
111 |             self.w_[0] += self.eta * errors.sum()
112 |             cost = (errors**2).sum() / 2.0
113 |             self.cost_.append(cost)
114 |         return self
115 | 
116 |     def net_input(self, X):
117 |         """Calculate net input"""
118 |         return np.dot(X, self.w_[1:]) + self.w_[0]
119 | 
120 |     def activation(self, X):
121 |         """Compute linear activation"""
122 |         return self.net_input(X)
123 | 
124 |     def predict(self, X):
125 |         """Return class label after unit step"""
126 |         
127 |         return np.where(self.activation(X) >= 0.0, 1, -1)
128 | title = 'Gradient Descent Learning rate 0.01'
129 | 
130 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
131 | ada1 = AdalineGD(n_iter=10, eta=0.01).fit(X, y)
132 | ax[0].plot(range(1, len(ada1.cost_) + 1), np.log10(ada1.cost_), marker='o')
133 | ax[0].set_xlabel('Epochs')
134 | ax[0].set_ylabel('log(Sum-squared-error)')
135 | ax[0].set_title('Adaline - Learning rate 0.01')
136 | ada2 = AdalineGD(n_iter=10, eta=0.0001).fit(X, y)
137 | 
138 | ax[1].plot(range(1, len(ada2.cost_) + 1), ada2.cost_, marker='o')
139 | ax[1].set_xlabel('Epochs')
140 | ax[1].set_ylabel('Sum-squared-error')
141 | ax[1].set_title('Adaline - Learning rate 0.0001')
142 | ocr_utils.show_figures(plt, title)
143 | 
144 | 
145 | 
146 | # 
147 | # plt.plot(range(1,len(ada1.cost_)+1), np.log10(ada1.cost_), marker='o',label = title)
148 | # plt.title(title)
149 | # ocr_utils.show_figures(plt, title)
150 | # 
151 | # ada2 = AdalineGD(n_iter=15, eta=0.0001).fit(X, y)
152 | # title = 'Gradient Descent Learning rate 0.0001'
153 | # plt.plot(range(1,len(ada2.cost_)+1), np.log10(ada2.cost_) ,marker='x',label = title)
154 | # plt.title(title)
155 | # ocr_utils.show_figures(plt, title)
156 | # standardize features
157 | X_std = np.copy(X)
158 | X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
159 | X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
160 | 
161 | ada = AdalineGD(n_iter=15, eta=0.01)
162 | ada.fit(X_std, y)
163 | ocr_utils.plot_decision_regions(X=X_std, 
164 |                             y=y,
165 |                             classifier=ada, 
166 |                             labels= labels,
167 |                             title='Adaline - Gradient Descent standardized rate 0.01')
168 | 
169 | title = 'Standardized Gradient Descent Learning rate 0.01'
170 | plt.plot(range(1,len(ada2.cost_)+1), np.log10(ada2.cost_) ,marker='x',label = title)
171 | plt.title(title)
172 | ocr_utils.show_figures(plt, title)
173 | 
174 | plt.plot(range(1,len(ada.cost_)+1), np.log10(ada.cost_), marker='v', label='standardized rate 0.01')
175 | plt.xlabel('Epochs')
176 | plt.ylabel('log(Sum-squared-error)')
177 | plt.legend(loc='lower left')
178 | plt.title('Adaline - Gradient Descent')
179 | plt.tight_layout()
180 | ocr_utils.show_figures(plt, 'Adaline - Gradient Descent')
181 | 
182 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/p44_adaline_sgd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | '''adaline_sgd.py illustrates Stochastic Gradient Descent.
  3 | 
  4 | First, the weights are updated after each training sample instead of 
  5 | calculating the error for the entire batch. This causes the weights to
  6 | converge faster than the batch method.
  7 | 
  8 | Second, the samples can be shuffled to avoid bias based on the order of samples in
  9 | the training set.
 10 | 
 11 | The decision regions and the speed of convergence is plotted
 12 | 
 13 | Created on Jun 22, 2016
 14 | 
 15 | from Python Machine Learning by Sebastian Raschka under the following license
 16 | 
 17 | The MIT License (MIT)
 18 | 
 19 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 20 | 
 21 | Permission is hereby granted, free of charge, to any person obtaining a copy
 22 | of this software and associated documentation files (the "Software"), to deal
 23 | in the Software without restriction, including without limitation the rights
 24 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 25 | copies of the Software, and to permit persons to whom the Software is
 26 | furnished to do so, subject to the following conditions:
 27 | 
 28 | The above copyright notice and this permission notice shall be included in all
 29 | copies or substantial portions of the Software.
 30 | 
 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 37 | SOFTWARE.
 38 | 
 39 | @author: richard lyman
 40 | '''
 41 | import ocr_utils
 42 | import numpy as np
 43 | from numpy.random import  seed
 44 | import matplotlib.pyplot as plt
 45 | 
 46 | 
 47 | #############################################################################
 48 | # read images and scatter plot
 49 | 
 50 | # retrieve 100 sets of target numbers and column sums
 51 | #    y: the ascii characters 48 and 49 ('0', '1')
 52 | #    X: the sum of the vertical pixels in the rows in horizontal columns 9 and 17
 53 | 
 54 | ascii_characters_to_train=(48,51)
 55 | columnsXY = (9,17)       
 56 | nchars=500
 57 | y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = ascii_characters_to_train , columns=columnsXY,nChars=120) 
 58 | y = np.where(y==ascii_characters_to_train[1],-1,1)
 59 | 
 60 | #############################################################################
 61 | # AdalineSGD from Python Machine Learning
 62 | class AdalineSGD(object):
 63 |     """ADAptive LInear NEuron classifier.
 64 | 
 65 |     Parameters
 66 |     ------------
 67 |     eta : float
 68 |         Learning rate (between 0.0 and 1.0)
 69 |     n_iter : int
 70 |         Passes over the training dataset.
 71 | 
 72 |     Attributes
 73 |     -----------
 74 |     w_ : 1d-array
 75 |         Weights after fitting.
 76 |     errors_ : list
 77 |         Number of misclassifications in every epoch.
 78 |     shuffle : bool (default: True)
 79 |         Shuffles training data every epoch if True to prevent cycles.
 80 |     random_state : int (default: None)
 81 |         Set random state for shuffling and initializing the weights.
 82 |         
 83 |     """
 84 |     def __init__(self, eta=0.01, n_iter=10, shuffle=True, random_state=None):
 85 |         self.eta = eta
 86 |         self.n_iter = n_iter
 87 |         self.w_initialized = False
 88 |         self.shuffle = shuffle
 89 |         if random_state:
 90 |             seed(random_state)
 91 |         
 92 |     def fit(self, X, y):
 93 |         """ Fit training data.
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         X : {array-like}, shape = [n_samples, n_features]
 98 |             Training vectors, where n_samples is the number of samples and
 99 |             n_features is the number of features.
100 |         y : array-like, shape = [n_samples]
101 |             Target values.
102 | 
103 |         Returns
104 |         -------
105 |         self : object
106 | 
107 |         """
108 |         self._initialize_weights(X.shape[1])
109 |         self.cost_ = []
110 |         for i in range(self.n_iter):
111 |             if self.shuffle:
112 |                 X, y = self._shuffle(X, y)
113 |             cost = []
114 |             for xi, target in zip(X, y):
115 |                 cost.append(self._update_weights(xi, target))
116 |             avg_cost = sum(cost)/len(y)
117 |             self.cost_.append(avg_cost)
118 |         return self
119 | 
120 |     def partial_fit(self, X, y):
121 |         """Fit training data without reinitializing the weights"""
122 |         if not self.w_initialized:
123 |             self._initialize_weights(X.shape[1])
124 |         if y.ravel().shape[0] > 1:
125 |             for xi, target in zip(X, y):
126 |                 self._update_weights(xi, target)
127 |         else:
128 |             self._update_weights(X, y)
129 |         return self
130 | 
131 |     def _shuffle(self, X, y):
132 |         """Shuffle training data"""
133 |         r = np.random.permutation(len(y))
134 |         return X[r], y[r]
135 |     
136 |     def _initialize_weights(self, m):
137 |         """Initialize weights to zeros"""
138 |         self.w_ = np.zeros(1 + m)
139 |         self.w_initialized = True
140 |         
141 |     def _update_weights(self, xi, target):
142 |         """Apply Adaline learning rule to update the weights"""
143 |         output = self.net_input(xi)
144 |         error = (target - output)
145 |         self.w_[1:] += self.eta * xi.dot(error)
146 |         self.w_[0] += self.eta * error
147 |         cost = 0.5 * error**2
148 |         return cost
149 |     
150 |     def net_input(self, X):
151 |         """Calculate net input"""
152 |         return np.dot(X, self.w_[1:]) + self.w_[0]
153 | 
154 |     def activation(self, X):
155 |         """Compute linear activation"""
156 |         return self.net_input(X)
157 | 
158 |     def predict(self, X):
159 |         """Return class label after unit step"""
160 |         return np.where(self.activation(X) >= 0.0, 1, -1)
161 | #############################################################################
162 | # standardize features,fit, and plot
163 | X_std = np.copy(X)
164 | X_std[:,0] = (X[:,0] - X[:,0].mean()) / X[:,0].std()
165 | X_std[:,1] = (X[:,1] - X[:,1].mean()) / X[:,1].std()
166 | ada = AdalineSGD(n_iter=15, eta=0.01, random_state = 1)
167 | ada.fit(X_std, y)
168 | 
169 | ocr_utils.plot_decision_regions(X=X_std, 
170 |                            y=y,
171 |                            classifier=ada, 
172 |                            title='Adaline - Stochastic Gradient Descent',
173 |                            labels=labels)
174 | 
175 | title='Adaline - Stochastic Gradient Descent'
176 | plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
177 | plt.xlabel('Epochs')
178 | plt.ylabel('Average Cost')
179 | plt.title(title)
180 | plt.tight_layout()
181 | ocr_utils.show_figures(plt, title)
182 | 
183 | print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/q3_removing_affine_distortion.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 23, 2016
  3 | 
  4 | @author: richard
  5 | '''
  6 | '''
  7 | Created on Jul 12, 2016
  8 | This program shows how Principal Component Analysis removes affine 
  9 | transformation distortions.
 10 | 
 11 | Parallel lines in an image remain parallel after an affine transformation. 
 12 | For instance, if an image is rotated or sheared, lines remain parallel.
 13 | 
 14 | PCA and LDA can remove affine transformations.  This is shown by making 3 shapes
 15 | and then making a number of shear versions of the shapes.  Running 
 16 | Principal Component Analysis reduces the number of features necessary to
 17 | recognize the features during Logistic Regression with 100% accuracy, 
 18 | down to 2 from 400 (20 columns by 20 rows).
 19 | 
 20 | We make three images and then make about 80 copies of each image created by
 21 | shearing the original image.
 22 | 
 23 | Since there is very little noise introduced by the shearing, almost all of
 24 | the explained variance is due to the shearing. PCA finds eigenvectors
 25 | that line up with shearing.
 26 | 
 27 | 1) For a couple of shapes, make sheared version.
 28 | 2) train and print accuracies without PCA 
 29 | 3) repeat, but use PCA first before training.
 30 | 4) observe the improvement
 31 | 
 32 | Do the same thing for Linear Discriminant Analysis
 33 | 
 34 | @author: richard
 35 | '''
 36 | 
 37 | import numpy as np
 38 | import ocr_utils   
 39 | from sklearn.metrics import accuracy_score
 40 | white_space = 5
 41 | 
 42 | #########################################################################
 43 | # make a 3 basic images with about 80 sheared clones each
 44 | 
 45 | plus = np.zeros((20,20))
 46 | box = np.zeros((20,20))
 47 | vee = np.zeros((20,20))
 48 | 
 49 | plus[range(white_space, 20-white_space),9:10] = 1.0
 50 | plus[9:10,range(white_space, 20-white_space)] = 1.0
 51 | 
 52 | box[white_space,range(white_space, 20-white_space)] = 1.0 #top
 53 | box[20-white_space, range(white_space, 20 -white_space)] = 1.0 #bottom
 54 | box[range(white_space, 20-white_space), white_space] = 1.0 # left
 55 | box[range(white_space, 20-white_space), 20 - white_space] = 1.0  #right
 56 | 
 57 | for i in range(20):
 58 |     vee[i,19-int(i/2)] = 1.0
 59 |     vee[i,int(i/2)] = 1.0
 60 |     
 61 | # make some skewed versions of the shapes
 62 | import skimage.transform as tf
 63 | 
 64 | def shear(X, skew):
 65 |     rows = X.shape[0]
 66 |     cols = X.shape[1]    
 67 |     ratioY = skew*cols/rows
 68 |     matrix =  np.array( [[1, ratioY, 0] ,[0, 1, 0] ,[0, 0, 1 ]])                                         
 69 |     tp=tf.ProjectiveTransform(matrix=matrix) 
 70 |     f = tf.warp(X, tp)      
 71 |     return f
 72 | 
 73 | # make some skewed versions of the shapes
 74 | skewRange = np.linspace(-0.5,0.5,81)
 75 | images = np.empty((3*len(skewRange),20,20))
 76 | ys = np.empty((3*len(skewRange)))
 77 | # make sheared versions of shapes
 78 | for i,skew in enumerate(skewRange):
 79 |     images[3*i] = shear(plus,skew)
 80 |     images[3*i+1] = shear(box,skew)
 81 |     images[3*i+2] = shear(vee,skew)
 82 |     ys[3*i] = 0
 83 |     ys[3*i+1] = 1
 84 |     ys[3*i+2] = 2
 85 |     
 86 | title='skewed versions of shapes'
 87 | ocr_utils.montage(images,title=title) 
 88 | 
 89 | num_image=images.shape[0]
 90 | images_reshaped = np.reshape(images,(num_image, 20*20))
 91 | 
 92 | #########################################################################
 93 | # run a Logistic Regression on the raw features with 20 rows, 20 columns
 94 | 
 95 | from sklearn.linear_model import LogisticRegression
 96 | from sklearn.model_selection import train_test_split
 97 | 
 98 | X_train , X_test, y_train, y_test = train_test_split(images_reshaped, ys, test_size=0.3, random_state=0)
 99 | 
100 | lr = LogisticRegression()
101 | lr.fit(X_train, y_train)
102 | y_train_pred = lr.predict(X_train)
103 | y_test_pred = lr.predict(X_test)
104 | 
105 | print('\nTrain Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_train, y_train_pred), lr.coef_.shape))
106 | print('Test Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_test, y_test_pred), lr.coef_.shape))
107 | 
108 | #########################################################################
109 | # run Principal Component analysis first, then Logistic Regression
110 | 
111 | from sklearn.decomposition import PCA
112 | n_components = 2
113 | pca = PCA(n_components=n_components)
114 | X_train_pca = pca.fit_transform(X_train)
115 | X_test_pca = pca.transform(X_test)
116 | 
117 | print('\nPCA components = {}'.format(pca.components_.shape))
118 | 
119 | lr = LogisticRegression()
120 | logistic_fitted = lr.fit(X_train_pca, y_train)
121 | 
122 | y_train_pred = logistic_fitted.predict(X_train_pca)
123 | y_test_pred = logistic_fitted.predict(X_test_pca)
124 | 
125 | print('\nPCA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components,lr.coef_.shape))
126 | print('PCA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,lr.coef_.shape))
127 | 
128 | X_errors_image = X_test[y_test!=y_test_pred]
129 | y_errors = y_test[y_test!=y_test_pred]
130 | X_errors_pca = X_test_pca[y_test!=y_test_pred]
131 | 
132 | # change to a 2D shape 
133 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], 20, 20))
134 | ocr_utils.montage(X_errors2D,title='PCA Error Images, components={}'.format (n_components))
135 | 
136 | X_combined = np.vstack((X_train_pca, X_test_pca))
137 | y_combined = np.hstack((y_train, y_test))
138 | 
139 | ocr_utils.plot_decision_regions(
140 |                                          X=X_combined,                                        
141 |                                          y=y_combined,                                        
142 |                                          classifier=lr,  
143 |                                          labels = ['PC1','PC2']  ,     
144 |                                          title='logistic_regression after 2 component PCA')
145 | 
146 | #########################################################################
147 | # run Linear Discriminant Analysis first then Logistic Regression
148 | 
149 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
150 | n_components = 2
151 | lda = LDA(n_components=n_components)
152 | 
153 | X_train_lda = lda.fit_transform(X_train, y_train)
154 | X_test_lda = lda.transform(X_test)
155 | print('\nLDA components = {}'.format(pca.components_.shape))
156 | lr = LogisticRegression()
157 | logistic_fitted = lr.fit(X_train_lda, y_train)
158 | 
159 | y_train_pred = logistic_fitted.predict(X_train_lda)
160 | y_test_pred = logistic_fitted.predict(X_test_lda)
161 | 
162 | print('\nLDA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),lda.n_components,lr.coef_.shape))
163 | print('LDA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),lda.n_components,lr.coef_.shape))
164 | 
165 | X_errors_image = X_test[y_test!=y_test_pred]
166 | 
167 | # change to a 2D shape 
168 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], 20, 20))
169 | ocr_utils.montage(X_errors2D,title='LDA Error Images, components={}'.format (n_components))
170 | 
171 | X_combined = np.vstack((X_train_lda, X_test_lda))
172 | y_combined = np.hstack((y_train, y_test))
173 | 
174 | ocr_utils.plot_decision_regions(
175 |                                          X=X_combined,                                        
176 |                                          y=y_combined,                                        
177 |                                          classifier=lr,  
178 |                                          labels = ['LDA1','LDA2']  ,     
179 |                                          title='logistic_regression after 2 component LDA')
180 | 
181 | print ('\n########################### No Errors ####################################')
182 | 


--------------------------------------------------------------------------------
/p314_k_means.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 18, 2016
  3 | k_means.py
  4 | 
  5 | K-means is a an alogithm for finding clusters of similar data without
  6 | supervision.  We give it the number of clusters we are looking for and
  7 | it lumps the samples together.  If does this by finding a centroid, assigning
  8 | the closest smaples to the centroid, recomputing the centroid from
  9 | the mean of the samples etc. 
 10 | 
 11 | It basically uses a Euclidean distance to evaluate whether a sample 
 12 | belongs to a centroid.  So it is good for spherical data, but has
 13 | trouble with non spherical data.
 14 | 
 15 | Unfortunately, the data in the ocr_utils is not spherical so we
 16 | get some odd results.
 17 | 
 18 | For this program 
 19 |     input a bunch of samples from ocr_utils,
 20 |      run K means on them and 
 21 |      display the results.
 22 |      
 23 |      Repeat this for k++ means, that places beginning centroids far away from
 24 |      each other.
 25 |      
 26 |      Run an 'elbow plot' that uses the inertia values from each cluster
 27 |      versus the number of clusters.  It shows how many cluster we need
 28 |      to get the inertia distortion values to stabilize.
 29 |      
 30 |     Make some montage plots of images so that we can see what images are
 31 |     in the clusters.
 32 | 
 33 | from Python Machine Learning by Sebastian Raschka under the following license
 34 | 
 35 | The MIT License (MIT)
 36 | 
 37 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 38 | 
 39 | Permission is hereby granted, free of charge, to any person obtaining a copy
 40 | of this software and associated documentation files (the "Software"), to deal
 41 | in the Software without restriction, including without limitation the rights
 42 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 43 | copies of the Software, and to permit persons to whom the Software is
 44 | furnished to do so, subject to the following conditions:
 45 | 
 46 | The above copyright notice and this permission notice shall be included in all
 47 | copies or substantial portions of the Software.
 48 | 
 49 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 50 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 51 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 52 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 53 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 54 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 55 | SOFTWARE.
 56 | 
 57 | @author: richard lyman
 58 | '''
 59 | import numpy as np
 60 | import ocr_utils
 61 | import matplotlib.pyplot as plt
 62 | n=200
 63 | 
 64 | chars_to_train = range(48,51)
 65 | columnsXY=(9,17)
 66 | column_str = 'column_sum{}'.format(list(columnsXY))
 67 | skewRange = np.linspace(-0.5,0.5,81)
 68 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
 69 | 
 70 | # output  the character label and the image and column sums
 71 | output_feature_list = ['m_label','image',column_str] 
 72 | 
 73 | # read the complete image (20x20) = 400 pixels for each character
 74 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
 75 |                             output_feature_list=output_feature_list, 
 76 |                             random_state=0)
 77 |    
 78 | y = ds.train.features[0][:n]
 79 | X_image = ds.train.features[1][:n]
 80 | X = ds.train.features[2][:n]
 81 | 
 82 | # put the ASCII equivalent of the unique characters in y into the legend of the plot
 83 | legend=[]
 84 | for ys in np.unique(y):
 85 |     legend.append('{} \'{}\''.format(ys, chr(ys)))
 86 |            
 87 | ocr_utils.scatter_plot(X=X, 
 88 |                   y=y,
 89 |                   legend_entries=legend,
 90 |                   axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 
 91 |                   title='k-means cluster E13B sum of columns')
 92 | 
 93 | from sklearn.cluster import KMeans
 94 | km = KMeans(n_clusters=3,init='random',n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1)
 95 | y_km = km.fit_predict(X)
 96 | 
 97 | legend=[]
 98 | for ys in np.unique(y_km):
 99 |     legend.append('{}\''.format(ys))
100 | 
101 | plt.scatter(km.cluster_centers_[:,0], 
102 |             km.cluster_centers_[:,1], 
103 |             s=250, 
104 |             marker='*', 
105 |             c='red', 
106 |             label='centroids')
107 |            
108 | ocr_utils.scatter_plot(X=X, 
109 |                   y=y_km,
110 |                   legend_entries=legend,
111 |                   axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 
112 |                   title='column sums k means centroids')
113 | 
114 | km = KMeans(n_clusters=3,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1)
115 | y_km = km.fit_predict(X)
116 | 
117 | legend=[]
118 | for ys in np.unique(y_km):
119 |     legend.append('{}\''.format(ys))
120 | 
121 | plt.scatter(km.cluster_centers_[:,0], 
122 |             km.cluster_centers_[:,1], 
123 |             s=250, 
124 |             marker='*', 
125 |             c='red', 
126 |             label='k++')
127 |            
128 | ocr_utils.scatter_plot(X=X, 
129 |                   y=y_km,
130 |                   legend_entries='',
131 |                   axis_labels = ['column {} sum'.format(columnsXY[i]) for i in range(len(columnsXY))], 
132 |                   title='column sums k++ means centroids')
133 | 
134 | 
135 | for i in range(0,km.cluster_centers_.shape[0]):
136 |     image_index2 = np.argwhere(y_km == i)
137 |     x2d = X_image[image_index2].reshape((image_index2.shape[0],ds.train.num_rows, ds.train.num_columns))
138 |     ocr_utils.montage(x2d,title='k++ cluster {}'.format(i))
139 |     
140 | ##############################################
141 | # separate the original images by cluster
142 | # print(km.cluster_centers_.shape)
143 | 
144 | n=30000
145 | 
146 | chars_to_train = range(48,58)
147 | columnsXY=range(0,20)
148 | column_str = 'column_sum{}'.format(list(columnsXY))
149 | skewRange = np.linspace(-0.5,0.5,81)
150 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
151 | 
152 | # output  the character label and the image and column sums
153 | output_feature_list = ['m_label','image'] 
154 | 
155 | # read the complete image (20x20) = 400 pixels for each character
156 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
157 |                             output_feature_list=output_feature_list, 
158 |                             random_state=0)
159 |    
160 | y = ds.train.features[0][:n]
161 | X_image = ds.train.features[1][:n]
162 | # X = ds.train.features[2][:n]
163 | 
164 | distortions=[]
165 | for i in range(1,30):
166 |     km = KMeans(n_clusters=i,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1)
167 |     y_km = km.fit_predict(X_image)
168 |     distortions.append(km.inertia_)
169 |     
170 | plt.plot(range(1,30), distortions, marker='o')
171 | plt.xlabel('Number of clusters')
172 | plt.ylabel('Distortion')
173 | title = '2D image elbow distortion'
174 | plt.title(title)
175 | ocr_utils.show_figures(plt, title)
176 |   
177 |   
178 | km = KMeans(n_clusters=8,n_init=10,max_iter=300,tol=1e-04,random_state=0,n_jobs=-1)
179 | y_km = km.fit_predict(X_image)
180 |         
181 | nClusters = km.cluster_centers_.shape[0]
182 | x2d = []
183 | sz = np.zeros((nClusters))
184 | 
185 | for i in range(0,nClusters):
186 |     image_index2 = np.argwhere(y_km == i)
187 |     x2d.append( X_image[image_index2].reshape((image_index2.shape[0],ds.train.num_rows, ds.train.num_columns)))
188 |     print (i,x2d[i].shape[0])
189 |     sz[i] = image_index2.shape[0]
190 |              
191 | args= np.argsort(sz)[::-1]
192 | print(sz[args])
193 | print(args)
194 | for i in range(0,nClusters):    
195 |     ocr_utils.montage(x2d[args[i]],title='2D image cluster {}'.format(i))
196 |     
197 |     
198 | print ('\n########################### No Errors ####################################')    
199 | 
200 | 


--------------------------------------------------------------------------------
/p194_receiver_operating_characteristic.py:
--------------------------------------------------------------------------------
  1 | ''' receiver_operating_characterist.py
  2 | 
  3 | A receiver operating characterist plot is a plot of the true positive rate
  4 | against the false positive rate for a dataset with binary outcomes
  5 | 
  6 | A threshold for determining whether a sample is positive or negative is
  7 | the independent variable that is varied to produce the values in the graph.
  8 | 
  9 | The AUC, Area Under the Curve can be calculated.  The closer to 1.0, the
 10 | better the classification.
 11 | 
 12 | Created on Jul 9, 2016
 13 | 
 14 | from Python Machine Learning by Sebastian Raschka under the following license
 15 | 
 16 | The MIT License (MIT)
 17 | 
 18 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 19 | 
 20 | Permission is hereby granted, free of charge, to any person obtaining a copy
 21 | of this software and associated documentation files (the "Software"), to deal
 22 | in the Software without restriction, including without limitation the rights
 23 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 24 | copies of the Software, and to permit persons to whom the Software is
 25 | furnished to do so, subject to the following conditions:
 26 | 
 27 | The above copyright notice and this permission notice shall be included in all
 28 | copies or substantial portions of the Software.
 29 | 
 30 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 31 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 32 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 33 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 34 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 35 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 36 | SOFTWARE.
 37 | 
 38 | @author: richard lyman
 39 | '''
 40 | # from sklearn.metrics import make_scorer,roc_curve, auc
 41 | from scipy import interp
 42 | import matplotlib.pyplot as plt
 43 | import numpy as np
 44 | import ocr_utils  
 45 | from sklearn.preprocessing import StandardScaler
 46 | from sklearn.linear_model import LogisticRegression
 47 | from sklearn.pipeline import Pipeline
 48 | from sklearn.model_selection import StratifiedKFold
 49 | from sklearn.decomposition import PCA
 50 | from sklearn.model_selection import train_test_split
 51 | from sklearn.metrics import make_scorer,precision_score,roc_curve, auc
 52 | from sklearn.metrics import roc_auc_score, accuracy_score
 53 | from sklearn.model_selection import cross_val_score    
 54 | 
 55 | if __name__ == '__main__':
 56 | 
 57 |     y, X, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) 
 58 |     from sklearn.preprocessing import LabelEncoder
 59 | 
 60 |     # the ROC is for data with a binary outcome. Change the ASCII characters to 0,1
 61 |     le = LabelEncoder()
 62 |     y = le.fit_transform(y)
 63 |     le.transform((48,51))
 64 | 
 65 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
 66 | 
 67 |     pipe_lr = Pipeline([('scl', StandardScaler()),
 68 |                         ('pca', PCA(n_components=2)),
 69 |                         ('clf', LogisticRegression(penalty='l2',random_state=0,C=100.0, solver='lbfgs'))])
 70 | 
 71 |     # X_train2 = X_train[:, [4, 14]]
 72 |     X_train2 = X_train
 73 |     
 74 |     kfold = StratifiedKFold(n_splits=3, random_state=1)
 75 |         
 76 | #         scores = []
 77 | #         for train_index, test_index in kfold.split(X_train, y_train):
 78 | #             pipe_lr.fit(X_train[train_index], y_train[train_index])
 79 | #             score = pipe_lr.score(X_train[test_index], y_train[test_index])            
 80 | #             scores.append(score)
 81 |     
 82 |     
 83 |     
 84 |     
 85 |     
 86 |     
 87 |     
 88 | #     cv = StratifiedKFold(y_train,n_folds=3,random_state=1)
 89 |     fig = plt.figure(figsize=(7, 5))
 90 | 
 91 |     mean_tpr = 0.0
 92 |     mean_fpr = np.linspace(0, 1, 100)
 93 |     all_tpr = []
 94 |     i=0
 95 |     for train_index, test_index in kfold.split(X_train, y_train):
 96 |         probas = pipe_lr.fit(X_train2[train_index], 
 97 |                              y_train[train_index]).predict_proba(X_train2[test_index])
 98 |         
 99 |         fpr, tpr, thresholds = roc_curve(y_train[test_index], 
100 |                                          probas[:, 1], 
101 |                                          pos_label=1)
102 |         mean_tpr += interp(mean_fpr, fpr, tpr)
103 |         mean_tpr[0] = 0.0
104 |         roc_auc = auc(fpr, tpr)
105 |         i=i+1
106 |         plt.plot(fpr, 
107 |                  tpr, 
108 |                  lw=1, 
109 |                  label='ROC fold %d (area = %0.2f)' 
110 |                         % (i, roc_auc))
111 | 
112 |     plt.plot([0, 1], 
113 |              [0, 1], 
114 |              linestyle='--', 
115 |              color=(0.6, 0.6, 0.6), 
116 |              label='random guessing')
117 | 
118 |     mean_tpr /= kfold.get_n_splits(X_train)
119 |     mean_tpr[-1] = 1.0
120 |     mean_auc = auc(mean_fpr, mean_tpr)
121 |     plt.plot(mean_fpr, mean_tpr, 'k--',
122 |              label='mean ROC (area = %0.2f)' % mean_auc, lw=2)
123 |     plt.plot([0, 0, 1], 
124 |              [0, 1, 1], 
125 |              lw=2, 
126 |              linestyle=':', 
127 |              color='black', 
128 |              label='perfect performance')
129 | 
130 |     plt.xlim([-0.05, 1.05])
131 |     plt.ylim([-0.05, 1.05])
132 |     plt.xlabel('false positive rate')
133 |     plt.ylabel('true positive rate')
134 |     title='Receiver Operator Characteristic'
135 |     plt.title(title)
136 |     plt.legend(loc="lower right")
137 |     plt.tight_layout()
138 |     ocr_utils.show_figures(plt,title)
139 | 
140 | 
141 |     pipe_lr = pipe_lr.fit(X_train2, y_train)
142 |     # y_pred2 = pipe_lr.predict(X_test[:, [4, 14]])
143 |     y_pred2 = pipe_lr.predict(X_test)
144 | 
145 |     print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred2))
146 |     print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred2))
147 |     #===================================================================================================================================================
148 |     #illustrates how to make a scorer using the precision evaluation value
149 |     # for more than 2 classes for GridSearch
150 |     # i.e. applies a binary scoring technique to multiclasses
151 |     pos_label=range(48,58)
152 | #     pre_scorer = make_scorer(score_func=precision_score, 
153 | #                              pos_label=pos_label, 
154 | #                              greater_is_better=True, 
155 | #                              average='micro')
156 | 
157 |     from sklearn.svm import SVC
158 |     y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = pos_label , nChars=4000, columns=(9,17), random_state=0) 
159 |     pipe_svc = Pipeline([('scl', StandardScaler()),
160 |                 ('clf', SVC(random_state=1))])
161 |     c_gamma_range = [0.01, 0.1, 1.0, 10.0]
162 |      
163 |     param_grid = [{'clf__C': c_gamma_range, 
164 |                    'clf__kernel': ['linear']},
165 |                      {'clf__C': c_gamma_range, 
166 |                       'clf__gamma': c_gamma_range, 
167 |                       'clf__kernel': ['rbf'],}]
168 |     from sklearn.model_selection import GridSearchCV
169 |     gs = GridSearchCV(estimator=pipe_svc, 
170 |                                 param_grid=param_grid, 
171 |                                 scoring='accuracy',
172 |                                 cv=5,
173 |                                 n_jobs=-1)
174 | 
175 | 
176 |     scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
177 |     print('\nSupport Vector Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
178 | 
179 |     gs = gs.fit(X_train, y_train)
180 |     print('Support Vector Machine Grid Search best score: {}'.format(gs.best_score_))
181 |     print('Support Vector Machine Grid Search best params: {}\n'.format(gs.best_params_))
182 | 
183 |     print ('\n########################### No Errors ####################################')
184 | 
185 | 


--------------------------------------------------------------------------------
/o1_top_secret_cnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | 
  4 | """# ==========================================================================
  5 | 
  6 | # Copyright 2015 Google Inc. All Rights Reserved.
  7 | #
  8 | # Licensed under the Apache License, Version 2.0 (the "License");
  9 | # you may not use this file except in compliance with the License.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | # ==============================================================================
 20 | 
 21 | 
 22 | encode a secret message in the angle of rotation of characters
 23 | 
 24 | Train a neural network on rotated versions of characters with the output of 
 25 | the network being the angle of rotation.  
 26 | 
 27 | Thus, given a rotated character, the neural network will yield a value 
 28 | that is the amount of rotation of the character.
 29 | 
 30 | Encode a test set by applying a secret message with one bit for each character.
 31 | Decode the secret message by running the rotated characters through the 
 32 | neural network, yielding the pattern of bits.
 33 | 
 34 | 
 35 | @author: richard lyman
 36 | 
 37 | """# ==============================================================================
 38 | import ocr_utils
 39 | 
 40 | 
 41 | import numpy as np
 42 | from PIL import Image, ImageDraw
 43 | import io
 44 | #import n1_2cnv1fc as nnetwork     
 45 | #import n1_residual3x4 as nnetwork 
 46 | import n1_2cnv2fc as nnetwork     
 47 | import skimage.transform as af  
 48 | from bitarray import bitarray 
 49 | 
 50 | 
 51 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))}  
 52 | output_feature_list  = ['orientation_one_hot','image']   
 53 | dtype = np.float32
 54 | 
 55 | skewRange = np.linspace(-0.2,0.2,2)
 56 |        
 57 | '''
 58 |  pick up the base character
 59 |  
 60 |  make a training set by rotating them through n angles
 61 |  
 62 |  train
 63 |  
 64 |  pick up the base characters
 65 |  encode the secret message n bits at a time into the characters
 66 |  this is the testing set
 67 |  
 68 |  test secret message yielding a vector of rotations
 69 |  
 70 |  convert the rotation back into bits
 71 |  
 72 |  assemble the bits into the secret message.
 73 |  '''
 74 | 
 75 | 
 76 | # pick up the base characters from training_image_file
 77 | # produce some sheared versions 
 78 | # make into a training set
 79 | # place in a ocr_utils TruthedCharacters class so we can use the
 80 | # one hot and batch functions 
 81 | 
 82 | character_size = 100
 83 | white_space=8
 84 | 
 85 | image_file= '15-01-01 459_Mont_Lyman'
 86 | image_file_jpg = image_file+'.jpg'
 87 | 
 88 | df,t1  = ocr_utils.file_to_df(image_file,character_size,title='Characters to Train',white_space=white_space)
 89 | 
 90 | shp = t1.shape
 91 | totalN = len(skewRange)*shp[0]
 92 | 
 93 | images=[]
 94 | originalH=[]
 95 | originalW=[]
 96 | tops=[]
 97 | lefts=[]
 98 | orientation=[]
 99 | recognized_label =[]
100 | 
101 | 
102 | 
103 | for j in range(shp[0]):
104 |     for i,skew in enumerate(skewRange):
105 |         k = i+j*len(skewRange)
106 |         
107 |         images.append(ocr_utils.shear(t1[j],skew))
108 |         originalH.append(df['originalH'][j])
109 |         tops.append(df['m_top'][j])        
110 |         originalW.append(df['originalW'][j])
111 |         lefts.append(df['m_left'][j])
112 |         
113 |         orientation.append(skew)
114 |         recognized_label.append( df['m_label'][j])
115 | images=np.array(images)
116 | ocr_utils.montage(images, title='Base Characters Skewed')  
117 | 
118 | images = np.reshape(images,(images.shape[0],images.shape[1]*images.shape[2]))
119 | df = ocr_utils.make_df(images, character_size, character_size, originalH, originalW, tops, lefts, orientation, recognized_label )
120 | #df = ocr_utils.make_df(images, character_size, character_size, bottoms, rights, tops, lefts, orientation, recognized_label )
121 | 
122 | 
123 | # input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))}  
124 | input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))}  
125 | output_feature_list  = ['orientation_one_hot','image']   
126 | ds = ocr_utils.read_df(df,input_filters_dict = input_filters_dict, 
127 |                             output_feature_list=output_feature_list,
128 |                             test_size = 0,
129 |                             engine_type='tensorflow',
130 |                             dtype=dtype) 
131 | 
132 | nn = nnetwork.network(ds.train) 
133 | """# ==============================================================================
134 | 
135 | Train and Evaluate the Model
136 | 
137 | """# ==============================================================================
138 |     
139 | nn.fit( ds.train ,  nEpochs=5000)
140 | 
141 | #######################################################################################
142 | 
143 | # now that the font is trained, pick up some text and encode a message
144 | image_file= '15-01-01 459_Mont_Lyman'
145 | image_file_jpg = image_file+'.jpg'
146 | df,t1 = ocr_utils.file_to_df(image_file,character_size, title = 'unencrypted file',white_space=white_space)
147 |  
148 |  
149 | secret_message = "top secret"
150 | a = bitarray()   
151 | a.frombytes(secret_message.encode('utf_8'))
152 | 
153 | index = 0
154 | encoded_skews=[]
155 | def convert_to_shear(a):
156 |     index = 0
157 |     while True:
158 |         if index < len(a):        
159 |             bits = a[index:index+1].to01()      
160 |             index += 1
161 |             #c = int(bits,2)
162 |             c = int(bits)            
163 |             yield c
164 |         else:
165 |             yield -1   
166 |             
167 | gen= convert_to_shear(a)
168 | 
169 | im = Image.open(image_file_jpg)     
170 | img2 = Image.new('L',(im.height,im.width),color=255)
171 | img3 = Image.new('L',(im.height,im.width),color=255)
172 | draw = ImageDraw.Draw(img3) 
173 | for i in range(t1.shape[0]):
174 |     left = int(df['m_left'][i])
175 |     right = left + int(df['originalW'][i])
176 |     top = int(df['m_top'][i])  
177 |     bottom = top + int(df['originalH'][i])  
178 |     skew_index = next(gen)
179 |     #print ('i={}, skew_index={}, left={}, top={}, right={}, bottom={}'.format(i,skew_index, left,top,right,bottom))    
180 |     encoded_skews.append(skew_index)
181 |     if skew_index >= 0:
182 |         t1[i] = ocr_utils.shear(t1[i], skewRange[skew_index]) 
183 |     im_clip = Image.fromarray(256.0-t1[i]*256.0)  
184 |     img2.paste(im_clip, box= (left , top))
185 |     img3.paste(im_clip, box= (left , top))  
186 |          
187 | 
188 |     draw.rectangle((left,top,right+2*white_space,bottom+2*white_space), outline=0)
189 |     
190 | gen.close()
191 | 
192 | ###########################################################################vvvvvvv
193 | image_file= '/tmp/plots/01_encrypted_file'
194 | image_file_jpg = image_file+'.jpg'
195 | img2.save(image_file_jpg)   
196 | 
197 | 
198 | image_file3= '/tmp/plots/01_03_encrypted_file_with_box'
199 | image_file3_jpg = image_file3+'.jpg'
200 | img3.save(image_file3_jpg)    
201 | 
202 | ''' test the new encrptyed file
203 | '''
204 | df,t1  = ocr_utils.file_to_df(image_file,character_size, title = 'Encrypted File',white_space=white_space)
205 | 
206 | ds = ocr_utils.read_df(df,input_filters_dict = input_filters_dict, 
207 |                             output_feature_list=output_feature_list,
208 |                             test_size = 1,
209 |                             engine_type='tensorflow',
210 |                             dtype=dtype)
211 |     
212 | results = nn.predict(ds.test)  
213 | correct_characters=[]
214 | incorrect_characters=[]
215 | for i,x in enumerate(df['m_label']):
216 |     try:
217 |         print('index={}, original character={}, result= {}, skew={}'.format(i, chr(int(x)),results[i], encoded_skews[i])   )  
218 |         if encoded_skews[i] >=0:
219 |             if results[i] == encoded_skews[i]:
220 |                 correct_characters.append(chr(int(x)))
221 |             else:
222 |                 incorrect_characters.append(chr(int(x)))
223 |     except:
224 |         print ('index out of bounds={}'.format(i))
225 | print ('correct characters={}'.format(correct_characters))       
226 | print ('incorrect characters={}'.format(incorrect_characters))
227 |      
228 | print ('\n########################### No Errors ####################################')
229 | 
230 | 


--------------------------------------------------------------------------------
/p177_k_fold_cross_validation.py:
--------------------------------------------------------------------------------
  1 | '''k_fold_model_selection.py
  2 | k fold cross validation splits the training set into n parts and uses a 
  3 | different 1/n of the test set for each iteration.  It is good for
  4 | tuning  parameters as all samples are used, reducing the variance of the 
  5 | model performance.
  6 | 
  7 | Using a pipeline automates the steps by putting into a batch pipe
  8 |     1) scaling, 
  9 |     2) Principle Component Analysis, and
 10 |     3) training 
 11 |     
 12 |     StratifiedKFold returns lists of the indexes of the X samples and y
 13 |     target samples to be used for each fold
 14 |     
 15 |     The cross_val_score returns accuracy scores for each k-fold predictor
 16 |     from each fold.
 17 |     
 18 | Created on Jul 5, 2016
 19 | 
 20 | from Python Machine Learning by Sebastian Raschka under the following license
 21 | 
 22 | The MIT License (MIT)
 23 | 
 24 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 25 | 
 26 | Permission is hereby granted, free of charge, to any person obtaining a copy
 27 | of this software and associated documentation files (the "Software"), to deal
 28 | in the Software without restriction, including without limitation the rights
 29 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 30 | copies of the Software, and to permit persons to whom the Software is
 31 | furnished to do so, subject to the following conditions:
 32 | 
 33 | The above copyright notice and this permission notice shall be included in all
 34 | copies or substantial portions of the Software.
 35 | 
 36 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 37 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 38 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 39 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 40 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 41 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 42 | SOFTWARE.
 43 | 
 44 | @author: richard lyman
 45 | '''
 46 | 
 47 | 
 48 | import ocr_utils
 49 | import matplotlib.pyplot as plt
 50 | import numpy as np
 51 | from sklearn.model_selection import StratifiedKFold
 52 | from sklearn.discriminant_analysis import  LinearDiscriminantAnalysis as LDA  
 53 | 
 54 | if __name__ == '__main__':
 55 |     #charsToTrain=range(48,58)
 56 |     chars_to_train = range(48,58)
 57 |     n_classes = len(chars_to_train)
 58 |     
 59 |     num_chars = 3000 #limit the number to speed up the calculation
 60 |     
 61 |     input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
 62 |     
 63 |     # output  the character label and the image and column sums
 64 |     output_feature_list = ['m_label','image'] 
 65 |     
 66 |     # read the complete image (20x20) = 400 pixels for each character
 67 |     ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
 68 |                                 output_feature_list=output_feature_list, 
 69 |                                 random_state=0)
 70 |        
 71 |     y_train = ds.train.features[0][:num_chars]
 72 |     X_train = ds.train.features[1][:num_chars]
 73 |     
 74 |     # y_test = ds.test.features[0]-48
 75 |     # X_test = ds.test.features[1]
 76 |     # y_train, X_train, y_test,  X_test, labels  = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0)  
 77 |     
 78 |     from sklearn.linear_model import LogisticRegression
 79 |     from sklearn.model_selection import train_test_split
 80 |     
 81 |     X_train , X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0)
 82 |     
 83 |     from sklearn.preprocessing import StandardScaler
 84 |     #  
 85 |     # sc = StandardScaler()
 86 |     # X_train_std = sc.fit_transform(X_train)
 87 |     # X_test_std = sc.fit_transform(X_test)
 88 |      
 89 |     # X_train, X_test, y_train, y_test = \
 90 |     #         train_test_split(X, y, test_size=0.20, random_state=1)
 91 |     
 92 |     from sklearn.decomposition import PCA
 93 |     
 94 |     from sklearn.pipeline import Pipeline
 95 |     
 96 |     num_planes = range(2,12)
 97 |     
 98 |     pca_scores =[]
 99 |     pca_std_dev =[]
100 |     for num_PCA in num_planes:
101 |         print ('number of Principal Components = {}'.format(num_PCA))
102 |         pipe_lr = Pipeline([('scl', StandardScaler()),
103 |                     ('pca', PCA(n_components=num_PCA, svd_solver='full')),
104 |                     ('clf', LogisticRegression(random_state=1,multi_class='auto', solver='liblinear'))])
105 | 
106 |         pipe_lr.fit(X_train, y_train)
107 |         print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
108 |         
109 |         kfold = StratifiedKFold(n_splits=10, random_state=1)
110 |         
111 |         scores = []
112 |         for train_index, test_index in kfold.split(X_train, y_train):
113 |             pipe_lr.fit(X_train[train_index], y_train[train_index])
114 |             score = pipe_lr.score(X_train[test_index], y_train[test_index])
115 |             scores.append(score)
116 |             #print ('train {} samples: {}'.format(len(train), train))
117 |             #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
118 |             
119 |         print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
120 |         from sklearn.model_selection import cross_val_score
121 |         
122 |         scores = cross_val_score(estimator=pipe_lr, 
123 |                                  X=X_train, 
124 |                                  y=y_train, 
125 |                                  cv=10,
126 |                                  n_jobs=-1)
127 |         print('CV accuracy scores: %s' % scores)
128 |         print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
129 |         pca_scores.append(np.mean(scores))
130 |         pca_std_dev.append(np.std(scores))    
131 |     
132 |     plt.plot(num_planes, pca_scores,  marker='o')
133 |     plt.ylabel('Accuracy')
134 |     plt.xlabel('number of Principal Components')
135 |     title = 'Accuracy versus number of Principal Components'
136 |     plt.title(title)    
137 |     plt.tight_layout()
138 |     ocr_utils.show_figures(plt, title)
139 |     
140 |     plt.plot(num_planes, pca_std_dev,  marker='o')
141 |     plt.ylabel('Standard Deviation')
142 |     plt.xlabel('number of Principal Components')
143 |     title = 'Standard Deviation versus number of Principal Components'
144 |     plt.title(title)    
145 |     plt.tight_layout()
146 |     ocr_utils.show_figures(plt, title)
147 |     
148 |     pca_scores =[]
149 |     pca_std_dev =[]
150 |     for num_LDA in num_planes:
151 |         print ('number of Principal Components = {}'.format(num_LDA))
152 |         pipe_lr = Pipeline([('scl', StandardScaler()),
153 |                     ('lda', LDA(n_components=min(num_LDA,n_classes-1), solver='eigen')),
154 |                     ('clf', LogisticRegression(random_state=1,multi_class='auto',solver='liblinear'))])
155 | 
156 |         kys = pipe_lr.get_params().keys()  
157 |         print(kys)      
158 | #         pipe_lr.set_params(lda__solver='eigen',clf__solver='liblinear',clf__multi_class='auto')             
159 |         pipe_lr.fit(X_train, y_train)
160 |         print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
161 |         
162 |         kfold = StratifiedKFold(n_splits=10, random_state=1)
163 |         
164 |         scores = []
165 |         for train_index, test_index in kfold.split(X_train, y_train):
166 |             pipe_lr.fit(X_train[train_index], y_train[train_index])
167 |             score = pipe_lr.score(X_train[test_index], y_train[test_index])            
168 |             scores.append(score)
169 |             #print ('train {} samples: {}'.format(len(train), train))
170 |             #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score))
171 |             
172 |         print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
173 |     
174 |         
175 |         scores = cross_val_score(estimator=pipe_lr, 
176 |                                  X=X_train, 
177 |                                  y=y_train, 
178 |                                  cv=10,
179 |                                  n_jobs=-1)
180 |         print('CV accuracy scores: %s' % scores)
181 |         print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
182 |         pca_scores.append(np.mean(scores))
183 |         pca_std_dev.append(np.std(scores))    
184 |     
185 |     plt.plot(num_planes, pca_scores,  marker='o')
186 |     plt.ylabel('Accuracy')
187 |     plt.xlabel('number of Linear Discriminants')
188 |     title = 'Accuracy versus number of Linear Discriminants'
189 |     plt.title(title)    
190 |     plt.tight_layout()
191 |     ocr_utils.show_figures(plt, title)
192 |     
193 |     plt.plot(num_planes, pca_std_dev,  marker='o')
194 |     plt.ylabel('Standard Deviation')
195 |     plt.xlabel('number of Linear Discriminants')
196 |     title = 'Standard Deviation versus number of Linear Discriminants'
197 |     plt.title(title)    
198 |     plt.tight_layout()
199 |     ocr_utils.show_figures(plt, title)
200 |     
201 |     print ('\n########################### No Errors ####################################')


--------------------------------------------------------------------------------
/n1_residual3x4.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | takes an image input and trains it to make an image output
  4 | 
  5 | funnels down to a 'key' and then goes back up to image
  6 | 
  7 | 
  8 | 
  9 | '''
 10 | from tensorflow.compat import v1 as tf
 11 | import numpy as np
 12 | from collections import namedtuple
 13 | import datetime
 14 | import ocr_utils
 15 | from  n0_network  import  base_network as b_network
 16 | 
 17 | class network(b_network):
 18 |     ''' definition of the network
 19 |     '''
 20 |     def __init__(self, truthed_features, dtype=np.float32):
 21 |         self._sess = tf.InteractiveSession()  
 22 | 
 23 |         lst = []
 24 |         extra_features_width = 0 # width of extra features
 25 |         
 26 |         """# ==============================================================================
 27 |         
 28 |         Placeholders
 29 |         
 30 |         Compute the size of various layers 
 31 |         
 32 |         Create a tensorflow Placeholder for each feature of data returned from the
 33 |         dataset
 34 |         
 35 |         """# ==============================================================================        
 36 |         
 37 |         for i,nm in enumerate(truthed_features.feature_names):
 38 |             
 39 |             # features[0], is always the target. For instance it may be m_label_one_hot 
 40 |             # the second features[1] is the 'image' that is passed to the convolution layers 
 41 |             # Any additional features bypass the convolution layers and go directly 
 42 |             # into the fully connected layer.  
 43 |             
 44 |             # The width of the extra features is calculated in order to allocate 
 45 |             # the correct widths of weights,  # and inputs 
 46 |             # names are assigned to make the look pretty on the tensorboard graph.
 47 |             
 48 |             if i == 0:
 49 |                 nm = 'y_'+nm
 50 |             else:
 51 |                 nm = 'x_'+nm
 52 |             if i>1:
 53 |                 extra_features_width += truthed_features.feature_width[i]
 54 |             lst.append(tf.placeholder(dtype, shape=[None, truthed_features.feature_width[i]], name=nm))
 55 |             
 56 |         # ph is a named tuple with key names like 'image', 'm_label', and values that
 57 |         # are tensors.  The display name on the Chrome graph are 'y_m_label', 'self._x_image, 
 58 |         # x_upper_case etc.
 59 |     
 60 |     
 61 |         Place_Holders = namedtuple('Place_Holders', truthed_features.feature_names)   
 62 |         self._ph = Place_Holders(*lst) # unpack placeholders into named Tuple
 63 |         self._keep_prob = tf.placeholder(dtype,name='keep_prob')    
 64 |         self._nRows = truthed_features.num_rows #image height
 65 |         self._nCols = truthed_features.num_columns #image width    
 66 |         nSections = 10
 67 |     
 68 |         in_out_width = self._nRows*self._nCols
 69 |         internal_width = int(in_out_width/4)
 70 |         w = list(range(nSections*3))
 71 |         b = list(range(nSections*3))
 72 |         h = list(range(nSections*3+1))        
 73 |         nFc1 = 2048      # size of fully connected layer
 74 | 
 75 |         nTarget = truthed_features.feature_width[0]  # the number of one_hot features in the target, 'm_label'
 76 |             
 77 |         """# ==============================================================================
 78 |         
 79 |         Build a Multilayer Convolutional Network
 80 |         
 81 |         Weight Initialization
 82 |         
 83 |         """# ==============================================================================
 84 |         
 85 |         def weight_variable(shape, dtype):
 86 |             initial = tf.truncated_normal(shape, stddev=0.1,dtype=dtype)
 87 |             return tf.Variable(initial)
 88 |         
 89 |         def bias_variable(shape, dtype):
 90 |             initial = tf.constant(0, shape=shape, dtype=dtype)
 91 |             return tf.Variable(initial)   
 92 |                 
 93 |         def shapeOuts(n):
 94 |             print ('n={}, hin={},w={}, b={} ,hout={}\n'.format(n, h[n].shape, w[n].shape, b[n].shape, h[n+1]._shape))
 95 |              
 96 |         def section(n):
 97 |             with tf.name_scope('section_'+str(n)+'_0') as scope:     
 98 |                 w[n]=weight_variable([in_out_width, internal_width],dtype)
 99 |                 b[n]=bias_variable([internal_width],dtype)  
100 |                 h[n+1] = tf.nn.relu(tf.matmul(h[n], w[n]) + b[n])
101 |                 shapeOuts(n)
102 |                  
103 |             with tf.name_scope('section_'+str(n)+'_1') as scope:  
104 |                 w[n+1]=weight_variable([internal_width, internal_width],dtype)
105 |                 b[n+1]=bias_variable([internal_width],dtype)     
106 |                                
107 |                 h[n+2]=tf.nn.relu(tf.matmul(h[n+1], w[n+1]) + b[n+1])
108 |                 shapeOuts(n+1)                  
109 |                                  
110 |             with tf.name_scope('section_'+str(n)+'_2') as scope:  
111 |                 w[n+2]=weight_variable([internal_width, in_out_width],dtype)
112 |                 b[n+2]=bias_variable([in_out_width],dtype)   
113 |                 z= tf.nn.relu(tf.matmul(h[n+2], w[n+2]) + b[n+2])
114 |                 h[n+3]= tf.add(z   ,h[n]) #n+3   
115 |                           
116 |                 print('z shape ={}'.format(z._shape)) 
117 |                 shapeOuts(n+2)                  
118 |             return    
119 |                    
120 |         def computeSize(s,tens):
121 |             sumC = 1
122 |             tShape = tens.get_shape()
123 |             nDims = len(tShape)
124 |             for i in range(nDims):
125 |                 sumC *= tShape[i]
126 |             print ('\t{}\t{}'.format(s,sumC),flush=True)
127 |             return sumC
128 |                          
129 |         """# ==============================================================================        
130 |         Build sectional network
131 |          
132 |         """# ==============================================================================      
133 |         h[0]= self._ph[1]
134 |         for i in range(nSections):
135 |             section(3*i)
136 |                  
137 |         """# ==============================================================================        
138 |         Dropout
139 |          
140 |         """# ==============================================================================
141 |         self._keep_prob = tf.placeholder(dtype,name='keep_prob')
142 |          
143 |         with tf.name_scope("drop") as scope:
144 |             h_fc2_drop = tf.nn.dropout(h[nSections*3], self._keep_prob)
145 |          
146 |         """# ==============================================================================
147 |          
148 |         Readout Layer
149 |          
150 |         """# ==============================================================================
151 |         with tf.name_scope("softmax") as scope:
152 |             w_fc3 = weight_variable([in_out_width, nTarget],dtype)
153 |             b_fc3 = bias_variable([nTarget],dtype)    
154 |             y_conv=tf.nn.softmax(tf.matmul(h_fc2_drop, w_fc3) + b_fc3)
155 |          
156 |         print ('network size:',flush=True)
157 |         total = 0
158 |         for i in range(nSections*3):
159 |             total = total + computeSize("w{}".format(i),w[i])
160 |         total = total + computeSize ("b_fc3",b_fc3) + \
161 |             computeSize ("w_fc3",w_fc3)      
162 |          
163 |         print('\ttotal\t{}'.format(total),flush=True)
164 |          
165 |             
166 |         with tf.name_scope("reshape_self._x_image") as scope:        
167 |             self._x_image = tf.reshape(self._ph.image, [-1,self._nCols,self._nRows,1])
168 |          
169 |         with tf.name_scope("xent") as scope:
170 |             # 1e-8 added to eliminate the crash of training when taking log of 0
171 |             cross_entropy = -tf.reduce_sum(self._ph[0]*tf.log(y_conv+1e-8))
172 |             ce_summ = tf.summary.scalar("cross entropy", cross_entropy)
173 |                  
174 |         with tf.name_scope("train") as scope:
175 |             self._train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
176 |              
177 |         with tf.name_scope("test") as scope:        
178 |             self._correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(self._ph[0],1))
179 |             self._prediction = tf.argmax(y_conv,1)            
180 |          
181 |             self._accuracy = tf.reduce_mean(tf.cast(self._correct_prediction, dtype))
182 |             accuracy_summary = tf.summary.scalar("accuracy", self._accuracy)    
183 | 
184 |         """# ==============================================================================
185 |         
186 |         Start TensorFlow Interactive Session
187 |         
188 |         """# ==============================================================================          
189 |         
190 |         self._sess.run(tf.initialize_all_variables())  
191 |         self._merged = tf.summary.merge_all()
192 |         tm = ""
193 |         tp = datetime.datetime.now().timetuple()
194 |         for i in range(4):
195 |             tm += str(tp[i])+'-'
196 |         tm += str(tp[4])    
197 |         
198 |         # To see the results in Chrome, 
199 |         # Run the following in terminal to activate server.
200 |         # tensorboard --logdir '/tmp/ds_logs/'
201 |         # See results on localhost:6006 
202 |         
203 |         self._writer = tf.summary.FileWriter("/tmp/ds_logs/"+ tm, self._sess.graph)
204 |         
205 |     def computeSize(s,tens):
206 |         sumC = 1
207 |         tShape = tens.get_shape()
208 |         nDims = len(tShape)
209 |         for i in range(nDims):
210 |             sumC *= tShape[i].value
211 |         print ('\t{}\t{}'.format(s,sumC),flush=True)
212 |         return sumC
213 |         
214 | 
215 |         
216 |     def __exit__(self, exc_type, exc_value, traceback):
217 |         tf.reset_default_graph() # only necessary when iterating through fonts
218 |         self._sess.close()  
219 |             
220 | 
221 |     def reset_graph(self):
222 |         tf.reset_default_graph() # only necessary when iterating through fonts
223 |         self._sess.close()  
224 |         
225 | #      
226 | #     def encode(self):  
227 | #         
228 | #         return key
229 | #         
230 | #     def decode(self, key):            


--------------------------------------------------------------------------------
/o3_top_secret_python_box.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 23, 2016
  3 | 
  4 | Created on Jul 12, 2016
  5 | This program shows how Principal Component Analysis removes affine 
  6 | transformation distortions.
  7 | 
  8 | Parallel lines in an image remain parallel after an affine transformation. 
  9 | For instance, if an image is rotated or sheared, lines remain parallel.
 10 | 
 11 | PCA and LDA can remove affine transformations.  This is shown by making 3 shapes
 12 | and then making a number of shear versions of the shapes.  Running 
 13 | Principal Component Analysis reduces the number of features necessary to
 14 | recognize the features during Logistic Regression with 100% accuracy, 
 15 | down to 2 from 400 (20 columns by 20 rows).
 16 | 
 17 | We make three images and then make about 80 copies of each image created by
 18 | shearing the original image.
 19 | 
 20 | Since there is very little noise introduced by the shearing, almost all of
 21 | the explained variance is due to the shearing. PCA finds eigenvectors
 22 | that line up with shearing.
 23 | 
 24 | 1) For a couple of shapes, make sheared version.
 25 | 2) train and print accuracies without PCA 
 26 | 3) repeat, but use PCA first before training.
 27 | 4) observe the improvement
 28 | 
 29 | Do the same thing for Linear Discriminant Analysis
 30 | encode a secret message in the angle of rotation of characters
 31 | 
 32 | Train a neural network on rotated versions of characters with the output of 
 33 | the network being the angle of rotation.  
 34 | 
 35 | Thus, given a rotated character, the neural network will yield a value 
 36 | that is the amount of rotation of the character.
 37 | 
 38 | Encode a test set by applying a secret message with one bit for each character.
 39 | Decode the secret message by running the rotated characters through the 
 40 | neural network, yielding the pattern of bits.
 41 | 
 42 |  pick up the base character
 43 |  
 44 |  make a training set by rotating them through n angles
 45 |  
 46 |  train
 47 |  
 48 |  pick up the base characters
 49 |  encode the secret message n bits at a time into the characters
 50 |  this is the testing set
 51 |  
 52 |  test secret message yielding a vector of rotations
 53 |  
 54 |  convert the rotation back into bits
 55 |  
 56 |  assemble the bits into the secret message.
 57 | 
 58 | 
 59 | @author: richard lyman
 60 | 
 61 | 
 62 | '''# ==============================================================================
 63 | 
 64 | import ocr_utils
 65 | 
 66 | 
 67 | import numpy as np
 68 | from PIL import Image, ImageDraw
 69 | import io
 70 | from sklearn.metrics import accuracy_score
 71 | from sklearn.decomposition import PCA
 72 | from sklearn.metrics import accuracy_score
 73 | #from sklearn.model_selection
 74 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 75 | from sklearn.linear_model import LogisticRegression
 76 | from sklearn.model_selection import train_test_split
 77 | 
 78 | # input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))}  
 79 | # output_feature_list  = ['orientation_one_hot','image']   
 80 | dtype = np.float32
 81 | character_size = 100
 82 | white_space = 10       
 83 | skewRange = np.linspace(-0.2,0.2,4)
 84 | 
 85 | class c_box(object):
 86 |     def __init__(self, top, left, right, bottom):
 87 |         self._top = top
 88 |         self._left = left
 89 |         self._right = right
 90 |         self._bottom = bottom
 91 | 
 92 | 
 93 | def find_min_max(sums):
 94 |     case = 0 
 95 |     mins = []
 96 |     maxes = []
 97 |     for i,sum in enumerate(sums):   
 98 |         '''
 99 |             case 0, going through area between characters
100 |                 if sum ==0 stay in case 0
101 |                 if sum != 0 set the top to i and switch to case 1
102 |             case 1, going through a character
103 |                 if sum ==0 set the bottom to i and drop to case 0
104 |                     also append the box to the list using 
105 |                     left = 0, and right = the width of the image                
106 |                 if sum !=0 then continue in case 1
107 |         '''
108 |       
109 |         if case==0 :
110 |             if sum != 0 :
111 |                 case = 1
112 |                 min= i
113 |         else:
114 |             if sum == 0    :             
115 |                 case = 0
116 |                 max= i
117 |                 mins.append(min)
118 |                 maxes.append(max)
119 |     return mins, maxes
120 |           
121 | 
122 | 
123 | # pick up the base characters from training_image_file
124 | # produce some skeared versions 
125 | # make into a training set
126 | # place in a ocr_utils TruthedCharacters class so we can use the
127 | # one hot and batch functions 
128 | 
129 | im = Image.open('15-01-01 459_Mont_Lyman.png')
130 | #im = Image.open('CourierFont.png')
131 | im = im.convert(mode='L')  
132 | data = 255-np.asarray( im, dtype="int32" )
133 | sums = np.sum(data,axis=1)
134 | mins, maxes = find_min_max(sums)
135 | boxes = []
136 | for top,bottom in zip(mins,maxes):
137 |     line = data[top:bottom]
138 |     line_sums = np.sum(line,axis=0)
139 |     lefts,rights = find_min_max(line_sums)
140 |     for left,right in zip(lefts,rights):
141 |         boxes.append(c_box(top,left,right,bottom))
142 |     
143 | images=[]
144 | orientation=[]
145 | recognized_label =[]
146 | for box in boxes:
147 |       
148 |     img2 = Image.new('L',(character_size,character_size),color=255)
149 |     
150 |     img =  im.crop(box=(box._left, box._top, box._right, box._bottom))     
151 |     img2.paste(img,box=(white_space,white_space))    
152 |     
153 |     imgByteArr = img2.tobytes()
154 |     lst = list(imgByteArr)
155 |     image = np.array(lst)/255.0 
156 |     image = 1.0 - image        
157 |     images.append(image)
158 | 
159 | height = im.height
160 | width = im.width
161 | 
162 | t1 = np.array(images)
163 | t1=np.reshape(t1,(t1.shape[0],character_size,character_size))
164 | ocr_utils.montage(t1, title='characters from file')
165 | 
166 | shp = t1.shape
167 | totalN = len(skewRange)*shp[0]
168 | images = []
169 | import skimage.transform as af  
170 | 
171 | for j in range(shp[0]):
172 |     for i,skew in enumerate(skewRange):       
173 |         images.append(ocr_utils.shear(t1[j],skew))       
174 |         orientation.append(skew)
175 | 
176 | images=np.array(images)
177 | ocr_utils.montage(images, title='characters being trained')
178 | images=np.reshape(images,(len(images),character_size*character_size))
179 | ys = ocr_utils.convert_to_unique(orientation)
180 | 
181 | 
182 | X_train , X_test, y_train, y_test = train_test_split(images, ys, test_size=0.3, random_state=0)
183 | print (y_test.shape)
184 | 
185 | lr = LogisticRegression()
186 | lr.fit(X_train, y_train)
187 | y_train_pred = lr.predict(X_train)
188 | y_test_pred = lr.predict(X_test)
189 | 
190 | print('\nTrain Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_train, y_train_pred), lr.coef_.shape))
191 | print('Test Accuracy: {:4.6f} coefficients={}'.format(accuracy_score(y_test, y_test_pred), lr.coef_.shape))
192 | 
193 | #########################################################################
194 | # run Principal Component analysis first, then Logistic Regression
195 | 
196 | n_components = 2
197 | pca = PCA(n_components=n_components)
198 | 
199 | X_train_pca = pca.fit_transform(X_train)
200 | X_test_pca = pca.transform(X_test)
201 | 
202 | print('\nPCA components = {}'.format(pca.components_.shape))
203 | 
204 | lr = LogisticRegression()
205 | logistic_fitted = lr.fit(X_train_pca, y_train)
206 | 
207 | y_train_pred = logistic_fitted.predict(X_train_pca)
208 | y_test_pred = logistic_fitted.predict(X_test_pca)
209 | 
210 | print('\nPCA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components,lr.coef_.shape))
211 | print('PCA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,lr.coef_.shape))
212 | 
213 | X_errors_image = X_test[y_test!=y_test_pred]
214 | y_errors = y_test[y_test!=y_test_pred]
215 | X_errors_pca = X_test_pca[y_test!=y_test_pred]
216 | 
217 | # change to a 2D shape 
218 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], character_size, character_size))
219 | ocr_utils.montage(X_errors2D,title='PCA Error Images, components={}'.format (n_components))
220 | 
221 | X_combined = np.vstack((X_train_pca, X_test_pca))
222 | y_combined = np.hstack((y_train, y_test))
223 | 
224 | ocr_utils.plot_decision_regions(
225 |                                          X=X_combined,                                        
226 |                                          y=y_combined,                                        
227 |                                          classifier=lr,  
228 |                                          labels = ['PC1','PC2']  ,     
229 |                                          title='logistic_regression after 2 component PCA')
230 | 
231 | 
232 | #########################################################################
233 | # run Linear Discriminant Analysis first then Logistic Regression
234 | 
235 | 
236 | n_components = 2
237 | lda = LinearDiscriminantAnalysis(n_components=n_components)
238 | 
239 | X_train_lda = lda.fit_transform(X_train, y_train)
240 | X_test_lda = lda.transform(X_test)
241 | 
242 | print('\nLDA components = {}'.format(pca.components_.shape))
243 | lr = LogisticRegression()
244 | logistic_fitted = lr.fit(X_train_lda, y_train)
245 | 
246 | y_train_pred = logistic_fitted.predict(X_train_lda)
247 | y_test_pred = logistic_fitted.predict(X_test_lda)
248 | 
249 | print('\nLDA Train Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_train, y_train_pred),lda.n_components,lr.coef_.shape))
250 | print('LDA Test Accuracy: {:4.6f}, n_components={} coefficients={}'.format(accuracy_score(y_test, y_test_pred),lda.n_components,lr.coef_.shape))
251 | 
252 | X_errors_image = X_test[y_test!=y_test_pred]
253 | 
254 | # change to a 2D shape 
255 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], character_size, character_size))
256 | ocr_utils.montage(X_errors2D,title='LDA Error Images, components={}'.format (n_components))
257 | 
258 | X_combined = np.vstack((X_train_lda, X_test_lda))
259 | y_combined = np.hstack((y_train, y_test))
260 | if X_combined.shape[1] > 1:
261 |     ocr_utils.plot_decision_regions(
262 |                                              X=X_combined,                                        
263 |                                              y=y_combined,                                        
264 |                                              classifier=lr,  
265 |                                              labels = ['LDA1','LDA2']  ,     
266 |                                              title='logistic_regression after 2 component LDA')
267 | print ('\n########################### No Errors ####################################')
268 | 


--------------------------------------------------------------------------------
/p131_principal_component_analysis.py:
--------------------------------------------------------------------------------
  1 | ''' principal_component_analysis.py 
  2 | Principal Component Analysis reduces the dimensionality of the feature set by
  3 | sorting the features by the explained variance.  It does this by
  4 | 1) computing a covariance matrix for the features
  5 | 2) finding the eigenvectors and eigenvalues of the matrix, principal components.
  6 | 3) computing explained variance for the components and sorting them
  7 | 
  8 | Always standardize because PCA is sensitive to scaling
  9 | 
 10 | 
 11 | Created on Jul 2, 2016
 12 | 
 13 | from Python Machine Learning by Sebastian Raschka under the following license
 14 | 
 15 | The MIT License (MIT)
 16 | 
 17 | Copyright (c) 2015, 2016 SEBASTIAN RASCHKA (mail@sebastianraschka.com)
 18 | 
 19 | Permission is hereby granted, free of charge, to any person obtaining a copy
 20 | of this software and associated documentation files (the "Software"), to deal
 21 | in the Software without restriction, including without limitation the rights
 22 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 23 | copies of the Software, and to permit persons to whom the Software is
 24 | furnished to do so, subject to the following conditions:
 25 | 
 26 | The above copyright notice and this permission notice shall be included in all
 27 | copies or substantial portions of the Software.
 28 | 
 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 30 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 31 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 32 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 33 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 34 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 35 | SOFTWARE.
 36 | 
 37 | @author: richard lyman
 38 | '''
 39 | n_components = 10 # of of pca components to use for final accuracy
 40 | 
 41 | import numpy as np
 42 | import ocr_utils
 43 | import matplotlib.pyplot as plt
 44 | from sklearn.metrics import accuracy_score
 45 | from sklearn.linear_model import LogisticRegression
 46 | from sklearn.decomposition import PCA
 47 | from sklearn.decomposition import KernelPCA
 48 | 
 49 | 
 50 | chars_to_train = range(48,58)
 51 | columnsXY=range(0,20)
 52 | column_str = 'column_sum{}'.format(list(columnsXY))
 53 | 
 54 | input_filters_dict = {'m_label': chars_to_train, 'font': 'E13B'}
 55 | 
 56 | # output  the character label and the image and column sums
 57 | output_feature_list = ['m_label','image',column_str] 
 58 | 
 59 | # read the complete image (20x20) = 400 pixels for each character
 60 | ds = ocr_utils.read_data(input_filters_dict=input_filters_dict, 
 61 |                             output_feature_list=output_feature_list, 
 62 |                             test_size=.2,
 63 |                             random_state=0)
 64 | windows_limit = 5000 # uses too much memory for my 32 bit windows computer so limit size of sample   
 65 | y_train = ds.train.features[0][:windows_limit]
 66 | X_train_image = ds.train.features[1][:windows_limit]
 67 | X_train = ds.train.features[2][:windows_limit]
 68 | 
 69 | y_test = ds.test.features[0]
 70 | X_test_image = ds.test.features[1]
 71 | X_test = ds.test.features[2]
 72 | 
 73 | 
 74 | cov_mat = np.cov(X_train_image.T)
 75 | eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
 76 | 
 77 | print('\nEigenvalues \n%s' % eigen_vals[:2*n_components])
 78 | 
 79 | tot = sum(eigen_vals)
 80 | var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
 81 | cum_var_exp = np.cumsum(var_exp)
 82 | var_exp = var_exp[:20]
 83 | cum_var_exp = cum_var_exp[:2*n_components]
 84 | title='explained variance'
 85 | plt.bar(range(1, len(var_exp)+1), var_exp, alpha=0.5, align='center',   label='individual explained variance')
 86 | plt.step(range(1, len(cum_var_exp)+1), cum_var_exp, where='mid',
 87 |          label='cumulative explained variance')
 88 | plt.ylabel('Explained variance ratio')
 89 | plt.xlabel('Principal components')
 90 | plt.legend(loc='best')
 91 | plt.tight_layout()
 92 | plt.title(title)
 93 | ocr_utils.show_figures(plt,title)
 94 | 
 95 | # Make a list of (eigenvalue, eigenvector) tuples
 96 | eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]
 97 | 
 98 | # Sort the (eigenvalue, eigenvector) tuples from high to low
 99 | eigen_pairs.sort(reverse=True)
100 | 
101 | # The eigenpairs with the highest explained variance
102 | w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
103 |                eigen_pairs[1][1][:, np.newaxis]))
104 | print('Matrix W:\n', w[:2*n_components,:])
105 | 
106 | X_train_pca = X_train_image.dot(w)
107 | print ('projection of first dataset sample on first 2 eignvectors {}'.format(X_train_image[0].dot(w)))
108 | 
109 | markers = ('o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd')
110 | colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan','orange','green','brown','lightblue','pink')
111 | 
112 | for l, c, m in zip(np.unique(y_train), colors, markers):
113 |     plt.scatter(X_train_pca[y_train==l, 0], 
114 |                 X_train_pca[y_train==l, 1], 
115 |                 c=c, label=l, marker=m)
116 | 
117 | plt.xlabel('PC 1')
118 | plt.ylabel('PC 2')
119 | plt.legend(loc='lower left')
120 | plt.tight_layout()
121 | title='features mapped to two principal components'
122 | plt.title(title)
123 | ocr_utils.show_figures(plt,title)
124 | 
125 | ########################################################################################
126 | 
127 | 
128 | pca = PCA(n_components=2)
129 | 
130 | X_train_pca = pca.fit_transform(X_train_image)
131 | X_test_pca = pca.transform(X_test_image)
132 | 
133 | lr = LogisticRegression(solver='liblinear',multi_class='auto')
134 | logistic_fitted =lr.fit(X_train_pca, y_train)
135 | 
136 | print('\nPCA Train Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_train, logistic_fitted.predict(X_train_pca)),pca.n_components))
137 | print('PCA Test Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_test, logistic_fitted.predict(X_test_pca)),pca.n_components))
138 | 
139 | title = 'train pc1 versus pc2'    
140 | ocr_utils.plot_decision_regions(X=X_train_pca, y=y_train, classifier=lr, labels=['pc1','pc2'], title=title)
141 | 
142 | title = 'test pc1 versus pc2'  
143 | ocr_utils.plot_decision_regions(X=X_test_pca, y=y_test, classifier=lr, labels=['pc1','pc2'], title=title)
144 | X_train_pca = pca.fit_transform(X_train_image)
145 | X_test_pca = pca.transform(X_test_image)
146 | 
147 | ########################################################################################
148 | pca = PCA(n_components=n_components)
149 | X_train_pca = pca.fit_transform(X_train_image)
150 | X_test_pca = pca.transform(X_test_image)
151 | 
152 | lr = LogisticRegression(solver='liblinear',multi_class='auto')
153 | logistic_fitted = lr.fit(X_train_pca, y_train)
154 | 
155 | y_train_pred = logistic_fitted.predict(X_train_pca)
156 | y_test_pred = logistic_fitted.predict(X_test_pca)
157 | 
158 | print('\nPCA Train Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_train, y_train_pred),pca.n_components))
159 | print('PCA Test Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components))
160 | 
161 | X_errors_image = X_test_image[y_test!=y_test_pred]
162 | y_errors = y_test[y_test!=y_test_pred]
163 | X_errors_pca = X_test_pca[y_test!=y_test_pred]
164 | 
165 | X_orig = X_train_image[:500]
166 | title = 'originals'
167 | X2D=np.reshape(X_orig, (X_orig.shape[0], ds.train.num_rows, ds.train.num_columns))
168 | ocr_utils.montage(X2D,title=title)
169 | 
170 | X_orig = X_train_pca[:500]
171 | title = 'inverse original'
172 | X_inverse = pca.inverse_transform(X_orig)
173 | X2D = np.reshape(X_inverse, (X_inverse.shape[0], ds.train.num_rows, ds.train.num_columns))
174 | X2D = X2D - np.min(X2D)
175 | ocr_utils.montage(X2D,title=title)
176 | 
177 | # change to a 2D shape 
178 | X_errors2D=np.reshape(X_errors_image, (X_errors_image.shape[0], ds.train.num_rows, ds.train.num_columns))
179 | ocr_utils.montage(X_errors2D,title='PCA Error Characters, components={}'.format (n_components))
180 | 
181 | title = 'inverse transform errors'
182 | X_inverse = pca.inverse_transform(X_errors_pca)
183 | X2D=np.reshape(X_inverse, (X_inverse.shape[0], ds.train.num_rows, ds.train.num_columns))
184 | X2D = X2D - np.min(X2D)
185 | ocr_utils.montage(X2D,title=title)
186 | 
187 | ########################################################################################
188 | kernel='rbf'  # really slow
189 | pca = KernelPCA(n_components=2,kernel=kernel, gamma=15)
190 | 
191 | X_train_pca = pca.fit_transform(X_train_image)
192 | X_test_pca = pca.transform(X_test_image)
193 | 
194 | lr = LogisticRegression(solver='liblinear',multi_class='auto')
195 | logistic_fitted=lr.fit(X_train_pca, y_train)
196 | y_train_pred = logistic_fitted.predict(X_train_pca)
197 | y_test_pred = logistic_fitted.predict(X_test_pca)
198 | 
199 | print('\nKernel PCA Train Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_train, y_train_pred), pca.n_components,kernel))
200 | print('Kernel PCA Test Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_test, y_test_pred),pca.n_components,kernel))
201 | 
202 | title = 'train kernel {} pc1 versus pc2'.format(kernel)    
203 | ocr_utils.plot_decision_regions(X=X_train_pca, y=y_train, classifier=lr, labels=['pc1','pc2'], title=title)
204 | 
205 | title = 'test kernel {} pc1 versus pc2'.format(kernel)    
206 | ocr_utils.plot_decision_regions(X=X_test_pca, y=y_test, classifier=lr, labels=['pc1','pc2'], title=title)
207 | 
208 | 
209 | 
210 | 
211 | ########################################################################################
212 | # too slow on my computer
213 | 
214 | # pca = KernelPCA(n_components=n_components,kernel=kernel, gamma = 15)
215 | # 
216 | # X_train_pca = pca.fit_transform(X_train_image)
217 | # X_test_pca = pca.transform(X_test_image)
218 | # 
219 | # print ('n_components={}'.format(pca.n_components))
220 | # 
221 | # lr = LogisticRegression()
222 | # logistic_fitted = lr.fit(X_train_pca, y_train)
223 | # 
224 | # y_pred = logistic_fitted.predict(X_test_pca)
225 | # print('\nKKernelPCA Train Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_train, logistic_fitted.predict(X_train_pca)), pca.n_components, kernel))
226 | # print('KernelPCA Test Accuracy: {:4.6f}, n_components={}, kernel={}'.format(accuracy_score(y_test, y_pred), pca.n_components, kernel))
227 | # 
228 | # X_errors_image = X_test_image[y_test!=y_pred]
229 | # y_errors = y_test[y_test!=y_pred]
230 | # 
231 | # error_images = X_errors_image.shape[0]
232 | # 
233 | # # change to a 2D shape 
234 | # X_errors2D=np.reshape(X_errors_image, (error_images, ds.train.num_rows, ds.train.num_columns))
235 | # ocr_utils.montage(X_errors2D,title='Kernel {} KernelPCA Errors Character,components={}s'.format(kernel,n_components))
236 | 
237 | print ('\n########################### No Errors ####################################')
238 | 
239 | 


--------------------------------------------------------------------------------