├── LICENSE
├── README.md
├── global.py
├── organize_flowers17.py
├── output
    ├── data.h5
    ├── global.md
    └── labels.h5
└── train_test.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Gogul
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Image Classification using Python and Machine Learning
 2 | 
 3 | This repo contains the code to perform a simple image classification task using Python and Machine Learning. We will apply global feature descriptors such as Color Histograms, Haralick Textures and Hu Moments to extract features from FLOWER17 dataset and use machine learning models to learn and predict.
 4 | 
 5 | [UPDATE]
 6 | Now, you can simply run `organize_flowers17.py` script to download and organize training data for this project. Also, I have updated the code to support only Python 3+ as Python 2+ faces end of life.
 7 | 
 8 | ## Summary of the project
 9 | * Global Feature Descriptors such as Color Histograms, Haralick Textures and Hu Moments are used on University of Oxford's FLOWER17 dataset.
10 | * Classifiers used are Logistic Regression, Linear Discriminant Analysis, K-Nearest Neighbors, Decision Trees, Random Forests, Gaussian Naive Bayes and Support Vector Machine.
11 | 
12 | ## Usage 
13 | 
14 | * `python organize_flowers17.py` - Downloads Flowers17 Dataset and organizes training set in disk.
15 | * `python global.py` - Extracts global features from training set and stores it in disk.
16 | * `python train_test.py` - Predicts the image class using the trained model.
17 | 
18 | Tutorial for this project is available at - [Image Classification using Python and Machine Learning](https://gogul09.github.io/software/image-classification-python)


--------------------------------------------------------------------------------
/global.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------
  2 | # GLOBAL FEATURE EXTRACTION
  3 | #-----------------------------------
  4 | from sklearn.preprocessing import LabelEncoder
  5 | from sklearn.preprocessing import MinMaxScaler
  6 | import numpy as np
  7 | import mahotas
  8 | import cv2
  9 | import os
 10 | import h5py
 11 | 
 12 | #--------------------
 13 | # tunable-parameters
 14 | #--------------------
 15 | images_per_class = 80
 16 | fixed_size       = tuple((500, 500))
 17 | train_path       = "dataset/train"
 18 | h5_data          = 'output/data.h5'
 19 | h5_labels        = 'output/labels.h5'
 20 | bins             = 8
 21 | 
 22 | # feature-descriptor-1: Hu Moments
 23 | def fd_hu_moments(image):
 24 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 25 |     feature = cv2.HuMoments(cv2.moments(image)).flatten()
 26 |     return feature
 27 | 
 28 | # feature-descriptor-2: Haralick Texture
 29 | def fd_haralick(image):
 30 |     # convert the image to grayscale
 31 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 32 |     # compute the haralick texture feature vector
 33 |     haralick = mahotas.features.haralick(gray).mean(axis=0)
 34 |     # return the result
 35 |     return haralick
 36 | 
 37 | # feature-descriptor-3: Color Histogram
 38 | def fd_histogram(image, mask=None):
 39 |     # convert the image to HSV color-space
 40 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 41 |     # compute the color histogram
 42 |     hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
 43 |     # normalize the histogram
 44 |     cv2.normalize(hist, hist)
 45 |     # return the histogram
 46 |     return hist.flatten()
 47 | 
 48 | # get the training labels
 49 | train_labels = os.listdir(train_path)
 50 | 
 51 | # sort the training labels
 52 | train_labels.sort()
 53 | print(train_labels)
 54 | 
 55 | # empty lists to hold feature vectors and labels
 56 | global_features = []
 57 | labels          = []
 58 | 
 59 | # loop over the training data sub-folders
 60 | for training_name in train_labels:
 61 |     # join the training data path and each species training folder
 62 |     dir = os.path.join(train_path, training_name)
 63 | 
 64 |     # get the current training label
 65 |     current_label = training_name
 66 | 
 67 |     # loop over the images in each sub-folder
 68 |     for x in range(1,images_per_class+1):
 69 |         # get the image file name
 70 |         file = dir + "/" + str(x) + ".jpg"
 71 | 
 72 |         # read the image and resize it to a fixed-size
 73 |         image = cv2.imread(file)
 74 |         image = cv2.resize(image, fixed_size)
 75 | 
 76 |         ####################################
 77 |         # Global Feature extraction
 78 |         ####################################
 79 |         fv_hu_moments = fd_hu_moments(image)
 80 |         fv_haralick   = fd_haralick(image)
 81 |         fv_histogram  = fd_histogram(image)
 82 | 
 83 |         ###################################
 84 |         # Concatenate global features
 85 |         ###################################
 86 |         global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])
 87 | 
 88 |         # update the list of labels and feature vectors
 89 |         labels.append(current_label)
 90 |         global_features.append(global_feature)
 91 | 
 92 |     print("[STATUS] processed folder: {}".format(current_label))
 93 | 
 94 | print("[STATUS] completed Global Feature Extraction...")
 95 | 
 96 | # get the overall feature vector size
 97 | print("[STATUS] feature vector size {}".format(np.array(global_features).shape))
 98 | 
 99 | # get the overall training label size
100 | print("[STATUS] training Labels {}".format(np.array(labels).shape))
101 | 
102 | # encode the target labels
103 | targetNames = np.unique(labels)
104 | le          = LabelEncoder()
105 | target      = le.fit_transform(labels)
106 | print("[STATUS] training labels encoded...")
107 | 
108 | # scale features in the range (0-1)
109 | scaler            = MinMaxScaler(feature_range=(0, 1))
110 | rescaled_features = scaler.fit_transform(global_features)
111 | print("[STATUS] feature vector normalized...")
112 | 
113 | print("[STATUS] target labels: {}".format(target))
114 | print("[STATUS] target labels shape: {}".format(target.shape))
115 | 
116 | # save the feature vector using HDF5
117 | h5f_data = h5py.File(h5_data, 'w')
118 | h5f_data.create_dataset('dataset_1', data=np.array(rescaled_features))
119 | 
120 | h5f_label = h5py.File(h5_labels, 'w')
121 | h5f_label.create_dataset('dataset_1', data=np.array(target))
122 | 
123 | h5f_data.close()
124 | h5f_label.close()
125 | 
126 | print("[STATUS] end of training..")


--------------------------------------------------------------------------------
/organize_flowers17.py:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------
 2 | # DOWNLOAD AND ORGANIZE FLOWERS17 DATASET
 3 | #-----------------------------------------
 4 | import os
 5 | import glob
 6 | import datetime
 7 | import tarfile
 8 | import urllib.request
 9 | 
10 | def download_dataset(filename, url, work_dir):
11 | 	if not os.path.exists(filename):
12 | 		print("[INFO] Downloading flowers17 dataset....")
13 | 		filename, _ = urllib.request.urlretrieve(url + filename, filename)
14 | 		statinfo = os.stat(filename)
15 | 		print("[INFO] Succesfully downloaded " + filename + " " + str(statinfo.st_size) + " bytes.")
16 | 		untar(filename, work_dir)
17 | 
18 | def jpg_files(members):
19 | 	for tarinfo in members:
20 | 		if os.path.splitext(tarinfo.name)[1] == ".jpg":
21 | 			yield tarinfo
22 | 
23 | def untar(fname, path):
24 | 	tar = tarfile.open(fname)
25 | 	tar.extractall(path=path, members=jpg_files(tar))
26 | 	tar.close()
27 | 	print("[INFO] Dataset extracted successfully.")
28 | 
29 | #-------------------------
30 | # MAIN FUNCTION
31 | #-------------------------
32 | if __name__ == '__main__':
33 | 	flowers17_url  = "http://www.robots.ox.ac.uk/~vgg/data/flowers/17/"
34 | 	flowers17_name = "17flowers.tgz"
35 | 	train_dir      = "dataset"
36 | 
37 | 	if not os.path.exists(train_dir):
38 | 		os.makedirs(train_dir)
39 | 
40 | 	download_dataset(flowers17_name, flowers17_url, train_dir)
41 | 	if os.path.exists(train_dir + "\\jpg"):
42 | 		os.rename(train_dir + "\\jpg", train_dir + "\\train")
43 | 
44 | 
45 | 	# get the class label limit
46 | 	class_limit = 17
47 | 
48 | 	# take all the images from the dataset
49 | 	image_paths = glob.glob(train_dir + "\\train\\*.jpg")
50 | 
51 | 	# variables to keep track
52 | 	label = 0
53 | 	i = 0
54 | 	j = 80
55 | 
56 | 	# flower17 class names
57 | 	class_names = ["daffodil", "snowdrop", "lilyvalley", "bluebell", "crocus",
58 | 			   	   "iris", "tigerlily", "tulip", "fritillary", "sunflower", 
59 | 			       "daisy", "coltsfoot", "dandelion", "cowslip", "buttercup",
60 | 			       "windflower", "pansy"]
61 | 
62 | 	# loop over the class labels
63 | 	for x in range(1, class_limit+1):
64 | 		# create a folder for that class
65 | 		os.makedirs(train_dir + "\\train\\" + class_names[label])
66 | 		
67 | 		# get the current path
68 | 		cur_path = train_dir + "\\train\\" + class_names[label] + "\\"
69 | 		
70 | 		# loop over the images in the dataset
71 | 		for index, image_path in enumerate(image_paths[i:j], start=1):
72 | 			original_path   = image_path
73 | 			image_path      = image_path.split("\\")
74 | 			image_file_name = str(index) + ".jpg"
75 | 			os.rename(original_path, cur_path + image_file_name)
76 | 		
77 | 		i += 80
78 | 		j += 80
79 | 		label += 1


--------------------------------------------------------------------------------
/output/data.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WonderEagle/ImageClassify/f8833ae87a24117b7c2f2d4a0079bf76da362bf9/output/data.h5


--------------------------------------------------------------------------------
/output/global.md:
--------------------------------------------------------------------------------
1 | This is the folder where your extracted global features are stored.
2 | 
3 | This folder will have 2 files.
4 |   --data.h5
5 |   --labels.h5


--------------------------------------------------------------------------------
/output/labels.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WonderEagle/ImageClassify/f8833ae87a24117b7c2f2d4a0079bf76da362bf9/output/labels.h5


--------------------------------------------------------------------------------
/train_test.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------
  2 | # TRAINING OUR MODEL
  3 | #-----------------------------------
  4 | import h5py
  5 | import numpy as np
  6 | import os
  7 | import glob
  8 | import cv2
  9 | import joblib
 10 | import warnings
 11 | from matplotlib import pyplot
 12 | from sklearn.model_selection import train_test_split, cross_val_score
 13 | from sklearn.model_selection import KFold, StratifiedKFold
 14 | from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.tree import DecisionTreeClassifier
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.neighbors import KNeighborsClassifier
 19 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 20 | from sklearn.naive_bayes import GaussianNB
 21 | from sklearn.svm import SVC
 22 | warnings.filterwarnings('ignore')
 23 | 
 24 | #--------------------
 25 | # tunable-parameters
 26 | #--------------------
 27 | num_trees = 100
 28 | test_size = 0.10
 29 | seed      = 9
 30 | train_path = "dataset/train"
 31 | test_path  = "dataset/test"
 32 | h5_data    = 'output/data.h5'
 33 | h5_labels  = 'output/labels.h5'
 34 | scoring    = "accuracy"
 35 | bins=8
 36 | fixed_size = tuple((500, 500))
 37 | # get the training labels
 38 | train_labels = os.listdir(train_path)
 39 | 
 40 | # sort the training labels
 41 | train_labels.sort()
 42 | 
 43 | if not os.path.exists(test_path):
 44 |     os.makedirs(test_path)
 45 | 
 46 | # create all the machine learning models
 47 | models = []
 48 | models.append(('LR', LogisticRegression(random_state=seed)))
 49 | models.append(('LDA', LinearDiscriminantAnalysis()))
 50 | models.append(('KNN', KNeighborsClassifier()))
 51 | models.append(('CART', DecisionTreeClassifier(random_state=seed)))
 52 | models.append(('RF', RandomForestClassifier(n_estimators=num_trees, random_state=seed)))
 53 | models.append(('NB', GaussianNB()))
 54 | models.append(('SVM', SVC(random_state=seed)))
 55 | 
 56 | # variables to hold the results and names
 57 | results = []
 58 | names   = []
 59 | 
 60 | # import the feature vector and trained labels
 61 | h5f_data  = h5py.File(h5_data, 'r')
 62 | h5f_label = h5py.File(h5_labels, 'r')
 63 | 
 64 | global_features_string = h5f_data['dataset_1']
 65 | global_labels_string   = h5f_label['dataset_1']
 66 | 
 67 | global_features = np.array(global_features_string)
 68 | global_labels   = np.array(global_labels_string)
 69 | 
 70 | h5f_data.close()
 71 | h5f_label.close()
 72 | 
 73 | # verify the shape of the feature vector and labels
 74 | print("[STATUS] features shape: {}".format(global_features.shape))
 75 | print("[STATUS] labels shape: {}".format(global_labels.shape))
 76 | 
 77 | print("[STATUS] training started...")
 78 | 
 79 | # split the training and testing data
 80 | (trainDataGlobal, testDataGlobal, trainLabelsGlobal, testLabelsGlobal) = train_test_split(
 81 |     np.array(global_features),np.array(global_labels), test_size=test_size,random_state=seed)
 82 | 
 83 | print("[STATUS] splitted train and test data...")
 84 | print("Train data  : {}".format(trainDataGlobal.shape))
 85 | print("Test data   : {}".format(testDataGlobal.shape))
 86 | print("Train labels: {}".format(trainLabelsGlobal.shape))
 87 | print("Test labels : {}".format(testLabelsGlobal.shape))
 88 | 
 89 | # 10-fold cross validation
 90 | for name, model in models:
 91 |     kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
 92 |     cv_results = cross_val_score(model, trainDataGlobal, trainLabelsGlobal, cv=kfold, scoring=scoring)
 93 |     results.append(cv_results)
 94 |     names.append(name)
 95 |     msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
 96 |     print(msg)
 97 | 
 98 | # boxplot algorithm comparison
 99 | fig = pyplot.figure()
100 | fig.suptitle('Machine Learning algorithm comparison')
101 | ax = fig.add_subplot(111)
102 | pyplot.boxplot(results)
103 | ax.set_xticklabels(names)
104 | pyplot.show()
105 | 
106 | #-----------------------------------
107 | # TESTING OUR MODEL
108 | #-----------------------------------
109 | 
110 | # to visualize results
111 | import matplotlib.pyplot as plt
112 | import mahotas
113 | from sklearn.preprocessing import MinMaxScaler
114 | # feature-descriptor-1: Hu Moments
115 | def fd_hu_moments(image):
116 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
117 |     feature = cv2.HuMoments(cv2.moments(image)).flatten()
118 |     return feature
119 | 
120 | # feature-descriptor-2: Haralick Texture
121 | def fd_haralick(image):
122 |     # convert the image to grayscale
123 |     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
124 |     # compute the haralick texture feature vector
125 |     haralick = mahotas.features.haralick(gray).mean(axis=0)
126 |     # return the result
127 |     return haralick
128 | 
129 | # feature-descriptor-3: Color Histogram
130 | def fd_histogram(image, mask=None):
131 |     # convert the image to HSV color-space
132 |     image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
133 |     # compute the color histogram
134 |     hist  = cv2.calcHist([image], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
135 |     # normalize the histogram
136 |     cv2.normalize(hist, hist)
137 |     # return the histogram
138 |     return hist.flatten()
139 | # create the model - Random Forests
140 | clf  = RandomForestClassifier(n_estimators=num_trees, random_state=seed)
141 | 
142 | # fit the training data to the model
143 | clf.fit(trainDataGlobal, trainLabelsGlobal)
144 | 
145 | # loop through the test images
146 | for file in glob.glob(test_path + "/*.jpg"):
147 |     # read the image
148 |     image = cv2.imread(file)
149 | 
150 |     # resize the image
151 |     image = cv2.resize(image, fixed_size)
152 | 
153 |     ####################################
154 |     # Global Feature extraction
155 |     ####################################
156 |     fv_hu_moments = fd_hu_moments(image)
157 |     fv_haralick   = fd_haralick(image)
158 |     fv_histogram  = fd_histogram(image)
159 | 
160 |     ###################################
161 |     # Concatenate global features
162 |     ###################################
163 |     global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments]).reshape(1, -1)
164 |     # print(global_feature.reshape(-1, 1))
165 |     # scale features in the range (0-1)
166 |     scaler = MinMaxScaler(feature_range=(0, 1))
167 |     rescaled_feature = scaler.fit_transform(global_feature)
168 |     # predict label of test image
169 |     prediction = clf.predict(rescaled_feature.reshape(1,-1))[0]
170 |     print(prediction)
171 |     # show predicted label on image
172 |     cv2.putText(image, train_labels[prediction], (20,30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,255), 3)
173 | 
174 |     # display the output image
175 |     plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
176 |     plt.show()


--------------------------------------------------------------------------------