├── Mean Shift-part 39,40,41,42.py ├── Mean Shift-part 39,40.py ├── Mean Shift-part 41,42.py ├── README.md ├── Regression part 10 and Part 11.py ├── SVM-PART 28,Kernel-part 29,30,31.py ├── SVM-part 20,21,22.py ├── SVM-part 25,26,27.py ├── Tic-Tac-Toe AI.py ├── cluster-part34,35,36,37.py ├── k means...nearest neighbours-part 13,14,15.py ├── k means...nearest neighbours-part 16,17,18,19.py ├── k-means from scratch-part 37,38.py ├── kernel,cvxopt-part 32.py ├── regress ion part 4 and part 5.py ├── regression part 12.py ├── regression part 6,7,8,9.py ├── regression-part 1 and 2.py └── svm(final)-sklearn-part 33.py /Mean Shift-part 39,40,41,42.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 27 19:01:02 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | import numpy as np 8 | from sklearn.cluster import MeanShift 9 | from sklearn.datasets.samples_generator import make_blobs 10 | import matplotlib.pyplot as plt 11 | from mpl_toolkits.mplot3d import Axes3D 12 | from matplotlib import style 13 | style.use("ggplot") 14 | import pandas as pd 15 | from sklearn import preprocessing 16 | from pandas.api.types import is_numeric_dtype 17 | ############Basic Visulisation of Mean Shift 18 | #centers = [[1,1,1],[5,5,5],[3,10,10]] 19 | # 20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5) 21 | # 22 | #ms = MeanShift() 23 | #ms.fit(X) 24 | #labels = ms.labels_ 25 | #cluster_centers = ms.cluster_centers_ 26 | # 27 | #print(cluster_centers) 28 | #n_clusters_ = len(np.unique(labels)) 29 | #print("Number of estimated clusters:", n_clusters_) 30 | # 31 | #colors = 10*['r','g','b','c','k','y','m'] 32 | #fig = plt.figure() 33 | #ax = fig.add_subplot(111, projection='3d') 34 | # 35 | #for i in range(len(X)): 36 | # ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o') 37 | # 38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2], 39 | # marker="x",color='k', s=150, linewidths = 5, zorder=10) 40 | 41 | ###################################################################### 42 | # -*- coding: utf-8 -*- 43 | #############Mean Shift on Titanic Dataset 44 | df = pd.read_excel('titanic.xls') 45 | orginal_df=pd.DataFrame.copy(df) 46 | 47 | 48 | df.drop(['body','name'], 1, inplace=True) 49 | #df.convert_objects(convert_numeric=True) 50 | print(df.head()) 51 | df.fillna(0,inplace=True) 52 | 53 | def handle_non_numerical_data(df): 54 | 55 | # handling non-numerical data: must convert. 56 | columns = df.columns.values 57 | 58 | for column in columns: 59 | text_digit_vals = {} 60 | def convert_to_int(val): 61 | return text_digit_vals[val] 62 | 63 | #print(column,df[column].dtype) 64 | if df[column].dtype != np.int64 and df[column].dtype != np.float64: 65 | 66 | column_contents = df[column].values.tolist() 67 | #finding just the uniques 68 | unique_elements = set(column_contents) 69 | # great, found them. 70 | x = 0 71 | for unique in unique_elements: 72 | if unique not in text_digit_vals: 73 | # creating dict that contains new 74 | # id per unique string 75 | text_digit_vals[unique] = x 76 | x+=1 77 | # now we map the new "id" vlaue 78 | # to replace the string. 79 | df[column] = list(map(convert_to_int,df[column])) 80 | 81 | return df 82 | 83 | df = handle_non_numerical_data(df) 84 | print(df.head()) 85 | 86 | # add/remove features just to see impact they have. 87 | df.drop(['ticket','home.dest'], 1, inplace=True) 88 | 89 | 90 | X = np.array(df.drop(['survived'], 1).astype(float)) 91 | X = preprocessing.scale(X) 92 | y = np.array(df['survived']) 93 | 94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5) 95 | 96 | clf = MeanShift()##change this 97 | clf.fit(X) 98 | 99 | 100 | labels=clf.labels_ 101 | cluster_centers=clf.cluster_centers_ 102 | 103 | orginal_df["cluster_group"]=np.nan 104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed 105 | for i in range(len(X)): 106 | orginal_df["cluster_group"].iloc[i]=labels[i] 107 | n_clusters_=len(np.unique(labels)) 108 | survival_rates={}###to see survival rate for different classes 109 | for i in range(n_clusters_): 110 | temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster 111 | # print(temp_df) 112 | survival_cluster=temp_df[(temp_df["survived"]==1)] 113 | survival_rate=len(survival_cluster)/len(temp_df) 114 | survival_rates[i]=survival_rate 115 | #print(orginal_df[(orginal_df["cluster_group"]==2)]) 116 | ###Now you can use df.describe() to analyse the data for different classes 117 | -------------------------------------------------------------------------------- /Mean Shift-part 39,40.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 27 19:01:02 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | import numpy as np 8 | from sklearn.cluster import MeanShift 9 | from sklearn.datasets.samples_generator import make_blobs 10 | import matplotlib.pyplot as plt 11 | from mpl_toolkits.mplot3d import Axes3D 12 | from matplotlib import style 13 | style.use("ggplot") 14 | import pandas as pd 15 | from sklearn import preprocessing 16 | from pandas.api.types import is_numeric_dtype 17 | ############Basic Visulisation of Mean Shift 18 | #centers = [[1,1,1],[5,5,5],[3,10,10]] 19 | # 20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5) 21 | # 22 | #ms = MeanShift() 23 | #ms.fit(X) 24 | #labels = ms.labels_ 25 | #cluster_centers = ms.cluster_centers_ 26 | # 27 | #print(cluster_centers) 28 | #n_clusters_ = len(np.unique(labels)) 29 | #print("Number of estimated clusters:", n_clusters_) 30 | # 31 | #colors = 10*['r','g','b','c','k','y','m'] 32 | #fig = plt.figure() 33 | #ax = fig.add_subplot(111, projection='3d') 34 | # 35 | #for i in range(len(X)): 36 | # ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o') 37 | # 38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2], 39 | # marker="x",color='k', s=150, linewidths = 5, zorder=10) 40 | 41 | ###################################################################### 42 | # -*- coding: utf-8 -*- 43 | #############Mean Shift on Titanic Dataset 44 | df = pd.read_excel('titanic.xls') 45 | orginal_df=pd.DataFrame.copy(df) 46 | 47 | 48 | df.drop(['body','name'], 1, inplace=True) 49 | #df.convert_objects(convert_numeric=True) 50 | print(df.head()) 51 | df.fillna(0,inplace=True) 52 | 53 | def handle_non_numerical_data(df): 54 | 55 | # handling non-numerical data: must convert. 56 | columns = df.columns.values 57 | 58 | for column in columns: 59 | text_digit_vals = {} 60 | def convert_to_int(val): 61 | return text_digit_vals[val] 62 | 63 | #print(column,df[column].dtype) 64 | if df[column].dtype != np.int64 and df[column].dtype != np.float64: 65 | 66 | column_contents = df[column].values.tolist() 67 | #finding just the uniques 68 | unique_elements = set(column_contents) 69 | # great, found them. 70 | x = 0 71 | for unique in unique_elements: 72 | if unique not in text_digit_vals: 73 | # creating dict that contains new 74 | # id per unique string 75 | text_digit_vals[unique] = x 76 | x+=1 77 | # now we map the new "id" vlaue 78 | # to replace the string. 79 | df[column] = list(map(convert_to_int,df[column])) 80 | 81 | return df 82 | 83 | df = handle_non_numerical_data(df) 84 | print(df.head()) 85 | 86 | # add/remove features just to see impact they have. 87 | df.drop(['ticket','home.dest'], 1, inplace=True) 88 | 89 | 90 | X = np.array(df.drop(['survived'], 1).astype(float)) 91 | X = preprocessing.scale(X) 92 | y = np.array(df['survived']) 93 | 94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5) 95 | 96 | clf = MeanShift()##change this 97 | clf.fit(X) 98 | 99 | 100 | labels=clf.labels_ 101 | cluster_centers=clf.cluster_centers_ 102 | 103 | orginal_df["cluster_group"]=np.nan 104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed 105 | for i in range(len(X)): 106 | orginal_df["cluster_group"].iloc[i]=labels[i] 107 | n_clusters_=len(np.unique(labels)) 108 | survival_rates={}###to see survival rate for different classes 109 | for i in range(n_clusters_): 110 | temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster 111 | # print(temp_df) 112 | survival_cluster=temp_df[(temp_df["survived"]==1)] 113 | survival_rate=len(survival_cluster)/len(temp_df) 114 | survival_rates[i]=survival_rate 115 | #print(orginal_df[(orginal_df["cluster_group"]==2)]) 116 | ###Now you can use df.describe() to analyse the data for different classes 117 | -------------------------------------------------------------------------------- /Mean Shift-part 41,42.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 28 12:43:31 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | import matplotlib.pyplot as plt 8 | from matplotlib import style 9 | style.use("ggplot") 10 | import numpy as np 11 | from sklearn.cluster import KMeans 12 | import pandas as pd 13 | from sklearn import preprocessing 14 | from pandas.api.types import is_numeric_dtype 15 | from sklearn.datasets.samples_generator import make_blobs 16 | import random 17 | 18 | #####basic visulisation of k-means 19 | 20 | #plt.scatter(X[:,0],X[:,1],s=150) 21 | X,y=make_blobs(n_samples=50,centers=5,n_features=2) 22 | 23 | 24 | clf=KMeans(n_clusters=2) 25 | clf.fit(X) 26 | centroids=clf.cluster_centers_ 27 | labels=clf.labels_ 28 | colors=10*["g","r","c","b","k","o"] 29 | ######Making Meansift from scartch 30 | class Mean_Shift: 31 | def __init__(self,radius=None,radius_norm_step=100): 32 | self.radius=radius 33 | self.radius_norm_step=radius_norm_step 34 | def fit(self,data): 35 | 36 | if self.radius==None: 37 | all_data_centroid=np.average(data,axis=0) 38 | all_data_norm=np.linalg.norm(all_data_centroid) 39 | self.radius=all_data_norm/self.radius_norm_step 40 | 41 | 42 | centroids={} 43 | 44 | for i in range(len(data)): 45 | centroids[i]=data[i] 46 | weights = [i for i in range(self.radius_norm_step)][::-1] 47 | while True: 48 | new_centroids = [] 49 | for i in centroids: 50 | in_bandwidth = [] 51 | centroid = centroids[i] 52 | 53 | for featureset in data: 54 | #if np.linalg.norm(featureset-centroid) < self.radius: 55 | # in_bandwidth.append(featureset) 56 | distance = np.linalg.norm(featureset-centroid) 57 | if distance == 0: 58 | distance = 0.00000000001 59 | weight_index = int(distance/self.radius) 60 | if weight_index > self.radius_norm_step-1: 61 | weight_index = self.radius_norm_step-1 62 | 63 | to_add = (weights[weight_index]**2)*[featureset] 64 | in_bandwidth +=to_add 65 | 66 | 67 | new_centroid = np.average(in_bandwidth,axis=0) 68 | new_centroids.append(tuple(new_centroid)) 69 | 70 | uniques = sorted(list(set(new_centroids))) 71 | to_pop=[] 72 | for i in uniques: 73 | for ii in uniques: 74 | if i==ii: 75 | pass 76 | elif np.linalg.norm(np.array(i)-np.array(ii))<=self.radius: 77 | to_pop.append(ii) 78 | break 79 | for i in to_pop: 80 | try: 81 | uniques.remove(i) 82 | except: 83 | pass 84 | 85 | 86 | prev_centroids=dict(centroids) 87 | 88 | 89 | centroids={} 90 | for i in range(len(uniques)): 91 | centroids[i]=np.array(uniques[i]) 92 | optimized=True 93 | 94 | for i in centroids: 95 | if not np.array_equal(centroids[i],prev_centroids[i]): 96 | optimized=False 97 | 98 | if not optimized: 99 | break 100 | 101 | if optimized: 102 | break 103 | 104 | self.centroids=centroids 105 | 106 | self.classification={} 107 | for i in range(len(self.centroids)): 108 | self.classification[i]=[] 109 | for featureset in data: 110 | distances=[np.linalg.norm(featureset-self.centroids[centroid])for centroid in self.centroids] 111 | classification=distances.index(min(distances)) 112 | self.classification[classification].append(featureset) 113 | 114 | def predict(self,data): 115 | distances=[np.linalg.norm(data-self.centroids[centroid])for centroid in self.centroids] 116 | classification=distances.index(min(distances)) 117 | return classification 118 | clf=Mean_Shift() 119 | clf.fit(X) 120 | 121 | 122 | centroids=clf.centroids 123 | plt.scatter(X[:,0],X[:,1],s=150) 124 | 125 | for classification in clf.classification: 126 | color=colors[classification] 127 | for featureset in clf.classification[classification]: 128 | plt.scatter(featureset[0],featureset[1],marker="x",color=color,s=150) 129 | 130 | for c in centroids: 131 | plt.scatter(centroids[c][0],centroids[c][1],color="k",marker="*",s=150) 132 | 133 | plt.show() 134 | ##The below commented code is for when radius is hardcoded: 135 | #import matplotlib.pyplot as plt 136 | #from matplotlib import style 137 | #style.use('ggplot') 138 | #import numpy as np 139 | # 140 | #X = np.array([[1, 2], 141 | # [1.5, 1.8], 142 | # [5, 8 ], 143 | # [8, 8], 144 | # [1, 0.6], 145 | # [9,11], 146 | # [8,2], 147 | # [10,2], 148 | # [9,3],]) 149 | # 150 | ###plt.scatter(X[:,0], X[:,1], s=150) 151 | ###plt.show() 152 | # 153 | #colors = 10*["g","r","c","b","k"] 154 | # 155 | #class Mean_Shift: 156 | # def __init__(self, radius=4): 157 | # self.radius = radius 158 | # 159 | # def fit(self, data): 160 | # centroids = {} 161 | # 162 | # for i in range(len(data)): 163 | # centroids[i] = data[i] 164 | # 165 | # while True: 166 | # new_centroids = [] 167 | # for i in centroids: 168 | # in_bandwidth = [] 169 | # centroid = centroids[i] 170 | # for featureset in data: 171 | # if np.linalg.norm(featureset-centroid) < self.radius: 172 | # in_bandwidth.append(featureset) 173 | # 174 | # new_centroid = np.average(in_bandwidth,axis=0) 175 | # new_centroids.append(tuple(new_centroid)) 176 | # 177 | # uniques = sorted(list(set(new_centroids))) 178 | # 179 | # prev_centroids = dict(centroids) 180 | # 181 | # centroids = {} 182 | # for i in range(len(uniques)): 183 | # centroids[i] = np.array(uniques[i]) 184 | # 185 | # optimized = True 186 | # 187 | # for i in centroids: 188 | # if not np.array_equal(centroids[i], prev_centroids[i]): 189 | # optimized = False 190 | # if not optimized: 191 | # break 192 | # 193 | # if optimized: 194 | # break 195 | # 196 | # self.centroids = centroids 197 | # 198 | # 199 | # 200 | #clf = Mean_Shift() 201 | #clf.fit(X) 202 | # 203 | #centroids = clf.centroids 204 | # 205 | #plt.scatter(X[:,0], X[:,1], s=150) 206 | # 207 | #for c in centroids: 208 | # plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150) 209 | # 210 | #plt.show() 211 | 212 | 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Algorithms 2 | Major Algorithms to implement different supervised and unsupervised Machine Learners 3 | 4 | These files shows my journey of Learning Machine Learning. These are not an implementation of a final project but rather shows how I learned Machine Learning and then tried to apply those concepts. 5 | 6 | These files include both sklearn part of implementation and also my own scratch construction of these classical Machine Learning Algorithms. 7 | 8 | Includes almost all of the main ML algorithms(KNN,SVM,Regression,MeanShift). 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Regression part 10 and Part 11.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 3 14:15:33 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | #Linear Regression Model from scratch: 9 | from statistics import mean 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from matplotlib import style 13 | import pylab 14 | style.use("fivethirtyeight") 15 | xs=np.array([1,2,3,4,5,6],dtype=np.float64) 16 | ys=np.array([5,4,6,5,6,7],dtype=np.float64) 17 | 18 | def best_fit_slope_and_intercept(xs,ys): 19 | m=((mean(xs)*mean(ys)) - (mean(xs*ys)))/((mean(xs)**2)-mean(xs**2)) 20 | b=mean(ys)-m*mean(xs) 21 | return m,b 22 | m,b=best_fit_slope_and_intercept(xs,ys) 23 | print(m,b) 24 | 25 | 26 | def squared_error(ys_orgin,ys_line): 27 | return sum((ys_line-ys_orgin)**2) 28 | def coefficent_of_determination(ys_orgin,ys_line): 29 | y_mean_line=[mean(ys_orgin) for y in ys_orgin] 30 | square_error_regr=squared_error(ys_orgin,ys_line) 31 | square_error_regr_y_mean=squared_error(ys_orgin,y_mean_line) 32 | return 1-(square_error_regr)/(square_error_regr_y_mean) 33 | regression_line=[(m*x)+b for x in xs] 34 | 35 | 36 | r_square=coefficent_of_determination(ys,regression_line) 37 | print(r_square) 38 | predict_x=8 39 | predict_y=(m*predict_x)+b 40 | print(regression_line) 41 | plt.scatter(xs,ys) 42 | plt.scatter(predict_x,predict_y,color="g") 43 | plt.plot(regression_line) 44 | -------------------------------------------------------------------------------- /SVM-PART 28,Kernel-part 29,30,31.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 18 18:00:48 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib import style 10 | import numpy as np 11 | style.use("ggplot") 12 | 13 | class Support_Vector_Machine: 14 | def __init__(self, visualization=True): 15 | self.visualization = visualization 16 | self.colors = {1:'r',-1:'b'} 17 | if self.visualization: 18 | self.fig = plt.figure() 19 | self.ax = self.fig.add_subplot(1,1,1) 20 | ###traing the data to find w and b 21 | def fit(self,data): 22 | self.data=data 23 | ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b 24 | opt_dict={} 25 | 26 | 27 | ##These transforms are what we use to apply to vector w 28 | ##each time we step in order to know ever possible direction of a vector w and its 29 | ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account 30 | #for direction,remeber in vector direction matters 31 | transforms=[[1,1],[-1,1],[-1,-1],[1,-1]] 32 | 33 | 34 | 35 | 36 | 37 | all_data=[] 38 | ###this three loop takes all features of the associated class yi and make, 39 | ##a new list of these features and than take the max and min value associated with the 40 | ### this new list of feature and these max and min values are to be used for further convex optimization 41 | for yi in self.data: 42 | for featureset in self.data[yi]: 43 | for feature in featureset: 44 | all_data.append(feature) 45 | self.max_feature_value=max(all_data) 46 | self.min_feature_value=min(all_data) 47 | all_data=None 48 | step_size=[self.max_feature_value*0.1, 49 | self.max_feature_value*0.01, 50 | ##POINT OF EXPENSE 51 | self.max_feature_value*0.001] 52 | ###extremely expensive-b does not need to take precise step 53 | b_range_multiple=5 54 | 55 | #we dont need to take as small of steps 56 | #wit b as we do w 57 | b_multiple=5 58 | ###the first value of w and remeber to simplify things,we assume each element of vector w 59 | ### to be same 60 | latest_optimum=self.max_feature_value*10 61 | 62 | for step in step_size: 63 | ####remeber to simplify things,we assume each element of vector w 64 | ### to be same 65 | w=np.array([latest_optimum,latest_optimum]) 66 | 67 | 68 | #we can do this because convex alogrithm 69 | optimized=False 70 | while not optimized: 71 | ####setting a range for b 72 | for b in np.arange(-1*(self.max_feature_value*b_range_multiple), 73 | self.max_feature_value*b_range_multiple, 74 | step*b_multiple): 75 | for transformation in transforms: 76 | ##applying the different transformation to account for difeerent direction(w_t) 77 | w_t=w*transformation 78 | found_option=True 79 | #weakest link in the SVM fundamentally 80 | #SMO attempt to fix this a bit 81 | ##Running the data on all points is costly-svm weakness 82 | ##yi(xi.w+b)>=1(constraint) 83 | for i in self.data: 84 | for xi in self.data[i]: 85 | yi=i 86 | ###this condition check even if one point in our data doesnt fit the constraint with the give w vector 87 | if not yi*(np.dot(w_t,xi)+b)>=1: 88 | found_option=False 89 | #if w satisfies the constraint 90 | if found_option: 91 | opt_dict[np.linalg.norm(w_t)]=[w_t,b] 92 | if w[0]<0: 93 | optimized=True 94 | print("optimized a step.") 95 | else: 96 | w=w-step 97 | #taking the smallest modulus w and taking new starting new point 98 | norms=sorted([n for n in opt_dict]) 99 | opt_choice=opt_dict[norms[0]] 100 | self.w=opt_choice[0] 101 | self.b=opt_choice[1] 102 | latest_optimum=opt_choice[0][0]+step*2 103 | def predict(self,features): 104 | ###sign(x.w+b) whatever the sign of the equation is 105 | classification=np.sign(np.dot(np.array(features),self.w)+self.b) 106 | if classification != 0 and self.visualization: 107 | self.ax.scatter(features[0],features[1],s=200,marker='*', c=self.colors[classification]) 108 | return classification 109 | def visualize(self): 110 | [[self.ax.scatter(x[0],x[1],s=100,color=self.colors[i]) for x in data_dict[i]] for i in data_dict] 111 | def hyperplane(x,w,b,v): 112 | ###v=x.w+b 113 | ###the hyperplane function shows the support vector plannes and boudrt decision so: 114 | ###positive support vector(psv)=1 115 | ###nsv=-1 116 | ###decision boundary=0,want to find a plane with these associated v values and show them 117 | #hyperplane v=x.w+b 118 | ##x,y is an unknown point on the hyperplane 119 | # x_v and w_v are the vector 120 | # x_v= [x,y] 121 | # x_v.w_v+b =1 for postive sv 122 | ## this helps to find the value of y where value of hyperplance is 1 123 | return (-w[0]*x-b+v)/w[1] 124 | datarange = (self.min_feature_value*0.9,self.max_feature_value*1.1) 125 | hyp_x_min = datarange[0] 126 | hyp_x_max = datarange[1] 127 | #(w.x+b)=1 128 | #positive support vector hyperplane 129 | psv1=hyperplane(hyp_x_min,self.w,self.b,1) 130 | ##psv1 is going to be scalar value not vector and its going to be y given specific x and v value 131 | psv2=hyperplane(hyp_x_max,self.w,self.b,1) 132 | #ploting the associate coordinate of psv2 and psv1 to visualize the hyperplane where v is one ,remeber hyper equation is for y such that v is one 133 | self.ax.plot([hyp_x_min,hyp_x_max],[psv1,psv2],"k") 134 | ##doing the same thing and process for a value of v=-1: 135 | nsv1=hyperplane(hyp_x_min,self.w,self.b,-1) 136 | nsv2=hyperplane(hyp_x_max,self.w,self.b,-1) 137 | self.ax.plot([hyp_x_min,hyp_x_max],[nsv1,nsv2],"k") 138 | ###doing the same thing and process for a value of v=0: 139 | db1=hyperplane(hyp_x_min,self.w,self.b,0) 140 | db2=hyperplane(hyp_x_max,self.w,self.b,0) 141 | self.ax.plot([hyp_x_min,hyp_x_max],[db1,db2],"y--") 142 | 143 | #show the result 144 | plt.show() 145 | 146 | 147 | 148 | 149 | data_dict={-1:np.array([[1,7], 150 | [2,8], 151 | [3,8]]), 152 | 1:np.array([[5,1],[6,-1],[7,3]])} 153 | 154 | 155 | ###trial 1:@19:13 hours\ 156 | svm=Support_Vector_Machine() 157 | svm.fit(data=data_dict) 158 | predict_us=[[0,10],[1,3],[3,4],[3,5],[5,5],[6,-5],[5,8]] 159 | for p in predict_us: 160 | svm.predict(p) 161 | svm.visualize() 162 | ############################################-SVM COMPLETED:::: 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /SVM-part 20,21,22.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 11 19:21:35 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | import numpy as np 9 | from sklearn import preprocessing,neighbors,svm 10 | from sklearn.model_selection import cross_validate,train_test_split 11 | import pandas as pd 12 | import pickle 13 | df=pd.read_csv("breast-cancer-wisconsin.data.txt") 14 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont, 15 | #lose rest of the data 16 | df.replace("?",-9999,inplace=True) 17 | 18 | ###check for any useless data and drop it 19 | df.drop(["id"],1,inplace=True) 20 | #### X are the features and y is the label 21 | X=np.array(df.drop(["class"],1)) 22 | print(X) 23 | y=np.array(df["class"]) 24 | 25 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) 26 | #####Using the classifer 27 | clf=svm.SVC() 28 | clf.fit(X_train,y_train) 29 | ###Saving the classifer 30 | with open("K_model","wb") as f: 31 | pickle.dump(clf,f) 32 | #Remeber the difference between accuracy and confidecnce 33 | accuracy=clf.score(X_test,y_test) 34 | print(accuracy) 35 | ####make prediction 36 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]]) 37 | example_measures=np.array(predict_X) 38 | print(example_measures) 39 | ###To make the array shape that sklearn understands and matches the the X features 40 | predict=clf.predict(example_measures.reshape(len(example_measures),-1)) 41 | print(predict) 42 | 43 | 44 | 45 | ###########the part 23,24 was theory -------------------------------------------------------------------------------- /SVM-part 25,26,27.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 15 19:05:10 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib import style 10 | import numpy as np 11 | style.use("ggplot") 12 | 13 | class Support_Vector_Machine(object): 14 | def __init___(self,visulization=True): 15 | self.visulization=visulization 16 | self.colors={1:"r",-1:"b"} 17 | if self.visulization: 18 | self.fig=plt.figure() 19 | self.ax=self.fig.add_subplot(1,1,1) 20 | ###traing the data to find w and b 21 | def fit(self,data): 22 | self.data=data 23 | ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b 24 | opt_dict={} 25 | 26 | 27 | ##These transforms are what we use to apply to vector w 28 | ##each time we step in order to know ever possible direction of a vector w and its 29 | ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account 30 | #for direction,remeber in vector direction matters 31 | transforms=[[1,1],[-1,1],[-1,-1],[1,-1]] 32 | 33 | 34 | 35 | 36 | 37 | all_data=[] 38 | ###this three loop takes all features of the associated class yi and make, 39 | ##a new list of these features and than take the max and min value associated with the 40 | ### this new list of feature and these max and min values are to be used for further convex optimization 41 | for yi in self.data: 42 | for featureset in self.data[yi]: 43 | for feature in featureset: 44 | all_data.append(feature) 45 | self.max_feature_value=max(all_data) 46 | self.min_feature_value=min(all_data) 47 | all_data=None 48 | step_size=[self.max_feature_value*0.1, 49 | self.max_feature_value*0.01, 50 | ##POINT OF EXPENSE 51 | self.max_feature_value*0.001] 52 | ###extremely expensive-b does not need to take precise step 53 | b_range_multiple=5 54 | 55 | #we dont need to take as small of steps 56 | #wit b as we do w 57 | b_multiple=5 58 | ###the first value of w and remeber to simplify things,we assume each element of vector w 59 | ### to be same 60 | latest_optimum=self.max_feature_value*10 61 | 62 | for step in step_size: 63 | ####remeber to simplify things,we assume each element of vector w 64 | ### to be same 65 | w=np.array([latest_optimum,latest_optimum]) 66 | 67 | 68 | #we can do this because convex alogrithm 69 | optimized=False 70 | while not optimized: 71 | ####setting a range for b 72 | for b in np.arange(-1*(self.max_feature_value*b_range_multiple), 73 | self.max_feature_value*b_range_multiple, 74 | step*b_multiple): 75 | for transformation in transforms: 76 | ##applying the different transformation to account for difeerent direction(w_t) 77 | w_t=w*transformation 78 | found_option=True 79 | #weakest link in the SVM fundamentally 80 | #SMO attempt to fix this a bit 81 | ##Running the data on all points is costly-svm weakness 82 | ##yi(xi.w+b)>=1(constraint) 83 | for i in self.data: 84 | for xi in self.data[i]: 85 | yi=i 86 | ###this condition check even if one point in our data doesnt fit the constraint with the give w vector 87 | if not yi*(np.dot(w_t,xi)+b)>=1: 88 | found_option=False 89 | #if w satisfies the constraint 90 | if found_option: 91 | opt_dict[np.linalg.norm(w_t)]=[w_t,b] 92 | if w[0]<0: 93 | optimized=True 94 | print("optimized a step.") 95 | else: 96 | w=w-step 97 | #taking the smallest modulus w and taking new starting new point 98 | norms=sorted([n for n in opt_dict]) 99 | opt_choice=opt_dict[norms[0]] 100 | self.w=opt_choice[0] 101 | self.b=opt_choice[1] 102 | latest_optimum=opt_choice[0][0]+step*2 103 | def predict(self,features): 104 | ###sign(x.w+b) whatever the sign of the equation is 105 | classification=np.sign(np.dot(np.array(features),self.w)+self.b) 106 | 107 | return classification 108 | 109 | 110 | 111 | 112 | 113 | 114 | data_dict={-1:np.array([[1,7], 115 | [2,8], 116 | [3,8]]), 117 | 1:np.array([[5,1],[6,-1],[7,3]])} 118 | -------------------------------------------------------------------------------- /Tic-Tac-Toe AI.py: -------------------------------------------------------------------------------- 1 | 15# -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Feb 15 23:11:53 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | board=[" " for x in range(10)] 9 | 10 | 11 | def insertLetter(letter,pos): 12 | board[pos]=letter 13 | def spaceisFree(pos): 14 | return board[pos]==" " 15 | def printBoard(board): 16 | print(' | | ') 17 | print(""+board[1]+ " | "+ board[2]+ " | "+ board[3]) 18 | print(' | |') 19 | print("----------") 20 | print(' | | ') 21 | print(""+board[4]+ " | "+ board[5]+ " | "+ board[6]) 22 | print(' | |') 23 | print("----------") 24 | print(' | |') 25 | print(""+board[7]+ " | "+ board[8]+ " | "+ board[9]) 26 | print(' | |') 27 | def isWinner(bo,le):##sorry for the long line 28 | return (bo[7]==le and bo[8]==le and bo[9]==le) or (bo[4]==le and bo[5]==le and bo[6]==le) or (bo[1]==le and bo[2]==le and bo[3]==le) or (bo[1]==le and bo[4]==le and bo[7]==le) or (bo[2]==le and bo[5]==le and bo[8]==le) or (bo[3]==le and bo[6]==le and bo[9]==le) or (bo[1]==le and bo[5]==le and bo[9]==le) or (bo[3]==le and bo[5]==le and bo[7]==le) 29 | def playerMove(): 30 | run=True 31 | while run: 32 | move=input("please select a position to place an X (1-9): ") 33 | try: 34 | move=int(move) 35 | if move>0 and move<10: 36 | if spaceisFree(move): 37 | run=False 38 | insertLetter("X",move) 39 | else: 40 | print("This space is occupied") 41 | else: 42 | print("Type a number witin the range") 43 | except: 44 | print("Type a number") 45 | 46 | def compMove(): 47 | possibleMoves=[x for x,letter in enumerate(board) if letter==" " and x!=0] 48 | move=0 49 | 50 | for let in ["O","X"]: 51 | for i in possibleMoves: 52 | boardCopy=board[:] 53 | boardCopy[i]=let 54 | if isWinner(boardCopy,let): 55 | move=i 56 | return move 57 | 58 | cornersOpen=[] 59 | for i in possibleMoves: 60 | if i in [1,3,7,9]: 61 | cornersOpen.append(i) 62 | if len(cornersOpen)>0: 63 | move=selectRandom(cornersOpen) 64 | return move 65 | 66 | if 5 in possibleMoves: 67 | move=5 68 | return move 69 | edgesOpen=[] 70 | for i in possibleMoves: 71 | if i in [2,4,6,8]: 72 | edgesOpen.append(i) 73 | if len(edgesOpen)>0: 74 | move=selectRandom(edgesOpen) 75 | return move 76 | 77 | def selectRandom(li): 78 | import random 79 | In =len(li) 80 | r=random.randrange(0,In) 81 | return li[r] 82 | def isBoardFull(): 83 | if board.count(" ")>1: 84 | return False 85 | else: 86 | return True 87 | def main(): 88 | print("Welcome to Tic Tac Toe") 89 | printBoard(board) 90 | 91 | while not (isBoardFull()): 92 | if not isWinner(board,'O'): 93 | playerMove() 94 | printBoard(board) 95 | else: 96 | print("Sorry AI win the game") 97 | break 98 | if not isWinner(board,'X'): 99 | move=compMove() 100 | if move==0: 101 | print("Tie Game!") 102 | else: 103 | insertLetter("O",move) 104 | print("computer placed an O in position ,move") 105 | printBoard(board) 106 | else: 107 | print("Yeap You ARE smarter than my AI") 108 | break 109 | if isBoardFull(): 110 | print("Tie Game") 111 | main() -------------------------------------------------------------------------------- /cluster-part34,35,36,37.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 25 15:06:06 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | from matplotlib import style 10 | style.use("ggplot") 11 | import numpy as np 12 | from sklearn.cluster import KMeans 13 | import pandas as pd 14 | from sklearn import preprocessing 15 | from pandas.api.types import is_numeric_dtype 16 | 17 | #####basic visulisation of k-means 18 | X=np.array([[1,2], 19 | [1.5,1.8], 20 | [5,8], 21 | [8,8], 22 | [1,0.6], 23 | [9,11]]) 24 | plt.scatter(X[:,0],X[:,1],s=150) 25 | 26 | 27 | 28 | clf=KMeans(n_clusters=2) 29 | clf.fit(X) 30 | centroids=clf.cluster_centers_ 31 | labels=clf.labels_ 32 | colors=["g.","r.","c.","b.","k.","o."] 33 | print(centroids) 34 | print(labels) 35 | for i in range(len(X)): 36 | plt.plot(X[i][0],X[i][1],colors[labels[i]],markersize=25) 37 | plt.scatter(centroids[:,0],centroids[:,1],marker="x",s=150) 38 | plt.show() 39 | ################################################################## 40 | ###Analysing Titanic dataset through K-means 41 | df=pd.read_excel("titanic.xls") 42 | df.drop(["body","name"],1,inplace=True) 43 | df.apply(pd.to_numeric, errors='ignore') 44 | df.fillna(0,inplace=True) 45 | c=df["age"].values.tolist() 46 | ###in order to convert text data to useable numeric data 47 | def handle_numeric_data(df): 48 | columns=df.columns.values 49 | # dtypes=dict(df.dtypes) 50 | for column in columns: 51 | text_digit_vals={} 52 | # dtype=dtypes[column] 53 | def convert_to_int(val): 54 | return text_digit_vals[val] 55 | if df[column].dtype != np.int64 and df[column].dtype != np.float64: 56 | columns_contents=df[column].values.tolist() 57 | 58 | unique_element=set(columns_contents) 59 | x=0 60 | for unique in unique_element: 61 | if unique not in text_digit_vals: 62 | text_digit_vals[unique]=x 63 | x+=1 64 | 65 | df[column]=list(map(convert_to_int,df[column])) 66 | return df 67 | df=handle_numeric_data(df) 68 | #################################################################### 69 | 70 | #df.drop(["sex","boat"],1,inplace=True) 71 | X=np.array(df.drop(["survived"],1),dtype=float) 72 | X=preprocessing.scale(X) 73 | y=np.array(df["survived"]) 74 | 75 | 76 | clf=KMeans(n_clusters=2) 77 | clf.fit(X) 78 | 79 | ########In unsupervised learning we do not have labels so we are going to use only X to fit the data 80 | ########then KMeans would label the data into two groups and to check accuracy of that labelling we will see 81 | #######what was the prediction of the KMeans and compare with binary representation of our y. 82 | correct=0 83 | labels=clf.labels_###use this 84 | print(labels) 85 | for i in range(len(X)): 86 | predict_me=np.array(X[i],dtype=float) 87 | predict_me=predict_me.reshape(-1,len(predict_me)) 88 | prediction=clf.predict(predict_me)###what does the label given by classifer 89 | if prediction[0]==y[i]:####did it label correctly 90 | correct+=1 91 | print(correct/len(X)) 92 | 93 | -------------------------------------------------------------------------------- /k means...nearest neighbours-part 13,14,15.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 5 13:45:02 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | import numpy as np 8 | from sklearn import preprocessing,neighbors 9 | from sklearn.model_selection import cross_validate,train_test_split 10 | import pandas as pd 11 | import pickle 12 | df=pd.read_csv("breast-cancer-wisconsin.data.txt") 13 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont, 14 | #lose rest of the data 15 | df.replace("?",-9999,inplace=True) 16 | 17 | ###check for any useless data and drop it 18 | df.drop(["id"],1,inplace=True) 19 | #### X are the features and y is the label 20 | X=np.array(df.drop(["class"],1)) 21 | print(X) 22 | y=np.array(df["class"]) 23 | 24 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) 25 | #####Using the classifer 26 | clf=neighbors.KNeighborsClassifier() 27 | clf.fit(X_train,y_train) 28 | ###Saving the classifer 29 | with open("K_model","wb") as f: 30 | pickle.dump(clf,f) 31 | #Remeber the difference between accuracy and confidecnce 32 | accuracy=clf.score(X_test,y_test) 33 | print(accuracy) 34 | ####make prediction 35 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]]) 36 | example_measures=np.array(predict_X) 37 | print(example_measures) 38 | ###To make the array shape that sklearn understands and matches the the X features 39 | predict=clf.predict(example_measures.reshape(len(example_measures),-1)) 40 | print(predict) 41 | 42 | 43 | 44 | 45 | 46 | 47 | ############################### 48 | #K-model from scratch: 49 | from math import sqrt 50 | #### Euclidean_Distance Basic Formula is: 51 | plot1=[1,3] 52 | plot2=[2,5] 53 | euclidean_distance=sqrt(((plot1[0]-plot2[0])**2)+((plot1[1]-plot2[1])**2)) 54 | print(euclidean_distance) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /k means...nearest neighbours-part 16,17,18,19.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 8 16:43:39 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | #K-model from scratch: 9 | from math import sqrt 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from matplotlib import style 13 | from collections import Counter 14 | import warnings 15 | import random 16 | import pandas as pd 17 | ###creating a dataset with labels and features 18 | dataset={"k":[[1,2],[2,3],[3,1]],"r":[[6,5],[7,7],[8,6]]} 19 | ##feature to be classified 20 | new_feature=[5,7] 21 | ####Looping over to scatter the plot 22 | #for i in dataset: 23 | # for ii in dataset[i]: 24 | # [plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] 25 | ###More pythontic way 26 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] 27 | #plt.scatter(new_feature[0],new_feature[1]) 28 | #plt.show() 29 | ###k alogorithm 30 | def k_nearest_neighbours(data,predict,k=3): 31 | if len(data) >=k: 32 | warnings.warn("K is set to a value less than total voting groups!") 33 | distances=[] 34 | for group in data: 35 | for features in data[group]: 36 | # euclidean_distance=np.sqrt(np.sum((np.array(features)-np.array(predict))**2)) or better: 37 | ###numpy fomula 38 | euclidean_distance=np.linalg.norm(np.array(features)-np.array(predict)) 39 | #####Making the euclidean distance list to sort later 40 | distances.append([euclidean_distance,group]) 41 | ###calculating votes to help us classify-lowest distance 42 | votes=[i[1] for i in sorted(distances)[:k]] 43 | votes_result=Counter(votes).most_common(1)[0][0] 44 | ###confidence measure how confident our classifer is about one single point in labelling that point-that is what porppotion of votes were infavour 45 | confidence=Counter(votes).most_common(1)[0][1]/k 46 | return votes_result,confidence 47 | 48 | #result=k_nearest_neighbours(dataset,new_feature,k=3) 49 | #print(result) 50 | ###showing the result,color is already k an r as variables. 51 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset] 52 | #plt.scatter(new_feature[0],new_feature[1],color=result) 53 | #plt.show() 54 | ######################Comparing our model vs sklearn model 55 | #df=pd.read_csv("breast-cancer-wisconsin.data.txt") 56 | #df.replace("?",-9999,inplace=True) 57 | #df.drop(["id"],1,inplace=True) 58 | #full_data=df.astype(float).values.tolist() 59 | ######shuffling the inner lists of full_data 60 | #random.shuffle(full_data) 61 | # 62 | #########dividing the full data into train data and test data 63 | #test_size=0.4 64 | #train_set= {2:[],4:[]} 65 | #test_set={2:[],4:[]} 66 | #train_data=full_data[:-int(test_size*len(full_data))] 67 | #test_data=full_data[-int(test_size*len(full_data)):] 68 | # 69 | #for i in train_data: 70 | # ####associating the datas to classifiers in this case 2 or 4 71 | # train_set[i[-1]].append(i[:-1]) 72 | #for i in test_data: 73 | # ####associating the datas to classifiers in this case 2 or 4 74 | # test_set[i[-1]].append(i[:-1]) 75 | #correct=0 76 | #total=0 77 | #for group in test_set: 78 | # for data in test_set[group]: 79 | # vote,confidence=k_nearest_neighbours(train_set,data,k=5) 80 | # if group == vote: 81 | # correct+=1 82 | # else: 83 | # print(confidence) 84 | # total+=1 85 | # 86 | #print("Accuracy: ",correct/total) 87 | #########################copying the whole algorith down again to judge the accuracy in a numbe of trials: 88 | accuracies=[] 89 | n=25 90 | for i in range(n): 91 | df=pd.read_csv("breast-cancer-wisconsin.data.txt") 92 | df.replace("?",-9999,inplace=True) 93 | df.drop(["id"],1,inplace=True) 94 | full_data=df.astype(float).values.tolist() 95 | #####shuffling the inner lists of full_data 96 | random.shuffle(full_data) 97 | 98 | ########dividing the full data into train data and test data 99 | test_size=0.4 100 | train_set= {2:[],4:[]} 101 | test_set={2:[],4:[]} 102 | train_data=full_data[:-int(test_size*len(full_data))] 103 | test_data=full_data[-int(test_size*len(full_data)):] 104 | 105 | for i in train_data: 106 | ####associating the datas to classifiers in this case 2 or 4 107 | train_set[i[-1]].append(i[:-1]) 108 | for i in test_data: 109 | ####associating the datas to classifiers in this case 2 or 4 110 | test_set[i[-1]].append(i[:-1]) 111 | correct=0 112 | total=0 113 | for group in test_set: 114 | for data in test_set[group]: 115 | vote,confidence=k_nearest_neighbours(train_set,data,k=5) 116 | if group == vote: 117 | correct+=1 118 | total+=1 119 | accuracies.append(correct/total) 120 | print("overall_accuracy(our algorithm) for", n ," steps = ", sum(accuracies)/len(accuracies)) 121 | ##############finally getting the sklearn algorithm and comparing it with our overall accuracy for a specific number of steps: 122 | accuracies_2=[] 123 | for i in range(n): 124 | from sklearn.model_selection import train_test_split 125 | from sklearn import neighbors 126 | df=pd.read_csv("breast-cancer-wisconsin.data.txt") 127 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont, 128 | #lose rest of the data 129 | df.replace("?",-9999,inplace=True) 130 | ###check for any useless data and drop it 131 | df.drop(["id"],1,inplace=True) 132 | #### X are the features and y is the label 133 | X=np.array(df.drop(["class"],1)) 134 | y=np.array(df["class"]) 135 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) 136 | #####Using the classifer 137 | clf=neighbors.KNeighborsClassifier() 138 | clf.fit(X_train,y_train) 139 | #Remeber the difference between accuracy and confidecnce 140 | accuracy=clf.score(X_test,y_test) 141 | accuracies_2.append(accuracy) 142 | print("overall_accuracy(sk-learn alogorithm) for", n ," steps = ", sum(accuracies_2)/len(accuracies_2)) 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /k-means from scratch-part 37,38.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Jan 26 17:11:21 2019 4 | 5 | @author: MMOHTASHIM 6 | """ 7 | 8 | ########################################## 9 | ###########-k-mean from scratch 10 | import matplotlib.pyplot as plt 11 | from matplotlib import style 12 | style.use("ggplot") 13 | import numpy as np 14 | from sklearn.cluster import KMeans 15 | import pandas as pd 16 | from sklearn import preprocessing 17 | from pandas.api.types import is_numeric_dtype 18 | 19 | #####basic visulisation 20 | #X=np.array([[1,2], 21 | # [1.5,1.8], 22 | # [5,8], 23 | # [8,8], 24 | # [1,0.6], 25 | # [9,11]]) 26 | 27 | #plt.scatter(X[:,0],X[:,1],s=150) 28 | 29 | colors=["g","r","c","b","k"] 30 | 31 | class K_Means: 32 | def __init__(self, k=2, tol=0.001, max_iter=300): 33 | self.k = k 34 | self.tol = tol 35 | self.max_iter = max_iter 36 | 37 | def fit(self,data): 38 | 39 | self.centroids = {} 40 | 41 | for i in range(self.k): 42 | self.centroids[i] = data[i] 43 | 44 | for i in range(self.max_iter): 45 | self.classifications = {} 46 | 47 | for i in range(self.k): 48 | self.classifications[i] = [] 49 | 50 | for featureset in data: 51 | distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids] 52 | classification = distances.index(min(distances)) 53 | self.classifications[classification].append(featureset) 54 | 55 | prev_centroids = dict(self.centroids) 56 | 57 | for classification in self.classifications: 58 | self.centroids[classification] = np.average(self.classifications[classification],axis=0) 59 | 60 | optimized = True 61 | 62 | for c in self.centroids: 63 | original_centroid = prev_centroids[c] 64 | current_centroid = self.centroids[c] 65 | if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol: 66 | print(np.sum((current_centroid-original_centroid)/original_centroid*100.0)) 67 | optimized = False 68 | 69 | if optimized: 70 | print(np.sum((current_centroid-original_centroid)/original_centroid*100.0)) 71 | break 72 | 73 | def predict(self,data): 74 | distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids] 75 | classification = distances.index(min(distances)) 76 | return classification 77 | ###This code for basic visulisation of K_mean in 2d. 78 | #clf = K_Means() 79 | #clf.fit(X) 80 | 81 | #for centroid in clf.centroids: 82 | # plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1], 83 | # marker="o", color="k", s=150, linewidths=5) 84 | # 85 | #for classification in clf.classifications: 86 | # color = colors[classification] 87 | # for featureset in clf.classifications[classification]: 88 | # plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5) 89 | 90 | 91 | #unknowns=np.array([[1,3], 92 | # [8,9], 93 | # [0,3], 94 | # [5,4], 95 | # [6,4]]) 96 | #for unknown in unknowns: 97 | # classification=clf.predict(unknown) 98 | # plt.scatter(unknown[0],unknown[1],marker="*",color=colors[classification],s=150,linewidths=5) 99 | 100 | ###below is to compare our alg with sklearn 101 | # https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls 102 | df = pd.read_excel('titanic.xls') 103 | df.drop(['body','name'], 1, inplace=True) 104 | #df.convert_objects(convert_numeric=True) 105 | print(df.head()) 106 | df.fillna(0,inplace=True) 107 | 108 | def handle_non_numerical_data(df): 109 | 110 | # handling non-numerical data: must convert. 111 | columns = df.columns.values 112 | 113 | for column in columns: 114 | text_digit_vals = {} 115 | def convert_to_int(val): 116 | return text_digit_vals[val] 117 | 118 | #print(column,df[column].dtype) 119 | if df[column].dtype != np.int64 and df[column].dtype != np.float64: 120 | 121 | column_contents = df[column].values.tolist() 122 | #finding just the uniques 123 | unique_elements = set(column_contents) 124 | # great, found them. 125 | x = 0 126 | for unique in unique_elements: 127 | if unique not in text_digit_vals: 128 | # creating dict that contains new 129 | # id per unique string 130 | text_digit_vals[unique] = x 131 | x+=1 132 | # now we map the new "id" vlaue 133 | # to replace the string. 134 | df[column] = list(map(convert_to_int,df[column])) 135 | 136 | return df 137 | 138 | df = handle_non_numerical_data(df) 139 | print(df.head()) 140 | 141 | # add/remove features just to see impact they have. 142 | df.drop(['ticket','home.dest'], 1, inplace=True) 143 | 144 | 145 | X = np.array(df.drop(['survived'], 1).astype(float)) 146 | X = preprocessing.scale(X) 147 | y = np.array(df['survived']) 148 | 149 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5) 150 | 151 | clf = K_Means()##change this 152 | clf.fit(X) 153 | 154 | correct = 0 155 | for i in range(len(X)): 156 | 157 | predict_me = np.array(X[i].astype(float)) 158 | predict_me = predict_me.reshape(-1, len(predict_me)) 159 | prediction = clf.predict(predict_me) 160 | if prediction == y[i]: 161 | correct += 1 162 | 163 | 164 | print(correct/len(X)) 165 | 166 | 167 | -------------------------------------------------------------------------------- /kernel,cvxopt-part 32.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jan 23 12:16:33 2019 4 | 5 | @author: Not my work 6 | """ 7 | 8 | # Mathieu Blondel, September 2010 9 | # License: BSD 3 clause 10 | # http://www.mblondel.org/journal/2010/09/19/support-vector-machines-in-python/ 11 | 12 | # visualizing what translating to another dimension does 13 | # and bringing back to 2D: 14 | # https://www.youtube.com/watch?v=3liCbRZPrZA 15 | 16 | # Docs: http://cvxopt.org/userguide/coneprog.html#quadratic-programming 17 | # Docs qp example: http://cvxopt.org/examples/tutorial/qp.html 18 | 19 | # Nice tutorial: 20 | # https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf 21 | 22 | 23 | import numpy as np 24 | from numpy import linalg 25 | import cvxopt 26 | import cvxopt.solvers 27 | 28 | def linear_kernel(x1, x2): 29 | return np.dot(x1, x2) 30 | 31 | def polynomial_kernel(x, y, p=3): 32 | return (1 + np.dot(x, y)) ** p 33 | 34 | def gaussian_kernel(x, y, sigma=5.0): 35 | return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2))) 36 | 37 | class SVM(object): 38 | 39 | def __init__(self, kernel=linear_kernel, C=None): 40 | self.kernel = kernel 41 | self.C = C 42 | if self.C is not None: self.C = float(self.C) 43 | 44 | def fit(self, X, y): 45 | n_samples, n_features = X.shape 46 | 47 | # Gram matrix 48 | K = np.zeros((n_samples, n_samples)) 49 | for i in range(n_samples): 50 | for j in range(n_samples): 51 | K[i,j] = self.kernel(X[i], X[j]) 52 | 53 | P = cvxopt.matrix(np.outer(y,y) * K) 54 | q = cvxopt.matrix(np.ones(n_samples) * -1) 55 | A = cvxopt.matrix(y, (1,n_samples)) 56 | b = cvxopt.matrix(0.0) 57 | 58 | if self.C is None: 59 | G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1)) 60 | h = cvxopt.matrix(np.zeros(n_samples)) 61 | else: 62 | tmp1 = np.diag(np.ones(n_samples) * -1) 63 | tmp2 = np.identity(n_samples) 64 | G = cvxopt.matrix(np.vstack((tmp1, tmp2))) 65 | tmp1 = np.zeros(n_samples) 66 | tmp2 = np.ones(n_samples) * self.C 67 | h = cvxopt.matrix(np.hstack((tmp1, tmp2))) 68 | 69 | # solve QP problem 70 | solution = cvxopt.solvers.qp(P, q, G, h, A, b) 71 | 72 | # Lagrange multipliers 73 | a = np.ravel(solution['x']) 74 | 75 | # Support vectors have non zero lagrange multipliers 76 | sv = a > 1e-5 77 | ind = np.arange(len(a))[sv] 78 | self.a = a[sv] 79 | self.sv = X[sv] 80 | self.sv_y = y[sv] 81 | print("%d support vectors out of %d points" % (len(self.a), n_samples)) 82 | 83 | # Intercept 84 | self.b = 0 85 | for n in range(len(self.a)): 86 | self.b += self.sv_y[n] 87 | self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv]) 88 | self.b /= len(self.a) 89 | 90 | # Weight vector 91 | if self.kernel == linear_kernel: 92 | self.w = np.zeros(n_features) 93 | for n in range(len(self.a)): 94 | self.w += self.a[n] * self.sv_y[n] * self.sv[n] 95 | else: 96 | self.w = None 97 | 98 | def project(self, X): 99 | if self.w is not None: 100 | return np.dot(X, self.w) + self.b 101 | else: 102 | y_predict = np.zeros(len(X)) 103 | for i in range(len(X)): 104 | s = 0 105 | for a, sv_y, sv in zip(self.a, self.sv_y, self.sv): 106 | s += a * sv_y * self.kernel(X[i], sv) 107 | y_predict[i] = s 108 | return y_predict + self.b 109 | 110 | def predict(self, X): 111 | return np.sign(self.project(X)) 112 | 113 | if __name__ == "__main__": 114 | import pylab as pl 115 | 116 | def gen_lin_separable_data(): 117 | # generate training data in the 2-d case 118 | mean1 = np.array([0, 2]) 119 | mean2 = np.array([2, 0]) 120 | cov = np.array([[0.8, 0.6], [0.6, 0.8]]) 121 | X1 = np.random.multivariate_normal(mean1, cov, 100) 122 | y1 = np.ones(len(X1)) 123 | X2 = np.random.multivariate_normal(mean2, cov, 100) 124 | y2 = np.ones(len(X2)) * -1 125 | return X1, y1, X2, y2 126 | 127 | def gen_non_lin_separable_data(): 128 | mean1 = [-1, 2] 129 | mean2 = [1, -1] 130 | mean3 = [4, -4] 131 | mean4 = [-4, 4] 132 | cov = [[1.0,0.8], [0.8, 1.0]] 133 | X1 = np.random.multivariate_normal(mean1, cov, 50) 134 | X1 = np.vstack((X1, np.random.multivariate_normal(mean3, cov, 50))) 135 | y1 = np.ones(len(X1)) 136 | X2 = np.random.multivariate_normal(mean2, cov, 50) 137 | X2 = np.vstack((X2, np.random.multivariate_normal(mean4, cov, 50))) 138 | y2 = np.ones(len(X2)) * -1 139 | return X1, y1, X2, y2 140 | 141 | def gen_lin_separable_overlap_data(): 142 | # generate training data in the 2-d case 143 | mean1 = np.array([0, 2]) 144 | mean2 = np.array([2, 0]) 145 | cov = np.array([[1.5, 1.0], [1.0, 1.5]]) 146 | X1 = np.random.multivariate_normal(mean1, cov, 100) 147 | y1 = np.ones(len(X1)) 148 | X2 = np.random.multivariate_normal(mean2, cov, 100) 149 | y2 = np.ones(len(X2)) * -1 150 | return X1, y1, X2, y2 151 | 152 | def split_train(X1, y1, X2, y2): 153 | X1_train = X1[:90] 154 | y1_train = y1[:90] 155 | X2_train = X2[:90] 156 | y2_train = y2[:90] 157 | X_train = np.vstack((X1_train, X2_train)) 158 | y_train = np.hstack((y1_train, y2_train)) 159 | return X_train, y_train 160 | 161 | def split_test(X1, y1, X2, y2): 162 | X1_test = X1[90:] 163 | y1_test = y1[90:] 164 | X2_test = X2[90:] 165 | y2_test = y2[90:] 166 | X_test = np.vstack((X1_test, X2_test)) 167 | y_test = np.hstack((y1_test, y2_test)) 168 | return X_test, y_test 169 | 170 | def plot_margin(X1_train, X2_train, clf): 171 | def f(x, w, b, c=0): 172 | # given x, return y such that [x,y] in on the line 173 | # w.x + b = c 174 | return (-w[0] * x - b + c) / w[1] 175 | 176 | pl.plot(X1_train[:,0], X1_train[:,1], "ro") 177 | pl.plot(X2_train[:,0], X2_train[:,1], "bo") 178 | pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g") 179 | 180 | # w.x + b = 0 181 | a0 = -4; a1 = f(a0, clf.w, clf.b) 182 | b0 = 4; b1 = f(b0, clf.w, clf.b) 183 | pl.plot([a0,b0], [a1,b1], "k") 184 | 185 | # w.x + b = 1 186 | a0 = -4; a1 = f(a0, clf.w, clf.b, 1) 187 | b0 = 4; b1 = f(b0, clf.w, clf.b, 1) 188 | pl.plot([a0,b0], [a1,b1], "k--") 189 | 190 | # w.x + b = -1 191 | a0 = -4; a1 = f(a0, clf.w, clf.b, -1) 192 | b0 = 4; b1 = f(b0, clf.w, clf.b, -1) 193 | pl.plot([a0,b0], [a1,b1], "k--") 194 | 195 | pl.axis("tight") 196 | pl.show() 197 | 198 | def plot_contour(X1_train, X2_train, clf): 199 | pl.plot(X1_train[:,0], X1_train[:,1], "ro") 200 | pl.plot(X2_train[:,0], X2_train[:,1], "bo") 201 | pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g") 202 | 203 | X1, X2 = np.meshgrid(np.linspace(-6,6,50), np.linspace(-6,6,50)) 204 | X = np.array([[x1, x2] for x1, x2 in zip(np.ravel(X1), np.ravel(X2))]) 205 | Z = clf.project(X).reshape(X1.shape) 206 | pl.contour(X1, X2, Z, [0.0], colors='k', linewidths=1, origin='lower') 207 | pl.contour(X1, X2, Z + 1, [0.0], colors='grey', linewidths=1, origin='lower') 208 | pl.contour(X1, X2, Z - 1, [0.0], colors='grey', linewidths=1, origin='lower') 209 | 210 | pl.axis("tight") 211 | pl.show() 212 | 213 | def test_linear(): 214 | X1, y1, X2, y2 = gen_lin_separable_data() 215 | X_train, y_train = split_train(X1, y1, X2, y2) 216 | X_test, y_test = split_test(X1, y1, X2, y2) 217 | 218 | clf = SVM() 219 | clf.fit(X_train, y_train) 220 | 221 | y_predict = clf.predict(X_test) 222 | correct = np.sum(y_predict == y_test) 223 | print("%d out of %d predictions correct" % (correct, len(y_predict))) 224 | 225 | plot_margin(X_train[y_train==1], X_train[y_train==-1], clf) 226 | 227 | def test_non_linear(): 228 | X1, y1, X2, y2 = gen_non_lin_separable_data() 229 | X_train, y_train = split_train(X1, y1, X2, y2) 230 | X_test, y_test = split_test(X1, y1, X2, y2) 231 | 232 | clf = SVM(polynomial_kernel) 233 | clf.fit(X_train, y_train) 234 | 235 | y_predict = clf.predict(X_test) 236 | correct = np.sum(y_predict == y_test) 237 | print("%d out of %d predictions correct" % (correct, len(y_predict))) 238 | 239 | plot_contour(X_train[y_train==1], X_train[y_train==-1], clf) 240 | 241 | def test_soft(): 242 | X1, y1, X2, y2 = gen_lin_separable_overlap_data() 243 | X_train, y_train = split_train(X1, y1, X2, y2) 244 | X_test, y_test = split_test(X1, y1, X2, y2) 245 | 246 | clf = SVM(C=1000.1) 247 | clf.fit(X_train, y_train) 248 | 249 | y_predict = clf.predict(X_test) 250 | correct = np.sum(y_predict == y_test) 251 | print("%d out of %d predictions correct" % (correct, len(y_predict))) 252 | 253 | plot_contour(X_train[y_train==1], X_train[y_train==-1], clf) 254 | 255 | 256 | test_linear() 257 | #test_non_linear() 258 | # test_soft() 259 | ################################-the above example is detailed application of convex optimization to quadratic programming and solving svm 260 | ##below is the code for simple understanding as how qp and convex optimization works: 261 | Import the necessary packages 262 | import numpy 263 | from cvxopt import matrix 264 | from cvxopt import solvers 265 | ####Remeber the orginal form of qp following cvxopt 1/2x^t*Px+q^Tx,subject to Gx