├── Mean Shift-part 39,40,41,42.py
├── Mean Shift-part 39,40.py
├── Mean Shift-part 41,42.py
├── README.md
├── Regression part 10 and Part 11.py
├── SVM-PART 28,Kernel-part 29,30,31.py
├── SVM-part 20,21,22.py
├── SVM-part 25,26,27.py
├── Tic-Tac-Toe AI.py
├── cluster-part34,35,36,37.py
├── k means...nearest neighbours-part 13,14,15.py
├── k means...nearest neighbours-part 16,17,18,19.py
├── k-means from scratch-part 37,38.py
├── kernel,cvxopt-part 32.py
├── regress ion part 4 and part 5.py
├── regression part 12.py
├── regression part 6,7,8,9.py
├── regression-part 1 and 2.py
└── svm(final)-sklearn-part 33.py
/Mean Shift-part 39,40,41,42.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 27 19:01:02 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 | import numpy as np
8 | from sklearn.cluster import MeanShift
9 | from sklearn.datasets.samples_generator import make_blobs
10 | import matplotlib.pyplot as plt
11 | from mpl_toolkits.mplot3d import Axes3D
12 | from matplotlib import style
13 | style.use("ggplot")
14 | import pandas as pd
15 | from sklearn import preprocessing
16 | from pandas.api.types import is_numeric_dtype
17 | ############Basic Visulisation of Mean Shift
18 | #centers = [[1,1,1],[5,5,5],[3,10,10]]
19 | #
20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5)
21 | #
22 | #ms = MeanShift()
23 | #ms.fit(X)
24 | #labels = ms.labels_
25 | #cluster_centers = ms.cluster_centers_
26 | #
27 | #print(cluster_centers)
28 | #n_clusters_ = len(np.unique(labels))
29 | #print("Number of estimated clusters:", n_clusters_)
30 | #
31 | #colors = 10*['r','g','b','c','k','y','m']
32 | #fig = plt.figure()
33 | #ax = fig.add_subplot(111, projection='3d')
34 | #
35 | #for i in range(len(X)):
36 | # ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')
37 | #
38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
39 | # marker="x",color='k', s=150, linewidths = 5, zorder=10)
40 |
41 | ######################################################################
42 | # -*- coding: utf-8 -*-
43 | #############Mean Shift on Titanic Dataset
44 | df = pd.read_excel('titanic.xls')
45 | orginal_df=pd.DataFrame.copy(df)
46 |
47 |
48 | df.drop(['body','name'], 1, inplace=True)
49 | #df.convert_objects(convert_numeric=True)
50 | print(df.head())
51 | df.fillna(0,inplace=True)
52 |
53 | def handle_non_numerical_data(df):
54 |
55 | # handling non-numerical data: must convert.
56 | columns = df.columns.values
57 |
58 | for column in columns:
59 | text_digit_vals = {}
60 | def convert_to_int(val):
61 | return text_digit_vals[val]
62 |
63 | #print(column,df[column].dtype)
64 | if df[column].dtype != np.int64 and df[column].dtype != np.float64:
65 |
66 | column_contents = df[column].values.tolist()
67 | #finding just the uniques
68 | unique_elements = set(column_contents)
69 | # great, found them.
70 | x = 0
71 | for unique in unique_elements:
72 | if unique not in text_digit_vals:
73 | # creating dict that contains new
74 | # id per unique string
75 | text_digit_vals[unique] = x
76 | x+=1
77 | # now we map the new "id" vlaue
78 | # to replace the string.
79 | df[column] = list(map(convert_to_int,df[column]))
80 |
81 | return df
82 |
83 | df = handle_non_numerical_data(df)
84 | print(df.head())
85 |
86 | # add/remove features just to see impact they have.
87 | df.drop(['ticket','home.dest'], 1, inplace=True)
88 |
89 |
90 | X = np.array(df.drop(['survived'], 1).astype(float))
91 | X = preprocessing.scale(X)
92 | y = np.array(df['survived'])
93 |
94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
95 |
96 | clf = MeanShift()##change this
97 | clf.fit(X)
98 |
99 |
100 | labels=clf.labels_
101 | cluster_centers=clf.cluster_centers_
102 |
103 | orginal_df["cluster_group"]=np.nan
104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed
105 | for i in range(len(X)):
106 | orginal_df["cluster_group"].iloc[i]=labels[i]
107 | n_clusters_=len(np.unique(labels))
108 | survival_rates={}###to see survival rate for different classes
109 | for i in range(n_clusters_):
110 | temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster
111 | # print(temp_df)
112 | survival_cluster=temp_df[(temp_df["survived"]==1)]
113 | survival_rate=len(survival_cluster)/len(temp_df)
114 | survival_rates[i]=survival_rate
115 | #print(orginal_df[(orginal_df["cluster_group"]==2)])
116 | ###Now you can use df.describe() to analyse the data for different classes
117 |
--------------------------------------------------------------------------------
/Mean Shift-part 39,40.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sun Jan 27 19:01:02 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 | import numpy as np
8 | from sklearn.cluster import MeanShift
9 | from sklearn.datasets.samples_generator import make_blobs
10 | import matplotlib.pyplot as plt
11 | from mpl_toolkits.mplot3d import Axes3D
12 | from matplotlib import style
13 | style.use("ggplot")
14 | import pandas as pd
15 | from sklearn import preprocessing
16 | from pandas.api.types import is_numeric_dtype
17 | ############Basic Visulisation of Mean Shift
18 | #centers = [[1,1,1],[5,5,5],[3,10,10]]
19 | #
20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5)
21 | #
22 | #ms = MeanShift()
23 | #ms.fit(X)
24 | #labels = ms.labels_
25 | #cluster_centers = ms.cluster_centers_
26 | #
27 | #print(cluster_centers)
28 | #n_clusters_ = len(np.unique(labels))
29 | #print("Number of estimated clusters:", n_clusters_)
30 | #
31 | #colors = 10*['r','g','b','c','k','y','m']
32 | #fig = plt.figure()
33 | #ax = fig.add_subplot(111, projection='3d')
34 | #
35 | #for i in range(len(X)):
36 | # ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')
37 | #
38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
39 | # marker="x",color='k', s=150, linewidths = 5, zorder=10)
40 |
41 | ######################################################################
42 | # -*- coding: utf-8 -*-
43 | #############Mean Shift on Titanic Dataset
44 | df = pd.read_excel('titanic.xls')
45 | orginal_df=pd.DataFrame.copy(df)
46 |
47 |
48 | df.drop(['body','name'], 1, inplace=True)
49 | #df.convert_objects(convert_numeric=True)
50 | print(df.head())
51 | df.fillna(0,inplace=True)
52 |
53 | def handle_non_numerical_data(df):
54 |
55 | # handling non-numerical data: must convert.
56 | columns = df.columns.values
57 |
58 | for column in columns:
59 | text_digit_vals = {}
60 | def convert_to_int(val):
61 | return text_digit_vals[val]
62 |
63 | #print(column,df[column].dtype)
64 | if df[column].dtype != np.int64 and df[column].dtype != np.float64:
65 |
66 | column_contents = df[column].values.tolist()
67 | #finding just the uniques
68 | unique_elements = set(column_contents)
69 | # great, found them.
70 | x = 0
71 | for unique in unique_elements:
72 | if unique not in text_digit_vals:
73 | # creating dict that contains new
74 | # id per unique string
75 | text_digit_vals[unique] = x
76 | x+=1
77 | # now we map the new "id" vlaue
78 | # to replace the string.
79 | df[column] = list(map(convert_to_int,df[column]))
80 |
81 | return df
82 |
83 | df = handle_non_numerical_data(df)
84 | print(df.head())
85 |
86 | # add/remove features just to see impact they have.
87 | df.drop(['ticket','home.dest'], 1, inplace=True)
88 |
89 |
90 | X = np.array(df.drop(['survived'], 1).astype(float))
91 | X = preprocessing.scale(X)
92 | y = np.array(df['survived'])
93 |
94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
95 |
96 | clf = MeanShift()##change this
97 | clf.fit(X)
98 |
99 |
100 | labels=clf.labels_
101 | cluster_centers=clf.cluster_centers_
102 |
103 | orginal_df["cluster_group"]=np.nan
104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed
105 | for i in range(len(X)):
106 | orginal_df["cluster_group"].iloc[i]=labels[i]
107 | n_clusters_=len(np.unique(labels))
108 | survival_rates={}###to see survival rate for different classes
109 | for i in range(n_clusters_):
110 | temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster
111 | # print(temp_df)
112 | survival_cluster=temp_df[(temp_df["survived"]==1)]
113 | survival_rate=len(survival_cluster)/len(temp_df)
114 | survival_rates[i]=survival_rate
115 | #print(orginal_df[(orginal_df["cluster_group"]==2)])
116 | ###Now you can use df.describe() to analyse the data for different classes
117 |
--------------------------------------------------------------------------------
/Mean Shift-part 41,42.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Jan 28 12:43:31 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 | import matplotlib.pyplot as plt
8 | from matplotlib import style
9 | style.use("ggplot")
10 | import numpy as np
11 | from sklearn.cluster import KMeans
12 | import pandas as pd
13 | from sklearn import preprocessing
14 | from pandas.api.types import is_numeric_dtype
15 | from sklearn.datasets.samples_generator import make_blobs
16 | import random
17 |
18 | #####basic visulisation of k-means
19 |
20 | #plt.scatter(X[:,0],X[:,1],s=150)
21 | X,y=make_blobs(n_samples=50,centers=5,n_features=2)
22 |
23 |
24 | clf=KMeans(n_clusters=2)
25 | clf.fit(X)
26 | centroids=clf.cluster_centers_
27 | labels=clf.labels_
28 | colors=10*["g","r","c","b","k","o"]
29 | ######Making Meansift from scartch
30 | class Mean_Shift:
31 | def __init__(self,radius=None,radius_norm_step=100):
32 | self.radius=radius
33 | self.radius_norm_step=radius_norm_step
34 | def fit(self,data):
35 |
36 | if self.radius==None:
37 | all_data_centroid=np.average(data,axis=0)
38 | all_data_norm=np.linalg.norm(all_data_centroid)
39 | self.radius=all_data_norm/self.radius_norm_step
40 |
41 |
42 | centroids={}
43 |
44 | for i in range(len(data)):
45 | centroids[i]=data[i]
46 | weights = [i for i in range(self.radius_norm_step)][::-1]
47 | while True:
48 | new_centroids = []
49 | for i in centroids:
50 | in_bandwidth = []
51 | centroid = centroids[i]
52 |
53 | for featureset in data:
54 | #if np.linalg.norm(featureset-centroid) < self.radius:
55 | # in_bandwidth.append(featureset)
56 | distance = np.linalg.norm(featureset-centroid)
57 | if distance == 0:
58 | distance = 0.00000000001
59 | weight_index = int(distance/self.radius)
60 | if weight_index > self.radius_norm_step-1:
61 | weight_index = self.radius_norm_step-1
62 |
63 | to_add = (weights[weight_index]**2)*[featureset]
64 | in_bandwidth +=to_add
65 |
66 |
67 | new_centroid = np.average(in_bandwidth,axis=0)
68 | new_centroids.append(tuple(new_centroid))
69 |
70 | uniques = sorted(list(set(new_centroids)))
71 | to_pop=[]
72 | for i in uniques:
73 | for ii in uniques:
74 | if i==ii:
75 | pass
76 | elif np.linalg.norm(np.array(i)-np.array(ii))<=self.radius:
77 | to_pop.append(ii)
78 | break
79 | for i in to_pop:
80 | try:
81 | uniques.remove(i)
82 | except:
83 | pass
84 |
85 |
86 | prev_centroids=dict(centroids)
87 |
88 |
89 | centroids={}
90 | for i in range(len(uniques)):
91 | centroids[i]=np.array(uniques[i])
92 | optimized=True
93 |
94 | for i in centroids:
95 | if not np.array_equal(centroids[i],prev_centroids[i]):
96 | optimized=False
97 |
98 | if not optimized:
99 | break
100 |
101 | if optimized:
102 | break
103 |
104 | self.centroids=centroids
105 |
106 | self.classification={}
107 | for i in range(len(self.centroids)):
108 | self.classification[i]=[]
109 | for featureset in data:
110 | distances=[np.linalg.norm(featureset-self.centroids[centroid])for centroid in self.centroids]
111 | classification=distances.index(min(distances))
112 | self.classification[classification].append(featureset)
113 |
114 | def predict(self,data):
115 | distances=[np.linalg.norm(data-self.centroids[centroid])for centroid in self.centroids]
116 | classification=distances.index(min(distances))
117 | return classification
118 | clf=Mean_Shift()
119 | clf.fit(X)
120 |
121 |
122 | centroids=clf.centroids
123 | plt.scatter(X[:,0],X[:,1],s=150)
124 |
125 | for classification in clf.classification:
126 | color=colors[classification]
127 | for featureset in clf.classification[classification]:
128 | plt.scatter(featureset[0],featureset[1],marker="x",color=color,s=150)
129 |
130 | for c in centroids:
131 | plt.scatter(centroids[c][0],centroids[c][1],color="k",marker="*",s=150)
132 |
133 | plt.show()
134 | ##The below commented code is for when radius is hardcoded:
135 | #import matplotlib.pyplot as plt
136 | #from matplotlib import style
137 | #style.use('ggplot')
138 | #import numpy as np
139 | #
140 | #X = np.array([[1, 2],
141 | # [1.5, 1.8],
142 | # [5, 8 ],
143 | # [8, 8],
144 | # [1, 0.6],
145 | # [9,11],
146 | # [8,2],
147 | # [10,2],
148 | # [9,3],])
149 | #
150 | ###plt.scatter(X[:,0], X[:,1], s=150)
151 | ###plt.show()
152 | #
153 | #colors = 10*["g","r","c","b","k"]
154 | #
155 | #class Mean_Shift:
156 | # def __init__(self, radius=4):
157 | # self.radius = radius
158 | #
159 | # def fit(self, data):
160 | # centroids = {}
161 | #
162 | # for i in range(len(data)):
163 | # centroids[i] = data[i]
164 | #
165 | # while True:
166 | # new_centroids = []
167 | # for i in centroids:
168 | # in_bandwidth = []
169 | # centroid = centroids[i]
170 | # for featureset in data:
171 | # if np.linalg.norm(featureset-centroid) < self.radius:
172 | # in_bandwidth.append(featureset)
173 | #
174 | # new_centroid = np.average(in_bandwidth,axis=0)
175 | # new_centroids.append(tuple(new_centroid))
176 | #
177 | # uniques = sorted(list(set(new_centroids)))
178 | #
179 | # prev_centroids = dict(centroids)
180 | #
181 | # centroids = {}
182 | # for i in range(len(uniques)):
183 | # centroids[i] = np.array(uniques[i])
184 | #
185 | # optimized = True
186 | #
187 | # for i in centroids:
188 | # if not np.array_equal(centroids[i], prev_centroids[i]):
189 | # optimized = False
190 | # if not optimized:
191 | # break
192 | #
193 | # if optimized:
194 | # break
195 | #
196 | # self.centroids = centroids
197 | #
198 | #
199 | #
200 | #clf = Mean_Shift()
201 | #clf.fit(X)
202 | #
203 | #centroids = clf.centroids
204 | #
205 | #plt.scatter(X[:,0], X[:,1], s=150)
206 | #
207 | #for c in centroids:
208 | # plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150)
209 | #
210 | #plt.show()
211 |
212 |
213 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine-Learning-Algorithms
2 | Major Algorithms to implement different supervised and unsupervised Machine Learners
3 |
4 | These files shows my journey of Learning Machine Learning. These are not an implementation of a final project but rather shows how I learned Machine Learning and then tried to apply those concepts.
5 |
6 | These files include both sklearn part of implementation and also my own scratch construction of these classical Machine Learning Algorithms.
7 |
8 | Includes almost all of the main ML algorithms(KNN,SVM,Regression,MeanShift).
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/Regression part 10 and Part 11.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Jan 3 14:15:33 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | #Linear Regression Model from scratch:
9 | from statistics import mean
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from matplotlib import style
13 | import pylab
14 | style.use("fivethirtyeight")
15 | xs=np.array([1,2,3,4,5,6],dtype=np.float64)
16 | ys=np.array([5,4,6,5,6,7],dtype=np.float64)
17 |
18 | def best_fit_slope_and_intercept(xs,ys):
19 | m=((mean(xs)*mean(ys)) - (mean(xs*ys)))/((mean(xs)**2)-mean(xs**2))
20 | b=mean(ys)-m*mean(xs)
21 | return m,b
22 | m,b=best_fit_slope_and_intercept(xs,ys)
23 | print(m,b)
24 |
25 |
26 | def squared_error(ys_orgin,ys_line):
27 | return sum((ys_line-ys_orgin)**2)
28 | def coefficent_of_determination(ys_orgin,ys_line):
29 | y_mean_line=[mean(ys_orgin) for y in ys_orgin]
30 | square_error_regr=squared_error(ys_orgin,ys_line)
31 | square_error_regr_y_mean=squared_error(ys_orgin,y_mean_line)
32 | return 1-(square_error_regr)/(square_error_regr_y_mean)
33 | regression_line=[(m*x)+b for x in xs]
34 |
35 |
36 | r_square=coefficent_of_determination(ys,regression_line)
37 | print(r_square)
38 | predict_x=8
39 | predict_y=(m*predict_x)+b
40 | print(regression_line)
41 | plt.scatter(xs,ys)
42 | plt.scatter(predict_x,predict_y,color="g")
43 | plt.plot(regression_line)
44 |
--------------------------------------------------------------------------------
/SVM-PART 28,Kernel-part 29,30,31.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jan 18 18:00:48 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | import matplotlib.pyplot as plt
9 | from matplotlib import style
10 | import numpy as np
11 | style.use("ggplot")
12 |
13 | class Support_Vector_Machine:
14 | def __init__(self, visualization=True):
15 | self.visualization = visualization
16 | self.colors = {1:'r',-1:'b'}
17 | if self.visualization:
18 | self.fig = plt.figure()
19 | self.ax = self.fig.add_subplot(1,1,1)
20 | ###traing the data to find w and b
21 | def fit(self,data):
22 | self.data=data
23 | ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b
24 | opt_dict={}
25 |
26 |
27 | ##These transforms are what we use to apply to vector w
28 | ##each time we step in order to know ever possible direction of a vector w and its
29 | ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account
30 | #for direction,remeber in vector direction matters
31 | transforms=[[1,1],[-1,1],[-1,-1],[1,-1]]
32 |
33 |
34 |
35 |
36 |
37 | all_data=[]
38 | ###this three loop takes all features of the associated class yi and make,
39 | ##a new list of these features and than take the max and min value associated with the
40 | ### this new list of feature and these max and min values are to be used for further convex optimization
41 | for yi in self.data:
42 | for featureset in self.data[yi]:
43 | for feature in featureset:
44 | all_data.append(feature)
45 | self.max_feature_value=max(all_data)
46 | self.min_feature_value=min(all_data)
47 | all_data=None
48 | step_size=[self.max_feature_value*0.1,
49 | self.max_feature_value*0.01,
50 | ##POINT OF EXPENSE
51 | self.max_feature_value*0.001]
52 | ###extremely expensive-b does not need to take precise step
53 | b_range_multiple=5
54 |
55 | #we dont need to take as small of steps
56 | #wit b as we do w
57 | b_multiple=5
58 | ###the first value of w and remeber to simplify things,we assume each element of vector w
59 | ### to be same
60 | latest_optimum=self.max_feature_value*10
61 |
62 | for step in step_size:
63 | ####remeber to simplify things,we assume each element of vector w
64 | ### to be same
65 | w=np.array([latest_optimum,latest_optimum])
66 |
67 |
68 | #we can do this because convex alogrithm
69 | optimized=False
70 | while not optimized:
71 | ####setting a range for b
72 | for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
73 | self.max_feature_value*b_range_multiple,
74 | step*b_multiple):
75 | for transformation in transforms:
76 | ##applying the different transformation to account for difeerent direction(w_t)
77 | w_t=w*transformation
78 | found_option=True
79 | #weakest link in the SVM fundamentally
80 | #SMO attempt to fix this a bit
81 | ##Running the data on all points is costly-svm weakness
82 | ##yi(xi.w+b)>=1(constraint)
83 | for i in self.data:
84 | for xi in self.data[i]:
85 | yi=i
86 | ###this condition check even if one point in our data doesnt fit the constraint with the give w vector
87 | if not yi*(np.dot(w_t,xi)+b)>=1:
88 | found_option=False
89 | #if w satisfies the constraint
90 | if found_option:
91 | opt_dict[np.linalg.norm(w_t)]=[w_t,b]
92 | if w[0]<0:
93 | optimized=True
94 | print("optimized a step.")
95 | else:
96 | w=w-step
97 | #taking the smallest modulus w and taking new starting new point
98 | norms=sorted([n for n in opt_dict])
99 | opt_choice=opt_dict[norms[0]]
100 | self.w=opt_choice[0]
101 | self.b=opt_choice[1]
102 | latest_optimum=opt_choice[0][0]+step*2
103 | def predict(self,features):
104 | ###sign(x.w+b) whatever the sign of the equation is
105 | classification=np.sign(np.dot(np.array(features),self.w)+self.b)
106 | if classification != 0 and self.visualization:
107 | self.ax.scatter(features[0],features[1],s=200,marker='*', c=self.colors[classification])
108 | return classification
109 | def visualize(self):
110 | [[self.ax.scatter(x[0],x[1],s=100,color=self.colors[i]) for x in data_dict[i]] for i in data_dict]
111 | def hyperplane(x,w,b,v):
112 | ###v=x.w+b
113 | ###the hyperplane function shows the support vector plannes and boudrt decision so:
114 | ###positive support vector(psv)=1
115 | ###nsv=-1
116 | ###decision boundary=0,want to find a plane with these associated v values and show them
117 | #hyperplane v=x.w+b
118 | ##x,y is an unknown point on the hyperplane
119 | # x_v and w_v are the vector
120 | # x_v= [x,y]
121 | # x_v.w_v+b =1 for postive sv
122 | ## this helps to find the value of y where value of hyperplance is 1
123 | return (-w[0]*x-b+v)/w[1]
124 | datarange = (self.min_feature_value*0.9,self.max_feature_value*1.1)
125 | hyp_x_min = datarange[0]
126 | hyp_x_max = datarange[1]
127 | #(w.x+b)=1
128 | #positive support vector hyperplane
129 | psv1=hyperplane(hyp_x_min,self.w,self.b,1)
130 | ##psv1 is going to be scalar value not vector and its going to be y given specific x and v value
131 | psv2=hyperplane(hyp_x_max,self.w,self.b,1)
132 | #ploting the associate coordinate of psv2 and psv1 to visualize the hyperplane where v is one ,remeber hyper equation is for y such that v is one
133 | self.ax.plot([hyp_x_min,hyp_x_max],[psv1,psv2],"k")
134 | ##doing the same thing and process for a value of v=-1:
135 | nsv1=hyperplane(hyp_x_min,self.w,self.b,-1)
136 | nsv2=hyperplane(hyp_x_max,self.w,self.b,-1)
137 | self.ax.plot([hyp_x_min,hyp_x_max],[nsv1,nsv2],"k")
138 | ###doing the same thing and process for a value of v=0:
139 | db1=hyperplane(hyp_x_min,self.w,self.b,0)
140 | db2=hyperplane(hyp_x_max,self.w,self.b,0)
141 | self.ax.plot([hyp_x_min,hyp_x_max],[db1,db2],"y--")
142 |
143 | #show the result
144 | plt.show()
145 |
146 |
147 |
148 |
149 | data_dict={-1:np.array([[1,7],
150 | [2,8],
151 | [3,8]]),
152 | 1:np.array([[5,1],[6,-1],[7,3]])}
153 |
154 |
155 | ###trial 1:@19:13 hours\
156 | svm=Support_Vector_Machine()
157 | svm.fit(data=data_dict)
158 | predict_us=[[0,10],[1,3],[3,4],[3,5],[5,5],[6,-5],[5,8]]
159 | for p in predict_us:
160 | svm.predict(p)
161 | svm.visualize()
162 | ############################################-SVM COMPLETED::::
163 |
164 |
165 |
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/SVM-part 20,21,22.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jan 11 19:21:35 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | import numpy as np
9 | from sklearn import preprocessing,neighbors,svm
10 | from sklearn.model_selection import cross_validate,train_test_split
11 | import pandas as pd
12 | import pickle
13 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
14 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
15 | #lose rest of the data
16 | df.replace("?",-9999,inplace=True)
17 |
18 | ###check for any useless data and drop it
19 | df.drop(["id"],1,inplace=True)
20 | #### X are the features and y is the label
21 | X=np.array(df.drop(["class"],1))
22 | print(X)
23 | y=np.array(df["class"])
24 |
25 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
26 | #####Using the classifer
27 | clf=svm.SVC()
28 | clf.fit(X_train,y_train)
29 | ###Saving the classifer
30 | with open("K_model","wb") as f:
31 | pickle.dump(clf,f)
32 | #Remeber the difference between accuracy and confidecnce
33 | accuracy=clf.score(X_test,y_test)
34 | print(accuracy)
35 | ####make prediction
36 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]])
37 | example_measures=np.array(predict_X)
38 | print(example_measures)
39 | ###To make the array shape that sklearn understands and matches the the X features
40 | predict=clf.predict(example_measures.reshape(len(example_measures),-1))
41 | print(predict)
42 |
43 |
44 |
45 | ###########the part 23,24 was theory
--------------------------------------------------------------------------------
/SVM-part 25,26,27.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Jan 15 19:05:10 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | import matplotlib.pyplot as plt
9 | from matplotlib import style
10 | import numpy as np
11 | style.use("ggplot")
12 |
13 | class Support_Vector_Machine(object):
14 | def __init___(self,visulization=True):
15 | self.visulization=visulization
16 | self.colors={1:"r",-1:"b"}
17 | if self.visulization:
18 | self.fig=plt.figure()
19 | self.ax=self.fig.add_subplot(1,1,1)
20 | ###traing the data to find w and b
21 | def fit(self,data):
22 | self.data=data
23 | ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b
24 | opt_dict={}
25 |
26 |
27 | ##These transforms are what we use to apply to vector w
28 | ##each time we step in order to know ever possible direction of a vector w and its
29 | ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account
30 | #for direction,remeber in vector direction matters
31 | transforms=[[1,1],[-1,1],[-1,-1],[1,-1]]
32 |
33 |
34 |
35 |
36 |
37 | all_data=[]
38 | ###this three loop takes all features of the associated class yi and make,
39 | ##a new list of these features and than take the max and min value associated with the
40 | ### this new list of feature and these max and min values are to be used for further convex optimization
41 | for yi in self.data:
42 | for featureset in self.data[yi]:
43 | for feature in featureset:
44 | all_data.append(feature)
45 | self.max_feature_value=max(all_data)
46 | self.min_feature_value=min(all_data)
47 | all_data=None
48 | step_size=[self.max_feature_value*0.1,
49 | self.max_feature_value*0.01,
50 | ##POINT OF EXPENSE
51 | self.max_feature_value*0.001]
52 | ###extremely expensive-b does not need to take precise step
53 | b_range_multiple=5
54 |
55 | #we dont need to take as small of steps
56 | #wit b as we do w
57 | b_multiple=5
58 | ###the first value of w and remeber to simplify things,we assume each element of vector w
59 | ### to be same
60 | latest_optimum=self.max_feature_value*10
61 |
62 | for step in step_size:
63 | ####remeber to simplify things,we assume each element of vector w
64 | ### to be same
65 | w=np.array([latest_optimum,latest_optimum])
66 |
67 |
68 | #we can do this because convex alogrithm
69 | optimized=False
70 | while not optimized:
71 | ####setting a range for b
72 | for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
73 | self.max_feature_value*b_range_multiple,
74 | step*b_multiple):
75 | for transformation in transforms:
76 | ##applying the different transformation to account for difeerent direction(w_t)
77 | w_t=w*transformation
78 | found_option=True
79 | #weakest link in the SVM fundamentally
80 | #SMO attempt to fix this a bit
81 | ##Running the data on all points is costly-svm weakness
82 | ##yi(xi.w+b)>=1(constraint)
83 | for i in self.data:
84 | for xi in self.data[i]:
85 | yi=i
86 | ###this condition check even if one point in our data doesnt fit the constraint with the give w vector
87 | if not yi*(np.dot(w_t,xi)+b)>=1:
88 | found_option=False
89 | #if w satisfies the constraint
90 | if found_option:
91 | opt_dict[np.linalg.norm(w_t)]=[w_t,b]
92 | if w[0]<0:
93 | optimized=True
94 | print("optimized a step.")
95 | else:
96 | w=w-step
97 | #taking the smallest modulus w and taking new starting new point
98 | norms=sorted([n for n in opt_dict])
99 | opt_choice=opt_dict[norms[0]]
100 | self.w=opt_choice[0]
101 | self.b=opt_choice[1]
102 | latest_optimum=opt_choice[0][0]+step*2
103 | def predict(self,features):
104 | ###sign(x.w+b) whatever the sign of the equation is
105 | classification=np.sign(np.dot(np.array(features),self.w)+self.b)
106 |
107 | return classification
108 |
109 |
110 |
111 |
112 |
113 |
114 | data_dict={-1:np.array([[1,7],
115 | [2,8],
116 | [3,8]]),
117 | 1:np.array([[5,1],[6,-1],[7,3]])}
118 |
--------------------------------------------------------------------------------
/Tic-Tac-Toe AI.py:
--------------------------------------------------------------------------------
1 | 15# -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Feb 15 23:11:53 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | board=[" " for x in range(10)]
9 |
10 |
11 | def insertLetter(letter,pos):
12 | board[pos]=letter
13 | def spaceisFree(pos):
14 | return board[pos]==" "
15 | def printBoard(board):
16 | print(' | | ')
17 | print(""+board[1]+ " | "+ board[2]+ " | "+ board[3])
18 | print(' | |')
19 | print("----------")
20 | print(' | | ')
21 | print(""+board[4]+ " | "+ board[5]+ " | "+ board[6])
22 | print(' | |')
23 | print("----------")
24 | print(' | |')
25 | print(""+board[7]+ " | "+ board[8]+ " | "+ board[9])
26 | print(' | |')
27 | def isWinner(bo,le):##sorry for the long line
28 | return (bo[7]==le and bo[8]==le and bo[9]==le) or (bo[4]==le and bo[5]==le and bo[6]==le) or (bo[1]==le and bo[2]==le and bo[3]==le) or (bo[1]==le and bo[4]==le and bo[7]==le) or (bo[2]==le and bo[5]==le and bo[8]==le) or (bo[3]==le and bo[6]==le and bo[9]==le) or (bo[1]==le and bo[5]==le and bo[9]==le) or (bo[3]==le and bo[5]==le and bo[7]==le)
29 | def playerMove():
30 | run=True
31 | while run:
32 | move=input("please select a position to place an X (1-9): ")
33 | try:
34 | move=int(move)
35 | if move>0 and move<10:
36 | if spaceisFree(move):
37 | run=False
38 | insertLetter("X",move)
39 | else:
40 | print("This space is occupied")
41 | else:
42 | print("Type a number witin the range")
43 | except:
44 | print("Type a number")
45 |
46 | def compMove():
47 | possibleMoves=[x for x,letter in enumerate(board) if letter==" " and x!=0]
48 | move=0
49 |
50 | for let in ["O","X"]:
51 | for i in possibleMoves:
52 | boardCopy=board[:]
53 | boardCopy[i]=let
54 | if isWinner(boardCopy,let):
55 | move=i
56 | return move
57 |
58 | cornersOpen=[]
59 | for i in possibleMoves:
60 | if i in [1,3,7,9]:
61 | cornersOpen.append(i)
62 | if len(cornersOpen)>0:
63 | move=selectRandom(cornersOpen)
64 | return move
65 |
66 | if 5 in possibleMoves:
67 | move=5
68 | return move
69 | edgesOpen=[]
70 | for i in possibleMoves:
71 | if i in [2,4,6,8]:
72 | edgesOpen.append(i)
73 | if len(edgesOpen)>0:
74 | move=selectRandom(edgesOpen)
75 | return move
76 |
77 | def selectRandom(li):
78 | import random
79 | In =len(li)
80 | r=random.randrange(0,In)
81 | return li[r]
82 | def isBoardFull():
83 | if board.count(" ")>1:
84 | return False
85 | else:
86 | return True
87 | def main():
88 | print("Welcome to Tic Tac Toe")
89 | printBoard(board)
90 |
91 | while not (isBoardFull()):
92 | if not isWinner(board,'O'):
93 | playerMove()
94 | printBoard(board)
95 | else:
96 | print("Sorry AI win the game")
97 | break
98 | if not isWinner(board,'X'):
99 | move=compMove()
100 | if move==0:
101 | print("Tie Game!")
102 | else:
103 | insertLetter("O",move)
104 | print("computer placed an O in position ,move")
105 | printBoard(board)
106 | else:
107 | print("Yeap You ARE smarter than my AI")
108 | break
109 | if isBoardFull():
110 | print("Tie Game")
111 | main()
--------------------------------------------------------------------------------
/cluster-part34,35,36,37.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jan 25 15:06:06 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | import matplotlib.pyplot as plt
9 | from matplotlib import style
10 | style.use("ggplot")
11 | import numpy as np
12 | from sklearn.cluster import KMeans
13 | import pandas as pd
14 | from sklearn import preprocessing
15 | from pandas.api.types import is_numeric_dtype
16 |
17 | #####basic visulisation of k-means
18 | X=np.array([[1,2],
19 | [1.5,1.8],
20 | [5,8],
21 | [8,8],
22 | [1,0.6],
23 | [9,11]])
24 | plt.scatter(X[:,0],X[:,1],s=150)
25 |
26 |
27 |
28 | clf=KMeans(n_clusters=2)
29 | clf.fit(X)
30 | centroids=clf.cluster_centers_
31 | labels=clf.labels_
32 | colors=["g.","r.","c.","b.","k.","o."]
33 | print(centroids)
34 | print(labels)
35 | for i in range(len(X)):
36 | plt.plot(X[i][0],X[i][1],colors[labels[i]],markersize=25)
37 | plt.scatter(centroids[:,0],centroids[:,1],marker="x",s=150)
38 | plt.show()
39 | ##################################################################
40 | ###Analysing Titanic dataset through K-means
41 | df=pd.read_excel("titanic.xls")
42 | df.drop(["body","name"],1,inplace=True)
43 | df.apply(pd.to_numeric, errors='ignore')
44 | df.fillna(0,inplace=True)
45 | c=df["age"].values.tolist()
46 | ###in order to convert text data to useable numeric data
47 | def handle_numeric_data(df):
48 | columns=df.columns.values
49 | # dtypes=dict(df.dtypes)
50 | for column in columns:
51 | text_digit_vals={}
52 | # dtype=dtypes[column]
53 | def convert_to_int(val):
54 | return text_digit_vals[val]
55 | if df[column].dtype != np.int64 and df[column].dtype != np.float64:
56 | columns_contents=df[column].values.tolist()
57 |
58 | unique_element=set(columns_contents)
59 | x=0
60 | for unique in unique_element:
61 | if unique not in text_digit_vals:
62 | text_digit_vals[unique]=x
63 | x+=1
64 |
65 | df[column]=list(map(convert_to_int,df[column]))
66 | return df
67 | df=handle_numeric_data(df)
68 | ####################################################################
69 |
70 | #df.drop(["sex","boat"],1,inplace=True)
71 | X=np.array(df.drop(["survived"],1),dtype=float)
72 | X=preprocessing.scale(X)
73 | y=np.array(df["survived"])
74 |
75 |
76 | clf=KMeans(n_clusters=2)
77 | clf.fit(X)
78 |
79 | ########In unsupervised learning we do not have labels so we are going to use only X to fit the data
80 | ########then KMeans would label the data into two groups and to check accuracy of that labelling we will see
81 | #######what was the prediction of the KMeans and compare with binary representation of our y.
82 | correct=0
83 | labels=clf.labels_###use this
84 | print(labels)
85 | for i in range(len(X)):
86 | predict_me=np.array(X[i],dtype=float)
87 | predict_me=predict_me.reshape(-1,len(predict_me))
88 | prediction=clf.predict(predict_me)###what does the label given by classifer
89 | if prediction[0]==y[i]:####did it label correctly
90 | correct+=1
91 | print(correct/len(X))
92 |
93 |
--------------------------------------------------------------------------------
/k means...nearest neighbours-part 13,14,15.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Jan 5 13:45:02 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 | import numpy as np
8 | from sklearn import preprocessing,neighbors
9 | from sklearn.model_selection import cross_validate,train_test_split
10 | import pandas as pd
11 | import pickle
12 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
13 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
14 | #lose rest of the data
15 | df.replace("?",-9999,inplace=True)
16 |
17 | ###check for any useless data and drop it
18 | df.drop(["id"],1,inplace=True)
19 | #### X are the features and y is the label
20 | X=np.array(df.drop(["class"],1))
21 | print(X)
22 | y=np.array(df["class"])
23 |
24 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
25 | #####Using the classifer
26 | clf=neighbors.KNeighborsClassifier()
27 | clf.fit(X_train,y_train)
28 | ###Saving the classifer
29 | with open("K_model","wb") as f:
30 | pickle.dump(clf,f)
31 | #Remeber the difference between accuracy and confidecnce
32 | accuracy=clf.score(X_test,y_test)
33 | print(accuracy)
34 | ####make prediction
35 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]])
36 | example_measures=np.array(predict_X)
37 | print(example_measures)
38 | ###To make the array shape that sklearn understands and matches the the X features
39 | predict=clf.predict(example_measures.reshape(len(example_measures),-1))
40 | print(predict)
41 |
42 |
43 |
44 |
45 |
46 |
47 | ###############################
48 | #K-model from scratch:
49 | from math import sqrt
50 | #### Euclidean_Distance Basic Formula is:
51 | plot1=[1,3]
52 | plot2=[2,5]
53 | euclidean_distance=sqrt(((plot1[0]-plot2[0])**2)+((plot1[1]-plot2[1])**2))
54 | print(euclidean_distance)
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/k means...nearest neighbours-part 16,17,18,19.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Tue Jan 8 16:43:39 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | #K-model from scratch:
9 | from math import sqrt
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from matplotlib import style
13 | from collections import Counter
14 | import warnings
15 | import random
16 | import pandas as pd
17 | ###creating a dataset with labels and features
18 | dataset={"k":[[1,2],[2,3],[3,1]],"r":[[6,5],[7,7],[8,6]]}
19 | ##feature to be classified
20 | new_feature=[5,7]
21 | ####Looping over to scatter the plot
22 | #for i in dataset:
23 | # for ii in dataset[i]:
24 | # [plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]]
25 | ###More pythontic way
26 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
27 | #plt.scatter(new_feature[0],new_feature[1])
28 | #plt.show()
29 | ###k alogorithm
30 | def k_nearest_neighbours(data,predict,k=3):
31 | if len(data) >=k:
32 | warnings.warn("K is set to a value less than total voting groups!")
33 | distances=[]
34 | for group in data:
35 | for features in data[group]:
36 | # euclidean_distance=np.sqrt(np.sum((np.array(features)-np.array(predict))**2)) or better:
37 | ###numpy fomula
38 | euclidean_distance=np.linalg.norm(np.array(features)-np.array(predict))
39 | #####Making the euclidean distance list to sort later
40 | distances.append([euclidean_distance,group])
41 | ###calculating votes to help us classify-lowest distance
42 | votes=[i[1] for i in sorted(distances)[:k]]
43 | votes_result=Counter(votes).most_common(1)[0][0]
44 | ###confidence measure how confident our classifer is about one single point in labelling that point-that is what porppotion of votes were infavour
45 | confidence=Counter(votes).most_common(1)[0][1]/k
46 | return votes_result,confidence
47 |
48 | #result=k_nearest_neighbours(dataset,new_feature,k=3)
49 | #print(result)
50 | ###showing the result,color is already k an r as variables.
51 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
52 | #plt.scatter(new_feature[0],new_feature[1],color=result)
53 | #plt.show()
54 | ######################Comparing our model vs sklearn model
55 | #df=pd.read_csv("breast-cancer-wisconsin.data.txt")
56 | #df.replace("?",-9999,inplace=True)
57 | #df.drop(["id"],1,inplace=True)
58 | #full_data=df.astype(float).values.tolist()
59 | ######shuffling the inner lists of full_data
60 | #random.shuffle(full_data)
61 | #
62 | #########dividing the full data into train data and test data
63 | #test_size=0.4
64 | #train_set= {2:[],4:[]}
65 | #test_set={2:[],4:[]}
66 | #train_data=full_data[:-int(test_size*len(full_data))]
67 | #test_data=full_data[-int(test_size*len(full_data)):]
68 | #
69 | #for i in train_data:
70 | # ####associating the datas to classifiers in this case 2 or 4
71 | # train_set[i[-1]].append(i[:-1])
72 | #for i in test_data:
73 | # ####associating the datas to classifiers in this case 2 or 4
74 | # test_set[i[-1]].append(i[:-1])
75 | #correct=0
76 | #total=0
77 | #for group in test_set:
78 | # for data in test_set[group]:
79 | # vote,confidence=k_nearest_neighbours(train_set,data,k=5)
80 | # if group == vote:
81 | # correct+=1
82 | # else:
83 | # print(confidence)
84 | # total+=1
85 | #
86 | #print("Accuracy: ",correct/total)
87 | #########################copying the whole algorith down again to judge the accuracy in a numbe of trials:
88 | accuracies=[]
89 | n=25
90 | for i in range(n):
91 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
92 | df.replace("?",-9999,inplace=True)
93 | df.drop(["id"],1,inplace=True)
94 | full_data=df.astype(float).values.tolist()
95 | #####shuffling the inner lists of full_data
96 | random.shuffle(full_data)
97 |
98 | ########dividing the full data into train data and test data
99 | test_size=0.4
100 | train_set= {2:[],4:[]}
101 | test_set={2:[],4:[]}
102 | train_data=full_data[:-int(test_size*len(full_data))]
103 | test_data=full_data[-int(test_size*len(full_data)):]
104 |
105 | for i in train_data:
106 | ####associating the datas to classifiers in this case 2 or 4
107 | train_set[i[-1]].append(i[:-1])
108 | for i in test_data:
109 | ####associating the datas to classifiers in this case 2 or 4
110 | test_set[i[-1]].append(i[:-1])
111 | correct=0
112 | total=0
113 | for group in test_set:
114 | for data in test_set[group]:
115 | vote,confidence=k_nearest_neighbours(train_set,data,k=5)
116 | if group == vote:
117 | correct+=1
118 | total+=1
119 | accuracies.append(correct/total)
120 | print("overall_accuracy(our algorithm) for", n ," steps = ", sum(accuracies)/len(accuracies))
121 | ##############finally getting the sklearn algorithm and comparing it with our overall accuracy for a specific number of steps:
122 | accuracies_2=[]
123 | for i in range(n):
124 | from sklearn.model_selection import train_test_split
125 | from sklearn import neighbors
126 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
127 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
128 | #lose rest of the data
129 | df.replace("?",-9999,inplace=True)
130 | ###check for any useless data and drop it
131 | df.drop(["id"],1,inplace=True)
132 | #### X are the features and y is the label
133 | X=np.array(df.drop(["class"],1))
134 | y=np.array(df["class"])
135 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
136 | #####Using the classifer
137 | clf=neighbors.KNeighborsClassifier()
138 | clf.fit(X_train,y_train)
139 | #Remeber the difference between accuracy and confidecnce
140 | accuracy=clf.score(X_test,y_test)
141 | accuracies_2.append(accuracy)
142 | print("overall_accuracy(sk-learn alogorithm) for", n ," steps = ", sum(accuracies_2)/len(accuracies_2))
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/k-means from scratch-part 37,38.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat Jan 26 17:11:21 2019
4 |
5 | @author: MMOHTASHIM
6 | """
7 |
8 | ##########################################
9 | ###########-k-mean from scratch
10 | import matplotlib.pyplot as plt
11 | from matplotlib import style
12 | style.use("ggplot")
13 | import numpy as np
14 | from sklearn.cluster import KMeans
15 | import pandas as pd
16 | from sklearn import preprocessing
17 | from pandas.api.types import is_numeric_dtype
18 |
19 | #####basic visulisation
20 | #X=np.array([[1,2],
21 | # [1.5,1.8],
22 | # [5,8],
23 | # [8,8],
24 | # [1,0.6],
25 | # [9,11]])
26 |
27 | #plt.scatter(X[:,0],X[:,1],s=150)
28 |
29 | colors=["g","r","c","b","k"]
30 |
31 | class K_Means:
32 | def __init__(self, k=2, tol=0.001, max_iter=300):
33 | self.k = k
34 | self.tol = tol
35 | self.max_iter = max_iter
36 |
37 | def fit(self,data):
38 |
39 | self.centroids = {}
40 |
41 | for i in range(self.k):
42 | self.centroids[i] = data[i]
43 |
44 | for i in range(self.max_iter):
45 | self.classifications = {}
46 |
47 | for i in range(self.k):
48 | self.classifications[i] = []
49 |
50 | for featureset in data:
51 | distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
52 | classification = distances.index(min(distances))
53 | self.classifications[classification].append(featureset)
54 |
55 | prev_centroids = dict(self.centroids)
56 |
57 | for classification in self.classifications:
58 | self.centroids[classification] = np.average(self.classifications[classification],axis=0)
59 |
60 | optimized = True
61 |
62 | for c in self.centroids:
63 | original_centroid = prev_centroids[c]
64 | current_centroid = self.centroids[c]
65 | if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
66 | print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
67 | optimized = False
68 |
69 | if optimized:
70 | print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
71 | break
72 |
73 | def predict(self,data):
74 | distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
75 | classification = distances.index(min(distances))
76 | return classification
77 | ###This code for basic visulisation of K_mean in 2d.
78 | #clf = K_Means()
79 | #clf.fit(X)
80 |
81 | #for centroid in clf.centroids:
82 | # plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],
83 | # marker="o", color="k", s=150, linewidths=5)
84 | #
85 | #for classification in clf.classifications:
86 | # color = colors[classification]
87 | # for featureset in clf.classifications[classification]:
88 | # plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5)
89 |
90 |
91 | #unknowns=np.array([[1,3],
92 | # [8,9],
93 | # [0,3],
94 | # [5,4],
95 | # [6,4]])
96 | #for unknown in unknowns:
97 | # classification=clf.predict(unknown)
98 | # plt.scatter(unknown[0],unknown[1],marker="*",color=colors[classification],s=150,linewidths=5)
99 |
100 | ###below is to compare our alg with sklearn
101 | # https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
102 | df = pd.read_excel('titanic.xls')
103 | df.drop(['body','name'], 1, inplace=True)
104 | #df.convert_objects(convert_numeric=True)
105 | print(df.head())
106 | df.fillna(0,inplace=True)
107 |
108 | def handle_non_numerical_data(df):
109 |
110 | # handling non-numerical data: must convert.
111 | columns = df.columns.values
112 |
113 | for column in columns:
114 | text_digit_vals = {}
115 | def convert_to_int(val):
116 | return text_digit_vals[val]
117 |
118 | #print(column,df[column].dtype)
119 | if df[column].dtype != np.int64 and df[column].dtype != np.float64:
120 |
121 | column_contents = df[column].values.tolist()
122 | #finding just the uniques
123 | unique_elements = set(column_contents)
124 | # great, found them.
125 | x = 0
126 | for unique in unique_elements:
127 | if unique not in text_digit_vals:
128 | # creating dict that contains new
129 | # id per unique string
130 | text_digit_vals[unique] = x
131 | x+=1
132 | # now we map the new "id" vlaue
133 | # to replace the string.
134 | df[column] = list(map(convert_to_int,df[column]))
135 |
136 | return df
137 |
138 | df = handle_non_numerical_data(df)
139 | print(df.head())
140 |
141 | # add/remove features just to see impact they have.
142 | df.drop(['ticket','home.dest'], 1, inplace=True)
143 |
144 |
145 | X = np.array(df.drop(['survived'], 1).astype(float))
146 | X = preprocessing.scale(X)
147 | y = np.array(df['survived'])
148 |
149 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
150 |
151 | clf = K_Means()##change this
152 | clf.fit(X)
153 |
154 | correct = 0
155 | for i in range(len(X)):
156 |
157 | predict_me = np.array(X[i].astype(float))
158 | predict_me = predict_me.reshape(-1, len(predict_me))
159 | prediction = clf.predict(predict_me)
160 | if prediction == y[i]:
161 | correct += 1
162 |
163 |
164 | print(correct/len(X))
165 |
166 |
167 |
--------------------------------------------------------------------------------
/kernel,cvxopt-part 32.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Jan 23 12:16:33 2019
4 |
5 | @author: Not my work
6 | """
7 |
8 | # Mathieu Blondel, September 2010
9 | # License: BSD 3 clause
10 | # http://www.mblondel.org/journal/2010/09/19/support-vector-machines-in-python/
11 |
12 | # visualizing what translating to another dimension does
13 | # and bringing back to 2D:
14 | # https://www.youtube.com/watch?v=3liCbRZPrZA
15 |
16 | # Docs: http://cvxopt.org/userguide/coneprog.html#quadratic-programming
17 | # Docs qp example: http://cvxopt.org/examples/tutorial/qp.html
18 |
19 | # Nice tutorial:
20 | # https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf
21 |
22 |
23 | import numpy as np
24 | from numpy import linalg
25 | import cvxopt
26 | import cvxopt.solvers
27 |
28 | def linear_kernel(x1, x2):
29 | return np.dot(x1, x2)
30 |
31 | def polynomial_kernel(x, y, p=3):
32 | return (1 + np.dot(x, y)) ** p
33 |
34 | def gaussian_kernel(x, y, sigma=5.0):
35 | return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))
36 |
37 | class SVM(object):
38 |
39 | def __init__(self, kernel=linear_kernel, C=None):
40 | self.kernel = kernel
41 | self.C = C
42 | if self.C is not None: self.C = float(self.C)
43 |
44 | def fit(self, X, y):
45 | n_samples, n_features = X.shape
46 |
47 | # Gram matrix
48 | K = np.zeros((n_samples, n_samples))
49 | for i in range(n_samples):
50 | for j in range(n_samples):
51 | K[i,j] = self.kernel(X[i], X[j])
52 |
53 | P = cvxopt.matrix(np.outer(y,y) * K)
54 | q = cvxopt.matrix(np.ones(n_samples) * -1)
55 | A = cvxopt.matrix(y, (1,n_samples))
56 | b = cvxopt.matrix(0.0)
57 |
58 | if self.C is None:
59 | G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
60 | h = cvxopt.matrix(np.zeros(n_samples))
61 | else:
62 | tmp1 = np.diag(np.ones(n_samples) * -1)
63 | tmp2 = np.identity(n_samples)
64 | G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
65 | tmp1 = np.zeros(n_samples)
66 | tmp2 = np.ones(n_samples) * self.C
67 | h = cvxopt.matrix(np.hstack((tmp1, tmp2)))
68 |
69 | # solve QP problem
70 | solution = cvxopt.solvers.qp(P, q, G, h, A, b)
71 |
72 | # Lagrange multipliers
73 | a = np.ravel(solution['x'])
74 |
75 | # Support vectors have non zero lagrange multipliers
76 | sv = a > 1e-5
77 | ind = np.arange(len(a))[sv]
78 | self.a = a[sv]
79 | self.sv = X[sv]
80 | self.sv_y = y[sv]
81 | print("%d support vectors out of %d points" % (len(self.a), n_samples))
82 |
83 | # Intercept
84 | self.b = 0
85 | for n in range(len(self.a)):
86 | self.b += self.sv_y[n]
87 | self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
88 | self.b /= len(self.a)
89 |
90 | # Weight vector
91 | if self.kernel == linear_kernel:
92 | self.w = np.zeros(n_features)
93 | for n in range(len(self.a)):
94 | self.w += self.a[n] * self.sv_y[n] * self.sv[n]
95 | else:
96 | self.w = None
97 |
98 | def project(self, X):
99 | if self.w is not None:
100 | return np.dot(X, self.w) + self.b
101 | else:
102 | y_predict = np.zeros(len(X))
103 | for i in range(len(X)):
104 | s = 0
105 | for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
106 | s += a * sv_y * self.kernel(X[i], sv)
107 | y_predict[i] = s
108 | return y_predict + self.b
109 |
110 | def predict(self, X):
111 | return np.sign(self.project(X))
112 |
113 | if __name__ == "__main__":
114 | import pylab as pl
115 |
116 | def gen_lin_separable_data():
117 | # generate training data in the 2-d case
118 | mean1 = np.array([0, 2])
119 | mean2 = np.array([2, 0])
120 | cov = np.array([[0.8, 0.6], [0.6, 0.8]])
121 | X1 = np.random.multivariate_normal(mean1, cov, 100)
122 | y1 = np.ones(len(X1))
123 | X2 = np.random.multivariate_normal(mean2, cov, 100)
124 | y2 = np.ones(len(X2)) * -1
125 | return X1, y1, X2, y2
126 |
127 | def gen_non_lin_separable_data():
128 | mean1 = [-1, 2]
129 | mean2 = [1, -1]
130 | mean3 = [4, -4]
131 | mean4 = [-4, 4]
132 | cov = [[1.0,0.8], [0.8, 1.0]]
133 | X1 = np.random.multivariate_normal(mean1, cov, 50)
134 | X1 = np.vstack((X1, np.random.multivariate_normal(mean3, cov, 50)))
135 | y1 = np.ones(len(X1))
136 | X2 = np.random.multivariate_normal(mean2, cov, 50)
137 | X2 = np.vstack((X2, np.random.multivariate_normal(mean4, cov, 50)))
138 | y2 = np.ones(len(X2)) * -1
139 | return X1, y1, X2, y2
140 |
141 | def gen_lin_separable_overlap_data():
142 | # generate training data in the 2-d case
143 | mean1 = np.array([0, 2])
144 | mean2 = np.array([2, 0])
145 | cov = np.array([[1.5, 1.0], [1.0, 1.5]])
146 | X1 = np.random.multivariate_normal(mean1, cov, 100)
147 | y1 = np.ones(len(X1))
148 | X2 = np.random.multivariate_normal(mean2, cov, 100)
149 | y2 = np.ones(len(X2)) * -1
150 | return X1, y1, X2, y2
151 |
152 | def split_train(X1, y1, X2, y2):
153 | X1_train = X1[:90]
154 | y1_train = y1[:90]
155 | X2_train = X2[:90]
156 | y2_train = y2[:90]
157 | X_train = np.vstack((X1_train, X2_train))
158 | y_train = np.hstack((y1_train, y2_train))
159 | return X_train, y_train
160 |
161 | def split_test(X1, y1, X2, y2):
162 | X1_test = X1[90:]
163 | y1_test = y1[90:]
164 | X2_test = X2[90:]
165 | y2_test = y2[90:]
166 | X_test = np.vstack((X1_test, X2_test))
167 | y_test = np.hstack((y1_test, y2_test))
168 | return X_test, y_test
169 |
170 | def plot_margin(X1_train, X2_train, clf):
171 | def f(x, w, b, c=0):
172 | # given x, return y such that [x,y] in on the line
173 | # w.x + b = c
174 | return (-w[0] * x - b + c) / w[1]
175 |
176 | pl.plot(X1_train[:,0], X1_train[:,1], "ro")
177 | pl.plot(X2_train[:,0], X2_train[:,1], "bo")
178 | pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
179 |
180 | # w.x + b = 0
181 | a0 = -4; a1 = f(a0, clf.w, clf.b)
182 | b0 = 4; b1 = f(b0, clf.w, clf.b)
183 | pl.plot([a0,b0], [a1,b1], "k")
184 |
185 | # w.x + b = 1
186 | a0 = -4; a1 = f(a0, clf.w, clf.b, 1)
187 | b0 = 4; b1 = f(b0, clf.w, clf.b, 1)
188 | pl.plot([a0,b0], [a1,b1], "k--")
189 |
190 | # w.x + b = -1
191 | a0 = -4; a1 = f(a0, clf.w, clf.b, -1)
192 | b0 = 4; b1 = f(b0, clf.w, clf.b, -1)
193 | pl.plot([a0,b0], [a1,b1], "k--")
194 |
195 | pl.axis("tight")
196 | pl.show()
197 |
198 | def plot_contour(X1_train, X2_train, clf):
199 | pl.plot(X1_train[:,0], X1_train[:,1], "ro")
200 | pl.plot(X2_train[:,0], X2_train[:,1], "bo")
201 | pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
202 |
203 | X1, X2 = np.meshgrid(np.linspace(-6,6,50), np.linspace(-6,6,50))
204 | X = np.array([[x1, x2] for x1, x2 in zip(np.ravel(X1), np.ravel(X2))])
205 | Z = clf.project(X).reshape(X1.shape)
206 | pl.contour(X1, X2, Z, [0.0], colors='k', linewidths=1, origin='lower')
207 | pl.contour(X1, X2, Z + 1, [0.0], colors='grey', linewidths=1, origin='lower')
208 | pl.contour(X1, X2, Z - 1, [0.0], colors='grey', linewidths=1, origin='lower')
209 |
210 | pl.axis("tight")
211 | pl.show()
212 |
213 | def test_linear():
214 | X1, y1, X2, y2 = gen_lin_separable_data()
215 | X_train, y_train = split_train(X1, y1, X2, y2)
216 | X_test, y_test = split_test(X1, y1, X2, y2)
217 |
218 | clf = SVM()
219 | clf.fit(X_train, y_train)
220 |
221 | y_predict = clf.predict(X_test)
222 | correct = np.sum(y_predict == y_test)
223 | print("%d out of %d predictions correct" % (correct, len(y_predict)))
224 |
225 | plot_margin(X_train[y_train==1], X_train[y_train==-1], clf)
226 |
227 | def test_non_linear():
228 | X1, y1, X2, y2 = gen_non_lin_separable_data()
229 | X_train, y_train = split_train(X1, y1, X2, y2)
230 | X_test, y_test = split_test(X1, y1, X2, y2)
231 |
232 | clf = SVM(polynomial_kernel)
233 | clf.fit(X_train, y_train)
234 |
235 | y_predict = clf.predict(X_test)
236 | correct = np.sum(y_predict == y_test)
237 | print("%d out of %d predictions correct" % (correct, len(y_predict)))
238 |
239 | plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
240 |
241 | def test_soft():
242 | X1, y1, X2, y2 = gen_lin_separable_overlap_data()
243 | X_train, y_train = split_train(X1, y1, X2, y2)
244 | X_test, y_test = split_test(X1, y1, X2, y2)
245 |
246 | clf = SVM(C=1000.1)
247 | clf.fit(X_train, y_train)
248 |
249 | y_predict = clf.predict(X_test)
250 | correct = np.sum(y_predict == y_test)
251 | print("%d out of %d predictions correct" % (correct, len(y_predict)))
252 |
253 | plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
254 |
255 |
256 | test_linear()
257 | #test_non_linear()
258 | # test_soft()
259 | ################################-the above example is detailed application of convex optimization to quadratic programming and solving svm
260 | ##below is the code for simple understanding as how qp and convex optimization works:
261 | Import the necessary packages
262 | import numpy
263 | from cvxopt import matrix
264 | from cvxopt import solvers
265 | ####Remeber the orginal form of qp following cvxopt 1/2x^t*Px+q^Tx,subject to Gx