├── Mean Shift-part 39,40,41,42.py
├── Mean Shift-part 39,40.py
├── Mean Shift-part 41,42.py
├── README.md
├── Regression part 10 and Part 11.py
├── SVM-PART 28,Kernel-part 29,30,31.py
├── SVM-part 20,21,22.py
├── SVM-part 25,26,27.py
├── Tic-Tac-Toe AI.py
├── cluster-part34,35,36,37.py
├── k means...nearest neighbours-part 13,14,15.py
├── k means...nearest neighbours-part 16,17,18,19.py
├── k-means from scratch-part 37,38.py
├── kernel,cvxopt-part 32.py
├── regress ion part 4 and part 5.py
├── regression part 12.py
├── regression part 6,7,8,9.py
├── regression-part 1 and 2.py
└── svm(final)-sklearn-part 33.py


/Mean Shift-part 39,40,41,42.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Jan 27 19:01:02 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | import numpy as np
  8 | from sklearn.cluster import MeanShift
  9 | from sklearn.datasets.samples_generator import make_blobs
 10 | import matplotlib.pyplot as plt
 11 | from mpl_toolkits.mplot3d import Axes3D
 12 | from matplotlib import style
 13 | style.use("ggplot")
 14 | import pandas as pd
 15 | from sklearn import preprocessing
 16 | from pandas.api.types import is_numeric_dtype
 17 | ############Basic Visulisation of Mean Shift
 18 | #centers = [[1,1,1],[5,5,5],[3,10,10]]
 19 | #
 20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5)
 21 | #
 22 | #ms = MeanShift()
 23 | #ms.fit(X)
 24 | #labels = ms.labels_
 25 | #cluster_centers = ms.cluster_centers_
 26 | #
 27 | #print(cluster_centers)
 28 | #n_clusters_ = len(np.unique(labels))
 29 | #print("Number of estimated clusters:", n_clusters_)
 30 | #
 31 | #colors = 10*['r','g','b','c','k','y','m']
 32 | #fig = plt.figure()
 33 | #ax = fig.add_subplot(111, projection='3d')
 34 | #
 35 | #for i in range(len(X)):
 36 | #    ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')
 37 | #
 38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
 39 | #            marker="x",color='k', s=150, linewidths = 5, zorder=10)
 40 | 
 41 | ######################################################################
 42 | # -*- coding: utf-8 -*-
 43 | #############Mean Shift on Titanic Dataset
 44 | df = pd.read_excel('titanic.xls')
 45 | orginal_df=pd.DataFrame.copy(df)
 46 | 
 47 | 
 48 | df.drop(['body','name'], 1, inplace=True)
 49 | #df.convert_objects(convert_numeric=True)
 50 | print(df.head())
 51 | df.fillna(0,inplace=True)
 52 | 
 53 | def handle_non_numerical_data(df):
 54 |     
 55 |     # handling non-numerical data: must convert.
 56 |     columns = df.columns.values
 57 | 
 58 |     for column in columns:
 59 |         text_digit_vals = {}
 60 |         def convert_to_int(val):
 61 |             return text_digit_vals[val]
 62 | 
 63 |         #print(column,df[column].dtype)
 64 |         if df[column].dtype != np.int64 and df[column].dtype != np.float64:
 65 |             
 66 |             column_contents = df[column].values.tolist()
 67 |             #finding just the uniques
 68 |             unique_elements = set(column_contents)
 69 |             # great, found them. 
 70 |             x = 0
 71 |             for unique in unique_elements:
 72 |                 if unique not in text_digit_vals:
 73 |                     # creating dict that contains new
 74 |                     # id per unique string
 75 |                     text_digit_vals[unique] = x
 76 |                     x+=1
 77 |             # now we map the new "id" vlaue
 78 |             # to replace the string. 
 79 |             df[column] = list(map(convert_to_int,df[column]))
 80 | 
 81 |     return df
 82 | 
 83 | df = handle_non_numerical_data(df)
 84 | print(df.head())
 85 | 
 86 | # add/remove features just to see impact they have.
 87 | df.drop(['ticket','home.dest'], 1, inplace=True)
 88 | 
 89 | 
 90 | X = np.array(df.drop(['survived'], 1).astype(float))
 91 | X = preprocessing.scale(X)
 92 | y = np.array(df['survived'])
 93 | 
 94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
 95 | 
 96 | clf = MeanShift()##change this
 97 | clf.fit(X)
 98 | 
 99 | 
100 | labels=clf.labels_
101 | cluster_centers=clf.cluster_centers_
102 | 
103 | orginal_df["cluster_group"]=np.nan
104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed
105 | for i in range(len(X)):
106 |     orginal_df["cluster_group"].iloc[i]=labels[i]
107 | n_clusters_=len(np.unique(labels))
108 | survival_rates={}###to see survival rate for different classes
109 | for i in range(n_clusters_):
110 |     temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster
111 | #    print(temp_df)
112 |     survival_cluster=temp_df[(temp_df["survived"]==1)]
113 |     survival_rate=len(survival_cluster)/len(temp_df)
114 |     survival_rates[i]=survival_rate
115 | #print(orginal_df[(orginal_df["cluster_group"]==2)])
116 | ###Now you can use df.describe() to analyse the data for different classes
117 |     


--------------------------------------------------------------------------------
/Mean Shift-part 39,40.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Jan 27 19:01:02 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | import numpy as np
  8 | from sklearn.cluster import MeanShift
  9 | from sklearn.datasets.samples_generator import make_blobs
 10 | import matplotlib.pyplot as plt
 11 | from mpl_toolkits.mplot3d import Axes3D
 12 | from matplotlib import style
 13 | style.use("ggplot")
 14 | import pandas as pd
 15 | from sklearn import preprocessing
 16 | from pandas.api.types import is_numeric_dtype
 17 | ############Basic Visulisation of Mean Shift
 18 | #centers = [[1,1,1],[5,5,5],[3,10,10]]
 19 | #
 20 | #X, _ = make_blobs(n_samples = 100, centers = centers, cluster_std = 1.5)
 21 | #
 22 | #ms = MeanShift()
 23 | #ms.fit(X)
 24 | #labels = ms.labels_
 25 | #cluster_centers = ms.cluster_centers_
 26 | #
 27 | #print(cluster_centers)
 28 | #n_clusters_ = len(np.unique(labels))
 29 | #print("Number of estimated clusters:", n_clusters_)
 30 | #
 31 | #colors = 10*['r','g','b','c','k','y','m']
 32 | #fig = plt.figure()
 33 | #ax = fig.add_subplot(111, projection='3d')
 34 | #
 35 | #for i in range(len(X)):
 36 | #    ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')
 37 | #
 38 | #ax.scatter(cluster_centers[:,0],cluster_centers[:,1],cluster_centers[:,2],
 39 | #            marker="x",color='k', s=150, linewidths = 5, zorder=10)
 40 | 
 41 | ######################################################################
 42 | # -*- coding: utf-8 -*-
 43 | #############Mean Shift on Titanic Dataset
 44 | df = pd.read_excel('titanic.xls')
 45 | orginal_df=pd.DataFrame.copy(df)
 46 | 
 47 | 
 48 | df.drop(['body','name'], 1, inplace=True)
 49 | #df.convert_objects(convert_numeric=True)
 50 | print(df.head())
 51 | df.fillna(0,inplace=True)
 52 | 
 53 | def handle_non_numerical_data(df):
 54 |     
 55 |     # handling non-numerical data: must convert.
 56 |     columns = df.columns.values
 57 | 
 58 |     for column in columns:
 59 |         text_digit_vals = {}
 60 |         def convert_to_int(val):
 61 |             return text_digit_vals[val]
 62 | 
 63 |         #print(column,df[column].dtype)
 64 |         if df[column].dtype != np.int64 and df[column].dtype != np.float64:
 65 |             
 66 |             column_contents = df[column].values.tolist()
 67 |             #finding just the uniques
 68 |             unique_elements = set(column_contents)
 69 |             # great, found them. 
 70 |             x = 0
 71 |             for unique in unique_elements:
 72 |                 if unique not in text_digit_vals:
 73 |                     # creating dict that contains new
 74 |                     # id per unique string
 75 |                     text_digit_vals[unique] = x
 76 |                     x+=1
 77 |             # now we map the new "id" vlaue
 78 |             # to replace the string. 
 79 |             df[column] = list(map(convert_to_int,df[column]))
 80 | 
 81 |     return df
 82 | 
 83 | df = handle_non_numerical_data(df)
 84 | print(df.head())
 85 | 
 86 | # add/remove features just to see impact they have.
 87 | df.drop(['ticket','home.dest'], 1, inplace=True)
 88 | 
 89 | 
 90 | X = np.array(df.drop(['survived'], 1).astype(float))
 91 | X = preprocessing.scale(X)
 92 | y = np.array(df['survived'])
 93 | 
 94 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
 95 | 
 96 | clf = MeanShift()##change this
 97 | clf.fit(X)
 98 | 
 99 | 
100 | labels=clf.labels_
101 | cluster_centers=clf.cluster_centers_
102 | 
103 | orginal_df["cluster_group"]=np.nan
104 | ##########in order to check survial for each individual cluster formed by MeanShift and check accuracy of clustes formed
105 | for i in range(len(X)):
106 |     orginal_df["cluster_group"].iloc[i]=labels[i]
107 | n_clusters_=len(np.unique(labels))
108 | survival_rates={}###to see survival rate for different classes
109 | for i in range(n_clusters_):
110 |     temp_df=orginal_df[(orginal_df["cluster_group"]==float(i))]###make a temp dataframe for each cluster
111 | #    print(temp_df)
112 |     survival_cluster=temp_df[(temp_df["survived"]==1)]
113 |     survival_rate=len(survival_cluster)/len(temp_df)
114 |     survival_rates[i]=survival_rate
115 | #print(orginal_df[(orginal_df["cluster_group"]==2)])
116 | ###Now you can use df.describe() to analyse the data for different classes
117 |     


--------------------------------------------------------------------------------
/Mean Shift-part 41,42.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Jan 28 12:43:31 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib import style
  9 | style.use("ggplot")
 10 | import numpy as np
 11 | from sklearn.cluster import KMeans
 12 | import pandas as pd
 13 | from sklearn import preprocessing
 14 | from pandas.api.types import is_numeric_dtype
 15 | from sklearn.datasets.samples_generator import make_blobs
 16 | import random
 17 | 
 18 | #####basic visulisation of k-means
 19 | 
 20 | #plt.scatter(X[:,0],X[:,1],s=150)
 21 | X,y=make_blobs(n_samples=50,centers=5,n_features=2)
 22 | 
 23 | 
 24 | clf=KMeans(n_clusters=2)
 25 | clf.fit(X)
 26 | centroids=clf.cluster_centers_
 27 | labels=clf.labels_
 28 | colors=10*["g","r","c","b","k","o"]
 29 | ######Making Meansift from scartch
 30 | class Mean_Shift:
 31 |     def __init__(self,radius=None,radius_norm_step=100):
 32 |         self.radius=radius
 33 |         self.radius_norm_step=radius_norm_step
 34 |     def fit(self,data):
 35 |         
 36 |         if self.radius==None:
 37 |             all_data_centroid=np.average(data,axis=0)
 38 |             all_data_norm=np.linalg.norm(all_data_centroid)
 39 |             self.radius=all_data_norm/self.radius_norm_step
 40 |             
 41 |         
 42 |         centroids={}
 43 |         
 44 |         for i in range(len(data)):
 45 |             centroids[i]=data[i]   
 46 |         weights = [i for i in range(self.radius_norm_step)][::-1]    
 47 |         while True:
 48 |             new_centroids = []
 49 |             for i in centroids:
 50 |                 in_bandwidth = []
 51 |                 centroid = centroids[i]
 52 |                 
 53 |                 for featureset in data:
 54 |                     #if np.linalg.norm(featureset-centroid) < self.radius:
 55 |                     #    in_bandwidth.append(featureset)
 56 |                     distance = np.linalg.norm(featureset-centroid)
 57 |                     if distance == 0:
 58 |                         distance = 0.00000000001
 59 |                     weight_index = int(distance/self.radius)
 60 |                     if weight_index > self.radius_norm_step-1:
 61 |                         weight_index = self.radius_norm_step-1  
 62 | 
 63 |                     to_add = (weights[weight_index]**2)*[featureset]
 64 |                     in_bandwidth +=to_add
 65 |                     
 66 | 
 67 |                 new_centroid = np.average(in_bandwidth,axis=0)
 68 |                 new_centroids.append(tuple(new_centroid))
 69 | 
 70 |             uniques = sorted(list(set(new_centroids)))
 71 |             to_pop=[]
 72 |             for i in uniques:
 73 |                 for ii in uniques:
 74 |                     if i==ii:
 75 |                         pass
 76 |                     elif np.linalg.norm(np.array(i)-np.array(ii))<=self.radius:
 77 |                         to_pop.append(ii)
 78 |                         break
 79 |             for i in to_pop:
 80 |                 try:
 81 |                     uniques.remove(i)
 82 |                 except:
 83 |                     pass
 84 |             
 85 |             
 86 |             prev_centroids=dict(centroids)
 87 |             
 88 |             
 89 |             centroids={}
 90 |             for i in range(len(uniques)):
 91 |                 centroids[i]=np.array(uniques[i])
 92 |             optimized=True
 93 |             
 94 |             for i in centroids:
 95 |                 if not np.array_equal(centroids[i],prev_centroids[i]):
 96 |                     optimized=False
 97 |                 
 98 |                 if not optimized:
 99 |                     break
100 |             
101 |             if optimized:
102 |                 break
103 |         
104 |         self.centroids=centroids
105 |         
106 |         self.classification={}
107 |         for i in range(len(self.centroids)):
108 |             self.classification[i]=[]
109 |         for featureset in data:
110 |             distances=[np.linalg.norm(featureset-self.centroids[centroid])for centroid in self.centroids]
111 |             classification=distances.index(min(distances))
112 |             self.classification[classification].append(featureset)
113 |             
114 |     def predict(self,data):
115 |         distances=[np.linalg.norm(data-self.centroids[centroid])for centroid in self.centroids]
116 |         classification=distances.index(min(distances))
117 |         return classification
118 | clf=Mean_Shift()
119 | clf.fit(X)
120 | 
121 | 
122 | centroids=clf.centroids
123 | plt.scatter(X[:,0],X[:,1],s=150)
124 | 
125 | for classification in clf.classification:
126 |     color=colors[classification]
127 |     for featureset in clf.classification[classification]:
128 |         plt.scatter(featureset[0],featureset[1],marker="x",color=color,s=150)
129 | 
130 | for c in centroids:
131 |     plt.scatter(centroids[c][0],centroids[c][1],color="k",marker="*",s=150)
132 |            
133 | plt.show()
134 | ##The below commented code is for when radius is hardcoded:
135 | #import matplotlib.pyplot as plt
136 | #from matplotlib import style
137 | #style.use('ggplot')
138 | #import numpy as np
139 | #
140 | #X = np.array([[1, 2],
141 | #              [1.5, 1.8],
142 | #              [5, 8 ],
143 | #              [8, 8],
144 | #              [1, 0.6],
145 | #              [9,11],
146 | #              [8,2],
147 | #              [10,2],
148 | #              [9,3],])
149 | #
150 | ###plt.scatter(X[:,0], X[:,1], s=150)
151 | ###plt.show()
152 | #
153 | #colors = 10*["g","r","c","b","k"]
154 | #
155 | #class Mean_Shift:
156 | #    def __init__(self, radius=4):
157 | #        self.radius = radius
158 | #
159 | #    def fit(self, data):
160 | #        centroids = {}
161 | #
162 | #        for i in range(len(data)):
163 | #            centroids[i] = data[i]
164 | #        
165 | #        while True:
166 | #            new_centroids = []
167 | #            for i in centroids:
168 | #                in_bandwidth = []
169 | #                centroid = centroids[i]
170 | #                for featureset in data:
171 | #                    if np.linalg.norm(featureset-centroid) < self.radius:
172 | #                        in_bandwidth.append(featureset)
173 | #
174 | #                new_centroid = np.average(in_bandwidth,axis=0)
175 | #                new_centroids.append(tuple(new_centroid))
176 | #
177 | #            uniques = sorted(list(set(new_centroids)))
178 | #
179 | #            prev_centroids = dict(centroids)
180 | #
181 | #            centroids = {}
182 | #            for i in range(len(uniques)):
183 | #                centroids[i] = np.array(uniques[i])
184 | #
185 | #            optimized = True
186 | #
187 | #            for i in centroids:
188 | #                if not np.array_equal(centroids[i], prev_centroids[i]):
189 | #                    optimized = False
190 | #                if not optimized:
191 | #                    break
192 | #                
193 | #            if optimized:
194 | #                break
195 | #
196 | #        self.centroids = centroids
197 | #
198 | #
199 | #
200 | #clf = Mean_Shift()
201 | #clf.fit(X)
202 | #
203 | #centroids = clf.centroids
204 | #
205 | #plt.scatter(X[:,0], X[:,1], s=150)
206 | #
207 | #for c in centroids:
208 | #    plt.scatter(centroids[c][0], centroids[c][1], color='k', marker='*', s=150)
209 | #
210 | #plt.show()
211 |             
212 |                     
213 |                     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine-Learning-Algorithms
 2 | Major Algorithms to implement different supervised and unsupervised Machine Learners
 3 | 
 4 | These files shows my journey of Learning Machine Learning. These are not an implementation of a final project but rather shows how I learned Machine Learning and then tried to apply those concepts.
 5 | 
 6 | These files include both sklearn part of implementation and also my own scratch construction of these classical Machine Learning Algorithms.
 7 | 
 8 | Includes almost all of the main ML algorithms(KNN,SVM,Regression,MeanShift).
 9 | 
10 | <img src="https://wwwadvectasse.cdn.triggerfish.cloud/uploads/2018/04/machine-learning-e1525867921119-1024x645.jpg">
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/Regression part 10 and Part 11.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Thu Jan  3 14:15:33 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | 
 8 | #Linear Regression Model from scratch:
 9 | from statistics import mean
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from matplotlib import style
13 | import pylab
14 | style.use("fivethirtyeight")
15 | xs=np.array([1,2,3,4,5,6],dtype=np.float64)
16 | ys=np.array([5,4,6,5,6,7],dtype=np.float64)
17 | 
18 | def best_fit_slope_and_intercept(xs,ys):
19 |     m=((mean(xs)*mean(ys)) - (mean(xs*ys)))/((mean(xs)**2)-mean(xs**2))
20 |     b=mean(ys)-m*mean(xs)
21 |     return m,b
22 | m,b=best_fit_slope_and_intercept(xs,ys)
23 | print(m,b)
24 | 
25 | 
26 | def squared_error(ys_orgin,ys_line):
27 |     return sum((ys_line-ys_orgin)**2)
28 | def coefficent_of_determination(ys_orgin,ys_line):
29 |     y_mean_line=[mean(ys_orgin) for y in ys_orgin]
30 |     square_error_regr=squared_error(ys_orgin,ys_line)
31 |     square_error_regr_y_mean=squared_error(ys_orgin,y_mean_line)
32 |     return 1-(square_error_regr)/(square_error_regr_y_mean)
33 | regression_line=[(m*x)+b for x in xs]
34 | 
35 | 
36 | r_square=coefficent_of_determination(ys,regression_line)
37 | print(r_square)
38 | predict_x=8
39 | predict_y=(m*predict_x)+b
40 | print(regression_line)
41 | plt.scatter(xs,ys)
42 | plt.scatter(predict_x,predict_y,color="g")
43 | plt.plot(regression_line)
44 | 


--------------------------------------------------------------------------------
/SVM-PART 28,Kernel-part 29,30,31.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Jan 18 18:00:48 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib import style
 10 | import numpy as np
 11 | style.use("ggplot")
 12 | 
 13 | class Support_Vector_Machine:
 14 |     def __init__(self, visualization=True):
 15 |         self.visualization = visualization
 16 |         self.colors = {1:'r',-1:'b'}
 17 |         if self.visualization:
 18 |             self.fig = plt.figure()
 19 |             self.ax = self.fig.add_subplot(1,1,1)
 20 |     ###traing the data to find w and b
 21 |     def fit(self,data):
 22 |        self.data=data
 23 |         ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b
 24 |        opt_dict={}
 25 |         
 26 |         
 27 |         ##These transforms are what we use to apply to vector w
 28 |         ##each time we step in order to know ever possible direction of a vector w and its 
 29 |         ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account 
 30 |         #for direction,remeber in vector direction matters
 31 |        transforms=[[1,1],[-1,1],[-1,-1],[1,-1]]
 32 |         
 33 |         
 34 |         
 35 |         
 36 |     
 37 |        all_data=[]
 38 |         ###this three loop takes all features of the associated class yi and make,
 39 |         ##a new list of these features and than take the max and min value associated with the 
 40 |         ### this new list of feature and these max and min values are to be used for further convex optimization
 41 |        for yi in self.data:
 42 |             for featureset in self.data[yi]:
 43 |                 for feature in featureset:
 44 |                     all_data.append(feature)
 45 |        self.max_feature_value=max(all_data)
 46 |        self.min_feature_value=min(all_data)
 47 |        all_data=None
 48 |        step_size=[self.max_feature_value*0.1,
 49 |                   self.max_feature_value*0.01,
 50 |                   ##POINT OF EXPENSE
 51 |                   self.max_feature_value*0.001]
 52 |        ###extremely expensive-b does not need to take precise step 
 53 |        b_range_multiple=5
 54 |        
 55 |        #we dont need to take as small of steps
 56 |        #wit b as we do w
 57 |        b_multiple=5
 58 |        ###the first value of w and remeber to simplify things,we assume each element of vector w
 59 |        ### to be same
 60 |        latest_optimum=self.max_feature_value*10
 61 |        
 62 |        for step in step_size:
 63 |            ####remeber to simplify things,we assume each element of vector w
 64 |        ### to be same
 65 |            w=np.array([latest_optimum,latest_optimum])
 66 |            
 67 |            
 68 |            #we can do this because convex alogrithm
 69 |            optimized=False
 70 |            while not optimized:
 71 |                ####setting a range for b 
 72 |               for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
 73 |                                  self.max_feature_value*b_range_multiple,
 74 |                                  step*b_multiple):
 75 |                   for transformation in transforms:
 76 |                       ##applying the different transformation to account for difeerent direction(w_t)
 77 |                       w_t=w*transformation
 78 |                       found_option=True
 79 |                       #weakest link in the SVM fundamentally
 80 |                       #SMO attempt to fix this a bit
 81 |                       ##Running the data on all points is costly-svm weakness
 82 |                       ##yi(xi.w+b)>=1(constraint)
 83 |                       for i in self.data:
 84 |                           for xi in self.data[i]:
 85 |                               yi=i
 86 |                               ###this condition check even if one point in our data doesnt fit the constraint with the give w vector
 87 |                               if not yi*(np.dot(w_t,xi)+b)>=1:
 88 |                                   found_option=False
 89 |                       #if w satisfies the constraint
 90 |                       if found_option:
 91 |                           opt_dict[np.linalg.norm(w_t)]=[w_t,b]
 92 |               if w[0]<0:
 93 |                 optimized=True
 94 |                 print("optimized a step.")
 95 |               else:
 96 |                 w=w-step
 97 |            #taking the smallest modulus w and taking new starting new point
 98 |            norms=sorted([n for n in opt_dict])
 99 |            opt_choice=opt_dict[norms[0]]
100 |            self.w=opt_choice[0]
101 |            self.b=opt_choice[1]
102 |            latest_optimum=opt_choice[0][0]+step*2
103 |     def predict(self,features):
104 |         ###sign(x.w+b) whatever the sign of the equation is
105 |         classification=np.sign(np.dot(np.array(features),self.w)+self.b)
106 |         if classification != 0 and self.visualization:
107 |             self.ax.scatter(features[0],features[1],s=200,marker='*', c=self.colors[classification])
108 |         return classification     
109 |     def visualize(self):
110 |         [[self.ax.scatter(x[0],x[1],s=100,color=self.colors[i]) for x in data_dict[i]] for i in data_dict]
111 |         def hyperplane(x,w,b,v):
112 |             ###v=x.w+b
113 |             ###the hyperplane function shows the support vector plannes and boudrt decision so:
114 |             ###positive support vector(psv)=1
115 |             ###nsv=-1
116 |             ###decision boundary=0,want to find a plane with these associated v values and show them
117 |             #hyperplane v=x.w+b
118 |             ##x,y is an unknown point on the hyperplane
119 |     #        x_v and w_v are the vector
120 |     #        x_v= [x,y]
121 |     #        x_v.w_v+b =1 for postive sv
122 |     ## this helps to find the value of y where value of hyperplance is 1
123 |             return (-w[0]*x-b+v)/w[1]
124 |         datarange = (self.min_feature_value*0.9,self.max_feature_value*1.1)
125 |         hyp_x_min = datarange[0]
126 |         hyp_x_max = datarange[1]
127 |         #(w.x+b)=1
128 |         #positive support vector hyperplane
129 |         psv1=hyperplane(hyp_x_min,self.w,self.b,1)
130 |         ##psv1 is going to be scalar value not vector and its going to be y given specific x and v value
131 |         psv2=hyperplane(hyp_x_max,self.w,self.b,1)
132 |         #ploting the associate coordinate of psv2 and psv1 to visualize the hyperplane where v is one ,remeber hyper equation is for y such that v is one
133 |         self.ax.plot([hyp_x_min,hyp_x_max],[psv1,psv2],"k")
134 |         ##doing the same thing and process for a value of v=-1:
135 |         nsv1=hyperplane(hyp_x_min,self.w,self.b,-1)
136 |         nsv2=hyperplane(hyp_x_max,self.w,self.b,-1)
137 |         self.ax.plot([hyp_x_min,hyp_x_max],[nsv1,nsv2],"k")
138 |         ###doing the same thing and process for a value of v=0:
139 |         db1=hyperplane(hyp_x_min,self.w,self.b,0)
140 |         db2=hyperplane(hyp_x_max,self.w,self.b,0)
141 |         self.ax.plot([hyp_x_min,hyp_x_max],[db1,db2],"y--")
142 |         
143 |         #show the result
144 |         plt.show()
145 |         
146 |         
147 |         
148 |         
149 | data_dict={-1:np.array([[1,7],
150 |                         [2,8],
151 |                         [3,8]]),
152 |             1:np.array([[5,1],[6,-1],[7,3]])}
153 | 
154 | 
155 | ###trial 1:@19:13 hours\
156 | svm=Support_Vector_Machine()
157 | svm.fit(data=data_dict)
158 | predict_us=[[0,10],[1,3],[3,4],[3,5],[5,5],[6,-5],[5,8]]
159 | for p in predict_us:
160 |     svm.predict(p)
161 | svm.visualize()
162 | ############################################-SVM COMPLETED::::
163 | 
164 |     
165 |     
166 |     
167 |     
168 |     
169 |     


--------------------------------------------------------------------------------
/SVM-part 20,21,22.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 11 19:21:35 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | 
 8 | import numpy as np
 9 | from sklearn import preprocessing,neighbors,svm
10 | from sklearn.model_selection import cross_validate,train_test_split
11 | import pandas as pd
12 | import pickle
13 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
14 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
15 | #lose rest of the data
16 | df.replace("?",-9999,inplace=True)
17 | 
18 | ###check for any useless data and drop it 
19 | df.drop(["id"],1,inplace=True)
20 | #### X are the features and y is the label
21 | X=np.array(df.drop(["class"],1))
22 | print(X)
23 | y=np.array(df["class"])
24 | 
25 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
26 | #####Using the classifer
27 | clf=svm.SVC()
28 | clf.fit(X_train,y_train)
29 | ###Saving the classifer
30 | with open("K_model","wb") as f:
31 |     pickle.dump(clf,f)
32 | #Remeber the difference between accuracy and confidecnce
33 | accuracy=clf.score(X_test,y_test)
34 | print(accuracy)
35 | ####make prediction
36 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]])
37 | example_measures=np.array(predict_X)
38 | print(example_measures)
39 | ###To make the array shape that sklearn understands and matches the the X features
40 | predict=clf.predict(example_measures.reshape(len(example_measures),-1))
41 | print(predict)
42 | 
43 | 
44 | 
45 | ###########the part 23,24 was theory


--------------------------------------------------------------------------------
/SVM-part 25,26,27.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jan 15 19:05:10 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | 
  8 | import matplotlib.pyplot as plt
  9 | from matplotlib import style
 10 | import numpy as np
 11 | style.use("ggplot")
 12 | 
 13 | class Support_Vector_Machine(object):
 14 |     def __init___(self,visulization=True):
 15 |         self.visulization=visulization
 16 |         self.colors={1:"r",-1:"b"}
 17 |         if self.visulization:
 18 |             self.fig=plt.figure()
 19 |             self.ax=self.fig.add_subplot(1,1,1)
 20 |     ###traing the data to find w and b
 21 |     def fit(self,data):
 22 |        self.data=data
 23 |         ####{||w||:[w,b]} a dictionary which store for every modulus value of w , a associate vector w and b
 24 |        opt_dict={}
 25 |         
 26 |         
 27 |         ##These transforms are what we use to apply to vector w
 28 |         ##each time we step in order to know ever possible direction of a vector w and its 
 29 |         ##associate b value whose value is affected by direction and store the highest b value, as modulus of w doesn't account 
 30 |         #for direction,remeber in vector direction matters
 31 |        transforms=[[1,1],[-1,1],[-1,-1],[1,-1]]
 32 |         
 33 |         
 34 |         
 35 |         
 36 |     
 37 |        all_data=[]
 38 |         ###this three loop takes all features of the associated class yi and make,
 39 |         ##a new list of these features and than take the max and min value associated with the 
 40 |         ### this new list of feature and these max and min values are to be used for further convex optimization
 41 |        for yi in self.data:
 42 |             for featureset in self.data[yi]:
 43 |                 for feature in featureset:
 44 |                     all_data.append(feature)
 45 |        self.max_feature_value=max(all_data)
 46 |        self.min_feature_value=min(all_data)
 47 |        all_data=None
 48 |        step_size=[self.max_feature_value*0.1,
 49 |                   self.max_feature_value*0.01,
 50 |                   ##POINT OF EXPENSE
 51 |                   self.max_feature_value*0.001]
 52 |        ###extremely expensive-b does not need to take precise step 
 53 |        b_range_multiple=5
 54 |        
 55 |        #we dont need to take as small of steps
 56 |        #wit b as we do w
 57 |        b_multiple=5
 58 |        ###the first value of w and remeber to simplify things,we assume each element of vector w
 59 |        ### to be same
 60 |        latest_optimum=self.max_feature_value*10
 61 |        
 62 |        for step in step_size:
 63 |            ####remeber to simplify things,we assume each element of vector w
 64 |        ### to be same
 65 |            w=np.array([latest_optimum,latest_optimum])
 66 |            
 67 |            
 68 |            #we can do this because convex alogrithm
 69 |            optimized=False
 70 |            while not optimized:
 71 |                ####setting a range for b 
 72 |               for b in np.arange(-1*(self.max_feature_value*b_range_multiple),
 73 |                                  self.max_feature_value*b_range_multiple,
 74 |                                  step*b_multiple):
 75 |                   for transformation in transforms:
 76 |                       ##applying the different transformation to account for difeerent direction(w_t)
 77 |                       w_t=w*transformation
 78 |                       found_option=True
 79 |                       #weakest link in the SVM fundamentally
 80 |                       #SMO attempt to fix this a bit
 81 |                       ##Running the data on all points is costly-svm weakness
 82 |                       ##yi(xi.w+b)>=1(constraint)
 83 |                       for i in self.data:
 84 |                           for xi in self.data[i]:
 85 |                               yi=i
 86 |                               ###this condition check even if one point in our data doesnt fit the constraint with the give w vector
 87 |                               if not yi*(np.dot(w_t,xi)+b)>=1:
 88 |                                   found_option=False
 89 |                       #if w satisfies the constraint
 90 |                       if found_option:
 91 |                           opt_dict[np.linalg.norm(w_t)]=[w_t,b]
 92 |               if w[0]<0:
 93 |                 optimized=True
 94 |                 print("optimized a step.")
 95 |               else:
 96 |                 w=w-step
 97 |            #taking the smallest modulus w and taking new starting new point
 98 |            norms=sorted([n for n in opt_dict])
 99 |            opt_choice=opt_dict[norms[0]]
100 |            self.w=opt_choice[0]
101 |            self.b=opt_choice[1]
102 |            latest_optimum=opt_choice[0][0]+step*2
103 |     def predict(self,features):
104 |         ###sign(x.w+b) whatever the sign of the equation is
105 |         classification=np.sign(np.dot(np.array(features),self.w)+self.b)
106 |             
107 |         return classification     
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | data_dict={-1:np.array([[1,7],
115 |                         [2,8],
116 |                         [3,8]]),
117 |             1:np.array([[5,1],[6,-1],[7,3]])}
118 | 


--------------------------------------------------------------------------------
/Tic-Tac-Toe AI.py:
--------------------------------------------------------------------------------
  1 | 15# -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Feb 15 23:11:53 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | 
  8 | board=[" " for x in range(10)]
  9 | 
 10 | 
 11 | def insertLetter(letter,pos):
 12 |     board[pos]=letter
 13 | def spaceisFree(pos):
 14 |     return board[pos]==" "
 15 | def printBoard(board):
 16 |     print('  |   | ')
 17 |     print(""+board[1]+ " | "+ board[2]+ " | "+ board[3])
 18 |     print('  |   |')
 19 |     print("----------")
 20 |     print('  |   | ')
 21 |     print(""+board[4]+ " | "+ board[5]+ " | "+ board[6])
 22 |     print('  |   |')
 23 |     print("----------")
 24 |     print('  |   |')
 25 |     print(""+board[7]+ " | "+ board[8]+ " | "+ board[9])
 26 |     print('  |   |')
 27 | def isWinner(bo,le):##sorry for the long line
 28 |    return (bo[7]==le and bo[8]==le and bo[9]==le) or (bo[4]==le and bo[5]==le and bo[6]==le) or (bo[1]==le and bo[2]==le and bo[3]==le) or (bo[1]==le and bo[4]==le and bo[7]==le) or  (bo[2]==le and bo[5]==le and bo[8]==le) or  (bo[3]==le and bo[6]==le and bo[9]==le) or (bo[1]==le and bo[5]==le and bo[9]==le) or  (bo[3]==le and bo[5]==le and bo[7]==le)
 29 | def playerMove():
 30 |     run=True
 31 |     while run:
 32 |         move=input("please select a position to place an X (1-9): ")
 33 |         try:
 34 |             move=int(move)
 35 |             if move>0 and move<10:
 36 |                 if spaceisFree(move):
 37 |                     run=False
 38 |                     insertLetter("X",move)
 39 |                 else:
 40 |                     print("This space is occupied")
 41 |             else:
 42 |                 print("Type a number witin the range")
 43 |         except:
 44 |             print("Type a number")
 45 |             
 46 | def compMove():
 47 |     possibleMoves=[x for x,letter in enumerate(board) if letter==" " and x!=0]
 48 |     move=0
 49 |     
 50 |     for let in ["O","X"]:
 51 |         for i in possibleMoves:
 52 |             boardCopy=board[:]
 53 |             boardCopy[i]=let
 54 |             if isWinner(boardCopy,let):
 55 |                 move=i
 56 |                 return move
 57 |     
 58 |     cornersOpen=[]
 59 |     for i in possibleMoves:
 60 |         if i in [1,3,7,9]:
 61 |             cornersOpen.append(i)
 62 |     if len(cornersOpen)>0:
 63 |         move=selectRandom(cornersOpen)
 64 |         return move
 65 |     
 66 |     if 5 in possibleMoves:
 67 |         move=5
 68 |         return move
 69 |     edgesOpen=[]
 70 |     for i in possibleMoves:
 71 |         if i in [2,4,6,8]:
 72 |             edgesOpen.append(i)
 73 |     if len(edgesOpen)>0:
 74 |         move=selectRandom(edgesOpen)
 75 |         return move
 76 | 
 77 | def selectRandom(li):
 78 |     import random
 79 |     In =len(li)
 80 |     r=random.randrange(0,In)
 81 |     return li[r]
 82 | def isBoardFull():
 83 |     if board.count(" ")>1:
 84 |         return False
 85 |     else:
 86 |         return True
 87 | def main():
 88 |     print("Welcome to Tic Tac Toe")
 89 |     printBoard(board)
 90 |     
 91 |     while not (isBoardFull()):
 92 |         if not isWinner(board,'O'):
 93 |             playerMove()
 94 |             printBoard(board)
 95 |         else:
 96 |             print("Sorry AI win the game")
 97 |             break
 98 |         if not isWinner(board,'X'):
 99 |             move=compMove()
100 |             if move==0:
101 |                 print("Tie Game!")
102 |             else:
103 |                 insertLetter("O",move)
104 |                 print("computer placed an O in position ,move")
105 |                 printBoard(board)
106 |         else:
107 |             print("Yeap You ARE smarter than my AI")
108 |             break
109 |     if isBoardFull():
110 |         print("Tie Game")
111 | main()


--------------------------------------------------------------------------------
/cluster-part34,35,36,37.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Jan 25 15:06:06 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | from matplotlib import style
10 | style.use("ggplot")
11 | import numpy as np
12 | from sklearn.cluster import KMeans
13 | import pandas as pd
14 | from sklearn import preprocessing
15 | from pandas.api.types import is_numeric_dtype
16 | 
17 | #####basic visulisation of k-means
18 | X=np.array([[1,2],
19 |            [1.5,1.8],
20 |            [5,8],
21 |            [8,8],
22 |            [1,0.6],
23 |            [9,11]])
24 | plt.scatter(X[:,0],X[:,1],s=150)
25 | 
26 | 
27 | 
28 | clf=KMeans(n_clusters=2)
29 | clf.fit(X)
30 | centroids=clf.cluster_centers_
31 | labels=clf.labels_
32 | colors=["g.","r.","c.","b.","k.","o."]
33 | print(centroids)
34 | print(labels)
35 | for i in range(len(X)):
36 |     plt.plot(X[i][0],X[i][1],colors[labels[i]],markersize=25)
37 | plt.scatter(centroids[:,0],centroids[:,1],marker="x",s=150)
38 | plt.show()
39 | ##################################################################
40 | ###Analysing Titanic dataset through K-means
41 | df=pd.read_excel("titanic.xls")
42 | df.drop(["body","name"],1,inplace=True)
43 | df.apply(pd.to_numeric, errors='ignore')
44 | df.fillna(0,inplace=True)
45 | c=df["age"].values.tolist()
46 | ###in order to convert text data to useable numeric data
47 | def handle_numeric_data(df):
48 |     columns=df.columns.values
49 | #    dtypes=dict(df.dtypes)
50 |     for column in columns:
51 |         text_digit_vals={}
52 | #        dtype=dtypes[column]
53 |         def convert_to_int(val):
54 |             return text_digit_vals[val]
55 |         if df[column].dtype != np.int64 and df[column].dtype != np.float64:
56 |             columns_contents=df[column].values.tolist()
57 | 
58 |             unique_element=set(columns_contents)
59 |             x=0
60 |             for unique in unique_element:
61 |                 if unique not in text_digit_vals:
62 |                     text_digit_vals[unique]=x
63 |                     x+=1
64 |                     
65 |             df[column]=list(map(convert_to_int,df[column]))
66 |     return df
67 | df=handle_numeric_data(df)
68 | ####################################################################
69 | 
70 | #df.drop(["sex","boat"],1,inplace=True)
71 | X=np.array(df.drop(["survived"],1),dtype=float)
72 | X=preprocessing.scale(X)
73 | y=np.array(df["survived"])
74 | 
75 | 
76 | clf=KMeans(n_clusters=2)
77 | clf.fit(X)
78 | 
79 | ########In unsupervised learning we do not have labels so we are going to use only X to fit the data
80 | ########then KMeans would label the data into two groups and to check accuracy of that labelling we will see
81 | #######what was the prediction of the KMeans and compare with binary representation of our y.
82 | correct=0
83 | labels=clf.labels_###use this
84 | print(labels)
85 | for i in range(len(X)):
86 |     predict_me=np.array(X[i],dtype=float)
87 |     predict_me=predict_me.reshape(-1,len(predict_me))
88 |     prediction=clf.predict(predict_me)###what does the label given by classifer
89 |     if prediction[0]==y[i]:####did it label correctly
90 |         correct+=1
91 | print(correct/len(X))    
92 | 
93 | 


--------------------------------------------------------------------------------
/k means...nearest neighbours-part 13,14,15.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jan  5 13:45:02 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | import numpy as np
 8 | from sklearn import preprocessing,neighbors
 9 | from sklearn.model_selection import cross_validate,train_test_split
10 | import pandas as pd
11 | import pickle
12 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
13 | ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
14 | #lose rest of the data
15 | df.replace("?",-9999,inplace=True)
16 | 
17 | ###check for any useless data and drop it 
18 | df.drop(["id"],1,inplace=True)
19 | #### X are the features and y is the label
20 | X=np.array(df.drop(["class"],1))
21 | print(X)
22 | y=np.array(df["class"])
23 | 
24 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
25 | #####Using the classifer
26 | clf=neighbors.KNeighborsClassifier()
27 | clf.fit(X_train,y_train)
28 | ###Saving the classifer
29 | with open("K_model","wb") as f:
30 |     pickle.dump(clf,f)
31 | #Remeber the difference between accuracy and confidecnce
32 | accuracy=clf.score(X_test,y_test)
33 | print(accuracy)
34 | ####make prediction
35 | predict_X=np.array([[4,2,1,1,1,2,3,2,1],[4,2,2,1,2,2,3,2,1]])
36 | example_measures=np.array(predict_X)
37 | print(example_measures)
38 | ###To make the array shape that sklearn understands and matches the the X features
39 | predict=clf.predict(example_measures.reshape(len(example_measures),-1))
40 | print(predict)
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | ###############################
48 | #K-model from scratch:
49 | from math import sqrt
50 | #### Euclidean_Distance Basic Formula is:
51 | plot1=[1,3]
52 | plot2=[2,5]
53 | euclidean_distance=sqrt(((plot1[0]-plot2[0])**2)+((plot1[1]-plot2[1])**2))
54 | print(euclidean_distance)
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/k means...nearest neighbours-part 16,17,18,19.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jan  8 16:43:39 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | 
  8 | #K-model from scratch:
  9 | from math import sqrt
 10 | import numpy as np
 11 | import matplotlib.pyplot as plt
 12 | from matplotlib import style
 13 | from collections import Counter
 14 | import warnings
 15 | import random 
 16 | import pandas as pd
 17 | ###creating a dataset with labels and features
 18 | dataset={"k":[[1,2],[2,3],[3,1]],"r":[[6,5],[7,7],[8,6]]}
 19 | ##feature to be classified
 20 | new_feature=[5,7]
 21 | ####Looping over to scatter the plot
 22 | #for i in dataset:
 23 | #    for ii in dataset[i]:
 24 | #        [plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]]
 25 | ###More pythontic way
 26 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
 27 | #plt.scatter(new_feature[0],new_feature[1])
 28 | #plt.show()
 29 | ###k alogorithm
 30 | def k_nearest_neighbours(data,predict,k=3):
 31 |     if len(data) >=k:
 32 |         warnings.warn("K is set to a value less than total voting groups!")
 33 |     distances=[]
 34 |     for group in data:
 35 |         for features in data[group]:
 36 | #            euclidean_distance=np.sqrt(np.sum((np.array(features)-np.array(predict))**2)) or better:
 37 |              ###numpy fomula
 38 |              euclidean_distance=np.linalg.norm(np.array(features)-np.array(predict))
 39 |              #####Making the euclidean distance list to sort later
 40 |              distances.append([euclidean_distance,group])
 41 |     ###calculating votes to help us classify-lowest distance
 42 |     votes=[i[1] for i in sorted(distances)[:k]]
 43 |     votes_result=Counter(votes).most_common(1)[0][0]
 44 |     ###confidence measure how confident our classifer is about one single point in labelling that point-that is what porppotion of votes were infavour
 45 |     confidence=Counter(votes).most_common(1)[0][1]/k
 46 |     return votes_result,confidence
 47 | 
 48 | #result=k_nearest_neighbours(dataset,new_feature,k=3)
 49 | #print(result)
 50 | ###showing the result,color is already k an r as variables.
 51 | #[[plt.scatter(ii[0],ii[1],s=100,color=i) for ii in dataset[i]] for i in dataset]
 52 | #plt.scatter(new_feature[0],new_feature[1],color=result)
 53 | #plt.show()
 54 | ######################Comparing our model vs sklearn model
 55 | #df=pd.read_csv("breast-cancer-wisconsin.data.txt")
 56 | #df.replace("?",-9999,inplace=True)
 57 | #df.drop(["id"],1,inplace=True)
 58 | #full_data=df.astype(float).values.tolist()
 59 | ######shuffling the inner lists of full_data
 60 | #random.shuffle(full_data)
 61 | #
 62 | #########dividing the full data into train data and test data
 63 | #test_size=0.4
 64 | #train_set= {2:[],4:[]}
 65 | #test_set={2:[],4:[]}
 66 | #train_data=full_data[:-int(test_size*len(full_data))]
 67 | #test_data=full_data[-int(test_size*len(full_data)):]
 68 | #
 69 | #for i in train_data:
 70 | #    ####associating the datas to classifiers in this case 2 or 4
 71 | #    train_set[i[-1]].append(i[:-1])
 72 | #for i in test_data:
 73 | #    ####associating the datas to classifiers in this case 2 or 4
 74 | #    test_set[i[-1]].append(i[:-1])
 75 | #correct=0
 76 | #total=0
 77 | #for group in test_set:
 78 | #    for data in test_set[group]:
 79 | #        vote,confidence=k_nearest_neighbours(train_set,data,k=5)
 80 | #        if group == vote:
 81 | #            correct+=1
 82 | #        else:
 83 | #            print(confidence)
 84 | #        total+=1
 85 | #        
 86 | #print("Accuracy: ",correct/total)
 87 | #########################copying the whole algorith down again to judge the accuracy in a numbe of trials:
 88 | accuracies=[]
 89 | n=25
 90 | for i in range(n):
 91 |     df=pd.read_csv("breast-cancer-wisconsin.data.txt")
 92 |     df.replace("?",-9999,inplace=True)
 93 |     df.drop(["id"],1,inplace=True)
 94 |     full_data=df.astype(float).values.tolist()
 95 |     #####shuffling the inner lists of full_data
 96 |     random.shuffle(full_data)
 97 |     
 98 |     ########dividing the full data into train data and test data
 99 |     test_size=0.4
100 |     train_set= {2:[],4:[]}
101 |     test_set={2:[],4:[]}
102 |     train_data=full_data[:-int(test_size*len(full_data))]
103 |     test_data=full_data[-int(test_size*len(full_data)):]
104 |     
105 |     for i in train_data:
106 |         ####associating the datas to classifiers in this case 2 or 4
107 |         train_set[i[-1]].append(i[:-1])
108 |     for i in test_data:
109 |         ####associating the datas to classifiers in this case 2 or 4
110 |         test_set[i[-1]].append(i[:-1])
111 |     correct=0
112 |     total=0
113 |     for group in test_set:
114 |         for data in test_set[group]:
115 |             vote,confidence=k_nearest_neighbours(train_set,data,k=5)
116 |             if group == vote:
117 |                 correct+=1
118 |             total+=1
119 |     accuracies.append(correct/total)
120 | print("overall_accuracy(our algorithm) for", n ," steps = ", sum(accuracies)/len(accuracies))
121 | ##############finally getting the sklearn algorithm and comparing it with our overall accuracy for a specific number of steps:
122 | accuracies_2=[]
123 | for i in range(n):
124 |     from sklearn.model_selection import train_test_split
125 |     from sklearn import neighbors
126 |     df=pd.read_csv("breast-cancer-wisconsin.data.txt")
127 |     ##fill missing data,-9999 will be treated as outlier in our algorithm and dont,
128 |     #lose rest of the data
129 |     df.replace("?",-9999,inplace=True)    
130 |     ###check for any useless data and drop it 
131 |     df.drop(["id"],1,inplace=True)
132 |     #### X are the features and y is the label
133 |     X=np.array(df.drop(["class"],1))
134 |     y=np.array(df["class"])    
135 |     X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
136 |     #####Using the classifer
137 |     clf=neighbors.KNeighborsClassifier()
138 |     clf.fit(X_train,y_train)
139 |     #Remeber the difference between accuracy and confidecnce
140 |     accuracy=clf.score(X_test,y_test)
141 |     accuracies_2.append(accuracy)
142 | print("overall_accuracy(sk-learn alogorithm) for", n ," steps = ", sum(accuracies_2)/len(accuracies_2))
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/k-means from scratch-part 37,38.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Jan 26 17:11:21 2019
  4 | 
  5 | @author: MMOHTASHIM
  6 | """
  7 | 
  8 | ##########################################
  9 | ###########-k-mean from scratch
 10 | import matplotlib.pyplot as plt
 11 | from matplotlib import style
 12 | style.use("ggplot")
 13 | import numpy as np
 14 | from sklearn.cluster import KMeans
 15 | import pandas as pd
 16 | from sklearn import preprocessing
 17 | from pandas.api.types import is_numeric_dtype
 18 | 
 19 | #####basic visulisation 
 20 | #X=np.array([[1,2],
 21 | #           [1.5,1.8],
 22 | #           [5,8],
 23 | #           [8,8],
 24 | #           [1,0.6],
 25 | #           [9,11]])
 26 | 
 27 | #plt.scatter(X[:,0],X[:,1],s=150)
 28 | 
 29 | colors=["g","r","c","b","k"]
 30 | 
 31 | class K_Means:
 32 |     def __init__(self, k=2, tol=0.001, max_iter=300):
 33 |         self.k = k
 34 |         self.tol = tol
 35 |         self.max_iter = max_iter
 36 | 
 37 |     def fit(self,data):
 38 | 
 39 |         self.centroids = {}
 40 | 
 41 |         for i in range(self.k):
 42 |             self.centroids[i] = data[i]
 43 | 
 44 |         for i in range(self.max_iter):
 45 |             self.classifications = {}
 46 | 
 47 |             for i in range(self.k):
 48 |                 self.classifications[i] = []
 49 | 
 50 |             for featureset in data:
 51 |                 distances = [np.linalg.norm(featureset-self.centroids[centroid]) for centroid in self.centroids]
 52 |                 classification = distances.index(min(distances))
 53 |                 self.classifications[classification].append(featureset)
 54 | 
 55 |             prev_centroids = dict(self.centroids)
 56 | 
 57 |             for classification in self.classifications:
 58 |                 self.centroids[classification] = np.average(self.classifications[classification],axis=0)
 59 | 
 60 |             optimized = True
 61 | 
 62 |             for c in self.centroids:
 63 |                 original_centroid = prev_centroids[c]
 64 |                 current_centroid = self.centroids[c]
 65 |                 if np.sum((current_centroid-original_centroid)/original_centroid*100.0) > self.tol:
 66 |                     print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
 67 |                     optimized = False
 68 | 
 69 |             if optimized:
 70 |                 print(np.sum((current_centroid-original_centroid)/original_centroid*100.0))
 71 |                 break
 72 | 
 73 |     def predict(self,data):
 74 |         distances = [np.linalg.norm(data-self.centroids[centroid]) for centroid in self.centroids]
 75 |         classification = distances.index(min(distances))
 76 |         return classification
 77 | ###This code for basic visulisation of K_mean in 2d.
 78 | #clf = K_Means()
 79 | #clf.fit(X)
 80 | 
 81 | #for centroid in clf.centroids:
 82 | #    plt.scatter(clf.centroids[centroid][0], clf.centroids[centroid][1],
 83 | #                marker="o", color="k", s=150, linewidths=5)
 84 | #
 85 | #for classification in clf.classifications:
 86 | #    color = colors[classification]
 87 | #    for featureset in clf.classifications[classification]:
 88 | #        plt.scatter(featureset[0], featureset[1], marker="x", color=color, s=150, linewidths=5)
 89 |         
 90 |         
 91 | #unknowns=np.array([[1,3],
 92 | #                   [8,9],
 93 | #                   [0,3],
 94 | #                   [5,4],
 95 | #                   [6,4]])
 96 | #for unknown in unknowns:
 97 | #    classification=clf.predict(unknown)
 98 | #    plt.scatter(unknown[0],unknown[1],marker="*",color=colors[classification],s=150,linewidths=5)
 99 | 
100 | ###below is to compare our alg with sklearn
101 | # https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
102 | df = pd.read_excel('titanic.xls')
103 | df.drop(['body','name'], 1, inplace=True)
104 | #df.convert_objects(convert_numeric=True)
105 | print(df.head())
106 | df.fillna(0,inplace=True)
107 | 
108 | def handle_non_numerical_data(df):
109 |     
110 |     # handling non-numerical data: must convert.
111 |     columns = df.columns.values
112 | 
113 |     for column in columns:
114 |         text_digit_vals = {}
115 |         def convert_to_int(val):
116 |             return text_digit_vals[val]
117 | 
118 |         #print(column,df[column].dtype)
119 |         if df[column].dtype != np.int64 and df[column].dtype != np.float64:
120 |             
121 |             column_contents = df[column].values.tolist()
122 |             #finding just the uniques
123 |             unique_elements = set(column_contents)
124 |             # great, found them. 
125 |             x = 0
126 |             for unique in unique_elements:
127 |                 if unique not in text_digit_vals:
128 |                     # creating dict that contains new
129 |                     # id per unique string
130 |                     text_digit_vals[unique] = x
131 |                     x+=1
132 |             # now we map the new "id" vlaue
133 |             # to replace the string. 
134 |             df[column] = list(map(convert_to_int,df[column]))
135 | 
136 |     return df
137 | 
138 | df = handle_non_numerical_data(df)
139 | print(df.head())
140 | 
141 | # add/remove features just to see impact they have.
142 | df.drop(['ticket','home.dest'], 1, inplace=True)
143 | 
144 | 
145 | X = np.array(df.drop(['survived'], 1).astype(float))
146 | X = preprocessing.scale(X)
147 | y = np.array(df['survived'])
148 | 
149 | #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.5)
150 | 
151 | clf = K_Means()##change this
152 | clf.fit(X)
153 | 
154 | correct = 0
155 | for i in range(len(X)):
156 | 
157 |     predict_me = np.array(X[i].astype(float))
158 |     predict_me = predict_me.reshape(-1, len(predict_me))
159 |     prediction = clf.predict(predict_me)
160 |     if prediction == y[i]:
161 |         correct += 1
162 | 
163 | 
164 | print(correct/len(X))
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/kernel,cvxopt-part 32.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Jan 23 12:16:33 2019
  4 | 
  5 | @author: Not my work
  6 | """
  7 | 
  8 | # Mathieu Blondel, September 2010
  9 | # License: BSD 3 clause
 10 | # http://www.mblondel.org/journal/2010/09/19/support-vector-machines-in-python/
 11 | 
 12 | # visualizing what translating to another dimension does
 13 | # and bringing back to 2D:
 14 | # https://www.youtube.com/watch?v=3liCbRZPrZA
 15 | 
 16 | # Docs: http://cvxopt.org/userguide/coneprog.html#quadratic-programming
 17 | # Docs qp example: http://cvxopt.org/examples/tutorial/qp.html
 18 | 
 19 | # Nice tutorial:
 20 | # https://courses.csail.mit.edu/6.867/wiki/images/a/a7/Qp-cvxopt.pdf
 21 | 
 22 | 
 23 | import numpy as np
 24 | from numpy import linalg
 25 | import cvxopt
 26 | import cvxopt.solvers
 27 |              
 28 | def linear_kernel(x1, x2):
 29 |     return np.dot(x1, x2)
 30 | 
 31 | def polynomial_kernel(x, y, p=3):
 32 |     return (1 + np.dot(x, y)) ** p
 33 | 
 34 | def gaussian_kernel(x, y, sigma=5.0):
 35 |     return np.exp(-linalg.norm(x-y)**2 / (2 * (sigma ** 2)))
 36 | 
 37 | class SVM(object):
 38 | 
 39 |     def __init__(self, kernel=linear_kernel, C=None):
 40 |         self.kernel = kernel
 41 |         self.C = C
 42 |         if self.C is not None: self.C = float(self.C)
 43 | 
 44 |     def fit(self, X, y):
 45 |         n_samples, n_features = X.shape
 46 | 
 47 |         # Gram matrix
 48 |         K = np.zeros((n_samples, n_samples))
 49 |         for i in range(n_samples):
 50 |             for j in range(n_samples):
 51 |                 K[i,j] = self.kernel(X[i], X[j])
 52 | 
 53 |         P = cvxopt.matrix(np.outer(y,y) * K)
 54 |         q = cvxopt.matrix(np.ones(n_samples) * -1)
 55 |         A = cvxopt.matrix(y, (1,n_samples))
 56 |         b = cvxopt.matrix(0.0)
 57 | 
 58 |         if self.C is None:
 59 |             G = cvxopt.matrix(np.diag(np.ones(n_samples) * -1))
 60 |             h = cvxopt.matrix(np.zeros(n_samples))
 61 |         else:
 62 |             tmp1 = np.diag(np.ones(n_samples) * -1)
 63 |             tmp2 = np.identity(n_samples)
 64 |             G = cvxopt.matrix(np.vstack((tmp1, tmp2)))
 65 |             tmp1 = np.zeros(n_samples)
 66 |             tmp2 = np.ones(n_samples) * self.C
 67 |             h = cvxopt.matrix(np.hstack((tmp1, tmp2)))
 68 | 
 69 |         # solve QP problem
 70 |         solution = cvxopt.solvers.qp(P, q, G, h, A, b)
 71 | 
 72 |         # Lagrange multipliers
 73 |         a = np.ravel(solution['x'])
 74 | 
 75 |         # Support vectors have non zero lagrange multipliers
 76 |         sv = a > 1e-5
 77 |         ind = np.arange(len(a))[sv]
 78 |         self.a = a[sv]
 79 |         self.sv = X[sv]
 80 |         self.sv_y = y[sv]
 81 |         print("%d support vectors out of %d points" % (len(self.a), n_samples))
 82 | 
 83 |         # Intercept
 84 |         self.b = 0
 85 |         for n in range(len(self.a)):
 86 |             self.b += self.sv_y[n]
 87 |             self.b -= np.sum(self.a * self.sv_y * K[ind[n],sv])
 88 |         self.b /= len(self.a)
 89 | 
 90 |         # Weight vector
 91 |         if self.kernel == linear_kernel:
 92 |             self.w = np.zeros(n_features)
 93 |             for n in range(len(self.a)):
 94 |                 self.w += self.a[n] * self.sv_y[n] * self.sv[n]
 95 |         else:
 96 |             self.w = None
 97 | 
 98 |     def project(self, X):
 99 |         if self.w is not None:
100 |             return np.dot(X, self.w) + self.b
101 |         else:
102 |             y_predict = np.zeros(len(X))
103 |             for i in range(len(X)):
104 |                 s = 0
105 |                 for a, sv_y, sv in zip(self.a, self.sv_y, self.sv):
106 |                     s += a * sv_y * self.kernel(X[i], sv)
107 |                 y_predict[i] = s
108 |             return y_predict + self.b
109 | 
110 |     def predict(self, X):
111 |         return np.sign(self.project(X))
112 | 
113 | if __name__ == "__main__":
114 |     import pylab as pl
115 | 
116 |     def gen_lin_separable_data():
117 |         # generate training data in the 2-d case
118 |         mean1 = np.array([0, 2])
119 |         mean2 = np.array([2, 0])
120 |         cov = np.array([[0.8, 0.6], [0.6, 0.8]])
121 |         X1 = np.random.multivariate_normal(mean1, cov, 100)
122 |         y1 = np.ones(len(X1))
123 |         X2 = np.random.multivariate_normal(mean2, cov, 100)
124 |         y2 = np.ones(len(X2)) * -1
125 |         return X1, y1, X2, y2
126 | 
127 |     def gen_non_lin_separable_data():
128 |         mean1 = [-1, 2]
129 |         mean2 = [1, -1]
130 |         mean3 = [4, -4]
131 |         mean4 = [-4, 4]
132 |         cov = [[1.0,0.8], [0.8, 1.0]]
133 |         X1 = np.random.multivariate_normal(mean1, cov, 50)
134 |         X1 = np.vstack((X1, np.random.multivariate_normal(mean3, cov, 50)))
135 |         y1 = np.ones(len(X1))
136 |         X2 = np.random.multivariate_normal(mean2, cov, 50)
137 |         X2 = np.vstack((X2, np.random.multivariate_normal(mean4, cov, 50)))
138 |         y2 = np.ones(len(X2)) * -1
139 |         return X1, y1, X2, y2
140 | 
141 |     def gen_lin_separable_overlap_data():
142 |         # generate training data in the 2-d case
143 |         mean1 = np.array([0, 2])
144 |         mean2 = np.array([2, 0])
145 |         cov = np.array([[1.5, 1.0], [1.0, 1.5]])
146 |         X1 = np.random.multivariate_normal(mean1, cov, 100)
147 |         y1 = np.ones(len(X1))
148 |         X2 = np.random.multivariate_normal(mean2, cov, 100)
149 |         y2 = np.ones(len(X2)) * -1
150 |         return X1, y1, X2, y2
151 | 
152 |     def split_train(X1, y1, X2, y2):
153 |         X1_train = X1[:90]
154 |         y1_train = y1[:90]
155 |         X2_train = X2[:90]
156 |         y2_train = y2[:90]
157 |         X_train = np.vstack((X1_train, X2_train))
158 |         y_train = np.hstack((y1_train, y2_train))
159 |         return X_train, y_train
160 | 
161 |     def split_test(X1, y1, X2, y2):
162 |         X1_test = X1[90:]
163 |         y1_test = y1[90:]
164 |         X2_test = X2[90:]
165 |         y2_test = y2[90:]
166 |         X_test = np.vstack((X1_test, X2_test))
167 |         y_test = np.hstack((y1_test, y2_test))
168 |         return X_test, y_test
169 | 
170 |     def plot_margin(X1_train, X2_train, clf):
171 |         def f(x, w, b, c=0):
172 |             # given x, return y such that [x,y] in on the line
173 |             # w.x + b = c
174 |             return (-w[0] * x - b + c) / w[1]
175 | 
176 |         pl.plot(X1_train[:,0], X1_train[:,1], "ro")
177 |         pl.plot(X2_train[:,0], X2_train[:,1], "bo")
178 |         pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
179 | 
180 |         # w.x + b = 0
181 |         a0 = -4; a1 = f(a0, clf.w, clf.b)
182 |         b0 = 4; b1 = f(b0, clf.w, clf.b)
183 |         pl.plot([a0,b0], [a1,b1], "k")
184 | 
185 |         # w.x + b = 1
186 |         a0 = -4; a1 = f(a0, clf.w, clf.b, 1)
187 |         b0 = 4; b1 = f(b0, clf.w, clf.b, 1)
188 |         pl.plot([a0,b0], [a1,b1], "k--")
189 | 
190 |         # w.x + b = -1
191 |         a0 = -4; a1 = f(a0, clf.w, clf.b, -1)
192 |         b0 = 4; b1 = f(b0, clf.w, clf.b, -1)
193 |         pl.plot([a0,b0], [a1,b1], "k--")
194 | 
195 |         pl.axis("tight")
196 |         pl.show()
197 | 
198 |     def plot_contour(X1_train, X2_train, clf):
199 |         pl.plot(X1_train[:,0], X1_train[:,1], "ro")
200 |         pl.plot(X2_train[:,0], X2_train[:,1], "bo")
201 |         pl.scatter(clf.sv[:,0], clf.sv[:,1], s=100, c="g")
202 | 
203 |         X1, X2 = np.meshgrid(np.linspace(-6,6,50), np.linspace(-6,6,50))
204 |         X = np.array([[x1, x2] for x1, x2 in zip(np.ravel(X1), np.ravel(X2))])
205 |         Z = clf.project(X).reshape(X1.shape)
206 |         pl.contour(X1, X2, Z, [0.0], colors='k', linewidths=1, origin='lower')
207 |         pl.contour(X1, X2, Z + 1, [0.0], colors='grey', linewidths=1, origin='lower')
208 |         pl.contour(X1, X2, Z - 1, [0.0], colors='grey', linewidths=1, origin='lower')
209 | 
210 |         pl.axis("tight")
211 |         pl.show()
212 | 
213 |     def test_linear():
214 |         X1, y1, X2, y2 = gen_lin_separable_data()
215 |         X_train, y_train = split_train(X1, y1, X2, y2)
216 |         X_test, y_test = split_test(X1, y1, X2, y2)
217 | 
218 |         clf = SVM()
219 |         clf.fit(X_train, y_train)
220 | 
221 |         y_predict = clf.predict(X_test)
222 |         correct = np.sum(y_predict == y_test)
223 |         print("%d out of %d predictions correct" % (correct, len(y_predict)))
224 | 
225 |         plot_margin(X_train[y_train==1], X_train[y_train==-1], clf)
226 | 
227 |     def test_non_linear():
228 |         X1, y1, X2, y2 = gen_non_lin_separable_data()
229 |         X_train, y_train = split_train(X1, y1, X2, y2)
230 |         X_test, y_test = split_test(X1, y1, X2, y2)
231 | 
232 |         clf = SVM(polynomial_kernel)
233 |         clf.fit(X_train, y_train)
234 | 
235 |         y_predict = clf.predict(X_test)
236 |         correct = np.sum(y_predict == y_test)
237 |         print("%d out of %d predictions correct" % (correct, len(y_predict)))
238 | 
239 |         plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
240 | 
241 |     def test_soft():
242 |         X1, y1, X2, y2 = gen_lin_separable_overlap_data()
243 |         X_train, y_train = split_train(X1, y1, X2, y2)
244 |         X_test, y_test = split_test(X1, y1, X2, y2)
245 | 
246 |         clf = SVM(C=1000.1)
247 |         clf.fit(X_train, y_train)
248 | 
249 |         y_predict = clf.predict(X_test)
250 |         correct = np.sum(y_predict == y_test)
251 |         print("%d out of %d predictions correct" % (correct, len(y_predict)))
252 | 
253 |         plot_contour(X_train[y_train==1], X_train[y_train==-1], clf)
254 | 
255 |         
256 |     test_linear()
257 |     #test_non_linear()
258 | #    test_soft()
259 | ################################-the above example is detailed application of convex optimization to quadratic programming and solving svm 
260 |     ##below is the code for simple understanding as how qp and convex optimization works:
261 | Import the necessary packages
262 | import numpy
263 | from cvxopt import matrix
264 | from cvxopt import solvers
265 | ####Remeber the orginal form of qp following cvxopt 1/2x^t*Px+q^Tx,subject to Gx<u,Ax=b where [P,q,G,A,b] are the vectors
266 | # Define QP parameters (directly)
267 | P = matrix([[1.0,0.0],[0.0,0.0]])
268 | q = matrix([3.0,4.0])
269 | G = matrix([[-1.0,0.0,-1.0,2.0,3.0],[0.0,-1.0,-3.0,5.0,4.0]])
270 | h = matrix([0.0,0.0,-15.0,100.0,80.0])
271 | # Define QP parameters (with NumPy)
272 | P = matrix(numpy.diag([1,0]), tc=’d’)
273 | q = matrix(numpy.array([3,4]), tc=’d’)
274 | G = matrix(numpy.array([[-1,0],[0,-1],[-1,-3],[2,5],[3,4]]), tc=’d’)
275 | h = matrix(numpy.array([0,0,-15,100,80]), tc=’d’)
276 | # Construct the QP, invoke solver
277 | sol = solvers.qp(P,q,G,h)
278 | # Extract optimal value and solution
279 | sol[’x’] # [7.13e-07, 5.00e+00]
280 | sol[’primal objective’] # 20.0000061731


--------------------------------------------------------------------------------
/regress ion part 4 and part 5.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | import quandl
 4 | import math,datetime
 5 | import numpy as np
 6 | from sklearn import preprocessing,cross_validation,svm
 7 | from sklearn.linear_model import LinearRegression 
 8 | import matplotlib.pyplot as plt
 9 | from matplotlib import style
10 | 
11 | style.use("ggplot")
12 |     
13 | df=quandl.get("WIKI/GOOGL",auth_token="fwwt3dyY_pF8LyZqpNsa")
14 | df=df[["Adj. Open","Adj. High","Adj. Low","Adj. Close","Adj. Volume"]]
15 | 
16 | df["HL_pct"]=(df["Adj. High"]-df["Adj. Close"])/(df["Adj. Close"])*100
17 | df["PCT_change"]=(df["Adj. Close"]-df["Adj. Open"])/(df["Adj. Open"])*100
18 | df=df[["Adj. Close","HL_pct","PCT_change","Adj. Volume"]]
19 | 
20 | forecast_col="Adj. Close"
21 | df.fillna(-99999,inplace=True)
22 | 
23 | 
24 | forecast_out=int(math.ceil(0.01*len(df)))
25 | print(len(df))
26 | df["label"]=df[forecast_col].shift(-forecast_out)
27 | 
28 | 
29 | X=np.array(df.drop(["label"],1))
30 | X=preprocessing.scale(X)
31 | X_lately=X[-forecast_out:]
32 | X=X[:-forecast_out]
33 | 
34 | 
35 | df.dropna(inplace=True)
36 | y=np.array(df["label"])
37 | y=np.array(df["label"])
38 | 
39 | X_train,X_test,y_train,y_test=cross_validation.train_test_split(X,y,test_size=0.2)
40 | 
41 | 
42 | clf=LinearRegression(n_jobs=-1)
43 | 
44 | clf.fit(X_train,y_train)
45 | 
46 | accuracy=clf.score(X_test,y_test)
47 | 
48 | 
49 | #print(accuracy)
50 | 
51 | forecast_set=clf.predict(X_lately)
52 | print(forecast_set,accuracy,forecast_out)
53 | 
54 | df["Forecast"]=np.nan
55 | last_date=df.iloc[-1].name
56 | last_unix=last_date.timestamp()
57 | print(last_unix)
58 | one_day=86400
59 | next_unix=last_unix+one_day
60 | print(next_unix)
61 | 
62 | 
63 | for i in forecast_set:
64 |     next_date=datetime.datetime.fromtimestamp(next_unix)
65 |     next_unix+=one_day
66 |     df.loc[next_date]=[np.nan for _ in range(len(df.columns)-1)]+[i]
67 |     print(next_date)
68 | df["Adj. Close"].plot()
69 | df["Forecast"].plot()
70 | plt.legend(loc=4)
71 | plt.xlabel("Date")
72 | plt.ylabel("Price")
73 | plt.show()
74 | 


--------------------------------------------------------------------------------
/regression part 12.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Jan  5 01:49:11 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | 
 8 | # -*- coding: utf-8 -*-
 9 | """
10 | Created on Thu Jan  3 14:15:33 2019
11 | 
12 | @author: MMOHTASHIM
13 | """
14 | 
15 | #Linear Regression Model from scratch:
16 | from statistics import mean
17 | import numpy as np
18 | import matplotlib.pyplot as plt
19 | from matplotlib import style
20 | import random
21 | style.use("fivethirtyeight")
22 | #xs=np.array([1,2,3,4,5,6],dtype=np.float64)
23 | #ys=np.array([5,4,6,5,6,7],dtype=np.float64)
24 | #This fuction is to test the accuracy of our assumptions
25 | def create_dataset(hm,variance,step=2,correlation=False):
26 |     val=1
27 |     ys=[]
28 |     for y in range(hm):
29 |         y=val+random.randrange(-variance,variance)
30 |         ys.append(y)
31 |         if correlation and correlation=="pos":
32 |             val+=step
33 |         elif correlation and correlation=="negative":
34 |             val-=step
35 |     xs=[i for i in range(len(ys))]
36 |     
37 |     return np.array(xs,dtype=np.float64),np.array(ys,dtype=np.float64)
38 |     
39 |     
40 | 
41 | 
42 | def best_fit_slope_and_intercept(xs,ys):
43 |     m=((mean(xs)*mean(ys)) - (mean(xs*ys)))/((mean(xs)**2)-mean(xs**2))
44 |     b=mean(ys)-m*mean(xs)
45 |     return m,b
46 | 
47 | def squared_error(ys_orgin,ys_line):
48 |     return sum((ys_line-ys_orgin)**2)
49 | def coefficent_of_determination(ys_orgin,ys_line):
50 |     y_mean_line=[mean(ys_orgin) for y in ys_orgin]
51 |     square_error_regr=squared_error(ys_orgin,ys_line)
52 |     square_error_regr_y_mean=squared_error(ys_orgin,y_mean_line)
53 |     return 1-(square_error_regr)/(square_error_regr_y_mean)
54 | xs,ys=create_dataset(40,5,2,correlation="negative")
55 | 
56 | 
57 | 
58 | 
59 | 
60 | m,b=best_fit_slope_and_intercept(xs,ys)
61 | print(m,b)
62 | 
63 | regression_line=[(m*x)+b for x in xs]
64 | r_square=coefficent_of_determination(ys,regression_line)
65 | print(r_square)
66 | x_predict=8
67 | y_predict=[(m*x_predict)+b]
68 | plt.scatter(xs,ys)
69 | plt.scatter(x_predict,y_predict,s=10,color="green")
70 | plt.scatter(xs,ys)
71 | plt.plot(regression_line)
72 | plt.show()


--------------------------------------------------------------------------------
/regression part 6,7,8,9.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import quandl
  4 | import math,datetime
  5 | import numpy as np
  6 | from sklearn import preprocessing,svm
  7 | from sklearn.model_selection import cross_validate,train_test_split
  8 | from sklearn.linear_model import LinearRegression 
  9 | import matplotlib.pyplot as plt
 10 | from matplotlib import style
 11 | import pickle
 12 | 
 13 | style.use("ggplot")
 14 |     
 15 | df=quandl.get("WIKI/GOOGL",auth_token="fwwt3dyY_pF8LyZqpNsa")
 16 | df=df[["Adj. Open","Adj. High","Adj. Low","Adj. Close","Adj. Volume"]]
 17 | df["HL_pct"]=(df["Adj. High"]-df["Adj. Close"])/(df["Adj. Close"])*100
 18 | df["PCT_change"]=(df["Adj. Close"]-df["Adj. Open"])/(df["Adj. Open"])*100
 19 | df=df[["Adj. Close","HL_pct","PCT_change","Adj. Volume"]]
 20 | df.to_csv("Project try.csv")
 21 | print(df)
 22 | forecast_col="Adj. Close"
 23 | df.fillna(-99999,inplace=True)
 24 | 
 25 | 
 26 | forecast_out=int(math.ceil(0.1*len(df)))
 27 | print(len(df))
 28 | df["label"]=df[forecast_col].shift(-forecast_out)
 29 | 
 30 | 
 31 | X=np.array(df.drop(["label"],1))
 32 | X=preprocessing.scale(X)
 33 | X_lately=X[-forecast_out:]
 34 | X=X[:-forecast_out]
 35 | 
 36 | 
 37 | df.dropna(inplace=True)
 38 | y=np.array(df["label"])
 39 |     
 40 | print(df)
 41 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
 42 | 
 43 | 
 44 | clf=LinearRegression(n_jobs=-1)
 45 | 
 46 | clf.fit(X_train,y_train)
 47 | with open("linearregression.pickle","wb") as f:
 48 |     pickle.dump(clf,f)
 49 | pickle_in=open("linearregression.pickle","rb")
 50 | clf=pickle.load(pickle_in)
 51 | 
 52 | accuracy=clf.score(X_test,y_test)
 53 | 
 54 | 
 55 | print(accuracy)
 56 | 
 57 | forecast_set=clf.predict(X_lately)
 58 | print(forecast_set,accuracy,forecast_out)
 59 | 
 60 | df["Forecast"]=np.nan
 61 | last_date=df.iloc[-1].name
 62 | last_unix=last_date.timestamp()
 63 | print(last_unix)
 64 | one_day=86400
 65 | next_unix=last_unix+one_day
 66 | print(next_unix)
 67 | 
 68 | 
 69 | for i in forecast_set:
 70 |     next_date=datetime.datetime.fromtimestamp(next_unix)
 71 |     next_unix+=one_day
 72 |     df.loc[next_date]=[np.nan for _ in range(len(df.columns)-1)]+[i]
 73 |     print(next_date)
 74 | print(df)
 75 | df["Adj. Close"].plot()
 76 | df["Forecast"].plot()
 77 | plt.legend(loc=4)
 78 | plt.xlabel("Date")
 79 | plt.ylabel("Price")
 80 | plt.show()
 81 | ############################################################################
 82 | ##Linear Regression Model from scratch:
 83 | #from statistics import mean
 84 | #import numpy as np
 85 | #import matplotlib.pyplot as plt
 86 | #from matplotlib import style
 87 | #style.use("fivethirtyeight")
 88 | #xs=np.array([1,2,3,4,5,6],dtype=np.float64)
 89 | #ys=np.array([5,4,6,5,6,7],dtype=np.float64)
 90 | #
 91 | #def best_fit_slope_and_intercept(xs,ys):
 92 | #    m=((mean(xs)*mean(ys)) - (mean(xs*ys)))/((mean(xs)**2)-mean(xs**2))
 93 | #    b=mean(ys)-m*mean(xs)
 94 | #    return m,b
 95 | #m,b=best_fit_slope_and_intercept(xs,ys)
 96 | #print(m,b)
 97 | #
 98 | #regression_line=[(m*x)+b for x in xs]
 99 | #predict_x=8
100 | #predict_y=(m*predict_x)+b
101 | #print(regression_line)
102 | #plt.scatter(xs,ys)
103 | #plt.scatter(predict_x,predict_y,color="g")
104 | #plt.plot(regression_line)
105 |     years_temps=[]
106 |     stds=[]
107 |     means=[]
108 |     city_year_temp=[]
109 |     for year in years:
110 |         for city in multi_cities:
111 |             city_year_temp.append(climate.get_yearly_temp(city,year))
112 |         l=len(city_year_temp[0])
113 |         for i in range(l):
114 |             years_temps=[]
115 |             for X in city_year_temp:
116 |                 years_temps.append(X[i])
117 |             mean=pylab.mean(years_temps)
118 |             means.append(mean) 
119 |         std=pylab.std(means)
120 |         stds.append(std)
121 |     return pylab.array(stds)
122 |             
123 |     


--------------------------------------------------------------------------------
/regression-part 1 and 2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Dec 29 18:06:51 2018
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | import pandas as pd
 8 | import quandl
 9 | import math
10 | df=quandl.get("WIKI/GOOGL",auth_token="fwwt3dyY_pF8LyZqpNsa")
11 | df=df[["Adj. Open","Adj. High","Adj. Low","Adj. Close","Adj. Volume"]]
12 | 
13 | df["HL_pct"]=(df["Adj. High"]-df["Adj. Close"])/(df["Adj. Close"])*100
14 | df["PCT_change"]=(df["Adj. Close"]-df["Adj. Open"])/(df["Adj. Open"])*100
15 | df=df[["Adj. Close","HL_pct","PCT_change","Adj. Volume"]]
16 | 
17 | forecast_col="Adj. Close"
18 | df.fillna(-99999,inplace=True)
19 | print(df)
20 | 
21 | forecast_out=int(math.ceil(0.01*len(df)))
22 | print(len(df))
23 | df["label"]=df[forecast_col].shift(-forecast_out)
24 | df.dropna(inplace=True)
25 | print(df)


--------------------------------------------------------------------------------
/svm(final)-sklearn-part 33.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Wed Jan 23 23:51:30 2019
 4 | 
 5 | @author: MMOHTASHIM
 6 | """
 7 | import numpy as np
 8 | from sklearn import svm,neighbors
 9 | from sklearn.model_selection import train_test_split
10 | import pandas as pd
11 | df=pd.read_csv("breast-cancer-wisconsin.data.txt")
12 | df.replace("?",-9999,inplace=True)
13 | df.drop(["id"],1,inplace=True)
14 | 
15 | 
16 | X=np.array(df.drop(["class"],1))
17 | y=np.array(df["class"])
18 | X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
19 | 
20 | 
21 | clf=svm.SVC(gamma="auto",kernel="rbf")
22 | clf.fit(X_train,y_train)
23 | accuracy=clf.score(X_test,y_test)
24 | print(accuracy)


--------------------------------------------------------------------------------