├── .ipynb_checkpoints ├── FeatureWeightedFuzzyX-checkpoint.ipynb ├── Sample_Weighting-checkpoint.ipynb └── Untitled-checkpoint.ipynb ├── Literature ├── featureweightlearning.pdf └── yeung2002(2).pdf ├── Notebooks ├── .ipynb_checkpoints │ ├── FeatureWeightedFuzzyX-checkpoint.ipynb │ ├── Module_Test-checkpoint.ipynb │ ├── Sample_Weighting-checkpoint.ipynb │ └── Untitled1-checkpoint.ipynb ├── FeatureWeightedFuzzyX.ipynb ├── Module_Test.ipynb ├── NoiseReduction.png ├── Sample_Weighting.ipynb ├── Untitled.ipynb └── Untitled1.ipynb ├── README.md ├── figures ├── Iris_W.png ├── Iris_WO.png ├── Loss.png ├── NoiseReduction.png ├── fuzzymatrix.png ├── gradient.png └── solve_for_beta.png └── src ├── __init__.py ├── __pycache__ └── feature_learning.cpython-36.pyc └── feature_learning.py /.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Weighted Fuzzy C-means Classifier\n", 8 | "This project was originally meant to be a part of the D-BOM order mining tool, but was foreseen becoming the bulk of the actual work. For that reason, I gave it a stand alone repo. This notebook will explore how I will turn this into a library. I will use the iris dataset for testing. This notebook will first attemp to build fuzzy c-means from scratch, and then it will attempt to introduce the feature weighted learning component. \n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 17, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from sklearn.datasets import load_iris\n", 18 | "import numpy as np\n", 19 | "iris = load_iris()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 18, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "array([[5.1, 3.5, 1.4, 0.2],\n", 31 | " [4.9, 3. , 1.4, 0.2],\n", 32 | " [4.7, 3.2, 1.3, 0.2],\n", 33 | " [4.6, 3.1, 1.5, 0.2],\n", 34 | " [5. , 3.6, 1.4, 0.2],\n", 35 | " [5.4, 3.9, 1.7, 0.4],\n", 36 | " [4.6, 3.4, 1.4, 0.3],\n", 37 | " [5. , 3.4, 1.5, 0.2],\n", 38 | " [4.4, 2.9, 1.4, 0.2],\n", 39 | " [4.9, 3.1, 1.5, 0.1]])" 40 | ] 41 | }, 42 | "execution_count": 18, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "X = iris.data\n", 49 | "X[:10, :]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 41, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "scales = np.amax(X, axis = 0)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 44, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Basic Fuzzy c-means clustering \n", 68 | "c = 3\n", 69 | "epsilon = .05\n", 70 | "m = 2\n", 71 | "V = np.empty((c,4))\n", 72 | "for i in np.arange(0,c):\n", 73 | " c_ = np.random.random((c,1))* scales[i]\n", 74 | " V[:, i:i+1] = c_\n", 75 | " \n", 76 | "fuzz = np.random.random(X.shape) " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 45, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "array([[6.90067416e-001, 4.03567345e+000, 1.96562883e+000,\n", 88 | " 6.95024462e-310],\n", 89 | " [4.36526976e+000, 1.52127116e+000, 6.67822419e+000,\n", 90 | " 6.95024462e-310],\n", 91 | " [5.17583961e+000, 2.75369530e+000, 2.57638666e+000,\n", 92 | " 6.95028791e-310]])" 93 | ] 94 | }, 95 | "execution_count": 45, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "V" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 46, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "dist_weights = [1,1,1,1]\n", 111 | "def euclidean(X, V, weights):\n", 112 | " dists = X- V\n", 113 | " return np.sqrt((dists * weights) **2)\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 47, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "def update(X, V):\n", 123 | " for v_i in V.shape[0]:\n", 124 | " numerator = fuzz[i]**m * X\n", 125 | " denominator = fuzz[i]**m\n", 126 | " v_i = numerator/denominator\n", 127 | " \n", 128 | " return \n", 129 | " " 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.6.6" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 2 161 | } 162 | -------------------------------------------------------------------------------- /Literature/featureweightlearning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/Literature/featureweightlearning.pdf -------------------------------------------------------------------------------- /Literature/yeung2002(2).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/Literature/yeung2002(2).pdf -------------------------------------------------------------------------------- /Notebooks/.ipynb_checkpoints/Module_Test-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module Testing\n", 8 | "This notebook is meant to test the code repo built and ensure it runs accurately" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys \n", 18 | "sys.path.insert(0, '/home/colin/Desktop/FWFCM/src')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import feature_learning\n", 28 | "from sklearn.datasets import load_iris\n", 29 | "import numpy as np\n", 30 | "from sklearn.preprocessing import StandardScaler\n", 31 | "iris = load_iris()\n", 32 | "X = iris.data\n", 33 | "X[:10, :]\n", 34 | "X = StandardScaler().fit_transform(X)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "99 Iterations Required\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "max_iter = 100\n", 52 | "threshold= .00001\n", 53 | "w = feature_learning.return_weighted_distance(X, threshold = threshold, max_iter = max_iter, n = 15)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "fcm = feature_learning.c_means(threshold = .03, max_iter = 100)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "fcm.fit(X, w)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "array([[ 0.12570625, -0.62975002, 0.33890973, 0.25467352],\n", 83 | " [ 1.08686586, -0.05200863, 1.10661401, 1.1292369 ],\n", 84 | " [-1.01263132, 0.83811185, -1.29881138, -1.24810065]])" 85 | ] 86 | }, 87 | "execution_count": 6, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "fcm.cluster_centers" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 7, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "from itertools import combinations\n", 103 | "import matplotlib.pyplot as plt\n", 104 | "def to_color(u):\n", 105 | " colors = []\n", 106 | " for i in range(0, u.shape[0]):\n", 107 | " colors.append(tuple((u[i, 0], u[i, 1], u[i, 2])))\n", 108 | " return colors \n", 109 | " \n", 110 | "\n", 111 | "def _2D_Plot(X_, V, colorlist, columns):\n", 112 | " fig, ax = plt.subplots(2,3, figsize = (15,10))\n", 113 | " axlist = [ (i,j) for i in np.arange(0,3) for j in np.arange(0,3)]\n", 114 | " for idx, axcombo in enumerate(combinations([0,1,2,3], 2)):\n", 115 | " axl = axlist[idx]\n", 116 | " axl = ax[axl[0], axl[1]]\n", 117 | " axl.scatter(X_[:, axcombo[0]], X_[:, axcombo[1]], c = colorlist, alpha = .5)\n", 118 | " axl.scatter(V[:, axcombo[0]], V[:, axcombo[1]], marker ='x', c=['red', 'green', 'blue'], s =200)\n", 119 | " axl.set_xlabel(columns[axcombo[0]])\n", 120 | " axl.set_ylabel(columns[axcombo[1]])\n", 121 | " #plt.show()\n", 122 | " return fig\n", 123 | "V = fcm.cluster_centers\n", 124 | "u = fcm.fuzzy_partition\n", 125 | "colorlist = to_color(u)\n", 126 | "columns = iris['feature_names']\n", 127 | "fig = _2D_Plot(X, V,colorlist, columns)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 8, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "0.863453892472934" 139 | ] 140 | }, 141 | "execution_count": 8, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "fcm.f_p_coeff" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 9, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "array([[0. , 0. , 1. , 0.01548039]])" 159 | ] 160 | }, 161 | "execution_count": 9, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "fcm.weights" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 10, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "array([[ 0.12570625, -0.62975002, 0.33890973, 0.25467352],\n", 179 | " [ 1.08686586, -0.05200863, 1.10661401, 1.1292369 ],\n", 180 | " [-1.01263132, 0.83811185, -1.29881138, -1.24810065]])" 181 | ] 182 | }, 183 | "execution_count": 10, 184 | "metadata": {}, 185 | "output_type": "execute_result" 186 | } 187 | ], 188 | "source": [ 189 | "fcm.cluster_centers" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 11, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "data": { 199 | "text/plain": [ 200 | "array([[6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 201 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 202 | " [3.18794184e-03, 1.53258878e-03, 9.95279469e-01],\n", 203 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 204 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 205 | " [7.24899867e-03, 3.18385129e-03, 9.89567150e-01],\n", 206 | " [6.08040121e-04, 2.86336135e-04, 9.99105624e-01],\n", 207 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 208 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 209 | " [9.39396592e-05, 4.32820022e-05, 9.99862778e-01],\n", 210 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 211 | " [2.12399565e-03, 9.56175533e-04, 9.96919829e-01],\n", 212 | " [6.11005073e-04, 2.87737773e-04, 9.99101257e-01],\n", 213 | " [1.28746051e-02, 6.42946841e-03, 9.80695926e-01],\n", 214 | " [7.39984381e-03, 3.62762008e-03, 9.88972536e-01],\n", 215 | " [9.38400131e-05, 4.32348219e-05, 9.99862925e-01],\n", 216 | " [3.19079899e-03, 1.53394138e-03, 9.95275260e-01],\n", 217 | " [6.08040121e-04, 2.86336135e-04, 9.99105624e-01],\n", 218 | " [7.24519464e-03, 3.18221452e-03, 9.89572591e-01],\n", 219 | " [9.07195587e-05, 4.17974757e-05, 9.99867483e-01],\n", 220 | " [7.24493691e-03, 3.18214184e-03, 9.89572921e-01],\n", 221 | " [9.38400131e-05, 4.32348219e-05, 9.99862925e-01],\n", 222 | " [1.92925307e-02, 9.80517703e-03, 9.70902292e-01],\n", 223 | " [7.25634926e-03, 3.18705221e-03, 9.89556599e-01],\n", 224 | " [2.90350853e-02, 1.20799423e-02, 9.58884972e-01],\n", 225 | " [2.12399565e-03, 9.56175533e-04, 9.96919829e-01],\n", 226 | " [2.12744162e-03, 9.57707248e-04, 9.96914851e-01],\n", 227 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 228 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 229 | " [2.12399565e-03, 9.56175533e-04, 9.96919829e-01],\n", 230 | " [2.12399565e-03, 9.56175533e-04, 9.96919829e-01],\n", 231 | " [9.38400131e-05, 4.32348219e-05, 9.99862925e-01],\n", 232 | " [9.39396592e-05, 4.32820022e-05, 9.99862778e-01],\n", 233 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 234 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 235 | " [7.39984381e-03, 3.62762008e-03, 9.88972536e-01],\n", 236 | " [3.18794184e-03, 1.53258878e-03, 9.95279469e-01],\n", 237 | " [6.11005073e-04, 2.87737773e-04, 9.99101257e-01],\n", 238 | " [3.18794184e-03, 1.53258878e-03, 9.95279469e-01],\n", 239 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 240 | " [3.18801011e-03, 1.53261003e-03, 9.95279380e-01],\n", 241 | " [3.18801011e-03, 1.53261003e-03, 9.95279380e-01],\n", 242 | " [3.18794184e-03, 1.53258878e-03, 9.95279469e-01],\n", 243 | " [2.14432872e-03, 9.65296742e-04, 9.96890375e-01],\n", 244 | " [2.90415247e-02, 1.20822484e-02, 9.58876227e-01],\n", 245 | " [6.08040121e-04, 2.86336135e-04, 9.99105624e-01],\n", 246 | " [2.12399565e-03, 9.56175533e-04, 9.96919829e-01],\n", 247 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 248 | " [9.07528881e-05, 4.18132408e-05, 9.99867434e-01],\n", 249 | " [6.08053787e-04, 2.86345031e-04, 9.99105601e-01],\n", 250 | " [8.85150875e-01, 1.04692160e-01, 1.01569646e-02],\n", 251 | " [9.83333380e-01, 1.43868671e-02, 2.27975312e-03],\n", 252 | " [6.73545028e-01, 3.09378704e-01, 1.70762682e-02],\n", 253 | " [9.40892685e-01, 4.06181668e-02, 1.84891479e-02],\n", 254 | " [9.47309302e-01, 4.68414506e-02, 5.84924782e-03],\n", 255 | " [9.83339378e-01, 1.43811052e-02, 2.27951716e-03],\n", 256 | " [8.85078271e-01, 1.04761541e-01, 1.01601882e-02],\n", 257 | " [6.55699649e-01, 1.25997651e-01, 2.18302700e-01],\n", 258 | " [9.47324295e-01, 4.68263691e-02, 5.84933544e-03],\n", 259 | " [9.10453411e-01, 5.76612590e-02, 3.18853301e-02],\n", 260 | " [7.53174635e-01, 1.13051212e-01, 1.33774154e-01],\n", 261 | " [9.86470603e-01, 1.03712087e-02, 3.15818832e-03],\n", 262 | " [9.40817045e-01, 4.06657047e-02, 1.85172504e-02],\n", 263 | " [8.85150875e-01, 1.04692160e-01, 1.01569646e-02],\n", 264 | " [7.97393846e-01, 1.02370983e-01, 1.00235171e-01],\n", 265 | " [9.98531093e-01, 1.22533234e-03, 2.43575028e-04],\n", 266 | " [9.83333380e-01, 1.43868671e-02, 2.27975312e-03],\n", 267 | " [9.66595067e-01, 2.43228311e-02, 9.08210139e-03],\n", 268 | " [9.83333380e-01, 1.43868671e-02, 2.27975312e-03],\n", 269 | " [9.10416139e-01, 5.76786502e-02, 3.19052111e-02],\n", 270 | " [7.93165196e-01, 1.92588782e-01, 1.42460228e-02],\n", 271 | " [9.40892685e-01, 4.06181668e-02, 1.84891479e-02],\n", 272 | " [6.73545028e-01, 3.09378704e-01, 1.70762682e-02],\n", 273 | " [8.85145526e-01, 1.04692989e-01, 1.01614841e-02],\n", 274 | " [9.98135960e-01, 1.49479468e-03, 3.69245368e-04],\n", 275 | " [9.98531093e-01, 1.22533234e-03, 2.43575028e-04],\n", 276 | " [7.93427146e-01, 1.92335609e-01, 1.42372449e-02],\n", 277 | " [5.34064714e-01, 4.48034516e-01, 1.79007702e-02],\n", 278 | " [9.83333380e-01, 1.43868671e-02, 2.27975312e-03],\n", 279 | " [7.53174635e-01, 1.13051212e-01, 1.33774154e-01],\n", 280 | " [8.76114566e-01, 7.41739555e-02, 4.97114790e-02],\n", 281 | " [8.38340731e-01, 8.92836851e-02, 7.23755836e-02],\n", 282 | " [9.10438176e-01, 5.76667363e-02, 3.18950875e-02],\n", 283 | " [3.90519659e-01, 5.92974633e-01, 1.65057083e-02],\n", 284 | " [9.83333380e-01, 1.43868671e-02, 2.27975312e-03],\n", 285 | " [9.83301024e-01, 1.44150461e-02, 2.28393013e-03],\n", 286 | " [8.85124319e-01, 1.04718072e-01, 1.01576087e-02],\n", 287 | " [9.98523495e-01, 1.23164175e-03, 2.44863430e-04],\n", 288 | " [9.66682815e-01, 2.42613932e-02, 9.05579202e-03],\n", 289 | " [9.40892685e-01, 4.06181668e-02, 1.84891479e-02],\n", 290 | " [9.98497885e-01, 1.25297294e-03, 2.49142102e-04],\n", 291 | " [9.47326948e-01, 4.68249353e-02, 5.84811704e-03],\n", 292 | " [9.40878449e-01, 4.06265320e-02, 1.84950189e-02],\n", 293 | " [6.55699649e-01, 1.25997651e-01, 2.18302700e-01],\n", 294 | " [9.86473954e-01, 1.03680870e-02, 3.15795938e-03],\n", 295 | " [9.86454214e-01, 1.03829121e-02, 3.16287369e-03],\n", 296 | " [9.86473954e-01, 1.03680870e-02, 3.15795938e-03],\n", 297 | " [9.98135960e-01, 1.49479468e-03, 3.69245368e-04],\n", 298 | " [4.90965003e-01, 1.23053684e-01, 3.85981312e-01],\n", 299 | " [9.66682815e-01, 2.42613932e-02, 9.05579202e-03],\n", 300 | " [3.10766664e-02, 9.64815480e-01, 4.10785333e-03],\n", 301 | " [3.90252558e-01, 5.93246302e-01, 1.65011393e-02],\n", 302 | " [1.56356171e-02, 9.82458258e-01, 1.90612468e-03],\n", 303 | " [7.10038829e-03, 9.92252756e-01, 6.46855758e-04],\n", 304 | " [4.30972439e-03, 9.95209128e-01, 4.81147750e-04],\n", 305 | " [1.33540360e-01, 8.40840915e-01, 2.56187253e-02],\n", 306 | " [9.83249096e-01, 1.44600914e-02, 2.29081212e-03],\n", 307 | " [8.43632824e-02, 9.01930968e-01, 1.37057498e-02],\n", 308 | " [4.33887700e-03, 9.95176819e-01, 4.84304272e-04],\n", 309 | " [4.84226556e-02, 9.44683417e-01, 6.89392720e-03],\n", 310 | " [3.90174096e-01, 5.93324383e-01, 1.65015209e-02],\n", 311 | " [1.53485088e-01, 8.37141414e-01, 9.37349812e-03],\n", 312 | " [3.09254345e-02, 9.66570900e-01, 2.50366525e-03],\n", 313 | " [5.33778451e-01, 4.48317023e-01, 1.79045260e-02],\n", 314 | " [3.89913388e-01, 5.93574040e-01, 1.65125726e-02],\n", 315 | " [1.53414194e-01, 8.37210239e-01, 9.37556676e-03],\n", 316 | " [3.09940662e-02, 9.66497331e-01, 2.50860314e-03],\n", 317 | " [1.47935268e-01, 8.22273695e-01, 2.97910367e-02],\n", 318 | " [1.73653196e-01, 7.88138780e-01, 3.82080240e-02],\n", 319 | " [5.34240849e-01, 4.47856053e-01, 1.79030981e-02],\n", 320 | " [5.55045814e-05, 9.99938869e-01, 5.62690068e-06],\n", 321 | " [6.73081101e-01, 3.09830889e-01, 1.70880099e-02],\n", 322 | " [1.47940740e-01, 8.22268053e-01, 2.97912065e-02],\n", 323 | " [6.73298694e-01, 3.09621483e-01, 1.70798229e-02],\n", 324 | " [1.49953367e-05, 9.99983485e-01, 1.51992272e-06],\n", 325 | " [3.10344038e-02, 9.64864708e-01, 4.10088776e-03],\n", 326 | " [7.93165196e-01, 1.92588782e-01, 1.42460228e-02],\n", 327 | " [6.73298694e-01, 3.09621483e-01, 1.70798229e-02],\n", 328 | " [7.04599991e-03, 9.92311971e-01, 6.42029245e-04],\n", 329 | " [4.42621469e-03, 9.95079750e-01, 4.94035204e-04],\n", 330 | " [4.83769394e-02, 9.44737488e-01, 6.88557270e-03],\n", 331 | " [1.01657954e-01, 8.80818410e-01, 1.75236354e-02],\n", 332 | " [7.06025094e-03, 9.92296359e-01, 6.43390023e-04],\n", 333 | " [3.90619214e-01, 5.92871650e-01, 1.65091354e-02],\n", 334 | " [7.39953763e-03, 9.91926391e-01, 6.74071042e-04],\n", 335 | " [4.83788706e-02, 9.44734250e-01, 6.88687904e-03],\n", 336 | " [7.13727884e-03, 9.92212157e-01, 6.50564623e-04],\n", 337 | " [3.09940662e-02, 9.66497331e-01, 2.50860314e-03],\n", 338 | " [7.93165196e-01, 1.92588782e-01, 1.42460228e-02],\n", 339 | " [7.79165729e-02, 9.16557038e-01, 5.52638894e-03],\n", 340 | " [7.13727884e-03, 9.92212157e-01, 6.50564623e-04],\n", 341 | " [3.89970579e-01, 5.93521040e-01, 1.65083809e-02],\n", 342 | " [3.90252558e-01, 5.93246302e-01, 1.65011393e-02],\n", 343 | " [1.56625913e-02, 9.82427762e-01, 1.90964704e-03],\n", 344 | " [1.52701815e-04, 9.99831814e-01, 1.54839538e-05],\n", 345 | " [2.59089559e-01, 7.27555956e-01, 1.33544854e-02],\n", 346 | " [5.33876792e-01, 4.48220900e-01, 1.79023080e-02],\n", 347 | " [2.59192873e-01, 7.27456801e-01, 1.33503261e-02],\n", 348 | " [7.79457049e-02, 9.16524102e-01, 5.53019287e-03],\n", 349 | " [3.90336315e-01, 5.93161975e-01, 1.65017101e-02]])" 350 | ] 351 | }, 352 | "execution_count": 11, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "fcm.fuzzy_partition" 359 | ] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": "Python 3", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.6.6" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 2 383 | } 384 | -------------------------------------------------------------------------------- /Notebooks/.ipynb_checkpoints/Sample_Weighting-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Weighted Fuzzy C-means Classifier Visuals" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import matplotlib.pyplot as plt\n", 17 | "import numpy as np\n", 18 | "from sklearn.cluster import KMeans\n", 19 | "\n", 20 | "np.random.seed(0)\n", 21 | "n_points_per_cluster = 250\n", 22 | "\n", 23 | "C1 = [-.5, -1.5] + .4 * np.random.randn(n_points_per_cluster, 2)\n", 24 | "C2 = [.5,0] + .25 * np.random.randn(n_points_per_cluster, 2)\n", 25 | "C3 = [0, 1.5] + .4 * np.random.randn(n_points_per_cluster, 2)\n", 26 | "\n", 27 | "C1 = np.hstack((C1, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 28 | "C2 = np.hstack((C2, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 29 | "C3 = np.hstack((C3, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 30 | "X_ = np.vstack((C1, C2, C3))\n", 31 | "from mpl_toolkits.mplot3d import Axes3D\n", 32 | "fig3D = plt.figure(figsize=(20, 20))\n", 33 | "ax = fig3D.add_subplot(221, projection='3d')\n", 34 | "ax.scatter(C1[:, 0], C1[:, 1],C1[:, 2], c= 'red', alpha = .5)\n", 35 | "ax.scatter(C2[:, 0], C2[:, 1],C2[:, 2], c= 'green', alpha = .5)\n", 36 | "ax.scatter(C3[:, 0], C3[:, 1],C3[:, 2], c= 'blue', alpha = .5)\n", 37 | "ax.set_ylim(-3,3)\n", 38 | "ax.set_xlim(-3,3)\n", 39 | "ax.set_title(\"Clusters With Noise Feature\")\n", 40 | "ax2 = fig3D.add_subplot(222, projection='3d')\n", 41 | "zeros = np.zeros((n_points_per_cluster, 1))\n", 42 | "ax2.scatter(C1[:, 0], C1[:, 1],zeros, c= 'red', alpha = .5)\n", 43 | "ax2.scatter(C2[:, 0], C2[:, 1],zeros, c= 'green', alpha = .5)\n", 44 | "ax2.scatter(C3[:, 0], C3[:, 1],zeros, c= 'blue', alpha = .5)\n", 45 | "ax2.set_ylim(-3,3)\n", 46 | "ax2.set_xlim(-3,3)\n", 47 | "ax2.set_zlim(-3,3)\n", 48 | "ax2.set_title(\"Clusters Without Noise Feature\")\n", 49 | "k = KMeans(3)\n", 50 | "k.fit(X_)\n", 51 | "l = k.labels_\n", 52 | "ax3 = fig3D.add_subplot(223, projection='3d')\n", 53 | "ax3.scatter(X_[:, 0], X_[:, 1],X_[:, 2], c=l, alpha = .5)\n", 54 | "ax3.set_ylim(-3,3)\n", 55 | "ax3.set_xlim(-3,3)\n", 56 | "ax3.set_zlim(-3,3)\n", 57 | "ax3.set_title(\"K-means With Noise Features\")\n", 58 | "\n", 59 | "import sys \n", 60 | "sys.path.insert(0, '/home/colin/Desktop/FWFCM/src')\n", 61 | "from feature_learning import return_weighted_distance\n", 62 | "p = return_weighted_distance(X_, threshold = 0.00005, n = 20)\n", 63 | "\n", 64 | "for i in range(3):\n", 65 | " X_[:, i] = X_[:, i] * p[:, i] * 1.5\n", 66 | "k.fit(X_)\n", 67 | "l = k.labels_\n", 68 | "zeros = np.zeros((n_points_per_cluster*3, 1))\n", 69 | "ax4 = fig3D.add_subplot(224, projection='3d')\n", 70 | "ax4.set_title(\"K-means With Learned and Scaled Features\")\n", 71 | "ax4.scatter(X_[:, 0], X_[:, 1],X_[:, 2], c=l, alpha = .5)\n", 72 | "ax4.set_ylim(-3,3)\n", 73 | "ax4.set_xlim(-3,3)\n", 74 | "ax4.set_zlim(-3,3)\n", 75 | "fig3D.savefig(\"NoiseReduction.png\")\n", 76 | "fig3D.show()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "p" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "plt.show()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.6.6" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 2 126 | } 127 | -------------------------------------------------------------------------------- /Notebooks/.ipynb_checkpoints/Untitled1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /Notebooks/Module_Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Module Testing\n", 8 | "This notebook is meant to test the code repo built and ensure it runs accurately" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys \n", 18 | "sys.path.insert(0, '/home/colin/Desktop/Projects/feature_learning/src')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import feature_learning\n", 28 | "from sklearn.datasets import load_iris\n", 29 | "import numpy as np\n", 30 | "from sklearn.preprocessing import StandardScaler\n", 31 | "iris = load_iris()\n", 32 | "X = iris.data\n", 33 | "X[:10, :]\n", 34 | "X = StandardScaler().fit_transform(X)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "1 Iterations Required\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "max_iter = 100\n", 52 | "threshold= .00001\n", 53 | "w = feature_learning.return_weighted_distance(X, threshold = threshold, max_iter = max_iter, n = 15)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "fcm = feature_learning.c_means(threshold = .03, max_iter = 100)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "ename": "ValueError", 72 | "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", 73 | "output_type": "error", 74 | "traceback": [ 75 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 76 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 77 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 78 | "\u001b[0;32m~/Desktop/Projects/feature_learning/src/feature_learning.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, weights, log_argmax)\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mnum_iter\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mmax_iter\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0mu2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 279\u001b[0;31m \u001b[0mV\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mJm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweights\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 280\u001b[0m \u001b[0mJ\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mJ\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mJm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mnum_iter\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 79 | "\u001b[0;32m~/Desktop/Projects/feature_learning/src/feature_learning.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(X, u2, m, weights, n, c)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0;31m# update distance matrix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 175\u001b[0;31m \u001b[0md\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpairwise_distances\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mV\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetric\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweighted_euclidean\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'weights'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mweights\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 176\u001b[0m \u001b[0;31m#update fuzziness\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 80 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mpairwise_distances\u001b[0;34m(X, Y, metric, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[0mfunc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpartial\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdistance\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcdist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmetric\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmetric\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1405\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1406\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_parallel_pairwise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1407\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1408\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 81 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36m_parallel_pairwise\u001b[0;34m(X, Y, func, n_jobs, **kwds)\u001b[0m\n\u001b[1;32m 1065\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1066\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0meffective_n_jobs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1067\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1068\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1069\u001b[0m \u001b[0;31m# TODO: in some cases, backend='threading' may be appropriate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 82 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36m_pairwise_callable\u001b[0;34m(X, Y, metric, **kwds)\u001b[0m\n\u001b[1;32m 1079\u001b[0m \"\"\"Handle the callable case for pairwise_{distances,kernels}\n\u001b[1;32m 1080\u001b[0m \"\"\"\n\u001b[0;32m-> 1081\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_pairwise_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1082\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1083\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mX\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mY\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 83 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/metrics/pairwise.py\u001b[0m in \u001b[0;36mcheck_pairwise_arrays\u001b[0;34m(X, Y, precomputed, dtype)\u001b[0m\n\u001b[1;32m 111\u001b[0m warn_on_dtype=warn_on_dtype, estimator=estimator)\n\u001b[1;32m 112\u001b[0m Y = check_array(Y, accept_sparse='csr', dtype=dtype,\n\u001b[0;32m--> 113\u001b[0;31m warn_on_dtype=warn_on_dtype, estimator=estimator)\n\u001b[0m\u001b[1;32m 114\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mprecomputed\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 84 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[1;32m 571\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 572\u001b[0m _assert_all_finite(array,\n\u001b[0;32m--> 573\u001b[0;31m allow_nan=force_all_finite == 'allow-nan')\n\u001b[0m\u001b[1;32m 574\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 575\u001b[0m \u001b[0mshape_repr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 85 | "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[0;34m(X, allow_nan)\u001b[0m\n\u001b[1;32m 54\u001b[0m not allow_nan and not np.isfinite(X).all()):\n\u001b[1;32m 55\u001b[0m \u001b[0mtype_err\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infinity'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mallow_nan\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'NaN, infinity'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 56\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg_err\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtype_err\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 57\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 58\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 86 | "\u001b[0;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "fcm.fit(X, w)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "fcm.cluster_centers" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "from itertools import combinations\n", 110 | "import matplotlib.pyplot as plt\n", 111 | "def to_color(u):\n", 112 | " colors = []\n", 113 | " for i in range(0, u.shape[0]):\n", 114 | " colors.append(tuple((u[i, 0], u[i, 1], u[i, 2])))\n", 115 | " return colors \n", 116 | " \n", 117 | "\n", 118 | "def _2D_Plot(X_, V, colorlist, columns):\n", 119 | " fig, ax = plt.subplots(2,3, figsize = (15,10))\n", 120 | " axlist = [ (i,j) for i in np.arange(0,3) for j in np.arange(0,3)]\n", 121 | " for idx, axcombo in enumerate(combinations([0,1,2,3], 2)):\n", 122 | " axl = axlist[idx]\n", 123 | " axl = ax[axl[0], axl[1]]\n", 124 | " axl.scatter(X_[:, axcombo[0]], X_[:, axcombo[1]], c = colorlist, alpha = .5)\n", 125 | " axl.scatter(V[:, axcombo[0]], V[:, axcombo[1]], marker ='x', c=['red', 'green', 'blue'], s =200)\n", 126 | " axl.set_xlabel(columns[axcombo[0]])\n", 127 | " axl.set_ylabel(columns[axcombo[1]])\n", 128 | " #plt.show()\n", 129 | " return fig\n", 130 | "V = fcm.cluster_centers\n", 131 | "u = fcm.fuzzy_partition\n", 132 | "colorlist = to_color(u)\n", 133 | "columns = iris['feature_names']\n", 134 | "fig = _2D_Plot(X, V,colorlist, columns)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "fcm.f_p_coeff" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "fcm.weights" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "fcm.cluster_centers" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "fcm.fuzzy_partition" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.7" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /Notebooks/NoiseReduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/Notebooks/NoiseReduction.png -------------------------------------------------------------------------------- /Notebooks/Sample_Weighting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Weighted Fuzzy C-means Classifier Visuals" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "30 Iterations Required\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import matplotlib.pyplot as plt\n", 25 | "import numpy as np\n", 26 | "from sklearn.cluster import KMeans\n", 27 | "\n", 28 | "np.random.seed(0)\n", 29 | "n_points_per_cluster = 250\n", 30 | "\n", 31 | "C1 = [-.5, -1.5] + .4 * np.random.randn(n_points_per_cluster, 2)\n", 32 | "C2 = [.5,0] + .25 * np.random.randn(n_points_per_cluster, 2)\n", 33 | "C3 = [0, 1.5] + .4 * np.random.randn(n_points_per_cluster, 2)\n", 34 | "\n", 35 | "C1 = np.hstack((C1, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 36 | "C2 = np.hstack((C2, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 37 | "C3 = np.hstack((C3, 1 * np.random.randn(n_points_per_cluster, 1)))\n", 38 | "X_ = np.vstack((C1, C2, C3))\n", 39 | "from mpl_toolkits.mplot3d import Axes3D\n", 40 | "fig3D = plt.figure(figsize=(20, 20))\n", 41 | "ax = fig3D.add_subplot(221, projection='3d')\n", 42 | "ax.scatter(C1[:, 0], C1[:, 1],C1[:, 2], c= 'red', alpha = .5)\n", 43 | "ax.scatter(C2[:, 0], C2[:, 1],C2[:, 2], c= 'green', alpha = .5)\n", 44 | "ax.scatter(C3[:, 0], C3[:, 1],C3[:, 2], c= 'blue', alpha = .5)\n", 45 | "ax.set_ylim(-3,3)\n", 46 | "ax.set_xlim(-3,3)\n", 47 | "ax.set_title(\"Clusters With Noise Feature\")\n", 48 | "ax2 = fig3D.add_subplot(222, projection='3d')\n", 49 | "zeros = np.zeros((n_points_per_cluster, 1))\n", 50 | "ax2.scatter(C1[:, 0], C1[:, 1],zeros, c= 'red', alpha = .5)\n", 51 | "ax2.scatter(C2[:, 0], C2[:, 1],zeros, c= 'green', alpha = .5)\n", 52 | "ax2.scatter(C3[:, 0], C3[:, 1],zeros, c= 'blue', alpha = .5)\n", 53 | "ax2.set_ylim(-3,3)\n", 54 | "ax2.set_xlim(-3,3)\n", 55 | "ax2.set_zlim(-3,3)\n", 56 | "ax2.set_title(\"Clusters Without Noise Feature\")\n", 57 | "k = KMeans(3)\n", 58 | "k.fit(X_)\n", 59 | "l = k.labels_\n", 60 | "ax3 = fig3D.add_subplot(223, projection='3d')\n", 61 | "ax3.scatter(X_[:, 0], X_[:, 1],X_[:, 2], c=l, alpha = .5)\n", 62 | "ax3.set_ylim(-3,3)\n", 63 | "ax3.set_xlim(-3,3)\n", 64 | "ax3.set_zlim(-3,3)\n", 65 | "ax3.set_title(\"K-means With Noise Features\")\n", 66 | "\n", 67 | "import sys \n", 68 | "sys.path.insert(0, '/home/colin/Desktop/FWFCM/src')\n", 69 | "from feature_learning import return_weighted_distance\n", 70 | "p = return_weighted_distance(X_, threshold = 0.00005, n = 20)\n", 71 | "\n", 72 | "for i in range(3):\n", 73 | " X_[:, i] = X_[:, i] * p[:, i] * 1.5\n", 74 | "k.fit(X_)\n", 75 | "l = k.labels_\n", 76 | "zeros = np.zeros((n_points_per_cluster*3, 1))\n", 77 | "ax4 = fig3D.add_subplot(224, projection='3d')\n", 78 | "ax4.set_title(\"K-means With Learned and Scaled Features\")\n", 79 | "ax4.scatter(X_[:, 0], X_[:, 1],X_[:, 2], c=l, alpha = .5)\n", 80 | "ax4.set_ylim(-3,3)\n", 81 | "ax4.set_xlim(-3,3)\n", 82 | "ax4.set_zlim(-3,3)\n", 83 | "fig3D.savefig(\"NoiseReduction.png\")\n", 84 | "fig3D.show()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "array([[0.11590097, 1. , 0.16391792]])" 96 | ] 97 | }, 98 | "execution_count": 2, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "p" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "plt.show()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.6.6" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /Notebooks/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Weighted Fuzzy C-means Classifier\n", 8 | "This project was originally meant to be a part of the D-BOM order mining tool, but was foreseen becoming the bulk of the actual work. For that reason, I gave it a stand alone repo. This notebook will explore how I will turn this into a library. I will use the iris dataset for testing. This notebook will first attemp to build fuzzy c-means from scratch, and then it will attempt to introduce the feature weighted learning component. \n" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 17, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "from sklearn.datasets import load_iris\n", 18 | "import numpy as np\n", 19 | "iris = load_iris()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 18, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "array([[5.1, 3.5, 1.4, 0.2],\n", 31 | " [4.9, 3. , 1.4, 0.2],\n", 32 | " [4.7, 3.2, 1.3, 0.2],\n", 33 | " [4.6, 3.1, 1.5, 0.2],\n", 34 | " [5. , 3.6, 1.4, 0.2],\n", 35 | " [5.4, 3.9, 1.7, 0.4],\n", 36 | " [4.6, 3.4, 1.4, 0.3],\n", 37 | " [5. , 3.4, 1.5, 0.2],\n", 38 | " [4.4, 2.9, 1.4, 0.2],\n", 39 | " [4.9, 3.1, 1.5, 0.1]])" 40 | ] 41 | }, 42 | "execution_count": 18, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "X = iris.data\n", 49 | "X[:10, :]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 41, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "scales = np.amax(X, axis = 0)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 44, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Basic Fuzzy c-means clustering \n", 68 | "c = 3\n", 69 | "epsilon = .05\n", 70 | "m = 2\n", 71 | "V = np.empty((c,4))\n", 72 | "for i in np.arange(0,c):\n", 73 | " c_ = np.random.random((c,1))* scales[i]\n", 74 | " V[:, i:i+1] = c_\n", 75 | " \n", 76 | "fuzz = np.random.random(X.shape) " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 45, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "array([[6.90067416e-001, 4.03567345e+000, 1.96562883e+000,\n", 88 | " 6.95024462e-310],\n", 89 | " [4.36526976e+000, 1.52127116e+000, 6.67822419e+000,\n", 90 | " 6.95024462e-310],\n", 91 | " [5.17583961e+000, 2.75369530e+000, 2.57638666e+000,\n", 92 | " 6.95028791e-310]])" 93 | ] 94 | }, 95 | "execution_count": 45, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "V" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 46, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "dist_weights = [1,1,1,1]\n", 111 | "def euclidean(X, V, weights):\n", 112 | " dists = X- V\n", 113 | " return np.sqrt((dists * weights) **2)\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 47, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "def update(X, V):\n", 123 | " #update fuzziness matrix\n", 124 | " for v_i in V.shape[0]:\n", 125 | " numerator = fuzz[i]**m * X\n", 126 | " denominator = fuzz[i]**m\n", 127 | " v_i = numerator/denominator\n", 128 | " \n", 129 | " #update distance matrix\n", 130 | " d = np.empty(X.shape)\n", 131 | " for d_j in np.arange(0,X.shape[0]):\n", 132 | " x_jk = X[d_j, :]\n", 133 | " for d_k in np.arange(0,c)\n", 134 | " \n", 135 | " \n", 136 | " return \n", 137 | " " 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.6.6" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 2 169 | } 170 | -------------------------------------------------------------------------------- /Notebooks/Untitled1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Feature Weighted Fuzzy C-Means Classifier 2 | 3 | When trying to cluster data into separate groups, it is often difficult to understand what features to include as well as how much importance to give to each. In this case, feature weight assignment can be seen as a generalization of feature selection. The purpose of this library is to take a data set and determine feature weights for each feature ranging from [0,1]. The results of the feature weight learning will be tested on a weighted fuzzy c-means on the Iris data set from the UCI machine learning repository. 4 | 5 | ## Algorithm 6 | 7 | The guiding principle of similarity based clustering is that similar objects are within the same cluster and dissimilar objects are in different clusters. This is not different than the goal of most conventional clustering algorithms. With similarity based clustering, a measure must be given to determine how similar two objects are. This similarity measure is based off distance, and different distance metrics can be employed, but the similarity measure results in a value in [0,1] with 0 having no similarity and 1 being identical. To measure feature weight importance, we will have to use a weighted euclidean distance function. The similarity measure is defined in the following: 8 | 9 | ![Fuzzy Equation](figures/fuzzymatrix.png) 10 | 11 | β here is a value that we will actually have to solve for, (w) represents the distance weight matrix, and d represents the pairwise distances between all objects. To solve for β, we have to use the assumption that if using the standard weights(all 1's), our similarity matrix would uniformly distributed between [0,1] resulting in a mean of .5. So to find β, we solve the equation: 12 | 13 | ![Beta Equation](figures/solve_for_beta.png) 14 | 15 | If using a weighted euclidean distance, it is possible to use this similarity matrix to identify what features introduce more noise and which ones are important to clustering. The ultimate goal is to minimize the "fuzziness" of the similarity matrix, trying to move everything in the middle (ie .5) to either 1 or 0. For this purpose we use the loss metric: 16 | 17 | ![Loss Equation](figures/Loss.png) 18 | 19 | 20 | Here (1) represents the base weights (all 1's). ρ represents the resulting fuzzy partition matrix that is a product of the weights used in the euclidean distance function between points p and q.  21 | We can then attempt to use Gradient Descent on this loss function to try and minimize it with respect to the similarity matrix. Gradient Descent is one of the most common optimization algorithms in machine learning that is used to find best parameters of a given function by using the function gradient, a combination of the partial derivatives. By taking steps proportional to the negative of the gradient, we can try to find the local minimum of the function. We will continually update the weights until either our maximum number of iterations has been met, or the function converges. So the gradient descent will be of our loss function with a partial derivative in respect to the weights.  22 | Where n is the learning rate defined. n is a very important parameter, as something too small will require too much computation, while too big and the function may never converge.  23 | If you can think of it in terms of a 3D graph, it would be like stretching or shrinking each axis, in a way that would put our points into tighter groups, that are further away from each other. We are not actually changing the locations of the data, we are solely transforming how we measure the distances that drive our similarity metrics.  24 | Here is a created example where I introduce 3 clusters with separate centroids on the first two variables, but introduce a third noise variable that would make the clustering more difficult. These are colored by the actual cluster labels given when the data is created. When eliminating the third noise variable, we can see it would be much easier to identify clusters.  25 | 26 | ![Noise Reduction](figures/NoiseReduction.png) 27 | 28 | As you can see, K-means had a tougher time identifying the actual clusters, because it had to incorporate the noise feature.  29 | 30 | 31 | Measuring Improvement 32 | A good representation of its effectiveness is fuzzy c-means, a relative of the commonly used k-means algorithm. It works in a very similar fashion to k-means, but rather results in something called the fuzzy partition matrix instead of just a cluster label.  33 | The fuzzy partition matrix is a set of weights that measure how similar a single point is to a given cluster center, close to how our similarity matrix is used previously. It can also be calculated using a weighted distance metric which we can feed our new found optimum weights. This will also then go back into updating the cluster centers. Like K-means, this results in the cluster centers shifting with each iteration, until the maximum number of iterations or a certain improvement threshold has been met. 34 | In fuzzy c-means, you would have a very similar goal as to our original loss function. You would like less "fuzzyness" from points, and you want them all to be as close as possible to their cluster centers, and further away from others. A good measure of the fuzzy clustering algorithm is Dunn's partition coefficient, a sum of all components of the fuzzy partition matrix.  35 | Let's try using fuzzy c-means on the Iris data set with and without our learned feature weights. Here the output of fuzzy c-means comparing all variables, assuming 3 clusters(since we know that from the data set).  36 | ![Iris W](figures/Iris_WO.png) 37 | 38 | 39 | 40 | Notice how the boundaries between some are less defined, and because we have multiple features equally weighted, it can be blurred. Now, when applying the feature weighted learning approach, we get normalized distance weights of: 41 | {'sepal length': 0.0, 'sepal width': 0.0, 'petal length': 1.0, 'petal width': 0.0258} 42 | 43 | ![Noise Reduction](figures/Iris_W.png) 44 | 45 | There are still fuzzy boundaries, mostly on features where we deemed them 0 value in the distance weights, but the algorithm put a major focus on petal length. We resulted in similar clusters, stronger boundaries (on some features), and overall our fuzzy partition coefficient increased from .70 to .86, an increase of ~23%! 46 | We also now know that if we wanted to generate rules about classifying them, we could just focus on 2 features instead of 4! 47 | 48 | 49 | ## Usage 50 | 51 | To use this module, you can either just return the weighted distance matrix by calling return_weighted_distance, and then you can run the fuzzy c-means by fitting it on the data, with the returned weights. 52 | 53 | 54 | ``` 55 | from feature_learning import return_weighted_distance, c_means 56 | w = return_weighted_distance(X) 57 | fcm = c_means() 58 | fcm.fit(X, w) 59 | 60 | ``` 61 | 62 | The resulting class has the following Attributes:
63 | cluster_centers - location of cluster centers
64 | fuzzy_partition - fuzzy partition matrix
65 | f_p_coeff - fuzzy partition coefficient 66 | -------------------------------------------------------------------------------- /figures/Iris_W.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/Iris_W.png -------------------------------------------------------------------------------- /figures/Iris_WO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/Iris_WO.png -------------------------------------------------------------------------------- /figures/Loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/Loss.png -------------------------------------------------------------------------------- /figures/NoiseReduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/NoiseReduction.png -------------------------------------------------------------------------------- /figures/fuzzymatrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/fuzzymatrix.png -------------------------------------------------------------------------------- /figures/gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/gradient.png -------------------------------------------------------------------------------- /figures/solve_for_beta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/figures/solve_for_beta.png -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/src/__init__.py -------------------------------------------------------------------------------- /src/__pycache__/feature_learning.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Csinclair0/feature_learning/25afe1538f040c0a314712fdeae58c52c6c12c80/src/__pycache__/feature_learning.cpython-36.pyc -------------------------------------------------------------------------------- /src/feature_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics import pairwise_distances 3 | import math 4 | import warnings 5 | from scipy.sparse import triu 6 | 7 | warnings.filterwarnings('ignore') 8 | 9 | 10 | def weighted_euclidean(X, V, weights): 11 | """Weighted euclidean distance function 12 | 13 | Parameters 14 | ---------- 15 | X : array 16 | first object 17 | V : array 18 | second object 19 | weights : array 20 | feature weights 21 | 22 | Returns 23 | ------- 24 | float 25 | weighted distance 26 | 27 | """ 28 | dists = X- V 29 | return np.sqrt(np.sum((dists * weights) **2)) 30 | 31 | 32 | def single_delta(X, V, F): 33 | """Distance using one single parameter 34 | 35 | Parameters 36 | ---------- 37 | X : type 38 | Description of parameter `X`. 39 | V : type 40 | Description of parameter `V`. 41 | F : type 42 | Description of parameter `F`. 43 | 44 | Returns 45 | ------- 46 | float 47 | single distance of just one feature 48 | 49 | """ 50 | d = X[F] - V[F] 51 | return d 52 | 53 | 54 | def calc_beta(X, d): 55 | """calculate beta calue for feature weight learning 56 | 57 | Parameters 58 | ---------- 59 | X : array 60 | data set 61 | d : array 62 | distance matrix 63 | 64 | Returns 65 | ------- 66 | float 67 | beta value 68 | 69 | """ 70 | n = X.shape[0] 71 | for b in np.linspace(0,1,10000): 72 | p = 1/(1+b*d) 73 | p = triu(p, 1).toarray() 74 | if (2 / (n*(n-1))) *np.sum(p)< .5: 75 | return b 76 | 77 | 78 | def return_weights(X, b, d, mincols, threshold, learning_rate, max_iter): 79 | """returns learned feature weights, given the data set, beta, the distance matrix and the minimum number of columns 80 | 81 | Parameters 82 | ---------- 83 | X : array 84 | data set 85 | b : float 86 | beta value 87 | d : array 88 | distance atrix 89 | mincols : int 90 | minimum number of columns to return that have weights 91 | threshold : float 92 | minimum threshold when to stop learning 93 | n : int 94 | learning rate 95 | 96 | Returns 97 | ------- 98 | array 99 | learned feature weights 100 | 101 | """ 102 | 103 | w= np.empty((1,X.shape[1])) 104 | w.fill(1) 105 | p_1 = 1/(1+b*d) 106 | n = X.shape[0] 107 | E_old = 1 108 | for i in np.arange(0, max_iter): 109 | d = pairwise_distances(X,X, metric = weighted_euclidean, **{'weights':w}) 110 | grad_w = np.empty((1,X.shape[1])) 111 | part_pq = -b/((1+b*d)**2) 112 | p = 1/(1+b*d) 113 | E = (2/(n*(n-1))) * np.sum(triu(.5*((p*(1-p_1) + p_1*(1-p))), 1).toarray()) 114 | if E_old - E < threshold: 115 | break 116 | E_old = E 117 | part_eq = (1-2*p_1) 118 | w_valid = np.where(w > 0)[1] 119 | 120 | if w_valid.shape[0] == mincols: 121 | break 122 | 123 | for j in w_valid: 124 | d_w = pairwise_distances(X, X, metric = single_delta, **{'F':j}) 125 | part_w = w[0, j]*(d_w)**2 / d 126 | part_w = triu(part_w, 1).toarray() 127 | grad_w_j = 1/(n*(n-1)) * part_eq * part_pq * part_w 128 | grad_w_j = triu(grad_w_j, 1).toarray() 129 | grad_w[ 0, j] = np.nansum(grad_w_j) 130 | grad_w = grad_w * learning_rate 131 | w = w-grad_w 132 | w = w.clip(min=0) 133 | #if i %100 == 0: #and i > 0: 134 | #print("Iteration {} Finished".format(i)) 135 | #print("Weights : {} ".format(w)) 136 | #print("Function Improvement : {}".format(E)) 137 | 138 | wmax = np.max(w) 139 | w = w / wmax 140 | print("{} Iterations Required".format(i)) 141 | return w 142 | 143 | def update(X, u2, m, weights, n, c): 144 | """update the fuzzy c-means process 145 | 146 | Parameters 147 | ---------- 148 | X : array 149 | data being clustered 150 | u2 : array 151 | current fuzziness matrix 152 | m : float 153 | fuzzy factor 154 | weights : array 155 | distance metrics 156 | n : int 157 | sample size 158 | c : int 159 | numebr of cluster 160 | 161 | Returns 162 | ------- 163 | 4 arrays used in c-means 164 | 165 | """ 166 | u = u2.copy() 167 | um = u ** m 168 | #update cluster centeers matrix 169 | numerator = um.T.dot(X) 170 | denominator = um.T.sum(axis = 1) 171 | V = numerator.T/(denominator) 172 | V = V.T 173 | 174 | # update distance matrix 175 | d = pairwise_distances(X, V, metric = weighted_euclidean, **{'weights':weights}) 176 | #update fuzziness 177 | for i in np.arange(0, n): 178 | for j in np.arange(0, c): 179 | newdenom = (d[i, j] / d[i, :]) ** (2/(m-1)) 180 | u[i, j] = 1 / np.sum(newdenom, axis = 0) 181 | 182 | #update loss 183 | J = (u *d**2).sum() 184 | 185 | return V, u, J, d 186 | 187 | def return_weighted_distance(X, mincols = 0, sample_size = 1, threshold = .0005, n = 1, max_iter = 1000): 188 | """takes in data set and completes entire process of feature weight learning. 189 | 1. Calculate Pairwise Distances 190 | 2. Calculate Beta 191 | 3. Learn Feature weights through gradient descent 192 | 193 | Parameters 194 | ---------- 195 | sku : class transformation 196 | class transformation where to add feature weights 197 | X : array 198 | data set 199 | mincols : int 200 | minimum number of columns to return 201 | sample_size : float 202 | fraction of dataset to use in feature weight learning 203 | 204 | Returns 205 | ------- 206 | array 207 | weighted dataset 208 | 209 | """ 210 | numsample = math.ceil(sample_size * X.shape[0]) 211 | sample = np.random.choice(X.shape[0],numsample, replace = False ) 212 | X_S = X#X[sample] 213 | d = pairwise_distances(X_S, X_S, metric = 'euclidean') 214 | b = calc_beta(X_S, d) 215 | w = return_weights(X_S, b, d, mincols, threshold, n, max_iter) 216 | w = w.reshape(1,-1) 217 | return w 218 | 219 | 220 | class c_means(): 221 | """ fuzzy cmeans class 222 | 223 | Parameters 224 | ---------- 225 | c : int 226 | number of clusters 227 | m : float 228 | fuzzification index 229 | max_iter : int 230 | maximum iterations 231 | threshold : float 232 | threhold of improvement 233 | 234 | Attributes 235 | ---------- 236 | c_ : c 237 | max_iter 238 | threshold 239 | m 240 | 241 | """ 242 | def __init__(self, c = 3, m = 2, max_iter = 1000, threshold = .01): 243 | self.c_ = c 244 | self.max_iter = max_iter 245 | self.threshold = threshold 246 | self.m = m 247 | 248 | def fit(self, X, weights, log_argmax = False): 249 | """ performs clustering using given weights 250 | 251 | Parameters 252 | ---------- 253 | X : array 254 | data to be clustered 255 | weights : array 256 | distance weight matrix 257 | 258 | Returns 259 | ------- 260 | class 261 | attributes of clustering 262 | 263 | """ 264 | self.weights = weights 265 | c = self.c_ 266 | m = self.m 267 | self.u_ = np.empty((X.shape[0], c)) 268 | d = X.shape[1] 269 | n = X.shape[0] 270 | V = np.random.random((c, d)) 271 | u_0 = np.random.random((n, c)) 272 | J = np.zeros(0) 273 | num_iter = 0 274 | u = u_0 275 | max_iter = self.max_iter 276 | threshold = self.threshold 277 | while num_iter < max_iter - 1: 278 | u2 = u.copy() 279 | V, u, Jm, d = update(X, u2, m, weights, n, c) 280 | J = np.hstack((J, Jm)) 281 | num_iter += 1 282 | 283 | if np.linalg.norm(u - u2) < threshold: 284 | break 285 | 286 | self.error_improvement = np.linalg.norm(u - u_0) 287 | self.f_p_coeff = np.trace(u.dot(u.T)) / float(n) 288 | self.cluster_centers = V 289 | self.fuzzy_partition = u 290 | self.loss = J 291 | 292 | if log_argmax == True: 293 | self.cluster = np.argmax(self.f_p_coeff, axis = 1) 294 | 295 | 296 | 297 | --------------------------------------------------------------------------------