├── LIFT ├── README.md ├── sample_data.mat └── test.ipynb ├── README.md ├── SVD └── SearchEngine_HW3_SVD.ipynb └── cppKMeans ├── cppKMeans.cpp └── cppKMeans_input.txt /LIFT/README.md: -------------------------------------------------------------------------------- 1 | # About 2 | LIFT是一个multi-label learning的方法,阅读Min-Ling Zhang的关于LIFT的论文后,对该算法产生了基本的了解。 3 | 4 | 论文提到该方法在多标签学习上面能取得较好的效果,作者给出的参考代码语言使用的是matlab,单独的学习算法选择了svm,用到了libsvm。 5 | 6 | 为了个人应用该算法,参考了源码后我使用python重写了该算法,主要用到了sklearn库,单独的学习算法也可以选择sklearn的众多算法。 7 | 8 | 代码没有经过很多测试,如果出现错误,请联系我,感谢指教! 9 | 10 | 另外可以查看 [我blog上对该算法的总结](https://blog.csdn.net/timso1997/article/details/80920902) 11 | 12 | 13 | # Source 14 | Paper: 15 | 16 | LIFT: Multi-Label Learning with Label-Specific Features 17 | 18 | By Min-Ling Zhang 19 | 20 | [Paper download](http://cse.seu.edu.cn/PersonalPage/zhangml/files/IJCAI'11.pdf) 21 | 22 | [Code download](http://cse.seu.edu.cn/PersonalPage/zhangml/files/LIFT.rar) 23 | -------------------------------------------------------------------------------- /LIFT/sample_data.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tims13/MachineLearning/5102f79344e83da712c1e3409cc37179f07a0405/LIFT/sample_data.mat -------------------------------------------------------------------------------- /LIFT/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import math\n", 10 | "\n", 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import scipy.io as sio\n", 14 | "\n", 15 | "from sklearn.metrics import accuracy_score\n", 16 | "from sklearn.metrics import roc_auc_score\n", 17 | "\n", 18 | "from sklearn.cluster import KMeans\n", 19 | "from sklearn.svm import SVC\n", 20 | "\n", 21 | "from sklearn.model_selection import train_test_split" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# load mat\n", 31 | "mmat= sio.loadmat(\"sample_data.mat\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "test_data= mmat['test_data']\n", 41 | "test_target= mmat['test_target']\n", 42 | "train_data= mmat['train_data']\n", 43 | "train_target= mmat['train_target']\n", 44 | "\n", 45 | "train_target= train_target.T\n", 46 | "test_target= test_target.T" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "numpy.ndarray" 58 | ] 59 | }, 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "type(train_data)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "(400, 5)" 78 | ] 79 | }, 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "train_target.shape" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "Performing clustering:1/5\n", 99 | "Performing clustering:2/5\n", 100 | "Performing clustering:3/5\n", 101 | "Performing clustering:4/5\n", 102 | "Performing clustering:5/5\n", 103 | "Building classifiers: :1/5\n", 104 | "Building classifiers: :2/5\n", 105 | "Building classifiers: :3/5\n", 106 | "Building classifiers: :4/5\n", 107 | "Building classifiers: :5/5\n", 108 | "Predicting: :1/5\n", 109 | "The accuracy is: 0.880000\n", 110 | "Predicting: :2/5\n", 111 | "The accuracy is: 0.785000\n", 112 | "Predicting: :3/5\n", 113 | "The accuracy is: 0.700000\n", 114 | "Predicting: :4/5\n", 115 | "The accuracy is: 0.840000\n", 116 | "Predicting: :5/5\n", 117 | "The accuracy is: 0.835000\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "ratio= 0.1\n", 123 | "\n", 124 | "def mLIFT(train_data,train_target,test_data,test_target,ratio):\n", 125 | " num_train, dim= train_data.shape\n", 126 | " num_test, num_class= test_target.shape\n", 127 | " \n", 128 | " P_Centers= []\n", 129 | " N_Centers= []\n", 130 | " \n", 131 | " ##### KMeans, and save the centers\n", 132 | " for i in range(num_class):\n", 133 | " print (\"Performing clustering:%d/%d\" %(i+ 1, num_class))\n", 134 | " \n", 135 | " p_data= train_data[train_target[:,i]== 1]\n", 136 | " n_data= train_data[train_target[:,i]== -1]\n", 137 | " \n", 138 | " k1= int(min( math.ceil(p_data.shape[0]* ratio), math.ceil(n_data.shape[0]* ratio) ))\n", 139 | " #print(\"k1= k2= %d\" %k1)\n", 140 | " k2= k1;\n", 141 | " \n", 142 | " if(k1== 0):\n", 143 | " POS_C=[]\n", 144 | " zero_kmeans= KMeans(n_clusters= min(50, num_train)).fit(train_data)\n", 145 | " NEG_C= zero_kmeans.cluster_centers_\n", 146 | " else:\n", 147 | " # Positive\n", 148 | " if(p_data.shape[0]== 1):\n", 149 | " POS_C= p_data\n", 150 | " else:\n", 151 | " p_kmeans= KMeans(n_clusters= k1).fit(p_data)\n", 152 | " POS_C= p_kmeans.cluster_centers_\n", 153 | " # Negative\n", 154 | " if(n_data.shape[0]== 1):\n", 155 | " NEG_C= n_data\n", 156 | " else:\n", 157 | " n_kmeans= KMeans(n_clusters= k2).fit(n_data)\n", 158 | " NEG_C= n_kmeans.cluster_centers_\n", 159 | " \n", 160 | " # Save the cluster centers\n", 161 | " P_Centers.append(POS_C)\n", 162 | " N_Centers.append(NEG_C)\n", 163 | " \n", 164 | " #print(\"The size of P_Canters is %d\\n\" %len(P_Centers))\n", 165 | " \n", 166 | " ##### Do the map and save the models\n", 167 | " Models= []\n", 168 | " for i in range(num_class):\n", 169 | " print (\"Building classifiers: :%d/%d\" %(i+ 1, num_class))\n", 170 | " centers= np.vstack((P_Centers[i], N_Centers[i]))\n", 171 | " num_center= centers.shape[0]\n", 172 | " # print(num_center)\n", 173 | " data= []\n", 174 | " \n", 175 | " if(num_center>= 5000):\n", 176 | " print(\"Too many cluster center!\")\n", 177 | " break\n", 178 | " else:\n", 179 | " blocksize= 5000- num_center\n", 180 | " num_block= int(math.ceil(num_train/ blocksize))\n", 181 | " # print(num_block)\n", 182 | " \n", 183 | " mFirst= True\n", 184 | " for j in range(num_block- 1):\n", 185 | " print(j)\n", 186 | " low= j* blocksize\n", 187 | " high= (j+ 1)* blocksize\n", 188 | " # Calculate the distance\n", 189 | " for k in range(num_center):\n", 190 | " diff= train_data[low:high, :]- centers[k]\n", 191 | " Eu_diff= np.linalg.norm(diff, axis=1)\n", 192 | " if(mFirst== True):\n", 193 | " mFirst= False\n", 194 | " data_temp= Eu_diff\n", 195 | " else:\n", 196 | " data_temp= np.vstack((data_temp, Eu_diff))\n", 197 | " \n", 198 | " \n", 199 | " low= (num_block- 1)* blocksize\n", 200 | " high= num_train\n", 201 | " \n", 202 | " # Calculate the distance\n", 203 | " for j in range(num_center):\n", 204 | " diff= train_data[low:high,:]- centers[j]\n", 205 | " Eu_diff= np.linalg.norm(diff, axis=1)\n", 206 | " if(mFirst== True):\n", 207 | " mFirst= False\n", 208 | " data_temp= Eu_diff\n", 209 | " else:\n", 210 | " data_temp= np.vstack((data_temp, Eu_diff))\n", 211 | " \n", 212 | " data= data_temp.T\n", 213 | " \n", 214 | " training_instance_matrix= data\n", 215 | " training_label_vector= train_target[:,i]\n", 216 | " \n", 217 | " model_this= SVC(C= 10, probability=True).fit(training_instance_matrix, training_label_vector)\n", 218 | " #model_this= LogisticRegression(C= 0.03).fit(training_instance_matrix, training_label_vector)\n", 219 | " #model_this= DecisionTreeClassifier().fit(training_instance_matrix, training_label_vector)\n", 220 | " #model_this = AdaBoostClassifier(DecisionTreeClassifier(),\n", 221 | " #algorithm=\"SAMME\",\n", 222 | " #n_estimators=50, learning_rate=0.8).fit(training_instance_matrix, training_label_vector)\n", 223 | " Models.append(model_this)\n", 224 | " \n", 225 | " ##### Predict\n", 226 | " for i in range(num_class):\n", 227 | " print (\"Predicting: :%d/%d\" %(i+ 1, num_class))\n", 228 | " centers= np.vstack((P_Centers[i], N_Centers[i]))\n", 229 | " num_center= centers.shape[0]\n", 230 | " # print(num_center)\n", 231 | " data= []\n", 232 | " \n", 233 | " if(num_center>= 5000):\n", 234 | " print(\"Too many cluster center!\")\n", 235 | " break\n", 236 | " else:\n", 237 | " blocksize= 5000- num_center\n", 238 | " num_block= int(math.ceil(num_test/ blocksize))\n", 239 | " # print(num_block)\n", 240 | " \n", 241 | " mFirst= True\n", 242 | " for j in range(num_block- 1):\n", 243 | " print(j)\n", 244 | " low= j* blocksize\n", 245 | " high= (j+ 1)* blocksize\n", 246 | " # Calculate the distance\n", 247 | " for k in range(num_center):\n", 248 | " diff= test_data[low:high, :]- centers[k]\n", 249 | " Eu_diff= np.linalg.norm(diff, axis=1)\n", 250 | " if(mFirst== True):\n", 251 | " mFirst= False\n", 252 | " data_temp= Eu_diff\n", 253 | " else:\n", 254 | " data_temp= np.vstack((data_temp, Eu_diff))\n", 255 | " \n", 256 | " \n", 257 | " low= (num_block- 1)* blocksize\n", 258 | " high= num_train\n", 259 | " \n", 260 | " # Calculate the distance\n", 261 | " for j in range(num_center):\n", 262 | " diff= test_data[low:high,:]- centers[j]\n", 263 | " Eu_diff= np.linalg.norm(diff, axis=1)\n", 264 | " if(mFirst== True):\n", 265 | " mFirst= False\n", 266 | " data_temp= Eu_diff\n", 267 | " else:\n", 268 | " data_temp= np.vstack((data_temp, Eu_diff))\n", 269 | " \n", 270 | " data= data_temp.T\n", 271 | " # print(data.shape)\n", 272 | " \n", 273 | " testing_instance_matrix= data;\n", 274 | " testing_label_vector= test_target[:, i]\n", 275 | " \n", 276 | " predicted_label= Models[i].predict(testing_instance_matrix)\n", 277 | " \n", 278 | " #print (predicted_label)\n", 279 | "\n", 280 | " print(\"The accuracy is: %f\" %accuracy_score(testing_label_vector, predicted_label))\n", 281 | " #print(roc_auc_score(testing_label_vector, predicted_label))\n", 282 | " \n", 283 | " return 1\n", 284 | "\n", 285 | "\n", 286 | "mLIFT(train_data,train_target,test_data,test_target,ratio);" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 2", 300 | "language": "python", 301 | "name": "python2" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 2 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython2", 313 | "version": "2.7.14" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 1 318 | } 319 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MachineLearning 2 | 3 | LIFT: MinLing Zhang的多标签分类算法 4 | 5 | cppKMeans: Kmeans的cpp简单实现,HKUST读研时,DataMining课程的作业产物 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /SVD/SearchEngine_HW3_SVD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import math\n", 11 | "import nltk\n", 12 | "from nltk.corpus import stopwords \n", 13 | "from nltk.tokenize import word_tokenize, sent_tokenize\n", 14 | "from nltk.corpus import stopwords\n", 15 | "from nltk.stem import PorterStemmer\n", 16 | "from nltk import bigrams\n", 17 | "from nltk import FreqDist\n", 18 | "from nltk import ngrams\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "# Preprocess" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "[nltk_data] Downloading package punkt to /Users/tim/nltk_data...\n", 39 | "[nltk_data] Package punkt is already up-to-date!\n", 40 | "[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...\n", 41 | "[nltk_data] Package stopwords is already up-to-date!\n" 42 | ] 43 | }, 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "True" 48 | ] 49 | }, 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "nltk.download('punkt')\n", 57 | "nltk.download('stopwords')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "with open('./paper.json', 'r') as f:\n", 67 | " papers = json.load(f)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "papers_50 = papers[0:50]" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "50" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "len(papers_50)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "def get_stemmed_tokens(string_source):\n", 106 | " # word tokenize after lower the string\n", 107 | " word_tokens = word_tokenize(string_source.lower())\n", 108 | " # remove the punctuations\n", 109 | " word_tokens = [word.lower() for word in word_tokens if word.isalpha()]\n", 110 | " # stop words filter\n", 111 | " stop_words = set(stopwords.words('english'))\n", 112 | " filtered_tokens = [w for w in word_tokens if not w in stop_words]\n", 113 | " # stemming\n", 114 | " stemmer = PorterStemmer()\n", 115 | " stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]\n", 116 | " return stemmed_tokens" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "paper_token_freq = []\n", 126 | "tokens = set()\n", 127 | "for index in range(len(papers_50)):\n", 128 | " # print(index)\n", 129 | " paper_abstract = papers_50[index]['abstract']\n", 130 | " stemmed_tokens = get_stemmed_tokens(paper_abstract)\n", 131 | " token_freq = dict()\n", 132 | " # print(stemmed_tokens)\n", 133 | " # count the tokens in the stemmed_tokens\n", 134 | " for token in stemmed_tokens:\n", 135 | " tokens.add(token)\n", 136 | " if token in token_freq.keys():\n", 137 | " token_freq[token] = token_freq[token] + 1\n", 138 | " else:\n", 139 | " token_freq[token] = 1\n", 140 | " token_freq_sorted = sorted(token_freq.items(), key=lambda kv:kv[1], reverse=True)\n", 141 | " paper_token_freq.append(token_freq_sorted)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "('model', 5)" 153 | ] 154 | }, 155 | "execution_count": 10, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "paper_token_freq[0][0]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 11, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "token2index = dict()\n", 178 | "count = 0\n", 179 | "for token in tokens:\n", 180 | " token2index[token] = count\n", 181 | " count = count + 1" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 12, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "(1094, 50)" 193 | ] 194 | }, 195 | "execution_count": 12, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "# create the term-doc matrix\n", 202 | "\n", 203 | "term_doc_mat = np.zeros([len(tokens), len(papers_50)])\n", 204 | "term_doc_mat.shape" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 13, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "'model'" 216 | ] 217 | }, 218 | "execution_count": 13, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "paper_token_freq[0][0][0]" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 97, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "for paper_index in range(len(paper_token_freq)):\n", 234 | " # print(paper_index)\n", 235 | " for token_freq in paper_token_freq[paper_index]:\n", 236 | " # print(token_freq)\n", 237 | " token = token_freq[0]\n", 238 | " freq = token_freq[1]\n", 239 | " token_index = token2index[token]\n", 240 | " # print(paper_index, token, token_index, freq)\n", 241 | " term_doc_mat[token_index][paper_index] = freq" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "#### normalize" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 16, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "for c in range(term_doc_mat.shape[1]):\n", 258 | " l2_norm = np.linalg.norm(term_doc_mat[:,c])\n", 259 | " # print(c, l2_norm)\n", 260 | " term_doc_mat[:,c] = term_doc_mat[:,c]/l2_norm" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 19, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "array([0. , 0. , 0. , 0.17609018, 0. ,\n", 272 | " 0. , 0. , 0. , 0.15476465, 0. ,\n", 273 | " 0. , 0. , 0. , 0. , 0. ,\n", 274 | " 0. , 0. , 0. , 0. , 0. ,\n", 275 | " 0. , 0. , 0. , 0. , 0. ,\n", 276 | " 0. , 0. , 0.1062988 , 0. , 0.07495317,\n", 277 | " 0. , 0. , 0. , 0. , 0. ,\n", 278 | " 0. , 0.07018624, 0. , 0. , 0. ,\n", 279 | " 0. , 0. , 0. , 0. , 0. ,\n", 280 | " 0. , 0.58658846, 0. , 0. , 0. ])" 281 | ] 282 | }, 283 | "execution_count": 19, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "term_doc_mat[token2index['knowledg']]" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 20, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "array([0.37796447, 0. , 0. , 0.17609018, 0.41812101,\n", 301 | " 0.20412415, 0. , 0.21428571, 0.38691162, 0.15665209,\n", 302 | " 0. , 0.3592106 , 0. , 0. , 0.32929278,\n", 303 | " 0.28603878, 0.51116565, 0.12171612, 0.17087153, 0.31980107,\n", 304 | " 0.0751646 , 0.1069045 , 0.14704292, 0.21997067, 0.4 ,\n", 305 | " 0.19156526, 0.28074496, 0.6909422 , 0.42874646, 0.14990634,\n", 306 | " 0.29704426, 0.10482848, 0.09053575, 0.31799936, 0.4417261 ,\n", 307 | " 0. , 0.3509312 , 0.24174689, 0.24019223, 0.3125 ,\n", 308 | " 0.08421519, 0. , 0.29488391, 0.19900744, 0. ,\n", 309 | " 0.1833397 , 0.14664712, 0. , 0.38124643, 0. ])" 310 | ] 311 | }, 312 | "execution_count": 20, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "term_doc_mat[token2index['model']]" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "# SVD" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 26, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "u, s, vh = np.linalg.svd(term_doc_mat, full_matrices=False)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 27, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "array([2.86096186, 1.3653995 , 1.27032358, 1.22987015, 1.19443846,\n", 346 | " 1.17806692, 1.15387011, 1.12738649, 1.09528575, 1.07138045,\n", 347 | " 1.06540577, 1.05129034, 1.04110368, 1.01120723, 0.99593028,\n", 348 | " 0.98741442, 0.97725382, 0.9746004 , 0.96690456, 0.94757842,\n", 349 | " 0.93954294, 0.93568809, 0.92170802, 0.91380586, 0.89307389,\n", 350 | " 0.88637231, 0.88359827, 0.87342525, 0.85572452, 0.84734158,\n", 351 | " 0.8404466 , 0.83379732, 0.8260985 , 0.81362107, 0.79924362,\n", 352 | " 0.7853955 , 0.77452919, 0.77164883, 0.76060635, 0.74874168,\n", 353 | " 0.73608489, 0.72600944, 0.71404007, 0.70582593, 0.69096365,\n", 354 | " 0.686157 , 0.66801739, 0.66629895, 0.61695201, 0.59705741])" 355 | ] 356 | }, 357 | "execution_count": 27, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "s" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 28, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "(1094, 50) (50,) (50, 50)\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "print(u.shape, s.shape, vh.shape)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 36, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "u_2 = u[:,0:2]\n", 390 | "s_2 = s[0:2]\n", 391 | "vh_2 = vh[0:2,:]" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 41, 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "data": { 401 | "text/plain": [ 402 | "(1094, 2)" 403 | ] 404 | }, 405 | "execution_count": 41, 406 | "metadata": {}, 407 | "output_type": "execute_result" 408 | } 409 | ], 410 | "source": [ 411 | "u_2.shape" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 42, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/plain": [ 422 | "array([2.86096186, 1.3653995 ])" 423 | ] 424 | }, 425 | "execution_count": 42, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "s_2" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 43, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "data": { 441 | "text/plain": [ 442 | "(2, 50)" 443 | ] 444 | }, 445 | "execution_count": 43, 446 | "metadata": {}, 447 | "output_type": "execute_result" 448 | } 449 | ], 450 | "source": [ 451 | "vh_2.shape" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 44, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "# reconstrut with rank 2\n", 461 | "term_doc_mat_2 = np.dot(u_2 * s_2, vh_2)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 45, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "array([ 0.00697243, -0.00811431, -0.00532235, -0.00197454, 0.01068318,\n", 473 | " 0.01161913, -0.00696591, 0.00165031, 0.01174326, 0.00252241,\n", 474 | " -0.00302361, 0.00626328, -0.00662105, -0.00510015, 0.00715214,\n", 475 | " 0.00948894, 0.01572016, 0.00134019, 0.00123861, 0.01114175,\n", 476 | " -0.00267239, 0.00181414, -0.00027136, 0.002495 , 0.01233359,\n", 477 | " -0.00246744, 0.00804274, 0.01885076, 0.01393888, 0.00703787,\n", 478 | " 0.00495371, -0.00535733, 0.00105639, 0.00753421, 0.00779568,\n", 479 | " -0.00420969, 0.00738203, 0.00787108, 0.00575727, 0.00886788,\n", 480 | " -0.00077753, -0.00350003, 0.0094726 , 0.00162726, -0.0034297 ,\n", 481 | " 0.00411899, 0.00239836, -0.00826835, 0.00897435, -0.00476588])" 482 | ] 483 | }, 484 | "execution_count": 45, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "term_doc_mat_2[0]" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 46, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/plain": [ 501 | "(1094, 50)" 502 | ] 503 | }, 504 | "execution_count": 46, 505 | "metadata": {}, 506 | "output_type": "execute_result" 507 | } 508 | ], 509 | "source": [ 510 | "term_doc_mat_2.shape" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "# Query" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": 32, 523 | "metadata": {}, 524 | "outputs": [ 525 | { 526 | "data": { 527 | "text/plain": [ 528 | "['knowledg', 'graph']" 529 | ] 530 | }, 531 | "execution_count": 32, 532 | "metadata": {}, 533 | "output_type": "execute_result" 534 | } 535 | ], 536 | "source": [ 537 | "query = get_stemmed_tokens('knowledge graph')\n", 538 | "query" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 33, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "(1094,)" 550 | ] 551 | }, 552 | "execution_count": 33, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "query_vec = np.zeros([term_doc_mat_2.shape[0]])\n", 559 | "query_vec.shape" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 34, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "query_vec[token2index['knowledg']] = 1.0\n", 569 | "query_vec[token2index['graph']] = 1.0" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 35, 575 | "metadata": {}, 576 | "outputs": [ 577 | { 578 | "name": "stdout", 579 | "output_type": "stream", 580 | "text": [ 581 | "1.0 1.0\n" 582 | ] 583 | } 584 | ], 585 | "source": [ 586 | "print(query_vec[token2index['knowledg']], query_vec[token2index['graph']])" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "# Mapping query to rank-2 space" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 49, 606 | "metadata": {}, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/plain": [ 611 | "array([[2.86096186, 0. ],\n", 612 | " [0. , 1.3653995 ]])" 613 | ] 614 | }, 615 | "execution_count": 49, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "s_2_mat = np.diag(s_2)\n", 622 | "s_2_mat" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 50, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/plain": [ 633 | "array([[0.3495328 , 0. ],\n", 634 | " [0. , 0.73238638]])" 635 | ] 636 | }, 637 | "execution_count": 50, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "# compute the inv of s_2\n", 644 | "s_inv = np.linalg.inv(s_2_mat)\n", 645 | "s_inv" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "u d t(v)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 53, 660 | "metadata": {}, 661 | "outputs": [ 662 | { 663 | "data": { 664 | "text/plain": [ 665 | "array([-0.0371808 , -0.04075829])" 666 | ] 667 | }, 668 | "execution_count": 53, 669 | "metadata": {}, 670 | "output_type": "execute_result" 671 | } 672 | ], 673 | "source": [ 674 | "query_vec_2 = np.dot(np.dot(query_vec.T, u_2), s_inv)\n", 675 | "query_vec_2" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": {}, 688 | "source": [ 689 | "# Compute cos" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 56, 695 | "metadata": {}, 696 | "outputs": [ 697 | { 698 | "data": { 699 | "text/plain": [ 700 | "(2, 50)" 701 | ] 702 | }, 703 | "execution_count": 56, 704 | "metadata": {}, 705 | "output_type": "execute_result" 706 | } 707 | ], 708 | "source": [ 709 | "vh_2.shape" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": 67, 715 | "metadata": {}, 716 | "outputs": [ 717 | { 718 | "data": { 719 | "text/plain": [ 720 | "(50,)" 721 | ] 722 | }, 723 | "execution_count": 67, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "# compute inner product\n", 730 | "np.dot(query_vec_2, vh_2).shape" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 57, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "text/plain": [ 741 | "(50,)" 742 | ] 743 | }, 744 | "execution_count": 57, 745 | "metadata": {}, 746 | "output_type": "execute_result" 747 | } 748 | ], 749 | "source": [ 750 | "np.dot(query_vec_2, vh_2).shape" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 58, 756 | "metadata": {}, 757 | "outputs": [ 758 | { 759 | "data": { 760 | "text/plain": [ 761 | "(2,)" 762 | ] 763 | }, 764 | "execution_count": 58, 765 | "metadata": {}, 766 | "output_type": "execute_result" 767 | } 768 | ], 769 | "source": [ 770 | "query_vec_2.shape" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 61, 776 | "metadata": {}, 777 | "outputs": [ 778 | { 779 | "data": { 780 | "text/plain": [ 781 | "array([-0.0371808 , -0.04075829])" 782 | ] 783 | }, 784 | "execution_count": 61, 785 | "metadata": {}, 786 | "output_type": "execute_result" 787 | } 788 | ], 789 | "source": [ 790 | "query_vec_2" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 60, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "data": { 800 | "text/plain": [ 801 | "0.05516928648255209" 802 | ] 803 | }, 804 | "execution_count": 60, 805 | "metadata": {}, 806 | "output_type": "execute_result" 807 | } 808 | ], 809 | "source": [ 810 | "# compute l2_norm of query\n", 811 | "query_vec_2_norm = np.linalg.norm(query_vec_2)\n", 812 | "query_vec_2_norm" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 66, 818 | "metadata": {}, 819 | "outputs": [ 820 | { 821 | "data": { 822 | "text/plain": [ 823 | "array([0.20174977, 0.27456174, 0.22248241, 0.24983308, 0.2174912 ,\n", 824 | " 0.22078464, 0.25952688, 0.14012625, 0.22746821, 0.1325822 ,\n", 825 | " 0.14369121, 0.17456237, 0.21062667, 0.21443231, 0.18055046,\n", 826 | " 0.17967539, 0.29752757, 0.08367503, 0.15293573, 0.21410344,\n", 827 | " 0.14629374, 0.1018657 , 0.20775419, 0.17301232, 0.23878486,\n", 828 | " 0.23394927, 0.16703174, 0.3567018 , 0.26609081, 0.14403617,\n", 829 | " 0.15051523, 0.25865527, 0.16531627, 0.19507977, 0.185571 ,\n", 830 | " 0.14652253, 0.22480121, 0.15222304, 0.12488625, 0.16981437,\n", 831 | " 0.18091916, 0.15457465, 0.17922928, 0.15072183, 0.1792327 ,\n", 832 | " 0.1265568 , 0.13909051, 0.30391799, 0.20642174, 0.20569964])" 833 | ] 834 | }, 835 | "execution_count": 66, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "# compute l2_norm of vh_2\n", 842 | "vh_2_norm = np.linalg.norm(vh_2, axis=0)\n", 843 | "vh_2_norm" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": 69, 849 | "metadata": {}, 850 | "outputs": [ 851 | { 852 | "data": { 853 | "text/plain": [ 854 | "(50,)" 855 | ] 856 | }, 857 | "execution_count": 69, 858 | "metadata": {}, 859 | "output_type": "execute_result" 860 | } 861 | ], 862 | "source": [ 863 | "(query_vec_2_norm * vh_2_norm).shape" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [] 872 | }, 873 | { 874 | "cell_type": "code", 875 | "execution_count": 70, 876 | "metadata": {}, 877 | "outputs": [], 878 | "source": [ 879 | "# compute cos\n", 880 | "similarities = np.dot(query_vec_2, vh_2) / (query_vec_2_norm * vh_2_norm)" 881 | ] 882 | }, 883 | { 884 | "cell_type": "code", 885 | "execution_count": 71, 886 | "metadata": {}, 887 | "outputs": [ 888 | { 889 | "data": { 890 | "text/plain": [ 891 | "array([ 0.70514207, -0.49991428, -0.38925342, -0.08012003, 0.95290003,\n", 892 | " 0.98686457, -0.44644639, 0.29037549, 0.98937496, 0.42425238,\n", 893 | " -0.33310164, 0.7284993 , -0.53717996, -0.38655397, 0.79391288,\n", 894 | " 0.99952375, 0.99554765, 0.36886327, 0.22187839, 0.99441873,\n", 895 | " -0.27925277, 0.40189678, 0.04516554, 0.33937997, 0.98970219,\n", 896 | " -0.1306334 , 0.93762527, 0.99656906, 0.99799914, 0.94886865,\n", 897 | " 0.6758419 , -0.32668208, 0.18993838, 0.77663652, 0.83528603,\n", 898 | " -0.48366097, 0.67452861, 0.99041457, 0.90427789, 0.9964103 ,\n", 899 | " -0.01150859, -0.36427163, 0.99755822, 0.27214636, -0.29607251,\n", 900 | " 0.66933092, 0.39147694, -0.45362384, 0.86030206, -0.3745371 ])" 901 | ] 902 | }, 903 | "execution_count": 71, 904 | "metadata": {}, 905 | "output_type": "execute_result" 906 | } 907 | ], 908 | "source": [ 909 | "similarities" 910 | ] 911 | }, 912 | { 913 | "cell_type": "markdown", 914 | "metadata": {}, 915 | "source": [ 916 | "# Sort and rank" 917 | ] 918 | }, 919 | { 920 | "cell_type": "code", 921 | "execution_count": 72, 922 | "metadata": {}, 923 | "outputs": [ 924 | { 925 | "data": { 926 | "text/plain": [ 927 | "array([15, 28, 42, 27, 39, 16, 19, 37, 24, 8, 5, 4, 29, 26, 38, 48, 34,\n", 928 | " 14, 33, 11, 0, 30, 36, 45, 9, 21, 46, 17, 23, 7, 43, 18, 32, 22,\n", 929 | " 40, 3, 25, 20, 44, 31, 10, 41, 49, 13, 2, 6, 47, 35, 1, 12])" 930 | ] 931 | }, 932 | "execution_count": 72, 933 | "metadata": {}, 934 | "output_type": "execute_result" 935 | } 936 | ], 937 | "source": [ 938 | "(-similarities).argsort()" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [] 947 | }, 948 | { 949 | "cell_type": "markdown", 950 | "metadata": {}, 951 | "source": [ 952 | "# Plot" 953 | ] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "execution_count": 74, 958 | "metadata": {}, 959 | "outputs": [ 960 | { 961 | "data": { 962 | "text/plain": [ 963 | "((1094, 2), (2, 50), (2,))" 964 | ] 965 | }, 966 | "execution_count": 74, 967 | "metadata": {}, 968 | "output_type": "execute_result" 969 | } 970 | ], 971 | "source": [ 972 | "u_2.shape, vh_2.shape, query_vec_2.shape" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": 80, 978 | "metadata": {}, 979 | "outputs": [ 980 | { 981 | "data": { 982 | "text/plain": [ 983 | "(50,)" 984 | ] 985 | }, 986 | "execution_count": 80, 987 | "metadata": {}, 988 | "output_type": "execute_result" 989 | } 990 | ], 991 | "source": [ 992 | "vh_2[0,:].shape" 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": null, 998 | "metadata": {}, 999 | "outputs": [], 1000 | "source": [] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": 76, 1005 | "metadata": {}, 1006 | "outputs": [], 1007 | "source": [ 1008 | "import matplotlib.pyplot as plt" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 96, 1014 | "metadata": {}, 1015 | "outputs": [ 1016 | { 1017 | "data": { 1018 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWaElEQVR4nO3dbawc133f8e8vpB2nSWyJESVRlli6AJFIoWLZJiSlDgLbEgOZDSK9KAoJjUMULqgH2nXQFKkMtUHzooBeFIEdlHRNOG4YJIgbJHZFuGocibFhGElsUbGsB9MyFdeGCbKiIjlOigJyJP/7YnfL1Wr37sPs3b135/sBFjuzc7jnnHt5fzt75sxMqgpJ0ur7gWU3QJK0GAa+JLWEgS9JLWHgS1JLGPiS1BJbl92AtVxyySW1a9euZTdDkjaNRx999K+ravuwbRs68Hft2sXJkyeX3QxJ2jSSfGvUNod0JKklDHxJagkDX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAlzTaoUOwdWvnWZuegS9ptI9+FF5+ufOsTc/AlzTanXfCli2dZ2162ch3vNq7d295aQVJmlySR6tq77Bt7uFL2tg8jjA3Br6kjc3jCHMzl8BPckuSp5M8k+TeIdv/eZLHu48/S/LmedQrqQU8jjA3jcfwk2wBvg7sA84AjwB3VNVX+8r8Y+BUVX0nybuB/1BVN4x7b8fwJWk66z2Gfz3wTFV9o6q+B3wCuLW/QFX9WVV9p7v6F8CVc6hXkjSFeQT+G4Fv962f6b42ynuB/zmHeiVJU5jHHa8y5LWh40RJ3kkn8H9m5JslB4GDADt37pxD8yRJMJ89/DPAVX3rVwJnBwsl+SngY8CtVfX8qDerqqNVtbeq9m7fPvS2jJKkGcwj8B8Bdid5U5LXArcDx/sLJNkJfBJ4T1V9fQ51SpKm1Djwq+ol4H3AZ4BTwB9U1VNJ7kpyV7fYrwE/BhxJ8lgSp95Ii+KJS+ry0grSqtu6tXPi0pYt8NJLy27NxnfoUOckrzvvhMOHl92aqXlpBanNPHFpOit8Zq+BL626w4c7e/abcG91KVb4A9IhHUlaIQ7pSJIMfElqCwNfklrCwJe02jwP4f8z8CWtthWeZjktA1/SalvhaZbTclqmJK0Qp2VK0rRWcOzfwJekYVZw7N/Al6RhVnDs3zF8SVohjuFLkgx8SZrZJjuwa+BL0qw22YFdA1+SZrXJDuzOJfCT3JLk6STPJLl3yPafSPLnSV5M8m/mUackLcyooZtNdnOZxoGfZAtwGHg3cA1wR5JrBoq9APwr4D81rU+SFm6TDd2MMo89/OuBZ6rqG1X1PeATwK39BarqfFU9Avz9HOqTpMXaZEM3o8wj8N8IfLtv/Uz3NUlaDfMculnizJ55BH6GvDbz2VxJDiY5meTkc88916BZkrRAkwb5EoeH5hH4Z4Cr+tavBM7O+mZVdbSq9lbV3u3btzdunCStm/6QnyTIDx3qlAG4+urFtLHPPAL/EWB3kjcleS1wO3B8Du8rabPZZCciNdYf8r1x/quvHv0z6P8wOHVqce3sahz4VfUS8D7gM8Ap4A+q6qkkdyW5CyDJ5UnOAP8a+HdJziR5fdO6JS3BWqG+2WezTPuB1X8w9/DhzvOTT47+Gdx5JyQXlhfMi6dJms7WrZ1A27KlcyCzX29ooxeAy9CkDWv1bZp/D3DPPZ3nBf88vHiapPlZa4riRjgRqcm3jN64+qzj672fzT33dH4GG+wbj4EvaTobIdTX0mTOfG9cfdbx9cGfzWBbBoeMrr22M8Rz7bWz1TclA1/Sckw6Xj7tuHqTD6RpPyzGta2/LYcOwZEjr9zjf/LJVz6vM8fwJS3HpOPlTcfV19M0bRs2vn/kSOd5zx544om5NMkxfEkbz6R70xv5sgaTtu3QIfj+9zvL/eP70BnSOXVqIVNZDXxJyzHp0MusQzTzOidg2Pv0XoO129Yr95GPQFXnw6F/fD/pvL6gA7sO6UhaTfMaChr2PsNeGzYdtH8YBy4M3fSfmdvT2/NvyCEdSe3T5ABs//Kw9+l/rX8vvn9PvX8Yp+fJJzszcnoHb+HCiVgf+cj6D+tU1YZ9vO1tbytJG8A991Rt2dJ53mwmbfuWLVXQee5fHvc+vbJQlVzY3ns9ubB9kkfDnzFwskZk6tJDfa2HgS9tEMMCsIlFfoD0B+9adfa3aVj7Rr3PPfdcCOvez6f32rRhP4ef8VqB75COpPHmPVNmkWeg9to+7uDouIPDvbNvB9/n8OHO+Hv/z6c33bKqM24/iQVcY8fAlzTevM+uXeRUy17bB0N5LcM+kHpn3yavfp/ehdOOHIEfGIjVSU+qqvWfQGPgS1qsZV1grRf8MH665loHan/yJ4f/m96HQ9Pg7n07WAdOy5S0WMs+c3ZeV8Qc/Pe9SyckcPfdr552OY0Guey0TEkbx7LPnJ2m/mEnXY26IBp0gvruu1857XIWGXbn2Obcw5ekUSb5NjBYZvBkq1nNmM3u4UvSLEZ9G+jf8+/N3nnDGzqvDbuW/rZt69/WCRj4ktqjyaWWR92wvDd754UXOq+dOnVhVn3vqpgvvDB9W9dhWGcugZ/kliRPJ3kmyb1DtifJb3a3P57krfOoV5LGGhbUR45MdxmDwWvZ9+/595b37Hn15RbWccbNLBqP4SfZAnwd2AecAR4B7qiqr/aV2Q+8H9gP3AB8uKpuGPfejuFLaqx/jL03Vx4mn6Vz7bWvnEu/Z09nL37UtNLB8k3MkM/rPYZ/PfBMVX2jqr4HfAK4daDMrcDvdM/8/QvgoiQ75lC3JK2tf2982FmxgwaHffrDe9u2zvrgRdJGld9g5rGH/0+BW6rqX3bX3wPcUFXv6yvzaeD+qvpCd/0E8G+ras3dd/fwJS1c/zeCq68eHeC9yxn3j7VXvXJ9z55mHwAbcA9/2JGFwVZOUqZTMDmY5GSSk88991zjxkna5OZ1I5NJ9X8jGAzrbds62/qvXd8L+GEHWZ94olN+g5hH4J8BrupbvxI4O0MZAKrqaFXtraq927dvn0PzJG1qi7zQGrxyZs7ghc+++91XX1Po7rsvLF977YXlH/zBzofAEs62HWUegf8IsDvJm5K8FrgdOD5Q5jjwS93ZOjcC362qc3OoW9KqW+aZuU88cWF65ag2HD584Wqc/d8IXnxxce2c0FzOtO3OwvkQsAX4eFX9xyR3AVTVf0kS4D8DtwD/F/gX48bvwTF8SRtcb6rnG97QmWu/bdtsc+6HWYczbb20giSNM+oKn5NeRmGWDwIvrSBJSzB4HKF3IHnYZRR6ks5QUHIh7Ce9Gco67Ygb+JI0zuBxhN4HwKlTFy6f0NM747Z3ieT+8J5kiuY6jroY+JI0Sv+lj/tn5ww7mQs6e/M/+7Od13tj+9OY9BvAjBzDl6RRprlZSn9ZmH465p49nVlBDTmGL6ndZj15a5opocMuqDbsssijLpU8h7Afxz18SatvWbdVHJzF09uL7z8rt39oaA7cw5fUTv2zaUbtqc/r0g1r3Q7xnns6B2N7e/G9vfxt21595u46cg9f0uqa5RaFo4yaiz/qfdYqv47fONzDl9ROk4zBTzpOP+6aPoPv07thypEj42+EviAGvqTV1X8htCZlYHhIrzUc1D9O3wv+a68dPs1zQRzSkaRZDZuKOTikM+ya+ut48NghHUkaNI+DtcOmYva+AfS+OTzxxIWrbfbf93YJ3MOX1E7jDpyOO0i7QbmHL0mDe/TjDpwu+sYrC2DgS2qHwQAfd7B2mTdeWScGvqR2mDbAJ52902/R99+dkmP4kjQvy7qEQx/H8CVpETb4MFCjwE+yLclDSU53ny8eUe7jSc4nmeDq/5K0Sc0yDLRATffw7wVOVNVu4ER3fZjfpnMDc0nSkjQN/FuBY93lY8BtwwpV1eeBOd3KXZI0i6aBf1lVnQPoPl/avEmSpPWwdVyBJA8Dlw/ZdN/8mwNJDgIHAXbu3LkeVUhSK40N/Kq6edS2JM8m2VFV55LsAM43bVBVHQWOQmdaZtP3kyR1NB3SOQ4c6C4fAB5o+H6SpHXSNPDvB/YlOQ3s666T5IokD/YKJfl94M+BH09yJsl7G9YrSZrS2CGdtVTV88BNQ14/C+zvW7+jST2SpOY801aSWsLAl6SWMPAlqSUMfElqCQNfklrCwJe0Wjb4TUiWycCXtFpW8F6082LgS1otG/wmJMvkLQ4laYV4i0NJkoEvSW1h4EtSSxj4ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLVEo8BPsi3JQ0lOd58vHlLmqiSfTXIqyVNJPtCkTknSbJru4d8LnKiq3cCJ7vqgl4BfqaqrgRuBQ0muaVivJGlKTQP/VuBYd/kYcNtggao6V1V/2V3+O+AU8MaG9UqSptQ08C+rqnPQCXbg0rUKJ9kFvAX4YsN6JUlT2jquQJKHgcuHbLpvmoqS/AjwR8AvV9XfrlHuIHAQYOfOndNUIUlaw9jAr6qbR21L8mySHVV1LskO4PyIcq+hE/a/V1WfHFPfUeAodC6PPK59kqTJNB3SOQ4c6C4fAB4YLJAkwG8Bp6rqNxrWJ0maUdPAvx/Yl+Q0sK+7TpIrkjzYLfN24D3Au5I81n3sb1ivJGlKY4d01lJVzwM3DXn9LLC/u/wFIE3qkSQ155m2ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLWEgS9JLWHgS1JLGPiS1BIGviS1hIEvSS1h4EtSSxj4ktQSBr4ktYSBL0ktYeBLUksY+JLUEga+JLVEo8BPsi3JQ0lOd58vHlLmdUm+lOQrSZ5K8utN6pQkzabpHv69wImq2g2c6K4PehF4V1W9GbgOuCXJjQ3rlSRNqWng3woc6y4fA24bLFAd/6e7+pruoxrWK0maUtPAv6yqzgF0ny8dVijJliSPAeeBh6rqiw3rlSRNaeu4AkkeBi4fsum+SSupqpeB65JcBHwqyZ6qenJEfQeBgwA7d+6ctApJ0hhjA7+qbh61LcmzSXZU1bkkO+jswa/1Xn+T5HPALcDQwK+qo8BRgL179zr0I0lz0nRI5zhwoLt8AHhgsECS7d09e5L8EHAz8LWG9UqSptQ08O8H9iU5DezrrpPkiiQPdsvsAD6b5HHgETpj+J9uWK8kaUpjh3TWUlXPAzcNef0ssL+7/Djwlib1SJKa80xbSWoJA1+SWsLAl6SWMPAlqSUMfElqCQNfklrCwJekljDwJaklDHxJagkDX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAlqSUMfElqCQNfklqiUeAn2ZbkoSSnu88Xr1F2S5IvJ/F+tpK0BE338O8FTlTVbuBEd32UDwCnGtYnSZpR08C/FTjWXT4G3DasUJIrgX8CfKxhfZKkGTUN/Muq6hxA9/nSEeU+BPwq8P2G9UmSZrR1XIEkDwOXD9l03yQVJPl54HxVPZrkHROUPwgcBNi5c+ckVUiSJjA28Kvq5lHbkjybZEdVnUuyAzg/pNjbgV9Ish94HfD6JL9bVb84or6jwFGAvXv31iSdkCSN13RI5zhwoLt8AHhgsEBVfbCqrqyqXcDtwJ+OCntJ0vppGvj3A/uSnAb2dddJckWSB5s2TpI0P2OHdNZSVc8DNw15/Sywf8jrnwM+16ROSdJsPNNWklrCwJekljDwJaklDHxJagkDX5JawsCXpJYw8CWpJQx8SWoJA1+SWsLAl6SWMPAlqSUMfElqCQNfklrCwJekljDwJaklDHxJagkDX5JawsCXpJZodIvDJNuA/wbsAr4J/LOq+s6Qct8E/g54GXipqvY2qVeSNL2me/j3Aieqajdwors+yjur6jrDXpKWo2ng3woc6y4fA25r+H6SpHXSNPAvq6pzAN3nS0eUK+BPkjya5GDDOsc7dAi2bu08S5IASFWtXSB5GLh8yKb7gGNVdVFf2e9U1cVD3uOKqjqb5FLgIeD9VfX5EfUdBA4C7Ny5823f+ta3Ju3LBVu3wssvw5Yt8NJL0/97Sdqkkjw6auh87B5+Vd1cVXuGPB4Ank2yo1vJDuD8iPc4230+D3wKuH6N+o5W1d6q2rt9+/bxvRvmzjs7YX/nnbP9e0laQU2HdI4DB7rLB4AHBgsk+eEkP9pbBn4OeLJhvWs7fLizZ3/48LpWI0mbSdPAvx/Yl+Q0sK+7TpIrkjzYLXMZ8IUkXwG+BPyPqvrjhvVKkqbUaB5+VT0P3DTk9bPA/u7yN4A3N6lHktScZ9pKUksY+JLUEga+JLWEgS9JLWHgS1JLjD3TdpmSPAdMeqrtJcBfr2NzFm2V+rNKfYHV6o992bhm7c8/rKqhZ61u6MCfRpKTq3QlzlXqzyr1BVarP/Zl41qP/jikI0ktYeBLUkusUuAfXXYD5myV+rNKfYHV6o992bjm3p+VGcOXJK1tlfbwJUlrMPAlqSU2beAn2ZbkoSSnu8+vutNWt9xFSf4wydeSnEry04tu6ySm6M83kzyR5LEkJxfdzklM2pdu2S1Jvpzk04ts4zQm6U+S1yX5UpKvJHkqya8vo63jTNiXq5J8tvv38lSSDyyjreNM8Tfz8STnk6zvfThmkOSWJE8neSbJvUO2J8lvdrc/nuStTerbtIEP3AucqKrdwInu+jAfBv64qn6CzmWaTy2ofdOatD8A76yq6zbwnONp+vIBNu7vpGeS/rwIvKuq3gxcB9yS5MbFNXFik/TlJeBXqupq4EbgUJJrFtjGSU36/+y3gVsW1ahJJdkCHAbeDVwD3DHk5/xuYHf3cRD4SKNKq2pTPoCngR3d5R3A00PKvB74X3QPTm/kxyT96W77JnDJsts7p75cSecP9V3Ap5fd7qb96Sv/D4C/BG5Ydtub9qVb7gFg37Lb3qQvwC7gyWW3eaBNPw18pm/9g8AHB8p8FLhjWJ9neWzmPfzLquocQPf50iFl/hHwHPBfu8MGH+veZnEjmqQ/AAX8SZJHuzd834gm7cuHgF8Fvr+gds1qov50h6ceo3Nv54eq6ouLa+LEJv3dAJBkF/AWYNP3ZQN6I/DtvvUz3demLTOxRne8Wm9JHgYuH7LpvgnfYivwVuD9VfXFJB+m87Xv38+piVOZQ38A3l5VZ5NcCjyU5GtV9fn5tHByTfuS5OeB81X1aJJ3zLFpM5nH76aqXgauS3IR8Kkke6pq4ePGc/p/RpIfAf4I+OWq+tt5tG1a8+rLBpUhrw3Ok5+kzMQ2dOBX1c2jtiV5NsmOqjqXZAedvapBZ4AzfXtaf8ja48nrag79oTq3j6Sqzif5FHA9sPDAn0Nf3g78QpL9wOuA1yf53ar6xXVq8prm8bvpe6+/SfI5OuPGCw/8efQlyWvohP3vVdUn16mpY83z97IBnQGu6lu/Ejg7Q5mJbeYhnePAge7yATrjjK9QVf8b+HaSH+++dBPw1cU0b2pj+5Pkh5P8aG8Z+DmWECgTmOR388GqurKqdgG3A3+6rLCfwCS/m+3dPXuS/BBwM/C1RTVwCpP0JcBvAaeq6jcW2LZpje3LBvcIsDvJm5K8ls7fwfGBMseBX+rO1rkR+G5vGGsmyz5w0eCAx4/ROeB3uvu8rfv6FcCDfeWuA04CjwP/Hbh42W2ftT90jkl8pft4Crhv2e1u8rvpK/8ONvZB20l+Nz8FfLn7/+xJ4NeW3e4GffkZOsMGjwOPdR/7l932Wf+fAb8PnAP+ns4e83uX3fa+tu0Hvg78Ve/vGbgLuKu7HDozef4KeALY26Q+L60gSS2xmYd0JElTMPAlqSUMfElqCQNfklrCwJekljDwJaklDHxJaon/B+KVEH5fjE33AAAAAElFTkSuQmCC\n", 1019 | "text/plain": [ 1020 | "
" 1021 | ] 1022 | }, 1023 | "metadata": { 1024 | "needs_background": "light" 1025 | }, 1026 | "output_type": "display_data" 1027 | } 1028 | ], 1029 | "source": [ 1030 | "plt.scatter(u_2[:,0], u_2[:,1], c='red', s=4)\n", 1031 | "plt.show()" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": 95, 1037 | "metadata": {}, 1038 | "outputs": [ 1039 | { 1040 | "data": { 1041 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbmElEQVR4nO3dbXBc1Z3n8e/fso0gQzKDEcTBdiR2HIi9so3dRfGY2ICNcWVsHiprDJl1cCiXE8RmakOBU97iKXnB7NaS7AbvUJohPGwRx5TBgypAwWDsnTiGwRLRALbxEyHQi8sWZjaEMgbL+u+LbimN6JbUure7T9/7+1Sp1N33dt+jq6ufzj33nHPN3RERkeQbU+sCiIhIdSjwRURSQoEvIpISCnwRkZRQ4IuIpMTYWhdgKKeeeqo3NzfXuhgiInWjq6vrPXdvKrYs6MBvbm6ms7Oz1sUQEakbZvb7UstiadIxs4VmttvM9pnZ6iLLl5jZq2bWbWadZnZRHNsVEZGRi1zDN7MGYC0wH8gC282sw913Fqy2CehwdzezGcBjwNlRty0iIiMXRw3/XGCfu7/p7p8AvwSWFK7g7h/6n4b0fg7Q8F4RkSqLI/DPAN4peJ7Nv/YpZnaVmb0BPAWsKPVhZrYy3+zT2dPTE0PxREQE4gl8K/LaZ2rw7r7R3c8GrgR+VOrD3L3d3TPunmlqKnqhWURERiGOwM8CkwueTwLeLbWyu/8z8O/M7NQYti0iIiMUR+BvB6aaWYuZjQeuBToKVzCzvzQzyz+eDYwHDsewbREp0NcHBw+CJsGVYiIHvrv3Am3As8Au4DF332Fmq8xsVX61a4DXzaybXI+epa55mUVi1dcH8+bBpEkwd27uuUghCzl3M5mMa+CVyMgcPJgL+95eGDsWslk4/fRal0qqzcy63D1TbJnm0hFJiNNOgwsuyIX9BRfknosUCnpqBREZOTPYvBl6enJhb8X6z0mqKfBFEmTMGDXjSGlq0hERSQkFvohISijwRaQqNEag9hT4IlJxGiMQBgW+iFRcTw9s25YbI7BtW+65VJ8CX0QqTmMEwqBumSJScRojEAYFvohUhcYI1J6adEREUkKBLyLDUpfKZFDgi8iQ1KUyORT4IjIkdalMDgW+iAxJXSqTQ710RGRI6lKZHKrhi1RYEi549nepVNjXNwW+SAXpgqeERIEvUkG64CkhUeCLVJAueEpIdNFWpIJ0wVNCosAXqTDNISOhUJOOiEhKKPBFRFJCgS8ikhKxBL6ZLTSz3Wa2z8xWF1l+vZm9mv/aZmYz49iuiFRPEgaQlStpP3PkwDezBmAtcAUwDVhmZtMGrfY74OvuPgP4EdAedbsiUj1pHECWxJ85jhr+ucA+d3/T3T8BfgksKVzB3be5+7/ln74ETIphuyJSJWkcQJbEnzmOwD8DeKfgeTb/WinfAZ4ptdDMVppZp5l19iRhD4skQLEBZElr7hgsiYPm4gj8YkNJih4CZjaPXODfVurD3L3d3TPunmlqaoqheCISVf8AsmwWtmzJhXzSmjsGG/wzJ2HQXByBnwUmFzyfBLw7eCUzmwH8A7DE3Q/HsF0RqaLCGTOT2NxRTNJmCY0j8LcDU82sxczGA9cCHYUrmNkU4Angr919TwzbFJEaKmzuOP/8XI0/qU07SRI58N29F2gDngV2AY+5+w4zW2Vmq/Kr3Q5MAP6XmXWbWWfU7UrtJL3tVobX39zx9tu5x5MnJ7dpJ0nMA/6rzWQy3tmp/w0h6e+qtm1broa3eXPutLfaZdBkZGE4eDDXjt/bm6vtZ7OaN6jWzKzL3TPFlmmkrZSl1m23SewbXc+S2JMlyRT4UpZa/4HX+h+OfFoSe7IkmQJfylLrP/Ba/8ORz0paT5Yk03z4UrZazu+uG4qIjJ4CX+qObigiMjpq0hERSQkFvohISijwRURSQoEvImXTaOv6pMAXkbJo8Fv9UuCLSFk0+K1+KfBFpCwa/Fa/1A9fRMqiwW/1SzV8ESlbiNMphHIhOZRyFKPAF5G6F8qF5FDKUYoCX2oi5FqQ1J9QLiSHUo5SFPhSdaHXgqT+hHIhOZRylKKLtlJ1xWpBmgxNojCDTZvgjTdg2rTaXVsI/YK2avhSdaHXgkpRM1T1lLuv+/rg0kvhnHNyZ4+1PGsM8YJ2PwW+VF2tb6IyGmqGqp7R7OvQ285DocCXmgi5FlSMAqV6RrOv6/WssdoU+CIjoECpntHs63o8a6wFXbQVGYHQL8YlyWj3te6ENjwFvsgIKVCqR/u6MtSkIyKSErEEvpktNLPdZrbPzFYXWX62mb1oZh+b2S1xbFNERMoTuUnHzBqAtcB8IAtsN7MOd99ZsNr7wH8Croy6PRERGZ04avjnAvvc/U13/wT4JbCkcAV3P+Tu24FjMWxPRERGIY7APwN4p+B5Nv+aiIgEJI7AL9ZpatSDz81spZl1mllnj0a3iIjEJo7AzwKTC55PAt4d7Ye5e7u7Z9w909TUFLlwIiKSE0fgbwemmlmLmY0HrgU6YvhckYrQJGiSVpED3917gTbgWWAX8Ji77zCzVWa2CsDMvmhmWeA/A//FzLJm9vmo2xYplyZBkzQzD7iak8lkvLOzs9bFkAQ5eDAX9r29ublaslmN6JRkMbMud88UW6aRtpIqmgRN0kxz6UiqjHZirr4+TZwm9U81fEmdcufiV7u/JIUCX2QYuvmJJIUCX2qiHrpG9pexqUnt/pIMCnypunpoIiks47x5sGmT7qYk9U+BL1VXD00kg8t4+PDI2v3r4cxF0kuBL1VXD10jR1PGejhzkXRTt0ypunq4P+xoyljszEWDuiQkquFLTZTbNbIWyi1jPZy5SLqphi8Sk3o4c5F0Uw1fpExDXZithzMXSS8FvkgZdGFW6pkCX6QM9dClVKQUBb5IGYa7MKt++BIyBb5IGfovzBYbdavmHgmdAl+kTKUuzCaluUdnKcmlwBeJSRL64dfyLEX/aCpPgS8Sk6Gae+pFrc5S1BxWHQp8kRjVez/8Wp2lJKU5LHQKfBEZUKuzlCQ0h9UDTa0gIp/Sf5ZSTZqWojoU+CIShFr8o0kbNemIiKSEAl9EJCUU+CIiKRFL4JvZQjPbbWb7zGx1keVmZv8zv/xVM5sdx3ZFRGTkIge+mTUAa4ErgGnAMjObNmi1K4Cp+a+VwN9F3e5QNGJPakXHnoQsjhr+ucA+d3/T3T8BfgksGbTOEuARz3kJ+HMzmxjDtj9DI/aSL9RQ1bEnoYsj8M8A3il4ns2/Vu46AJjZSjPrNLPOnlEMt9OIvWQLOVR17Eno4gj8YkMkBte9RrJO7kX3dnfPuHumqamp7MKEOmIv1FppvQk5VEM99kT6xRH4WWBywfNJwLujWCcWIU5gFXKttN6EHKohHnsiheII/O3AVDNrMbPxwLVAx6B1OoD/mO+tcx7wB3c/EMO2iwptAquQa6VDCfGsJPRQDe3YEykUOfDdvRdoA54FdgGPufsOM1tlZqvyqz0NvAnsA/4e+F7U7daTkGulpYR8VqJQFRkd85Cqb4NkMhnv7OysdTFi0ddXXxNDHTyYC/ve3tw/qmxW85yI1AMz63L3TLFlGmlbJfVWK63HsxIRGZpmy5SiNF2tSPIo8KUkTVcrkixq0qkDIfaWEZH6o8APXDV6y+gfikg6KPADV+k+/CF3vxSReCnwA1fp3jL1OihMRMqnwA9cpUeWqvulSGA6OqCtLfc9ZuqlUwcq0VumcCCYul+KBKKjA5YtgyNH4MEHYd06WLw4to9XDT+FBrfbQ30NCksDXUhPqeeey4U95L4/91ysH6/ATyG124dNF9JTbMECOOmk3OOTTso9j5ECP4XUbh82/UNOscWLc804N90Ue3MOqA0/lTRtQtj6/yFv26Z/yKm0eHHsQd9PgR+AWsykqWkTwqV/yFIpatKpMbXXSjH1Nruq1AcFfo2pvVZEqkWBX2O6gCoi1aI2/BpTe20y1dsdziQdVMMPgNprk0XXZSRUCnype6GNStV1GQmVAl/qWoi1aV2XkVCpDV/qWrHadK3HF+i6jIRKNXypa6HWpnVdRkKkGr7UNdWmRUZOgS91T9NEiIyMmnRERFIiUuCb2Slm9k9mtjf//S9KrPdzMztkZq9H2Z5IkoXWvVSSJ2oNfzWwyd2nApvyz4t5CFgYcVsiiRVi91JJnqiBvwR4OP/4YeDKYiu5+z8D70fclkhiabCWVEPUwD/d3Q8A5L9H7hRnZivNrNPMOnt01EtKhNq9VJJl2F46ZvY88MUii9bEXxxw93agHSCTyag1cxiapCsZ1L1UqmHYwHf3y0otM7ODZjbR3Q+Y2UTgUKylkyH1t/v23wpv8+ZcF0WpT+peKpUWNR46gOX5x8uBJyN+npRB7b4iUo6ogX8PMN/M9gLz888xsy+Z2dP9K5nZOuBF4Cwzy5rZdyJuV1C7b+jUzVJCE2mkrbsfBi4t8vq7wKKC58uibEeKU7tvuNTcJiHSIVjnNElXmNTcJiFS4ItUgJrbJESaPE2kAtTcJiFS4ItUiLpZSmjUpCMikhIKfJGAqCunVJICXyQQI50xU/8UZLQU+FITCq3PGklXTk2jLFEo8OVTqhHECq3iRtKVU/37JQoFvgyoVhArtIrr78qZzcKWLcW7cqp/v0ShwE+o0dTUqxXECq3Shhs5PZJ/CiKlKPATaLQ19WoFsUIrGk2nIaOlgVcJVKymPpIBQNUcHapBSSLVpxp+AkWpqav2KJJcquEnkOZxEZFiFPgJpSYTERlMTToiIimhwBcRSQkFvohISijwRURSQoEvIpISCnwRkZRQ4NcRTSmcHPpdSi0o8OuEphRODv0upVYU+HVCUwonh36XUisK/DqhKYWTQ79LqZVIUyuY2SnAeqAZeAv4D+7+b4PWmQw8AnwR6APa3f1/RNluGml+nOTQ71JqJWoNfzWwyd2nApvyzwfrBX7g7l8FzgNuMrNpEbebSprJMjn0u5RaiBr4S4CH848fBq4cvIK7H3D3V/KP/wjsAs6IuF0RESlT1MA/3d0PQC7YgSFbI82sGTgH+Jch1llpZp1m1tmjq1kiIrEZtg3fzJ4n1/4+2JpyNmRmfwY8DvyNu39Qaj13bwfaATKZjHopi9S5Y8eOkc1mOXr0aK2LkiiNjY1MmjSJcePGjfg9wwa+u19WapmZHTSzie5+wMwmAodKrDeOXNg/6u5PjLh0IlL3stksJ598Ms3NzZguWsTC3Tl8+DDZbJaWlpYRvy9qk04HsDz/eDnw5OAVLPcbfgDY5e73RtyeiNSZo0ePMmHCBIV9jMyMCRMmlH3WFDXw7wHmm9leYH7+OWb2JTN7Or/OhcBfA5eYWXf+a1HE7YpIHVHYx280+zRSP3x3PwxcWuT1d4FF+cdbAf22RURqTCNtRSTxGhoamDVrFtOnT2fmzJnce++99NXJJEbd3d08/fTTw684ArqJuYgk3oknnkh3dzcAhw4d4rrrruMPf/gDd911V20LNgLd3d10dnayaFH0lnDV8EUkOJWcPvq0006jvb2d++67D3fn6NGj3HDDDbS2tnLOOeewefNmAI4fP84tt9xCa2srM2bM4Gc/+xkAzc3NvPfeewB0dnYyd+5cAO68806WL1/OggULaG5u5oknnuDWW2+ltbWVhQsXcuzYMQC6urr4+te/zpw5c7j88ss5cOAAAHPnzuW2227j3HPP5Stf+Qq//vWv+eSTT7j99ttZv349s2bNYv369ZF+dgW+iASlGtNHn3nmmfT19XHo0CHWrl0LwGuvvca6detYvnw5R48epb29nd/97nf89re/5dVXX+X6668f9nP379/PU089xZNPPsm3vvUt5s2bx2uvvcaJJ57IU089xbFjx7j55pvZsGEDXV1drFixgjVr/jSkqbe3l5dffpmf/vSn3HXXXYwfP567776bpUuX0t3dzdKlSyP93GrSEZGgFJs++vTT49+O508ftm7dys033wzA2WefzZe//GX27NnD888/z6pVqxg7NheTp5xyyrCfecUVVzBu3DhaW1s5fvw4CxcuBKC1tZW33nqL3bt38/rrrzN//nwgdxYxceLEgfdfffXVAMyZM4e33nortp+1nwI/xfr6NGOjhKd/+uht2yo3ffSbb75JQ0MDp5122kDwD+buRbs+jh07duCC7+B+8CeccAIAY8aMYdy4cQPvHzNmDL29vbg706dP58UXXyy6zf73NzQ00NvbO7ofbghq0kkp3XVJQtU/fXQ2C1u2xF8Z6enpYdWqVbS1tWFmfO1rX+PRRx8FYM+ePbz99tucddZZLFiwgPvvv38geN9//30g14bf1dUFwOOPP17Wts866yx6enoGAv/YsWPs2LFjyPecfPLJ/PGPfyxrO6Uo8FNKd12SkMU9ffRHH3000C3zsssuY8GCBdxxxx0AfO973+P48eO0traydOlSHnroIU444QRuvPFGpkyZwowZM5g5cya/+MUvALjjjjv4/ve/z8UXX0xDQ0NZ5Rg/fjwbNmzgtttuY+bMmcyaNYtt27YN+Z558+axc+fOWC7aWqnTmRBkMhnv7OysdTESyT1Xs+8/ba5ETUoEYNeuXXz1q1+tdTESqdi+NbMud88UW19t+Cmluy6JpI8CP8X6T5tFJB3Uhi8ikhIKfBGRlFDgi4ikhAJfRCQlFPgiknjZbJYlS5YwdepUzjzzTNra2vj4449rXayqU+CLSKK5O1dffTVXXnkle/fuZe/evXz00UfceuutkT/7+PHjMZSwehT4IhKejg5oa8t9j+iFF16gsbGRG264AcjNU/OTn/yERx55hPvuu4+2traBdb/xjW+wZcsWAJ577jnOP/98Zs+ezTe/+U0+/PBDIDe1wt13381FF13EPffcw+zZswfev3fvXubMmRO5zJWiwBeRsHR0wLJlsHZt7nvE0N+xY8dnQvjzn/88zc3NJScoe++99/jxj3/M888/zyuvvEImk+Hee+8dWN7Y2MjWrVtZs2YNX/jCFwZurvLggw/y7W9/O1J5K0mBLyJhee45OHIk9/jIkdzzCErNejnUtDIvvfQSO3fu5MILL2TWrFk8/PDD/P73vx9YXjgv/Y033siDDz7I8ePHWb9+Pdddd12k8laSAl9EwrJgAZx0Uu7xSSflnkcwffp0Bs/J9cEHH3Dw4EEmTJjwqXvb9k937O7Mnz+f7u5uuru72blzJw888MDAep/73OcGHl9zzTU888wz/OpXv2LOnDlMmDAhUnkrKfWBX8lbqYnIKCxeDOvWwU035b4vXhzp4y699FKOHDnCI488AuQutP7gBz+gra2NlpYWuru76evr45133uHll18G4LzzzuM3v/kN+/btA+DIkSPs2bOn6Oc3NjZy+eWX893vfnfgOkGoUh34mhNeJFCLF8N990UOewAzY+PGjWzYsIGpU6cyYcIExowZw5o1a7jwwgtpaWmhtbWVW265ZeACbFNTEw899BDLli1jxowZnHfeebzxxhslt3H99ddjZiyIeDZSaamePK1at1ITkdqaPHkyHfmLv9u2bWPZsmV0dXUxZ86cgZufDHbJJZewffv2z7xe7NaDW7duZcWKFWXPj19tqQ78atxKTUTCcsEFF3zqAmxUV111Ffv37+eFF16I7TMrJdWBrznhRSSqjRs31roIIxapDd/MTjGzfzKzvfnvf1FknUYze9nM/tXMdpjZXVG2Gbe4b6UmIp8V8p316tVo9mnUi7argU3uPhXYlH8+2MfAJe4+E5gFLDSz8yJuV0TqRGNjI4cPH1box8jdOXz4MI2NjWW9L2qTzhJgbv7xw8AW4LZBBXPgw/zTcfkv/eZFUmLSpElks1l6enpqXZREaWxsZNKkSWW9J2rgn+7uBwDc/YCZFb3saWYNQBfwl8Bad/+XUh9oZiuBlQBTpkyJWDwRqbVx48bR0tJS62IIIwh8M3se+GKRRWtGuhF3Pw7MMrM/Bzaa2b9399dLrNsOtANkMhmdCYiIxGTYwHf3y0otM7ODZjYxX7ufCBwa5rP+n5ltARYCRQNfREQqI+pF2w5gef7xcuDJwSuYWVO+Zo+ZnQhcBpQesiYiIhVhUa6cm9kE4DFgCvA28E13f9/MvgT8g7svMrMZ5C7oNpD7B/OYu989ws/vAUYzQuJU4L1RvK+aVMboQi8fhF/G0MsH4ZcxtPJ92d2bii2IFPihMrNOd8/UuhxDURmjC718EH4ZQy8fhF/G0MtXKNWTp4mIpIkCX0QkJZIa+O21LsAIqIzRhV4+CL+MoZcPwi9j6OUbkMg2fBER+ayk1vBFRGQQBb6ISErUVeCPcDrmyWa22cx25adj/n7Bsv9mZm+Y2atmtrFgQFizmX1kZt35r/trWMaS7zezH5rZPjPbbWaXV6p8+fV+bmaHzOz1Qa+vL9hPb5lZd/71qu7DYcp4p5n934KyLCpYFsI+DOI4HKaMoRyHC/Pb2WdmqwteD+k4LFXGih6Ho+LudfMF/Fdgdf7xauBvi6wzEZidf3wysAeYln++ABibf/y3/e8HmoHXAylj0fcD04B/BU4AWoD9QEMlypdf9jVg9lD7BfjvwO212IdDlRG4E7ilyPpB7MNQjsNhyljz45DcYM39wJnA+Px2p4V0HA5Vxkofh6P6maqxkdgKC7uBifnHE4HdI3jPk8D8Iq9fBTxagYMkUhlLvR/4IfDDgvc8C5xfyfINtV8AA94BptZyHxbb7hB/aEHtw1COwxL7sObHIXA+8Gyp318Ix+FQZaz0cTiar7pq0mHQdMzAkHehNbNm4Byg2HTMK4BnCp63mNlvzez/mNnFNSxjqfefQe7A7pfNv1bR8g3hYuCgu+8teK0m+7CEtnyTyc8LTsVD24cQyHFYxvuruQ9Hsq1aH4fDlbGSx2HZgrunrcUwHXP+c/4MeBz4G3f/YNCyNUAv0H+7+gPAFHc/bGZzgH80s+mD31fNMhZbvchrRfvUxlW+YSwD1hU8r8k+LOHvgB+R2z8/InfKv4LA9mEox2GZqrkPR7KtWh+HQ5Ux8nEYt+AC32OYjtnMxpEL0kfd/YlBy5YD3wAu9fz5lLt/TO5WjLh7l5ntB74CdNagjKXenwUmF6w3CXi3UuUbipmNBa4G5hRss+r7sBR3P1jwWX8P/Cr/NKR9GMRxOIQQjsMhtxXIcViyjHEch3GrtyadkUzHbMADwC53v3fQsoXkbsG42N2PFLzeZLm7cmFmZwJTgTdrUcYh3t8BXGtmJ5hZS76ML1eifCNwGfCGu2f7X6j2PhxK/o+z31X86d4LQezDUI7DUb6/mvtwOzDVzFrMbDxwbf59/UI4DkuWsQrHYfmqcaEgri9gArmbpe/Nfz8l//qXgKfzjy8id3r0KtCd/1qUX7aPXNtZ/+v351+/BthB7sr5K8Bf1bCMRd+fX7aG3BX93cAVlSpf/vk6cqfHx8jVSL5TsOwhYNWgz63qPhyqjMD/Bl7L798O8hfeQtmHoRyHw5QxlONwEblebPuBNYM+I5TjsGgZK30cjuZLUyuIiKREvTXpiIjIKCnwRURSQoEvIpISCnwRkZRQ4IuIpIQCX0QkJRT4IiIp8f8BwHrCsiUfVU4AAAAASUVORK5CYII=\n", 1042 | "text/plain": [ 1043 | "
" 1044 | ] 1045 | }, 1046 | "metadata": { 1047 | "needs_background": "light" 1048 | }, 1049 | "output_type": "display_data" 1050 | } 1051 | ], 1052 | "source": [ 1053 | "plt.scatter(vh_2[0,:], vh_2[1,:], c='blue', s=5, label='Document')\n", 1054 | "plt.scatter(query_vec_2[0], query_vec_2[1], c='red', s=10, label='Query')\n", 1055 | "plt.legend(loc='lower right')\n", 1056 | "plt.show()" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": null, 1062 | "metadata": {}, 1063 | "outputs": [], 1064 | "source": [] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "metadata": {}, 1069 | "source": [ 1070 | "### Useful debugging code" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "code", 1075 | "execution_count": 29, 1076 | "metadata": {}, 1077 | "outputs": [], 1078 | "source": [ 1079 | "similaries = []\n", 1080 | "for i in range(term_doc_mat.shape[1]):\n", 1081 | " s = np.dot(query_vec, term_doc_mat[:,i]) / (np.linalg.norm(query_vec) * np.linalg.norm(term_doc_mat[:,i]))\n", 1082 | " # print(i,s)\n", 1083 | " similaries.append(s)" 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": 30, 1089 | "metadata": {}, 1090 | "outputs": [ 1091 | { 1092 | "data": { 1093 | "text/plain": [ 1094 | "array([46, 3, 8, 27, 29, 36, 0, 28, 30, 31, 32, 33, 34, 35, 38, 39, 40,\n", 1095 | " 41, 42, 43, 44, 45, 47, 37, 26, 24, 48, 1, 2, 4, 5, 6, 7, 9,\n", 1096 | " 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 49])" 1097 | ] 1098 | }, 1099 | "execution_count": 30, 1100 | "metadata": {}, 1101 | "output_type": "execute_result" 1102 | } 1103 | ], 1104 | "source": [ 1105 | "sim = np.array(similaries)\n", 1106 | "(-sim).argsort()" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "code", 1111 | "execution_count": 73, 1112 | "metadata": {}, 1113 | "outputs": [ 1114 | { 1115 | "data": { 1116 | "text/plain": [ 1117 | "[('depress', 4),\n", 1118 | " ('interview', 4),\n", 1119 | " ('model', 3),\n", 1120 | " ('latent', 3),\n", 1121 | " ('assess', 2),\n", 1122 | " ('pattern', 2),\n", 1123 | " ('propos', 2),\n", 1124 | " ('categor', 2),\n", 1125 | " ('prompt', 2),\n", 1126 | " ('categori', 2),\n", 1127 | " ('accur', 1),\n", 1128 | " ('diagnos', 1),\n", 1129 | " ('requir', 1),\n", 1130 | " ('analysi', 1),\n", 1131 | " ('henc', 1),\n", 1132 | " ('autom', 1),\n", 1133 | " ('method', 1),\n", 1134 | " ('linguist', 1),\n", 1135 | " ('could', 1),\n", 1136 | " ('help', 1),\n", 1137 | " ('psychiatr', 1),\n", 1138 | " ('profession', 1),\n", 1139 | " ('make', 1),\n", 1140 | " ('faster', 1),\n", 1141 | " ('inform', 1),\n", 1142 | " ('decis', 1),\n", 1143 | " ('diagnosi', 1),\n", 1144 | " ('jlpc', 1),\n", 1145 | " ('analyz', 1),\n", 1146 | " ('transcript', 1),\n", 1147 | " ('identifi', 1),\n", 1148 | " ('jointli', 1),\n", 1149 | " ('allow', 1),\n", 1150 | " ('defin', 1),\n", 1151 | " ('convers', 1),\n", 1152 | " ('context', 1),\n", 1153 | " ('influenc', 1),\n", 1154 | " ('languag', 1),\n", 1155 | " ('individu', 1),\n", 1156 | " ('show', 1),\n", 1157 | " ('outperform', 1),\n", 1158 | " ('competit', 1),\n", 1159 | " ('baselin', 1),\n", 1160 | " ('provid', 1),\n", 1161 | " ('psycholinguist', 1),\n", 1162 | " ('insight', 1)]" 1163 | ] 1164 | }, 1165 | "execution_count": 73, 1166 | "metadata": {}, 1167 | "output_type": "execute_result" 1168 | } 1169 | ], 1170 | "source": [ 1171 | "paper_token_freq[15]" 1172 | ] 1173 | }, 1174 | { 1175 | "cell_type": "code", 1176 | "execution_count": null, 1177 | "metadata": {}, 1178 | "outputs": [], 1179 | "source": [] 1180 | } 1181 | ], 1182 | "metadata": { 1183 | "kernelspec": { 1184 | "display_name": "Python 3", 1185 | "language": "python", 1186 | "name": "python3" 1187 | }, 1188 | "language_info": { 1189 | "codemirror_mode": { 1190 | "name": "ipython", 1191 | "version": 3 1192 | }, 1193 | "file_extension": ".py", 1194 | "mimetype": "text/x-python", 1195 | "name": "python", 1196 | "nbconvert_exporter": "python", 1197 | "pygments_lexer": "ipython3", 1198 | "version": "3.7.4" 1199 | } 1200 | }, 1201 | "nbformat": 4, 1202 | "nbformat_minor": 4 1203 | } 1204 | -------------------------------------------------------------------------------- /cppKMeans/cppKMeans.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | 9 | // the iteration times 10 | const int T = 5; 11 | 12 | int N; 13 | int K, D; 14 | 15 | typedef struct Point { 16 | vector datas; 17 | int cluster; 18 | Point(int D) { 19 | datas = vector(D, 0); 20 | cluster = -1; 21 | } 22 | Point operator +(const Point& a) { 23 | Point res(this->datas.size()); 24 | for (int i= 0; i< this->datas.size(); i++) { 25 | res.datas[i] = this->datas[i] + a.datas[i]; 26 | } 27 | return res; 28 | } 29 | void operator +=(const Point& a) { 30 | for (int i= 0; i< this->datas.size(); i++) { 31 | this->datas[i] = this->datas[i] + a.datas[i]; 32 | } 33 | } 34 | void devide(const int& a) { 35 | for (int i= 0; i< this->datas.size(); i++) { 36 | this->datas[i] = this->datas[i] / (double)a; 37 | } 38 | } 39 | } Point; 40 | 41 | void printPoints(vector v) { 42 | for (int i= 0; i< v.size(); i++) { 43 | for (int j= 0; j< v[i].datas.size(); j++) { 44 | cout<< v[i].datas[j]<< " "; 45 | } 46 | cout<< v[i].cluster<< endl; 47 | } 48 | } 49 | 50 | double cal_distance(Point& p1, Point& p2) { 51 | // Euclidean Distance 52 | double res = 0; 53 | for (int i= 0; i< D; i++) { 54 | double temp = (double) (p1.datas[i] - p2.datas[i]); 55 | res += temp * temp; 56 | } 57 | res = sqrt(res); 58 | return res; 59 | } 60 | 61 | void assign_to_center(Point& p, vector& clusters) { 62 | double min_d = DBL_MAX; 63 | int min_index = -1; 64 | for (int i= 0; i< clusters.size(); i++) { 65 | double d = cal_distance(p, clusters[i]); 66 | if (d < min_d) { 67 | min_d = d; 68 | min_index = i; 69 | } 70 | } 71 | p.cluster = min_index; 72 | } 73 | 74 | int main() { 75 | cout<< "please enter the dimension of the data points"<< endl; 76 | cin>> D; 77 | cout<< "please enter the num of data points"<< endl; 78 | cin>> N; 79 | vector points(N, Point(D)); 80 | cout<< "please enter the data points"<< endl; 81 | for (int i= 0; i< N; i++) { 82 | for (int j= 0; j< D; j++) { 83 | cin>> points[i].datas[j]; 84 | } 85 | } 86 | // printPoints(points); 87 | cout<< "please enter the cluster num K"<< endl; 88 | cin>> K; 89 | vector clusters = vector(K, Point(D)); 90 | cout<< "please enter the initial means"<< endl; 91 | for (int i= 0; i< K; i++) { 92 | for (int j= 0; j< D; j++) { 93 | cin>> clusters[i].datas[j]; 94 | } 95 | clusters[i].cluster = i; 96 | } 97 | // printPoints(clusters); 98 | for (int t = 0; t < T; t++) { 99 | /* 100 | assign the points to according centers. 101 | */ 102 | for (int i= 0; i< N; i++) { 103 | assign_to_center(points[i], clusters); 104 | } 105 | /* 106 | recal the clusters. 107 | */ 108 | vector c_count = vector(K, 0); 109 | for (int i= 0; i< K; i++) { 110 | clusters[i].datas.assign(D, 0.0); 111 | } 112 | for (int i= 0; i< N; i++) { 113 | int c = points[i].cluster; 114 | ++c_count[c]; 115 | clusters[c] += points[i]; 116 | } 117 | for (int i= 0; i< K; i++) { 118 | clusters[i].devide(c_count[i]); 119 | } 120 | // print each step 121 | cout<< endl; 122 | cout<< "T = "<< t<< endl; 123 | cout<< "PRINT POINTS"<< endl; 124 | printPoints(points); 125 | cout<< "PRINT CLUSTERS"<< endl; 126 | printPoints(clusters); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /cppKMeans/cppKMeans_input.txt: -------------------------------------------------------------------------------- 1 | 2 2 | 8 3 | 17 12 4 | 5 12 5 | 17 14 6 | 5 16 7 | 20 15 8 | 3 9 9 | 12 3 10 | 12 32 11 | 2 12 | 12 3 13 | 12 32 14 | 15 | 2 16 | 8 17 | 17 12 18 | 5 12 19 | 17 14 20 | 5 16 21 | 20 15 22 | 3 9 23 | 12 3 24 | 12 32 25 | 2 26 | 5 12 27 | 17 12 28 | 29 | 2 30 | 8 31 | 17 12 32 | 5 12 33 | 17 14 34 | 5 16 35 | 20 15 36 | 3 9 37 | 12 3 38 | 12 32 39 | 3 40 | 12 3 41 | 12 32 42 | 5 12 43 | 44 | 2 45 | 8 46 | 17 12 47 | 5 12 48 | 17 14 49 | 5 16 50 | 20 15 51 | 3 9 52 | 12 3 53 | 12 32 54 | 4 55 | 12 3 56 | 12 32 57 | 5 12 58 | 17 12 --------------------------------------------------------------------------------