├── README.md └── fiedlerembedding.py /README.md: -------------------------------------------------------------------------------- 1 | A Python implementation of Fiedler Embeddings 2 | 3 | Fiedler Embedding was first proposed by Bruce Hendriksen as an alternative to LSA in 4 | http://www.siam.org/meetings/sdm06/workproceed/Text%20Mining/hendrickson22.pdf 5 | http://www.sandia.gov/~bahendr/papers/Fiedler-LSA.pdf 6 | 7 | Xi Wang uses Fielder Embedding in "Extracting Social Dimensions using Fiedler Embedding" 8 | http://ial.eecs.ucf.edu/pdf/Sukthankar-Xi-SocialCom2011.pdf 9 | 10 | Please Note: The code is still very experimental and yet to be implemented into a class. -------------------------------------------------------------------------------- /fiedlerembedding.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | from scipy import linalg,array,dot,mat 3 | from scipy.sparse.linalg import eigsh 4 | from math import * 5 | from pprint import pprint 6 | 7 | def createLaplacian(DoctermMatrix): 8 | # Create word-word mapping matrix 9 | # WW and DD to be eventually constructed from an external source 10 | # Created from sum of word and docs (as per example in Hendrickson paper) 11 | #WW = diag(add.reduce(DoctermMatrix),0) 12 | #DD = diag(add.reduce(DoctermMatrix, axis=1),0) 13 | 14 | # Create word-word mapping matrix 15 | # Initially constructed from the doc-word matrix 16 | WW = dot(DoctermMatrix.transpose(),DoctermMatrix) * (-1) 17 | # Create doc-doc mapping matrix 18 | DD = dot(DoctermMatrix,DoctermMatrix.transpose()) * (-1) 19 | 20 | B = DoctermMatrix * (-1) 21 | BT = B.transpose() 22 | 23 | # Create Block Matrix L 24 | # L is a (nodocs + noterms) by (nodocs + noterms) matrix 25 | # --- --- 26 | # WW BT 27 | # B DD 28 | # --- --- 29 | L = bmat('WW,BT; B,DD') 30 | 31 | # Make sure the diagonal values make the row-sums add to zero 32 | #Set the diagonals of L to 0 33 | fill_diagonal(L, 0) 34 | L_diag = L.sum(axis=1) * (-1) 35 | fill_diagonal(L, L_diag) 36 | return L 37 | 38 | 39 | def fiedlerEmbeddedSpace(L,k): 40 | # L = Laplacian 41 | # k = dimension after dimension reduction 42 | 43 | # Perform Eigen Decomposition on the Laplacian matrix L where L = V * D * (VT) where VT is Transpose of V 44 | # V and D are the eigenvectors and eigenvalues 45 | 46 | # Need the k+1 eigenvalues (non zero) and eigenvectors 47 | # ie the smallest eigenvalue is not included 48 | # Eigenvalues must be in increasing order 49 | 50 | #evals, evecs = eigsh(L, (k+1), which='SM', maxiter=5000) 51 | # Note if you have scipy 0.11 consider using shift invert mode 52 | # evals_small, evecs_small = eigsh(X, 3, sigma=0, which='LM') 53 | 54 | eval_k, evecs_k = eigsh(L, k, which='SM', return_eigenvectors=True) 55 | #fieldler_vector = sigma[1], eigenvects[:, 1] 56 | 57 | # Make S the k-dimensional embedded space S = (Dk^0.5) * VkT 58 | # where Dk and Vk are the k eigenvalues and corresponding 59 | eval_k = diag(eval_k,0)**0.5 60 | S = dot(eval_k,evecs_k.T) 61 | return S 62 | 63 | def query(S,q): 64 | '''Takes S the k-dimensional embedded space and the query vector as a parameter''' 65 | q_norm = q/linalg.norm(q) # normalize query vector 66 | qpos = dot(S,q_norm) 67 | return qpos 68 | 69 | def knnMatches(S,qpos,K): 70 | """ find the K nearest neighbours of in embedded space S """ 71 | qpos = qpos.T 72 | diff = (S.T - qpos)**2 73 | diff_sum = array(add.reduce(diff, axis=0)) 74 | diff_sum = diff_sum**0.5 75 | idx = argsort(diff_sum) 76 | return idx[:K] 77 | 78 | ''' 79 | # Example document-term matrix 80 | # Sentences from http://web.eecs.utk.edu/~berry/order/node4.html#SECTION00022000000000000000 81 | # nine docs c1- c5 related to human-computer interaction and m1- m4 related to graph theory. 82 | # Docs: 83 | # C1 = Human machine interface for Lab ABC computer applications 84 | # C2 = A survey of user opinion of computer system response time 85 | # C3 = The EPS user interface management system 86 | # C4 = System and human system engineering testing of EPS 87 | # C5 = Relation of user-perceived response time to error measurement 88 | # M1 = The generation of random, binary, unordered trees 89 | # M2 = The intersection graph of paths in trees 90 | # M3 = Graph minors IV: Widths of trees and quasi-ordering 91 | # M4 = Graph minors: A survey 92 | ''' 93 | termanddocvect = ["computer", "EPS", "human", "interface", "response", "system", "time", "user", "minors", "survey","trees", "graph", "C1", "C2", "C3", "C4", "C5", "M1", "M2", "M3", "M4"] 94 | 95 | # Vector dimensions: computer, EPS, human, interface, response, system, time, user, minors, survey,trees, graph 96 | docterm=array([[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0], 97 | [1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], 98 | [0.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 99 | [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], 100 | [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 101 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], 102 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], 103 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0], 104 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0]]) 105 | 106 | # Create Laplacian block Matrix 107 | L = createLaplacian(docterm) 108 | 109 | k = 2 110 | S = fiedlerEmbeddedSpace(L,k) 111 | 112 | print S 113 | 114 | # query for human 115 | q = array([0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 116 | qpos = query(S,q) 117 | print qpos 118 | 119 | matches = knnMatches(S,qpos,3) 120 | print matches 121 | 122 | for i in matches: 123 | print termanddocvect[i] 124 | 125 | --------------------------------------------------------------------------------