├── sample.csv ├── readme.md ├── nmf.py ├── nmftoxml.py ├── englishstop.txt ├── SurveyQuestionThemes.py └── porter.py /sample.csv: -------------------------------------------------------------------------------- 1 | "id","text" 2 | "1","Human machine interface for lab abc computer applications" 3 | "2","A survey of user opinion of computer system response time" 4 | "3","The EPS user interface management system" 5 | "4","System and human system engineering testing of EPS" 6 | "5","Relation of user perceived response time to error measurement" 7 | "6","The generation of random binary unordered trees" 8 | "7","The intersection graph of paths in trees" 9 | "8","Graph minors IV Widths of trees and well quasi ordering" 10 | "9","Graph minors A survey" -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | NMFtoXML 2 | 3 | NMFtoXML is a set of python scripts that imports text from a csv file, runs Non-negative Matrix Factorization and then outputs the resulting clusters to XML. 4 | 5 | Dependencies: 6 | Numpy is required. 7 | The projected gradient NMF algorithm is used. The algorithm was ported to Python by Anthony Di Franco. 8 | The PorterStemmer algorithm by Vivake Gupta. 9 | 10 | The script can be run from the command line. 11 | 12 | The format is: python nmftoxml.py filename directory numberofthemes showdocs outputfilename 13 | 14 | eg: 15 | python nmftoxml.py sample.csv C:\nmftoxml 2 1 sample.xml 16 | 17 | -------------------------------------------------------------------------------- /nmf.py: -------------------------------------------------------------------------------- 1 | # NMF by alternative non-negative least squares using projected gradients 2 | # Author: Chih-Jen Lin, National Taiwan University 3 | # Python/numpy translation: Anthony Di Franco 4 | 5 | from numpy import * 6 | from numpy.linalg import norm 7 | from time import time 8 | from sys import stdout 9 | 10 | def nmf(V,Winit,Hinit,tol,timelimit,maxiter): 11 | """ 12 | (W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter) 13 | W,H: output solution 14 | Winit,Hinit: initial solution 15 | tol: tolerance for a relative stopping condition 16 | timelimit, maxiter: limit of time and iterations 17 | """ 18 | 19 | W = Winit; H = Hinit; initt = time(); 20 | 21 | gradW = dot(W, dot(H, H.T)) - dot(V, H.T) 22 | gradH = dot(dot(W.T, W), H) - dot(W.T, V) 23 | initgrad = norm(r_[gradW, gradH.T]) 24 | #print 'Init gradient norm %f' % initgrad 25 | tolW = max(0.001,tol)*initgrad 26 | tolH = tolW 27 | 28 | for iter in xrange(1,maxiter): 29 | # stopping condition 30 | projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)], 31 | gradH[logical_or(gradH<0, H>0)]]) 32 | if projnorm < tol*initgrad or time() - initt > timelimit: break 33 | 34 | (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000) 35 | W = W.T 36 | gradW = gradW.T 37 | 38 | if iterW==1: tolW = 0.1 * tolW 39 | 40 | (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000) 41 | if iterH==1: tolH = 0.1 * tolH 42 | 43 | if iter % 10 == 0: stdout.write('.') 44 | 45 | return (W,H) 46 | 47 | def nlssubprob(V,W,Hinit,tol,maxiter): 48 | """ 49 | H, grad: output solution and gradient 50 | iter: #iterations used 51 | V, W: constant matrices 52 | Hinit: initial solution 53 | tol: stopping tolerance 54 | maxiter: limit of iterations 55 | """ 56 | 57 | H = Hinit 58 | WtV = dot(W.T, V) 59 | WtW = dot(W.T, W) 60 | 61 | alpha = 1; beta = 0.1; 62 | for iter in xrange(1, maxiter): 63 | grad = dot(WtW, H) - WtV 64 | projgrad = norm(grad[logical_or(grad < 0, H >0)]) 65 | if projgrad < tol: break 66 | 67 | # search step size 68 | for inner_iter in xrange(1,20): 69 | Hn = H - alpha*grad 70 | Hn = where(Hn > 0, Hn, 0) 71 | d = Hn-H 72 | gradd = sum(grad * d) 73 | dQd = sum(dot(WtW,d) * d) 74 | suff_decr = 0.99*gradd + 0.5*dQd < 0; 75 | if inner_iter == 1: 76 | decr_alpha = not suff_decr; Hp = H; 77 | if decr_alpha: 78 | if suff_decr: 79 | H = Hn; break; 80 | else: 81 | alpha = alpha * beta; 82 | else: 83 | if not suff_decr or (Hp == Hn).all(): 84 | H = Hp; break; 85 | else: 86 | alpha = alpha/beta; Hp = Hn; 87 | 88 | return (H, grad, iter) -------------------------------------------------------------------------------- /nmftoxml.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Programmed by Aneesha Bakharia 3 | ''' 4 | 5 | import re 6 | from numpy import * 7 | import csv 8 | import os 9 | import sys 10 | import platform 11 | 12 | import nmf as nmf # import non negative matrix factorization algorithm 13 | import porter as porter # import porter.py - porter stemmer algorithm 14 | import SurveyQuestionThemes as surveythemer 15 | 16 | def remove_html_tags(data): 17 | p = re.compile(r'<.*?>') 18 | return p.sub('', data) 19 | 20 | def findthemes(nothemes,wordlist,questionresponses,inc_articles,outputfile): 21 | #print questionresponses 22 | synonym_wordlists = [] 23 | synonym_wordlist = wordlist 24 | synonym_wordlists = synonym_wordlist.splitlines() 25 | exclude_wordlist = [] 26 | 27 | stop_path = "englishstop.txt" 28 | stop_words = surveythemer.loadStopWords(stop_path) 29 | 30 | surveyQuestionResponse = [] 31 | surveyQuestionResponseNID = [] 32 | 33 | for response in questionresponses: 34 | newresp = remove_html_tags(response["text"]) 35 | surveyQuestionResponse.append(newresp) 36 | surveyQuestionResponseNID.append(response["id"]) 37 | 38 | listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words) 39 | wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords) 40 | pc = nothemes 41 | #size of input matrix 42 | ic=shape(wordMatrix)[0] 43 | fc=shape(wordMatrix)[1] 44 | # Random initialization 45 | w=array([[random.random() for j in range(pc)] for i in range(ic)]) 46 | h=array([[random.random() for i in range(fc)] for i in range(pc)]) 47 | nmfresult = "" 48 | themes = "" 49 | weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500) 50 | themexml = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID) 51 | f = open(outputfile, 'w') 52 | f.write(themexml) 53 | f.close() 54 | return 55 | 56 | fileName = sys.argv[1] 57 | directoryName = sys.argv[2] 58 | filepath = os.path.abspath(directoryName + '/' + fileName) 59 | nothemes = int(sys.argv[3]) 60 | showdocs = int(sys.argv[4]) 61 | outputfile = sys.argv[5] 62 | 63 | outputfiledir = os.path.abspath(directoryName + '/data/' + outputfile) 64 | 65 | data = csv.reader(open(filepath)) 66 | 67 | questionresponses = [] 68 | # Read the column names from the first line of the file 69 | fields = data.next() 70 | items = "" 71 | count = 1 72 | for row in data: 73 | #print count 74 | count = count + 1 75 | # Zip together the field names and values 76 | items = zip(fields, row) 77 | item = {} 78 | # Add the value to our dictionary 79 | for (name, value) in items: 80 | item[name] = value.strip() 81 | questionresponses.append(item) 82 | 83 | findthemes(nothemes,"",questionresponses,showdocs,outputfiledir) -------------------------------------------------------------------------------- /englishstop.txt: -------------------------------------------------------------------------------- 1 | a 2 | a's 3 | able 4 | about 5 | above 6 | according 7 | accordingly 8 | across 9 | actually 10 | after 11 | afterwards 12 | again 13 | against 14 | ain't 15 | all 16 | allow 17 | allows 18 | almost 19 | alone 20 | along 21 | already 22 | also 23 | although 24 | always 25 | am 26 | among 27 | amongst 28 | an 29 | and 30 | another 31 | any 32 | anybody 33 | anyhow 34 | anyone 35 | anything 36 | anyway 37 | anyways 38 | anywhere 39 | apart 40 | appear 41 | appreciate 42 | appropriate 43 | are 44 | aren't 45 | around 46 | as 47 | aside 48 | ask 49 | asking 50 | associated 51 | at 52 | available 53 | away 54 | awfully 55 | b 56 | be 57 | became 58 | because 59 | become 60 | becomes 61 | becoming 62 | been 63 | before 64 | beforehand 65 | behind 66 | being 67 | believe 68 | below 69 | beside 70 | besides 71 | best 72 | better 73 | between 74 | beyond 75 | both 76 | brief 77 | but 78 | by 79 | c 80 | c'mon 81 | c's 82 | came 83 | can 84 | can't 85 | cannot 86 | cant 87 | cause 88 | causes 89 | certain 90 | certainly 91 | changes 92 | clearly 93 | co 94 | com 95 | come 96 | comes 97 | concerning 98 | consequently 99 | consider 100 | considering 101 | contain 102 | containing 103 | contains 104 | corresponding 105 | could 106 | couldn't 107 | course 108 | currently 109 | d 110 | definitely 111 | described 112 | despite 113 | did 114 | didn't 115 | different 116 | do 117 | does 118 | doesn't 119 | doing 120 | don't 121 | done 122 | down 123 | downwards 124 | during 125 | e 126 | each 127 | edu 128 | eg 129 | eight 130 | either 131 | else 132 | elsewhere 133 | enough 134 | entirely 135 | especially 136 | et 137 | etc 138 | even 139 | ever 140 | every 141 | everybody 142 | everyone 143 | everything 144 | everywhere 145 | ex 146 | exactly 147 | example 148 | except 149 | f 150 | far 151 | few 152 | fifth 153 | first 154 | five 155 | followed 156 | following 157 | follows 158 | for 159 | former 160 | formerly 161 | forth 162 | four 163 | from 164 | further 165 | furthermore 166 | g 167 | get 168 | gets 169 | getting 170 | given 171 | gives 172 | go 173 | goes 174 | going 175 | gone 176 | got 177 | gotten 178 | greetings 179 | h 180 | had 181 | hadn't 182 | happens 183 | hardly 184 | has 185 | hasn't 186 | have 187 | haven't 188 | having 189 | he 190 | he's 191 | hello 192 | help 193 | hence 194 | her 195 | here 196 | here's 197 | hereafter 198 | hereby 199 | herein 200 | hereupon 201 | hers 202 | herself 203 | hi 204 | him 205 | himself 206 | his 207 | hither 208 | hopefully 209 | how 210 | howbeit 211 | however 212 | i 213 | i'd 214 | i'll 215 | i'm 216 | i've 217 | ie 218 | if 219 | ignored 220 | immediate 221 | in 222 | inasmuch 223 | inc 224 | indeed 225 | indicate 226 | indicated 227 | indicates 228 | inner 229 | insofar 230 | instead 231 | into 232 | inward 233 | is 234 | isn't 235 | it 236 | it'd 237 | it'll 238 | it's 239 | its 240 | itself 241 | j 242 | just 243 | k 244 | keep 245 | keeps 246 | kept 247 | know 248 | knows 249 | known 250 | l 251 | last 252 | lately 253 | later 254 | latter 255 | latterly 256 | least 257 | less 258 | lest 259 | let 260 | let's 261 | like 262 | liked 263 | likely 264 | little 265 | look 266 | looking 267 | looks 268 | ltd 269 | m 270 | mainly 271 | many 272 | may 273 | maybe 274 | me 275 | mean 276 | meanwhile 277 | merely 278 | might 279 | more 280 | moreover 281 | most 282 | mostly 283 | much 284 | must 285 | my 286 | myself 287 | n 288 | name 289 | namely 290 | nd 291 | near 292 | nearly 293 | necessary 294 | need 295 | needs 296 | neither 297 | never 298 | nevertheless 299 | new 300 | next 301 | nine 302 | no 303 | nobody 304 | non 305 | none 306 | noone 307 | nor 308 | normally 309 | not 310 | nothing 311 | novel 312 | now 313 | nowhere 314 | o 315 | obviously 316 | of 317 | off 318 | often 319 | oh 320 | ok 321 | okay 322 | old 323 | on 324 | once 325 | one 326 | ones 327 | only 328 | onto 329 | or 330 | other 331 | others 332 | otherwise 333 | ought 334 | our 335 | ours 336 | ourselves 337 | out 338 | outside 339 | over 340 | overall 341 | own 342 | p 343 | particular 344 | particularly 345 | per 346 | perhaps 347 | placed 348 | please 349 | plus 350 | possible 351 | presumably 352 | probably 353 | provides 354 | q 355 | que 356 | quite 357 | qv 358 | r 359 | rather 360 | rd 361 | re 362 | really 363 | reasonably 364 | regarding 365 | regardless 366 | regards 367 | relatively 368 | respectively 369 | right 370 | s 371 | said 372 | same 373 | saw 374 | say 375 | saying 376 | says 377 | second 378 | secondly 379 | see 380 | seeing 381 | seem 382 | seemed 383 | seeming 384 | seems 385 | seen 386 | self 387 | selves 388 | sensible 389 | sent 390 | serious 391 | seriously 392 | seven 393 | several 394 | shall 395 | she 396 | should 397 | shouldn't 398 | since 399 | six 400 | so 401 | some 402 | somebody 403 | somehow 404 | someone 405 | something 406 | sometime 407 | sometimes 408 | somewhat 409 | somewhere 410 | soon 411 | sorry 412 | specified 413 | specify 414 | specifying 415 | still 416 | sub 417 | such 418 | sup 419 | sure 420 | t 421 | t's 422 | take 423 | taken 424 | tell 425 | tends 426 | th 427 | than 428 | thank 429 | thanks 430 | thanx 431 | that 432 | that's 433 | thats 434 | the 435 | their 436 | theirs 437 | them 438 | themselves 439 | then 440 | thence 441 | there 442 | there's 443 | thereafter 444 | thereby 445 | therefore 446 | therein 447 | theres 448 | thereupon 449 | these 450 | they 451 | they'd 452 | they'll 453 | they're 454 | they've 455 | think 456 | third 457 | this 458 | thorough 459 | thoroughly 460 | those 461 | though 462 | three 463 | through 464 | throughout 465 | thru 466 | thus 467 | to 468 | together 469 | too 470 | took 471 | toward 472 | towards 473 | tried 474 | tries 475 | truly 476 | try 477 | trying 478 | twice 479 | two 480 | u 481 | un 482 | under 483 | unfortunately 484 | unless 485 | unlikely 486 | until 487 | unto 488 | up 489 | upon 490 | us 491 | use 492 | used 493 | useful 494 | uses 495 | using 496 | usually 497 | uucp 498 | v 499 | value 500 | various 501 | very 502 | via 503 | viz 504 | vs 505 | w 506 | want 507 | wants 508 | was 509 | wasn't 510 | way 511 | we 512 | we'd 513 | we'll 514 | we're 515 | we've 516 | welcome 517 | well 518 | went 519 | were 520 | weren't 521 | what 522 | what's 523 | whatever 524 | when 525 | whence 526 | whenever 527 | where 528 | where's 529 | whereafter 530 | whereas 531 | whereby 532 | wherein 533 | whereupon 534 | wherever 535 | whether 536 | which 537 | while 538 | whither 539 | who 540 | who's 541 | whoever 542 | whole 543 | whom 544 | whose 545 | why 546 | will 547 | willing 548 | wish 549 | with 550 | within 551 | without 552 | won't 553 | wonder 554 | would 555 | would 556 | wouldn't 557 | x 558 | y 559 | yes 560 | yet 561 | you 562 | you'd 563 | you'll 564 | you're 565 | you've 566 | your 567 | yours 568 | yourself 569 | yourselves 570 | z 571 | zero -------------------------------------------------------------------------------- /SurveyQuestionThemes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | # Programmed by Aneesha Bakharia 3 | ''' 4 | 5 | import re # import regular expression module 6 | from numpy import * # import numpy for matrix & array features 7 | from numpy.linalg import svd 8 | from numpy.linalg import norm 9 | #import nnmf # import non negative matrix factorization algorithm 10 | import porter # import porter.py - porter stemmer algorithm 11 | from xml.dom.minidom import Document 12 | ## 13 | # A collection of functions for using Non Negative Matrix Factorization to find themes in open ended survey questions 14 | 15 | ## 16 | # Utility function to return a list of all words that are have a length greater than a specified number of characters. 17 | # @param text The text that must be split in to words. 18 | # @param minWordReturnSize The minimum no of characters a word must have to be included. 19 | def separatewords(text,minWordReturnSize): 20 | #splitter=re.compile('\\W*') 21 | splitter=re.compile('[^a-zA-Z0-9_\\+\\-]') 22 | return [singleWord.lower() for singleWord in splitter.split(text) if len(singleWord)>minWordReturnSize] 23 | 24 | ## 25 | # Utility function to sort a dictionary by Value in decending order 26 | # Not the most efficient implementation - may need to refactor if speed becomes an issue 27 | # @param dictionary The dictionary data structure. 28 | # @return list A list of keys sorted by their value. 29 | def sortDictionaryByValues(dictionary): 30 | """ Returns the keys of dictionary d sorted by their values """ 31 | items=dictionary.items() 32 | backitems=[ [v[1],v[0]] for v in items] 33 | backitems.sort() 34 | backitems.reverse() 35 | return [ backitems[i][1] for i in range(0,len(backitems))] 36 | 37 | ## 38 | # Utility function to load stop words from a file and return as a list of words 39 | # @param stopWordFile Path and file name of a file containing stop words. 40 | # @return list A list of stop words. 41 | def loadStopWords(stopWordFile): 42 | stopWords = [] 43 | for line in open(stopWordFile): 44 | for word in line.split( ): #in case more than one per line 45 | stopWords.append(word) 46 | return stopWords 47 | 48 | ## 49 | # Utility function to remove stop words that are present in a list of words 50 | # @param wordlist Unfiltered list of words 51 | # @param stopwordlist List of stops words 52 | # @return list Filtered list of words 53 | def removeStopWords(wordlist,stopwordlist): 54 | return [word for word in wordlist if word not in stopwordlist] 55 | 56 | ## 57 | # This function returns a list of all words that are have a length greater than a specified number of characters. 58 | # @param text The text that must be split in to words. 59 | # @param minWordReturnSize The minimum no of characters a word must have to be included. 60 | def getItemWords(list_of_words,stop_words): 61 | stemmer=porter.PorterStemmer() 62 | allwords={} 63 | itemwords=[] 64 | itemtitles=[] 65 | ec=0 66 | stemlist = {} 67 | # Loop over every item in list_of_words 68 | for item in list_of_words: 69 | words=separatewords(item,1) 70 | words = removeStopWords(words,stop_words) 71 | itemwords.append({}) 72 | itemtitles.append("Response " + str(ec+1)) 73 | # Increase the counts for this word in allwords and in articlewords 74 | for word in words: 75 | unstemmedword = word 76 | word=stemmer.stem(word,0,len(word)-1) 77 | if word in stemlist: 78 | temp = stemlist[word] 79 | try: 80 | temp.index(unstemmedword) 81 | except ValueError: 82 | temp.append(unstemmedword) 83 | stemlist[word] = temp 84 | else: 85 | temp = [] 86 | temp.append(unstemmedword) 87 | stemlist[word] = temp 88 | allwords.setdefault(word,0) 89 | allwords[word]+=1 90 | itemwords[ec].setdefault(word,0) 91 | itemwords[ec][word]+=1 92 | ec+=1 93 | return allwords,itemwords,itemtitles,stemlist 94 | 95 | 96 | ## 97 | # Returns the document (row) and words (columns) matrix and the list of words as a vector 98 | def createWordMatrix(all_words,words_inItems): 99 | #print all_words 100 | #print words_inItems 101 | wordvector=[] 102 | # Only take words that are common but not too common 103 | for w,c in all_words.items(): 104 | wordvector.append(w) 105 | #if c base element 142 | themes_xml = doc.createElement("themes") 143 | doc.appendChild(themes_xml) 144 | 145 | for row in range(norows): 146 | theme_html = "" 147 | # Create the main element 148 | theme_xml = doc.createElement("theme") 149 | theme_xml.setAttribute("id", str(count)) 150 | theme_xml.setAttribute("title", "Theme " + str(count)) 151 | themes_xml.appendChild(theme_xml) 152 | 153 | ThemeDictionary["T" + str(count)] = "Theme " + str(count) 154 | 155 | # Create a element 156 | words_xml = doc.createElement("words") 157 | theme_xml.appendChild(words_xml) 158 | 159 | for col in range(nocols): 160 | d1[list_Of_Included_Words[col]] = themes_matrix[row,col] 161 | themes_list_in_order = sortDictionaryByValues(d1) 162 | for it in themes_list_in_order[0:6]: 163 | word_index = list_Of_Included_Words.index(it) 164 | TermDictionary[it] = stemdictionary[it] 165 | TermInTheme["T" + str(count) + "_" + it] = str("%.2f" % d1[it]) 166 | 167 | word_xml = doc.createElement("word") 168 | word_xml.setAttribute("weight", str(d1[it])) 169 | wordtext_xml = doc.createTextNode(it) 170 | word_xml.appendChild(wordtext_xml) 171 | words_xml.appendChild(word_xml) 172 | 173 | theme_html = theme_html + "

" 174 | if (inc_articles==1): 175 | # Print articles/items/survey responses that map to Theme/Feature 176 | articlesInTheme = {} 177 | articleweights=[] 178 | for article in range(noarticles): 179 | if (weights_matrix[article,row] > 0.08): 180 | articlesInTheme[article] = weights_matrix[article,row] 181 | articleweights.append(weights_matrix[article,row]) 182 | DocDictionary["D" + str(article)] = "Doc" + str(article) 183 | DocInTheme["T" + str(count) + "_D" + str(article)] = str("%.2f" % weights_matrix[article,row]) 184 | else: 185 | # Capture all non attached documents 186 | DocDictionary["D" + str(article)] = "Doc" + str(article) 187 | max_weight_val = max(articleweights) 188 | min_weight_val = min(articleweights) 189 | 190 | articles_In_Theme_Order = sortDictionaryByValues(articlesInTheme) 191 | 192 | # Create a element 193 | reponses_xml = doc.createElement("responses") 194 | theme_xml.appendChild(reponses_xml) 195 | 196 | for article_no in articles_In_Theme_Order: 197 | 198 | response_xml = doc.createElement("response") 199 | response_xml.setAttribute("id", str(surveyQuestionResponseNID[article_no])) 200 | response_xml.setAttribute("weight", str(("%.2f" % weights_matrix[article_no,row]))) 201 | 202 | responsetext_xml = doc.createTextNode(item_Titles[article_no]) 203 | response_xml.appendChild(responsetext_xml) 204 | 205 | reponses_xml.appendChild(response_xml) 206 | 207 | # Create Rendered element 208 | rendered_xml = doc.createElement("rendered") 209 | theme_xml.appendChild(rendered_xml) 210 | 211 | count = count + 1 212 | return doc.toprettyxml(indent=" ") 213 | 214 | 215 | # Strange way to determine if NaN in Python? 216 | def isNaN(x): 217 | return (x == x) == False 218 | -------------------------------------------------------------------------------- /porter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Porter Stemming Algorithm 4 | This is the Porter stemming algorithm, ported to Python from the 5 | version coded up in ANSI C by the author. It may be be regarded 6 | as canonical, in that it follows the algorithm presented in 7 | 8 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 9 | no. 3, pp 130-137, 10 | 11 | only differing from it at the points maked --DEPARTURE-- below. 12 | 13 | See also http://www.tartarus.org/~martin/PorterStemmer 14 | 15 | The algorithm as described in the paper could be exactly replicated 16 | by adjusting the points of DEPARTURE, but this is barely necessary, 17 | because (a) the points of DEPARTURE are definitely improvements, and 18 | (b) no encoding of the Porter stemmer I have seen is anything like 19 | as exact as this version, even with the points of DEPARTURE! 20 | 21 | Vivake Gupta (v@nano.com) 22 | 23 | Release 1: January 2001 24 | 25 | Further adjustments by Santiago Bruno (bananabruno@gmail.com) 26 | to allow word input not restricted to one word per line, leading 27 | to: 28 | 29 | release 2: July 2008 30 | """ 31 | 32 | import sys 33 | 34 | class PorterStemmer: 35 | 36 | def __init__(self): 37 | """The main part of the stemming algorithm starts here. 38 | b is a buffer holding a word to be stemmed. The letters are in b[k0], 39 | b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is 40 | readjusted downwards as the stemming progresses. Zero termination is 41 | not in fact used in the algorithm. 42 | 43 | Note that only lower case sequences are stemmed. Forcing to lower case 44 | should be done before stem(...) is called. 45 | """ 46 | 47 | self.b = "" # buffer for word to be stemmed 48 | self.k = 0 49 | self.k0 = 0 50 | self.j = 0 # j is a general offset into the string 51 | 52 | def cons(self, i): 53 | """cons(i) is TRUE <=> b[i] is a consonant.""" 54 | if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': 55 | return 0 56 | if self.b[i] == 'y': 57 | if i == self.k0: 58 | return 1 59 | else: 60 | return (not self.cons(i - 1)) 61 | return 1 62 | 63 | def m(self): 64 | """m() measures the number of consonant sequences between k0 and j. 65 | if c is a consonant sequence and v a vowel sequence, and <..> 66 | indicates arbitrary presence, 67 | 68 | gives 0 69 | vc gives 1 70 | vcvc gives 2 71 | vcvcvc gives 3 72 | .... 73 | """ 74 | n = 0 75 | i = self.k0 76 | while 1: 77 | if i > self.j: 78 | return n 79 | if not self.cons(i): 80 | break 81 | i = i + 1 82 | i = i + 1 83 | while 1: 84 | while 1: 85 | if i > self.j: 86 | return n 87 | if self.cons(i): 88 | break 89 | i = i + 1 90 | i = i + 1 91 | n = n + 1 92 | while 1: 93 | if i > self.j: 94 | return n 95 | if not self.cons(i): 96 | break 97 | i = i + 1 98 | i = i + 1 99 | 100 | def vowelinstem(self): 101 | """vowelinstem() is TRUE <=> k0,...j contains a vowel""" 102 | for i in range(self.k0, self.j + 1): 103 | if not self.cons(i): 104 | return 1 105 | return 0 106 | 107 | def doublec(self, j): 108 | """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" 109 | if j < (self.k0 + 1): 110 | return 0 111 | if (self.b[j] != self.b[j-1]): 112 | return 0 113 | return self.cons(j) 114 | 115 | def cvc(self, i): 116 | """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant 117 | and also if the second c is not w,x or y. this is used when trying to 118 | restore an e at the end of a short e.g. 119 | 120 | cav(e), lov(e), hop(e), crim(e), but 121 | snow, box, tray. 122 | """ 123 | if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): 124 | return 0 125 | ch = self.b[i] 126 | if ch == 'w' or ch == 'x' or ch == 'y': 127 | return 0 128 | return 1 129 | 130 | def ends(self, s): 131 | """ends(s) is TRUE <=> k0,...k ends with the string s.""" 132 | length = len(s) 133 | if s[length - 1] != self.b[self.k]: # tiny speed-up 134 | return 0 135 | if length > (self.k - self.k0 + 1): 136 | return 0 137 | if self.b[self.k-length+1:self.k+1] != s: 138 | return 0 139 | self.j = self.k - length 140 | return 1 141 | 142 | def setto(self, s): 143 | """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" 144 | length = len(s) 145 | self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] 146 | self.k = self.j + length 147 | 148 | def r(self, s): 149 | """r(s) is used further down.""" 150 | if self.m() > 0: 151 | self.setto(s) 152 | 153 | def step1ab(self): 154 | """step1ab() gets rid of plurals and -ed or -ing. e.g. 155 | 156 | caresses -> caress 157 | ponies -> poni 158 | ties -> ti 159 | caress -> caress 160 | cats -> cat 161 | 162 | feed -> feed 163 | agreed -> agree 164 | disabled -> disable 165 | 166 | matting -> mat 167 | mating -> mate 168 | meeting -> meet 169 | milling -> mill 170 | messing -> mess 171 | 172 | meetings -> meet 173 | """ 174 | if self.b[self.k] == 's': 175 | if self.ends("sses"): 176 | self.k = self.k - 2 177 | elif self.ends("ies"): 178 | self.setto("i") 179 | elif self.b[self.k - 1] != 's': 180 | self.k = self.k - 1 181 | if self.ends("eed"): 182 | if self.m() > 0: 183 | self.k = self.k - 1 184 | elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): 185 | self.k = self.j 186 | if self.ends("at"): self.setto("ate") 187 | elif self.ends("bl"): self.setto("ble") 188 | elif self.ends("iz"): self.setto("ize") 189 | elif self.doublec(self.k): 190 | self.k = self.k - 1 191 | ch = self.b[self.k] 192 | if ch == 'l' or ch == 's' or ch == 'z': 193 | self.k = self.k + 1 194 | elif (self.m() == 1 and self.cvc(self.k)): 195 | self.setto("e") 196 | 197 | def step1c(self): 198 | """step1c() turns terminal y to i when there is another vowel in the stem.""" 199 | if (self.ends("y") and self.vowelinstem()): 200 | self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] 201 | 202 | def step2(self): 203 | """step2() maps double suffices to single ones. 204 | so -ization ( = -ize plus -ation) maps to -ize etc. note that the 205 | string before the suffix must give m() > 0. 206 | """ 207 | if self.b[self.k - 1] == 'a': 208 | if self.ends("ational"): self.r("ate") 209 | elif self.ends("tional"): self.r("tion") 210 | elif self.b[self.k - 1] == 'c': 211 | if self.ends("enci"): self.r("ence") 212 | elif self.ends("anci"): self.r("ance") 213 | elif self.b[self.k - 1] == 'e': 214 | if self.ends("izer"): self.r("ize") 215 | elif self.b[self.k - 1] == 'l': 216 | if self.ends("bli"): self.r("ble") # --DEPARTURE-- 217 | # To match the published algorithm, replace this phrase with 218 | # if self.ends("abli"): self.r("able") 219 | elif self.ends("alli"): self.r("al") 220 | elif self.ends("entli"): self.r("ent") 221 | elif self.ends("eli"): self.r("e") 222 | elif self.ends("ousli"): self.r("ous") 223 | elif self.b[self.k - 1] == 'o': 224 | if self.ends("ization"): self.r("ize") 225 | elif self.ends("ation"): self.r("ate") 226 | elif self.ends("ator"): self.r("ate") 227 | elif self.b[self.k - 1] == 's': 228 | if self.ends("alism"): self.r("al") 229 | elif self.ends("iveness"): self.r("ive") 230 | elif self.ends("fulness"): self.r("ful") 231 | elif self.ends("ousness"): self.r("ous") 232 | elif self.b[self.k - 1] == 't': 233 | if self.ends("aliti"): self.r("al") 234 | elif self.ends("iviti"): self.r("ive") 235 | elif self.ends("biliti"): self.r("ble") 236 | elif self.b[self.k - 1] == 'g': # --DEPARTURE-- 237 | if self.ends("logi"): self.r("log") 238 | # To match the published algorithm, delete this phrase 239 | 240 | def step3(self): 241 | """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" 242 | if self.b[self.k] == 'e': 243 | if self.ends("icate"): self.r("ic") 244 | elif self.ends("ative"): self.r("") 245 | elif self.ends("alize"): self.r("al") 246 | elif self.b[self.k] == 'i': 247 | if self.ends("iciti"): self.r("ic") 248 | elif self.b[self.k] == 'l': 249 | if self.ends("ical"): self.r("ic") 250 | elif self.ends("ful"): self.r("") 251 | elif self.b[self.k] == 's': 252 | if self.ends("ness"): self.r("") 253 | 254 | def step4(self): 255 | """step4() takes off -ant, -ence etc., in context vcvc.""" 256 | if self.b[self.k - 1] == 'a': 257 | if self.ends("al"): pass 258 | else: return 259 | elif self.b[self.k - 1] == 'c': 260 | if self.ends("ance"): pass 261 | elif self.ends("ence"): pass 262 | else: return 263 | elif self.b[self.k - 1] == 'e': 264 | if self.ends("er"): pass 265 | else: return 266 | elif self.b[self.k - 1] == 'i': 267 | if self.ends("ic"): pass 268 | else: return 269 | elif self.b[self.k - 1] == 'l': 270 | if self.ends("able"): pass 271 | elif self.ends("ible"): pass 272 | else: return 273 | elif self.b[self.k - 1] == 'n': 274 | if self.ends("ant"): pass 275 | elif self.ends("ement"): pass 276 | elif self.ends("ment"): pass 277 | elif self.ends("ent"): pass 278 | else: return 279 | elif self.b[self.k - 1] == 'o': 280 | if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass 281 | elif self.ends("ou"): pass 282 | # takes care of -ous 283 | else: return 284 | elif self.b[self.k - 1] == 's': 285 | if self.ends("ism"): pass 286 | else: return 287 | elif self.b[self.k - 1] == 't': 288 | if self.ends("ate"): pass 289 | elif self.ends("iti"): pass 290 | else: return 291 | elif self.b[self.k - 1] == 'u': 292 | if self.ends("ous"): pass 293 | else: return 294 | elif self.b[self.k - 1] == 'v': 295 | if self.ends("ive"): pass 296 | else: return 297 | elif self.b[self.k - 1] == 'z': 298 | if self.ends("ize"): pass 299 | else: return 300 | else: 301 | return 302 | if self.m() > 1: 303 | self.k = self.j 304 | 305 | def step5(self): 306 | """step5() removes a final -e if m() > 1, and changes -ll to -l if 307 | m() > 1. 308 | """ 309 | self.j = self.k 310 | if self.b[self.k] == 'e': 311 | a = self.m() 312 | if a > 1 or (a == 1 and not self.cvc(self.k-1)): 313 | self.k = self.k - 1 314 | if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: 315 | self.k = self.k -1 316 | 317 | def stem(self, p, i, j): 318 | """In stem(p,i,j), p is a char pointer, and the string to be stemmed 319 | is from p[i] to p[j] inclusive. Typically i is zero and j is the 320 | offset to the last character of a string, (p[j+1] == '\0'). The 321 | stemmer adjusts the characters p[i] ... p[j] and returns the new 322 | end-point of the string, k. Stemming never increases word length, so 323 | i <= k <= j. To turn the stemmer into a module, declare 'stem' as 324 | extern, and delete the remainder of this file. 325 | """ 326 | # copy the parameters into statics 327 | self.b = p 328 | self.k = j 329 | self.k0 = i 330 | if self.k <= self.k0 + 1: 331 | return self.b # --DEPARTURE-- 332 | 333 | # With this line, strings of length 1 or 2 don't go through the 334 | # stemming process, although no mention is made of this in the 335 | # published algorithm. Remove the line to match the published 336 | # algorithm. 337 | 338 | self.step1ab() 339 | self.step1c() 340 | self.step2() 341 | self.step3() 342 | self.step4() 343 | self.step5() 344 | return self.b[self.k0:self.k+1] 345 | 346 | 347 | if __name__ == '__main__': 348 | p = PorterStemmer() 349 | if len(sys.argv) > 1: 350 | for f in sys.argv[1:]: 351 | infile = open(f, 'r') 352 | while 1: 353 | output = '' 354 | word = '' 355 | line = infile.readline() 356 | if line == '': 357 | break 358 | for c in line: 359 | if c.isalpha(): 360 | word += c.lower() 361 | else: 362 | if word: 363 | output += p.stem(word, 0,len(word)-1) 364 | word = '' 365 | output += c.lower() 366 | print output, 367 | infile.close() 368 | --------------------------------------------------------------------------------