├── sample.csv
├── readme.md
├── nmf.py
├── nmftoxml.py
├── englishstop.txt
├── SurveyQuestionThemes.py
└── porter.py


/sample.csv:
--------------------------------------------------------------------------------
 1 | "id","text"
 2 | "1","Human machine interface for lab abc computer applications"
 3 | "2","A survey of user opinion of computer system response time"
 4 | "3","The EPS user interface management system"
 5 | "4","System and human system engineering testing of EPS"
 6 | "5","Relation of user perceived response time to error measurement"
 7 | "6","The generation of random binary unordered trees"
 8 | "7","The intersection graph of paths in trees"
 9 | "8","Graph minors IV Widths of trees and well quasi ordering"
10 | "9","Graph minors A survey"


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | NMFtoXML
 2 | 
 3 | NMFtoXML is a set of python scripts that imports text from a csv file, runs Non-negative Matrix Factorization and then outputs the resulting clusters to XML.
 4 | 
 5 | Dependencies:
 6 | Numpy is required.
 7 | The projected gradient NMF algorithm is used. The algorithm was ported to Python by Anthony Di Franco.
 8 | The PorterStemmer algorithm by Vivake Gupta.
 9 | 
10 | The script can be run from the command line.
11 | 
12 | The format is: python nmftoxml.py filename directory numberofthemes showdocs outputfilename
13 | 
14 | eg:
15 | python nmftoxml.py sample.csv C:\nmftoxml 2 1 sample.xml
16 | 
17 | 


--------------------------------------------------------------------------------
/nmf.py:
--------------------------------------------------------------------------------
 1 | # NMF by alternative non-negative least squares using projected gradients
 2 | # Author: Chih-Jen Lin, National Taiwan University
 3 | # Python/numpy translation: Anthony Di Franco
 4 | 
 5 | from numpy import *
 6 | from numpy.linalg import norm
 7 | from time import time
 8 | from sys import stdout
 9 | 
10 | def nmf(V,Winit,Hinit,tol,timelimit,maxiter):
11 |  """
12 |  (W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter)
13 |  W,H: output solution
14 |  Winit,Hinit: initial solution
15 |  tol: tolerance for a relative stopping condition
16 |  timelimit, maxiter: limit of time and iterations
17 |  """
18 | 
19 |  W = Winit; H = Hinit; initt = time();
20 | 
21 |  gradW = dot(W, dot(H, H.T)) - dot(V, H.T)
22 |  gradH = dot(dot(W.T, W), H) - dot(W.T, V)
23 |  initgrad = norm(r_[gradW, gradH.T])
24 |  #print 'Init gradient norm %f' % initgrad 
25 |  tolW = max(0.001,tol)*initgrad
26 |  tolH = tolW
27 | 
28 |  for iter in xrange(1,maxiter):
29 |   # stopping condition
30 |   projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)],
31 |                                  gradH[logical_or(gradH<0, H>0)]])
32 |   if projnorm < tol*initgrad or time() - initt > timelimit: break
33 |   
34 |   (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000)
35 |   W = W.T
36 |   gradW = gradW.T
37 |   
38 |   if iterW==1: tolW = 0.1 * tolW
39 | 
40 |   (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000)
41 |   if iterH==1: tolH = 0.1 * tolH
42 | 
43 |   if iter % 10 == 0: stdout.write('.')
44 | 
45 |  return (W,H)
46 | 
47 | def nlssubprob(V,W,Hinit,tol,maxiter):
48 |  """
49 |  H, grad: output solution and gradient
50 |  iter: #iterations used
51 |  V, W: constant matrices
52 |  Hinit: initial solution
53 |  tol: stopping tolerance
54 |  maxiter: limit of iterations
55 |  """
56 |  
57 |  H = Hinit
58 |  WtV = dot(W.T, V)
59 |  WtW = dot(W.T, W) 
60 | 
61 |  alpha = 1; beta = 0.1;
62 |  for iter in xrange(1, maxiter):  
63 |   grad = dot(WtW, H) - WtV
64 |   projgrad = norm(grad[logical_or(grad < 0, H >0)])
65 |   if projgrad < tol: break
66 | 
67 |   # search step size 
68 |   for inner_iter in xrange(1,20):
69 |    Hn = H - alpha*grad
70 |    Hn = where(Hn > 0, Hn, 0)
71 |    d = Hn-H
72 |    gradd = sum(grad * d)
73 |    dQd = sum(dot(WtW,d) * d)
74 |    suff_decr = 0.99*gradd + 0.5*dQd < 0;
75 |    if inner_iter == 1:
76 |     decr_alpha = not suff_decr; Hp = H;
77 |    if decr_alpha: 
78 |     if suff_decr:
79 |      H = Hn; break;
80 |     else:
81 |      alpha = alpha * beta;
82 |    else:
83 |       if not suff_decr or (Hp == Hn).all():
84 |        H = Hp; break;
85 |       else:
86 |        alpha = alpha/beta; Hp = Hn;
87 | 
88 |  return (H, grad, iter)


--------------------------------------------------------------------------------
/nmftoxml.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | # Programmed by Aneesha Bakharia
 3 | '''
 4 | 
 5 | import re 
 6 | from numpy import *
 7 | import csv 
 8 | import os
 9 | import sys
10 | import platform
11 | 
12 | import nmf as nmf # import non negative matrix factorization algorithm
13 | import porter as porter # import porter.py - porter stemmer algorithm
14 | import SurveyQuestionThemes as surveythemer
15 | 
16 | def remove_html_tags(data):
17 |     p = re.compile(r'<.*?>')
18 |     return p.sub('', data)
19 | 
20 | def findthemes(nothemes,wordlist,questionresponses,inc_articles,outputfile):
21 |     #print questionresponses
22 |     synonym_wordlists = []
23 |     synonym_wordlist = wordlist
24 |     synonym_wordlists = synonym_wordlist.splitlines()
25 |     exclude_wordlist = []
26 | 
27 |     stop_path = "englishstop.txt"
28 |     stop_words = surveythemer.loadStopWords(stop_path)
29 | 
30 |     surveyQuestionResponse = []
31 |     surveyQuestionResponseNID = []
32 |     
33 |     for response in questionresponses:
34 |       newresp = remove_html_tags(response["text"])
35 |       surveyQuestionResponse.append(newresp)
36 |       surveyQuestionResponseNID.append(response["id"])
37 |         
38 |     listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words)
39 |     wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords)
40 |     pc = nothemes
41 |     #size of input matrix
42 |     ic=shape(wordMatrix)[0]
43 |     fc=shape(wordMatrix)[1]
44 |     # Random initialization
45 |     w=array([[random.random() for j in range(pc)] for i in range(ic)])
46 |     h=array([[random.random() for i in range(fc)] for i in range(pc)])
47 |     nmfresult = ""
48 |     themes = ""
49 |     weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500)
50 |     themexml = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID)
51 |     f = open(outputfile, 'w')
52 |     f.write(themexml)
53 |     f.close()  
54 |     return 
55 |  
56 | fileName = sys.argv[1]
57 | directoryName = sys.argv[2]
58 | filepath = os.path.abspath(directoryName + '/' + fileName)
59 | nothemes = int(sys.argv[3])
60 | showdocs = int(sys.argv[4])
61 | outputfile = sys.argv[5]
62 | 
63 | outputfiledir = os.path.abspath(directoryName + '/data/' + outputfile)
64 | 
65 | data = csv.reader(open(filepath))  
66 | 
67 | questionresponses = []
68 | # Read the column names from the first line of the file  
69 | fields = data.next()  
70 | items = ""
71 | count = 1
72 | for row in data:  
73 |   #print count
74 |   count = count + 1
75 |   # Zip together the field names and values  
76 |   items = zip(fields, row)  
77 |   item = {}  
78 |   # Add the value to our dictionary  
79 |   for (name, value) in items:  
80 |     item[name] = value.strip() 
81 |   questionresponses.append(item)
82 | 
83 | findthemes(nothemes,"",questionresponses,showdocs,outputfiledir)


--------------------------------------------------------------------------------
/englishstop.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | a's
  3 | able
  4 | about
  5 | above
  6 | according
  7 | accordingly
  8 | across
  9 | actually
 10 | after
 11 | afterwards
 12 | again
 13 | against
 14 | ain't
 15 | all
 16 | allow
 17 | allows
 18 | almost
 19 | alone
 20 | along
 21 | already
 22 | also
 23 | although
 24 | always
 25 | am
 26 | among
 27 | amongst
 28 | an
 29 | and
 30 | another
 31 | any
 32 | anybody
 33 | anyhow
 34 | anyone
 35 | anything
 36 | anyway
 37 | anyways
 38 | anywhere
 39 | apart
 40 | appear
 41 | appreciate
 42 | appropriate
 43 | are
 44 | aren't
 45 | around
 46 | as
 47 | aside
 48 | ask
 49 | asking
 50 | associated
 51 | at
 52 | available
 53 | away
 54 | awfully
 55 | b
 56 | be
 57 | became
 58 | because
 59 | become
 60 | becomes
 61 | becoming
 62 | been
 63 | before
 64 | beforehand
 65 | behind
 66 | being
 67 | believe
 68 | below
 69 | beside
 70 | besides
 71 | best
 72 | better
 73 | between
 74 | beyond
 75 | both
 76 | brief
 77 | but
 78 | by
 79 | c
 80 | c'mon
 81 | c's
 82 | came
 83 | can
 84 | can't
 85 | cannot
 86 | cant
 87 | cause
 88 | causes
 89 | certain
 90 | certainly
 91 | changes
 92 | clearly
 93 | co
 94 | com
 95 | come
 96 | comes
 97 | concerning
 98 | consequently
 99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero


--------------------------------------------------------------------------------
/SurveyQuestionThemes.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | # Programmed by Aneesha Bakharia
  3 | '''
  4 | 
  5 | import re # import regular expression module
  6 | from numpy import * # import numpy for matrix & array features
  7 | from numpy.linalg import svd
  8 | from numpy.linalg import norm
  9 | #import nnmf # import non negative matrix factorization algorithm
 10 | import porter # import porter.py - porter stemmer algorithm
 11 | from xml.dom.minidom import Document
 12 | ##
 13 | # A collection of functions for using Non Negative Matrix Factorization to find themes in open ended survey questions
 14 | 
 15 | ##
 16 | # Utility function to return a list of all words that are have a length greater than a specified number of characters.
 17 | # @param text The text that must be split in to words.
 18 | # @param minWordReturnSize The minimum no of characters a word must have to be included.
 19 | def separatewords(text,minWordReturnSize):
 20 |   #splitter=re.compile('\\W*')
 21 |   splitter=re.compile('[^a-zA-Z0-9_\\+\\-]')
 22 |   return [singleWord.lower() for singleWord in splitter.split(text) if len(singleWord)>minWordReturnSize]
 23 | 
 24 | ##
 25 | # Utility function to sort a dictionary by Value in decending order
 26 | # Not the most efficient implementation - may need to refactor if speed becomes an issue
 27 | # @param dictionary The dictionary data structure.
 28 | # @return list A list of keys sorted by their value.
 29 | def sortDictionaryByValues(dictionary):
 30 |     """ Returns the keys of dictionary d sorted by their values """
 31 |     items=dictionary.items()
 32 |     backitems=[ [v[1],v[0]] for v in items]
 33 |     backitems.sort()
 34 |     backitems.reverse()
 35 |     return [ backitems[i][1] for i in range(0,len(backitems))]
 36 | 
 37 | ##
 38 | # Utility function to load stop words from a file and return as a list of words
 39 | # @param stopWordFile Path and file name of a file containing stop words.
 40 | # @return list A list of stop words.
 41 | def loadStopWords(stopWordFile):
 42 |     stopWords = []
 43 |     for line in open(stopWordFile):
 44 |         for word in line.split( ): #in case more than one per line
 45 |             stopWords.append(word)
 46 |     return stopWords
 47 | 
 48 | ##
 49 | # Utility function to remove stop words that are present in a list of words
 50 | # @param wordlist Unfiltered list of words
 51 | # @param stopwordlist List of stops words
 52 | # @return list Filtered list of words
 53 | def removeStopWords(wordlist,stopwordlist):
 54 |     return [word for word in wordlist if word not in stopwordlist]
 55 | 
 56 | ##
 57 | # This function returns a list of all words that are have a length greater than a specified number of characters.
 58 | # @param text The text that must be split in to words.
 59 | # @param minWordReturnSize The minimum no of characters a word must have to be included.
 60 | def getItemWords(list_of_words,stop_words):
 61 |   stemmer=porter.PorterStemmer()
 62 |   allwords={}
 63 |   itemwords=[]
 64 |   itemtitles=[]
 65 |   ec=0
 66 |   stemlist = {}
 67 |   # Loop over every item in list_of_words
 68 |   for item in list_of_words:
 69 |       words=separatewords(item,1)
 70 |       words = removeStopWords(words,stop_words)
 71 |       itemwords.append({})
 72 |       itemtitles.append("Response " + str(ec+1))
 73 |       # Increase the counts for this word in allwords and in articlewords
 74 |       for word in words:
 75 |         unstemmedword = word
 76 |         word=stemmer.stem(word,0,len(word)-1)
 77 | 	if word in stemlist:
 78 | 		temp = stemlist[word]
 79 |  		try:
 80 |         		temp.index(unstemmedword)
 81 |    		except ValueError:
 82 | 			temp.append(unstemmedword)
 83 | 			stemlist[word] = temp
 84 | 	else:
 85 | 		temp = []
 86 | 		temp.append(unstemmedword)
 87 | 		stemlist[word] = temp	
 88 |         allwords.setdefault(word,0)
 89 |         allwords[word]+=1
 90 |         itemwords[ec].setdefault(word,0)
 91 |         itemwords[ec][word]+=1
 92 |       ec+=1
 93 |   return allwords,itemwords,itemtitles,stemlist
 94 | 
 95 | 
 96 | ##
 97 | # Returns the document (row) and words (columns) matrix and the list of words as a vector
 98 | def createWordMatrix(all_words,words_inItems):
 99 |     #print all_words
100 |     #print words_inItems
101 |     wordvector=[]
102 |     # Only take words that are common but not too common
103 |     for w,c in all_words.items():
104 |         wordvector.append(w)
105 |         #if c<len(words_inItems)*0.6: #*0.2
106 |     # Create the word matrix
107 |     cols = len(wordvector)
108 |     rows = len(words_inItems)
109 |     l1=[[(wrd in f and f[wrd] or 0) for wrd in wordvector] for f in words_inItems]
110 |     wordMatrix = array(l1)
111 |     wordCount = []
112 |     sum = 0
113 |     for c in range(0, cols):
114 |         for r in range(0, rows):
115 |             sum += wordMatrix[r][c]
116 |         wordCount.append(sum)
117 |         sum = 0
118 |     return wordMatrix, wordvector, wordCount, cols, rows
119 | 
120 | 
121 | ##
122 | # Display themes, top six words in a theme and associated documents in theme.
123 | def display_themes(weights_matrix,themes_matrix,list_Of_Included_Words,item_Titles, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID):
124 |     # Data structures for Graph Representation
125 |     ThemeDictionary = {}
126 |     DocDictionary = {}
127 |     TermDictionary = {}
128 |     # Data structures to represent relationships
129 |     DocInTheme = {}
130 |     TermInTheme = {}
131 |     # Start of NMF interpretation
132 |     norows,nocols = shape(themes_matrix)
133 |     noarticles,nofeatures = shape(weights_matrix)
134 |     d1={}
135 |     count = 1
136 |     theme_html = ""
137 |     themes_html = ""
138 |     # setup XML structure
139 |     # Create the minidom document
140 |     doc = Document()
141 |     # Create the <themes> base element
142 |     themes_xml = doc.createElement("themes")
143 |     doc.appendChild(themes_xml)
144 |     
145 |     for row in range(norows):
146 | 	theme_html = ""
147 | 	# Create the main <theme> element
148 | 	theme_xml = doc.createElement("theme")
149 | 	theme_xml.setAttribute("id", str(count))
150 | 	theme_xml.setAttribute("title", "Theme " + str(count))
151 | 	themes_xml.appendChild(theme_xml)
152 | 	
153 |         ThemeDictionary["T" + str(count)] = "Theme " + str(count)
154 | 	
155 | 	# Create a <words> element
156 | 	words_xml = doc.createElement("words")
157 | 	theme_xml.appendChild(words_xml)
158 | 	
159 |         for col in range(nocols):
160 |             d1[list_Of_Included_Words[col]] = themes_matrix[row,col]
161 |         themes_list_in_order = sortDictionaryByValues(d1)
162 |         for it in themes_list_in_order[0:6]:
163 |           word_index = list_Of_Included_Words.index(it)
164 |           TermDictionary[it] = stemdictionary[it]
165 |           TermInTheme["T" + str(count) + "_" + it] = str("%.2f" % d1[it])
166 | 	  
167 | 	  word_xml = doc.createElement("word")
168 | 	  word_xml.setAttribute("weight", str(d1[it]))
169 | 	  wordtext_xml = doc.createTextNode(it)
170 | 	  word_xml.appendChild(wordtext_xml)
171 | 	  words_xml.appendChild(word_xml)
172 | 	  
173 | 	theme_html = theme_html + "</p>"
174 | 	if (inc_articles==1):
175 |           # Print articles/items/survey responses that map to Theme/Feature
176 |           articlesInTheme = {}
177 |           articleweights=[]
178 |           for article in range(noarticles):
179 |             if (weights_matrix[article,row] > 0.08):
180 |                 articlesInTheme[article] = weights_matrix[article,row]
181 |                 articleweights.append(weights_matrix[article,row])
182 |                 DocDictionary["D" + str(article)] = "Doc" + str(article)
183 |                 DocInTheme["T" + str(count) + "_D" + str(article)] = str("%.2f" % weights_matrix[article,row])
184 |             else:
185 |                 # Capture all non attached documents
186 |                 DocDictionary["D" + str(article)] = "Doc" + str(article)
187 |           max_weight_val = max(articleweights)
188 |           min_weight_val = min(articleweights)
189 | 
190 |           articles_In_Theme_Order = sortDictionaryByValues(articlesInTheme)
191 | 
192 | 	  # Create a <responses> element
193 | 	  reponses_xml = doc.createElement("responses")
194 | 	  theme_xml.appendChild(reponses_xml)
195 | 	  
196 | 	  for article_no in articles_In_Theme_Order:
197 | 	    
198 | 	    response_xml = doc.createElement("response")
199 | 	    response_xml.setAttribute("id", str(surveyQuestionResponseNID[article_no]))
200 | 	    response_xml.setAttribute("weight", str(("%.2f" % weights_matrix[article_no,row])))
201 | 
202 | 	    responsetext_xml = doc.createTextNode(item_Titles[article_no])
203 | 	    response_xml.appendChild(responsetext_xml)
204 | 
205 | 	    reponses_xml.appendChild(response_xml)
206 | 
207 | 	  # Create Rendered element
208 | 	  rendered_xml = doc.createElement("rendered")
209 | 	  theme_xml.appendChild(rendered_xml)
210 | 
211 | 	count = count + 1
212 |     return doc.toprettyxml(indent="  ")
213 |    
214 | 
215 | # Strange way to determine if NaN in Python?
216 | def isNaN(x):
217 |     return (x == x) == False
218 |     


--------------------------------------------------------------------------------
/porter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Porter Stemming Algorithm
  4 | This is the Porter stemming algorithm, ported to Python from the
  5 | version coded up in ANSI C by the author. It may be be regarded
  6 | as canonical, in that it follows the algorithm presented in
  7 | 
  8 | Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  9 | no. 3, pp 130-137,
 10 | 
 11 | only differing from it at the points maked --DEPARTURE-- below.
 12 | 
 13 | See also http://www.tartarus.org/~martin/PorterStemmer
 14 | 
 15 | The algorithm as described in the paper could be exactly replicated
 16 | by adjusting the points of DEPARTURE, but this is barely necessary,
 17 | because (a) the points of DEPARTURE are definitely improvements, and
 18 | (b) no encoding of the Porter stemmer I have seen is anything like
 19 | as exact as this version, even with the points of DEPARTURE!
 20 | 
 21 | Vivake Gupta (v@nano.com)
 22 | 
 23 | Release 1: January 2001
 24 | 
 25 | Further adjustments by Santiago Bruno (bananabruno@gmail.com)
 26 | to allow word input not restricted to one word per line, leading
 27 | to:
 28 | 
 29 | release 2: July 2008
 30 | """
 31 | 
 32 | import sys
 33 | 
 34 | class PorterStemmer:
 35 | 
 36 |     def __init__(self):
 37 |         """The main part of the stemming algorithm starts here.
 38 |         b is a buffer holding a word to be stemmed. The letters are in b[k0],
 39 |         b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
 40 |         readjusted downwards as the stemming progresses. Zero termination is
 41 |         not in fact used in the algorithm.
 42 | 
 43 |         Note that only lower case sequences are stemmed. Forcing to lower case
 44 |         should be done before stem(...) is called.
 45 |         """
 46 | 
 47 |         self.b = ""  # buffer for word to be stemmed
 48 |         self.k = 0
 49 |         self.k0 = 0
 50 |         self.j = 0   # j is a general offset into the string
 51 | 
 52 |     def cons(self, i):
 53 |         """cons(i) is TRUE <=> b[i] is a consonant."""
 54 |         if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
 55 |             return 0
 56 |         if self.b[i] == 'y':
 57 |             if i == self.k0:
 58 |                 return 1
 59 |             else:
 60 |                 return (not self.cons(i - 1))
 61 |         return 1
 62 | 
 63 |     def m(self):
 64 |         """m() measures the number of consonant sequences between k0 and j.
 65 |         if c is a consonant sequence and v a vowel sequence, and <..>
 66 |         indicates arbitrary presence,
 67 | 
 68 |            <c><v>       gives 0
 69 |            <c>vc<v>     gives 1
 70 |            <c>vcvc<v>   gives 2
 71 |            <c>vcvcvc<v> gives 3
 72 |            ....
 73 |         """
 74 |         n = 0
 75 |         i = self.k0
 76 |         while 1:
 77 |             if i > self.j:
 78 |                 return n
 79 |             if not self.cons(i):
 80 |                 break
 81 |             i = i + 1
 82 |         i = i + 1
 83 |         while 1:
 84 |             while 1:
 85 |                 if i > self.j:
 86 |                     return n
 87 |                 if self.cons(i):
 88 |                     break
 89 |                 i = i + 1
 90 |             i = i + 1
 91 |             n = n + 1
 92 |             while 1:
 93 |                 if i > self.j:
 94 |                     return n
 95 |                 if not self.cons(i):
 96 |                     break
 97 |                 i = i + 1
 98 |             i = i + 1
 99 | 
100 |     def vowelinstem(self):
101 |         """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
102 |         for i in range(self.k0, self.j + 1):
103 |             if not self.cons(i):
104 |                 return 1
105 |         return 0
106 | 
107 |     def doublec(self, j):
108 |         """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
109 |         if j < (self.k0 + 1):
110 |             return 0
111 |         if (self.b[j] != self.b[j-1]):
112 |             return 0
113 |         return self.cons(j)
114 | 
115 |     def cvc(self, i):
116 |         """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
117 |         and also if the second c is not w,x or y. this is used when trying to
118 |         restore an e at the end of a short  e.g.
119 | 
120 |            cav(e), lov(e), hop(e), crim(e), but
121 |            snow, box, tray.
122 |         """
123 |         if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
124 |             return 0
125 |         ch = self.b[i]
126 |         if ch == 'w' or ch == 'x' or ch == 'y':
127 |             return 0
128 |         return 1
129 | 
130 |     def ends(self, s):
131 |         """ends(s) is TRUE <=> k0,...k ends with the string s."""
132 |         length = len(s)
133 |         if s[length - 1] != self.b[self.k]: # tiny speed-up
134 |             return 0
135 |         if length > (self.k - self.k0 + 1):
136 |             return 0
137 |         if self.b[self.k-length+1:self.k+1] != s:
138 |             return 0
139 |         self.j = self.k - length
140 |         return 1
141 | 
142 |     def setto(self, s):
143 |         """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
144 |         length = len(s)
145 |         self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
146 |         self.k = self.j + length
147 | 
148 |     def r(self, s):
149 |         """r(s) is used further down."""
150 |         if self.m() > 0:
151 |             self.setto(s)
152 | 
153 |     def step1ab(self):
154 |         """step1ab() gets rid of plurals and -ed or -ing. e.g.
155 | 
156 |            caresses  ->  caress
157 |            ponies    ->  poni
158 |            ties      ->  ti
159 |            caress    ->  caress
160 |            cats      ->  cat
161 | 
162 |            feed      ->  feed
163 |            agreed    ->  agree
164 |            disabled  ->  disable
165 | 
166 |            matting   ->  mat
167 |            mating    ->  mate
168 |            meeting   ->  meet
169 |            milling   ->  mill
170 |            messing   ->  mess
171 | 
172 |            meetings  ->  meet
173 |         """
174 |         if self.b[self.k] == 's':
175 |             if self.ends("sses"):
176 |                 self.k = self.k - 2
177 |             elif self.ends("ies"):
178 |                 self.setto("i")
179 |             elif self.b[self.k - 1] != 's':
180 |                 self.k = self.k - 1
181 |         if self.ends("eed"):
182 |             if self.m() > 0:
183 |                 self.k = self.k - 1
184 |         elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
185 |             self.k = self.j
186 |             if self.ends("at"):   self.setto("ate")
187 |             elif self.ends("bl"): self.setto("ble")
188 |             elif self.ends("iz"): self.setto("ize")
189 |             elif self.doublec(self.k):
190 |                 self.k = self.k - 1
191 |                 ch = self.b[self.k]
192 |                 if ch == 'l' or ch == 's' or ch == 'z':
193 |                     self.k = self.k + 1
194 |             elif (self.m() == 1 and self.cvc(self.k)):
195 |                 self.setto("e")
196 | 
197 |     def step1c(self):
198 |         """step1c() turns terminal y to i when there is another vowel in the stem."""
199 |         if (self.ends("y") and self.vowelinstem()):
200 |             self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
201 | 
202 |     def step2(self):
203 |         """step2() maps double suffices to single ones.
204 |         so -ization ( = -ize plus -ation) maps to -ize etc. note that the
205 |         string before the suffix must give m() > 0.
206 |         """
207 |         if self.b[self.k - 1] == 'a':
208 |             if self.ends("ational"):   self.r("ate")
209 |             elif self.ends("tional"):  self.r("tion")
210 |         elif self.b[self.k - 1] == 'c':
211 |             if self.ends("enci"):      self.r("ence")
212 |             elif self.ends("anci"):    self.r("ance")
213 |         elif self.b[self.k - 1] == 'e':
214 |             if self.ends("izer"):      self.r("ize")
215 |         elif self.b[self.k - 1] == 'l':
216 |             if self.ends("bli"):       self.r("ble") # --DEPARTURE--
217 |             # To match the published algorithm, replace this phrase with
218 |             #   if self.ends("abli"):      self.r("able")
219 |             elif self.ends("alli"):    self.r("al")
220 |             elif self.ends("entli"):   self.r("ent")
221 |             elif self.ends("eli"):     self.r("e")
222 |             elif self.ends("ousli"):   self.r("ous")
223 |         elif self.b[self.k - 1] == 'o':
224 |             if self.ends("ization"):   self.r("ize")
225 |             elif self.ends("ation"):   self.r("ate")
226 |             elif self.ends("ator"):    self.r("ate")
227 |         elif self.b[self.k - 1] == 's':
228 |             if self.ends("alism"):     self.r("al")
229 |             elif self.ends("iveness"): self.r("ive")
230 |             elif self.ends("fulness"): self.r("ful")
231 |             elif self.ends("ousness"): self.r("ous")
232 |         elif self.b[self.k - 1] == 't':
233 |             if self.ends("aliti"):     self.r("al")
234 |             elif self.ends("iviti"):   self.r("ive")
235 |             elif self.ends("biliti"):  self.r("ble")
236 |         elif self.b[self.k - 1] == 'g': # --DEPARTURE--
237 |             if self.ends("logi"):      self.r("log")
238 |         # To match the published algorithm, delete this phrase
239 | 
240 |     def step3(self):
241 |         """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
242 |         if self.b[self.k] == 'e':
243 |             if self.ends("icate"):     self.r("ic")
244 |             elif self.ends("ative"):   self.r("")
245 |             elif self.ends("alize"):   self.r("al")
246 |         elif self.b[self.k] == 'i':
247 |             if self.ends("iciti"):     self.r("ic")
248 |         elif self.b[self.k] == 'l':
249 |             if self.ends("ical"):      self.r("ic")
250 |             elif self.ends("ful"):     self.r("")
251 |         elif self.b[self.k] == 's':
252 |             if self.ends("ness"):      self.r("")
253 | 
254 |     def step4(self):
255 |         """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
256 |         if self.b[self.k - 1] == 'a':
257 |             if self.ends("al"): pass
258 |             else: return
259 |         elif self.b[self.k - 1] == 'c':
260 |             if self.ends("ance"): pass
261 |             elif self.ends("ence"): pass
262 |             else: return
263 |         elif self.b[self.k - 1] == 'e':
264 |             if self.ends("er"): pass
265 |             else: return
266 |         elif self.b[self.k - 1] == 'i':
267 |             if self.ends("ic"): pass
268 |             else: return
269 |         elif self.b[self.k - 1] == 'l':
270 |             if self.ends("able"): pass
271 |             elif self.ends("ible"): pass
272 |             else: return
273 |         elif self.b[self.k - 1] == 'n':
274 |             if self.ends("ant"): pass
275 |             elif self.ends("ement"): pass
276 |             elif self.ends("ment"): pass
277 |             elif self.ends("ent"): pass
278 |             else: return
279 |         elif self.b[self.k - 1] == 'o':
280 |             if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
281 |             elif self.ends("ou"): pass
282 |             # takes care of -ous
283 |             else: return
284 |         elif self.b[self.k - 1] == 's':
285 |             if self.ends("ism"): pass
286 |             else: return
287 |         elif self.b[self.k - 1] == 't':
288 |             if self.ends("ate"): pass
289 |             elif self.ends("iti"): pass
290 |             else: return
291 |         elif self.b[self.k - 1] == 'u':
292 |             if self.ends("ous"): pass
293 |             else: return
294 |         elif self.b[self.k - 1] == 'v':
295 |             if self.ends("ive"): pass
296 |             else: return
297 |         elif self.b[self.k - 1] == 'z':
298 |             if self.ends("ize"): pass
299 |             else: return
300 |         else:
301 |             return
302 |         if self.m() > 1:
303 |             self.k = self.j
304 | 
305 |     def step5(self):
306 |         """step5() removes a final -e if m() > 1, and changes -ll to -l if
307 |         m() > 1.
308 |         """
309 |         self.j = self.k
310 |         if self.b[self.k] == 'e':
311 |             a = self.m()
312 |             if a > 1 or (a == 1 and not self.cvc(self.k-1)):
313 |                 self.k = self.k - 1
314 |         if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
315 |             self.k = self.k -1
316 | 
317 |     def stem(self, p, i, j):
318 |         """In stem(p,i,j), p is a char pointer, and the string to be stemmed
319 |         is from p[i] to p[j] inclusive. Typically i is zero and j is the
320 |         offset to the last character of a string, (p[j+1] == '\0'). The
321 |         stemmer adjusts the characters p[i] ... p[j] and returns the new
322 |         end-point of the string, k. Stemming never increases word length, so
323 |         i <= k <= j. To turn the stemmer into a module, declare 'stem' as
324 |         extern, and delete the remainder of this file.
325 |         """
326 |         # copy the parameters into statics
327 |         self.b = p
328 |         self.k = j
329 |         self.k0 = i
330 |         if self.k <= self.k0 + 1:
331 |             return self.b # --DEPARTURE--
332 | 
333 |         # With this line, strings of length 1 or 2 don't go through the
334 |         # stemming process, although no mention is made of this in the
335 |         # published algorithm. Remove the line to match the published
336 |         # algorithm.
337 | 
338 |         self.step1ab()
339 |         self.step1c()
340 |         self.step2()
341 |         self.step3()
342 |         self.step4()
343 |         self.step5()
344 |         return self.b[self.k0:self.k+1]
345 | 
346 | 
347 | if __name__ == '__main__':
348 |     p = PorterStemmer()
349 |     if len(sys.argv) > 1:
350 |         for f in sys.argv[1:]:
351 |             infile = open(f, 'r')
352 |             while 1:
353 |                 output = ''
354 |                 word = ''
355 |                 line = infile.readline()
356 |                 if line == '':
357 |                     break
358 |                 for c in line:
359 |                     if c.isalpha():
360 |                         word += c.lower()
361 |                     else:
362 |                         if word:
363 |                             output += p.stem(word, 0,len(word)-1)
364 |                             word = ''
365 |                         output += c.lower()
366 |                 print output,
367 |             infile.close()
368 | 


--------------------------------------------------------------------------------