├── sample.csv
├── readme.md
├── nmf.py
├── nmftoxml.py
├── englishstop.txt
├── SurveyQuestionThemes.py
└── porter.py
/sample.csv:
--------------------------------------------------------------------------------
1 | "id","text"
2 | "1","Human machine interface for lab abc computer applications"
3 | "2","A survey of user opinion of computer system response time"
4 | "3","The EPS user interface management system"
5 | "4","System and human system engineering testing of EPS"
6 | "5","Relation of user perceived response time to error measurement"
7 | "6","The generation of random binary unordered trees"
8 | "7","The intersection graph of paths in trees"
9 | "8","Graph minors IV Widths of trees and well quasi ordering"
10 | "9","Graph minors A survey"
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | NMFtoXML
2 |
3 | NMFtoXML is a set of python scripts that imports text from a csv file, runs Non-negative Matrix Factorization and then outputs the resulting clusters to XML.
4 |
5 | Dependencies:
6 | Numpy is required.
7 | The projected gradient NMF algorithm is used. The algorithm was ported to Python by Anthony Di Franco.
8 | The PorterStemmer algorithm by Vivake Gupta.
9 |
10 | The script can be run from the command line.
11 |
12 | The format is: python nmftoxml.py filename directory numberofthemes showdocs outputfilename
13 |
14 | eg:
15 | python nmftoxml.py sample.csv C:\nmftoxml 2 1 sample.xml
16 |
17 |
--------------------------------------------------------------------------------
/nmf.py:
--------------------------------------------------------------------------------
1 | # NMF by alternative non-negative least squares using projected gradients
2 | # Author: Chih-Jen Lin, National Taiwan University
3 | # Python/numpy translation: Anthony Di Franco
4 |
5 | from numpy import *
6 | from numpy.linalg import norm
7 | from time import time
8 | from sys import stdout
9 |
10 | def nmf(V,Winit,Hinit,tol,timelimit,maxiter):
11 | """
12 | (W,H) = nmf(V,Winit,Hinit,tol,timelimit,maxiter)
13 | W,H: output solution
14 | Winit,Hinit: initial solution
15 | tol: tolerance for a relative stopping condition
16 | timelimit, maxiter: limit of time and iterations
17 | """
18 |
19 | W = Winit; H = Hinit; initt = time();
20 |
21 | gradW = dot(W, dot(H, H.T)) - dot(V, H.T)
22 | gradH = dot(dot(W.T, W), H) - dot(W.T, V)
23 | initgrad = norm(r_[gradW, gradH.T])
24 | #print 'Init gradient norm %f' % initgrad
25 | tolW = max(0.001,tol)*initgrad
26 | tolH = tolW
27 |
28 | for iter in xrange(1,maxiter):
29 | # stopping condition
30 | projnorm = norm(r_[gradW[logical_or(gradW<0, W>0)],
31 | gradH[logical_or(gradH<0, H>0)]])
32 | if projnorm < tol*initgrad or time() - initt > timelimit: break
33 |
34 | (W, gradW, iterW) = nlssubprob(V.T,H.T,W.T,tolW,1000)
35 | W = W.T
36 | gradW = gradW.T
37 |
38 | if iterW==1: tolW = 0.1 * tolW
39 |
40 | (H,gradH,iterH) = nlssubprob(V,W,H,tolH,1000)
41 | if iterH==1: tolH = 0.1 * tolH
42 |
43 | if iter % 10 == 0: stdout.write('.')
44 |
45 | return (W,H)
46 |
47 | def nlssubprob(V,W,Hinit,tol,maxiter):
48 | """
49 | H, grad: output solution and gradient
50 | iter: #iterations used
51 | V, W: constant matrices
52 | Hinit: initial solution
53 | tol: stopping tolerance
54 | maxiter: limit of iterations
55 | """
56 |
57 | H = Hinit
58 | WtV = dot(W.T, V)
59 | WtW = dot(W.T, W)
60 |
61 | alpha = 1; beta = 0.1;
62 | for iter in xrange(1, maxiter):
63 | grad = dot(WtW, H) - WtV
64 | projgrad = norm(grad[logical_or(grad < 0, H >0)])
65 | if projgrad < tol: break
66 |
67 | # search step size
68 | for inner_iter in xrange(1,20):
69 | Hn = H - alpha*grad
70 | Hn = where(Hn > 0, Hn, 0)
71 | d = Hn-H
72 | gradd = sum(grad * d)
73 | dQd = sum(dot(WtW,d) * d)
74 | suff_decr = 0.99*gradd + 0.5*dQd < 0;
75 | if inner_iter == 1:
76 | decr_alpha = not suff_decr; Hp = H;
77 | if decr_alpha:
78 | if suff_decr:
79 | H = Hn; break;
80 | else:
81 | alpha = alpha * beta;
82 | else:
83 | if not suff_decr or (Hp == Hn).all():
84 | H = Hp; break;
85 | else:
86 | alpha = alpha/beta; Hp = Hn;
87 |
88 | return (H, grad, iter)
--------------------------------------------------------------------------------
/nmftoxml.py:
--------------------------------------------------------------------------------
1 | '''
2 | # Programmed by Aneesha Bakharia
3 | '''
4 |
5 | import re
6 | from numpy import *
7 | import csv
8 | import os
9 | import sys
10 | import platform
11 |
12 | import nmf as nmf # import non negative matrix factorization algorithm
13 | import porter as porter # import porter.py - porter stemmer algorithm
14 | import SurveyQuestionThemes as surveythemer
15 |
16 | def remove_html_tags(data):
17 | p = re.compile(r'<.*?>')
18 | return p.sub('', data)
19 |
20 | def findthemes(nothemes,wordlist,questionresponses,inc_articles,outputfile):
21 | #print questionresponses
22 | synonym_wordlists = []
23 | synonym_wordlist = wordlist
24 | synonym_wordlists = synonym_wordlist.splitlines()
25 | exclude_wordlist = []
26 |
27 | stop_path = "englishstop.txt"
28 | stop_words = surveythemer.loadStopWords(stop_path)
29 |
30 | surveyQuestionResponse = []
31 | surveyQuestionResponseNID = []
32 |
33 | for response in questionresponses:
34 | newresp = remove_html_tags(response["text"])
35 | surveyQuestionResponse.append(newresp)
36 | surveyQuestionResponseNID.append(response["id"])
37 |
38 | listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words)
39 | wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords)
40 | pc = nothemes
41 | #size of input matrix
42 | ic=shape(wordMatrix)[0]
43 | fc=shape(wordMatrix)[1]
44 | # Random initialization
45 | w=array([[random.random() for j in range(pc)] for i in range(ic)])
46 | h=array([[random.random() for i in range(fc)] for i in range(pc)])
47 | nmfresult = ""
48 | themes = ""
49 | weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500)
50 | themexml = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID)
51 | f = open(outputfile, 'w')
52 | f.write(themexml)
53 | f.close()
54 | return
55 |
56 | fileName = sys.argv[1]
57 | directoryName = sys.argv[2]
58 | filepath = os.path.abspath(directoryName + '/' + fileName)
59 | nothemes = int(sys.argv[3])
60 | showdocs = int(sys.argv[4])
61 | outputfile = sys.argv[5]
62 |
63 | outputfiledir = os.path.abspath(directoryName + '/data/' + outputfile)
64 |
65 | data = csv.reader(open(filepath))
66 |
67 | questionresponses = []
68 | # Read the column names from the first line of the file
69 | fields = data.next()
70 | items = ""
71 | count = 1
72 | for row in data:
73 | #print count
74 | count = count + 1
75 | # Zip together the field names and values
76 | items = zip(fields, row)
77 | item = {}
78 | # Add the value to our dictionary
79 | for (name, value) in items:
80 | item[name] = value.strip()
81 | questionresponses.append(item)
82 |
83 | findthemes(nothemes,"",questionresponses,showdocs,outputfiledir)
--------------------------------------------------------------------------------
/englishstop.txt:
--------------------------------------------------------------------------------
1 | a
2 | a's
3 | able
4 | about
5 | above
6 | according
7 | accordingly
8 | across
9 | actually
10 | after
11 | afterwards
12 | again
13 | against
14 | ain't
15 | all
16 | allow
17 | allows
18 | almost
19 | alone
20 | along
21 | already
22 | also
23 | although
24 | always
25 | am
26 | among
27 | amongst
28 | an
29 | and
30 | another
31 | any
32 | anybody
33 | anyhow
34 | anyone
35 | anything
36 | anyway
37 | anyways
38 | anywhere
39 | apart
40 | appear
41 | appreciate
42 | appropriate
43 | are
44 | aren't
45 | around
46 | as
47 | aside
48 | ask
49 | asking
50 | associated
51 | at
52 | available
53 | away
54 | awfully
55 | b
56 | be
57 | became
58 | because
59 | become
60 | becomes
61 | becoming
62 | been
63 | before
64 | beforehand
65 | behind
66 | being
67 | believe
68 | below
69 | beside
70 | besides
71 | best
72 | better
73 | between
74 | beyond
75 | both
76 | brief
77 | but
78 | by
79 | c
80 | c'mon
81 | c's
82 | came
83 | can
84 | can't
85 | cannot
86 | cant
87 | cause
88 | causes
89 | certain
90 | certainly
91 | changes
92 | clearly
93 | co
94 | com
95 | come
96 | comes
97 | concerning
98 | consequently
99 | consider
100 | considering
101 | contain
102 | containing
103 | contains
104 | corresponding
105 | could
106 | couldn't
107 | course
108 | currently
109 | d
110 | definitely
111 | described
112 | despite
113 | did
114 | didn't
115 | different
116 | do
117 | does
118 | doesn't
119 | doing
120 | don't
121 | done
122 | down
123 | downwards
124 | during
125 | e
126 | each
127 | edu
128 | eg
129 | eight
130 | either
131 | else
132 | elsewhere
133 | enough
134 | entirely
135 | especially
136 | et
137 | etc
138 | even
139 | ever
140 | every
141 | everybody
142 | everyone
143 | everything
144 | everywhere
145 | ex
146 | exactly
147 | example
148 | except
149 | f
150 | far
151 | few
152 | fifth
153 | first
154 | five
155 | followed
156 | following
157 | follows
158 | for
159 | former
160 | formerly
161 | forth
162 | four
163 | from
164 | further
165 | furthermore
166 | g
167 | get
168 | gets
169 | getting
170 | given
171 | gives
172 | go
173 | goes
174 | going
175 | gone
176 | got
177 | gotten
178 | greetings
179 | h
180 | had
181 | hadn't
182 | happens
183 | hardly
184 | has
185 | hasn't
186 | have
187 | haven't
188 | having
189 | he
190 | he's
191 | hello
192 | help
193 | hence
194 | her
195 | here
196 | here's
197 | hereafter
198 | hereby
199 | herein
200 | hereupon
201 | hers
202 | herself
203 | hi
204 | him
205 | himself
206 | his
207 | hither
208 | hopefully
209 | how
210 | howbeit
211 | however
212 | i
213 | i'd
214 | i'll
215 | i'm
216 | i've
217 | ie
218 | if
219 | ignored
220 | immediate
221 | in
222 | inasmuch
223 | inc
224 | indeed
225 | indicate
226 | indicated
227 | indicates
228 | inner
229 | insofar
230 | instead
231 | into
232 | inward
233 | is
234 | isn't
235 | it
236 | it'd
237 | it'll
238 | it's
239 | its
240 | itself
241 | j
242 | just
243 | k
244 | keep
245 | keeps
246 | kept
247 | know
248 | knows
249 | known
250 | l
251 | last
252 | lately
253 | later
254 | latter
255 | latterly
256 | least
257 | less
258 | lest
259 | let
260 | let's
261 | like
262 | liked
263 | likely
264 | little
265 | look
266 | looking
267 | looks
268 | ltd
269 | m
270 | mainly
271 | many
272 | may
273 | maybe
274 | me
275 | mean
276 | meanwhile
277 | merely
278 | might
279 | more
280 | moreover
281 | most
282 | mostly
283 | much
284 | must
285 | my
286 | myself
287 | n
288 | name
289 | namely
290 | nd
291 | near
292 | nearly
293 | necessary
294 | need
295 | needs
296 | neither
297 | never
298 | nevertheless
299 | new
300 | next
301 | nine
302 | no
303 | nobody
304 | non
305 | none
306 | noone
307 | nor
308 | normally
309 | not
310 | nothing
311 | novel
312 | now
313 | nowhere
314 | o
315 | obviously
316 | of
317 | off
318 | often
319 | oh
320 | ok
321 | okay
322 | old
323 | on
324 | once
325 | one
326 | ones
327 | only
328 | onto
329 | or
330 | other
331 | others
332 | otherwise
333 | ought
334 | our
335 | ours
336 | ourselves
337 | out
338 | outside
339 | over
340 | overall
341 | own
342 | p
343 | particular
344 | particularly
345 | per
346 | perhaps
347 | placed
348 | please
349 | plus
350 | possible
351 | presumably
352 | probably
353 | provides
354 | q
355 | que
356 | quite
357 | qv
358 | r
359 | rather
360 | rd
361 | re
362 | really
363 | reasonably
364 | regarding
365 | regardless
366 | regards
367 | relatively
368 | respectively
369 | right
370 | s
371 | said
372 | same
373 | saw
374 | say
375 | saying
376 | says
377 | second
378 | secondly
379 | see
380 | seeing
381 | seem
382 | seemed
383 | seeming
384 | seems
385 | seen
386 | self
387 | selves
388 | sensible
389 | sent
390 | serious
391 | seriously
392 | seven
393 | several
394 | shall
395 | she
396 | should
397 | shouldn't
398 | since
399 | six
400 | so
401 | some
402 | somebody
403 | somehow
404 | someone
405 | something
406 | sometime
407 | sometimes
408 | somewhat
409 | somewhere
410 | soon
411 | sorry
412 | specified
413 | specify
414 | specifying
415 | still
416 | sub
417 | such
418 | sup
419 | sure
420 | t
421 | t's
422 | take
423 | taken
424 | tell
425 | tends
426 | th
427 | than
428 | thank
429 | thanks
430 | thanx
431 | that
432 | that's
433 | thats
434 | the
435 | their
436 | theirs
437 | them
438 | themselves
439 | then
440 | thence
441 | there
442 | there's
443 | thereafter
444 | thereby
445 | therefore
446 | therein
447 | theres
448 | thereupon
449 | these
450 | they
451 | they'd
452 | they'll
453 | they're
454 | they've
455 | think
456 | third
457 | this
458 | thorough
459 | thoroughly
460 | those
461 | though
462 | three
463 | through
464 | throughout
465 | thru
466 | thus
467 | to
468 | together
469 | too
470 | took
471 | toward
472 | towards
473 | tried
474 | tries
475 | truly
476 | try
477 | trying
478 | twice
479 | two
480 | u
481 | un
482 | under
483 | unfortunately
484 | unless
485 | unlikely
486 | until
487 | unto
488 | up
489 | upon
490 | us
491 | use
492 | used
493 | useful
494 | uses
495 | using
496 | usually
497 | uucp
498 | v
499 | value
500 | various
501 | very
502 | via
503 | viz
504 | vs
505 | w
506 | want
507 | wants
508 | was
509 | wasn't
510 | way
511 | we
512 | we'd
513 | we'll
514 | we're
515 | we've
516 | welcome
517 | well
518 | went
519 | were
520 | weren't
521 | what
522 | what's
523 | whatever
524 | when
525 | whence
526 | whenever
527 | where
528 | where's
529 | whereafter
530 | whereas
531 | whereby
532 | wherein
533 | whereupon
534 | wherever
535 | whether
536 | which
537 | while
538 | whither
539 | who
540 | who's
541 | whoever
542 | whole
543 | whom
544 | whose
545 | why
546 | will
547 | willing
548 | wish
549 | with
550 | within
551 | without
552 | won't
553 | wonder
554 | would
555 | would
556 | wouldn't
557 | x
558 | y
559 | yes
560 | yet
561 | you
562 | you'd
563 | you'll
564 | you're
565 | you've
566 | your
567 | yours
568 | yourself
569 | yourselves
570 | z
571 | zero
--------------------------------------------------------------------------------
/SurveyQuestionThemes.py:
--------------------------------------------------------------------------------
1 | '''
2 | # Programmed by Aneesha Bakharia
3 | '''
4 |
5 | import re # import regular expression module
6 | from numpy import * # import numpy for matrix & array features
7 | from numpy.linalg import svd
8 | from numpy.linalg import norm
9 | #import nnmf # import non negative matrix factorization algorithm
10 | import porter # import porter.py - porter stemmer algorithm
11 | from xml.dom.minidom import Document
12 | ##
13 | # A collection of functions for using Non Negative Matrix Factorization to find themes in open ended survey questions
14 |
15 | ##
16 | # Utility function to return a list of all words that are have a length greater than a specified number of characters.
17 | # @param text The text that must be split in to words.
18 | # @param minWordReturnSize The minimum no of characters a word must have to be included.
19 | def separatewords(text,minWordReturnSize):
20 | #splitter=re.compile('\\W*')
21 | splitter=re.compile('[^a-zA-Z0-9_\\+\\-]')
22 | return [singleWord.lower() for singleWord in splitter.split(text) if len(singleWord)>minWordReturnSize]
23 |
24 | ##
25 | # Utility function to sort a dictionary by Value in decending order
26 | # Not the most efficient implementation - may need to refactor if speed becomes an issue
27 | # @param dictionary The dictionary data structure.
28 | # @return list A list of keys sorted by their value.
29 | def sortDictionaryByValues(dictionary):
30 | """ Returns the keys of dictionary d sorted by their values """
31 | items=dictionary.items()
32 | backitems=[ [v[1],v[0]] for v in items]
33 | backitems.sort()
34 | backitems.reverse()
35 | return [ backitems[i][1] for i in range(0,len(backitems))]
36 |
37 | ##
38 | # Utility function to load stop words from a file and return as a list of words
39 | # @param stopWordFile Path and file name of a file containing stop words.
40 | # @return list A list of stop words.
41 | def loadStopWords(stopWordFile):
42 | stopWords = []
43 | for line in open(stopWordFile):
44 | for word in line.split( ): #in case more than one per line
45 | stopWords.append(word)
46 | return stopWords
47 |
48 | ##
49 | # Utility function to remove stop words that are present in a list of words
50 | # @param wordlist Unfiltered list of words
51 | # @param stopwordlist List of stops words
52 | # @return list Filtered list of words
53 | def removeStopWords(wordlist,stopwordlist):
54 | return [word for word in wordlist if word not in stopwordlist]
55 |
56 | ##
57 | # This function returns a list of all words that are have a length greater than a specified number of characters.
58 | # @param text The text that must be split in to words.
59 | # @param minWordReturnSize The minimum no of characters a word must have to be included.
60 | def getItemWords(list_of_words,stop_words):
61 | stemmer=porter.PorterStemmer()
62 | allwords={}
63 | itemwords=[]
64 | itemtitles=[]
65 | ec=0
66 | stemlist = {}
67 | # Loop over every item in list_of_words
68 | for item in list_of_words:
69 | words=separatewords(item,1)
70 | words = removeStopWords(words,stop_words)
71 | itemwords.append({})
72 | itemtitles.append("Response " + str(ec+1))
73 | # Increase the counts for this word in allwords and in articlewords
74 | for word in words:
75 | unstemmedword = word
76 | word=stemmer.stem(word,0,len(word)-1)
77 | if word in stemlist:
78 | temp = stemlist[word]
79 | try:
80 | temp.index(unstemmedword)
81 | except ValueError:
82 | temp.append(unstemmedword)
83 | stemlist[word] = temp
84 | else:
85 | temp = []
86 | temp.append(unstemmedword)
87 | stemlist[word] = temp
88 | allwords.setdefault(word,0)
89 | allwords[word]+=1
90 | itemwords[ec].setdefault(word,0)
91 | itemwords[ec][word]+=1
92 | ec+=1
93 | return allwords,itemwords,itemtitles,stemlist
94 |
95 |
96 | ##
97 | # Returns the document (row) and words (columns) matrix and the list of words as a vector
98 | def createWordMatrix(all_words,words_inItems):
99 | #print all_words
100 | #print words_inItems
101 | wordvector=[]
102 | # Only take words that are common but not too common
103 | for w,c in all_words.items():
104 | wordvector.append(w)
105 | #if c