├── pipeline ├── __init__.py ├── stmt │ ├── generate-topic-index.py │ ├── extract-doc-index.py │ ├── extract-term-freqs.py │ ├── lda-learn.sh │ ├── generate-label-term-distributions.py │ └── lda-learn.scala ├── prepare_vis_for_client.sh ├── train_mallet.sh ├── train_stmt.sh ├── utf8_utils.py ├── io_utils.py ├── import_mallet.py ├── tokenize.py ├── compute_saliency.py ├── prepare_data_for_client.py ├── import_stmt.py ├── api_utils.py ├── compute_similarity.py └── compute_seriation.py ├── client-src ├── web.sh ├── FullTermTopicProbabilityModel.js ├── termite.css ├── SeriatedTermTopicProbabilityModel.js ├── ViewParameters.js ├── InteractionObjects.css ├── UserControlViews.js ├── QueryString.js ├── TermFrequencyModel.js ├── StateModel.js ├── html5slider.js ├── FilteredTermTopicProbabilityModel.js ├── index.html ├── TermFrequencyView.js └── TermTopicMatrixView.js ├── .gitignore ├── README.md ├── CHANGE_LOG ├── example.cfg ├── LICENSE ├── setup.sh ├── README.old └── execute.py /pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /client-src/web.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Starting a local web server at http://localhost:8888/" 4 | python -m SimpleHTTPServer 8888 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.pyc 3 | *.min.js 4 | /lib/ 5 | /client-lib/ 6 | /client-src/d3.v3.js 7 | /client-src/jquery.js 8 | /client-src/underscore.js 9 | /client-src/backbone.js 10 | /client-src/data 11 | /mallet-* 12 | /stmt-* 13 | -------------------------------------------------------------------------------- /client-src/FullTermTopicProbabilityModel.js: -------------------------------------------------------------------------------- 1 | /* 2 | FullTermTopicProbabilityModel.js 3 | 4 | Currently does nothing. 5 | 6 | Designed to take in complete list of terms, topics, and matrix. 7 | 8 | Passes subset of complete inputs to SeriatedTermTopicProbabilityModel. 9 | */ 10 | 11 | 12 | function FullTermTopicProbabilityModel() 13 | { 14 | // nothing here for now 15 | } 16 | -------------------------------------------------------------------------------- /pipeline/stmt/generate-topic-index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser( description = 'Generate topic-index.txt' ) 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' ) 8 | parser.add_argument( 'topicCount', type = int, help = 'Number of topics' ) 9 | args = parser.parse_args() 10 | path = args.path 11 | topicCount = args.topicCount 12 | 13 | f = "{}/topic-index.txt".format( path ) 14 | w = open( f, 'w' ) 15 | for i in range( topicCount ) : 16 | w.write( 'Topic {}\n'.format( i+1 ) ) 17 | w.close() 18 | -------------------------------------------------------------------------------- /pipeline/stmt/extract-doc-index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser( description = 'Generate doc-index.txt from document-topic-distributions.csv' ) 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' ) 8 | args = parser.parse_args() 9 | path = args.path 10 | 11 | lines = open( '{}/document-topic-distributions.csv'.format( path ) ).read().splitlines() 12 | writer = open( '{}/doc-index.txt'.format( path ), 'w' ) 13 | for line in lines : 14 | values = line.split( ',' ) 15 | writer.write( '{}\n'.format( values[0] ) ) 16 | writer.close() 17 | -------------------------------------------------------------------------------- /pipeline/stmt/extract-term-freqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser( description = 'Generate label-term-distributions.csv from topic-term-distributions.csv.' ) 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' ) 8 | args = parser.parse_args() 9 | path = args.path 10 | 11 | lines = open( '{}/term-counts.csv'.format( path ) ).read().splitlines() 12 | writer = open( '{}/term-freqs.txt'.format( path ), 'w' ) 13 | for line in lines : 14 | values = line.split( ',' ) 15 | writer.write( '{}\t{}\n'.format( values[0], values[1] ) ) 16 | writer.close() 17 | -------------------------------------------------------------------------------- /pipeline/prepare_vis_for_client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Copies files necessary to run the client to the specified path's public_html directory 4 | 5 | EXPECTED_ARGS=1 6 | if [ $# -lt $EXPECTED_ARGS ] 7 | then 8 | echo "Usage: `basename $0` project_path" 9 | exit -1 10 | fi 11 | 12 | ROOT=$1 # path to public_html 13 | CLIENT_SRC=client-src/ 14 | CLIENT_LIB=client-lib/ 15 | 16 | echo "Copying js files..." 17 | for JS_FILE in d3.v3 jquery backbone underscore FullTermTopicProbabilityModel SeriatedTermTopicProbabilityModel FilteredTermTopicProbabilityModel TermFrequencyModel TermTopicMatrixView TermFrequencyView ViewParameters StateModel UserControlViews QueryString html5slider 18 | do 19 | cp $CLIENT_LIB/$JS_FILE.min.js $ROOT/public_html/ 20 | done 21 | 22 | echo "Copying CSS file..." 23 | for CSS_FILE in InteractionObjects termite 24 | do 25 | cp $CLIENT_SRC/$CSS_FILE.css $ROOT/public_html/ 26 | done 27 | 28 | echo "Copying local server file..." 29 | cp $CLIENT_SRC/web.sh $ROOT/public_html/ 30 | 31 | echo "Copying HTML file..." 32 | cp $CLIENT_SRC/index.html $ROOT/public_html/ 33 | 34 | # rename HTML's imported javascript files to use the minified versions 35 | echo "Renaming library dependencies in HTML file..." 36 | sed -i='' 's|\.js|.min.js|g' $ROOT/public_html/index.html 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Current Development 2 | =================== 3 | 4 | Starting in 2014, we have split Termite into two components: 5 | * **[Termite Data Server](http://github.com/uwdata/termite-data-server)** for processing the output of topic models and distributing the content as a web service 6 | * **[Termite Visualizations](http://github.com/uwdata/termite-visualizations)** for visualizing topic model outputs in a web browser 7 | 8 | Our goals are to: 9 | * support multiple topic modeling tools 10 | * reduce the cost of developing new visualizations through shared infrastructure 11 | * allow multiple visualizations to interact with any number of topic modeling software and with other visualizations 12 | 13 | Please see the respective repositories for the latest software and additional information. 14 | 15 | Termite 16 | ------- 17 | 18 | Termite is a visualization tool for inspecting the output of statistical topic models based on the techniques described in the following publication. For more details about this repository, see the file "README.old". 19 | 20 | **Termite: Visualization Techniques for Assessing Textual Topic Models** 21 | Jason Chuang, Christopher D. Manning, Jeffrey Heer 22 | Computer Science Dept, Stanford University 23 | http://vis.stanford.edu/papers/termite 24 | 25 | -------------------------------------------------------------------------------- /CHANGE_LOG: -------------------------------------------------------------------------------- 1 | Termite Topic Model Visualization 2 | Jason Chuang, Ashley Jin 3 | http://termite.stanford.edu 4 | 5 | -------------------------------------------------------------------------------- 6 | 7 | Version 1.0 (Released on Feb 1, 2013) 8 | 9 | Data processing pipeline: 10 | - Tokenize a text corpus. 11 | - Build a topic model using MALLET or STMT. 12 | - Compute term similarity and saliency statistics. 13 | - Pre-compute term seriation. 14 | - Generate a visualization viewable in a web browser. 15 | 16 | Visualization: 17 | - Display topical term frequency using a tabular circular view. 18 | - Display global term frequency using a bar chart. 19 | - Embed (save/load) visualization states using URL. 20 | - Options for selecting the number of frequent/salient terms to display. 21 | 22 | -------------------------------------------------------------------------------- 23 | 24 | Version 1.1 (Released on March 30, 2013) 25 | 26 | Data processing pipeline: 27 | - Updated similarity computation. 28 | - Fixed minor bugs. 29 | 30 | Visualization: 31 | - Select and color latent topics. 32 | - Display top terms belonging to selected topics. 33 | - Re-order terms by topical frequency. 34 | - Brushing-n-linking on mouse over. 35 | 36 | -------------------------------------------------------------------------------- 37 | -------------------------------------------------------------------------------- /pipeline/train_mallet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EXPECTED_ARGS=3 4 | if [ $# -lt $EXPECTED_ARGS ] 5 | then 6 | echo "Usage: `basename $0` input-file output-path num-topics" 7 | exit -1 8 | fi 9 | 10 | MALLET=mallet-2.0.7/ 11 | INPUT=$1 12 | OUTPUT=$2 13 | TOPICS=$3 14 | 15 | 16 | 17 | echo "--------------------------------------------------------------------------------" 18 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..." 19 | echo 20 | 21 | if [ ! -d $OUTPUT ]; then 22 | echo "Creating output folder..." 23 | mkdir $OUTPUT 24 | fi 25 | 26 | echo "Importing data into Mallet..." 27 | $MALLET/bin/mallet import-file \ 28 | --input $INPUT \ 29 | --output $OUTPUT/text.vectors \ 30 | --line-regex "^(\S*)\t(.*)$" \ 31 | --token-regex "\S+" \ 32 | --name 0 --label 1 --data 2 \ 33 | --remove-stopwords true --encoding utf-8 --keep-sequence 34 | # --remove-stopwords false --encoding utf-8 --keep-sequence 35 | 36 | echo "Learning latent topics..." 37 | $MALLET/bin/mallet train-topics \ 38 | --input $OUTPUT/text.vectors \ 39 | --output-model $OUTPUT/output.model \ 40 | --output-topic-keys $OUTPUT/output-topic-keys.txt \ 41 | --topic-word-weights-file $OUTPUT/topic-word-weights.txt \ 42 | --word-topic-counts-file $OUTPUT/word-topic-counts.txt \ 43 | --num-topics $TOPICS 44 | 45 | echo "--------------------------------------------------------------------------------" 46 | -------------------------------------------------------------------------------- /client-src/termite.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: #ccc; 3 | cursor: default; 4 | } 5 | div, p { 6 | padding: 0; 7 | margin: 0; 8 | border: 0; 9 | } 10 | 11 | #pageBackground { 12 | padding: 20px; 13 | } 14 | #pageFrame { 15 | border: 1px solid #999; 16 | box-shadow: 0 0 25px #999; 17 | background-color: #f3f3f3; 18 | } 19 | #pageHeader { 20 | border-bottom: 1px solid #999; 21 | } 22 | 23 | #pageLogo { 24 | font-family: Georgia; 25 | padding: 20px 150px 20px 30px; 26 | } 27 | #pageLogo .title { 28 | font-weight: bold; 29 | font-size: 18pt; 30 | } 31 | #pageLogo .subtitle { 32 | font-size: 12pt; 33 | color: #666; 34 | } 35 | #pageLogo .credits { 36 | font-size: 9pt; 37 | color: #666; 38 | } 39 | 40 | #pageControl { 41 | color: #666; 42 | font-family: Verdana; 43 | font-size: 8pt; 44 | padding: 10px 20px; 45 | border-left: 1px solid #999; 46 | } 47 | 48 | #pageHeader .headerObject { 49 | display: inline-block; 50 | vertical-align: top; 51 | } 52 | #pageHeader .headerObject div.line { 53 | padding: 0; 54 | margin: 0; 55 | height: 20px; 56 | } 57 | #pageContent { 58 | padding: 20px 40px; 59 | background-color: #fff; 60 | } 61 | #pageDetails { 62 | color: #999; 63 | font-family: Verdana; 64 | font-size: 8pt; 65 | } 66 | 67 | #pageFooter { 68 | color: #666; 69 | font-family: Verdana; 70 | font-size: 8pt; 71 | border-top: 1px solid #999; 72 | padding: 5px; 73 | } -------------------------------------------------------------------------------- /example.cfg: -------------------------------------------------------------------------------- 1 | [Corpus] 2 | 3 | # Currently only support one format: file 4 | # In the future: file, folder, lucene 5 | 6 | format = file 7 | path = corpus/example-documents.txt 8 | 9 | ### these both work for unicode encoded corpus files: 10 | # tokenization = [^ ]+ 11 | tokenization = whitespace 12 | 13 | 14 | # ----------------------------------------------------------------------------- 15 | 16 | [TopicModel] 17 | 18 | # Two topic models 19 | # Supported libraries: mallet, stmt 20 | library = mallet 21 | ; library = stmt 22 | 23 | # Path to save topic model outputs 24 | path = output/example-project/topic-model 25 | 26 | # Number of topics to train 27 | num_topics = 20 28 | 29 | # ----------------------------------------------------------------------------- 30 | 31 | [Termite] 32 | 33 | # Currently only support one format: file 34 | # In the future: file, database 35 | format = file 36 | 37 | # Path to save Termite-internal working files 38 | path = output/example-project 39 | 40 | # Number of terms to seriate 41 | number_of_seriated_terms = 400 42 | 43 | # ----------------------------------------------------------------------------- 44 | 45 | [Misc] 46 | 47 | # Miscellaneous program configurations 48 | 49 | ;logging = 10 # Display all debug messages 50 | ;logging = 20 # Display info messages 51 | ;logging = 30 # Display only warnings 52 | ;logging = 40 # Display only errors 53 | -------------------------------------------------------------------------------- /pipeline/train_stmt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | EXPECTED_ARGS=3 4 | if [ $# -lt $EXPECTED_ARGS ] 5 | then 6 | echo "Usage: `basename $0` input-file output-path num-topics" 7 | exit -1 8 | fi 9 | 10 | STMT_JAR=stmt-0.4.0/ 11 | STMT_LIB=pipeline/stmt/ 12 | INPUT=$1 13 | OUTPUT=$2 14 | TOPICS=$3 15 | ITERS=1000 16 | 17 | echo "--------------------------------------------------------------------------------" 18 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..." 19 | echo 20 | 21 | echo "java -Xmx2g -jar $STMT_LIB/tmt-0.4.0.jar $STMT_LIB/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS" 22 | java -Xmx2g -jar $STMT_JAR/tmt-0.4.0.jar $STMT_LIB/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS 23 | 24 | echo "Mark file iteration as 'final-iters'..." 25 | ln -s `printf '%05d' $ITERS`/ $OUTPUT/final-iters 26 | 27 | echo "Unpack topic-term distribution..." 28 | gunzip -c $OUTPUT/final-iters/topic-term-distributions.csv.gz > $OUTPUT/topic-term-distributions.csv 29 | 30 | echo "Generate topic-index (list of topics)..." 31 | $STMT_LIB/generate-topic-index.py $OUTPUT $TOPICS 32 | 33 | echo "Copy term-index (list of terms)..." 34 | cp $OUTPUT/final-iters/term-index.txt $OUTPUT/term-index.txt 35 | 36 | echo "Extract doc-index (list of documents)..." 37 | $STMT_LIB/extract-doc-index.py $OUTPUT 38 | 39 | echo "Extract list of term frequencies..." 40 | $STMT_LIB/extract-term-freqs.py $OUTPUT 41 | 42 | echo "--------------------------------------------------------------------------------" 43 | -------------------------------------------------------------------------------- /pipeline/stmt/lda-learn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check for proper number of command line args. 4 | EXPECTED_ARGS=6 5 | if [ $# -lt $EXPECTED_ARGS ] 6 | then 7 | echo "Usage: `basename $0` input-file output-path iters topics term-smoothing topic-smoothing" 8 | exit -1 9 | fi 10 | 11 | PATH=stmt-0.4.0 12 | INPUT=$1 13 | OUTPUT=$2 14 | ITERS=$3 15 | TOPICS=$4 16 | TERM_SMOOTHING=$5 17 | TOPIC_SMOOTHING=$6 18 | 19 | 20 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..." 21 | echo "java -Xmx2g -jar $PATH/tmt-0.4.0.jar $PATH/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS $TERM_SMOOTHING $TOPIC_SMOOTHING" 22 | java -Xmx2g -jar $PATH/tmt-0.4.0.jar $PATH/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS $TERM_SMOOTHING $TOPIC_SMOOTHING 23 | 24 | 25 | 26 | 27 | #echo "Generate summary page..." 28 | #stmt/summarize.py $OUTPUT stmt 29 | 30 | echo "Mark file iteration as 'final-iters'..." 31 | ln -s `printf '%05d' $ITERS`/ $OUTPUT/final-iters 32 | 33 | echo "Unpack topic-term distribution..." 34 | gunzip -c $OUTPUT/final-iters/topic-term-distributions.csv.gz > $OUTPUT/topic-term-distributions.csv 35 | 36 | 37 | 38 | echo "Generate topic-index (list of topics)..." 39 | $PATH/generate-topic-index.py $OUTPUT $TOPICS 40 | 41 | echo "Copy term-index (list of terms)..." 42 | cp $OUTPUT/final-iters/term-index.txt $OUTPUT/term-index.txt 43 | 44 | echo "Extract doc-index (list of documents)..." 45 | $PATH/extract-doc-index.py $OUTPUT 46 | 47 | echo "Extract list of term frequencies..." 48 | $PATH/extract-term-freqs.py $OUTPUT 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Leland Stanford Junior University 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /client-src/SeriatedTermTopicProbabilityModel.js: -------------------------------------------------------------------------------- 1 | /* 2 | SeriatedTermTopicProbabilityModel.js 3 | 4 | Currently: Reads in input file to get seriated terms, topics, term information 5 | (e.g. saliency), and matrix of similarity values. 6 | 7 | 8 | Designed to take in a subset of the full list of terms, topics, and matrix. 9 | */ 10 | 11 | var SeriatedTermTopicProbabilityModel = Backbone.Model.extend({ 12 | defaults : { 13 | "matrix" : null, 14 | "termIndex" : null, 15 | "topicIndex" : null, 16 | "sparseMatrix" : null // currently null 17 | }, 18 | url : "data/seriated-parameters.json", 19 | initialize : function() { 20 | this.parentModel = null; 21 | } 22 | }); 23 | 24 | /** 25 | * Initialize seriated's parent model 26 | * 27 | * @private 28 | */ 29 | SeriatedTermTopicProbabilityModel.prototype.initModel = function ( fullModel ) { 30 | this.parentModel = filteredModel; 31 | }; 32 | 33 | /** 34 | * Loads matrix, termIndex, and topicIndex from the model's "url" 35 | * and triggers a loaded event that the next model (child model) listens to. 36 | * (This function is called after the state model loaded event is fired) 37 | * 38 | * @param { string } the location of datafile to load values from 39 | * @return { void } 40 | */ 41 | SeriatedTermTopicProbabilityModel.prototype.load = function () { 42 | var successHandler = function( model, response, options ) 43 | { 44 | this.trigger("loaded:seriated"); 45 | 46 | }.bind(this); 47 | var errorHandler = function( model, xhr, options ) { }.bind(this); 48 | this.fetch({ 49 | add : true, 50 | success : successHandler, 51 | error : errorHandler 52 | }); 53 | }; -------------------------------------------------------------------------------- /pipeline/utf8_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | Modified from 'The Python Standard Library' 6 | 13.1. csv — CSV File Reading and Writing 7 | http://docs.python.org/2/library/csv.html 8 | """ 9 | 10 | import csv, codecs, cStringIO 11 | 12 | class UTF8Recoder: 13 | """ 14 | Iterator that reads an encoded stream and reencodes the input to UTF-8 15 | """ 16 | def __init__(self, f, encoding): 17 | self.reader = codecs.getreader(encoding)(f) 18 | 19 | def __iter__(self): 20 | return self 21 | 22 | def next(self): 23 | return self.reader.next().encode("utf-8") 24 | 25 | class UnicodeReader: 26 | """ 27 | A CSV reader which will iterate over lines in the CSV file "f", 28 | which is encoded in the given encoding. 29 | """ 30 | 31 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", delimiter="\t", **kwds): 32 | f = UTF8Recoder(f, encoding) 33 | self.reader = csv.reader(f, dialect=dialect, delimiter=delimiter, **kwds) 34 | 35 | def next(self): 36 | row = self.reader.next() 37 | return [unicode(s, "utf-8") for s in row] 38 | 39 | def __iter__(self): 40 | return self 41 | 42 | class UnicodeWriter: 43 | """ 44 | A CSV writer which will write rows to CSV file "f", 45 | which is encoded in the given encoding. 46 | """ 47 | 48 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", delimiter="\t", **kwds): 49 | # Redirect output to a queue 50 | self.queue = cStringIO.StringIO() 51 | self.writer = csv.writer(self.queue, dialect=dialect, delimiter=delimiter, **kwds) 52 | self.stream = f 53 | self.encoder = codecs.getincrementalencoder(encoding)() 54 | 55 | def writerow(self, row): 56 | self.writer.writerow([s.encode("utf-8") for s in row]) 57 | # Fetch UTF-8 output from the queue ... 58 | data = self.queue.getvalue() 59 | data = data.decode("utf-8", "ignore") 60 | # ... and reencode it into the target encoding 61 | data = self.encoder.encode(data) 62 | # write to the target stream 63 | self.stream.write(data) 64 | # empty queue 65 | self.queue.truncate(0) 66 | 67 | def writerows(self, rows): 68 | for row in rows: 69 | self.writerow(row) -------------------------------------------------------------------------------- /pipeline/stmt/generate-label-term-distributions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import re 6 | 7 | parser = argparse.ArgumentParser( description = 'Generate label-term-distributions.csv from topic-term-distributions.csv.' ) 8 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' ) 9 | args = parser.parse_args() 10 | path = args.path 11 | 12 | ################################################################################ 13 | 14 | # Get topics 15 | topics = [] 16 | f = '{}/final-iters/topic-index.txt'.format( path ) 17 | for line in open( f ).read().splitlines() : 18 | topics.append( line ) 19 | 20 | # Get labels (Skip BACKGROUND) 21 | labels = [] 22 | f = '{}/final-iters/label-index.txt'.format( path ) 23 | for line in open( f ).read().splitlines() : 24 | if ( line != 'BACKGROUND' ) : 25 | labels.append( line ) 26 | 27 | ################################################################################ 28 | 29 | # Match labels and topics 30 | match = [] 31 | for i in range( len( topics ) ) : 32 | topic = topics[i] 33 | match.append( -1 ) 34 | 35 | for j in range( len( labels ) ) : 36 | label = labels[j] 37 | m = re.match( r'{} \- \d+'.format( re.escape(label) ), topic ) 38 | if m is not None: 39 | match[i] = j 40 | 41 | if ( match[i] == -1 ) : 42 | match[i] = len(labels) 43 | labels.append( "Topic{:02d}".format( len(labels)+1 ) ) 44 | 45 | #print labels 46 | #print match 47 | 48 | # Merge rows of TOPIC-term distributions 49 | tally = [] 50 | for label in labels: 51 | tally.append( [] ) 52 | 53 | f = '{}/topic-term-distributions.csv'.format( path ) 54 | lines = open( f ).read().splitlines() 55 | assert( len(lines) == len(topics) ) 56 | for i in range( len( topics ) ) : 57 | values = lines[i].split( ',' ) 58 | for j in range( len( values ) ) : 59 | values[j] = float( values[j] ) 60 | target = match[i] 61 | if ( len( tally[target] ) == 0 ) : 62 | tally[target] = values 63 | else : 64 | for j in range( len( values ) ) : 65 | tally[target][j] += values[j] 66 | 67 | ################################################################################ 68 | 69 | # Output topics 70 | f = '{}/topic-index.txt'.format( path ) 71 | w = open( f, 'w' ) 72 | for topic in topics : 73 | w.write( topic + '\n' ) 74 | w.close() 75 | 76 | # Output labels 77 | f = '{}/label-index.txt'.format( path ) 78 | w = open( f, 'w' ) 79 | for label in labels : 80 | w.write( label + '\n' ) 81 | w.close() 82 | 83 | # Output LABEL-term distributions 84 | f = '{}/label-term-distributions.csv'.format( path ) 85 | w = open( f, 'w' ) 86 | for values in tally : 87 | for j in range( len( values ) ) : 88 | values[j] = str( values[j] ) 89 | w.write( ','.join( values ) + '\n' ) 90 | w.close() 91 | 92 | -------------------------------------------------------------------------------- /client-src/ViewParameters.js: -------------------------------------------------------------------------------- 1 | /* 2 | ViewParameters.js 3 | 4 | This file contains some final parameters for the view elements. 5 | 6 | Parameters include: 7 | -defaults for different objects 8 | -functions to assign colors to events 9 | -functions to generate consistent class tags for objects based on term or topic 10 | */ 11 | //===================================================================================== 12 | // VIEW PARAMS 13 | //===================================================================================== 14 | var THRESHHOLD = 0.01; 15 | 16 | var HIGHLIGHT = "red"; 17 | var DEFAULT = "default"; 18 | var DESELECT = "deselect"; 19 | 20 | var colorNames = ["orange", "blue", "green", "purple", "brown", "pink"]; 21 | var colorObjs = []; 22 | 23 | /** 24 | * Initializes the color objects to all free or according to the given object 25 | * 26 | * @param { list } list of used colors (should be initialized with usage:true) 27 | */ 28 | function initColorObjects( loadObj ) { 29 | if(loadObj === null){ 30 | for( var index = 0; index < colorNames.length; index++ ) { 31 | colorObjs.push({color: colorNames[index], usage: false}); 32 | } 33 | } 34 | else{ 35 | // load some initial usage from passed object (from state) 36 | } 37 | } 38 | /** 39 | * Returns the first free color if any. Marks returned color as used if not DEFAULT 40 | */ 41 | function getColor() { 42 | var color = DEFAULT; 43 | for( var index = 0; index < colorObjs.length; index++ ){ 44 | if( !(colorObjs[index].usage) ){ 45 | color = colorObjs[index].color; 46 | colorObjs[index].usage = true; 47 | break; 48 | } 49 | } 50 | return color; 51 | } 52 | /** 53 | * Marks the given color as usage:false if that color name exists 54 | * 55 | * @param { string } name of color to be freed 56 | */ 57 | function freeColor( color ) { 58 | if( color !== DEFAULT ){ 59 | for( var index = 0; index < colorObjs.length; index++ ){ 60 | if( color === colorObjs[index].color){ 61 | colorObjs[index].usage = false; 62 | break; 63 | } 64 | } 65 | } 66 | }; 67 | function claimColor( color ){ 68 | if( color !== DEFAULT ){ 69 | for( var index = 0; index < colorObjs.length; index++ ){ 70 | if( color === colorObjs[index].color){ 71 | colorObjs[index].usage = true; 72 | break; 73 | } 74 | } 75 | } 76 | }; 77 | 78 | /** 79 | * consistent d3 class labeling helper functions 80 | * 81 | * @param { string, int } term or topic to use in classname 82 | * @return { string } class name based on input 83 | */ 84 | function getTopicClassTag( topic ){ 85 | return "__topic_" + sanitize(topic); 86 | } 87 | function getTermClassTag( term ){ 88 | return "__term_" + sanitize(term); 89 | } 90 | function sanitize( text ){ 91 | // Need to account for non-alphanumeric characters 92 | // Return a unique identifier for any input string 93 | return text.replace( /[^A-Za-z0-9]/g, "_" ); 94 | } 95 | /** end class labeling helper functions **/ 96 | -------------------------------------------------------------------------------- /pipeline/stmt/lda-learn.scala: -------------------------------------------------------------------------------- 1 | // tells Scala where to find the TMT classes 2 | import scalanlp.io._; 3 | import scalanlp.stage._; 4 | import scalanlp.stage.text._; 5 | import scalanlp.text.tokenize._; 6 | import scalanlp.pipes.Pipes.global._; 7 | 8 | import edu.stanford.nlp.tmt.stage._; 9 | import edu.stanford.nlp.tmt.model.lda._; 10 | 11 | 12 | 13 | if ( args.length < 2 ) 14 | { 15 | System.err.println( "Arguments: inputFile outputPath [numTopics] [numIters] [termSmoothing] [topicSmoothing]" ); 16 | System.err.println( " inputFile: tab-delimited file containing the training corpus" ); 17 | System.err.println( " (first column = docID, second column = text)" ); 18 | System.err.println( " outputPath: path for saving output model data" ); 19 | System.err.println( " numOfTopics: number of topics to train [default=20]" ); 20 | System.err.println( " maxIters: number of iterations to execute [default=1000]" ); 21 | System.err.println( " termSmoothing: [default=0.01]" ); 22 | System.err.println( "topicSmoothing: [default=0.01]" ); 23 | System.exit( -1 ); 24 | } 25 | 26 | 27 | val inputFile = args(0); 28 | val outputPath = args(1); 29 | val indexColumn = 1; 30 | val textColumn = 2; 31 | 32 | 33 | val numOfTopics = if ( args.length > 2 ) { args(2).toInt } else { 20 }; 34 | val maxIters = if ( args.length > 3 ) { args(3).toInt } else { 1000 }; 35 | val termSmoothing = if ( args.length > 4 ) { args(4).toDouble } else { 0.01 }; 36 | val topicSmoothing = if ( args.length > 5 ) { args(5).toDouble } else { 0.01 }; 37 | 38 | System.err.println( "LDA Learning Parameters..." ); 39 | System.err.println( " inputFile = " + inputFile ); 40 | System.err.println( " outputPath = " + outputPath ); 41 | System.err.println( " numOfTopics = " + numOfTopics ); 42 | System.err.println( " maxIters = " + maxIters ); 43 | System.err.println( " termSmoothing = " + termSmoothing ); 44 | System.err.println( "topicSmoothing = " + topicSmoothing ); 45 | System.err.println(); 46 | 47 | 48 | val alphabetsOnly = { 49 | RegexSearchTokenizer( "[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*" ) ~> // keep tokens with alphabets 50 | CaseFolder() ~> // fold to lower case 51 | StopWordFilter( "en" ) // remove common English words 52 | } 53 | 54 | System.err.println( "Loading source text..." ); 55 | val source = TSVFile( inputFile ) ~> IDColumn( indexColumn ); 56 | val text = source ~> Column( textColumn ) ~> TokenizeWith( alphabetsOnly ) ~> TermCounter(); 57 | 58 | 59 | System.err.println( "Defining dataset and model..." ); 60 | val dataset = LDADataset( text ); 61 | 62 | 63 | val modelParams = LDAModelParams( numTopics=numOfTopics, dataset=dataset, topicSmoothing=topicSmoothing, termSmoothing=termSmoothing ); 64 | val modelPath = file( outputPath ); 65 | 66 | System.err.println( "Learning LDA topics..." ); 67 | val model = TrainCVB0LDA( modelParams, dataset, output=modelPath, maxIterations=maxIters ); 68 | val perDocTopicDistributions = InferCVB0DocumentTopicDistributions( model, dataset ); 69 | 70 | System.err.println( "Writing term counts to disk..." ); 71 | val termCounts = text.meta[ TermCounts ]; 72 | CSVFile( file( outputPath + "/term-counts.csv" ) ).write( 73 | { 74 | for ( term <- termCounts.index.iterator ) yield ( term, termCounts.getTF( term ), termCounts.getDF( term ) ) 75 | } 76 | ); 77 | 78 | //System.err.println( "Writing topics per doc..." ) 79 | //CSVFile( file( outputPath + "/topics-per-doc.csv" ) ).write( perDocTopicDistributions ); 80 | -------------------------------------------------------------------------------- /pipeline/io_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import os 6 | from utf8_utils import UnicodeReader, UnicodeWriter 7 | 8 | def CheckAndMakeDirs( path ): 9 | if not os.path.exists( path ): 10 | os.makedirs( path ) 11 | 12 | def ReadAsList( filename ): 13 | """ 14 | Return a list of values. 15 | Each value corresponds to a line of the input file. 16 | """ 17 | data = [] 18 | with open( filename, 'r' ) as f: 19 | lines = f.read().decode( 'utf-8' ).splitlines() 20 | for line in lines: 21 | data.append( line ) 22 | return data 23 | 24 | def ReadAsVector( filename ): 25 | vector = [] 26 | with open( filename, 'r' ) as f: 27 | lines = f.read().decode( 'utf-8' ).splitlines() 28 | for line in lines: 29 | vector.append( float( line ) ) 30 | return vector 31 | 32 | def ReadAsMatrix( filename ): 33 | matrix = [] 34 | with open( filename, 'r' ) as f: 35 | lines = UnicodeReader( f ) 36 | for line in lines: 37 | matrix.append( map( float, line ) ) 38 | return matrix 39 | 40 | def ReadAsSparseVector( filename ): 41 | vector = {} 42 | with open( filename, 'r' ) as f: 43 | lines = UnicodeReader( f ) 44 | for ( key, value ) in lines: 45 | vector[ key ] = float( value ) 46 | return vector 47 | 48 | def ReadAsSparseMatrix( filename ): 49 | matrix = {} 50 | with open( filename, 'r' ) as f: 51 | lines = UnicodeReader( f ) 52 | for ( aKey, bKey, value ) in lines: 53 | matrix[ (aKey, bKey) ] = float( value ) 54 | return matrix 55 | 56 | def ReadAsJson( filename ): 57 | """ 58 | Expect a dict of values. 59 | Write dict as-is to disk as a JSON object. 60 | """ 61 | data = None 62 | with open( filename, 'r' ) as f: 63 | data = json.load( f, encoding = 'utf-8' ) 64 | return data 65 | 66 | def WriteAsList( data, filename ): 67 | with open( filename, 'w' ) as f: 68 | for element in data: 69 | f.write( element.encode( 'utf-8' ) + '\n' ) 70 | 71 | def WriteAsVector( vector, filename ): 72 | with open( filename, 'w' ) as f: 73 | for element in vector: 74 | f.write( str( vector ) + '\n' ) 75 | 76 | def WriteAsMatrix( matrix, filename ): 77 | with open( filename, 'w' ) as f: 78 | writer = UnicodeWriter( f ) 79 | for row in matrix: 80 | writer.writerow( map( str, row ) ) 81 | 82 | def WriteAsSparseVector( vector, filename ): 83 | """ 84 | Expect a sparse vector (dict) of values. 85 | Generate a tab-delimited file, with 2 columns. 86 | Write key as the 1st column; write cell value as the 2nd column. 87 | """ 88 | sortedKeys = sorted( vector.keys(), key = lambda key : -vector[key] ) 89 | with open( filename, 'w' ) as f: 90 | writer = UnicodeWriter( f ) 91 | for key in sortedKeys: 92 | writer.writerow( [ key, str( vector[key] ) ] ) 93 | 94 | def WriteAsSparseMatrix( matrix, filename ): 95 | """ 96 | Expect a sparse matrix (two-level dict) of values. 97 | Generate a tab-delimited file, with 3 columns. 98 | Write two keys as the 1st and 2nd columns; write cell value as the 3rd column. 99 | """ 100 | sortedKeys = sorted( matrix.keys(), key = lambda key : -matrix[key] ) 101 | with open( filename, 'w' ) as f: 102 | writer = UnicodeWriter( f ) 103 | for ( aKey, bKey ) in sortedKeys: 104 | writer.writerow( [ aKey, bKey, str( matrix[ (aKey, bKey) ] ) ] ) 105 | 106 | def WriteAsJson( data, filename ): 107 | """ 108 | Expect a dict of values. 109 | Write dict as-is to disk as a JSON object. 110 | """ 111 | with open( filename, 'w' ) as f: 112 | json.dump( data, f, encoding = 'utf-8', indent = 2, sort_keys = True ) 113 | 114 | def WriteAsTabDelimited( data, filename, fields ): 115 | """ 116 | Expect a list of dict values. 117 | Take in a list of output fields. 118 | Write specified fields to disk, as a tab-delimited file (with header row). 119 | """ 120 | with open( filename, 'w' ) as f: 121 | writer = UnicodeWriter( f ) 122 | writer.writerow( fields ) 123 | for element in data: 124 | values = [] 125 | for field in fields: 126 | if not type( element[field] ) is unicode: 127 | values.append( str( element[field] ) ) 128 | else: 129 | values.append( element[field] ) 130 | writer.writerow( values ) 131 | -------------------------------------------------------------------------------- /client-src/InteractionObjects.css: -------------------------------------------------------------------------------- 1 | /* matrix color objects */ 2 | line.normal { 3 | stroke: #808080; 4 | stroke-opacity: 0.25; 5 | stroke-width: 0.5px; 6 | } 7 | line.blue{ 8 | stroke: #1f77b4; 9 | stroke-opacity: 1; 10 | stroke-width: 0.5px; 11 | } 12 | line.orange { 13 | stroke: #ff7f0e; 14 | stroke-opacity: 1; 15 | stroke-width: 0.5px; 16 | } 17 | line.green { 18 | stroke: #2ca02c; 19 | stroke-opacity: 1; 20 | stroke-width: 0.5px; 21 | } 22 | line.purple { 23 | stroke: #9467bd; 24 | stroke-opacity: 1; 25 | stroke-width: 0.5px; 26 | } 27 | line.brown { 28 | stroke: #8c564b; 29 | stroke-opacity: 1; 30 | stroke-width: 0.5px; 31 | } 32 | line.pink { 33 | stroke: #e377c2; 34 | stroke-opacity: 1; 35 | stroke-width: 0.5px; 36 | } 37 | line.red { 38 | stroke: #933 ; 39 | stroke-opacity: 1 ; 40 | stroke-width: 0.5px ; 41 | } 42 | 43 | text { 44 | user-select: none; 45 | -webkit-user-select: none; 46 | -moz-user-select: none; 47 | font-family: Verdana; 48 | font-size: 10px; 49 | } 50 | text.topLabel { 51 | text-anchor: start; 52 | cursor: pointer; 53 | } 54 | text.leftLabel { 55 | text-anchor: end; 56 | } 57 | text.normal { 58 | fill: #808080; 59 | stroke: #808080; 60 | stroke-opacity: 0; 61 | stroke-width: 0px; 62 | } 63 | text.blue { 64 | fill: #1f77b4; 65 | stroke: #1f77b4; 66 | stroke-opacity: 1; 67 | stroke-width: 0.5px; 68 | } 69 | text.orange { 70 | fill: #ff7f0e; 71 | stroke: #ff7f0e; 72 | stroke-opacity: 1; 73 | stroke-width: 0.5px; 74 | } 75 | text.green { 76 | fill: #2ca02c; 77 | stroke: #2ca02c; 78 | stroke-opacity: 1; 79 | stroke-width: 0.5px; 80 | } 81 | text.purple { 82 | fill: #9467bd; 83 | stroke: #9467bd; 84 | stroke-opacity: 1; 85 | stroke-width: 0.5px; 86 | } 87 | text.brown { 88 | fill: #8c564b; 89 | stroke: #8c564b; 90 | stroke-opacity: 1; 91 | stroke-width: 0.5px; 92 | } 93 | text.pink { 94 | fill: #e377c2; 95 | stroke: #e377c2; 96 | stroke-opacity: 1; 97 | stroke-width: 0.5px; 98 | } 99 | text.red { 100 | fill: #933 ; 101 | stroke: #933 ; 102 | stroke-opacity: 1 ; 103 | stroke-width: 0.5px ; 104 | } 105 | 106 | circle.normal { 107 | fill: #808080; 108 | fill-opacity: 0.4; 109 | stroke: #808080; 110 | stroke-opacity: 0.8; 111 | } 112 | circle.blue { 113 | fill: #1f77b4; 114 | fill-opacity: 0.5; 115 | stroke: #1f77b4; 116 | stroke-opacity: 1; 117 | stroke-width: 0.5px; 118 | } 119 | circle.orange { 120 | fill: #ff7f0e; 121 | fill-opacity: 0.5; 122 | stroke: #ff7f0e; 123 | stroke-opacity: 1; 124 | stroke-width: 0.5px; 125 | } 126 | circle.green { 127 | fill: #2ca02c; 128 | fill-opacity: 0.5; 129 | stroke: #2ca02c; 130 | stroke-opacity: 1; 131 | stroke-width: 0.5px; 132 | } 133 | circle.purple { 134 | fill: #9467bd; 135 | fill-opacity: 0.5; 136 | stroke: #9467bd; 137 | stroke-opacity: 1; 138 | stroke-width: 0.5px; 139 | } 140 | circle.brown { 141 | fill: #8c564b; 142 | fill-opacity: 0.5; 143 | stroke: #8c564b; 144 | stroke-opacity: 1; 145 | stroke-width: 0.5px; 146 | } 147 | circle.pink { 148 | fill: #e377c2; 149 | fill-opacity: 0.5; 150 | stroke: #e377c2; 151 | stroke-opacity: 1; 152 | stroke-width: 0.5px; 153 | } 154 | circle.red { 155 | fill: #933 ; 156 | fill-opacity: 0.5 ; 157 | stroke: #933 ; 158 | stroke-opacity: 1 ; 159 | stroke-width: 0.5px ; 160 | } 161 | /* histogram color objects */ 162 | text.termLabel{ 163 | text-anchor: end; 164 | } 165 | text.HISTnormal { 166 | fill: #808080; 167 | stroke: #808080; 168 | stroke-opacity: 0; 169 | stroke-width: 0px; 170 | } 171 | text.HISTorange { 172 | fill: #ff7f0e; 173 | stroke: #ff7f0e; 174 | stroke-opacity: 1; 175 | stroke-width: 0.5px; 176 | } 177 | text.HISTred { 178 | fill: #933 ; 179 | stroke: #933 ; 180 | stroke-opacity: 1 ; 181 | stroke-width: 0.5px ; 182 | } 183 | 184 | line.termFreqBar { 185 | stroke: #808080; 186 | stroke-opacity: 0.4; 187 | stroke-width: 5px; 188 | } 189 | line.HISTnormal { 190 | stroke: #000; 191 | stroke-opacity: 0; 192 | stroke-width: 5px; 193 | } 194 | 195 | .HISTblue { 196 | fill: #1f77b4; 197 | stroke: #1f77b4; 198 | stroke-opacity: 1; 199 | stroke-width: 5px; 200 | } 201 | .HISTorange { 202 | fill: #ff7f0e; 203 | stroke: #ff7f0e; 204 | stroke-opacity: 1; 205 | stroke-width: 5px; 206 | } 207 | .HISTgreen { 208 | fill: #2ca02c; 209 | stroke: #2ca02c; 210 | stroke-opacity: 1; 211 | stroke-width: 5px; 212 | } 213 | 214 | .HISTpurple { 215 | fill: #9467bd; 216 | stroke: #9467bd; 217 | stroke-opacity: 1; 218 | stroke-width: 5px; 219 | } 220 | .HISTbrown { 221 | fill: #8c564b; 222 | stroke: #8c564b; 223 | stroke-opacity: 1; 224 | stroke-width: 5px; 225 | } 226 | .HISTpink { 227 | fill: #e377c2; 228 | stroke: #e377c2; 229 | stroke-opacity: 1; 230 | stroke-width: 5px; 231 | } 232 | .HISTred { 233 | fill: #933; 234 | stroke: #933; 235 | stroke-opacity: 1; 236 | stroke-width: 5px; 237 | } 238 | line.HISTred { 239 | fill: #933 ; 240 | stroke: #933 ; 241 | stroke-opacity: 1 ; 242 | stroke-width: 5px ; 243 | } -------------------------------------------------------------------------------- /pipeline/import_mallet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | from utf8_utils import UnicodeReader 10 | from api_utils import ModelAPI 11 | 12 | class ImportMallet( object ): 13 | 14 | """ 15 | Copies mallet file formats into Termite internal format. 16 | """ 17 | 18 | # Files generated by Mallet 19 | TOPIC_WORD_WEIGHTS = 'topic-word-weights.txt' 20 | 21 | def __init__( self, logging_level ): 22 | self.logger = logging.getLogger( 'ImportMallet' ) 23 | self.logger.setLevel( logging_level ) 24 | handler = logging.StreamHandler( sys.stderr ) 25 | handler.setLevel( logging_level ) 26 | self.logger.addHandler( handler ) 27 | 28 | def execute( self, model_library, model_path, data_path ): 29 | 30 | assert model_library is not None 31 | assert model_library == 'mallet' 32 | assert model_path is not None 33 | assert data_path is not None 34 | 35 | self.logger.info( '--------------------------------------------------------------------------------' ) 36 | self.logger.info( 'Importing a Mallet model...' ) 37 | self.logger.info( ' topic model = %s (%s)', model_path, model_library ) 38 | self.logger.info( ' output = %s', data_path ) 39 | 40 | self.logger.info( 'Connecting to data...' ) 41 | self.model = ModelAPI( data_path ) 42 | 43 | self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS ) 44 | self.extractTopicWordWeights( model_path ) 45 | 46 | self.logger.info( 'Writing data to disk...' ) 47 | self.model.write() 48 | 49 | self.logger.info( '--------------------------------------------------------------------------------' ) 50 | 51 | def extractTopicWordWeights( self, model_path ): 52 | data = {} 53 | words = [] 54 | topics = [] 55 | 56 | # Read in content of file (sparse matrix representation) 57 | filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS ) 58 | with open( filename, 'r' ) as f: 59 | lines = UnicodeReader( f ) 60 | for (topic, word, value) in lines: 61 | topic = int(topic) 62 | if topic not in data: 63 | data[ topic ] = {} 64 | data[ topic ][ word ] = float(value) 65 | words.append( word ) 66 | topics.append( topic ) 67 | 68 | # Get list of terms and topic indexes 69 | term_index = sorted( list( frozenset( words ) ) ) 70 | topic_index = sorted( list( frozenset( topics ) ) ) 71 | 72 | # Build dense matrix representation 73 | matrix = [] 74 | for term in term_index : 75 | row = [] 76 | for topic in topic_index : 77 | row.append( data[ topic ][ term ] ) 78 | matrix.append( row ) 79 | 80 | # Generate topic labels 81 | topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ] 82 | 83 | self.model.term_topic_matrix = matrix 84 | self.model.term_index = term_index 85 | self.model.topic_index = topic_str_index 86 | 87 | def main(): 88 | parser = argparse.ArgumentParser( description = 'Import results from Mallet topic model library into Termite.' ) 89 | parser.add_argument( 'config_file' , type = str, default = None , help = 'Path of Termite configuration file.' ) 90 | parser.add_argument( '--topic-model-library', type = str, dest = 'model_library', help = 'Override topic model library.' ) 91 | parser.add_argument( '--topic-model-path' , type = str, dest = 'model_path' , help = 'Override topic model path.' ) 92 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path.' ) 93 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 94 | args = parser.parse_args() 95 | 96 | model_library = None 97 | model_path = None 98 | data_path = None 99 | logging_level = 20 100 | 101 | # Read in default values from the configuration file 102 | config = ConfigParser.RawConfigParser() 103 | config.read( args.config_file ) 104 | model_library = config.get( 'TopicModel', 'library' ) 105 | model_path = config.get( 'TopicModel', 'path' ) 106 | data_path = config.get( 'Termite', 'path' ) 107 | if config.has_section( 'Misc' ): 108 | if config.has_option( 'Misc', 'logging' ): 109 | logging_level = config.getint( 'Misc', 'logging' ) 110 | 111 | # Read in user-specifiec values from the program arguments 112 | if args.model_library is not None: 113 | model_library = args.model_library 114 | if args.model_path is not None: 115 | model_path = args.model_path 116 | if args.data_path is not None: 117 | data_path = args.data_path 118 | if args.logging is not None: 119 | logging_level = args.logging 120 | 121 | ImportMallet( logging_level ).execute( model_library, model_path, data_path ) 122 | 123 | if __name__ == '__main__': 124 | main() -------------------------------------------------------------------------------- /client-src/UserControlViews.js: -------------------------------------------------------------------------------- 1 | // Expects to be bound to the state model 2 | var TotalTermsView = Backbone.View.extend({ 3 | el : 'div.TotalTermsView', 4 | render : function() 5 | { 6 | d3.select(this.el).text( this.model.get("totalTerms") ); 7 | } 8 | }); 9 | 10 | // Affinity Number Terms 11 | // Need to bound to the state model 12 | var AffinityNumTermsView = Backbone.View.extend({ 13 | el : 'div.AffinityNumTermsView', 14 | render : function() 15 | { 16 | d3.select(this.el).text( this.model.get("numAffinityTerms") ); 17 | } 18 | }); 19 | 20 | // Expects to be bound to the state model 21 | var AffinityNumTermsSlider = Backbone.View.extend({ 22 | el : 'input.AffinityNumTermsSlider', 23 | events : { 24 | 'change' : function(e) { 25 | this.model.set("numAffinityTerms", parseInt(e.target.value)); 26 | } 27 | }, 28 | initialize : function() { 29 | this.model.on( "change:numAffinityTerms", function(value) { 30 | d3.select(this.el)[0][0].value = this.model.get("numAffinityTerms"); 31 | }, this); 32 | } 33 | }); 34 | 35 | 36 | // Salient Number Terms 37 | // Expects to be bound to the state model 38 | var SalientNumTermsView = Backbone.View.extend({ 39 | el : 'div.SalientNumTermsView', 40 | render : function() 41 | { 42 | d3.select(this.el).text( this.model.get("numSalientTerms") ); 43 | } 44 | }); 45 | 46 | // Expects to be bound to the state model 47 | var SalientNumTermsSlider = Backbone.View.extend({ 48 | el: 'input.SalientNumTermsSlider', 49 | events : { 50 | 'change' : function(e) { 51 | this.model.set("numSalientTerms", parseInt(e.target.value)); 52 | } 53 | }, 54 | initialize : function() { 55 | this.model.on( "change:numSalientTerms", function(value) { 56 | d3.select(this.el)[0][0].value = this.model.get("numSalientTerms"); 57 | }, this); 58 | } 59 | }); 60 | 61 | // User Defined Terms 62 | // Expects to be bound to the state model 63 | var FoundTermsView = Backbone.View.extend({ 64 | el : 'div.FoundTermsView', 65 | render : function() 66 | { 67 | d3.select(this.el).text( this.model.get("foundTerms")); 68 | } 69 | }); 70 | 71 | // Expects to be bound to the state model 72 | var UnfoundTermsView = Backbone.View.extend({ 73 | el: 'div.UnfoundTermsView', 74 | prefix: 'div.UnfoundTermsPrefix', 75 | render : function() 76 | { 77 | if( this.model.get("unfoundTerms") !== ""){ 78 | d3.select( this.prefix ).style("visibility", "visible"); 79 | d3.select( this.el ).style("visibility", "visible").text( this.model.get("unfoundTerms")); 80 | } else { 81 | d3.select( this.prefix ).style("visibility", "hidden"); 82 | d3.select( this.el ).style("visibility", "hidden"); 83 | } 84 | } 85 | }); 86 | 87 | // Expects to be bound to the state model 88 | var UserDefinedTermsBox = Backbone.View.extend({ 89 | el: 'input.UserDefinedTermsBox', 90 | events : { 91 | 'keyup' : function(e) { 92 | this.model.setVisibleTerms(e.target.value); 93 | } 94 | }, 95 | initialize : function() { 96 | this.model.on( "change:visibleTerms", function(value) { 97 | d3.select(this.el)[0][0].value = this.model.get("visibleTerms").join(", "); 98 | }, this); 99 | } 100 | }); 101 | 102 | // Expects to be bound to the state model 103 | var AddTopTwenty = Backbone.View.extend({ 104 | el: 'input.TopTwentyAddition', 105 | events : { 106 | 'change' : function(e) { 107 | this.model.set("addTopTwenty", e.target.checked); 108 | } 109 | }, 110 | initialize : function() { 111 | this.model.on( "change:addTopTwenty", function(value) { 112 | d3.select(this.el)[0][0].checked = this.model.get("addTopTwenty"); 113 | }, this); 114 | } 115 | }); 116 | 117 | // Expects to be bound to the state model 118 | var SortDescription = Backbone.View.extend({ 119 | el: 'div.SortDescription', 120 | render : function() 121 | { 122 | var sort = this.model.get("sortType"); 123 | var topic = this.model.get("doubleClickTopic"); 124 | var output = ""; 125 | if( sort === "" ) 126 | output = "default"; 127 | else if (sort === "asc") 128 | output = "ascending on topic #" + topic; 129 | else 130 | output = "descending on topic #" + topic; 131 | d3.select(this.el).text( output ); 132 | }, 133 | initialize : function() { 134 | // TODO: call render's function? 135 | this.model.on( "change:sortType change:doubleClickTopic", function(value) { 136 | var sort = this.model.get("sortType"); 137 | var topic = this.model.get("doubleClickTopic"); 138 | var output = ""; 139 | if( sort === "" ) 140 | output = "default"; 141 | else if (sort === "asc") 142 | output = "ascending on topic #" + topic; 143 | else 144 | output = "descending on topic #" + topic; 145 | d3.select(this.el).text( output ); 146 | }, this); 147 | } 148 | }); 149 | 150 | // Expects to be bound to the state model 151 | var ClearAllButton = Backbone.View.extend({ 152 | el: 'button.clearAll', 153 | events : { 154 | 'click' : function(e) { 155 | this.model.clearAllSelectedTopics(); 156 | } 157 | } 158 | }); 159 | 160 | // Expects to be bound to the state model 161 | var ClearSortButton = Backbone.View.extend({ 162 | el: 'button.clearSort', 163 | events : { 164 | 'click' : function(e) { 165 | this.model.clearSorting(); 166 | } 167 | } 168 | }); 169 | -------------------------------------------------------------------------------- /pipeline/tokenize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import sys 6 | import argparse 7 | import logging 8 | import ConfigParser 9 | from api_utils import DocumentsAPI, TokensAPI 10 | 11 | class Tokenize( object ): 12 | 13 | """ 14 | Takes in the input corpus doc and writes it out as a list of tokens. 15 | 16 | Currently, supports only single document corpus with one document per line of format: 17 | doc_iddocument_content 18 | (Two fields delimited by tab.) 19 | 20 | Support for multiple files, directory(ies), and Lucene considered for future releases. 21 | """ 22 | 23 | WHITESPACE_TOKENIZATION = r'[^ ]+' 24 | ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*' 25 | ALPHA_TOKENIZATION = r'[A-Za-z_]+' 26 | UNICODE_TOKENIZATION = r'[\w]+' 27 | DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION 28 | 29 | def __init__( self, logging_level ): 30 | self.logger = logging.getLogger( 'Tokenize' ) 31 | self.logger.setLevel( logging_level ) 32 | handler = logging.StreamHandler( sys.stderr ) 33 | handler.setLevel( logging_level ) 34 | self.logger.addHandler( handler ) 35 | 36 | def execute( self, corpus_format, corpus_path, data_path, tokenization ): 37 | assert corpus_format is not None 38 | assert corpus_path is not None 39 | assert data_path is not None 40 | if tokenization is None: 41 | tokenization = Tokenize.DEFAULT_TOKENIZATION 42 | elif tokenization == 'unicode': 43 | tokenization = Tokenize.UNICODE_TOKENIZATION 44 | elif tokenization == 'whitespace': 45 | tokenization = Tokenize.WHITESPACE_TOKENIZATION 46 | elif tokenization == 'alpha': 47 | tokenization = Tokenize.ALPHA_TOKENIZATION 48 | elif tokenization == 'alphanumeric': 49 | tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION 50 | 51 | self.logger.info( '--------------------------------------------------------------------------------' ) 52 | self.logger.info( 'Tokenizing source corpus...' ) 53 | self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) 54 | self.logger.info( ' data_path = %s', data_path ) 55 | self.logger.info( ' tokenization = %s', tokenization ) 56 | 57 | self.logger.info( 'Connecting to data...' ) 58 | self.documents = DocumentsAPI( corpus_format, corpus_path ) 59 | self.tokens = TokensAPI( data_path ) 60 | 61 | self.logger.info( 'Reading from disk...' ) 62 | self.documents.read() 63 | 64 | self.logger.info( 'Tokenizing...' ) 65 | self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) ) 66 | 67 | self.logger.info( 'Writing to disk...' ) 68 | self.tokens.write() 69 | 70 | self.logger.info( '--------------------------------------------------------------------------------' ) 71 | 72 | def TokenizeDocuments( self, tokenizer ): 73 | for docID, docContent in self.documents.data.iteritems(): 74 | docTokens = self.TokenizeDocument( docContent, tokenizer ) 75 | self.tokens.data[ docID ] = docTokens 76 | 77 | def TokenizeDocument( self, text, tokenizer ): 78 | tokens = [] 79 | for token in re.findall( tokenizer, text ): 80 | tokens.append( token.lower() ) 81 | return tokens 82 | 83 | #-------------------------------------------------------------------------------# 84 | 85 | def main(): 86 | parser = argparse.ArgumentParser( description = 'Tokenize a document collection for Termite.' ) 87 | parser.add_argument( 'config_file' , type = str, default = None , help = 'Path of Termite configuration file.' ) 88 | parser.add_argument( '--corpus-format', type = str, dest = 'corpus_format', help = 'Override corpus format.' ) 89 | parser.add_argument( '--corpus-path' , type = str, dest = 'corpus_path' , help = 'Override corpus path.' ) 90 | parser.add_argument( '--tokenization' , type = str, dest = 'tokenization' , help = 'Override tokenization regex pattern.' ) 91 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path.' ) 92 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 93 | args = parser.parse_args() 94 | 95 | # Declare parameters 96 | corpus_format = None 97 | corpus_path = None 98 | tokenization = None 99 | data_path = None 100 | logging_level = 20 101 | 102 | # Read in default values from the configuration file 103 | if args.config_file is not None: 104 | config = ConfigParser.RawConfigParser() 105 | config.read( args.config_file ) 106 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'format' ): 107 | corpus_format = config.get( 'Corpus', 'format' ) 108 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'path' ): 109 | corpus_path = config.get( 'Corpus', 'path' ) 110 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'tokenization' ): 111 | tokenization = config.get( 'Corpus', 'tokenization' ) 112 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 113 | data_path = config.get( 'Termite', 'path' ) 114 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 115 | logging_level = config.getint( 'Misc', 'logging' ) 116 | 117 | # Read in user-specifiec values from the program arguments 118 | if args.corpus_format is not None: 119 | corpus_format = args.corpus_format 120 | if args.corpus_path is not None: 121 | corpus_path = args.corpus_path 122 | if args.tokenization is not None: 123 | tokenization = args.tokenization 124 | if args.data_path is not None: 125 | data_path = args.data_path 126 | if args.logging is not None: 127 | logging_level = args.logging 128 | 129 | Tokenize( logging_level ).execute( corpus_format, corpus_path, data_path, tokenization ) 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /client-src/QueryString.js: -------------------------------------------------------------------------------- 1 | function QueryString() 2 | { 3 | this.parameters = []; 4 | } 5 | 6 | QueryString.prototype.parameters = function() 7 | { 8 | for ( var i = 0; i < arguments.length; i++ ) 9 | this.addValueParameter( arguments[i] ); 10 | } 11 | QueryString.prototype.addValueParameter = function( name, identifier, decoder, encoder ) 12 | { 13 | if ( identifier === undefined || identifier === null ) 14 | identifier = name; 15 | if ( encoder === undefined || encoder === null ) 16 | if ( decoder === undefined || decoder === null ) 17 | encoder = "str"; 18 | else 19 | encoder = decoder; 20 | if ( decoder === undefined || decoder === null ) 21 | decoder = "str"; 22 | 23 | var parameter = { 24 | 'name' : name, 25 | 'identifier' : identifier, 26 | 'isArray' : false, 27 | 'decoder' : this.valueDecoder( decoder ), 28 | 'encoder' : this.valueEncoder( encoder ) 29 | } 30 | this.parameters.push( parameter ); 31 | return this; 32 | } 33 | QueryString.prototype.addArrayParameter = function( name, identifier, decoder, encoder ) 34 | { 35 | if ( identifier === undefined || identifier === null ) 36 | identifier = name; 37 | if ( encoder === undefined || encoder === null ) 38 | if ( decoder === undefined || decoder === null ) 39 | encoder = "str"; 40 | else 41 | encoder = decoder; 42 | if ( decoder === undefined || decoder === null ) 43 | decoder = "str"; 44 | 45 | var parameter = { 46 | 'name' : name, 47 | 'identifier' : identifier, 48 | 'isArray' : true, 49 | 'decoder' : this.arrayDecoder( decoder ), 50 | 'encoder' : this.arrayEncoder( encoder ) 51 | } 52 | this.parameters.push( parameter ); 53 | return this; 54 | } 55 | QueryString.prototype.valueDecoder = function( decoder ) 56 | { 57 | if ( typeof decoder == "function" ) 58 | return decoder; 59 | if ( typeof decoder == "string" ) 60 | { 61 | if ( decoder == "int" ) 62 | return function(d) { return parseInt(d,10) } 63 | if ( decoder == "float" ) 64 | return function(d) { return parseFloat(d) } 65 | return function(d) { return d }; 66 | } 67 | return null; 68 | } 69 | QueryString.prototype.valueEncoder = function( encoder ) 70 | { 71 | if ( typeof encoder == "function" ) 72 | return encoder; 73 | if ( typeof encoder == "string" ) 74 | { 75 | return function(d) { return String(d) }; 76 | } 77 | return null; 78 | } 79 | QueryString.prototype.arrayDecoder = function( decoder ) 80 | { 81 | if ( typeof decoder == "function" ) 82 | return decoder; 83 | var g = function(values) 84 | { 85 | var f = this.valueDecoder(decoder); 86 | var states = []; 87 | values.forEach( function(d) { states.push( f(d) ) } ); 88 | return states; 89 | } 90 | return g.bind(this); 91 | } 92 | QueryString.prototype.arrayEncoder = function( encoder ) 93 | { 94 | if ( typeof encoder == "function" ) 95 | return encoder; 96 | var g = function(states) 97 | { 98 | var f = this.valueEncoder(encoder); 99 | var values = []; 100 | states.forEach( function(d) { values.push( f(d) ) } ); 101 | return values; 102 | } 103 | return g.bind(this); 104 | } 105 | 106 | QueryString.prototype.read = function( states ) 107 | { 108 | if ( states === undefined || states === null ) 109 | states = {}; 110 | for ( var i in this.parameters ) 111 | { 112 | var p = this.parameters[i]; 113 | if ( p.isArray ) 114 | { 115 | var values = this.getValues( p.identifier ); 116 | if ( values.length > 0 ) 117 | states[p.name] = p.decoder( values ); 118 | } 119 | else 120 | { 121 | var value = this.getValue( p.identifier ); 122 | if ( value != null ) 123 | states[p.name] = p.decoder( value ); 124 | } 125 | } 126 | return states; 127 | } 128 | QueryString.prototype.write = function( states, replaceBrowserHistoryEntry, pageStates, pageTitle ) 129 | { 130 | if ( replaceBrowserHistoryEntry === undefined || typeof replaceBrowserHistoryEntry != "boolean" ) 131 | replaceBrowserHistoryEntry = false; 132 | if ( pageStates === undefined ) 133 | pageStates = null; 134 | if ( pageTitle === undefined ) 135 | pageTitle = null; 136 | 137 | var s = []; 138 | for ( var i in this.parameters ) 139 | { 140 | var p = this.parameters[i]; 141 | if ( p.name in states ) 142 | { 143 | if ( p.isArray ) 144 | { 145 | var values = p.encoder( states[p.name] ); 146 | for ( var j in values ) 147 | if ( values[j].length > 0 ) 148 | s.push( p.identifier + "=" + escape( values[j] ) ); 149 | } 150 | else 151 | { 152 | var value = p.encoder( states[p.name] ); 153 | if ( value.length > 0 ) 154 | s.push( p.identifier + "=" + escape( value ) ); 155 | } 156 | } 157 | } 158 | 159 | var protocol = window.location.protocol; 160 | var server = window.location.host; 161 | var path = window.location.pathname; 162 | var pageURL = protocol + '//' + server + path + ( s.length > 0 ? "?" + s.join( "&" ) : "" ); 163 | 164 | if ( replaceBrowserHistoryEntry ) 165 | history.replaceState( pageStates, pageTitle, pageURL ); 166 | else 167 | history.pushState( pageStates, pageTitle, pageURL ); 168 | } 169 | 170 | QueryString.prototype.getValue = function( key ) 171 | { 172 | var regex = this.getKeyRegex( key ); 173 | var match = regex.exec( window.location.href ); 174 | if ( match === null ) 175 | return null; 176 | else 177 | return unescape( match[1] ); 178 | } 179 | QueryString.prototype.getValues = function( key ) 180 | { 181 | var regex = this.getKeyRegex( key ); 182 | var matches = window.location.href.match( regex ); 183 | if ( matches === null ) 184 | return []; 185 | else 186 | { 187 | for ( var i = 0; i < matches.length; i ++ ) 188 | { 189 | var regex = this.getKeyRegex( key ); 190 | var match = regex.exec( matches[i] ); 191 | matches[i] = unescape( match[1] ); 192 | } 193 | return matches; 194 | } 195 | } 196 | QueryString.prototype.getKeyRegex = function( key ) 197 | { 198 | return new RegExp( "[\\?&]" + key + "=([^&]*)", "g" ); 199 | } 200 | 201 | -------------------------------------------------------------------------------- /pipeline/compute_saliency.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | import math 10 | from api_utils import ModelAPI, SaliencyAPI 11 | 12 | class ComputeSaliency( object ): 13 | """ 14 | Distinctiveness and saliency. 15 | 16 | Compute term distinctiveness and term saliency, based on 17 | the term probability distributions associated with a set of 18 | latent topics. 19 | 20 | Input is term-topic probability distribution, stored in 3 separate files: 21 | 'term-topic-matrix.txt' contains the entries of the matrix. 22 | 'term-index.txt' contains the terms corresponding to the rows of the matrix. 23 | 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. 24 | 25 | Output is a list of term distinctiveness and saliency values, 26 | in two duplicate formats, a tab-delimited file and a JSON object: 27 | 'term-info.txt' 28 | 'term-info.json' 29 | 30 | An auxiliary output is a list topic weights (i.e., the number of 31 | tokens in the corpus assigned to each latent topic) in two 32 | duplicate formats, a tab-delimited file and a JSON object: 33 | 'topic-info.txt' 34 | 'topic-info.json' 35 | """ 36 | 37 | def __init__( self, logging_level ): 38 | self.logger = logging.getLogger( 'ComputeSaliency' ) 39 | self.logger.setLevel( logging_level ) 40 | handler = logging.StreamHandler( sys.stderr ) 41 | handler.setLevel( logging_level ) 42 | self.logger.addHandler( handler ) 43 | 44 | def execute( self, data_path ): 45 | 46 | assert data_path is not None 47 | 48 | self.logger.info( '--------------------------------------------------------------------------------' ) 49 | self.logger.info( 'Computing term saliency...' ) 50 | self.logger.info( ' data_path = %s', data_path ) 51 | 52 | self.logger.info( 'Connecting to data...' ) 53 | self.model = ModelAPI( data_path ) 54 | self.saliency = SaliencyAPI( data_path ) 55 | 56 | self.logger.info( 'Reading data from disk...' ) 57 | self.model.read() 58 | 59 | self.logger.info( 'Computing...' ) 60 | self.computeTopicInfo() 61 | self.computeTermInfo() 62 | self.rankResults() 63 | 64 | self.logger.info( 'Writing data to disk...' ) 65 | self.saliency.write() 66 | 67 | self.logger.info( '--------------------------------------------------------------------------------' ) 68 | 69 | def computeTopicInfo( self ): 70 | topic_weights = [ sum(x) for x in zip( *self.model.term_topic_matrix ) ] 71 | topic_info = [] 72 | for i in range(self.model.topic_count): 73 | topic_info.append( { 74 | 'topic' : self.model.topic_index[i], 75 | 'weight' : topic_weights[i] 76 | } ) 77 | 78 | self.saliency.topic_info = topic_info 79 | 80 | def computeTermInfo( self ): 81 | """Iterate over the list of terms. Compute frequency, distinctiveness, saliency.""" 82 | 83 | topic_marginal = self.getNormalized( [ d['weight'] for d in self.saliency.topic_info ] ) 84 | term_info = [] 85 | for i in range(self.model.term_count): 86 | term = self.model.term_index[i] 87 | counts = self.model.term_topic_matrix[i] 88 | frequency = sum( counts ) 89 | probs = self.getNormalized( counts ) 90 | distinctiveness = self.getKLDivergence( probs, topic_marginal ) 91 | saliency = frequency * distinctiveness 92 | term_info.append( { 93 | 'term' : term, 94 | 'saliency' : saliency, 95 | 'frequency' : frequency, 96 | 'distinctiveness' : distinctiveness, 97 | 'rank' : None, 98 | 'visibility' : 'default' 99 | } ) 100 | self.saliency.term_info = term_info 101 | 102 | def getNormalized( self, counts ): 103 | """Rescale a list of counts, so they represent a proper probability distribution.""" 104 | tally = sum( counts ) 105 | if tally == 0: 106 | probs = [ d for d in counts ] 107 | else: 108 | probs = [ d / tally for d in counts ] 109 | return probs 110 | 111 | def getKLDivergence( self, P, Q ): 112 | """Compute KL-divergence from P to Q""" 113 | divergence = 0 114 | assert len(P) == len(Q) 115 | for i in range(len(P)): 116 | p = P[i] 117 | q = Q[i] 118 | assert p >= 0 119 | assert q >= 0 120 | if p > 0: 121 | divergence += p * math.log( p / q ) 122 | return divergence 123 | 124 | def rankResults( self ): 125 | """Sort topics by decreasing weight. Sort term frequencies by decreasing saliency.""" 126 | self.saliency.topic_info = sorted( self.saliency.topic_info, key = lambda topic_weight : -topic_weight['weight'] ) 127 | self.saliency.term_info = sorted( self.saliency.term_info, key = lambda term_freq : -term_freq['saliency'] ) 128 | for i, element in enumerate( self.saliency.term_info ): 129 | element['rank'] = i 130 | 131 | #-------------------------------------------------------------------------------# 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser( description = 'Compute term saliency for TermiteVis.' ) 135 | parser.add_argument( 'config_file', type = str, default = None , help = 'Path of Termite configuration file.' ) 136 | parser.add_argument( '--data-path', type = str, dest = 'data_path', help = 'Override data path.' ) 137 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 138 | args = parser.parse_args() 139 | 140 | data_path = None 141 | logging_level = 20 142 | 143 | # Read in default values from the configuration file 144 | if args.config_file is not None: 145 | config = ConfigParser.RawConfigParser() 146 | config.read( args.config_file ) 147 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 148 | data_path = config.get( 'Termite', 'path' ) 149 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 150 | logging_level = config.getint( 'Misc', 'logging' ) 151 | 152 | # Read in user-specifiec values from the program arguments 153 | if args.data_path is not None: 154 | data_path = args.data_path 155 | if args.logging is not None: 156 | logging_level = args.logging 157 | 158 | ComputeSaliency( logging_level ).execute( data_path ) 159 | 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /pipeline/prepare_data_for_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | from api_utils import ModelAPI, SaliencyAPI, SeriationAPI, ClientAPI 10 | 11 | class PrepareDataForClient( object ): 12 | """ 13 | Reformats data necessary for client to run. 14 | 15 | Extracts a subset of the complete term list and term-topic matrix and writes 16 | the subset to a separate file. Also, generates JSON file that merges/packages term 17 | information with the actual term. 18 | 19 | Input is term-topic probability distribution and term information, stored in 4 files: 20 | 'term-topic-matrix.txt' contains the entries of the matrix. 21 | 'term-index.txt' contains the terms corresponding to the rows of the matrix. 22 | 'topic-index.txt' contains the topic labels corresponding to the columns of the matrix. 23 | 'term-info.txt' contains information about individual terms. 24 | 25 | Output is a subset of terms and matrix, as well as the term subset's information. 26 | Number of files created or copied: 5 27 | 'submatrix-term-index.txt' 28 | 'submatrix-topic-index.txt' 29 | 'submatrix-term-topic.txt' 30 | 'term-info.json' 31 | 'term-info.txt' 32 | """ 33 | 34 | def __init__( self, logging_level ): 35 | self.logger = logging.getLogger( 'PrepareDataForClient' ) 36 | self.logger.setLevel( logging_level ) 37 | handler = logging.StreamHandler( sys.stderr ) 38 | handler.setLevel( logging_level ) 39 | self.logger.addHandler( handler ) 40 | 41 | def execute( self, data_path ): 42 | 43 | assert data_path is not None 44 | 45 | self.logger.info( '--------------------------------------------------------------------------------' ) 46 | self.logger.info( 'Preparing data for client...' ) 47 | self.logger.info( ' data_path = %s', data_path ) 48 | 49 | self.logger.info( 'Connecting to data...' ) 50 | self.model = ModelAPI( data_path ) 51 | self.saliency = SaliencyAPI( data_path ) 52 | self.seriation = SeriationAPI( data_path ) 53 | self.client = ClientAPI( data_path ) 54 | 55 | self.logger.info( 'Reading data from disk...' ) 56 | self.model.read() 57 | self.saliency.read() 58 | self.seriation.read() 59 | 60 | self.logger.info( 'Preparing parameters for seriated matrix...' ) 61 | self.prepareSeriatedParameters() 62 | 63 | self.logger.info( 'Preparing parameters for filtered matrix...' ) 64 | self.prepareFilteredParameters() 65 | 66 | self.logger.info( 'Preparing global term freqs...' ) 67 | self. prepareGlobalTermFreqs() 68 | 69 | self.logger.info( 'Writing data to disk...' ) 70 | self.client.write() 71 | 72 | def prepareSeriatedParameters( self ): 73 | topic_index = self.model.topic_index 74 | term_index = self.model.term_index 75 | term_topic_matrix = self.model.term_topic_matrix 76 | term_ordering = self.seriation.term_ordering 77 | term_topic_submatrix = [] 78 | term_subindex = [] 79 | for term in term_ordering: 80 | if term in term_index: 81 | index = term_index.index( term ) 82 | term_topic_submatrix.append( term_topic_matrix[ index ] ) 83 | term_subindex.append( term ) 84 | else: 85 | self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term ) 86 | 87 | self.client.seriated_parameters = { 88 | 'termIndex' : term_subindex, 89 | 'topicIndex' : topic_index, 90 | 'matrix' : term_topic_submatrix 91 | } 92 | 93 | def prepareFilteredParameters( self ): 94 | term_rank_map = { term: value for value, term in enumerate( self.seriation.term_iter_index ) } 95 | term_order_map = { term: value for value, term in enumerate( self.seriation.term_ordering ) } 96 | term_saliency_map = { d['term']: d['saliency'] for d in self.saliency.term_info } 97 | term_distinctiveness_map = { d['term'] : d['distinctiveness'] for d in self.saliency.term_info } 98 | 99 | self.client.filtered_parameters = { 100 | 'termRankMap' : term_rank_map, 101 | 'termOrderMap' : term_order_map, 102 | 'termSaliencyMap' : term_saliency_map, 103 | 'termDistinctivenessMap' : term_distinctiveness_map 104 | } 105 | 106 | def prepareGlobalTermFreqs( self ): 107 | topic_index = self.model.topic_index 108 | term_index = self.model.term_index 109 | term_topic_matrix = self.model.term_topic_matrix 110 | term_ordering = self.seriation.term_ordering 111 | term_topic_submatrix = [] 112 | term_subindex = [] 113 | for term in term_ordering: 114 | if term in term_index: 115 | index = term_index.index( term ) 116 | term_topic_submatrix.append( term_topic_matrix[ index ] ) 117 | term_subindex.append( term ) 118 | else: 119 | self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term ) 120 | 121 | term_freqs = { d['term']: d['frequency'] for d in self.saliency.term_info } 122 | 123 | self.client.global_term_freqs = { 124 | 'termIndex' : term_subindex, 125 | 'topicIndex' : topic_index, 126 | 'matrix' : term_topic_submatrix, 127 | 'termFreqMap' : term_freqs 128 | } 129 | 130 | def main(): 131 | parser = argparse.ArgumentParser( description = 'Prepare data for client.' ) 132 | parser.add_argument( 'config_file', type = str, default = None , help = 'Path of Termite configuration file.' ) 133 | parser.add_argument( '--data-path', type = str, dest = 'data_path', help = 'Override data path.' ) 134 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 135 | args = parser.parse_args() 136 | 137 | args = parser.parse_args() 138 | 139 | data_path = None 140 | logging_level = 20 141 | 142 | # Read in default values from the configuration file 143 | if args.config_file is not None: 144 | config = ConfigParser.RawConfigParser() 145 | config.read( args.config_file ) 146 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 147 | data_path = config.get( 'Termite', 'path' ) 148 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 149 | logging_level = config.getint( 'Misc', 'logging' ) 150 | 151 | # Read in user-specifiec values from the program arguments 152 | if args.data_path is not None: 153 | data_path = args.data_path 154 | if args.logging is not None: 155 | logging_level = args.logging 156 | 157 | PrepareDataForClient( logging_level ).execute( data_path ) 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Termite Set-Up Script 4 | # 5 | # Run once to 6 | # - download necessary library files 7 | # - minify client javascript files 8 | # 9 | 10 | LIBRARY=lib/ 11 | STMT=stmt-0.4.0/ 12 | CLIENT_SRC=client-src/ 13 | CLIENT_LIB=client-lib/ 14 | 15 | if [ ! -d $LIBRARY ] 16 | then 17 | echo 18 | echo "Creating a library folder: $LIBRARY" 19 | mkdir $LIBRARY 20 | fi 21 | 22 | if [ ! -d $CLIENT_LIB ] 23 | then 24 | echo 25 | echo "Creating the client template folder: $CLIENT_LIB" 26 | mkdir $CLIENT_LIB 27 | fi 28 | 29 | #------------------------------------------------------------------------------# 30 | # D3 Visualization Javascript Library 31 | 32 | echo 33 | echo "Downloading D3 javascript library..." 34 | curl --insecure --location https://github.com/mbostock/d3/releases/download/v3.4.1/d3.v3.zip > $LIBRARY/d3.v3.zip 35 | 36 | echo 37 | echo "Uncompressing D3 javascript library..." 38 | unzip $LIBRARY/d3.v3.zip d3.v3.js -d $CLIENT_SRC 39 | unzip $LIBRARY/d3.v3.zip d3.v3.min.js -d $CLIENT_LIB 40 | 41 | echo 42 | echo "Extracting D3 license..." 43 | unzip $LIBRARY/d3.v3.zip LICENSE -d $LIBRARY 44 | mv $LIBRARY/LICENSE $LIBRARY/LICENSE-d3 45 | 46 | #------------------------------------------------------------------------------# 47 | # jQuery Javascript Library 48 | 49 | echo 50 | echo "Downloading jQuery javascript library..." 51 | curl --insecure --location http://code.jquery.com/jquery-1.9.1.js > $CLIENT_SRC/jquery.js 52 | curl --insecure --location http://code.jquery.com/jquery-1.9.1.min.js > $CLIENT_LIB/jquery.min.js 53 | 54 | echo 55 | echo "Downloading jQuery GitHub archive..." 56 | curl --insecure --location http://github.com/jquery/jquery/archive/master.zip > $LIBRARY/jquery.zip 57 | 58 | echo 59 | echo "Extracting jQuery license..." 60 | unzip $LIBRARY/jquery.zip jquery-master/MIT-LICENSE.txt -d $LIBRARY 61 | mv $LIBRARY/jquery-master/MIT-LICENSE.txt $LIBRARY/LICENSE-jquery 62 | rmdir $LIBRARY/jquery-master 63 | 64 | #------------------------------------------------------------------------------# 65 | # Underscore Javascript Library 66 | 67 | echo 68 | echo "Downloading Underscore GitHub archive..." 69 | curl --insecure --location http://github.com/documentcloud/underscore/archive/master.zip > $LIBRARY/underscore.zip 70 | 71 | echo 72 | echo "Uncompressing Underscore javascript library..." 73 | unzip $LIBRARY/underscore.zip underscore-master/underscore.js -d $LIBRARY 74 | unzip $LIBRARY/underscore.zip underscore-master/underscore-min.js -d $LIBRARY 75 | mv $LIBRARY/underscore-master/underscore.js $CLIENT_SRC/underscore.js 76 | mv $LIBRARY/underscore-master/underscore-min.js $CLIENT_LIB/underscore.min.js 77 | 78 | echo 79 | echo "Extracting Underscore license..." 80 | unzip $LIBRARY/underscore.zip underscore-master/LICENSE -d $LIBRARY 81 | mv $LIBRARY/underscore-master/LICENSE $LIBRARY/LICENSE-underscore 82 | rmdir $LIBRARY/underscore-master 83 | 84 | #------------------------------------------------------------------------------# 85 | # Backbone Javascript Library 86 | 87 | echo 88 | echo "Downloading Backbone GitHub archive..." 89 | curl --insecure --location http://github.com/documentcloud/backbone/archive/master.zip > $LIBRARY/backbone.zip 90 | 91 | echo 92 | echo "Uncompressing Backbone javascript library..." 93 | unzip $LIBRARY/backbone.zip backbone-master/backbone.js -d $LIBRARY 94 | unzip $LIBRARY/backbone.zip backbone-master/backbone-min.js -d $LIBRARY 95 | mv $LIBRARY/backbone-master/backbone.js $CLIENT_SRC/backbone.js 96 | mv $LIBRARY/backbone-master/backbone-min.js $CLIENT_LIB/backbone.min.js 97 | 98 | echo 99 | echo "Extracting Backbone license..." 100 | unzip $LIBRARY/backbone.zip backbone-master/LICENSE -d $LIBRARY 101 | mv $LIBRARY/backbone-master/LICENSE $LIBRARY/LICENSE-backbone 102 | rmdir $LIBRARY/backbone-master 103 | 104 | #------------------------------------------------------------------------------# 105 | # Mallet (topic modeling library) 106 | 107 | echo 108 | echo "Downloading MALLET (MAchine Learning for LanguagE Toolkit)..." 109 | curl --insecure --location http://mallet.cs.umass.edu/dist/mallet-2.0.7.tar.gz > $LIBRARY/mallet-2.0.7.tar.gz 110 | 111 | echo 112 | echo "Uncompressing MALLET..." 113 | tar -zxvf $LIBRARY/mallet-2.0.7.tar.gz mallet-2.0.7 114 | 115 | echo 116 | echo "Extracting MALLET License..." 117 | cp mallet-2.0.7/LICENSE $LIBRARY/LICENSE-mallet 118 | 119 | #------------------------------------------------------------------------------# 120 | # Stanford Topic Modeling Toolkit 121 | 122 | echo 123 | echo "Downloading STMT (Stanford Topic Modeling Toolkit)..." 124 | if [ ! -d $STMT ] 125 | then 126 | echo 127 | echo "Creating a folder for STMT: $STMT" 128 | mkdir $STMT 129 | fi 130 | curl --insecure --location http://nlp.stanford.edu/software/tmt/tmt-0.4/tmt-0.4.0.jar > $STMT/tmt-0.4.0.jar 131 | curl --insecure --location http://nlp.stanford.edu/software/tmt/tmt-0.4/tmt-0.4.0-src.zip > $LIBRARY/tmt-0.4.0-src.zip 132 | 133 | echo 134 | echo "Extracting STMT License..." 135 | unzip $LIBRARY/tmt-0.4.0-src.zip LICENSE -d $LIBRARY 136 | cp $LIBRARY/LICENSE $LIBRARY/LICENSE-stmt 137 | 138 | #------------------------------------------------------------------------------# 139 | # Google closure compiler for Javascript 140 | 141 | echo 142 | echo "Downloading Google Closure Compiler..." 143 | curl --insecure --location http://dl.google.com/closure-compiler/compiler-latest.zip > $LIBRARY/compiler-latest.zip 144 | 145 | echo 146 | echo "Uncompressing Google Closure Compiler..." 147 | unzip $LIBRARY/compiler-latest.zip compiler.jar -d $LIBRARY 148 | mv $LIBRARY/compiler.jar $LIBRARY/closure-compiler.jar 149 | 150 | echo 151 | echo "Extracting Google Closure Compiler License..." 152 | unzip $LIBRARY/compiler-latest.zip COPYING -d $LIBRARY 153 | cp $LIBRARY/COPYING $LIBRARY/LICENSE-closure-compiler 154 | 155 | #------------------------------------------------------------------------------# 156 | # Slider for Firefox 157 | 158 | echo 159 | echo "Minifying html5slider.js" 160 | java -jar $LIBRARY/closure-compiler.jar --js=$CLIENT_SRC/html5slider.js --js_output_file=$CLIENT_LIB/html5slider.min.js 161 | 162 | #------------------------------------------------------------------------------# 163 | # Minify javascript files 164 | 165 | echo 166 | echo "Minifying javascript files..." 167 | 168 | for JS_FILE in FullTermTopicProbabilityModel SeriatedTermTopicProbabilityModel FilteredTermTopicProbabilityModel TermFrequencyModel TermTopicMatrixView TermFrequencyView ViewParameters StateModel UserControlViews QueryString 169 | do 170 | echo " Minifying $JS_FILE" 171 | java -jar $LIBRARY/closure-compiler.jar --js=$CLIENT_SRC/$JS_FILE.js --js_output_file=$CLIENT_LIB/$JS_FILE.min.js 172 | done 173 | -------------------------------------------------------------------------------- /pipeline/import_stmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | from utf8_utils import UnicodeReader 10 | from api_utils import ModelAPI 11 | 12 | class ImportStmt( object ): 13 | 14 | """ 15 | Copies STMT file formats into Termite internal format. 16 | """ 17 | 18 | # Files generated by STMT 19 | TERM_INDEX = 'term-index.txt' 20 | TOPIC_INDEX = 'topic-index.txt' 21 | DOCUMENT_INDEX = 'doc-index.txt' 22 | TOPIC_TERM = 'topic-term-distributions.csv' 23 | DOCUMENT_TOPIC = 'document-topic-distributions.csv' 24 | 25 | def __init__( self, logging_level ): 26 | self.logger = logging.getLogger( 'ImportStmt' ) 27 | self.logger.setLevel( logging_level ) 28 | handler = logging.StreamHandler( sys.stderr ) 29 | handler.setLevel( logging_level ) 30 | self.logger.addHandler( handler ) 31 | 32 | def execute( self, model_library, model_path, data_path ): 33 | 34 | assert model_library is not None 35 | assert model_library == 'stmt' 36 | assert model_path is not None 37 | assert data_path is not None 38 | 39 | self.logger.info( '--------------------------------------------------------------------------------' ) 40 | self.logger.info( 'Importing an STMT model...' ) 41 | self.logger.info( ' topic model = %s (%s)', model_path, model_library ) 42 | self.logger.info( ' output = %s', data_path ) 43 | 44 | self.logger.info( 'Connecting to data...' ) 45 | self.model = ModelAPI( data_path ) 46 | 47 | self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX ) 48 | self.model.term_index = self.readAsList( model_path, ImportStmt.TERM_INDEX ) 49 | self.model.term_count = len(self.model.term_index) 50 | 51 | self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX ) 52 | self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX ) 53 | self.model.topic_count = len(self.model.topic_index) 54 | 55 | self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX ) 56 | self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX ) 57 | self.model.document_count = len(self.model.document_index) 58 | 59 | self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM ) 60 | self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM ) 61 | 62 | self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC ) 63 | self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC ) 64 | 65 | self.logger.info( 'Extracting term-topic matrix...' ) 66 | self.extractTermTopicMatrix() 67 | 68 | self.logger.info( 'Extracting document-topic matrix...' ) 69 | self.extractDocumentTopicMatrix() 70 | 71 | self.logger.info( 'Writing data to disk...' ) 72 | self.model.write() 73 | 74 | def readAsList( self, model_path, filename ): 75 | data = [] 76 | filename = '{}/{}'.format( model_path, filename ) 77 | with open( filename, 'r' ) as f: 78 | data = f.read().decode( 'utf-8' ).splitlines() 79 | return data 80 | 81 | # Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file 82 | def readCsvAsMatrixStr( self, model_path, filename ): 83 | """ 84 | Return a matrix (list of list) of string values. 85 | Each row corresponds to a line of the input file. 86 | Each cell (in a row) corresponds to a comma-separated value (in each line). 87 | """ 88 | data = [] 89 | filename = '{}/{}'.format( model_path, filename ) 90 | with open( filename, 'r' ) as f: 91 | lines = UnicodeReader( f, delimiter = ',' ) 92 | data = [ d for d in lines ] 93 | return data 94 | 95 | def extractDocumentTopicMatrix( self ): 96 | """ 97 | Extract document-topic matrix. 98 | Probability distributions are stored from the 2nd column onward in the document-topic distributions. 99 | """ 100 | matrix = [] 101 | for line in self.document_topic_counts: 102 | matrix.append( map( float, line[1:self.model.topic_count+1] ) ) 103 | self.model.document_topic_matrix = matrix 104 | 105 | def extractTermTopicMatrix( self ): 106 | """ 107 | Extract term-topic matrix. 108 | Transpose the input topic-term distributions. 109 | Ensure all values are greater than or equal to 0. 110 | """ 111 | matrix = [ [0] * self.model.topic_count ] * self.model.term_count 112 | for j, line in enumerate( self.topic_term_counts ): 113 | for i, value in enumerate(line): 114 | matrix[i][j] = max( 0, float(value) ) 115 | self.model.term_topic_matrix = matrix 116 | 117 | def main(): 118 | parser = argparse.ArgumentParser( description = 'Import results from STMT (Stanford Topic-Modeling Toolbox) into Termite.' ) 119 | parser.add_argument( 'config_file' , type = str, default = None , help = 'Path of Termite configuration file.' ) 120 | parser.add_argument( '--topic-model-library', type = str, dest = 'model_library', help = 'Override topic model format' ) 121 | parser.add_argument( '--topic-model-path' , type = str, dest = 'model_path' , help = 'Override topic model path' ) 122 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path' ) 123 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level' ) 124 | args = parser.parse_args() 125 | 126 | model_library = None 127 | model_path = None 128 | data_path = None 129 | logging_level = 20 130 | 131 | # Read in default values from the configuration file 132 | config = ConfigParser.RawConfigParser() 133 | config.read( args.config_file ) 134 | model_library = config.get( 'TopicModel', 'library' ) 135 | model_path = config.get( 'TopicModel', 'path' ) 136 | data_path = config.get( 'Termite', 'path' ) 137 | if config.has_section( 'Misc' ): 138 | if config.has_option( 'Misc', 'logging' ): 139 | logging_level = config.getint( 'Misc', 'logging' ) 140 | 141 | # Read in user-specifiec values from the program arguments 142 | if args.model_library is not None: 143 | model_library = args.model_library 144 | if args.model_path is not None: 145 | model_path = args.model_path 146 | if args.data_path is not None: 147 | data_path = args.data_path 148 | if args.logging is not None: 149 | logging_level = args.logging 150 | 151 | ImportStmt( logging_level ).execute( model_library, model_path, data_path ) 152 | 153 | if __name__ == '__main__': 154 | main() 155 | -------------------------------------------------------------------------------- /client-src/TermFrequencyModel.js: -------------------------------------------------------------------------------- 1 | /* 2 | TermFrequencyModel.js 3 | This model processes and packages data for the term frequency view. 4 | 5 | Initialization: 6 | Load term frequency from 'data/global-term-freqs.json' 7 | 8 | Data Update: 9 | Listens to FilteredTermTopicProbabilityModel (events: ) 10 | 11 | Details: 12 | -------- 13 | Pulls data from FilteredTermTopicProbilityModel. The model loads some parameters from 14 | the url. On updates, the model receives a list 15 | of terms and generates a list of item(term, frequency) (same order as the term list 16 | received as input). 17 | */ 18 | var TermFrequencyModel = Backbone.Model.extend({ 19 | defaults : { 20 | "termIndex" : null, 21 | "totalTermFreqs": {}, 22 | "topicalFreqMatrix": [], 23 | "colorList": [], 24 | "selectedTopics": {} 25 | }, 26 | url : "data/global-term-freqs.json", 27 | initialize : function() { 28 | this.parentModel = null; 29 | this.stateModel = null; 30 | 31 | // original data 32 | this.originalMatrix = null; 33 | this.originalTopicIndex = null; 34 | this.originalTermIndex = null; 35 | 36 | // mappings 37 | this.termFreqMap = null; 38 | 39 | // iteractions 40 | // TODO: (later) clean up these. Definitely don't need all of these variables 41 | this.selectedTopics = {}; 42 | this.colorList = []; 43 | this.colorToTopic = {}; 44 | this.topicalFreqs = null; 45 | } 46 | }); 47 | 48 | /** 49 | * Initialize Term Frequency Model's parent and state model 50 | * 51 | * @private 52 | */ 53 | TermFrequencyModel.prototype.initModels = function( parent, state ){ 54 | this.parentModel = parent; 55 | this.stateModel = state; 56 | }; 57 | 58 | /** 59 | * Initialize all topics' selection status to null (called once by load) 60 | * 61 | * @private 62 | */ 63 | TermFrequencyModel.prototype.defaultSelection = function(){ 64 | var topicIndex = this.parentModel.get("topicIndex"); 65 | for( var i = 0; i < topicIndex.length; i++ ){ 66 | this.selectedTopics[i] = null; 67 | } 68 | this.set("selectedTopics", this.selectedTopics); 69 | }; 70 | 71 | /** 72 | * Loads matrix, termIndex, topicIndex, and term to frequency mapping from the model's "url" 73 | * and triggers a loaded event that the next model (child model) listens to. Also, pulls 74 | * any selected topics from state model and processes them. 75 | * (This function is called after the filtered model loaded event is fired) 76 | * 77 | * @param { string } the location of datafile to load values from 78 | * @return { void } 79 | */ 80 | TermFrequencyModel.prototype.load = function(){ 81 | var successHandler = function( model, response, options ) 82 | { 83 | this.set("termIndex", this.parentModel.get("termIndex")); 84 | 85 | this.originalMatrix = response.matrix; 86 | this.originalTopicIndex = response.topicIndex; 87 | this.originalTermIndex = response.termIndex; 88 | 89 | this.termFreqMap = response.termFreqMap; 90 | this.defaultSelection(); 91 | this.getTotalTermFreqs(); 92 | 93 | // process selected topics from the saved state 94 | var coloredTopics = this.stateModel.get("selectedTopics"); 95 | var colorList = []; 96 | for( var obj in coloredTopics){ 97 | claimColor( coloredTopics[obj] ); 98 | colorList.push({"topic":obj, "color":coloredTopics[obj]}); 99 | } 100 | colorList.sort(function(a, b) {return colorNames.indexOf(a.color) - colorNames.indexOf(b.color)}); 101 | for( var i = 0; i < colorList.length; i++){ 102 | this.selectTopic({"topic": colorList[i].topic, "color": colorList[i].color} ); 103 | } 104 | 105 | // signal completion 106 | this.trigger("loaded:freqModel"); 107 | 108 | }.bind(this); 109 | var errorHandler = function( model, xhr, options ) { }.bind(this); 110 | this.fetch({ 111 | add : false, 112 | success : successHandler, 113 | error : errorHandler 114 | }); 115 | }; 116 | 117 | /** 118 | * Calls appropriate functions to update based on data change(s) 119 | */ 120 | TermFrequencyModel.prototype.update = function(){ 121 | this.generateTopicalMatrix( true ); 122 | this.getTotalTermFreqs(); 123 | this.set("termIndex", this.parentModel.get("termIndex")); 124 | }; 125 | 126 | /** 127 | * Finds total frequency for each term in termIndex 128 | * 129 | * @private 130 | */ 131 | TermFrequencyModel.prototype.getTotalTermFreqs = function(){ 132 | var frequencies = {}; 133 | var terms = this.parentModel.get("termIndex"); 134 | for( var i = 0; i < terms.length; i++){ 135 | frequencies[terms[i]] = this.termFreqMap[terms[i]]; 136 | } 137 | this.set("totalTermFreqs", frequencies); 138 | }; 139 | 140 | /** 141 | * Finds frequency / topic for each term in termIndex and each topic in selectedTopics 142 | * 143 | * @private 144 | */ 145 | TermFrequencyModel.prototype.generateTopicalMatrix = function( keepQuiet ) { 146 | var frequencies = []; 147 | var terms = this.parentModel.get("termIndex"); 148 | for( var index = 0; index < this.colorList.length; index++){ 149 | var tempList = []; 150 | var topic = this.colorToTopic[this.colorList[index]]; 151 | for( var i = 0; i < terms.length; i++){ 152 | var termIndex = this.originalTermIndex.indexOf(terms[i]); 153 | tempList.push(this.originalMatrix[termIndex][topic]); 154 | } 155 | frequencies.push(tempList); 156 | } 157 | this.topicalFreqs = frequencies; 158 | this.set("topicalFreqMatrix", frequencies, {silent: keepQuiet}); 159 | this.set("colorList", this.colorList); 160 | this.set("selectedTopics", this.selectedTopics); 161 | return frequencies; 162 | }; 163 | 164 | /** 165 | * Called by term frequency view. Returns frequency / topic for every term in termIndex 166 | * 167 | * @this { TermFrequencyModel } 168 | * @param { int } target topic index 169 | * @return { array } list of topical frequencies in termIndex ordering 170 | */ 171 | TermFrequencyModel.prototype.getTopicalsForTopic = function( topic ) { 172 | var frequencies = []; 173 | var terms = this.get("termIndex"); 174 | for( var i = 0; i < terms.length; i++){ 175 | var termIndex = this.originalTermIndex.indexOf(terms[i]); 176 | frequencies.push(this.originalMatrix[termIndex][topic]); 177 | } 178 | return frequencies; 179 | }; 180 | 181 | // interactions 182 | /** 183 | * Behavior when topic is selected 184 | * 185 | * @this { TermFrequencyModel } 186 | * @param { object } topic: target topic index, color: associated color 187 | * @return { void } 188 | */ 189 | TermFrequencyModel.prototype.selectTopic = function( obj ) { 190 | var topic = obj.topic; 191 | var color = obj.color; 192 | var topicIndex = this.parentModel.get("topicIndex"); 193 | if( topic !== null){ 194 | 195 | // if color is DEFAULT, the event can be treated as a deselect 196 | if( color === DEFAULT) { 197 | if(this.selectedTopics[topic] !== null){ 198 | var index = this.colorList.indexOf(this.selectedTopics[topic]); 199 | this.colorList.splice(index,1); 200 | this.selectedTopics[topic] = null; 201 | delete this.colorToTopic[color]; 202 | 203 | } else { 204 | return; 205 | } 206 | } 207 | // only add if this topic wasn't added previously 208 | else if(this.selectedTopics[topic] === null) { 209 | this.selectedTopics[topic] = color; 210 | this.colorList.push(color); 211 | this.colorToTopic[color] = topic; 212 | } 213 | 214 | // recompute the topical matrix 215 | this.generateTopicalMatrix( false ); 216 | } 217 | }; 218 | -------------------------------------------------------------------------------- /client-src/StateModel.js: -------------------------------------------------------------------------------- 1 | var StateModel = Backbone.Model.extend({ 2 | defaults : { 3 | "numAffinityTerms" : 25, 4 | "numSalientTerms" : 0, 5 | "visibleTerms" : [], 6 | "totalTerms" : 25, 7 | "foundTerms" : "", 8 | "unfoundTerms" : "", 9 | "sortType": "", 10 | "addTopTwenty": false, 11 | "highlightedTerm" : "", 12 | "highlightedTopic" : null, 13 | "selectedTopics" : {}, 14 | "doubleClickTopic": null, 15 | "selectedTopicsStr": "" // var for load and save state 16 | }, 17 | initialize : function() { 18 | this.matrixView = null; 19 | this.termFreqView = null; 20 | } 21 | }); 22 | 23 | /** 24 | * Initialize state model's view models 25 | * 26 | * @private 27 | */ 28 | StateModel.prototype.initModel = function ( matrix, histogram ){ 29 | this.matrixView = matrix; 30 | this.termFreqView = histogram; 31 | }; 32 | 33 | // User Defined Terms 34 | /** 35 | * Set user defined control feedback views 36 | * 37 | * @this {state model} 38 | * @param { array } list of terms 39 | * @param { boolean } whether or not event should be silent 40 | */ 41 | StateModel.prototype.setFoundTerms = function( termList, keepQuiet ) { 42 | this.set( "foundTerms", termList.join(", "), {silent: keepQuiet}); 43 | }; 44 | StateModel.prototype.setUnfoundTerms = function( termList, keepQuiet ){ 45 | if( termList.length > 0 && termList[0] != "") 46 | this.set( "unfoundTerms", termList.join(", "), {silent: keepQuiet}); 47 | else 48 | this.set( "unfoundTerms", "", {silent: keepQuiet}); 49 | }; 50 | StateModel.prototype.setVisibleTerms = function ( userSpecifiedVisibleTerms ) { 51 | this.set( "visibleTerms", userSpecifiedVisibleTerms.split(/[ ,;]+/g) ); 52 | }; 53 | /** end user defined control feedback **/ 54 | 55 | 56 | /** 57 | * Handles selecting topics using click event. Uses function freeColor and getColor that 58 | * are defined in ViewParameters 59 | * 60 | * @this {state model} 61 | * @param { string } DEFAULT defined in ViewParameters 62 | * @param { int } index of clicked topic 63 | */ 64 | StateModel.prototype.selectTopic = function( topicIndex ) { 65 | var color = DEFAULT; 66 | // frees the color associated with the topic if it was already selected 67 | if( topicIndex in this.get("selectedTopics")) { 68 | freeColor( this.get("selectedTopics")[topicIndex] ); 69 | delete this.get("selectedTopics")[topicIndex]; 70 | } 71 | // assign a color to the selected topic if there are any free 72 | else { 73 | color = getColor(); 74 | this.get("selectedTopics")[topicIndex] = color; 75 | } 76 | // fire event to signify topic coloring may have changed 77 | this.trigger("color:topic", { "topic":topicIndex, "color": color } ); 78 | }; 79 | /** 80 | * Clears all topic selections (currently inefficiently implemented) 81 | */ 82 | StateModel.prototype.clearAllSelectedTopics = function() { 83 | console.log("clear all topics"); 84 | var selectedTopics = this.get("selectedTopics"); 85 | for( var i in selectedTopics){ 86 | freeColor( selectedTopics[i] ); 87 | delete this.get("selectedTopics")[i]; 88 | this.trigger("color:topic", {"topic":i, "color":DEFAULT} ); 89 | } 90 | }; 91 | /** end topic selection code **/ 92 | 93 | /** 94 | * Handles sorting using double click on a topic label 95 | * 96 | * @this {state model} 97 | * @param { int } index of double clicked topic 98 | */ 99 | StateModel.prototype.getSortType = function ( topicIndex ){ 100 | var sorts = ["desc", "asc", ""]; 101 | 102 | if(this.get("doubleClickTopic") !== topicIndex) 103 | return sorts[0]; 104 | else{ 105 | var currentSort = this.get("sortType"); 106 | var index = (sorts.indexOf(currentSort) + 1) % sorts.length; 107 | return sorts[index]; 108 | } 109 | }; 110 | StateModel.prototype.setDoubleClickTopic = function ( topicIndex ){ 111 | var type = this.getSortType(topicIndex); 112 | if( type === "") 113 | this.set( "doubleClickTopic", null); 114 | else 115 | this.set( "doubleClickTopic", topicIndex); 116 | this.set( "sortType", type); 117 | }; 118 | StateModel.prototype.clearSorting = function(){ 119 | this.set( "doubleClickTopic", null); 120 | this.set( "sortType", ""); 121 | }; 122 | /** end double click event code **/ 123 | 124 | /** 125 | * Handles highlighting events triggered by mouseover and mouseout 126 | * 127 | * @param { string } target term 128 | * @param { int } index of target topic 129 | */ 130 | StateModel.prototype.setHighlightedTerm = function( term ) { 131 | this.set("highlightedTerm", term ); 132 | }; 133 | StateModel.prototype.setHighlightedTopic = function( topic ) { 134 | this.set("highlightedTopic", topic ); 135 | }; 136 | /** end highlight event code **/ 137 | 138 | 139 | /** 140 | * load from query string including decoding some values 141 | * 142 | * @this {state model} 143 | */ 144 | StateModel.prototype.loadStatesFromQueryString = function() { 145 | 146 | var decodeString = function( str ){ 147 | var topicLabel = "#topic:"; 148 | var colorLabel = "#color:"; 149 | 150 | // extract color and topic pairs 151 | while( str.length > 0) { 152 | var topicIndex = str.indexOf(topicLabel); 153 | var colorIndex = str.indexOf(colorLabel); 154 | 155 | var topic = null; 156 | var color = null; 157 | if(topicIndex >= 0 && colorIndex >= 0){ 158 | topic = parseInt(str.substring(topicIndex+7, colorIndex)); 159 | 160 | var tempIndex = str.indexOf(topicLabel, colorIndex+7); 161 | if(tempIndex >= 0){ //there's another pair 162 | color = str.substring(colorIndex+7, tempIndex); 163 | str = str.substring(tempIndex); 164 | } else { //no more pairs 165 | color = str.substring(colorIndex+7); 166 | // get rid of trailing characters... 167 | color = color.replace( /[^A-Za-z0-9]/g, "" ); 168 | str = ""; 169 | } 170 | this.get("selectedTopics")[topic] = color; 171 | } 172 | } 173 | }.bind(this); 174 | 175 | var qs = new QueryString(); 176 | qs.addValueParameter( 'numAffinityTerms', 'na', 'int' ); 177 | qs.addValueParameter( 'numSalientTerms', 'ns', 'int' ); 178 | qs.addArrayParameter( 'visibleTerms', 't' ); 179 | qs.addValueParameter( 'sortType', 'st', 'str'); 180 | qs.addValueParameter( 'doubleClickTopic', 'dct', 'int'); 181 | qs.addValueParameter( 'addTopTwenty', 'att', 'str'); 182 | qs.addValueParameter( 'selectedTopicsStr', 'tc', 'str'); 183 | 184 | var states = qs.read(); 185 | for ( var key in states ){ 186 | if(key === "doubleClickTopic" && states[key] === -1){ 187 | this.set(key, null); 188 | } 189 | else if( key === "selectedTopicsStr" && states[key] !== ""){ 190 | // decode string 191 | decodeString( states[key] ); 192 | this.set(key, states[key]); 193 | } 194 | else if( key === "addTopTwenty"){ 195 | if( states[key].replace( /[^A-Za-z0-9]/g, "" ) === "false") 196 | this.set(key, false); 197 | else 198 | this.set(key, true); 199 | } 200 | else 201 | this.set( key, states[key] ); 202 | } 203 | 204 | this.trigger( "loaded:states" ); 205 | this.trigger( "sending:colors", this.get("selectedTopics")); 206 | }; 207 | 208 | /** 209 | * save current state to query string 210 | * 211 | * @this {state model} 212 | */ 213 | StateModel.prototype.saveStatesToQueryString = function() { 214 | var qs = new QueryString(); 215 | qs.addValueParameter( 'numAffinityTerms', 'na', 'int' ); 216 | qs.addValueParameter( 'numSalientTerms', 'ns', 'int' ); 217 | qs.addArrayParameter( 'visibleTerms', 't' ); 218 | qs.addValueParameter( 'sortType', 'st', 'str'); 219 | qs.addValueParameter( 'doubleClickTopic', 'dct', 'int'); 220 | qs.addValueParameter( 'addTopTwenty', 'att', 'str'); 221 | 222 | var selectedTopics = this.get("selectedTopics"); 223 | var strVersion = ""; 224 | for( var i in selectedTopics){ 225 | if(selectedTopics[i] !== DEFAULT) 226 | strVersion += "#topic:" + i + "#color:" + selectedTopics[i]; 227 | } 228 | this.set("selectedTopicsStr", strVersion); 229 | qs.addValueParameter( 'selectedTopicsStr', 'tc', 'str'); 230 | 231 | var keys = [ 'numAffinityTerms', 'numSalientTerms', 'visibleTerms', 'sortType', 'doubleClickTopic', 'addTopTwenty', 'selectedTopicsStr' ]; 232 | var states = {}; 233 | for ( var i in keys ) 234 | { 235 | var key = keys[i]; 236 | if(key === "doubleClickTopic" && this.get(key) === null){ 237 | states[key] = -1; 238 | } 239 | else 240 | states[key] = this.get(key); 241 | } 242 | 243 | qs.write( states ); 244 | }; -------------------------------------------------------------------------------- /README.old: -------------------------------------------------------------------------------- 1 | README for Termite, a topic model visualization tool. 2 | 3 | --------------- 4 | INFORMATION 5 | --------------- 6 | Termite is a visualization tool for inspecting the output of statistical 7 | topic models based on the techniques described in the following publication: 8 | 9 | Termite: Visualization Techniques for Assessing Textual Topic Models 10 | Jason Chuang, Christopher D. Manning, Jeffrey Heer 11 | Computer Science Dept, Stanford University 12 | http://vis.stanford.edu/papers/termite 13 | 14 | This tool is developed by: 15 | * Jason Chuang 16 | * Ashley Jin 17 | 18 | and is distributed under the BSD-3 license. 19 | 20 | ----------- 21 | LICENSE 22 | ----------- 23 | Copyright (c) 2013, Leland Stanford Junior University 24 | All rights reserved. 25 | 26 | Redistribution and use in source and binary forms, with or without 27 | modification, are permitted provided that the following conditions are met: 28 | * Redistributions of source code must retain the above copyright 29 | notice, this list of conditions and the following disclaimer. 30 | * Redistributions in binary form must reproduce the above copyright 31 | notice, this list of conditions and the following disclaimer in the 32 | documentation and/or other materials provided with the distribution. 33 | * Neither the name of the nor the 34 | names of its contributors may be used to endorse or promote products 35 | derived from this software without specific prior written permission. 36 | 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 38 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 39 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 40 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 41 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 42 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 43 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 44 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 45 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 46 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 47 | 48 | ---------------------- 49 | ADDITIONAL CREDITS 50 | ---------------------- 51 | Termite requires the use of the following libraries and tools. 52 | We thank their respective authors for developing and distributing the tools. 53 | 54 | Mallet: Machine learning for language toolkit 55 | Project website: http://mallet.cs.umass.edu 56 | Developed by Andrew McCallum, et al. 57 | Distributed under a CPL license: lib/LICENSE-mallet 58 | 59 | STMT: Stanford topic modeling toolbox 60 | Project website: http://nlp.stanford.edu/software/tmt 61 | Developed by Daniel Ramage, et al. 62 | Distributed under a GNU license: lib/LICENSE-stmt 63 | 64 | D3 javascript visualization library 65 | Project website: http://d3js.org 66 | Developed by Mike Bostock, et al. 67 | Distributed under a BSD license: lib/LICENSE-d3 68 | 69 | Google closure javascript compiler 70 | Project website: https://developers.google.com/closure/compiler/ 71 | Developed by Google engineers 72 | Distributed under an Apache license: lib/LICENSE-closure-compiler 73 | 74 | Backbone 75 | Project website: http://backbonejs.org 76 | Developed by Jeremy Ashkenas, DocumentCloud Inc. 77 | Distributed under a MIT license: lib/LICENSE-backbone 78 | 79 | Underscore 80 | Project website: http://underscorejs.org 81 | Developed by Jeremy Ashkenas, DocumentCloud Inc. 82 | Distributed under a MIT license: lib/LICENSE-underscore 83 | 84 | jQuery 85 | Project website: http://jquery.com 86 | Developed by the jQuery Foundation 87 | Distributed under a MIT license: lib/LICENSE-jquery 88 | 89 | html5slider 90 | Project website: https://github.com/fryn/html5slider 91 | Developed by Frank Yan 92 | Distributed under an MIT license: http://opensource.org/licenses/MIT 93 | 94 | ------------------ 95 | ONE-TIME SETUP 96 | ------------------ 97 | Run the setup script to fetch the following tools, libraries, and prepare all javascript 98 | files needed by Termite. This script only needs to be run once when Termite is first 99 | downloaded onto a new machine. 100 | >> ./setup.sh 101 | 102 | Libraries fetched include: 103 | * mallet 104 | * stmt 105 | * closure-compiler.js 106 | * d3.v3.js 107 | * jquery.js 108 | * underscore.js 109 | * backbone.js 110 | * html5slider.js 111 | 112 | The script creates a minified version of all javascript files. 113 | 114 | ----------------------- 115 | BUILD A TOPIC MODEL 116 | ----------------------- 117 | Building a topic model in Termite requires running a single python script. The script reads 118 | in an input text corpus, and produces an output folder whose content can be inspected 119 | using a web browser, described in the next section. 120 | 121 | Customize configuration file with the following information. A sample configuration 122 | file can be found in 'example.cfg' 123 | [Corpus] path to text corpus 124 | [TopicModel] directory for holding topic model outputs 125 | [TopicModel] number of topics to train 126 | [TopicModel] topic model (either mallet or stmt) 127 | [Termite] number of terms to seriate 128 | [Termite] path to save Termite-internal working files 129 | 130 | Process the text corpus, and build a topic model by running the execution script. 131 | Execution time will vary depending on the size of the corpus. 132 | >> ./execute.py 133 | 134 | The execution script calls in order: 135 | 1. pipeline/tokenize.py Tokenize the text corpus 136 | 2. pipeline/train_mallet.py Train a topic model using MALLET 137 | pipeline/train_stmt.py or STMT. 138 | 3. pipeline/compute_saliency.py Compute term saliency 139 | 4. pipeline/compute_similarity.py Compute term similarity 140 | 5. pipeline/compute_seriation.py: Seriates terms 141 | 6. pipeline/prepare_data_for_client.py: Generates datafiles for client 142 | 7. pipeline/prepare_vis_for_client.py: Copies necessary scripts for client 143 | 144 | ---------------------------- 145 | VIEW TOPIC MODEL OUTPUTS 146 | ---------------------------- 147 | You are now ready to visualize the topic model outputs! Termite's output can be viewed in 148 | a web browser. To view the files locally (on your own computer), you need to set up a local 149 | web server. Alternatively, you may copy the output folder to a web server to publish the results. 150 | 151 | Termite outputs are stored in the 'public_html' subfolder within the output directory. 152 | 153 | To set up a local webserver: 154 | 1. Change into output directory (specified in the configuration file) 155 | >> cd /public_html 156 | 2. Start a local server using python 157 | >> ./web.sh 158 | 3. Open http://localhost:8888 in a modern web browser (Chrome, Safari, Firefox, or Opera) 159 | to view a visualization of the model outputs. 160 | 161 | To publish the results on a webserver: 162 | 1. Copy public_html directory to your remote server. 163 | 164 | ----------------------------- 165 | TOPIC MODEL VISUALIZATION 166 | ----------------------------- 167 | The visualization should consist of a matrix view with a frequency bar view to the right. 168 | The top right contains user controls, two slider bars and one input text box. User 169 | controls are explained in more detail in the 'VISUALIZATION USE CASES' section. 170 | 171 | When the mouse is placed over terms, topics, circles, or bars in the matrix or frequency 172 | bar view, the associated term, topic, bar, and circles will be highlighted in both views. 173 | 174 | --------------------------- 175 | VISUALIZATION USE CASES 176 | --------------------------- 177 | The Termite visualization tool has a set of user controls in the top right section of the 178 | webpage. The user may specify 179 | * the number of terms with highest affinity to show 180 | * the number of terms with highest saliency to show 181 | * always display specific terms 182 | * always display 20 most frequent terms belonging to selected topics 183 | * click on a topic to select/color the topic 184 | * mouse over a topic/term to highlight the topic/term 185 | (Note: the highest affinity term set and highest saliency term set may contain overlapping words) 186 | -------------------------------------------------------------------------------- /execute.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | import time 10 | import os 11 | from pipeline.tokenize import Tokenize 12 | from pipeline.import_mallet import ImportMallet 13 | from pipeline.import_stmt import ImportStmt 14 | from pipeline.compute_saliency import ComputeSaliency 15 | from pipeline.compute_similarity import ComputeSimilarity 16 | from pipeline.compute_seriation import ComputeSeriation 17 | from pipeline.prepare_data_for_client import PrepareDataForClient 18 | 19 | class Execute( object ): 20 | 21 | """ 22 | Runs entire data processing pipeline and sets up client. 23 | 24 | Execute data processing scripts in order: 25 | 1. tokenize.py: Tokenize corpus 26 | 2. train_stmt/mallet.py: Train model 27 | 3. compute_saliency.py: Compute term saliency 28 | 4. compute_similarity.py: Compute term similarity 29 | 5. compute_seriation.py: Seriates terms 30 | 6. prepare_data_for_client.py: Generates datafiles for client 31 | 7. prepare_vis_for_client.py: Copies necessary scripts for client 32 | 33 | Input is configuration file specifying target corpus and destination directory. 34 | 35 | Creates multiple directories that store files from each stage of the pipeline. 36 | Among the directories is the public_html directory that holds all client files. 37 | """ 38 | 39 | DEFAULT_NUM_TOPICS = 25 40 | 41 | def __init__( self, logging_level ): 42 | self.logger = logging.getLogger( 'Execute' ) 43 | self.logger.setLevel( logging_level ) 44 | handler = logging.StreamHandler( sys.stderr ) 45 | handler.setLevel( logging_level ) 46 | self.logger.addHandler( handler ) 47 | 48 | def execute( self, corpus_format, corpus_path, tokenization, model_library, model_path, data_path, num_topics, number_of_seriated_terms ): 49 | 50 | assert corpus_format is not None 51 | assert corpus_path is not None 52 | assert model_library is not None 53 | assert model_library == 'stmt' or model_library == 'mallet' 54 | assert model_path is not None 55 | assert data_path is not None 56 | if num_topics is None: 57 | num_topics = Execute.DEFAULT_NUM_TOPICS 58 | assert number_of_seriated_terms is not None 59 | 60 | self.logger.info( '--------------------------------------------------------------------------------' ) 61 | self.logger.info( 'Tokenizing source corpus...' ) 62 | self.logger.info( ' corpus_path = %s (%s)', corpus_path, corpus_format ) 63 | self.logger.info( ' model_path = %s (%s)', model_path, model_library ) 64 | self.logger.info( ' data_path = %s', data_path ) 65 | self.logger.info( ' num_topics = %d', num_topics ) 66 | self.logger.info( ' number_of_seriated_terms = %s', number_of_seriated_terms ) 67 | self.logger.info( '--------------------------------------------------------------------------------' ) 68 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 69 | 70 | Tokenize( self.logger.level ).execute( corpus_format, corpus_path, data_path, tokenization ) 71 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 72 | 73 | if model_library == 'stmt': 74 | command = 'pipeline/train_stmt.sh {} {} {}'.format( data_path + '/tokens/tokens.txt', model_path, num_topics ) 75 | os.system( command ) 76 | ImportStmt( self.logger.level ).execute( model_library, model_path, data_path ) 77 | if model_library == 'mallet': 78 | command = 'pipeline/train_mallet.sh {} {} {}'.format( data_path + '/tokens/tokens.txt', model_path, num_topics ) 79 | os.system( command ) 80 | ImportMallet( self.logger.level ).execute( model_library, model_path, data_path ) 81 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 82 | 83 | ComputeSaliency( self.logger.level ).execute( data_path ) 84 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 85 | 86 | ComputeSimilarity( self.logger.level ).execute( data_path ) 87 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 88 | 89 | ComputeSeriation( self.logger.level ).execute( data_path, number_of_seriated_terms ) 90 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 91 | 92 | PrepareDataForClient( self.logger.level ).execute( data_path ) 93 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 94 | 95 | command = 'pipeline/prepare_vis_for_client.sh {}'.format( data_path ) 96 | os.system( command ) 97 | self.logger.info( 'Current time = {}'.format( time.ctime() ) ) 98 | 99 | #-------------------------------------------------------------------------------# 100 | 101 | def main(): 102 | parser = argparse.ArgumentParser( description = 'Prepare data for Termite.' ) 103 | parser.add_argument( 'config_file' , type = str, help = 'Termite configuration file.' ) 104 | parser.add_argument( '--corpus-format', type = str, dest = 'corpus_format', help = 'Override corpus format in the config file.' ) 105 | parser.add_argument( '--corpus-path' , type = str, dest = 'corpus_path' , help = 'Override corpus path in the config file.' ) 106 | parser.add_argument( '--model-library', type = str, dest = 'model_library', help = 'Override model library in the config file.' ) 107 | parser.add_argument( '--model-path' , type = str, dest = 'model_path' , help = 'Override model path in the config file.' ) 108 | parser.add_argument( '--num-topcis' , type = int, dest = 'num_topics' , help = 'Override number of topics in the config file.' ) 109 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path in the config file.' ) 110 | parser.add_argument( '--number-of-seriated-terms', type = int, dest = 'number_of_seriated_terms', help = 'Override the number of terms to seriate.' ) 111 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level specified in config file.' ) 112 | args = parser.parse_args() 113 | 114 | corpus_format = None 115 | corpus_path = None 116 | model_library = None 117 | model_path = None 118 | data_path = None 119 | num_topics = None 120 | number_of_seriated_terms = None 121 | logging_level = 20 122 | 123 | # Read in default values from the configuration file 124 | config = ConfigParser.RawConfigParser() 125 | config.read( args.config_file ) 126 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'format' ): 127 | corpus_format = config.get( 'Corpus', 'format' ) 128 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'path' ): 129 | corpus_path = config.get( 'Corpus', 'path' ) 130 | if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'tokenization' ): 131 | tokenization = config.get( 'Corpus', 'tokenization' ) 132 | if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'library' ): 133 | model_library = config.get( 'TopicModel', 'library' ) 134 | if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'path' ): 135 | model_path = config.get( 'TopicModel', 'path' ) 136 | if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'num_topics' ): 137 | num_topics = config.getint( 'TopicModel', 'num_topics' ) 138 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 139 | data_path = config.get( 'Termite', 'path' ) 140 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'number_of_seriated_terms' ): 141 | number_of_seriated_terms = config.getint( 'Termite', 'number_of_seriated_terms' ) 142 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 143 | logging_level = config.getint( 'Misc', 'logging' ) 144 | 145 | # Read in user-specifiec values from the program arguments 146 | if args.corpus_format is not None: 147 | corpus_format = args.corpus_format 148 | if args.corpus_path is not None: 149 | corpus_path = args.corpus_path 150 | if args.model_library is not None: 151 | model_library = args.model_library 152 | if args.model_path is not None: 153 | model_path = args.model_path 154 | if args.num_topics is not None: 155 | num_topics = args.num_topics 156 | if args.data_path is not None: 157 | data_path = args.data_path 158 | if args.number_of_seriated_terms is not None: 159 | number_of_seriated_terms = args.number_of_seriated_terms 160 | if args.logging is not None: 161 | logging_level = args.logging 162 | 163 | Execute( logging_level ).execute( corpus_format, corpus_path, tokenization, model_library, model_path, data_path, num_topics, number_of_seriated_terms ) 164 | 165 | if __name__ == '__main__': 166 | main() 167 | -------------------------------------------------------------------------------- /pipeline/api_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import json 6 | from io_utils import CheckAndMakeDirs 7 | from io_utils import ReadAsList, ReadAsVector, ReadAsMatrix, ReadAsSparseVector, ReadAsSparseMatrix, ReadAsJson 8 | from io_utils import WriteAsList, WriteAsVector, WriteAsMatrix, WriteAsSparseVector, WriteAsSparseMatrix, WriteAsJson, WriteAsTabDelimited 9 | from utf8_utils import UnicodeReader, UnicodeWriter 10 | 11 | class DocumentsAPI( object ): 12 | ACCEPTABLE_FORMATS = frozenset( [ 'file' ] ) 13 | 14 | def __init__( self, format, path ): 15 | assert format in DocumentsAPI.ACCEPTABLE_FORMATS 16 | self.format = format 17 | self.path = path 18 | self.data = [] 19 | 20 | def read( self ): 21 | self.data = {} 22 | filename = self.path 23 | with open( filename, 'r' ) as f: 24 | lines = f.read().decode( 'utf-8', 'ignore' ).splitlines() 25 | for line in lines: 26 | docID, docContent = line.split( '\t' ) 27 | self.data[ docID ] = docContent 28 | 29 | class TokensAPI( object ): 30 | SUBFOLDER = 'tokens' 31 | TOKENS = 'tokens.txt' 32 | 33 | def __init__( self, path ): 34 | self.path = '{}/{}/'.format( path, TokensAPI.SUBFOLDER ) 35 | self.data = {} 36 | 37 | def read( self ): 38 | self.data = {} 39 | filename = self.path + TokensAPI.TOKENS 40 | with open( filename, 'r' ) as f: 41 | lines = UnicodeReader( f ) 42 | for ( docID, docTokens ) in lines: 43 | self.data[ docID ] = docTokens.split( ' ' ) 44 | 45 | def write( self ): 46 | CheckAndMakeDirs( self.path ) 47 | filename = self.path + TokensAPI.TOKENS 48 | with open( filename, 'w' ) as f: 49 | writer = UnicodeWriter( f ) 50 | for ( docID, docTokens ) in self.data.iteritems(): 51 | writer.writerow( [ docID, ' '.join(docTokens) ] ) 52 | 53 | class ModelAPI( object ): 54 | SUBFOLDER = 'model' 55 | TOPIC_INDEX = 'topic-index.txt' 56 | TERM_INDEX = 'term-index.txt' 57 | TERM_TOPIC_MATRIX = 'term-topic-matrix.txt' 58 | 59 | def __init__( self, path ): 60 | self.path = '{}/{}/'.format( path, ModelAPI.SUBFOLDER ) 61 | self.topic_index = [] 62 | self.term_index = [] 63 | self.topic_count = 0 64 | self.term_count = 0 65 | self.term_topic_matrix = [] 66 | 67 | def read( self ): 68 | self.topic_index = ReadAsList( self.path + ModelAPI.TOPIC_INDEX ) 69 | self.term_index = ReadAsList( self.path + ModelAPI.TERM_INDEX ) 70 | self.term_topic_matrix = ReadAsMatrix( self.path + ModelAPI.TERM_TOPIC_MATRIX ) 71 | self.verify() 72 | 73 | def verify( self ): 74 | self.topic_count = len( self.topic_index ) 75 | self.term_count = len( self.term_index ) 76 | 77 | assert self.term_count == len( self.term_topic_matrix ) 78 | for row in self.term_topic_matrix: 79 | assert self.topic_count == len(row) 80 | 81 | def write( self ): 82 | self.verify() 83 | CheckAndMakeDirs( self.path ) 84 | WriteAsList( self.topic_index, self.path + ModelAPI.TOPIC_INDEX ) 85 | WriteAsList( self.term_index, self.path + ModelAPI.TERM_INDEX ) 86 | WriteAsMatrix( self.term_topic_matrix, self.path + ModelAPI.TERM_TOPIC_MATRIX ) 87 | 88 | class SaliencyAPI( object ): 89 | SUBFOLDER = 'saliency' 90 | TOPIC_WEIGHTS = 'topic-info.json' 91 | TOPIC_WEIGHTS_TXT = 'topic-info.txt' 92 | TOPIC_WEIGHTS_FIELDS = [ 'term', 'saliency', 'frequency', 'distinctiveness', 'rank', 'visibility' ] 93 | TERM_SALIENCY = 'term-info.json' 94 | TERM_SALIENCY_TXT = 'term-info.txt' 95 | TERM_SALIENCY_FIELDS = [ 'topic', 'weight' ] 96 | 97 | def __init__( self, path ): 98 | self.path = '{}/{}/'.format( path, SaliencyAPI.SUBFOLDER ) 99 | self.term_info = {} 100 | self.topic_info = {} 101 | 102 | def read( self ): 103 | self.term_info = ReadAsJson( self.path + SaliencyAPI.TERM_SALIENCY ) 104 | self.topic_info = ReadAsJson( self.path + SaliencyAPI.TOPIC_WEIGHTS ) 105 | 106 | def write( self ): 107 | CheckAndMakeDirs( self.path ) 108 | WriteAsJson( self.term_info, self.path + SaliencyAPI.TERM_SALIENCY ) 109 | WriteAsTabDelimited( self.term_info, self.path + SaliencyAPI.TERM_SALIENCY_TXT, SaliencyAPI.TOPIC_WEIGHTS_FIELDS ) 110 | WriteAsJson( self.topic_info, self.path + SaliencyAPI.TOPIC_WEIGHTS ) 111 | WriteAsTabDelimited( self.topic_info, self.path + SaliencyAPI.TOPIC_WEIGHTS_TXT, SaliencyAPI.TERM_SALIENCY_FIELDS ) 112 | 113 | class SimilarityAPI( object ): 114 | SUBFOLDER = 'similarity' 115 | DOCUMENT_OCCURRENCE = 'document-occurrence.txt' 116 | DOCUMENT_COOCCURRENCE = 'document-cooccurrence.txt' 117 | WINDOW_OCCURRENCE = 'window-occurrence.txt' 118 | WINDOW_COOCCURRENCE = 'window-cooccurrence.txt' 119 | UNIGRAM_COUNTS = 'unigram-counts.txt' 120 | BIGRAM_COUNTS = 'bigram-counts.txt' 121 | DOCUMENT_G2 = 'document-g2.txt' 122 | WINDOW_G2 = 'window-g2.txt' 123 | COLLOCATAPIN_G2 = 'collocation-g2.txt' 124 | COMBINED_G2 = 'combined-g2.txt' 125 | 126 | def __init__( self, path ): 127 | self.path = '{}/{}/'.format( path, SimilarityAPI.SUBFOLDER ) 128 | self.document_occurrence = {} 129 | self.document_cooccurrence = {} 130 | self.window_occurrence = {} 131 | self.window_cooccurrence = {} 132 | self.unigram_counts = {} 133 | self.bigram_counts = {} 134 | self.document_g2 = {} 135 | self.window_g2 = {} 136 | self.collcation_g2 = {} 137 | self.combined_g2 = {} 138 | 139 | def read( self ): 140 | # self.document_occurrence = ReadAsSparseVector( self.path + SimilarityAPI.DOCUMENT_OCCURRENCE ) 141 | # self.document_cooccurrence = ReadAsSparseMatrix( self.path + SimilarityAPI.DOCUMENT_COOCCURRENCE ) 142 | # self.window_occurrence = ReadAsSparseVector( self.path + SimilarityAPI.WINDOW_OCCURRENCE ) 143 | # self.window_cooccurrence = ReadAsSparseMatrix( self.path + SimilarityAPI.WINDOW_COOCCURRENCE ) 144 | # self.unigram_counts = ReadAsSparseVector( self.path + SimilarityAPI.UNIGRAM_COUNTS ) 145 | # self.bigram_counts = ReadAsSparseMatrix( self.path + SimilarityAPI.BIGRAM_COUNTS ) 146 | # self.document_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.DOCUMENT_G2 ) 147 | # self.window_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.WINDOW_G2 ) 148 | # self.collocation_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.COLLOCATAPIN_G2 ) 149 | self.combined_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.COMBINED_G2 ) 150 | 151 | def write( self ): 152 | CheckAndMakeDirs( self.path ) 153 | # WriteAsSparseVector( self.document_occurrence, self.path + SimilarityAPI.DOCUMENT_OCCURRENCE ) 154 | # WriteAsSparseMatrix( self.document_cooccurrence, self.path + SimilarityAPI.DOCUMENT_COOCCURRENCE ) 155 | # WriteAsSparseVector( self.window_occurrence, self.path + SimilarityAPI.WINDOW_OCCURRENCE ) 156 | # WriteAsSparseMatrix( self.window_cooccurrence, self.path + SimilarityAPI.WINDOW_COOCCURRENCE ) 157 | # WriteAsSparseVector( self.unigram_counts, self.path + SimilarityAPI.UNIGRAM_COUNTS ) 158 | # WriteAsSparseMatrix( self.bigram_counts, self.path + SimilarityAPI.BIGRAM_COUNTS ) 159 | # WriteAsSparseMatrix( self.document_g2, self.path + SimilarityAPI.DOCUMENT_G2 ) 160 | # WriteAsSparseMatrix( self.window_g2, self.path + SimilarityAPI.WINDOW_G2 ) 161 | # WriteAsSparseMatrix( self.collocation_g2, self.path + SimilarityAPI.COLLOCATAPIN_G2 ) 162 | WriteAsSparseMatrix( self.combined_g2, self.path + SimilarityAPI.COMBINED_G2 ) 163 | 164 | class SeriationAPI( object ): 165 | SUBFOLDER = 'seriation' 166 | TERM_ORDERING = 'term-ordering.txt' 167 | TERM_ITER_INDEX = 'term-iter-index.txt' 168 | 169 | def __init__( self, path ): 170 | self.path = '{}/{}/'.format( path, SeriationAPI.SUBFOLDER ) 171 | self.term_ordering = [] 172 | self.term_iter_index = [] 173 | 174 | def read( self ): 175 | self.term_ordering = ReadAsList( self.path + SeriationAPI.TERM_ORDERING ) 176 | self.term_iter_index = ReadAsList( self.path + SeriationAPI.TERM_ITER_INDEX ) 177 | 178 | def write( self ): 179 | CheckAndMakeDirs( self.path ) 180 | WriteAsList( self.term_ordering, self.path + SeriationAPI.TERM_ORDERING ) 181 | WriteAsList( self.term_iter_index, self.path + SeriationAPI.TERM_ITER_INDEX ) 182 | 183 | class ClientAPI( object ): 184 | SUBFOLDER = 'public_html/data' 185 | SERIATED_PARAMETERS = 'seriated-parameters.json' 186 | FILTERED_PARAMETERS = 'filtered-parameters.json' 187 | GLOBAL_TERM_FREQS = 'global-term-freqs.json' 188 | 189 | def __init__( self, path ): 190 | self.path = '{}/{}/'.format( path, ClientAPI.SUBFOLDER ) 191 | self.seriated_parameters = {} 192 | self.filtered_parameters = {} 193 | self.global_term_freqs = {} 194 | 195 | def read( self ): 196 | self.seriated_parameters = ReadAsJson( self.path + ClientAPI.SERIATED_PARAMETERS ) 197 | self.filtered_parameters = ReadAsJson( self.path + ClientAPI.FILTERED_PARAMETERS ) 198 | self.global_term_freqs = ReadAsJson( self.path + ClientAPI.GLOBAL_TERM_FREQS ) 199 | 200 | def write( self ): 201 | CheckAndMakeDirs( self.path ) 202 | WriteAsJson( self.seriated_parameters, self.path + ClientAPI.SERIATED_PARAMETERS ) 203 | WriteAsJson( self.filtered_parameters, self.path + ClientAPI.FILTERED_PARAMETERS ) 204 | WriteAsJson( self.global_term_freqs, self.path + ClientAPI.GLOBAL_TERM_FREQS ) 205 | -------------------------------------------------------------------------------- /client-src/html5slider.js: -------------------------------------------------------------------------------- 1 | /* 2 | html5slider - a JS implementation of for Firefox 16 and up 3 | https://github.com/fryn/html5slider 4 | 5 | Copyright (c) 2010-2012 Frank Yan, 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | THE SOFTWARE. 24 | */ 25 | 26 | (function() { 27 | 28 | // test for native support 29 | var test = document.createElement('input'); 30 | try { 31 | test.type = 'range'; 32 | if (test.type == 'range') 33 | return; 34 | } catch (e) { 35 | return; 36 | } 37 | 38 | // test for required property support 39 | test.style.background = 'linear-gradient(red, red)'; 40 | if (!test.style.backgroundImage || !('MozAppearance' in test.style) || 41 | !document.mozSetImageElement || !this.MutationObserver) 42 | return; 43 | 44 | var scale; 45 | var isMac = navigator.platform == 'MacIntel'; 46 | var thumb = { 47 | radius: isMac ? 9 : 6, 48 | width: isMac ? 22 : 12, 49 | height: isMac ? 16 : 20 50 | }; 51 | var track = 'linear-gradient(transparent ' + (isMac ? 52 | '6px, #999 6px, #999 7px, #ccc 8px, #bbb 9px, #bbb 10px, transparent 10px' : 53 | '9px, #999 9px, #bbb 10px, #fff 11px, transparent 11px') + 54 | ', transparent)'; 55 | var styles = { 56 | 'min-width': thumb.width + 'px', 57 | 'min-height': thumb.height + 'px', 58 | 'max-height': thumb.height + 'px', 59 | padding: '0 0 ' + (isMac ? '2px' : '1px'), 60 | border: 0, 61 | 'border-radius': 0, 62 | cursor: 'default', 63 | 'text-indent': '-999999px' // -moz-user-select: none; breaks mouse capture 64 | }; 65 | var options = { 66 | attributes: true, 67 | attributeFilter: ['min', 'max', 'step', 'value'] 68 | }; 69 | var forEach = Array.prototype.forEach; 70 | var onChange = document.createEvent('HTMLEvents'); 71 | onChange.initEvent('change', true, false); 72 | 73 | if (document.readyState == 'loading') 74 | document.addEventListener('DOMContentLoaded', initialize, true); 75 | else 76 | initialize(); 77 | 78 | function initialize() { 79 | // create initial sliders 80 | forEach.call(document.querySelectorAll('input[type=range]'), transform); 81 | // create sliders on-the-fly 82 | new MutationObserver(function(mutations) { 83 | mutations.forEach(function(mutation) { 84 | if (mutation.addedNodes) 85 | forEach.call(mutation.addedNodes, function(node) { 86 | check(node); 87 | if (node.childElementCount) 88 | forEach.call(node.querySelectorAll('input'), check); 89 | }); 90 | }); 91 | }).observe(document, { childList: true, subtree: true }); 92 | } 93 | 94 | function check(input) { 95 | if (input.localName == 'input' && input.type != 'range' && 96 | input.getAttribute('type') == 'range') 97 | transform(input); 98 | } 99 | 100 | function transform(slider) { 101 | 102 | var isValueSet, areAttrsSet, isChanged, isClick, prevValue, rawValue, prevX; 103 | var min, max, step, range, value = slider.value; 104 | 105 | // lazily create shared slider affordance 106 | if (!scale) { 107 | scale = document.body.appendChild(document.createElement('hr')); 108 | style(scale, { 109 | '-moz-appearance': isMac ? 'scale-horizontal' : 'scalethumb-horizontal', 110 | display: 'block', 111 | visibility: 'visible', 112 | opacity: 1, 113 | position: 'fixed', 114 | top: '-999999px' 115 | }); 116 | document.mozSetImageElement('__sliderthumb__', scale); 117 | } 118 | 119 | // reimplement value and type properties 120 | var getValue = function() { return '' + value; }; 121 | var setValue = function setValue(val) { 122 | value = '' + val; 123 | isValueSet = true; 124 | draw(); 125 | delete slider.value; 126 | slider.value = value; 127 | slider.__defineGetter__('value', getValue); 128 | slider.__defineSetter__('value', setValue); 129 | }; 130 | slider.__defineGetter__('value', getValue); 131 | slider.__defineSetter__('value', setValue); 132 | slider.__defineGetter__('type', function() { return 'range'; }); 133 | 134 | // sync properties with attributes 135 | ['min', 'max', 'step'].forEach(function(prop) { 136 | if (slider.hasAttribute(prop)) 137 | areAttrsSet = true; 138 | slider.__defineGetter__(prop, function() { 139 | return this.hasAttribute(prop) ? this.getAttribute(prop) : ''; 140 | }); 141 | slider.__defineSetter__(prop, function(val) { 142 | val === null ? this.removeAttribute(prop) : this.setAttribute(prop, val); 143 | }); 144 | }); 145 | 146 | // initialize slider 147 | slider.readOnly = true; 148 | style(slider, styles); 149 | update(); 150 | 151 | new MutationObserver(function(mutations) { 152 | mutations.forEach(function(mutation) { 153 | if (mutation.attributeName != 'value') { 154 | update(); 155 | areAttrsSet = true; 156 | } 157 | // note that value attribute only sets initial value 158 | else if (!isValueSet) { 159 | value = slider.getAttribute('value'); 160 | draw(); 161 | } 162 | }); 163 | }).observe(slider, options); 164 | 165 | slider.addEventListener('mousedown', onDragStart, true); 166 | slider.addEventListener('keydown', onKeyDown, true); 167 | slider.addEventListener('focus', onFocus, true); 168 | slider.addEventListener('blur', onBlur, true); 169 | 170 | function onDragStart(e) { 171 | isClick = true; 172 | setTimeout(function() { isClick = false; }, 0); 173 | if (e.button || !range) 174 | return; 175 | var width = parseFloat(getComputedStyle(this, 0).width); 176 | var multiplier = (width - thumb.width) / range; 177 | if (!multiplier) 178 | return; 179 | // distance between click and center of thumb 180 | var dev = e.clientX - this.getBoundingClientRect().left - thumb.width / 2 - 181 | (value - min) * multiplier; 182 | // if click was not on thumb, move thumb to click location 183 | if (Math.abs(dev) > thumb.radius) { 184 | isChanged = true; 185 | this.value -= -dev / multiplier; 186 | } 187 | rawValue = value; 188 | prevX = e.clientX; 189 | this.addEventListener('mousemove', onDrag, true); 190 | this.addEventListener('mouseup', onDragEnd, true); 191 | } 192 | 193 | function onDrag(e) { 194 | var width = parseFloat(getComputedStyle(this, 0).width); 195 | var multiplier = (width - thumb.width) / range; 196 | if (!multiplier) 197 | return; 198 | rawValue += (e.clientX - prevX) / multiplier; 199 | prevX = e.clientX; 200 | isChanged = true; 201 | this.value = rawValue; 202 | } 203 | 204 | function onDragEnd() { 205 | this.removeEventListener('mousemove', onDrag, true); 206 | this.removeEventListener('mouseup', onDragEnd, true); 207 | } 208 | 209 | function onKeyDown(e) { 210 | if (e.keyCode > 36 && e.keyCode < 41) { // 37-40: left, up, right, down 211 | onFocus.call(this); 212 | isChanged = true; 213 | this.value = value + (e.keyCode == 38 || e.keyCode == 39 ? step : -step); 214 | } 215 | } 216 | 217 | function onFocus() { 218 | if (!isClick) 219 | this.style.boxShadow = !isMac ? '0 0 0 2px #fb0' : 220 | 'inset 0 0 20px rgba(0,127,255,.1), 0 0 1px rgba(0,127,255,.4)'; 221 | } 222 | 223 | function onBlur() { 224 | this.style.boxShadow = ''; 225 | } 226 | 227 | // determines whether value is valid number in attribute form 228 | function isAttrNum(value) { 229 | return !isNaN(value) && +value == parseFloat(value); 230 | } 231 | 232 | // validates min, max, and step attributes and redraws 233 | function update() { 234 | min = isAttrNum(slider.min) ? +slider.min : 0; 235 | max = isAttrNum(slider.max) ? +slider.max : 100; 236 | if (max < min) 237 | max = min > 100 ? min : 100; 238 | step = isAttrNum(slider.step) && slider.step > 0 ? +slider.step : 1; 239 | range = max - min; 240 | draw(true); 241 | } 242 | 243 | // recalculates value property 244 | function calc() { 245 | if (!isValueSet && !areAttrsSet) 246 | value = slider.getAttribute('value'); 247 | if (!isAttrNum(value)) 248 | value = (min + max) / 2;; 249 | // snap to step intervals (WebKit sometimes does not - bug?) 250 | value = Math.round((value - min) / step) * step + min; 251 | if (value < min) 252 | value = min; 253 | else if (value > max) 254 | value = min + ~~(range / step) * step; 255 | } 256 | 257 | // renders slider using CSS background ;) 258 | function draw(attrsModified) { 259 | calc(); 260 | if (isChanged && value != prevValue) 261 | slider.dispatchEvent(onChange); 262 | isChanged = false; 263 | if (!attrsModified && value == prevValue) 264 | return; 265 | prevValue = value; 266 | var position = range ? (value - min) / range * 100 : 0; 267 | var bg = '-moz-element(#__sliderthumb__) ' + position + '% no-repeat, '; 268 | style(slider, { background: bg + track }); 269 | } 270 | 271 | } 272 | 273 | function style(element, styles) { 274 | for (var prop in styles) 275 | element.style.setProperty(prop, styles[prop], 'important'); 276 | } 277 | 278 | })(); 279 | -------------------------------------------------------------------------------- /client-src/FilteredTermTopicProbabilityModel.js: -------------------------------------------------------------------------------- 1 | /* 2 | FilteredTermTopicProbabilityModel.js 3 | 4 | This model is responsible for modifying data based on user inputs/controls 5 | Current user control changes: 6 | -number of terms to show based on BEA choice order 7 | -number of terms to show based on saliency score (desc order) 8 | -specific terms to always show in the list of terms 9 | -whether or not to add top "twenty" terms of selected topics 10 | -sorting 11 | 12 | Details: 13 | -------- 14 | Pulls data from SeriatedTermTopicProbabilityModel on initialize. 15 | Afterwards, this model is called when the user controls on the website are changed. 16 | At that time, the new "user defined" state is passed to the update function. 17 | */ 18 | 19 | var FilteredTermTopicProbabilityModel = Backbone.Model.extend({ 20 | defaults : { 21 | "matrix" : null, 22 | "termIndex" : null, 23 | "topicIndex" : null, 24 | "sparseMatrix" : null 25 | }, 26 | url : "data/filtered-parameters.json", 27 | initialize : function() { 28 | this.stateModel = null; 29 | this.parentModel = null; 30 | 31 | // mappings 32 | this.termRankMap = null; 33 | this.termOrderMap = null; 34 | this.rowIndexMap = null; 35 | this.termDistinctivenessMap = null; 36 | this.termSaliencyList = []; 37 | 38 | // interaction related variables 39 | this.selectedTopics = {}; 40 | this.visibleTopTerms = {}; 41 | } 42 | }); 43 | 44 | /** 45 | * Initialize filtered's parent and state model 46 | * 47 | * @private 48 | */ 49 | FilteredTermTopicProbabilityModel.prototype.initModel = function( model, state ){ 50 | this.parentModel = model; 51 | this.stateModel = state; 52 | }; 53 | 54 | /** 55 | * Initialize all topics' selection status to false (called once by load) 56 | * 57 | * @private 58 | */ 59 | FilteredTermTopicProbabilityModel.prototype.defaultSelection = function(){ 60 | var topicIndex = this.parentModel.get("topicIndex"); 61 | for( var i = 0; i < topicIndex.length; i++ ){ 62 | this.selectedTopics[i] = false; 63 | } 64 | }; 65 | 66 | /** 67 | * Loads various mappings from the model's "url" 68 | * and triggers a loaded event that the next model (child model) listens to. 69 | * (This function is called after the seriated model loaded event is fired) 70 | * 71 | * @param { string } the location of datafile to load values from 72 | * @return { void } 73 | */ 74 | FilteredTermTopicProbabilityModel.prototype.load = function() { 75 | var initRowIndexMap = function( termIndex ){ 76 | this.rowIndexMap = {}; 77 | for ( var i = 0; i < termIndex.length; i++ ){ 78 | this.rowIndexMap[termIndex[i]] = i; 79 | } 80 | }.bind(this); 81 | 82 | var initTermSaliencyList = function( saliencyMap ){ 83 | termSaliencyList = []; 84 | tempList = []; 85 | for ( var term in saliencyMap ){ 86 | tempList.push([term, saliencyMap[term]]); 87 | } 88 | tempList.sort(function(a, b) {return b[1] - a[1]}); 89 | for( var i = 0; i < tempList.length; i++ ){ 90 | this.termSaliencyList.push(tempList[i][0]); 91 | } 92 | }.bind(this); 93 | 94 | var successHandler = function( model, response, options ) 95 | { 96 | var keepQuiet = false; 97 | this.termRankMap = response.termRankMap; 98 | this.termOrderMap = response.termOrderMap; 99 | this.termDistinctivenessMap = response.termDistinctivenessMap; 100 | initRowIndexMap( this.parentModel.get("termIndex") ); 101 | initTermSaliencyList( response.termSaliencyMap ); 102 | 103 | this.initTopTermLists(); 104 | this.defaultSelection(); 105 | this.filter( keepQuiet ); 106 | 107 | var coloredTopics = this.stateModel.get("selectedTopics"); 108 | for( var obj in coloredTopics){ 109 | claimColor( coloredTopics[obj] ); 110 | this.selectTopic({"topic": obj, "color": coloredTopics[obj]} ); 111 | } 112 | 113 | this.trigger('loaded:filtered'); 114 | 115 | }.bind(this); 116 | var errorHandler = function( model, xhr, options ) { }.bind(this); 117 | this.fetch({ 118 | add : false, 119 | success : successHandler, 120 | error : errorHandler 121 | }); 122 | }; 123 | 124 | /** 125 | * Generates list of top twenty terms per topic in original topicIndex (called in load) 126 | * 127 | * @private 128 | */ 129 | FilteredTermTopicProbabilityModel.prototype.initTopTermLists = function() { 130 | var termIndex = this.parentModel.get("termIndex"); 131 | var topicIndex = this.parentModel.get("topicIndex"); 132 | 133 | var colFirstMatrix = generateColumnFirst(this.parentModel.get("matrix")); 134 | 135 | var termsPerTopic = 20; 136 | this.topTermLists = {}; 137 | for( var i = 0; i < topicIndex.length; i++){ 138 | this.topTermLists[i] = []; 139 | 140 | // get term freqs for this topic 141 | var topicalFrequencies = colFirstMatrix[i]; 142 | 143 | // sort the terms by topical frequency 144 | var indices = new Array(termIndex.length); 145 | for(var j = 0; j < termIndex.length; j++) 146 | indices[j] = j; 147 | indices.sort(function (a, b) { return topicalFrequencies[a] < topicalFrequencies[b] ? 1 : topicalFrequencies[a] > topicalFrequencies[b] ? -1 : 0; }); 148 | 149 | // take the top 20 (unless there are fewer than 20) 150 | var count = 0; 151 | while(count < 20 && indices[count] > THRESHHOLD){ 152 | this.topTermLists[i].push(termIndex[indices[count]]); 153 | count++; 154 | } 155 | } 156 | }; 157 | 158 | /** 159 | * Calls appropriate functions to update based on data change(s) 160 | */ 161 | FilteredTermTopicProbabilityModel.prototype.update = function( obj ) 162 | { 163 | this.filter( false ); 164 | }; 165 | 166 | /** 167 | * adds top twenty term list of selected topics to the visibleTopTerms list 168 | * 169 | * @private 170 | */ 171 | FilteredTermTopicProbabilityModel.prototype.addTopTerms = function() { 172 | for( var obj in this.selectedTopics){ 173 | if(this.selectedTopics[obj]) 174 | this.visibleTopTerms[obj] = this.topTermLists[obj]; 175 | } 176 | }; 177 | 178 | /** 179 | * Refreshes the termIndex and ordering based on user changes 180 | * 181 | * @param { boolean } determines whether certain "set"s should trigger change events 182 | * @return { void } 183 | */ 184 | FilteredTermTopicProbabilityModel.prototype.filter = function( keepQuiet ) { 185 | var original_submatrix = this.parentModel.get("matrix"); 186 | var original_termIndex = this.parentModel.get("termIndex"); 187 | var original_topicIndex = this.parentModel.get("topicIndex"); 188 | 189 | var userDefinedTerms = this.stateModel.get("visibleTerms").slice(0); 190 | if(this.stateModel.get("addTopTwenty")) 191 | this.addTopTerms(); 192 | else 193 | this.visibleTopTerms = {}; 194 | 195 | var affinityLimit = this.stateModel.get("numAffinityTerms"); 196 | var saliencyLimit = this.stateModel.get("numSalientTerms"); 197 | 198 | var foundTerms = []; 199 | var subset = []; 200 | // choose terms to keep 201 | var chooseTerm = function( term ){ 202 | if( userDefinedTerms.indexOf( term ) >= 0 ){ 203 | foundTerms.push(term); 204 | return true; 205 | } 206 | if( this.termRankMap[term] < affinityLimit ){ 207 | return true; 208 | } 209 | if( this.termSaliencyList.indexOf( term ) >= 0 && this.termSaliencyList.indexOf( term ) < saliencyLimit ){ 210 | return true; 211 | } 212 | for(var topicNo in this.visibleTopTerms){ 213 | if( this.visibleTopTerms[topicNo].indexOf( term ) >= 0 ) 214 | return true; 215 | } 216 | return false; 217 | }.bind(this); 218 | 219 | // sort the terms 220 | var sortType = this.stateModel.get("sortType"); 221 | for ( var i = 0; i < original_termIndex.length; i++ ){ 222 | var term = original_termIndex[i]; 223 | if( chooseTerm( term ) ){ 224 | if(sortType === "") 225 | subset.push( [term, this.termOrderMap[ term ]] ); 226 | else if( sortType === "desc") { 227 | var topic = this.stateModel.get("doubleClickTopic"); 228 | subset.push( [term, 1 / (original_submatrix[this.rowIndexMap[term]][topic]*this.termDistinctivenessMap[term])]); 229 | } 230 | else if( sortType === "asc") { 231 | var topic = this.stateModel.get("doubleClickTopic"); 232 | subset.push( [term, original_submatrix[this.rowIndexMap[term]][topic]*this.termDistinctivenessMap[term]]); 233 | } 234 | } 235 | } 236 | // find out which user defined terms were found in the dataset 237 | for( var i = 0; i < foundTerms.length; i++){ 238 | userDefinedTerms.splice(userDefinedTerms.indexOf(foundTerms[i]),1); 239 | } 240 | subset.sort(function(a, b) {return a[1] - b[1]}); 241 | 242 | // update model and state attributes 243 | matrix = []; 244 | termIndex = [] 245 | for(var j = 0; j < subset.length; j++){ 246 | var term = subset[j][0]; 247 | termIndex.push(term); 248 | matrix.push(original_submatrix[this.rowIndexMap[term]]); 249 | } 250 | this.set("topicIndex", original_topicIndex, { silent: keepQuiet } ); 251 | this.set("termIndex", termIndex, { silent: keepQuiet } ); 252 | this.set("matrix", matrix, { silent: keepQuiet} ); 253 | this.set("sparseMatrix", generateSparseMatrix.bind(this)(), {silent: keepQuiet}); 254 | 255 | this.stateModel.setFoundTerms(foundTerms, keepQuiet); 256 | this.stateModel.setUnfoundTerms(userDefinedTerms, keepQuiet); 257 | this.stateModel.set("totalTerms", termIndex.length); 258 | }; 259 | 260 | /** 261 | * Behavior when topic is selected 262 | * 263 | * @this { FilteredTermTopicProbabilityModel } 264 | * @param { object } topic: target topic index, color: associated color 265 | * @return { void } 266 | */ 267 | FilteredTermTopicProbabilityModel.prototype.selectTopic = function( obj ) { 268 | var topic = obj.topic; 269 | var colorClass = obj.color; 270 | var topicIndex = this.parentModel.get("topicIndex"); 271 | if( topic !== null){ 272 | 273 | // if color is DEFAULT, the event can be treated as a deselect 274 | if( colorClass === DEFAULT){ 275 | if(this.selectedTopics[topic]){ 276 | delete this.visibleTopTerms[topic]; 277 | this.selectedTopics[topic] = false; 278 | this.filter( false ); 279 | } 280 | return; 281 | } 282 | 283 | // only add if this topic wasn't added previously 284 | if(this.selectedTopics[topic] === false) { 285 | this.selectedTopics[topic] = true; 286 | this.filter( false ); 287 | } 288 | } 289 | }; -------------------------------------------------------------------------------- /pipeline/compute_similarity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | import math 10 | import itertools 11 | from api_utils import TokensAPI, SimilarityAPI 12 | 13 | class ComputeSimilarity( object ): 14 | """ 15 | Similarity measures. 16 | 17 | Compute term similarity based on co-occurrence and 18 | collocation likelihoods. 19 | """ 20 | 21 | DEFAULT_SLIDING_WINDOW_SIZE = 10 22 | MAX_FREQ = 100.0 23 | 24 | def __init__( self, logging_level ): 25 | self.logger = logging.getLogger( 'ComputeSimilarity' ) 26 | self.logger.setLevel( logging_level ) 27 | handler = logging.StreamHandler( sys.stderr ) 28 | handler.setLevel( logging_level ) 29 | self.logger.addHandler( handler ) 30 | 31 | def execute( self, data_path, sliding_window_size = None ): 32 | 33 | assert data_path is not None 34 | if sliding_window_size is None: 35 | sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE 36 | 37 | self.logger.info( '--------------------------------------------------------------------------------' ) 38 | self.logger.info( 'Computing term similarity...' ) 39 | self.logger.info( ' data_path = %s', data_path ) 40 | self.logger.info( ' sliding_window_size = %d', sliding_window_size ) 41 | 42 | self.logger.info( 'Connecting to data...' ) 43 | self.tokens = TokensAPI( data_path ) 44 | self.similarity = SimilarityAPI( data_path ) 45 | 46 | self.logger.info( 'Reading data from disk...' ) 47 | self.tokens.read() 48 | 49 | self.logger.info( 'Computing document co-occurrence...' ) 50 | self.computeDocumentCooccurrence() 51 | 52 | self.logger.info( 'Computing sliding-window co-occurrence...' ) 53 | self.computeSlidingWindowCooccurrence( sliding_window_size ) 54 | 55 | self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' ) 56 | self.computeTokenCounts() 57 | 58 | self.logger.info( 'Computing document co-occurrence likelihood...' ) 59 | self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence ) 60 | 61 | self.logger.info( 'Computing sliding-window co-occurrence likelihood...' ) 62 | self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence ) 63 | 64 | self.logger.info( 'Computing collocation likelihood...' ) 65 | self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts ) 66 | 67 | self.combineSimilarityMatrices() 68 | 69 | self.logger.info( 'Writing data to disk...' ) 70 | self.similarity.write() 71 | 72 | self.logger.info( '--------------------------------------------------------------------------------' ) 73 | 74 | def incrementCount( self, occurrence, key ): 75 | if key not in occurrence: 76 | occurrence[ key ] = 1 77 | else: 78 | occurrence[ key ] += 1 79 | 80 | def computeDocumentCooccurrence( self ): 81 | document_count = 0 82 | occurrence = {} 83 | cooccurrence = {} 84 | for docID, docTokens in self.tokens.data.iteritems(): 85 | self.logger.debug( ' %s (%d tokens)', docID, len(docTokens) ) 86 | tokenSet = frozenset(docTokens) 87 | document_count += 1 88 | for token in tokenSet: 89 | self.incrementCount( occurrence, token ) 90 | for aToken in tokenSet: 91 | for bToken in tokenSet: 92 | if aToken < bToken: 93 | self.incrementCount( cooccurrence, (aToken, bToken) ) 94 | 95 | self.document_count = document_count 96 | self.similarity.document_occurrence = occurrence 97 | self.similarity.document_cooccurrence = cooccurrence 98 | 99 | def computeSlidingWindowCooccurrence( self, sliding_window_size ): 100 | window_count = 0 101 | occurrence = {} 102 | cooccurrence = {} 103 | for docID, docTokens in self.tokens.data.iteritems(): 104 | allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size ) 105 | self.logger.debug( ' %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) ) 106 | for windowTokens in allWindowTokens: 107 | tokenSet = frozenset(windowTokens) 108 | window_count += 1 109 | for token in tokenSet: 110 | self.incrementCount( occurrence, token ) 111 | for aToken in tokenSet: 112 | for bToken in tokenSet: 113 | if aToken < bToken: 114 | self.incrementCount( cooccurrence, (aToken, bToken) ) 115 | 116 | self.window_count = window_count 117 | self.similarity.window_occurrence = occurrence 118 | self.similarity.window_cooccurrence = cooccurrence 119 | 120 | def getSlidingWindowTokens( self, tokens, sliding_window_size ): 121 | allWindows = [] 122 | aIndex = 0 - sliding_window_size 123 | bIndex = len(tokens) + sliding_window_size 124 | for index in range( aIndex, bIndex ): 125 | a = max( 0 , index - sliding_window_size ) 126 | b = min( len(tokens) , index + sliding_window_size ) 127 | allWindows.append( tokens[a:b] ) 128 | return allWindows 129 | 130 | def computeTokenCounts( self ): 131 | token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() ) 132 | 133 | unigram_counts = {} 134 | for docTokens in self.tokens.data.itervalues(): 135 | for token in docTokens: 136 | self.incrementCount( unigram_counts, token ) 137 | 138 | bigram_counts = {} 139 | for docTokens in self.tokens.data.itervalues(): 140 | prevToken = None 141 | for currToken in docTokens: 142 | if prevToken is not None: 143 | self.incrementCount( bigram_counts, (prevToken, currToken) ) 144 | prevToken = currToken 145 | 146 | self.token_count = token_count 147 | self.similarity.unigram_counts = unigram_counts 148 | self.similarity.bigram_counts = bigram_counts 149 | 150 | def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ): 151 | assert B_given_A >= 0 152 | assert B_given_notA >= 0 153 | assert any_given_A >= B_given_A 154 | assert any_given_notA >= B_given_notA 155 | 156 | a = float( B_given_A ) 157 | b = float( B_given_notA ) 158 | c = float( any_given_A ) 159 | d = float( any_given_notA ) 160 | E1 = c * ( a + b ) / ( c + d ) 161 | E2 = d * ( a + b ) / ( c + d ) 162 | 163 | g2a = 0 164 | g2b = 0 165 | if a > 0: 166 | g2a = a * math.log( a / E1 ) 167 | if b > 0: 168 | g2b = b * math.log( b / E2 ) 169 | return 2 * ( g2a + g2b ) 170 | 171 | def getG2( self, freq_all, freq_ab, freq_a, freq_b ): 172 | assert freq_all >= freq_a 173 | assert freq_all >= freq_b 174 | assert freq_a >= freq_ab 175 | assert freq_b >= freq_ab 176 | assert freq_all >= 0 177 | assert freq_ab >= 0 178 | assert freq_a >= 0 179 | assert freq_b >= 0 180 | 181 | B_given_A = freq_ab 182 | B_given_notA = freq_b - freq_ab 183 | any_given_A = freq_a 184 | any_given_notA = freq_all - freq_a 185 | 186 | return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA ) 187 | 188 | def getG2Stats( self, max_count, occurrence, cooccurrence ): 189 | g2_stats = {} 190 | freq_all = max_count 191 | for ( firstToken, secondToken ) in cooccurrence: 192 | freq_a = occurrence[ firstToken ] 193 | freq_b = occurrence[ secondToken ] 194 | freq_ab = cooccurrence[ (firstToken, secondToken) ] 195 | 196 | scale = ComputeSimilarity.MAX_FREQ / freq_all 197 | rescaled_freq_all = freq_all * scale 198 | rescaled_freq_a = freq_a * scale 199 | rescaled_freq_b = freq_b * scale 200 | rescaled_freq_ab = freq_ab * scale 201 | if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0: 202 | g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b ) 203 | return g2_stats 204 | 205 | def combineSimilarityMatrices( self ): 206 | self.logger.info( 'Combining similarity matrices...' ) 207 | self.similarity.combined_g2 = {} 208 | 209 | keys_queued = [] 210 | for key in self.similarity.document_g2: 211 | ( firstToken, secondToken ) = key 212 | otherKey = ( secondToken, firstToken ) 213 | keys_queued.append( key ) 214 | keys_queued.append( otherKey ) 215 | for key in self.similarity.window_g2: 216 | ( firstToken, secondToken ) = key 217 | otherKey = ( secondToken, firstToken ) 218 | keys_queued.append( key ) 219 | keys_queued.append( otherKey ) 220 | for key in self.similarity.collocation_g2: 221 | keys_queued.append( key ) 222 | 223 | keys_processed = {} 224 | for key in keys_queued: 225 | keys_processed[ key ] = False 226 | 227 | for key in keys_queued: 228 | if not keys_processed[ key ]: 229 | keys_processed[ key ] = True 230 | 231 | ( firstToken, secondToken ) = key 232 | if firstToken < secondToken: 233 | orderedKey = key 234 | else: 235 | orderedKey = ( secondToken, firstToken ) 236 | score = 0.0 237 | if orderedKey in self.similarity.document_g2: 238 | score += self.similarity.document_g2[ orderedKey ] 239 | if orderedKey in self.similarity.window_g2: 240 | score += self.similarity.window_g2[ orderedKey ] 241 | if key in self.similarity.collocation_g2: 242 | score += self.similarity.collocation_g2[ key ] 243 | if score > 0.0: 244 | self.similarity.combined_g2[ key ] = score 245 | 246 | #-------------------------------------------------------------------------------# 247 | 248 | def main(): 249 | parser = argparse.ArgumentParser( description = 'Compute term similarity for TermiteVis.' ) 250 | parser.add_argument( 'config_file' , type = str, default = None , help = 'Path of Termite configuration file.' ) 251 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path.' ) 252 | parser.add_argument( '--sliding-window-size', type = int, dest = 'sliding_window_size', help = 'Override sliding window size.' ) 253 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 254 | args = parser.parse_args() 255 | 256 | data_path = None 257 | sliding_window_size = None 258 | logging_level = 20 259 | 260 | # Read in default values from the configuration file 261 | if args.config_file is not None: 262 | config = ConfigParser.RawConfigParser() 263 | config.read( args.config_file ) 264 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 265 | data_path = config.get( 'Termite', 'path' ) 266 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'sliding_window_size' ): 267 | sliding_window_size = config.get( 'Termite', 'sliding_window_size' ) 268 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 269 | logging_level = config.getint( 'Misc', 'logging' ) 270 | 271 | # Read in user-specifiec values from the program arguments 272 | if args.data_path is not None: 273 | data_path = args.data_path 274 | if args.sliding_window_size is not None: 275 | sliding_window_size = args.sliding_window_size 276 | if args.logging is not None: 277 | logging_level = args.logging 278 | 279 | ComputeSimilarity( logging_level ).execute( data_path, sliding_window_size ) 280 | 281 | if __name__ == '__main__': 282 | main() 283 | -------------------------------------------------------------------------------- /pipeline/compute_seriation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import ConfigParser 7 | import logging 8 | 9 | import time 10 | from operator import itemgetter 11 | from api_utils import SaliencyAPI, SimilarityAPI, SeriationAPI 12 | 13 | class ComputeSeriation( object ): 14 | """Seriation algorithm. 15 | 16 | Re-order words to improve promote the legibility of multi-word 17 | phrases and reveal the clustering of related terms. 18 | 19 | As output, the algorithm produces a list of seriated terms and its 'ranking' 20 | (i.e., the iteration in which a term was seriated). 21 | """ 22 | 23 | DEFAULT_NUM_SERIATED_TERMS = 100 24 | 25 | def __init__( self, logging_level ): 26 | self.logger = logging.getLogger( 'ComputeSeriation' ) 27 | self.logger.setLevel( logging_level ) 28 | handler = logging.StreamHandler( sys.stderr ) 29 | handler.setLevel( logging_level ) 30 | self.logger.addHandler( handler ) 31 | 32 | def execute( self, data_path, numSeriatedTerms = None ): 33 | 34 | assert data_path is not None 35 | if numSeriatedTerms is None: 36 | numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS 37 | 38 | self.logger.info( '--------------------------------------------------------------------------------' ) 39 | self.logger.info( 'Computing term seriation...' ) 40 | self.logger.info( ' data_path = %s', data_path ) 41 | self.logger.info( ' number_of_seriated_terms = %d', numSeriatedTerms ) 42 | 43 | self.logger.info( 'Connecting to data...' ) 44 | self.saliency = SaliencyAPI( data_path ) 45 | self.similarity = SimilarityAPI( data_path ) 46 | self.seriation = SeriationAPI( data_path ) 47 | 48 | self.logger.info( 'Reading data from disk...' ) 49 | self.saliency.read() 50 | self.similarity.read() 51 | 52 | self.logger.info( 'Reshaping saliency data...' ) 53 | self.reshape() 54 | 55 | self.logger.info( 'Computing seriation...' ) 56 | self.compute( numSeriatedTerms ) 57 | 58 | self.logger.info( 'Writing data to disk...' ) 59 | self.seriation.write() 60 | 61 | self.logger.info( '--------------------------------------------------------------------------------' ) 62 | 63 | def reshape( self ): 64 | self.candidateSize = 100 65 | self.orderedTermList = [] 66 | self.termSaliency = {} 67 | self.termFreqs = {} 68 | self.termDistinct = {} 69 | self.termRank = {} 70 | self.termVisibility = {} 71 | for element in self.saliency.term_info: 72 | term = element['term'] 73 | self.orderedTermList.append( term ) 74 | self.termSaliency[term] = element['saliency'] 75 | self.termFreqs[term] = element['frequency'] 76 | self.termDistinct[term] = element['distinctiveness'] 77 | self.termRank[term] = element['rank'] 78 | self.termVisibility[term] = element['visibility'] 79 | 80 | def compute( self, numSeriatedTerms ): 81 | # Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output... 82 | # set in init (i.e. read from config file) 83 | 84 | # Seriate! 85 | start_time = time.time() 86 | candidateTerms = self.orderedTermList 87 | self.seriation.term_ordering = [] 88 | self.seriation.term_iter_index = [] 89 | self.buffers = [0,0] 90 | 91 | preBest = [] 92 | postBest = [] 93 | 94 | for iteration in range(numSeriatedTerms): 95 | print "Iteration no. ", iteration 96 | 97 | addedTerm = 0 98 | if len(self.seriation.term_iter_index) > 0: 99 | addedTerm = self.seriation.term_iter_index[-1] 100 | if iteration == 1: 101 | (preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms) 102 | (preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm) 103 | (candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration) 104 | 105 | print "---------------" 106 | seriation_time = time.time() - start_time 107 | 108 | # Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered 109 | #print "term_ordering: ", self.seriation.term_ordering 110 | #print "term_iter_index: ", self.seriation.term_iter_index # Feel free to pick a less confusing variable name 111 | 112 | #print "similarity matrix generation time: ", compute_sim_time 113 | #print "seriation time: ", seriation_time 114 | self.logger.debug("seriation time: " + str(seriation_time)) 115 | 116 | #-------------------------------------------------------------------------------# 117 | # Helper Functions 118 | 119 | def initBestEnergies(self, firstTerm, candidateTerms): 120 | 121 | preBest = [] 122 | postBest = [] 123 | for candidate in candidateTerms: 124 | pre_score = 0 125 | post_score = 0 126 | 127 | # preBest 128 | if (candidate, firstTerm) in self.similarity.combined_g2: 129 | pre_score = self.similarity.combined_g2[(candidate, firstTerm)] 130 | # postBest 131 | if (firstTerm, candidate) in self.similarity.combined_g2: 132 | post_score = self.similarity.combined_g2[(firstTerm, candidate)] 133 | 134 | preBest.append((candidate, pre_score)) 135 | postBest.append((candidate, post_score)) 136 | 137 | return (preBest, postBest) 138 | 139 | def getBestEnergies(self, preBest, postBest, addedTerm): 140 | if addedTerm == 0: 141 | return (preBest, postBest, []) 142 | 143 | term_order = [x[0] for x in preBest] 144 | # compare candidate terms' bests against newly added term 145 | remove_index = -1 146 | for existingIndex in range(len(preBest)): 147 | term = term_order[existingIndex] 148 | if term == addedTerm: 149 | remove_index = existingIndex 150 | 151 | # check pre energies 152 | if (term, addedTerm) in self.similarity.combined_g2: 153 | if self.similarity.combined_g2[(term, addedTerm)] > preBest[existingIndex][1]: 154 | preBest[existingIndex] = (term, self.similarity.combined_g2[(term, addedTerm)]) 155 | # check post energies 156 | if (addedTerm, term) in self.similarity.combined_g2: 157 | if self.similarity.combined_g2[(addedTerm, term)] > postBest[existingIndex][1]: 158 | postBest[existingIndex] = (term, self.similarity.combined_g2[(addedTerm, term)]) 159 | 160 | # remove the added term's preBest and postBest scores 161 | if remove_index != -1: 162 | del preBest[remove_index] 163 | del postBest[remove_index] 164 | 165 | #create and sort the bestEnergies list 166 | energyMax = [sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])] 167 | bestEnergies = zip([x[0] for x in preBest], energyMax) 168 | 169 | return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True)) 170 | 171 | def iterate_eff( self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no ): 172 | maxEnergyChange = 0.0; 173 | maxTerm = ""; 174 | maxPosition = 0; 175 | 176 | if len(bestEnergies) != 0: 177 | bestEnergy_terms = [x[0] for x in bestEnergies] 178 | else: 179 | bestEnergy_terms = candidateTerms 180 | 181 | breakout_counter = 0 182 | for candidate_index in range(len(bestEnergy_terms)): 183 | breakout_counter += 1 184 | candidate = bestEnergy_terms[candidate_index] 185 | for position in range(len(term_ordering)+1): 186 | current_buffer = buffers[position] 187 | candidateRank = self.termRank[candidate] 188 | if candidateRank <= (len(term_ordering) + self.candidateSize): 189 | current_energy_change = self.getEnergyChange(candidate, position, term_ordering, current_buffer, iteration_no) 190 | if current_energy_change > maxEnergyChange: 191 | maxEnergyChange = current_energy_change 192 | maxTerm = candidate 193 | maxPosition = position 194 | # check for early termination 195 | if candidate_index < len(bestEnergy_terms)-1 and len(bestEnergies) != 0: 196 | if maxEnergyChange >= (2*(bestEnergies[candidate_index][1] + current_buffer)): 197 | print "#-------- breaking out early ---------#" 198 | print "candidates checked: ", breakout_counter 199 | break; 200 | 201 | print "change in energy: ", maxEnergyChange 202 | print "maxTerm: ", maxTerm 203 | print "maxPosition: ", maxPosition 204 | 205 | candidateTerms.remove(maxTerm) 206 | 207 | # update buffers 208 | buf_score = 0 209 | if len(term_ordering) == 0: 210 | buffers = buffers 211 | elif maxPosition >= len(term_ordering): 212 | if (term_ordering[-1], maxTerm) in self.similarity.combined_g2: 213 | buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)] 214 | buffers.insert(len(buffers)-1, buf_score) 215 | elif maxPosition == 0: 216 | if (maxTerm, term_ordering[0]) in self.similarity.combined_g2: 217 | buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])] 218 | buffers.insert(1, buf_score) 219 | else: 220 | if (term_ordering[maxPosition-1], maxTerm) in self.similarity.combined_g2: 221 | buf_score = self.similarity.combined_g2[(term_ordering[maxPosition-1], maxTerm)] 222 | buffers[maxPosition] = buf_score 223 | 224 | buf_score = 0 225 | if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2: 226 | buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[maxPosition])] 227 | buffers.insert(maxPosition+1, buf_score) 228 | 229 | # update term ordering and ranking 230 | if maxPosition >= len(term_ordering): 231 | term_ordering.append(maxTerm) 232 | else: 233 | term_ordering.insert(maxPosition, maxTerm) 234 | term_iter_index.append(maxTerm) 235 | 236 | 237 | return (candidateTerms, term_ordering, term_iter_index, buffers) 238 | 239 | def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no): 240 | prevBond = 0.0 241 | postBond = 0.0 242 | 243 | # first iteration only 244 | if iteration_no == 0: 245 | current_freq = 0.0 246 | current_saliency = 0.0 247 | 248 | if candidateTerm in self.termFreqs: 249 | current_freq = self.termFreqs[candidateTerm] 250 | if candidateTerm in self.termSaliency: 251 | current_saliency = self.termSaliency[candidateTerm] 252 | return 0.001 * current_freq * current_saliency 253 | 254 | # get previous term 255 | if position > 0: 256 | prev_term = term_list[position-1] 257 | if (prev_term, candidateTerm) in self.similarity.combined_g2: 258 | prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)] 259 | 260 | # get next term 261 | if position < len(term_list): 262 | next_term = term_list[position] 263 | if (next_term, candidateTerm) in self.similarity.combined_g2: 264 | postBond = self.similarity.combined_g2[(candidateTerm, next_term)] 265 | 266 | return 2*(prevBond + postBond - currentBuffer) 267 | 268 | #-------------------------------------------------------------------------------# 269 | 270 | def main(): 271 | parser = argparse.ArgumentParser( description = 'Compute term seriation for TermiteVis.' ) 272 | parser.add_argument( 'config_file' , type = str, default = None , help = 'Path of Termite configuration file.' ) 273 | parser.add_argument( '--data-path' , type = str, dest = 'data_path' , help = 'Override data path.' ) 274 | parser.add_argument( '--number-of-seriated-terms', type = int, dest = 'number_of_seriated_terms', help = 'Override the number of terms to seriate.' ) 275 | parser.add_argument( '--logging' , type = int, dest = 'logging' , help = 'Override logging level.' ) 276 | args = parser.parse_args() 277 | 278 | data_path = None 279 | number_of_seriated_terms = None 280 | logging_level = 20 281 | 282 | # Read in default values from the configuration file 283 | if args.config_file is not None: 284 | config = ConfigParser.RawConfigParser() 285 | config.read( args.config_file ) 286 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ): 287 | data_path = config.get( 'Termite', 'path' ) 288 | if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'number_of_seriated_terms' ): 289 | number_of_seriated_terms = config.getint( 'Termite', 'number_of_seriated_terms' ) 290 | if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ): 291 | logging_level = config.getint( 'Misc', 'logging' ) 292 | 293 | # Read in user-specifiec values from the program arguments 294 | if args.data_path is not None: 295 | data_path = args.data_path 296 | if args.number_of_seriated_terms is not None: 297 | number_of_seriated_terms = args.number_of_seriated_terms 298 | if args.logging is not None: 299 | logging_level = args.logging 300 | 301 | ComputeSeriation( logging_level ).execute( data_path, number_of_seriated_terms ) 302 | 303 | if __name__ == '__main__': 304 | main() 305 | -------------------------------------------------------------------------------- /client-src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Termite | Topic Model Visualization 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 187 | 188 | 189 |
190 |
191 | 248 |
249 | 250 | 251 | 252 | 253 | 254 |
255 |
256 |

This visualization shows the topical distribution of words in a corpus.

257 |

The area of a circle is proportional to a word's frequency in a topic.

258 |
259 |
260 | 263 |
264 |
265 | 266 | 267 | -------------------------------------------------------------------------------- /client-src/TermFrequencyView.js: -------------------------------------------------------------------------------- 1 | /* 2 | TermFrequencyView.js 3 | 4 | This view is responsible for generating the term frequency view. 5 | 6 | Details: 7 | -------- 8 | Receives list of terms and associated frequencies from TermFrequencyModel. 9 | 10 | Additionally, uses parameters defined in ViewParameters.js. 11 | */ 12 | var TERMFREQ_TEXT_DEFAULT = { 13 | FILL_COLOR: "#808080", 14 | STROKE_OPACITY: 0, 15 | FILL_OPACITY: 1 16 | }; 17 | var TERMFREQ_BAR_DEFAULT = { 18 | STROKE_COLOR: "#808080", 19 | STROKE_WIDTH: 5, 20 | STROKE_OPACITY: 0.4 21 | }; 22 | var HISTOGRAM_ENCODING_PARAMETERS = { 23 | NUM_TOPICS : 0, 24 | setNumTopics : function(numTopics) { this.NUM_TOPICS = numTopics; }, 25 | DENSE_NUM_TOPICS: 50, 26 | LOOSE_NUM_TOPICS: 20, 27 | DENSE_PACKING: 12, 28 | LOOSE_PACKING: 18, 29 | packing : function() 30 | { 31 | return 12; 32 | } 33 | }; 34 | var HISTORGRAM_CONTAINER_PADDING = { 35 | left_separation: 10, 36 | top: 60, 37 | left: 130, 38 | right: 20, 39 | bottom: 60, 40 | width: 150, 41 | fullWidth : function() { return this.left + this.right + this.width }, 42 | fullHeight : function( numTopics, numTerms ) { return this.top + this.bottom + HISTOGRAM_ENCODING_PARAMETERS.packing() * numTerms } 43 | }; 44 | 45 | var TermFrequencyView = Backbone.View.extend({ 46 | initialize : function() { 47 | this.parentModel = null; 48 | 49 | // encoders 50 | this.ys = null; 51 | this.line_length = null; 52 | 53 | // svg layers 54 | this.svg = null; 55 | this.svgTermLabelLayer = null; 56 | this.svgTermBarLayer = null; 57 | this.overlayLayer = null; 58 | this.overlayLineLayer = null; 59 | this.svgTopicalBarLayer = null; 60 | this.svgTermHighlightLayer = null; 61 | 62 | // interaction variables 63 | this.highlightedTerm = null; 64 | this.highlightedTopic = null; 65 | 66 | this.selectedTopics = []; 67 | this.colorClassPrefix = "HIST"; 68 | this.normalColor = "normal"; 69 | 70 | // bar highlighting 71 | this.totalOffsets = []; 72 | this.prevHighlightColor = this.normalColor; 73 | this.useOffset = false; 74 | } 75 | }); 76 | 77 | /** 78 | * Initialize Term Frequency View's parent model 79 | * 80 | * @private 81 | */ 82 | TermFrequencyView.prototype.initModel = function( model, state ){ 83 | this.parentModel = model; 84 | }; 85 | 86 | /** 87 | * Initialize/render histogram view's elements for the first time 88 | * 89 | * @private 90 | */ 91 | TermFrequencyView.prototype.load = function(){ 92 | this.renderInit(); 93 | this.renderUpdate(); 94 | }; 95 | 96 | /** 97 | * Updates the view (public encapsulation used in index.html) 98 | */ 99 | TermFrequencyView.prototype.update = function() { 100 | this.renderUpdate(); 101 | }; 102 | 103 | /** 104 | * Transforms the topical frequency matrix into a form appropriate for d3 stacked bars 105 | * 106 | * @private 107 | */ 108 | TermFrequencyView.prototype.prepareStackedBars = function() { 109 | var matrix = this.parentModel.get("topicalFreqMatrix"); 110 | if( matrix.length === 0) 111 | return []; 112 | 113 | var remapped = matrix.map( function(layer){ 114 | return layer.map( function(d, j) { return { x : j, y : d }; } ); 115 | }); 116 | 117 | var stackedTransformer = d3.layout.stack(); 118 | var stackedData = stackedTransformer(remapped); 119 | 120 | // update totalOffsets (for highlighting use) 121 | this.totalOffsets = []; 122 | if(stackedData.length > 0){ 123 | for( var j = 0; j < stackedData[0].length; j++){ 124 | var sum = 0.0; 125 | for( var i = 0; i < stackedData.length; i++){ 126 | sum += stackedData[i][j].y; 127 | } 128 | this.totalOffsets[j] = sum; 129 | } 130 | } 131 | return stackedData; 132 | }; 133 | 134 | /** 135 | * Initialize histogram view's elements 136 | * -svg layers 137 | * -encoders 138 | * -etc. 139 | * 140 | * @private 141 | */ 142 | TermFrequencyView.prototype.renderInit = function() { 143 | var termIndex = this.parentModel.get("termIndex"); 144 | var termFreq = this.parentModel.get("totalTermFreqs"); 145 | 146 | // Compute encoders 147 | this.ys = d3.scale.linear(); 148 | 149 | var maxFreq = 0.0; 150 | for( var i = 0; i < termIndex.length; i++ ) { 151 | if(termFreq[termIndex[i]] > maxFreq) 152 | maxFreq = termFreq[termIndex[i]]; 153 | } 154 | this.line_length = d3.scale.linear().domain([0, maxFreq]).range( [ 0, HISTORGRAM_CONTAINER_PADDING.width ] ); 155 | 156 | // init svg layers 157 | var container = d3.select(this.el); 158 | this.svg = container.append( "svg:svg" ) 159 | .style( "cursor", "default" ) 160 | .style( "width", HISTORGRAM_CONTAINER_PADDING.fullWidth() + "px" ) 161 | this.svgTermLabelLayer = this.svg.append( "svg:g" ) 162 | .attr( "class", "termLabelLayer" ) 163 | .attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" ); 164 | this.svgTermBarLayer = this.svg.append( "svg:g" ) 165 | .attr( "class", "termBarLayer" ) 166 | .attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" ); 167 | this.overlayLayer = this.svg.append( "svg:g" ) 168 | .attr( "class", "overlayLayer") 169 | .attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" ); 170 | this.svgTopicalBarLayer = this.svg.append( "svg:g" ) 171 | .attr( "class", "topicalBarLayer" ) 172 | .attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" ); 173 | this.svgTermHighlightLayer = this.svg.append( "svg:g" ) 174 | .attr( "class", "termHighlightLayer" ) 175 | .attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" ); 176 | }; 177 | 178 | /** 179 | * Update histogram view's elements based on parent model's termIndex and term frequencies 180 | * 181 | * @private 182 | */ 183 | TermFrequencyView.prototype.renderUpdate = function() { 184 | var termIndex = this.parentModel.get("termIndex"); 185 | var termFreq = this.parentModel.get("totalTermFreqs"); 186 | 187 | this.svg 188 | .style( "height", HISTORGRAM_CONTAINER_PADDING.fullHeight( HISTOGRAM_ENCODING_PARAMETERS.NUM_TOPICS, termIndex.length ) + "px" ) 189 | 190 | this.ys.domain( [ 0, termIndex.length ] ) 191 | .range( [ 0, termIndex.length * HISTOGRAM_ENCODING_PARAMETERS.packing()] ); 192 | 193 | this.svgTermLabelLayer.selectAll( "text" ).data( termIndex ).exit().remove(); 194 | this.svgTermLabelLayer.selectAll( "text" ).data( termIndex ).enter().append( "svg:text" ) 195 | .on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this)) 196 | .attr( "x", -HISTORGRAM_CONTAINER_PADDING.left_separation ) 197 | .attr( "y", 3 ) 198 | this.svgTermLabelLayer.selectAll( "text" ).data( termIndex ) 199 | .attr( "class", function(d) { return ["termLabel", "HISTnormal", getTermClassTag(d)].join(" ") }) 200 | .attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) ) 201 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this)) 202 | .text( function(d) { return d } ); 203 | 204 | this.svgTermBarLayer.selectAll("line").data(termIndex).exit().remove(); 205 | this.svgTermBarLayer.selectAll("line").data(termIndex).enter().append("svg:line") 206 | .on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) ) 207 | .attr( "y1", 0 ) 208 | .attr( "y2", 0 ) 209 | .attr( "x1", this.line_length(0) ) 210 | this.svgTermBarLayer.selectAll("line").data(termIndex) 211 | .attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) ) 212 | .attr( "class", function(d,i) { return ["termFreqBar", getTermClassTag(d)].join(" ") }) 213 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) ) 214 | .attr( "x2", function(d) { return this.line_length(termFreq[d]) }.bind(this) ) 215 | 216 | var stackedData = this.prepareStackedBars(); 217 | var colors = this.parentModel.get("colorList"); 218 | 219 | this.overlayLayer.selectAll( "g" ).data(stackedData).exit().remove(); 220 | this.overlayLayer.selectAll( "g" ).data(stackedData).enter().append("svg:g") 221 | this.gLayer = this.overlayLayer.selectAll( "g" ).data(stackedData) 222 | .attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) ) 223 | 224 | this.gLayer.selectAll("line").data(function(d) {return d;}).exit().remove(); 225 | this.gLayer.selectAll("line").data(function(d, i) { return d;}).enter().append("svg:line") 226 | .attr("y1", 0) 227 | .attr("y2", 0) 228 | this.gLayer.selectAll("line").data(function(d, i) { return d;}) 229 | .attr("class", function(d,i){return ["line", getTermClassTag(termIndex[i])].join(" ") }) 230 | .attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) ) 231 | .attr("x1", function(d){ return this.line_length(d.y0)}.bind(this) ) 232 | .attr("x2", function(d){return this.line_length(d.y0) + this.line_length(d.y)}.bind(this) ) 233 | 234 | this.svgTopicalBarLayer.selectAll("line").data(termIndex).exit().remove(); 235 | this.svgTopicalBarLayer.selectAll("line").data(termIndex).enter().append("svg:line") 236 | .on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) ) 237 | .attr( "y1", 0 ) 238 | .attr( "y2", 0 ) 239 | .attr( "x1", this.line_length(0) ) 240 | .attr( "x2", this.line_length(0) ) 241 | this.svgTopicalBarLayer.selectAll("line").data(termIndex) 242 | .attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) ) 243 | .attr( "class", function(d,i) { return ["topicalFreqBar", getTermClassTag(d)].join(" ") }) 244 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) ) 245 | 246 | this.svgTermHighlightLayer.selectAll("line").data(termIndex).exit().remove(); 247 | this.svgTermHighlightLayer.selectAll("line").data(termIndex).enter().append("svg:line") 248 | .on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) ) 249 | .attr( "y1", 0 ) 250 | .attr( "y2", 0 ) 251 | .attr( "x1", this.line_length(0) ) 252 | .style( "fill" , "none") 253 | this.svgTermHighlightLayer.selectAll("line").data(termIndex) 254 | .attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) ) 255 | .attr( "class", function(d,i) { return ["termHighlightBar", getTermClassTag(d)].join(" ") }) 256 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) ) 257 | .attr( "x2", function(d) { return this.line_length(termFreq[d]) }.bind(this) ) 258 | }; 259 | 260 | // interactions 261 | /** 262 | * Calls appropriate functions to deal with topic highlight event elements 263 | * 264 | * @param { model } model is passed but unused 265 | * @param { int } value is the target topic 266 | * @return { void } 267 | */ 268 | TermFrequencyView.prototype.onHighlightTopicChanged = function( model, value ) { 269 | var topic = value; 270 | if(topic === null) 271 | this.unhighlight( false, true ); 272 | else 273 | this.highlight( null, topic ); 274 | }; 275 | /** 276 | * Calls appropriate functions to deal with term highlight event elements 277 | * 278 | * @param { model } model is passed but unused 279 | * @param { string } value is the target term 280 | * @return { void } 281 | */ 282 | TermFrequencyView.prototype.onHighlightTermChanged = function( model, value ) { 283 | var term = value; 284 | if(term === "") 285 | this.unhighlight( true, false ); 286 | else 287 | this.highlight( term, null ); 288 | }; 289 | /** 290 | * Unhighlights elements based on term and/or topic 291 | * 292 | * @private 293 | */ 294 | TermFrequencyView.prototype.unhighlight = function( term, topic ) { 295 | // unhighlight term 296 | if( term ){ 297 | term = this.highlightedTerm; 298 | this.highlightedTerm = null; 299 | this.svgTermLabelLayer.selectAll("." + getTermClassTag(term)) 300 | .classed(this.colorClassPrefix + HIGHLIGHT, false) 301 | 302 | this.svgTermHighlightLayer.selectAll("." + getTermClassTag(term)) 303 | .classed(this.colorClassPrefix + HIGHLIGHT, false) 304 | } 305 | 306 | // unhighlight topic 307 | if( topic ){ 308 | topic = this.highlightedTopic; 309 | var termIndex = this.parentModel.get("termIndex"); 310 | var topicals = this.parentModel.getTopicalsForTopic(topic); 311 | this.highlightedTopic = null; 312 | 313 | // highlight labels 314 | for( var i = 0; i < termIndex.length; i++){ 315 | var term = termIndex[i]; 316 | if( topicals[i]> THRESHHOLD ){ 317 | this.svgTermLabelLayer.selectAll("." + getTermClassTag(term)) 318 | .classed(this.colorClassPrefix + HIGHLIGHT, false) 319 | 320 | if( this.useOffset ){ 321 | // make highlight bars invis 322 | this.svgTopicalBarLayer.selectAll("." + getTermClassTag(term)) 323 | .classed(this.colorClassPrefix + HIGHLIGHT, false) 324 | .attr( "x2", this.line_length(0)) 325 | .attr( "x1", this.line_length(0)); 326 | } 327 | } 328 | } 329 | 330 | // reset layers 331 | var colors = this.parentModel.get("colorList"); 332 | this.gLayer = this.overlayLayer.selectAll( "g" ) 333 | .attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) ); 334 | 335 | // reset variables 336 | this.prevHighlightColor = this.normalColor; 337 | this.useOffset = false; 338 | } 339 | }; 340 | /** 341 | * Highlights elements based on term and/or topic 342 | * 343 | * @private 344 | */ 345 | TermFrequencyView.prototype.highlight = function( term, topic ) { 346 | // highlight term 347 | if( term !== null ){ 348 | this.highlightedTerm = term; 349 | this.svgTermLabelLayer.selectAll("." + getTermClassTag(term)) 350 | .classed(this.colorClassPrefix + HIGHLIGHT, true) 351 | 352 | this.svgTermHighlightLayer.selectAll("." + getTermClassTag(term)) 353 | .classed(this.colorClassPrefix + HIGHLIGHT, true) 354 | } 355 | // highlight topic 356 | else if( topic !== null ){ 357 | var termIndex = this.parentModel.get("termIndex"); 358 | var topicals = this.parentModel.getTopicalsForTopic(topic); 359 | this.highlightedTopic = topic; 360 | 361 | // highlight labels 362 | var stackedData = this.prepareStackedBars(); 363 | var selectedTopics = this.parentModel.get("selectedTopics"); 364 | var colors = this.parentModel.get("colorList").slice(); 365 | 366 | // decide how to "highlight" bars 367 | if( selectedTopics[topic] !== null){ 368 | // previously selected topic 369 | this.prevHighlightColor = selectedTopics[topic]; 370 | colors[colors.indexOf(selectedTopics[topic])] = HIGHLIGHT; 371 | this.gLayer = this.overlayLayer.selectAll( "g" ) 372 | .attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) ); 373 | } else { 374 | // add bar with offset 375 | this.useOffset = true; 376 | } 377 | for( var i = 0; i < termIndex.length; i++){ 378 | var term = termIndex[i]; 379 | if( topicals[i]> THRESHHOLD ){ 380 | this.svgTermLabelLayer.selectAll("." + getTermClassTag(term)) 381 | .classed(this.colorClassPrefix + HIGHLIGHT, true) 382 | 383 | // highlight bars 384 | if( this.useOffset ) { 385 | // use the offset 386 | var offset = 0; 387 | if( this.totalOffsets.length > 0) 388 | offset = this.totalOffsets[i]; 389 | 390 | this.svgTopicalBarLayer.selectAll("." + getTermClassTag(term)) 391 | .classed(this.colorClassPrefix + HIGHLIGHT, true) 392 | .attr( "x2", this.line_length(offset + topicals[i])) 393 | .attr( "x1", this.line_length(offset)); 394 | } 395 | } 396 | } 397 | } 398 | }; -------------------------------------------------------------------------------- /client-src/TermTopicMatrixView.js: -------------------------------------------------------------------------------- 1 | /* 2 | TermTopicMatrixView.js 3 | 4 | This view is responsible for generating the term:topic similarity matrix. 5 | 6 | Details: 7 | -------- 8 | Pulls list of ordered terms, topics, and similarity values from 9 | FilteredTermTopicProbabilityModel. 10 | 11 | Additionally, uses parameters defined in ViewParameters.js. 12 | */ 13 | var MATRIX_CONTAINER_PADDING = { 14 | left_separation: 8, 15 | top_separation: 5, 16 | left: 110, 17 | right: 20, 18 | top: 60, 19 | bottom: 60, 20 | fullWidth : function( numTopics ) { return this.left + this.right + MATRIX_ENCODING_PARAMETERS.packing() * numTopics }, 21 | fullHeight : function( numTopics, numTerms ) { return this.top + this.bottom + MATRIX_ENCODING_PARAMETERS.packing() * numTerms } 22 | }; 23 | 24 | var MATRIX_ENCODING_PARAMETERS = { 25 | NUM_TOPICS : 0, 26 | NUM_TERMS : 0, 27 | MATRIX : null, 28 | setNumTopics : function(numTopics) { this.NUM_TOPICS = numTopics; }, 29 | setNumTerms : function(numTerms) { this.NUM_TERMS = numTerms; }, 30 | setMatrix : function(matrix) { this.MATRIX = matrix; }, 31 | DENSE_NUM_TOPICS: 50, 32 | LOOSE_NUM_TOPICS: 20, 33 | DENSE_PACKING: 12, 34 | LOOSE_PACKING: 18, 35 | packing : function() 36 | { 37 | return 12; 38 | }, 39 | TARGET_PIXEL_DENSITY : 0.20, 40 | radius : function( sparseMatrix, numTopics, numTerms ) // matrix view 41 | { 42 | var totalCirclePixels = 0.0; 43 | for ( var i in sparseMatrix ) 44 | totalCirclePixels += sparseMatrix[i].value * Math.PI; 45 | // Add up # pixels: prob * Math.PI; 46 | var totalMatrixPixels = numTopics * numTerms * this.packing() * this.packing(); 47 | 48 | var targetPixels = ( totalMatrixPixels * this.TARGET_PIXEL_DENSITY ); 49 | var observedPixels = totalCirclePixels; 50 | var areaScale = targetPixels / observedPixels; 51 | var radiusScale = Math.sqrt( areaScale ); 52 | 53 | var totalCirclePixels = 0.0; 54 | for ( var i in sparseMatrix ) 55 | totalCirclePixels += radiusScale * radiusScale * ( sparseMatrix[i].value ) * Math.PI; 56 | 57 | return radiusScale; 58 | } 59 | }; 60 | 61 | var TermTopicMatrixView = Backbone.View.extend({ 62 | initialize : function() { 63 | this.parentModel = null; 64 | 65 | // encodings 66 | this.xs = null; 67 | this.ys = null; 68 | this.rs = null; 69 | 70 | // svg layers 71 | this.svg = null; 72 | this.xGridlineLayer = null; 73 | this.yGridlineLayer = null; 74 | this.matrixLayer = null; 75 | this.leftLabelLayer = null; 76 | this.topLabelLayer = null; 77 | 78 | // interaction variables 79 | this.selectedTopics = []; 80 | this.normalColor = "normal"; 81 | 82 | this.highlightedTerm = null; 83 | this.highlightedTopic = null; 84 | 85 | this.receivedColors = null; 86 | 87 | } 88 | }); 89 | /** 90 | * Initialize matrix view's parent model 91 | * 92 | * @private 93 | */ 94 | TermTopicMatrixView.prototype.initModel = function( model ) { 95 | this.parentModel = model; 96 | }; 97 | 98 | /** 99 | * Receives information about selected topics that were restored from saved state 100 | * 101 | */ 102 | TermTopicMatrixView.prototype.receiveSelectedTopics = function( obj ){ 103 | this.receivedColors = obj; 104 | }; 105 | 106 | /** 107 | * Initialize/render matrix view's elements for the first time 108 | * 109 | * @private 110 | */ 111 | TermTopicMatrixView.prototype.load = function(){ 112 | this.renderInit(); 113 | this.renderUpdate(); 114 | 115 | for( var obj in this.selectedTopics ){ 116 | this.selectTopic(obj, this.selectedTopics[obj]); 117 | } 118 | }; 119 | 120 | /** 121 | * Initialize all topics' selection color to DEFAULT (used by renderInit only) 122 | * 123 | * @private 124 | */ 125 | TermTopicMatrixView.prototype.defaultSelection = function(){ 126 | var topicIndex = this.parentModel.get("topicIndex"); 127 | for( var i = 0; i < topicIndex.length; i++ ){ 128 | this.selectedTopics[i] = this.normalColor; 129 | if( this.receivedColors !== null && this.receivedColors[i] !== undefined){ 130 | this.selectedTopics[i] = this.receivedColors[i]; 131 | } 132 | } 133 | }; 134 | 135 | /** 136 | * Initialize matrix view's elements 137 | * -svg layers 138 | * -encoders 139 | * -etc. 140 | * 141 | * @private 142 | */ 143 | TermTopicMatrixView.prototype.renderInit = function(){ 144 | var matrix = this.parentModel.get("sparseMatrix"); 145 | var termIndex = this.parentModel.get("termIndex"); 146 | var topicIndex = this.parentModel.get("topicIndex"); 147 | 148 | this.defaultSelection(); 149 | 150 | this.xs = d3.scale.linear(); 151 | this.ys = d3.scale.linear(); 152 | 153 | this.rs = d3.scale.sqrt() 154 | .domain( [ 0, 1 ] ) 155 | .range( [ 0, MATRIX_ENCODING_PARAMETERS.radius( matrix, topicIndex.length, termIndex.length ) ] ); 156 | 157 | var container = d3.select( this.el ); 158 | this.svg = container.append( "svg:svg" ) 159 | 160 | this.initMatrixView(); 161 | this.initTopLabelView(); 162 | this.initLeftLabelView(); 163 | }; 164 | 165 | /** 166 | * Update matrix view's elements based on parent model's termIndex, topicIndex, and matrix 167 | * 168 | * @private 169 | */ 170 | TermTopicMatrixView.prototype.renderUpdate = function(){ 171 | var termIndex = this.parentModel.get("termIndex"); 172 | var topicIndex = this.parentModel.get("topicIndex"); 173 | 174 | this.xs 175 | .domain( [ 0, topicIndex.length ] ) 176 | .range( [ MATRIX_CONTAINER_PADDING.left, MATRIX_CONTAINER_PADDING.left + topicIndex.length * MATRIX_ENCODING_PARAMETERS.packing() ] ); 177 | this.ys 178 | .domain( [ 0, termIndex.length ] ) 179 | .range( [ MATRIX_CONTAINER_PADDING.top, MATRIX_CONTAINER_PADDING.top + termIndex.length * MATRIX_ENCODING_PARAMETERS.packing() ] ); 180 | this.svg 181 | .style( "width", MATRIX_CONTAINER_PADDING.fullWidth( topicIndex.length ) + "px" ) 182 | .style( "height", MATRIX_CONTAINER_PADDING.fullHeight( topicIndex.length, termIndex.length ) + "px" ) 183 | 184 | this.updateMatrixView(); 185 | this.updateTopLabelView(); 186 | this.updateLeftLabelView(); 187 | }; 188 | 189 | /** 190 | * Init and update functions for each layer 191 | * 192 | * @private 193 | */ 194 | TermTopicMatrixView.prototype.initMatrixView = function(){ 195 | this.xGridlineLayer = this.svg.append( "svg:g" ).attr( "class", "xGridlineLayer" ); 196 | this.yGridlineLayer = this.svg.append( "svg:g" ).attr( "class", "yGridlineLayer" ); 197 | this.matrixLayer = this.svg.append( "svg:g" ).attr( "class", "matrixLayer" ); 198 | }; 199 | TermTopicMatrixView.prototype.updateMatrixView = function(){ 200 | var matrix = this.parentModel.get("sparseMatrix"); 201 | var termIndex = this.parentModel.get("termIndex"); 202 | var topicIndex = this.parentModel.get("topicIndex"); 203 | 204 | this.matrixLayer.selectAll( "circle" ).data( matrix ).exit().remove(); 205 | this.matrixLayer.selectAll( "circle" ).data( matrix ).enter().append( "svg:circle" ) 206 | .on( "mouseout", function() { this.trigger( "mouseout:term", ""); this.trigger( "mouseout:topic", null); }.bind(this) ) 207 | this.matrixLayer.selectAll( "circle" ).data( matrix ) 208 | .attr( "class", function(d) { return [ "matrixElement", this.selectedTopics[d.topicIndex], getTopicClassTag(d.topicName), getTermClassTag(d.term) ].join(" ") }.bind(this)) 209 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d.term); this.trigger( "mouseover:topic", d.topicIndex); }.bind(this) ) 210 | .on( "click", function (d) { this.trigger( "click:topic", d.topicIndex ) }.bind(this)) 211 | .attr( "cx", function(d) { return this.xs(d.topicIndex+0.5) }.bind(this) ) 212 | .attr( "cy", function(d) { return this.ys(d.termIndex+0.5) }.bind(this) ) 213 | .attr( "r", function(d) { return this.rs(d.value) }.bind(this) ) 214 | 215 | this.xGridlineLayer.selectAll( "line" ).data( termIndex ).exit().remove(); 216 | this.xGridlineLayer.selectAll( "line" ).data( termIndex ).enter().append( "svg:line" ) 217 | .attr( "x1", this.xs(0.5) ) 218 | this.xGridlineLayer.selectAll( "line" ).data( termIndex ) 219 | .attr( "class", function(d) { return [ "verticalLine", this.normalColor, getTermClassTag(d) ].join(" ") }.bind(this)) 220 | .attr( "x2", this.xs(topicIndex.length-0.5) ) 221 | .attr( "y1", function(d,i) { return this.ys(i+0.5) }.bind(this) ) 222 | .attr( "y2", function(d,i) { return this.ys(i+0.5) }.bind(this) ) 223 | 224 | this.yGridlineLayer.selectAll( "line" ).data( topicIndex ).exit().remove(); 225 | this.yGridlineLayer.selectAll( "line" ).data( topicIndex ).enter().append( "svg:line" ) 226 | .attr( "y1", this.ys(0.5) ) 227 | this.yGridlineLayer.selectAll( "line" ).data( topicIndex ) 228 | .attr( "class", function(d, i) { return [ "verticalLine", this.selectedTopics[i], getTopicClassTag(d)].join(" ") }.bind(this)) 229 | .attr( "x1", function(d,i){ return this.xs(i+0.5) }.bind(this) ) 230 | .attr( "x2", function(d,i){ return this.xs(i+0.5) }.bind(this) ) 231 | .attr( "y2", this.ys(termIndex.length-0.5) ) 232 | }; 233 | TermTopicMatrixView.prototype.initTopLabelView = function(){ 234 | this.topLabelLayer = this.svg.append( "svg:g" ) 235 | .attr( "class", "topLabelLayer" ); 236 | }; 237 | TermTopicMatrixView.prototype.updateTopLabelView = function(){ 238 | var topicIndex = this.parentModel.get("topicIndex"); 239 | var dblclickTimer = null; 240 | 241 | this.topLabelLayer.selectAll( "text" ).data( topicIndex ).exit().remove() 242 | this.topLabelLayer.selectAll( "text" ).data( topicIndex ).enter().append( "svg:text" ) 243 | .on( "mouseout", function() { this.trigger( "mouseout:topic", null) }.bind(this)) 244 | .attr( "y", 3 ) 245 | this.topLabelLayer.selectAll( "text" ).data( topicIndex ) 246 | .attr( "class", function(d, i) { return ["topLabel", this.selectedTopics[i], getTopicClassTag(d)].join(" ") }.bind(this)) 247 | .on( "mouseover", function(d, i) { this.trigger( "mouseover:topic", i ) }.bind(this)) 248 | .attr( "transform", function(d,i) { return "translate(" + this.xs(i+0.5) + "," + (this.ys(0)-MATRIX_CONTAINER_PADDING.top_separation) + ") rotate(270)" }.bind(this) ) 249 | .text( function(d) { return d } ) 250 | .on( "click", function(d, i) { 251 | dblclickTimer = setTimeout(function(){ clickWork(d, i)}, 200); 252 | }) 253 | .on( "dblclick", function(d, i){ 254 | clearTimeout(dblclickTimer); 255 | dblclickTimer = null; 256 | this.trigger( "doubleClick:topic", i) 257 | }.bind(this)) 258 | 259 | var clickWork = function(d, i) { 260 | if(dblclickTimer === null) 261 | return; 262 | else { 263 | this.trigger( "click:topic", i) 264 | } 265 | }.bind(this); 266 | }; 267 | TermTopicMatrixView.prototype.initLeftLabelView = function(){ 268 | this.leftLabelLayer = this.svg.append( "svg:g" ) 269 | .attr( "class", "leftLabelLayer" ); 270 | }; 271 | TermTopicMatrixView.prototype.updateLeftLabelView = function(){ 272 | var termIndex = this.parentModel.get("termIndex"); 273 | 274 | this.leftLabelLayer.selectAll( "text" ).data( termIndex ).exit().remove(); 275 | this.leftLabelLayer.selectAll( "text" ).data( termIndex ).enter().append( "svg:text" ) 276 | .on( "mouseout", function() { this.trigger( "mouseout:term", "") }.bind(this)) 277 | .attr( "y", 3 ) 278 | this.leftLabelLayer.selectAll( "text" ).data( termIndex ) 279 | .attr( "class", function(d) { return ["leftLabel", this.normalColor, getTermClassTag(d)].join(" ") }.bind(this)) 280 | .on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this)) 281 | .attr( "transform", function(d,i) { return "translate(" + (this.xs(0)-MATRIX_CONTAINER_PADDING.left_separation) + "," + this.ys(i+0.5) + ")" }.bind(this) ) 282 | .text( function(d) { return d } ) 283 | }; 284 | /** end init and update functions **/ 285 | 286 | /** 287 | * Updates the view (public encapsulation used in index.html) 288 | */ 289 | TermTopicMatrixView.prototype.update = function() { 290 | this.renderUpdate(); 291 | }; 292 | 293 | 294 | // Interactions 295 | /** 296 | * Calls appropriate functions to deal with term highlight event elements 297 | * 298 | * @param { model } model is passed but unused 299 | * @param { string } value is the target term 300 | * @return { void } 301 | */ 302 | TermTopicMatrixView.prototype.onSelectionTermChanged = function( model, value ) { 303 | var term = value; 304 | if(term === "") 305 | this.unhighlight( true, false ); 306 | else 307 | this.highlight( term, null ); 308 | }; 309 | /** 310 | * Calls appropriate functions to deal with topic highlight event elements 311 | * 312 | * @param { model } model is passed but unused 313 | * @param { int } value is the target topic index 314 | * @return { void } 315 | */ 316 | TermTopicMatrixView.prototype.onSelectionTopicChanged = function( model, value ) { 317 | var topic = value; 318 | if(topic === null) 319 | this.unhighlight( false, true ); 320 | else 321 | this.highlight( null, topic ); 322 | }; 323 | 324 | /** 325 | * Highlights elements based on term and/or topic 326 | * 327 | * @private 328 | */ 329 | TermTopicMatrixView.prototype.highlight = function( term, topic ) { 330 | if( term !== null ){ 331 | this.highlightedTerm = term; 332 | this.svg.selectAll("." + getTermClassTag(term)) 333 | .classed(HIGHLIGHT, true) 334 | } 335 | 336 | if( topic !== null ){ 337 | var topicIndex = this.parentModel.get("topicIndex"); 338 | var termIndex = this.parentModel.get("termIndex"); 339 | var matrix = this.parentModel.get("matrix"); 340 | 341 | this.highlightedTopic = topic; 342 | this.svg.selectAll("." + getTopicClassTag(topicIndex[topic])) 343 | .classed(HIGHLIGHT, true) 344 | 345 | // highlight term labels 346 | for( var i = 0; i < termIndex.length; i++){ 347 | var term = termIndex[i]; 348 | if( matrix[i][topic] > THRESHHOLD ){ 349 | this.leftLabelLayer.selectAll("." + getTermClassTag(term)) 350 | .classed(HIGHLIGHT, true) 351 | } 352 | } 353 | } 354 | }; 355 | /** 356 | * Unhighlights elements based on term and/or topic 357 | * 358 | * @private 359 | */ 360 | TermTopicMatrixView.prototype.unhighlight = function( term, topic ) { 361 | if( term && this.highlightedTerm !== null){ 362 | this.svg.selectAll("." + getTermClassTag(this.highlightedTerm)) 363 | .classed(HIGHLIGHT, false) 364 | 365 | this.highlightedTerm = null; 366 | } 367 | 368 | if( topic && this.hightlightedTopic !== null){ 369 | var topicIndex = this.parentModel.get("topicIndex"); 370 | var termIndex = this.parentModel.get("termIndex"); 371 | var matrix = this.parentModel.get("matrix"); 372 | 373 | var topicNo = this.highlightedTopic; 374 | this.svg.selectAll("." + getTopicClassTag(topicIndex[topicNo])) 375 | .classed(HIGHLIGHT, false) 376 | 377 | // unhighlight labels 378 | for( var i = 0; i < termIndex.length; i++){ 379 | var term = termIndex[i]; 380 | if( matrix[i][topicNo] > THRESHHOLD ){ 381 | this.leftLabelLayer.selectAll("." + getTermClassTag(term)) 382 | .classed(HIGHLIGHT, false) 383 | } 384 | } 385 | 386 | this.highlightedTopic = null; 387 | } 388 | }; 389 | 390 | /** 391 | * Calls appropriate functions to deal with topic selection event elements 392 | * 393 | * @param { object } contains both target topic index and associated color 394 | * @return { void } 395 | */ 396 | TermTopicMatrixView.prototype.clickTopic = function( obj ){ 397 | this.selectTopic(obj.topic, obj.color); 398 | }; 399 | /** 400 | * topic selection behavior 401 | * 402 | * @private 403 | */ 404 | TermTopicMatrixView.prototype.selectTopic = function( topic, colorClass ) { 405 | var topicIndex = this.parentModel.get("topicIndex"); 406 | if( topic !== null){ 407 | 408 | if( colorClass === DEFAULT) 409 | colorClass = this.normalColor; 410 | 411 | var oldColor = this.selectedTopics[topic]; 412 | 413 | // set new color 414 | this.svg.selectAll("." + getTopicClassTag(topicIndex[topic])) 415 | .classed(oldColor, false) 416 | .classed(colorClass, true) 417 | 418 | this.selectedTopics[topic] = colorClass; 419 | } 420 | }; --------------------------------------------------------------------------------