├── pipeline
    ├── __init__.py
    ├── stmt
    │   ├── generate-topic-index.py
    │   ├── extract-doc-index.py
    │   ├── extract-term-freqs.py
    │   ├── lda-learn.sh
    │   ├── generate-label-term-distributions.py
    │   └── lda-learn.scala
    ├── prepare_vis_for_client.sh
    ├── train_mallet.sh
    ├── train_stmt.sh
    ├── utf8_utils.py
    ├── io_utils.py
    ├── import_mallet.py
    ├── tokenize.py
    ├── compute_saliency.py
    ├── prepare_data_for_client.py
    ├── import_stmt.py
    ├── api_utils.py
    ├── compute_similarity.py
    └── compute_seriation.py
├── client-src
    ├── web.sh
    ├── FullTermTopicProbabilityModel.js
    ├── termite.css
    ├── SeriatedTermTopicProbabilityModel.js
    ├── ViewParameters.js
    ├── InteractionObjects.css
    ├── UserControlViews.js
    ├── QueryString.js
    ├── TermFrequencyModel.js
    ├── StateModel.js
    ├── html5slider.js
    ├── FilteredTermTopicProbabilityModel.js
    ├── index.html
    ├── TermFrequencyView.js
    └── TermTopicMatrixView.js
├── .gitignore
├── README.md
├── CHANGE_LOG
├── example.cfg
├── LICENSE
├── setup.sh
├── README.old
└── execute.py


/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 


--------------------------------------------------------------------------------
/client-src/web.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | echo "Starting a local web server at http://localhost:8888/"
4 | python -m SimpleHTTPServer 8888


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.pyc
 3 | *.min.js
 4 | /lib/
 5 | /client-lib/
 6 | /client-src/d3.v3.js
 7 | /client-src/jquery.js
 8 | /client-src/underscore.js
 9 | /client-src/backbone.js
10 | /client-src/data
11 | /mallet-*
12 | /stmt-*
13 | 


--------------------------------------------------------------------------------
/client-src/FullTermTopicProbabilityModel.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	FullTermTopicProbabilityModel.js
 3 | 	
 4 | 	Currently does nothing.
 5 | 	
 6 | 	Designed to take in complete list of terms, topics, and matrix. 
 7 | 	
 8 | 	Passes subset of complete inputs to SeriatedTermTopicProbabilityModel.
 9 | */
10 | 
11 | 
12 | function FullTermTopicProbabilityModel()
13 | {
14 | 	// nothing here for now
15 | }
16 | 


--------------------------------------------------------------------------------
/pipeline/stmt/generate-topic-index.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser( description = 'Generate topic-index.txt' )
 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' )
 8 | parser.add_argument( 'topicCount', type = int, help = 'Number of topics' )
 9 | args = parser.parse_args()
10 | path = args.path
11 | topicCount = args.topicCount
12 | 
13 | f = "{}/topic-index.txt".format( path )
14 | w = open( f, 'w' )
15 | for i in range( topicCount ) :
16 | 	w.write( 'Topic {}\n'.format( i+1 ) )
17 | w.close()
18 | 


--------------------------------------------------------------------------------
/pipeline/stmt/extract-doc-index.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser( description = 'Generate doc-index.txt from document-topic-distributions.csv' )
 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' )
 8 | args = parser.parse_args()
 9 | path = args.path
10 | 
11 | lines = open( '{}/document-topic-distributions.csv'.format( path ) ).read().splitlines()
12 | writer = open( '{}/doc-index.txt'.format( path ), 'w' )
13 | for line in lines :
14 | 	values = line.split( ',' )
15 | 	writer.write( '{}\n'.format( values[0] ) )
16 | writer.close()
17 | 


--------------------------------------------------------------------------------
/pipeline/stmt/extract-term-freqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser( description = 'Generate label-term-distributions.csv from topic-term-distributions.csv.' )
 7 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' )
 8 | args = parser.parse_args()
 9 | path = args.path
10 | 
11 | lines = open( '{}/term-counts.csv'.format( path ) ).read().splitlines()
12 | writer = open( '{}/term-freqs.txt'.format( path ), 'w' )
13 | for line in lines :
14 | 	values = line.split( ',' )
15 | 	writer.write( '{}\t{}\n'.format( values[0], values[1] ) )
16 | writer.close()
17 | 


--------------------------------------------------------------------------------
/pipeline/prepare_vis_for_client.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Copies files necessary to run the client to the specified path's public_html directory
 4 | 
 5 | EXPECTED_ARGS=1
 6 | if [ $# -lt $EXPECTED_ARGS ]
 7 | then
 8 | 	echo "Usage: `basename $0` project_path"
 9 | 	exit -1
10 | fi
11 | 
12 | ROOT=$1 # path to public_html
13 | CLIENT_SRC=client-src/
14 | CLIENT_LIB=client-lib/
15 | 
16 | echo "Copying js files..."
17 | for JS_FILE in d3.v3 jquery backbone underscore FullTermTopicProbabilityModel SeriatedTermTopicProbabilityModel FilteredTermTopicProbabilityModel TermFrequencyModel TermTopicMatrixView TermFrequencyView ViewParameters StateModel UserControlViews QueryString html5slider
18 | do
19 | 	cp $CLIENT_LIB/$JS_FILE.min.js $ROOT/public_html/
20 | done
21 | 
22 | echo "Copying CSS file..."
23 | for CSS_FILE in InteractionObjects termite
24 | do
25 | 	cp $CLIENT_SRC/$CSS_FILE.css $ROOT/public_html/
26 | done
27 | 
28 | echo "Copying local server file..."
29 | cp $CLIENT_SRC/web.sh $ROOT/public_html/
30 | 
31 | echo "Copying HTML file..."
32 | cp $CLIENT_SRC/index.html $ROOT/public_html/
33 | 
34 | # rename HTML's imported javascript files to use the minified versions
35 | echo "Renaming library dependencies in HTML file..."
36 | sed -i='' 's|\.js|.min.js|g' $ROOT/public_html/index.html
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Current Development
 2 | ===================
 3 | 
 4 | Starting in 2014, we have split Termite into two components:
 5 |   * **[Termite Data Server](http://github.com/uwdata/termite-data-server)** for processing the output of topic models and distributing the content as a web service
 6 |   * **[Termite Visualizations](http://github.com/uwdata/termite-visualizations)** for visualizing topic model outputs in a web browser
 7 | 
 8 | Our goals are to:
 9 |   * support multiple topic modeling tools
10 |   * reduce the cost of developing new visualizations through shared infrastructure
11 |   * allow multiple visualizations to interact with any number of topic modeling software and with other visualizations
12 | 
13 | Please see the respective repositories for the latest software and additional information.
14 | 
15 | Termite
16 | -------
17 | 
18 | Termite is a visualization tool for inspecting the output of statistical topic models based on the techniques described in the following publication. For more details about this repository, see the file "README.old".
19 | 
20 |   **Termite: Visualization Techniques for Assessing Textual Topic Models**  
21 |   Jason Chuang, Christopher D. Manning, Jeffrey Heer  
22 |   Computer Science Dept, Stanford University  
23 |   http://vis.stanford.edu/papers/termite  
24 | 
25 | 


--------------------------------------------------------------------------------
/CHANGE_LOG:
--------------------------------------------------------------------------------
 1 | Termite Topic Model Visualization
 2 | Jason Chuang, Ashley Jin
 3 | http://termite.stanford.edu
 4 | 
 5 | --------------------------------------------------------------------------------
 6 | 
 7 | Version 1.0 (Released on Feb 1, 2013)
 8 | 
 9 | Data processing pipeline:
10 |   - Tokenize a text corpus.
11 |   - Build a topic model using MALLET or STMT.
12 |   - Compute term similarity and saliency statistics.
13 |   - Pre-compute term seriation.
14 |   - Generate a visualization viewable in a web browser.
15 | 
16 | Visualization:
17 |   - Display topical term frequency using a tabular circular view.
18 |   - Display global term frequency using a bar chart.
19 |   - Embed (save/load) visualization states using URL.
20 |   - Options for selecting the number of frequent/salient terms to display.
21 | 
22 | --------------------------------------------------------------------------------
23 | 
24 | Version 1.1 (Released on March 30, 2013)
25 | 
26 | Data processing pipeline:
27 |   - Updated similarity computation.
28 |   - Fixed minor bugs.
29 | 
30 | Visualization:
31 |   - Select and color latent topics.
32 |   - Display top terms belonging to selected topics.
33 |   - Re-order terms by topical frequency.
34 |   - Brushing-n-linking on mouse over.
35 | 
36 | --------------------------------------------------------------------------------
37 | 


--------------------------------------------------------------------------------
/pipeline/train_mallet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | EXPECTED_ARGS=3
 4 | if [ $# -lt $EXPECTED_ARGS ]
 5 | then
 6 | 	echo "Usage: `basename $0` input-file output-path num-topics"
 7 | 	exit -1
 8 | fi
 9 | 
10 | MALLET=mallet-2.0.7/
11 | INPUT=$1
12 | OUTPUT=$2
13 | TOPICS=$3
14 | 
15 | 
16 | 
17 | echo "--------------------------------------------------------------------------------"
18 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..."
19 | echo
20 | 
21 | if [ ! -d $OUTPUT ]; then
22 | 	echo "Creating output folder..."
23 |     mkdir $OUTPUT
24 | fi
25 | 
26 | echo "Importing data into Mallet..."
27 | $MALLET/bin/mallet import-file \
28 | 	--input $INPUT \
29 | 	--output $OUTPUT/text.vectors \
30 | 	--line-regex "^(\S*)\t(.*)$" \
31 | 	--token-regex "\S+" \
32 | 	--name 0 --label 1 --data 2 \
33 | 	--remove-stopwords true --encoding utf-8 --keep-sequence
34 | #	--remove-stopwords false --encoding utf-8 --keep-sequence
35 | 
36 | echo "Learning latent topics..."
37 | $MALLET/bin/mallet train-topics \
38 | 	--input $OUTPUT/text.vectors \
39 | 	--output-model $OUTPUT/output.model \
40 | 	--output-topic-keys $OUTPUT/output-topic-keys.txt \
41 | 	--topic-word-weights-file $OUTPUT/topic-word-weights.txt \
42 | 	--word-topic-counts-file $OUTPUT/word-topic-counts.txt \
43 | 	--num-topics $TOPICS
44 | 
45 | echo "--------------------------------------------------------------------------------"
46 | 


--------------------------------------------------------------------------------
/client-src/termite.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |  background-color: #ccc;
 3 |  cursor: default;
 4 | }
 5 | div, p {
 6 |  padding: 0;
 7 |  margin: 0;
 8 |  border: 0;
 9 | }
10 | 
11 | #pageBackground {
12 |  padding: 20px;
13 | }
14 | #pageFrame {
15 |  border: 1px solid #999;
16 |  box-shadow: 0 0 25px #999;
17 |  background-color: #f3f3f3;
18 | }
19 | #pageHeader {
20 |  border-bottom: 1px solid #999;
21 | }
22 | 
23 | #pageLogo {
24 |  font-family: Georgia;
25 |  padding: 20px 150px 20px 30px;
26 | }
27 | #pageLogo .title {
28 |  font-weight: bold;
29 |  font-size: 18pt;
30 | }
31 | #pageLogo .subtitle {
32 |  font-size: 12pt;
33 |  color: #666;
34 | }
35 | #pageLogo .credits {
36 |  font-size: 9pt;
37 |  color: #666;
38 | }
39 | 
40 | #pageControl {
41 |  color: #666;
42 |  font-family: Verdana;
43 |  font-size: 8pt;
44 |  padding: 10px 20px;
45 |  border-left: 1px solid #999;
46 | }
47 | 
48 | #pageHeader .headerObject {
49 |  display: inline-block;
50 |  vertical-align: top;
51 | }
52 | #pageHeader .headerObject div.line {
53 |  padding: 0;
54 |  margin: 0;
55 |  height: 20px;
56 | }
57 | #pageContent {
58 |  padding: 20px 40px;
59 |  background-color: #fff;
60 | }
61 | #pageDetails {
62 |  color: #999;
63 |  font-family: Verdana;
64 |  font-size: 8pt;
65 | }
66 | 
67 | #pageFooter {
68 |  color: #666;
69 |  font-family: Verdana;
70 |  font-size: 8pt;
71 |  border-top: 1px solid #999;
72 |  padding: 5px;
73 | }


--------------------------------------------------------------------------------
/example.cfg:
--------------------------------------------------------------------------------
 1 | [Corpus]
 2 | 
 3 | # Currently only support one format: file
 4 | # In the future: file, folder, lucene
 5 | 
 6 | format = file
 7 | path = corpus/example-documents.txt
 8 | 
 9 | ### these both work for unicode encoded corpus files:
10 | # tokenization = [^ ]+
11 | tokenization = whitespace
12 | 
13 | 
14 | # -----------------------------------------------------------------------------
15 | 
16 | [TopicModel]
17 | 
18 | # Two topic models
19 | # Supported libraries: mallet, stmt
20 | library = mallet
21 | ; library = stmt
22 | 
23 | # Path to save topic model outputs
24 | path = output/example-project/topic-model
25 | 
26 | # Number of topics to train
27 | num_topics = 20
28 | 
29 | # -----------------------------------------------------------------------------
30 | 
31 | [Termite]
32 | 
33 | # Currently only support one format: file
34 | # In the future: file, database
35 | format = file
36 | 
37 | # Path to save Termite-internal working files
38 | path = output/example-project
39 | 
40 | # Number of terms to seriate
41 | number_of_seriated_terms = 400
42 | 
43 | # -----------------------------------------------------------------------------
44 | 
45 | [Misc]
46 | 
47 | # Miscellaneous program configurations
48 | 
49 | ;logging = 10   # Display all debug messages
50 | ;logging = 20   # Display info messages
51 | ;logging = 30   # Display only warnings
52 | ;logging = 40   # Display only errors
53 | 


--------------------------------------------------------------------------------
/pipeline/train_stmt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | EXPECTED_ARGS=3
 4 | if [ $# -lt $EXPECTED_ARGS ]
 5 | then
 6 | 	echo "Usage: `basename $0` input-file output-path num-topics"
 7 | 	exit -1
 8 | fi
 9 | 
10 | STMT_JAR=stmt-0.4.0/
11 | STMT_LIB=pipeline/stmt/
12 | INPUT=$1
13 | OUTPUT=$2
14 | TOPICS=$3
15 | ITERS=1000
16 | 
17 | echo "--------------------------------------------------------------------------------"
18 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..."
19 | echo
20 | 
21 | echo "java -Xmx2g -jar $STMT_LIB/tmt-0.4.0.jar $STMT_LIB/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS"
22 | java -Xmx2g -jar $STMT_JAR/tmt-0.4.0.jar $STMT_LIB/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS
23 | 
24 | echo "Mark file iteration as 'final-iters'..."
25 | ln -s `printf '%05d' $ITERS`/ $OUTPUT/final-iters
26 | 
27 | echo "Unpack topic-term distribution..."
28 | gunzip -c $OUTPUT/final-iters/topic-term-distributions.csv.gz > $OUTPUT/topic-term-distributions.csv
29 | 
30 | echo "Generate topic-index (list of topics)..."
31 | $STMT_LIB/generate-topic-index.py $OUTPUT $TOPICS
32 | 
33 | echo "Copy term-index (list of terms)..."
34 | cp $OUTPUT/final-iters/term-index.txt $OUTPUT/term-index.txt
35 | 
36 | echo "Extract doc-index (list of documents)..."
37 | $STMT_LIB/extract-doc-index.py $OUTPUT
38 | 
39 | echo "Extract list of term frequencies..."
40 | $STMT_LIB/extract-term-freqs.py $OUTPUT
41 | 
42 | echo "--------------------------------------------------------------------------------"
43 | 


--------------------------------------------------------------------------------
/pipeline/stmt/lda-learn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check for proper number of command line args.
 4 | EXPECTED_ARGS=6
 5 | if [ $# -lt $EXPECTED_ARGS ]
 6 | then
 7 | 	echo "Usage: `basename $0` input-file output-path iters topics term-smoothing topic-smoothing"
 8 | 	exit -1
 9 | fi
10 | 
11 | PATH=stmt-0.4.0
12 | INPUT=$1
13 | OUTPUT=$2
14 | ITERS=$3
15 | TOPICS=$4
16 | TERM_SMOOTHING=$5
17 | TOPIC_SMOOTHING=$6
18 | 
19 | 
20 | echo "Training [ $INPUT ] --> [ $OUTPUT ]..."
21 | echo "java -Xmx2g -jar $PATH/tmt-0.4.0.jar $PATH/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS $TERM_SMOOTHING $TOPIC_SMOOTHING"
22 | java -Xmx2g -jar $PATH/tmt-0.4.0.jar $PATH/lda-learn.scala $INPUT $OUTPUT $TOPICS $ITERS $TERM_SMOOTHING $TOPIC_SMOOTHING
23 | 
24 | 
25 | 
26 | 
27 | #echo "Generate summary page..."
28 | #stmt/summarize.py $OUTPUT stmt
29 | 
30 | echo "Mark file iteration as 'final-iters'..."
31 | ln -s `printf '%05d' $ITERS`/ $OUTPUT/final-iters
32 | 
33 | echo "Unpack topic-term distribution..."
34 | gunzip -c $OUTPUT/final-iters/topic-term-distributions.csv.gz > $OUTPUT/topic-term-distributions.csv
35 | 
36 | 
37 | 
38 | echo "Generate topic-index (list of topics)..."
39 | $PATH/generate-topic-index.py $OUTPUT $TOPICS
40 | 
41 | echo "Copy term-index (list of terms)..."
42 | cp $OUTPUT/final-iters/term-index.txt $OUTPUT/term-index.txt
43 | 
44 | echo "Extract doc-index (list of documents)..."
45 | $PATH/extract-doc-index.py $OUTPUT
46 | 
47 | echo "Extract list of term frequencies..."
48 | $PATH/extract-term-freqs.py $OUTPUT
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Leland Stanford Junior University
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the <organization> nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/client-src/SeriatedTermTopicProbabilityModel.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	SeriatedTermTopicProbabilityModel.js
 3 | 	
 4 | 	Currently: Reads in input file to get seriated terms, topics, term information 
 5 | 		(e.g. saliency), and matrix of similarity values.
 6 | 		
 7 | 	
 8 | 	Designed to take in a subset of the full list of terms, topics, and matrix. 
 9 | */
10 | 
11 | var SeriatedTermTopicProbabilityModel = Backbone.Model.extend({
12 | 	defaults : {
13 | 		"matrix" : null,
14 | 		"termIndex" : null,
15 | 		"topicIndex" : null,
16 | 		"sparseMatrix" : null	// currently null
17 | 	},
18 | 	url : "data/seriated-parameters.json",
19 | 	initialize : function() {
20 | 		this.parentModel = null;
21 | 	}
22 | });
23 | 
24 | /**
25 |  * Initialize seriated's parent model
26 |  *
27 |  * @private
28 |  */
29 | SeriatedTermTopicProbabilityModel.prototype.initModel = function ( fullModel ) {
30 | 	this.parentModel = filteredModel;
31 | };
32 | 
33 | /**
34 |  * Loads matrix, termIndex, and topicIndex from the model's "url"
35 |  * and triggers a loaded event that the next model (child model) listens to.  
36 |  * (This function is called after the state model loaded event is fired)
37 |  *
38 |  * @param { string } the location of datafile to load values from
39 |  * @return { void }
40 |  */
41 | SeriatedTermTopicProbabilityModel.prototype.load = function () {
42 | 	var successHandler = function( model, response, options )
43 | 	{
44 | 		this.trigger("loaded:seriated");	
45 | 		
46 | 	}.bind(this);
47 | 	var errorHandler = function( model, xhr, options ) { }.bind(this);
48 | 	this.fetch({
49 | 		add : true,
50 | 		success : successHandler,
51 | 		error : errorHandler
52 | 	});	
53 | };


--------------------------------------------------------------------------------
/pipeline/utf8_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Modified from 'The Python Standard Library'
 6 | 13.1. csv — CSV File Reading and Writing
 7 | http://docs.python.org/2/library/csv.html
 8 | """
 9 | 
10 | import csv, codecs, cStringIO
11 | 
12 | class UTF8Recoder:
13 | 	"""
14 | 	Iterator that reads an encoded stream and reencodes the input to UTF-8
15 | 	"""
16 | 	def __init__(self, f, encoding):
17 | 		self.reader = codecs.getreader(encoding)(f)
18 | 	
19 | 	def __iter__(self):
20 | 		return self
21 | 	
22 | 	def next(self):
23 | 		return self.reader.next().encode("utf-8")
24 | 
25 | class UnicodeReader:
26 | 	"""
27 | 	A CSV reader which will iterate over lines in the CSV file "f",
28 | 	which is encoded in the given encoding.
29 | 	"""
30 | 	
31 | 	def __init__(self, f, dialect=csv.excel, encoding="utf-8", delimiter="\t", **kwds):
32 | 		f = UTF8Recoder(f, encoding)
33 | 		self.reader = csv.reader(f, dialect=dialect, delimiter=delimiter, **kwds)
34 | 	
35 | 	def next(self):
36 | 		row = self.reader.next()
37 | 		return [unicode(s, "utf-8") for s in row]
38 | 	
39 | 	def __iter__(self):
40 | 		return self
41 | 
42 | class UnicodeWriter:
43 | 	"""
44 | 	A CSV writer which will write rows to CSV file "f",
45 | 	which is encoded in the given encoding.
46 | 	"""
47 | 	
48 | 	def __init__(self, f, dialect=csv.excel, encoding="utf-8", delimiter="\t", **kwds):
49 | 		# Redirect output to a queue
50 | 		self.queue = cStringIO.StringIO()
51 | 		self.writer = csv.writer(self.queue, dialect=dialect, delimiter=delimiter, **kwds)
52 | 		self.stream = f
53 | 		self.encoder = codecs.getincrementalencoder(encoding)()
54 | 	
55 | 	def writerow(self, row):
56 | 		self.writer.writerow([s.encode("utf-8") for s in row])
57 | 		# Fetch UTF-8 output from the queue ...
58 | 		data = self.queue.getvalue()
59 | 		data = data.decode("utf-8", "ignore")
60 | 		# ... and reencode it into the target encoding
61 | 		data = self.encoder.encode(data)
62 | 		# write to the target stream
63 | 		self.stream.write(data)
64 | 		# empty queue
65 | 		self.queue.truncate(0)
66 | 	
67 | 	def writerows(self, rows):
68 | 		for row in rows:
69 | 			self.writerow(row)


--------------------------------------------------------------------------------
/pipeline/stmt/generate-label-term-distributions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import re
 6 | 
 7 | parser = argparse.ArgumentParser( description = 'Generate label-term-distributions.csv from topic-term-distributions.csv.' )
 8 | parser.add_argument( 'path', type = str, help = 'Path of STMT model output' )
 9 | args = parser.parse_args()
10 | path = args.path
11 | 
12 | ################################################################################
13 | 
14 | # Get topics
15 | topics = []
16 | f = '{}/final-iters/topic-index.txt'.format( path )
17 | for line in open( f ).read().splitlines() :
18 | 	topics.append( line )
19 | 
20 | # Get labels (Skip BACKGROUND)
21 | labels = []
22 | f = '{}/final-iters/label-index.txt'.format( path )
23 | for line in open( f ).read().splitlines() :
24 | 	if ( line != 'BACKGROUND' ) :
25 | 		labels.append( line )
26 | 
27 | ################################################################################
28 | 
29 | # Match labels and topics
30 | match = []
31 | for i in range( len( topics ) ) :
32 | 	topic = topics[i]
33 | 	match.append( -1 )
34 | 	
35 | 	for j in range( len( labels ) ) :
36 | 		label = labels[j]
37 | 		m = re.match( r'{} \- \d+'.format( re.escape(label) ), topic )
38 | 		if m is not None:
39 | 			match[i] = j
40 | 	
41 | 	if ( match[i] == -1 ) :
42 | 		match[i] = len(labels)
43 | 		labels.append( "Topic{:02d}".format( len(labels)+1 ) )
44 | 
45 | #print labels
46 | #print match
47 | 
48 | # Merge rows of TOPIC-term distributions
49 | tally = []
50 | for label in labels:
51 | 	tally.append( [] )
52 | 
53 | f = '{}/topic-term-distributions.csv'.format( path )
54 | lines = open( f ).read().splitlines()
55 | assert( len(lines) == len(topics) )
56 | for i in range( len( topics ) ) :
57 | 	values = lines[i].split( ',' )
58 | 	for j in range( len( values ) ) :
59 | 		values[j] = float( values[j] )
60 | 	target = match[i]
61 | 	if ( len( tally[target] ) == 0 ) :
62 | 		tally[target] = values
63 | 	else :
64 | 		for j in range( len( values ) ) :
65 | 			tally[target][j] += values[j]
66 | 
67 | ################################################################################
68 | 
69 | # Output topics
70 | f = '{}/topic-index.txt'.format( path )
71 | w = open( f, 'w' )
72 | for topic in topics :
73 | 	w.write( topic + '\n' )
74 | w.close()
75 | 
76 | # Output labels
77 | f = '{}/label-index.txt'.format( path )
78 | w = open( f, 'w' )
79 | for label in labels :
80 | 	w.write( label + '\n' )
81 | w.close()
82 | 
83 | # Output LABEL-term distributions
84 | f = '{}/label-term-distributions.csv'.format( path )
85 | w = open( f, 'w' )
86 | for values in tally :
87 | 	for j in range( len( values ) ) :
88 | 		values[j] = str( values[j] )
89 | 	w.write( ','.join( values ) + '\n' )
90 | w.close()
91 | 
92 | 


--------------------------------------------------------------------------------
/client-src/ViewParameters.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	ViewParameters.js
 3 | 	
 4 | 	This file contains some final parameters for the view elements. 
 5 | 	
 6 | 	Parameters include:
 7 | 		-defaults for different objects
 8 | 		-functions to assign colors to events
 9 | 		-functions to generate consistent class tags for objects based on term or topic
10 | */
11 | //=====================================================================================
12 | //									VIEW PARAMS
13 | //=====================================================================================
14 | var THRESHHOLD = 0.01;
15 | 
16 | var HIGHLIGHT = "red";
17 | var DEFAULT = "default";
18 | var DESELECT = "deselect";
19 | 
20 | var colorNames = ["orange", "blue", "green", "purple", "brown", "pink"];
21 | var colorObjs = [];
22 | 
23 | /**
24 |  * Initializes the color objects to all free or according to the given object
25 |  *
26 |  * @param { list } list of used colors (should be initialized with usage:true)
27 |  */
28 | function initColorObjects( loadObj ) {
29 | 	if(loadObj === null){
30 | 		for( var index = 0; index < colorNames.length; index++ ) {
31 | 			colorObjs.push({color: colorNames[index], usage: false});
32 | 		}
33 | 	}
34 | 	else{
35 | 		// load some initial usage from passed object (from state)
36 | 	}
37 | }
38 | /**
39 |  * Returns the first free color if any. Marks returned color as used if not DEFAULT
40 |  */
41 | function getColor() {
42 | 	var color = DEFAULT;
43 | 	for( var index = 0; index < colorObjs.length; index++ ){
44 | 		if( !(colorObjs[index].usage) ){
45 | 			color = colorObjs[index].color;
46 | 			colorObjs[index].usage = true;
47 | 			break;
48 | 		}
49 | 	}
50 | 	return color;
51 | }
52 | /**
53 |  * Marks the given color as usage:false if that color name exists
54 |  *
55 |  * @param { string } name of color to be freed
56 |  */
57 | function freeColor( color ) {
58 | 	if( color !== DEFAULT ){
59 | 		for( var index = 0; index < colorObjs.length; index++ ){
60 | 			if( color === colorObjs[index].color){
61 | 				colorObjs[index].usage = false;
62 | 				break;
63 | 			}
64 | 		}
65 | 	}
66 | };
67 | function claimColor( color ){
68 | 	if( color !== DEFAULT ){
69 | 		for( var index = 0; index < colorObjs.length; index++ ){
70 | 			if( color === colorObjs[index].color){
71 | 				colorObjs[index].usage = true;
72 | 				break;
73 | 			}	
74 | 		}
75 | 	}
76 | };
77 | 
78 | /**
79 |  * consistent d3 class labeling helper functions
80 |  *
81 |  * @param { string, int } term or topic to use in classname
82 |  * @return { string } class name based on input
83 |  */
84 | function getTopicClassTag( topic ){
85 | 	return "__topic_" + sanitize(topic);
86 | }
87 | function getTermClassTag( term ){
88 | 	return "__term_" + sanitize(term);
89 | }
90 | function sanitize( text ){
91 | 	// Need to account for non-alphanumeric characters
92 | 	// Return a unique identifier for any input string
93 | 	return text.replace( /[^A-Za-z0-9]/g, "_" );
94 | }
95 | /** end class labeling helper functions **/
96 | 


--------------------------------------------------------------------------------
/pipeline/stmt/lda-learn.scala:
--------------------------------------------------------------------------------
 1 | // tells Scala where to find the TMT classes
 2 | import scalanlp.io._;
 3 | import scalanlp.stage._;
 4 | import scalanlp.stage.text._;
 5 | import scalanlp.text.tokenize._;
 6 | import scalanlp.pipes.Pipes.global._;
 7 | 
 8 | import edu.stanford.nlp.tmt.stage._;
 9 | import edu.stanford.nlp.tmt.model.lda._;
10 | 
11 | 
12 | 
13 | if ( args.length < 2 )
14 | {
15 |   System.err.println( "Arguments: inputFile outputPath [numTopics] [numIters] [termSmoothing] [topicSmoothing]" );
16 |   System.err.println( "     inputFile: tab-delimited file containing the training corpus" );
17 |   System.err.println( "                (first column = docID, second column = text)" );
18 |   System.err.println( "    outputPath: path for saving output model data" );
19 |   System.err.println( "   numOfTopics: number of topics to train [default=20]" );
20 |   System.err.println( "      maxIters: number of iterations to execute [default=1000]" );
21 |   System.err.println( " termSmoothing: [default=0.01]" );
22 |   System.err.println( "topicSmoothing: [default=0.01]" );
23 |   System.exit( -1 );
24 | }
25 | 
26 | 
27 | val inputFile = args(0);
28 | val outputPath = args(1);
29 | val indexColumn = 1;
30 | val textColumn = 2;
31 | 
32 | 
33 | val numOfTopics = if ( args.length > 2 ) { args(2).toInt } else { 20 };
34 | val maxIters = if ( args.length > 3 ) { args(3).toInt } else { 1000 };
35 | val termSmoothing = if ( args.length > 4 ) { args(4).toDouble } else { 0.01 };
36 | val topicSmoothing = if ( args.length > 5 ) { args(5).toDouble } else { 0.01 };
37 | 
38 | System.err.println( "LDA Learning Parameters..." );
39 | System.err.println( "     inputFile = " + inputFile );
40 | System.err.println( "    outputPath = " + outputPath );
41 | System.err.println( "   numOfTopics = " + numOfTopics );
42 | System.err.println( "      maxIters = " + maxIters );
43 | System.err.println( " termSmoothing = " + termSmoothing );
44 | System.err.println( "topicSmoothing = " + topicSmoothing );
45 | System.err.println();
46 | 
47 | 
48 | val alphabetsOnly = {
49 |   RegexSearchTokenizer( "[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*" ) ~> // keep tokens with alphabets
50 |   CaseFolder() ~>                                                   // fold to lower case
51 |   StopWordFilter( "en" )                                            // remove common English words
52 | }
53 | 
54 | System.err.println( "Loading source text..." );
55 | val source = TSVFile( inputFile ) ~> IDColumn( indexColumn );
56 | val text = source ~> Column( textColumn ) ~> TokenizeWith( alphabetsOnly ) ~> TermCounter();
57 | 
58 | 
59 | System.err.println( "Defining dataset and model..." );
60 | val dataset = LDADataset( text );
61 | 
62 | 
63 | val modelParams = LDAModelParams( numTopics=numOfTopics, dataset=dataset, topicSmoothing=topicSmoothing, termSmoothing=termSmoothing );
64 | val modelPath = file( outputPath );
65 | 
66 | System.err.println( "Learning LDA topics..." );
67 | val model = TrainCVB0LDA( modelParams, dataset, output=modelPath, maxIterations=maxIters );
68 | val perDocTopicDistributions = InferCVB0DocumentTopicDistributions( model, dataset );
69 | 
70 | System.err.println( "Writing term counts to disk..." );
71 | val termCounts = text.meta[ TermCounts ];
72 | CSVFile( file( outputPath + "/term-counts.csv" ) ).write(
73 |   {
74 |     for ( term <- termCounts.index.iterator ) yield ( term, termCounts.getTF( term ), termCounts.getDF( term ) )
75 |   }
76 | );
77 | 
78 | //System.err.println( "Writing topics per doc..." )
79 | //CSVFile( file( outputPath + "/topics-per-doc.csv" ) ).write( perDocTopicDistributions );
80 | 


--------------------------------------------------------------------------------
/pipeline/io_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | import os
  6 | from utf8_utils import UnicodeReader, UnicodeWriter
  7 | 
  8 | def CheckAndMakeDirs( path ):
  9 | 	if not os.path.exists( path ):
 10 | 		os.makedirs( path )
 11 | 
 12 | def ReadAsList( filename ):
 13 | 	"""
 14 | 	Return a list of values.
 15 | 	Each value corresponds to a line of the input file.
 16 | 	"""
 17 | 	data = []
 18 | 	with open( filename, 'r' ) as f:
 19 | 		lines = f.read().decode( 'utf-8' ).splitlines()
 20 | 		for line in lines:
 21 | 			data.append( line )
 22 | 	return data
 23 | 
 24 | def ReadAsVector( filename ):
 25 | 	vector = []
 26 | 	with open( filename, 'r' ) as f:
 27 | 		lines = f.read().decode( 'utf-8' ).splitlines()
 28 | 		for line in lines:
 29 | 			vector.append( float( line ) )
 30 | 	return vector
 31 | 
 32 | def ReadAsMatrix( filename ):
 33 | 	matrix = []
 34 | 	with open( filename, 'r' ) as f:
 35 | 		lines = UnicodeReader( f )
 36 | 		for line in lines:
 37 | 			matrix.append( map( float, line ) )
 38 | 	return matrix
 39 | 
 40 | def ReadAsSparseVector( filename ):
 41 | 	vector = {}
 42 | 	with open( filename, 'r' ) as f:
 43 | 		lines = UnicodeReader( f )
 44 | 		for ( key, value ) in lines:
 45 | 			vector[ key ] = float( value )
 46 | 	return vector
 47 | 
 48 | def ReadAsSparseMatrix( filename ):
 49 | 	matrix = {}
 50 | 	with open( filename, 'r' ) as f:
 51 | 		lines = UnicodeReader( f )
 52 | 		for ( aKey, bKey, value ) in lines:
 53 | 			matrix[ (aKey, bKey) ] = float( value )
 54 | 	return matrix
 55 | 
 56 | def ReadAsJson( filename ):
 57 | 	"""
 58 | 	Expect a dict of values.
 59 | 	Write dict as-is to disk as a JSON object.
 60 | 	"""
 61 | 	data = None
 62 | 	with open( filename, 'r' ) as f:
 63 | 		data = json.load( f, encoding = 'utf-8' )
 64 | 	return data
 65 | 
 66 | def WriteAsList( data, filename ):
 67 | 	with open( filename, 'w' ) as f:
 68 | 		for element in data:
 69 | 			f.write( element.encode( 'utf-8' ) + '\n' )
 70 | 
 71 | def WriteAsVector( vector, filename ):
 72 | 	with open( filename, 'w' ) as f:
 73 | 		for element in vector:
 74 | 			f.write( str( vector ) + '\n' )
 75 | 
 76 | def WriteAsMatrix( matrix, filename ):
 77 | 	with open( filename, 'w' ) as f:
 78 | 		writer = UnicodeWriter( f )
 79 | 		for row in matrix:
 80 | 			writer.writerow( map( str, row ) )
 81 | 
 82 | def WriteAsSparseVector( vector, filename ):
 83 | 	"""
 84 | 	Expect a sparse vector (dict) of values.
 85 | 	Generate a tab-delimited file, with 2 columns.
 86 | 	Write key as the 1st column; write cell value as the 2nd column.
 87 | 	"""
 88 | 	sortedKeys = sorted( vector.keys(), key = lambda key : -vector[key] )
 89 | 	with open( filename, 'w' ) as f:
 90 | 		writer = UnicodeWriter( f )
 91 | 		for key in sortedKeys:
 92 | 			writer.writerow( [ key, str( vector[key] ) ] )
 93 | 
 94 | def WriteAsSparseMatrix( matrix, filename ):
 95 | 	"""
 96 | 	Expect a sparse matrix (two-level dict) of values.
 97 | 	Generate a tab-delimited file, with 3 columns.
 98 | 	Write two keys as the 1st and 2nd columns; write cell value as the 3rd column.
 99 | 	"""
100 | 	sortedKeys = sorted( matrix.keys(), key = lambda key : -matrix[key] )
101 | 	with open( filename, 'w' ) as f:
102 | 		writer = UnicodeWriter( f )
103 | 		for ( aKey, bKey ) in sortedKeys:
104 | 			writer.writerow( [ aKey, bKey, str( matrix[ (aKey, bKey) ] ) ] )
105 | 
106 | def WriteAsJson( data, filename ):
107 | 	"""
108 | 	Expect a dict of values.
109 | 	Write dict as-is to disk as a JSON object.
110 | 	"""
111 | 	with open( filename, 'w' ) as f:
112 | 		json.dump( data, f, encoding = 'utf-8', indent = 2, sort_keys = True )
113 | 
114 | def WriteAsTabDelimited( data, filename, fields ):
115 | 	"""
116 | 	Expect a list of dict values.
117 | 	Take in a list of output fields.
118 | 	Write specified fields to disk, as a tab-delimited file (with header row).
119 | 	"""
120 | 	with open( filename, 'w' ) as f:
121 | 		writer = UnicodeWriter( f )
122 | 		writer.writerow( fields )
123 | 		for element in data:
124 | 			values = []
125 | 			for field in fields:
126 | 				if not type( element[field] ) is unicode:
127 | 					values.append( str( element[field] ) )
128 | 				else:
129 | 					values.append( element[field] )
130 | 			writer.writerow( values )
131 | 


--------------------------------------------------------------------------------
/client-src/InteractionObjects.css:
--------------------------------------------------------------------------------
  1 | /* matrix color objects */
  2 | line.normal {
  3 | 	stroke: #808080;
  4 | 	stroke-opacity: 0.25;
  5 | 	stroke-width: 0.5px;
  6 | }
  7 | line.blue{
  8 | 	stroke: #1f77b4;
  9 | 	stroke-opacity: 1;
 10 | 	stroke-width: 0.5px;
 11 | }
 12 | line.orange {
 13 | 	stroke: #ff7f0e;
 14 | 	stroke-opacity: 1;
 15 | 	stroke-width: 0.5px;
 16 | }
 17 | line.green {
 18 | 	stroke: #2ca02c;
 19 | 	stroke-opacity: 1;
 20 | 	stroke-width: 0.5px;
 21 | }
 22 | line.purple {
 23 | 	stroke: #9467bd;
 24 | 	stroke-opacity: 1;
 25 | 	stroke-width: 0.5px;
 26 | }
 27 | line.brown {
 28 | 	stroke: #8c564b;
 29 | 	stroke-opacity: 1;
 30 | 	stroke-width: 0.5px;
 31 | }
 32 | line.pink {
 33 | 	stroke: #e377c2;
 34 | 	stroke-opacity: 1;
 35 | 	stroke-width: 0.5px;
 36 | }
 37 | line.red {
 38 | 	stroke: #933 ;
 39 | 	stroke-opacity: 1 ;
 40 | 	stroke-width: 0.5px ;
 41 | }
 42 | 
 43 | text {
 44 | 	user-select: none;
 45 | 	-webkit-user-select: none;
 46 | 	-moz-user-select: none;
 47 | 	font-family: Verdana;
 48 | 	font-size: 10px;
 49 | }
 50 | text.topLabel {
 51 | 	text-anchor: start;
 52 | 	cursor: pointer;
 53 | }
 54 | text.leftLabel {
 55 | 	text-anchor: end;
 56 | }
 57 | text.normal {
 58 | 	fill: #808080;
 59 | 	stroke: #808080;
 60 | 	stroke-opacity: 0;
 61 | 	stroke-width: 0px;
 62 | }
 63 | text.blue {
 64 | 	fill: #1f77b4;
 65 | 	stroke: #1f77b4;
 66 | 	stroke-opacity: 1;
 67 | 	stroke-width: 0.5px;
 68 | }
 69 | text.orange {
 70 | 	fill: #ff7f0e;
 71 | 	stroke: #ff7f0e;
 72 | 	stroke-opacity: 1;
 73 | 	stroke-width: 0.5px;
 74 | }
 75 | text.green {
 76 | 	fill: #2ca02c;
 77 | 	stroke: #2ca02c;
 78 | 	stroke-opacity: 1;
 79 | 	stroke-width: 0.5px;
 80 | }
 81 | text.purple {
 82 | 	fill: #9467bd;
 83 | 	stroke: #9467bd;
 84 | 	stroke-opacity: 1;
 85 | 	stroke-width: 0.5px;
 86 | }
 87 | text.brown {
 88 | 	fill: #8c564b;
 89 | 	stroke: #8c564b;
 90 | 	stroke-opacity: 1;
 91 | 	stroke-width: 0.5px;
 92 | }
 93 | text.pink {
 94 | 	fill: #e377c2;
 95 | 	stroke: #e377c2;
 96 | 	stroke-opacity: 1;
 97 | 	stroke-width: 0.5px;
 98 | }
 99 | text.red {
100 | 	fill: #933 ;
101 | 	stroke: #933 ;
102 | 	stroke-opacity: 1 ;
103 | 	stroke-width: 0.5px ;
104 | }
105 | 
106 | circle.normal {
107 | 	fill: #808080;
108 | 	fill-opacity: 0.4;
109 | 	stroke: #808080;
110 | 	stroke-opacity: 0.8;
111 | }
112 | circle.blue {
113 | 	fill: #1f77b4;
114 | 	fill-opacity: 0.5;
115 | 	stroke: #1f77b4;
116 | 	stroke-opacity: 1;
117 | 	stroke-width: 0.5px;
118 | }
119 | circle.orange {
120 | 	fill: #ff7f0e;
121 | 	fill-opacity: 0.5;
122 | 	stroke: #ff7f0e;
123 | 	stroke-opacity: 1;
124 | 	stroke-width: 0.5px;
125 | }
126 | circle.green {
127 | 	fill: #2ca02c;
128 | 	fill-opacity: 0.5;
129 | 	stroke: #2ca02c;
130 | 	stroke-opacity: 1;
131 | 	stroke-width: 0.5px;
132 | }
133 | circle.purple {
134 | 	fill: #9467bd;
135 | 	fill-opacity: 0.5;
136 | 	stroke: #9467bd;
137 | 	stroke-opacity: 1;
138 | 	stroke-width: 0.5px;
139 | }
140 | circle.brown {
141 | 	fill: #8c564b;
142 | 	fill-opacity: 0.5;
143 | 	stroke: #8c564b;
144 | 	stroke-opacity: 1;
145 | 	stroke-width: 0.5px;
146 | }
147 | circle.pink {
148 | 	fill: #e377c2;
149 | 	fill-opacity: 0.5;
150 | 	stroke: #e377c2;
151 | 	stroke-opacity: 1;
152 | 	stroke-width: 0.5px;
153 | }
154 | circle.red {
155 | 	fill: #933 ;
156 | 	fill-opacity: 0.5 ;
157 | 	stroke: #933 ;
158 | 	stroke-opacity: 1 ;
159 | 	stroke-width: 0.5px ;
160 | }
161 | /* histogram color objects */
162 | text.termLabel{
163 | 	text-anchor: end;
164 | }
165 | text.HISTnormal {
166 | 	fill: #808080;
167 | 	stroke: #808080;
168 | 	stroke-opacity: 0;
169 | 	stroke-width: 0px;
170 | }
171 | text.HISTorange {
172 | 	fill: #ff7f0e;
173 | 	stroke: #ff7f0e;
174 | 	stroke-opacity: 1;
175 | 	stroke-width: 0.5px;
176 | }
177 | text.HISTred {
178 | 	fill: #933 ;
179 | 	stroke: #933 ;
180 | 	stroke-opacity: 1 ;
181 | 	stroke-width: 0.5px ;
182 | }
183 | 
184 | line.termFreqBar {
185 | 	stroke: #808080;
186 | 	stroke-opacity: 0.4;
187 | 	stroke-width: 5px;
188 | }
189 | line.HISTnormal {
190 | 	stroke: #000;
191 | 	stroke-opacity: 0;
192 | 	stroke-width: 5px;
193 | }
194 | 
195 | .HISTblue {
196 | 	fill: #1f77b4;
197 | 	stroke: #1f77b4;
198 | 	stroke-opacity: 1;
199 | 	stroke-width: 5px;
200 | }
201 | .HISTorange {
202 | 	fill: #ff7f0e;
203 | 	stroke: #ff7f0e;
204 | 	stroke-opacity: 1;
205 | 	stroke-width: 5px;
206 | }
207 | .HISTgreen {
208 | 	fill: #2ca02c;
209 | 	stroke: #2ca02c;
210 | 	stroke-opacity: 1;
211 | 	stroke-width: 5px;
212 | }
213 | 
214 | .HISTpurple {
215 | 	fill: #9467bd;
216 | 	stroke: #9467bd;
217 | 	stroke-opacity: 1;
218 | 	stroke-width: 5px;
219 | }
220 | .HISTbrown {
221 | 	fill: #8c564b;
222 | 	stroke: #8c564b;
223 | 	stroke-opacity: 1;
224 | 	stroke-width: 5px;
225 | }
226 | .HISTpink {
227 | 	fill: #e377c2;
228 | 	stroke: #e377c2;
229 | 	stroke-opacity: 1;
230 | 	stroke-width: 5px;
231 | }
232 | .HISTred {
233 | 	fill: #933;
234 | 	stroke: #933;
235 | 	stroke-opacity: 1;
236 | 	stroke-width: 5px;
237 | }
238 | line.HISTred {
239 | 	fill: #933 ;
240 | 	stroke: #933 ;
241 | 	stroke-opacity: 1 ;
242 | 	stroke-width: 5px ;
243 | }


--------------------------------------------------------------------------------
/pipeline/import_mallet.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | from utf8_utils import UnicodeReader
 10 | from api_utils import ModelAPI
 11 | 
 12 | class ImportMallet( object ):
 13 | 
 14 | 	"""
 15 | 	Copies mallet file formats into Termite internal format.
 16 | 	"""
 17 | 	
 18 | 	# Files generated by Mallet
 19 | 	TOPIC_WORD_WEIGHTS = 'topic-word-weights.txt'
 20 | 	
 21 | 	def __init__( self, logging_level ):
 22 | 		self.logger = logging.getLogger( 'ImportMallet' )
 23 | 		self.logger.setLevel( logging_level )
 24 | 		handler = logging.StreamHandler( sys.stderr )
 25 | 		handler.setLevel( logging_level )
 26 | 		self.logger.addHandler( handler )
 27 | 	
 28 | 	def execute( self, model_library, model_path, data_path ):
 29 | 		
 30 | 		assert model_library is not None
 31 | 		assert model_library == 'mallet'
 32 | 		assert model_path is not None
 33 | 		assert data_path is not None
 34 | 		
 35 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 36 | 		self.logger.info( 'Importing a Mallet model...'                                                      )
 37 | 		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
 38 | 		self.logger.info( '    output = %s', data_path                                                       )
 39 | 		
 40 | 		self.logger.info( 'Connecting to data...' )
 41 | 		self.model = ModelAPI( data_path )
 42 | 		
 43 | 		self.logger.info( 'Reading "%s" from Mallet...', ImportMallet.TOPIC_WORD_WEIGHTS )
 44 | 		self.extractTopicWordWeights( model_path )
 45 | 		
 46 | 		self.logger.info( 'Writing data to disk...' )
 47 | 		self.model.write()
 48 | 		
 49 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 50 | 	
 51 | 	def extractTopicWordWeights( self, model_path ):
 52 | 		data = {}
 53 | 		words = []
 54 | 		topics = []
 55 | 		
 56 | 		# Read in content of file (sparse matrix representation)
 57 | 		filename = '{}/{}'.format( model_path, ImportMallet.TOPIC_WORD_WEIGHTS )
 58 | 		with open( filename, 'r' ) as f:
 59 | 			lines = UnicodeReader( f )
 60 | 			for (topic, word, value) in lines:
 61 | 				topic = int(topic)
 62 | 				if topic not in data:
 63 | 					data[ topic ] = {}
 64 | 				data[ topic ][ word ] = float(value)
 65 | 				words.append( word )
 66 | 				topics.append( topic )
 67 | 		
 68 | 		# Get list of terms and topic indexes
 69 | 		term_index = sorted( list( frozenset( words ) ) )
 70 | 		topic_index = sorted( list( frozenset( topics ) ) )
 71 | 		
 72 | 		# Build dense matrix representation
 73 | 		matrix = []
 74 | 		for term in term_index :
 75 | 			row = []
 76 | 			for topic in topic_index :
 77 | 				row.append( data[ topic ][ term ] )
 78 | 			matrix.append( row )
 79 | 		
 80 | 		# Generate topic labels
 81 | 		topic_str_index = [ 'Topic {}'.format(d) for d in topic_index ]
 82 | 		
 83 | 		self.model.term_topic_matrix = matrix
 84 | 		self.model.term_index = term_index
 85 | 		self.model.topic_index = topic_str_index
 86 | 
 87 | def main():
 88 | 	parser = argparse.ArgumentParser( description = 'Import results from Mallet topic model library into Termite.' )
 89 | 	parser.add_argument( 'config_file'          , type = str, default = None        , help = 'Path of Termite configuration file.' )
 90 | 	parser.add_argument( '--topic-model-library', type = str, dest = 'model_library', help = 'Override topic model library.'       )
 91 | 	parser.add_argument( '--topic-model-path'   , type = str, dest = 'model_path'   , help = 'Override topic model path.'          )
 92 | 	parser.add_argument( '--data-path'          , type = str, dest = 'data_path'    , help = 'Override data path.'                 )
 93 | 	parser.add_argument( '--logging'            , type = int, dest = 'logging'      , help = 'Override logging level.'             )
 94 | 	args = parser.parse_args()
 95 | 	
 96 | 	model_library = None
 97 | 	model_path = None
 98 | 	data_path = None
 99 | 	logging_level = 20
100 | 	
101 | 	# Read in default values from the configuration file
102 | 	config = ConfigParser.RawConfigParser()
103 | 	config.read( args.config_file )
104 | 	model_library = config.get( 'TopicModel', 'library' )
105 | 	model_path = config.get( 'TopicModel', 'path' )
106 | 	data_path = config.get( 'Termite', 'path' )
107 | 	if config.has_section( 'Misc' ):
108 | 		if config.has_option( 'Misc', 'logging' ):
109 | 			logging_level = config.getint( 'Misc', 'logging' )
110 | 	
111 | 	# Read in user-specifiec values from the program arguments
112 | 	if args.model_library is not None:
113 | 		model_library = args.model_library
114 | 	if args.model_path is not None:
115 | 		model_path = args.model_path
116 | 	if args.data_path is not None:
117 | 		data_path = args.data_path
118 | 	if args.logging is not None:
119 | 		logging_level = args.logging
120 | 	
121 | 	ImportMallet( logging_level ).execute( model_library, model_path, data_path )
122 | 
123 | if __name__ == '__main__':
124 | 	main()


--------------------------------------------------------------------------------
/client-src/UserControlViews.js:
--------------------------------------------------------------------------------
  1 | // Expects to be bound to the state model
  2 | var TotalTermsView = Backbone.View.extend({
  3 | 	el : 'div.TotalTermsView',
  4 | 	render : function()
  5 | 	{
  6 | 		d3.select(this.el).text( this.model.get("totalTerms") );
  7 | 	}
  8 | });
  9 | 
 10 | // Affinity Number Terms
 11 | // Need to bound to the state model
 12 | var AffinityNumTermsView = Backbone.View.extend({
 13 | 	el : 'div.AffinityNumTermsView',
 14 | 	render : function()
 15 | 	{
 16 | 		d3.select(this.el).text( this.model.get("numAffinityTerms") );
 17 | 	}
 18 | });
 19 | 
 20 | // Expects to be bound to the state model
 21 | var AffinityNumTermsSlider = Backbone.View.extend({
 22 | 	el : 'input.AffinityNumTermsSlider',
 23 | 	events : {
 24 | 		'change' : function(e) {
 25 | 			this.model.set("numAffinityTerms", parseInt(e.target.value));
 26 | 		}
 27 | 	},
 28 | 	initialize : function() {
 29 | 		this.model.on( "change:numAffinityTerms", function(value) {
 30 | 			d3.select(this.el)[0][0].value = this.model.get("numAffinityTerms");
 31 | 		}, this);
 32 | 	}
 33 | });
 34 | 
 35 | 
 36 | // Salient Number Terms
 37 | // Expects to be bound to the state model
 38 | var SalientNumTermsView = Backbone.View.extend({
 39 | 	el : 'div.SalientNumTermsView',
 40 | 	render : function()
 41 | 	{
 42 | 		d3.select(this.el).text( this.model.get("numSalientTerms") );
 43 | 	}
 44 | });
 45 | 
 46 | // Expects to be bound to the state model
 47 | var SalientNumTermsSlider = Backbone.View.extend({
 48 | 	el: 'input.SalientNumTermsSlider',
 49 | 	events : {
 50 | 		'change' : function(e) {
 51 | 			this.model.set("numSalientTerms", parseInt(e.target.value));
 52 | 		}
 53 | 	},
 54 | 	initialize : function() {
 55 | 		this.model.on( "change:numSalientTerms", function(value) {
 56 | 			d3.select(this.el)[0][0].value = this.model.get("numSalientTerms");
 57 | 		}, this);
 58 | 	}
 59 | });
 60 | 
 61 | // User Defined Terms
 62 | // Expects to be bound to the state model
 63 | var FoundTermsView = Backbone.View.extend({
 64 | 	el : 'div.FoundTermsView',
 65 | 	render : function()
 66 | 	{
 67 | 		d3.select(this.el).text( this.model.get("foundTerms"));
 68 | 	}
 69 | });
 70 | 
 71 | // Expects to be bound to the state model
 72 | var UnfoundTermsView = Backbone.View.extend({
 73 | 	el: 'div.UnfoundTermsView',
 74 | 	prefix: 'div.UnfoundTermsPrefix',
 75 | 	render : function()
 76 | 	{
 77 | 		if( this.model.get("unfoundTerms") !== ""){
 78 | 			d3.select( this.prefix ).style("visibility", "visible");
 79 | 			d3.select( this.el ).style("visibility", "visible").text( this.model.get("unfoundTerms"));
 80 | 		} else {
 81 | 			d3.select( this.prefix ).style("visibility", "hidden");
 82 | 			d3.select( this.el ).style("visibility", "hidden");
 83 | 		}
 84 | 	} 
 85 | });
 86 | 
 87 | // Expects to be bound to the state model
 88 | var UserDefinedTermsBox = Backbone.View.extend({
 89 | 	el: 'input.UserDefinedTermsBox',
 90 | 	events : {
 91 | 		'keyup' : function(e) {
 92 | 			this.model.setVisibleTerms(e.target.value);
 93 | 		}
 94 | 	},
 95 | 	initialize : function() {
 96 | 		this.model.on( "change:visibleTerms", function(value) {
 97 | 			d3.select(this.el)[0][0].value = this.model.get("visibleTerms").join(", ");
 98 | 		}, this);
 99 | 	}
100 | });
101 | 
102 | // Expects to be bound to the state model
103 | var AddTopTwenty = Backbone.View.extend({
104 | 	el: 'input.TopTwentyAddition',
105 | 	events : {
106 | 		'change' : function(e) {
107 | 			this.model.set("addTopTwenty", e.target.checked);
108 | 		}
109 | 	},
110 | 	initialize : function() {
111 | 		this.model.on( "change:addTopTwenty", function(value) {
112 | 			d3.select(this.el)[0][0].checked = this.model.get("addTopTwenty");
113 | 		}, this);
114 | 	}
115 | });
116 | 
117 | // Expects to be bound to the state model
118 | var SortDescription = Backbone.View.extend({
119 | 	el: 'div.SortDescription',
120 | 	render : function()
121 | 	{
122 | 		var sort = this.model.get("sortType");
123 | 		var topic = this.model.get("doubleClickTopic");
124 | 		var output = "";
125 | 		if( sort === "" )
126 | 			output = "default";
127 | 		else if (sort === "asc")
128 | 			output = "ascending on topic #" + topic;
129 | 		else
130 | 			output = "descending on topic #" + topic;
131 | 		d3.select(this.el).text( output );
132 | 	},
133 | 	initialize : function() {
134 | 		// TODO: call render's function?
135 | 		this.model.on( "change:sortType change:doubleClickTopic", function(value) {
136 | 			var sort = this.model.get("sortType");
137 | 			var topic = this.model.get("doubleClickTopic");
138 | 			var output = "";
139 | 			if( sort === "" )
140 | 				output = "default";
141 | 			else if (sort === "asc")
142 | 				output = "ascending on topic #" + topic;
143 | 			else
144 | 				output = "descending on topic #" + topic;
145 | 			d3.select(this.el).text( output );
146 | 		}, this);
147 | 	}
148 | });
149 | 
150 | // Expects to be bound to the state model
151 | var ClearAllButton = Backbone.View.extend({
152 | 	el: 'button.clearAll',
153 | 	events : {
154 | 		'click' : function(e) {
155 | 			this.model.clearAllSelectedTopics();
156 | 		}
157 | 	}
158 | });
159 | 
160 | // Expects to be bound to the state model
161 | var ClearSortButton = Backbone.View.extend({
162 | 	el: 'button.clearSort',
163 | 	events : {
164 | 		'click' : function(e) {
165 | 			this.model.clearSorting();
166 | 		}
167 | 	}
168 | });
169 | 


--------------------------------------------------------------------------------
/pipeline/tokenize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import sys
  6 | import argparse
  7 | import logging
  8 | import ConfigParser
  9 | from api_utils import DocumentsAPI, TokensAPI
 10 | 
 11 | class Tokenize( object ):
 12 | 
 13 | 	"""
 14 | 	Takes in the input corpus doc and writes it out as a list of tokens.
 15 | 	
 16 | 	Currently, supports only single document corpus with one document per line of format:
 17 | 		doc_id<tab>document_content
 18 | 	(Two fields delimited by tab.)
 19 | 	
 20 | 	Support for multiple files, directory(ies), and Lucene considered for future releases.
 21 | 	"""
 22 | 	
 23 | 	WHITESPACE_TOKENIZATION = r'[^ ]+'
 24 | 	ALPHANUMERIC_TOKENIZATION = r'[0-9A-Za-z_]*[A-Za-z_]+[0-9A-Za-z_]*'
 25 | 	ALPHA_TOKENIZATION = r'[A-Za-z_]+'
 26 | 	UNICODE_TOKENIZATION = r'[\w]+'
 27 | 	DEFAULT_TOKENIZATION = ALPHA_TOKENIZATION
 28 | 	
 29 | 	def __init__( self, logging_level ):
 30 | 		self.logger = logging.getLogger( 'Tokenize' )
 31 | 		self.logger.setLevel( logging_level )
 32 | 		handler = logging.StreamHandler( sys.stderr )
 33 | 		handler.setLevel( logging_level )
 34 | 		self.logger.addHandler( handler )
 35 | 	
 36 | 	def execute( self, corpus_format, corpus_path, data_path, tokenization ):		
 37 | 		assert corpus_format is not None
 38 | 		assert corpus_path is not None
 39 | 		assert data_path is not None
 40 | 		if tokenization is None:
 41 | 			tokenization = Tokenize.DEFAULT_TOKENIZATION
 42 | 		elif tokenization == 'unicode':
 43 | 			tokenization = Tokenize.UNICODE_TOKENIZATION
 44 | 		elif tokenization == 'whitespace':
 45 | 			tokenization = Tokenize.WHITESPACE_TOKENIZATION
 46 | 		elif tokenization == 'alpha':
 47 | 			tokenization = Tokenize.ALPHA_TOKENIZATION
 48 | 		elif tokenization == 'alphanumeric':
 49 | 			tokenization = Tokenize.ALPHANUMERIC_TOKENIZATION
 50 | 	
 51 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 52 | 		self.logger.info( 'Tokenizing source corpus...'                                                      )
 53 | 		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
 54 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 55 | 		self.logger.info( '    tokenization = %s', tokenization                                              )
 56 | 		
 57 | 		self.logger.info( 'Connecting to data...' )
 58 | 		self.documents = DocumentsAPI( corpus_format, corpus_path )
 59 | 		self.tokens = TokensAPI( data_path )
 60 | 		
 61 | 		self.logger.info( 'Reading from disk...' )
 62 | 		self.documents.read()
 63 | 		
 64 | 		self.logger.info( 'Tokenizing...' )
 65 | 		self.TokenizeDocuments( re.compile( tokenization, re.UNICODE ) )
 66 | 		
 67 | 		self.logger.info( 'Writing to disk...' )
 68 | 		self.tokens.write()
 69 | 		
 70 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 71 | 	
 72 | 	def TokenizeDocuments( self, tokenizer ):
 73 | 		for docID, docContent in self.documents.data.iteritems():
 74 | 			docTokens = self.TokenizeDocument( docContent, tokenizer )
 75 | 			self.tokens.data[ docID ] = docTokens
 76 | 	
 77 | 	def TokenizeDocument( self, text, tokenizer ):
 78 | 		tokens = []
 79 | 		for token in re.findall( tokenizer, text ):
 80 | 			tokens.append( token.lower() )
 81 | 		return tokens
 82 | 
 83 | #-------------------------------------------------------------------------------#
 84 | 
 85 | def main():
 86 | 	parser = argparse.ArgumentParser( description = 'Tokenize a document collection for Termite.' )
 87 | 	parser.add_argument( 'config_file'    , type = str, default = None        , help = 'Path of Termite configuration file.'  )
 88 | 	parser.add_argument( '--corpus-format', type = str, dest = 'corpus_format', help = 'Override corpus format.'              )
 89 | 	parser.add_argument( '--corpus-path'  , type = str, dest = 'corpus_path'  , help = 'Override corpus path.'                )
 90 | 	parser.add_argument( '--tokenization' , type = str, dest = 'tokenization' , help = 'Override tokenization regex pattern.' )
 91 | 	parser.add_argument( '--data-path'    , type = str, dest = 'data_path'    , help = 'Override data path.'                  )
 92 | 	parser.add_argument( '--logging'      , type = int, dest = 'logging'      , help = 'Override logging level.'              )
 93 | 	args = parser.parse_args()
 94 | 	
 95 | 	# Declare parameters
 96 | 	corpus_format = None
 97 | 	corpus_path = None
 98 | 	tokenization = None
 99 | 	data_path = None
100 | 	logging_level = 20
101 | 	
102 | 	# Read in default values from the configuration file
103 | 	if args.config_file is not None:
104 | 		config = ConfigParser.RawConfigParser()
105 | 		config.read( args.config_file )
106 | 		if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'format' ):
107 | 			corpus_format = config.get( 'Corpus', 'format' )
108 | 		if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'path' ):
109 | 			corpus_path = config.get( 'Corpus', 'path' )
110 | 		if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'tokenization' ):
111 | 			tokenization = config.get( 'Corpus', 'tokenization' )
112 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
113 | 			data_path = config.get( 'Termite', 'path' )
114 | 		if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
115 | 			logging_level = config.getint( 'Misc', 'logging' )
116 | 	
117 | 	# Read in user-specifiec values from the program arguments
118 | 	if args.corpus_format is not None:
119 | 		corpus_format = args.corpus_format
120 | 	if args.corpus_path is not None:
121 | 		corpus_path = args.corpus_path
122 | 	if args.tokenization is not None:
123 | 		tokenization = args.tokenization
124 | 	if args.data_path is not None:
125 | 		data_path = args.data_path
126 | 	if args.logging is not None:
127 | 		logging_level = args.logging
128 | 	
129 | 	Tokenize( logging_level ).execute( corpus_format, corpus_path, data_path, tokenization )
130 | 
131 | if __name__ == '__main__':
132 | 	main()
133 | 


--------------------------------------------------------------------------------
/client-src/QueryString.js:
--------------------------------------------------------------------------------
  1 | function QueryString()
  2 | {
  3 | 	this.parameters = [];
  4 | }
  5 | 
  6 | QueryString.prototype.parameters = function()
  7 | {
  8 | 	for ( var i = 0; i < arguments.length; i++ )
  9 | 		this.addValueParameter( arguments[i] );
 10 | }
 11 | QueryString.prototype.addValueParameter = function( name, identifier, decoder, encoder )
 12 | {
 13 | 	if ( identifier === undefined || identifier === null )
 14 | 		identifier = name;
 15 | 	if ( encoder === undefined || encoder === null )
 16 | 		if ( decoder === undefined || decoder === null )
 17 | 			encoder = "str";
 18 | 		else
 19 | 			encoder = decoder;
 20 | 	if ( decoder === undefined || decoder === null )
 21 | 		decoder = "str";
 22 | 
 23 | 	var parameter = {
 24 | 		'name' : name,
 25 | 		'identifier' : identifier,
 26 | 		'isArray' : false,
 27 | 		'decoder' : this.valueDecoder( decoder ),
 28 | 		'encoder' : this.valueEncoder( encoder )
 29 | 	}
 30 | 	this.parameters.push( parameter );
 31 | 	return this;
 32 | }
 33 | QueryString.prototype.addArrayParameter = function( name, identifier, decoder, encoder )
 34 | {
 35 | 	if ( identifier === undefined || identifier === null )
 36 | 		identifier = name;
 37 | 	if ( encoder === undefined || encoder === null )
 38 | 		if ( decoder === undefined || decoder === null )
 39 | 			encoder = "str";
 40 | 		else
 41 | 			encoder = decoder;
 42 | 	if ( decoder === undefined || decoder === null )
 43 | 		decoder = "str";
 44 | 		
 45 | 	var parameter = {
 46 | 		'name' : name,
 47 | 		'identifier' : identifier,
 48 | 		'isArray' : true,
 49 | 		'decoder' : this.arrayDecoder( decoder ),
 50 | 		'encoder' : this.arrayEncoder( encoder )
 51 | 	}
 52 | 	this.parameters.push( parameter );
 53 | 	return this;
 54 | }
 55 | QueryString.prototype.valueDecoder = function( decoder )
 56 | {
 57 | 	if ( typeof decoder == "function" )
 58 | 		return decoder;
 59 | 	if ( typeof decoder == "string" )
 60 | 	{
 61 | 		if ( decoder == "int" )
 62 | 			return function(d) { return parseInt(d,10) }
 63 | 		if ( decoder == "float" )
 64 | 			return function(d) { return parseFloat(d) }
 65 | 		return function(d) { return d };
 66 | 	}
 67 | 	return null;
 68 | }
 69 | QueryString.prototype.valueEncoder = function( encoder )
 70 | {
 71 | 	if ( typeof encoder == "function" )
 72 | 		return encoder;
 73 | 	if ( typeof encoder == "string" )
 74 | 	{
 75 | 		return function(d) { return String(d) };
 76 | 	}
 77 | 	return null;
 78 | }
 79 | QueryString.prototype.arrayDecoder = function( decoder )
 80 | {
 81 | 	if ( typeof decoder == "function" )
 82 | 		return decoder;
 83 | 	var g = function(values)
 84 | 	{
 85 | 		var f = this.valueDecoder(decoder);
 86 | 		var states = [];
 87 | 		values.forEach( function(d) { states.push( f(d) ) } );
 88 | 		return states;
 89 | 	}
 90 | 	return g.bind(this);
 91 | }
 92 | QueryString.prototype.arrayEncoder = function( encoder )
 93 | {
 94 | 	if ( typeof encoder == "function" )
 95 | 		return encoder;
 96 | 	var g = function(states)
 97 | 	{
 98 | 		var f = this.valueEncoder(encoder);
 99 | 		var values = [];
100 | 		states.forEach( function(d) { values.push( f(d) ) } );
101 | 		return values;
102 | 	}
103 | 	return g.bind(this);
104 | }
105 | 
106 | QueryString.prototype.read = function( states )
107 | {
108 | 	if ( states === undefined || states === null )
109 | 		states = {};
110 | 	for ( var i in this.parameters )
111 | 	{
112 | 		var p = this.parameters[i];
113 | 		if ( p.isArray )
114 | 		{
115 | 			var values = this.getValues( p.identifier );
116 | 			if ( values.length > 0 )
117 | 				states[p.name] = p.decoder( values );
118 | 		}
119 | 		else
120 | 		{
121 | 			var value = this.getValue( p.identifier );
122 | 			if ( value != null )
123 | 				states[p.name] = p.decoder( value );
124 | 		}
125 | 	}
126 | 	return states;
127 | }
128 | QueryString.prototype.write = function( states, replaceBrowserHistoryEntry, pageStates, pageTitle )
129 | {
130 | 	if ( replaceBrowserHistoryEntry === undefined || typeof replaceBrowserHistoryEntry != "boolean" )
131 | 		replaceBrowserHistoryEntry = false;
132 | 	if ( pageStates === undefined )
133 | 		pageStates = null;
134 | 	if ( pageTitle === undefined )
135 | 		pageTitle = null;
136 | 
137 | 	var s = [];
138 | 	for ( var i in this.parameters )
139 | 	{
140 | 		var p = this.parameters[i];
141 | 		if ( p.name in states )
142 | 		{
143 | 			if ( p.isArray )
144 | 			{
145 | 				var values = p.encoder( states[p.name] );
146 | 				for ( var j in values )
147 | 					if ( values[j].length > 0 )
148 | 						s.push( p.identifier + "=" + escape( values[j] ) );
149 | 			}
150 | 			else
151 | 			{
152 | 				var value = p.encoder( states[p.name] );
153 | 				if ( value.length > 0 )
154 | 					s.push( p.identifier + "=" + escape( value ) );
155 | 			}
156 | 		}
157 | 	}
158 | 	
159 | 	var protocol = window.location.protocol;
160 | 	var server = window.location.host;
161 | 	var path = window.location.pathname;
162 | 	var pageURL = protocol + '//' + server + path + ( s.length > 0 ? "?" + s.join( "&" ) : "" );
163 | 	
164 | 	if ( replaceBrowserHistoryEntry )
165 | 		history.replaceState( pageStates, pageTitle, pageURL );
166 | 	else
167 | 		history.pushState( pageStates, pageTitle, pageURL );
168 | }
169 | 
170 | QueryString.prototype.getValue = function( key )
171 | {
172 | 	var regex = this.getKeyRegex( key );
173 | 	var match = regex.exec( window.location.href );
174 | 	if ( match === null )
175 | 		return null;
176 | 	else
177 | 		return unescape( match[1] );
178 | }
179 | QueryString.prototype.getValues = function( key )
180 | {
181 | 	var regex = this.getKeyRegex( key );
182 | 	var matches = window.location.href.match( regex );
183 | 	if ( matches === null )
184 | 		return [];
185 | 	else
186 | 	{
187 | 		for ( var i = 0; i < matches.length; i ++ )
188 | 		{
189 | 			var regex = this.getKeyRegex( key );
190 | 			var match = regex.exec( matches[i] );
191 | 			matches[i] = unescape( match[1] );
192 | 		}
193 | 		return matches;
194 | 	}
195 | }
196 | QueryString.prototype.getKeyRegex = function( key )
197 | {
198 | 	return new RegExp( "[\\?&]" + key + "=([^&]*)", "g" );
199 | }
200 | 
201 | 


--------------------------------------------------------------------------------
/pipeline/compute_saliency.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | import math
 10 | from api_utils import ModelAPI, SaliencyAPI
 11 | 
 12 | class ComputeSaliency( object ):
 13 | 	"""
 14 | 	Distinctiveness and saliency.
 15 | 	
 16 | 	Compute term distinctiveness and term saliency, based on
 17 | 	the term probability distributions associated with a set of
 18 | 	latent topics.
 19 | 	
 20 | 	Input is term-topic probability distribution, stored in 3 separate files:
 21 | 	    'term-topic-matrix.txt' contains the entries of the matrix.
 22 | 	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
 23 | 	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
 24 | 	
 25 | 	Output is a list of term distinctiveness and saliency values,
 26 | 	in two duplicate formats, a tab-delimited file and a JSON object:
 27 | 	    'term-info.txt'
 28 | 	    'term-info.json'
 29 | 	
 30 | 	An auxiliary output is a list topic weights (i.e., the number of
 31 | 	tokens in the corpus assigned to each latent topic) in two
 32 | 	duplicate formats, a tab-delimited file and a JSON object:
 33 | 	    'topic-info.txt'
 34 | 	    'topic-info.json'
 35 | 	"""
 36 | 	
 37 | 	def __init__( self, logging_level ):
 38 | 		self.logger = logging.getLogger( 'ComputeSaliency' )
 39 | 		self.logger.setLevel( logging_level )
 40 | 		handler = logging.StreamHandler( sys.stderr )
 41 | 		handler.setLevel( logging_level )
 42 | 		self.logger.addHandler( handler )
 43 | 	
 44 | 	def execute( self, data_path ):
 45 | 		
 46 | 		assert data_path is not None
 47 | 		
 48 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 49 | 		self.logger.info( 'Computing term saliency...'                                                       )
 50 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 51 | 		
 52 | 		self.logger.info( 'Connecting to data...' )
 53 | 		self.model = ModelAPI( data_path )
 54 | 		self.saliency = SaliencyAPI( data_path )
 55 | 		
 56 | 		self.logger.info( 'Reading data from disk...' )
 57 | 		self.model.read()
 58 | 		
 59 | 		self.logger.info( 'Computing...' )
 60 | 		self.computeTopicInfo()
 61 | 		self.computeTermInfo()
 62 | 		self.rankResults()
 63 | 		
 64 | 		self.logger.info( 'Writing data to disk...' )
 65 | 		self.saliency.write()
 66 | 		
 67 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 68 | 	
 69 | 	def computeTopicInfo( self ):
 70 | 		topic_weights = [ sum(x) for x in zip( *self.model.term_topic_matrix ) ]
 71 | 		topic_info = []
 72 | 		for i in range(self.model.topic_count):
 73 | 			topic_info.append( {
 74 | 				'topic' : self.model.topic_index[i],
 75 | 				'weight' : topic_weights[i]
 76 | 			} )
 77 | 		
 78 | 		self.saliency.topic_info = topic_info
 79 | 	
 80 | 	def computeTermInfo( self ):
 81 | 		"""Iterate over the list of terms. Compute frequency, distinctiveness, saliency."""
 82 | 		
 83 | 		topic_marginal = self.getNormalized( [ d['weight'] for d in self.saliency.topic_info ] )
 84 | 		term_info = []
 85 | 		for i in range(self.model.term_count):
 86 | 			term = self.model.term_index[i]
 87 | 			counts = self.model.term_topic_matrix[i]
 88 | 			frequency = sum( counts )
 89 | 			probs = self.getNormalized( counts )
 90 | 			distinctiveness = self.getKLDivergence( probs, topic_marginal )
 91 | 			saliency = frequency * distinctiveness
 92 | 			term_info.append( {
 93 | 				'term' : term,
 94 | 				'saliency' : saliency,
 95 | 				'frequency' : frequency,
 96 | 				'distinctiveness' : distinctiveness,
 97 | 				'rank' : None,
 98 | 				'visibility' : 'default'
 99 | 			} )
100 | 		self.saliency.term_info = term_info
101 | 	
102 | 	def getNormalized( self, counts ):
103 | 		"""Rescale a list of counts, so they represent a proper probability distribution."""
104 | 		tally = sum( counts )
105 | 		if tally == 0:
106 | 			probs = [ d for d in counts ]
107 | 		else:
108 | 			probs = [ d / tally for d in counts ]
109 | 		return probs
110 | 	
111 | 	def getKLDivergence( self, P, Q ):
112 | 		"""Compute KL-divergence from P to Q"""
113 | 		divergence = 0
114 | 		assert len(P) == len(Q)
115 | 		for i in range(len(P)):
116 | 			p = P[i]
117 | 			q = Q[i]
118 | 			assert p >= 0
119 | 			assert q >= 0
120 | 			if p > 0:
121 | 				divergence += p * math.log( p / q )
122 | 		return divergence
123 | 	
124 | 	def rankResults( self ):
125 | 		"""Sort topics by decreasing weight. Sort term frequencies by decreasing saliency."""
126 | 		self.saliency.topic_info = sorted( self.saliency.topic_info, key = lambda topic_weight : -topic_weight['weight'] )
127 | 		self.saliency.term_info = sorted( self.saliency.term_info, key = lambda term_freq : -term_freq['saliency'] )
128 | 		for i, element in enumerate( self.saliency.term_info ):
129 | 			element['rank'] = i
130 | 
131 | #-------------------------------------------------------------------------------#
132 | 
133 | def main():
134 | 	parser = argparse.ArgumentParser( description = 'Compute term saliency for TermiteVis.' )
135 | 	parser.add_argument( 'config_file', type = str, default = None    , help = 'Path of Termite configuration file.' )
136 | 	parser.add_argument( '--data-path', type = str, dest = 'data_path', help = 'Override data path.'                 )
137 | 	parser.add_argument( '--logging'  , type = int, dest = 'logging'  , help = 'Override logging level.'             )
138 | 	args = parser.parse_args()
139 | 	
140 | 	data_path = None
141 | 	logging_level = 20
142 | 	
143 | 	# Read in default values from the configuration file
144 | 	if args.config_file is not None:
145 | 		config = ConfigParser.RawConfigParser()
146 | 		config.read( args.config_file )
147 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
148 | 			data_path = config.get( 'Termite', 'path' )
149 | 		if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
150 | 			logging_level = config.getint( 'Misc', 'logging' )
151 | 	
152 | 	# Read in user-specifiec values from the program arguments
153 | 	if args.data_path is not None:
154 | 		data_path = args.data_path
155 | 	if args.logging is not None:
156 | 		logging_level = args.logging
157 | 	
158 | 	ComputeSaliency( logging_level ).execute( data_path )
159 | 
160 | if __name__ == '__main__':
161 | 	main()
162 | 


--------------------------------------------------------------------------------
/pipeline/prepare_data_for_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | from api_utils import ModelAPI, SaliencyAPI, SeriationAPI, ClientAPI
 10 | 
 11 | class PrepareDataForClient( object ):
 12 | 	"""
 13 | 	Reformats data necessary for client to run. 
 14 | 	
 15 | 	Extracts a subset of the complete term list and term-topic matrix and writes
 16 | 	the subset to a separate file. Also, generates JSON file that merges/packages term
 17 | 	information with the actual term.
 18 | 	
 19 | 	Input is term-topic probability distribution and term information, stored in 4 files:
 20 | 	    'term-topic-matrix.txt' contains the entries of the matrix.
 21 | 	    'term-index.txt' contains the terms corresponding to the rows of the matrix.
 22 | 	    'topic-index.txt' contains the topic labels corresponding to the columns of the matrix.
 23 | 	    'term-info.txt' contains information about individual terms.
 24 | 	
 25 | 	Output is a subset of terms and matrix, as well as the term subset's information.
 26 | 	Number of files created or copied: 5
 27 | 		'submatrix-term-index.txt'
 28 | 	    'submatrix-topic-index.txt'
 29 | 	    'submatrix-term-topic.txt'
 30 | 	    'term-info.json'
 31 | 	    'term-info.txt'
 32 | 	"""
 33 | 	
 34 | 	def __init__( self, logging_level ):
 35 | 		self.logger = logging.getLogger( 'PrepareDataForClient' )
 36 | 		self.logger.setLevel( logging_level )
 37 | 		handler = logging.StreamHandler( sys.stderr )
 38 | 		handler.setLevel( logging_level )
 39 | 		self.logger.addHandler( handler )
 40 | 	
 41 | 	def execute( self, data_path ):
 42 | 		
 43 | 		assert data_path is not None
 44 | 		
 45 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 46 | 		self.logger.info( 'Preparing data for client...'                                                     )
 47 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 48 | 		
 49 | 		self.logger.info( 'Connecting to data...' )
 50 | 		self.model = ModelAPI( data_path )
 51 | 		self.saliency = SaliencyAPI( data_path )
 52 | 		self.seriation = SeriationAPI( data_path )
 53 | 		self.client = ClientAPI( data_path )
 54 | 		
 55 | 		self.logger.info( 'Reading data from disk...' )
 56 | 		self.model.read()
 57 | 		self.saliency.read()
 58 | 		self.seriation.read()
 59 | 
 60 | 		self.logger.info( 'Preparing parameters for seriated matrix...' )
 61 | 		self.prepareSeriatedParameters()
 62 | 		
 63 | 		self.logger.info( 'Preparing parameters for filtered matrix...' )
 64 | 		self.prepareFilteredParameters()
 65 | 		
 66 | 		self.logger.info( 'Preparing global term freqs...' )
 67 | 		self. prepareGlobalTermFreqs()
 68 | 		
 69 | 		self.logger.info( 'Writing data to disk...' )
 70 | 		self.client.write()
 71 | 
 72 | 	def prepareSeriatedParameters( self ):
 73 | 		topic_index = self.model.topic_index
 74 | 		term_index = self.model.term_index
 75 | 		term_topic_matrix = self.model.term_topic_matrix
 76 | 		term_ordering = self.seriation.term_ordering
 77 | 		term_topic_submatrix = []
 78 | 		term_subindex = []
 79 | 		for term in term_ordering:
 80 | 			if term in term_index:
 81 | 				index = term_index.index( term )
 82 | 				term_topic_submatrix.append( term_topic_matrix[ index ] )
 83 | 				term_subindex.append( term )
 84 | 			else:
 85 | 				self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term )
 86 | 
 87 | 		self.client.seriated_parameters = {
 88 | 			'termIndex' : term_subindex,
 89 | 			'topicIndex' : topic_index,
 90 | 			'matrix' : term_topic_submatrix
 91 | 		}
 92 | 	
 93 | 	def prepareFilteredParameters( self ):
 94 | 		term_rank_map = { term: value for value, term in enumerate( self.seriation.term_iter_index ) }
 95 | 		term_order_map = { term: value for value, term in enumerate( self.seriation.term_ordering ) }
 96 | 		term_saliency_map = { d['term']: d['saliency'] for d in self.saliency.term_info }
 97 | 		term_distinctiveness_map = { d['term'] : d['distinctiveness'] for d in self.saliency.term_info }
 98 | 
 99 | 		self.client.filtered_parameters = {
100 | 			'termRankMap' : term_rank_map,
101 | 			'termOrderMap' : term_order_map,
102 | 			'termSaliencyMap' : term_saliency_map,
103 | 			'termDistinctivenessMap' : term_distinctiveness_map
104 | 		}
105 | 
106 | 	def prepareGlobalTermFreqs( self ):
107 | 		topic_index = self.model.topic_index
108 | 		term_index = self.model.term_index
109 | 		term_topic_matrix = self.model.term_topic_matrix
110 | 		term_ordering = self.seriation.term_ordering
111 | 		term_topic_submatrix = []
112 | 		term_subindex = []
113 | 		for term in term_ordering:
114 | 			if term in term_index:
115 | 				index = term_index.index( term )
116 | 				term_topic_submatrix.append( term_topic_matrix[ index ] )
117 | 				term_subindex.append( term )
118 | 			else:
119 | 				self.logger.info( 'ERROR: Term (%s) does not appear in the list of seriated terms', term )
120 | 
121 | 		term_freqs = { d['term']: d['frequency'] for d in self.saliency.term_info }
122 | 
123 | 		self.client.global_term_freqs = {
124 | 			'termIndex' : term_subindex,
125 | 			'topicIndex' : topic_index,
126 | 			'matrix' : term_topic_submatrix,
127 | 			'termFreqMap' : term_freqs
128 | 		}
129 | 
130 | def main():
131 | 	parser = argparse.ArgumentParser( description = 'Prepare data for client.' )
132 | 	parser.add_argument( 'config_file', type = str, default = None    , help = 'Path of Termite configuration file.' )
133 | 	parser.add_argument( '--data-path', type = str, dest = 'data_path', help = 'Override data path.'                 )
134 | 	parser.add_argument( '--logging'  , type = int, dest = 'logging'  , help = 'Override logging level.'             )
135 | 	args = parser.parse_args()
136 | 	
137 | 	args = parser.parse_args()
138 | 	
139 | 	data_path = None
140 | 	logging_level = 20
141 | 	
142 | 	# Read in default values from the configuration file
143 | 	if args.config_file is not None:
144 | 		config = ConfigParser.RawConfigParser()
145 | 		config.read( args.config_file )
146 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
147 | 			data_path = config.get( 'Termite', 'path' )
148 | 		if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
149 | 			logging_level = config.getint( 'Misc', 'logging' )
150 | 	
151 | 	# Read in user-specifiec values from the program arguments
152 | 	if args.data_path is not None:
153 | 		data_path = args.data_path
154 | 	if args.logging is not None:
155 | 		logging_level = args.logging
156 | 	
157 | 	PrepareDataForClient( logging_level ).execute( data_path )
158 | 
159 | if __name__ == '__main__':
160 | 	main()
161 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Termite Set-Up Script
  4 | #
  5 | # Run once to
  6 | #   - download necessary library files
  7 | #   - minify client javascript files
  8 | #
  9 | 
 10 | LIBRARY=lib/
 11 | STMT=stmt-0.4.0/
 12 | CLIENT_SRC=client-src/
 13 | CLIENT_LIB=client-lib/
 14 | 
 15 | if [ ! -d $LIBRARY ]
 16 | then
 17 | 	echo
 18 | 	echo "Creating a library folder: $LIBRARY"
 19 | 	mkdir $LIBRARY
 20 | fi
 21 | 
 22 | if [ ! -d $CLIENT_LIB ]
 23 | then
 24 | 	echo
 25 | 	echo "Creating the client template folder: $CLIENT_LIB"
 26 | 	mkdir $CLIENT_LIB
 27 | fi
 28 | 
 29 | #------------------------------------------------------------------------------#
 30 | # D3 Visualization Javascript Library
 31 | 
 32 | echo
 33 | echo "Downloading D3 javascript library..."
 34 | curl --insecure --location https://github.com/mbostock/d3/releases/download/v3.4.1/d3.v3.zip > $LIBRARY/d3.v3.zip
 35 | 
 36 | echo
 37 | echo "Uncompressing D3 javascript library..."
 38 | unzip $LIBRARY/d3.v3.zip d3.v3.js -d $CLIENT_SRC
 39 | unzip $LIBRARY/d3.v3.zip d3.v3.min.js -d $CLIENT_LIB
 40 | 
 41 | echo
 42 | echo "Extracting D3 license..."
 43 | unzip $LIBRARY/d3.v3.zip LICENSE -d $LIBRARY
 44 | mv $LIBRARY/LICENSE $LIBRARY/LICENSE-d3
 45 | 
 46 | #------------------------------------------------------------------------------#
 47 | # jQuery Javascript Library
 48 | 
 49 | echo
 50 | echo "Downloading jQuery javascript library..."
 51 | curl --insecure --location http://code.jquery.com/jquery-1.9.1.js > $CLIENT_SRC/jquery.js
 52 | curl --insecure --location http://code.jquery.com/jquery-1.9.1.min.js > $CLIENT_LIB/jquery.min.js
 53 | 
 54 | echo
 55 | echo "Downloading jQuery GitHub archive..."
 56 | curl --insecure --location http://github.com/jquery/jquery/archive/master.zip > $LIBRARY/jquery.zip
 57 | 
 58 | echo
 59 | echo "Extracting jQuery license..."
 60 | unzip $LIBRARY/jquery.zip jquery-master/MIT-LICENSE.txt -d $LIBRARY
 61 | mv $LIBRARY/jquery-master/MIT-LICENSE.txt $LIBRARY/LICENSE-jquery
 62 | rmdir $LIBRARY/jquery-master
 63 | 
 64 | #------------------------------------------------------------------------------#
 65 | # Underscore Javascript Library
 66 | 
 67 | echo
 68 | echo "Downloading Underscore GitHub archive..."
 69 | curl --insecure --location http://github.com/documentcloud/underscore/archive/master.zip > $LIBRARY/underscore.zip
 70 | 
 71 | echo
 72 | echo "Uncompressing Underscore javascript library..."
 73 | unzip $LIBRARY/underscore.zip underscore-master/underscore.js -d $LIBRARY
 74 | unzip $LIBRARY/underscore.zip underscore-master/underscore-min.js -d $LIBRARY
 75 | mv $LIBRARY/underscore-master/underscore.js $CLIENT_SRC/underscore.js
 76 | mv $LIBRARY/underscore-master/underscore-min.js $CLIENT_LIB/underscore.min.js
 77 | 
 78 | echo
 79 | echo "Extracting Underscore license..."
 80 | unzip $LIBRARY/underscore.zip underscore-master/LICENSE -d $LIBRARY
 81 | mv $LIBRARY/underscore-master/LICENSE $LIBRARY/LICENSE-underscore
 82 | rmdir $LIBRARY/underscore-master
 83 | 
 84 | #------------------------------------------------------------------------------#
 85 | # Backbone Javascript Library
 86 | 
 87 | echo
 88 | echo "Downloading Backbone GitHub archive..."
 89 | curl --insecure --location http://github.com/documentcloud/backbone/archive/master.zip > $LIBRARY/backbone.zip
 90 | 
 91 | echo
 92 | echo "Uncompressing Backbone javascript library..."
 93 | unzip $LIBRARY/backbone.zip backbone-master/backbone.js -d $LIBRARY
 94 | unzip $LIBRARY/backbone.zip backbone-master/backbone-min.js -d $LIBRARY
 95 | mv $LIBRARY/backbone-master/backbone.js $CLIENT_SRC/backbone.js
 96 | mv $LIBRARY/backbone-master/backbone-min.js $CLIENT_LIB/backbone.min.js
 97 | 
 98 | echo
 99 | echo "Extracting Backbone license..."
100 | unzip $LIBRARY/backbone.zip backbone-master/LICENSE -d $LIBRARY
101 | mv $LIBRARY/backbone-master/LICENSE $LIBRARY/LICENSE-backbone
102 | rmdir $LIBRARY/backbone-master
103 | 
104 | #------------------------------------------------------------------------------#
105 | # Mallet (topic modeling library)
106 | 
107 | echo
108 | echo "Downloading MALLET (MAchine Learning for LanguagE Toolkit)..."
109 | curl --insecure --location http://mallet.cs.umass.edu/dist/mallet-2.0.7.tar.gz > $LIBRARY/mallet-2.0.7.tar.gz
110 | 
111 | echo
112 | echo "Uncompressing MALLET..."
113 | tar -zxvf $LIBRARY/mallet-2.0.7.tar.gz mallet-2.0.7
114 | 
115 | echo
116 | echo "Extracting MALLET License..."
117 | cp mallet-2.0.7/LICENSE $LIBRARY/LICENSE-mallet
118 | 
119 | #------------------------------------------------------------------------------#
120 | # Stanford Topic Modeling Toolkit
121 | 
122 | echo
123 | echo "Downloading STMT (Stanford Topic Modeling Toolkit)..."
124 | if [ ! -d $STMT ]
125 | then
126 | 	echo
127 | 	echo "Creating a folder for STMT: $STMT"
128 | 	mkdir $STMT
129 | fi
130 | curl --insecure --location http://nlp.stanford.edu/software/tmt/tmt-0.4/tmt-0.4.0.jar > $STMT/tmt-0.4.0.jar
131 | curl --insecure --location http://nlp.stanford.edu/software/tmt/tmt-0.4/tmt-0.4.0-src.zip > $LIBRARY/tmt-0.4.0-src.zip
132 | 
133 | echo
134 | echo "Extracting STMT License..."
135 | unzip $LIBRARY/tmt-0.4.0-src.zip LICENSE -d $LIBRARY
136 | cp $LIBRARY/LICENSE $LIBRARY/LICENSE-stmt
137 | 
138 | #------------------------------------------------------------------------------#
139 | # Google closure compiler for Javascript
140 | 
141 | echo
142 | echo "Downloading Google Closure Compiler..."
143 | curl --insecure --location http://dl.google.com/closure-compiler/compiler-latest.zip > $LIBRARY/compiler-latest.zip
144 | 
145 | echo
146 | echo "Uncompressing Google Closure Compiler..."
147 | unzip $LIBRARY/compiler-latest.zip compiler.jar -d $LIBRARY
148 | mv $LIBRARY/compiler.jar $LIBRARY/closure-compiler.jar
149 | 
150 | echo
151 | echo "Extracting Google Closure Compiler License..."
152 | unzip $LIBRARY/compiler-latest.zip COPYING -d $LIBRARY
153 | cp $LIBRARY/COPYING $LIBRARY/LICENSE-closure-compiler
154 | 
155 | #------------------------------------------------------------------------------#
156 | # Slider for Firefox
157 | 
158 | echo
159 | echo "Minifying html5slider.js"
160 | java -jar $LIBRARY/closure-compiler.jar --js=$CLIENT_SRC/html5slider.js --js_output_file=$CLIENT_LIB/html5slider.min.js
161 | 
162 | #------------------------------------------------------------------------------#
163 | # Minify javascript files
164 | 
165 | echo
166 | echo "Minifying javascript files..."
167 | 
168 | for JS_FILE in FullTermTopicProbabilityModel SeriatedTermTopicProbabilityModel FilteredTermTopicProbabilityModel TermFrequencyModel TermTopicMatrixView TermFrequencyView ViewParameters StateModel UserControlViews QueryString
169 | do
170 | 	echo "    Minifying $JS_FILE"
171 | 	java -jar $LIBRARY/closure-compiler.jar --js=$CLIENT_SRC/$JS_FILE.js --js_output_file=$CLIENT_LIB/$JS_FILE.min.js
172 | done
173 | 


--------------------------------------------------------------------------------
/pipeline/import_stmt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | from utf8_utils import UnicodeReader
 10 | from api_utils import ModelAPI
 11 | 
 12 | class ImportStmt( object ):
 13 | 	
 14 | 	"""
 15 | 	Copies STMT file formats into Termite internal format.
 16 | 	"""
 17 | 	
 18 | 	# Files generated by STMT
 19 | 	TERM_INDEX = 'term-index.txt'
 20 | 	TOPIC_INDEX = 'topic-index.txt'
 21 | 	DOCUMENT_INDEX = 'doc-index.txt'
 22 | 	TOPIC_TERM = 'topic-term-distributions.csv'
 23 | 	DOCUMENT_TOPIC = 'document-topic-distributions.csv'
 24 | 	
 25 | 	def __init__( self, logging_level ):
 26 | 		self.logger = logging.getLogger( 'ImportStmt' )
 27 | 		self.logger.setLevel( logging_level )
 28 | 		handler = logging.StreamHandler( sys.stderr )
 29 | 		handler.setLevel( logging_level )
 30 | 		self.logger.addHandler( handler )
 31 | 	
 32 | 	def execute( self, model_library, model_path, data_path ):
 33 | 		
 34 | 		assert model_library is not None
 35 | 		assert model_library == 'stmt'
 36 | 		assert model_path is not None
 37 | 		assert data_path is not None
 38 | 		
 39 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 40 | 		self.logger.info( 'Importing an STMT model...'                                                       )
 41 | 		self.logger.info( '    topic model = %s (%s)', model_path, model_library                             )
 42 | 		self.logger.info( '    output = %s', data_path                                                       )
 43 | 		
 44 | 		self.logger.info( 'Connecting to data...' )
 45 | 		self.model = ModelAPI( data_path )
 46 | 		
 47 | 		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TERM_INDEX )
 48 | 		self.model.term_index  = self.readAsList( model_path, ImportStmt.TERM_INDEX )
 49 | 		self.model.term_count = len(self.model.term_index)
 50 | 		
 51 | 		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_INDEX )
 52 | 		self.model.topic_index = self.readAsList( model_path, ImportStmt.TOPIC_INDEX )
 53 | 		self.model.topic_count = len(self.model.topic_index)
 54 | 		
 55 | 		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_INDEX )
 56 | 		self.model.document_index = self.readAsList( model_path, ImportStmt.DOCUMENT_INDEX )
 57 | 		self.model.document_count = len(self.model.document_index)
 58 | 		
 59 | 		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.TOPIC_TERM )
 60 | 		self.topic_term_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.TOPIC_TERM )
 61 | 		
 62 | 		self.logger.info( 'Reading "%s" from STMT output...', ImportStmt.DOCUMENT_TOPIC )
 63 | 		self.document_topic_counts = self.readCsvAsMatrixStr( model_path, ImportStmt.DOCUMENT_TOPIC )
 64 | 		
 65 | 		self.logger.info( 'Extracting term-topic matrix...' )
 66 | 		self.extractTermTopicMatrix()
 67 | 		
 68 | 		self.logger.info( 'Extracting document-topic matrix...' )
 69 | 		self.extractDocumentTopicMatrix()
 70 | 		
 71 | 		self.logger.info( 'Writing data to disk...' )
 72 | 		self.model.write()
 73 | 	
 74 | 	def readAsList( self, model_path, filename ):
 75 | 		data = []
 76 | 		filename = '{}/{}'.format( model_path, filename )
 77 | 		with open( filename, 'r' ) as f:
 78 | 			data = f.read().decode( 'utf-8' ).splitlines()
 79 | 		return data
 80 | 	
 81 | 	# Need for STMT, which generates a mixed-string-float document-topic-distributions.csv file
 82 | 	def readCsvAsMatrixStr( self, model_path, filename ):
 83 | 		"""
 84 | 		Return a matrix (list of list) of string values.
 85 | 		Each row corresponds to a line of the input file.
 86 | 		Each cell (in a row) corresponds to a comma-separated value (in each line).
 87 | 		"""
 88 | 		data = []
 89 | 		filename = '{}/{}'.format( model_path, filename )
 90 | 		with open( filename, 'r' ) as f:
 91 | 			lines = UnicodeReader( f, delimiter = ',' )
 92 | 			data = [ d for d in lines ]
 93 | 		return data
 94 | 	
 95 | 	def extractDocumentTopicMatrix( self ):
 96 | 		"""
 97 | 		Extract document-topic matrix.
 98 | 		Probability distributions are stored from the 2nd column onward in the document-topic distributions.
 99 | 		"""
100 | 		matrix = []
101 | 		for line in self.document_topic_counts:
102 | 			matrix.append( map( float, line[1:self.model.topic_count+1] ) )
103 | 		self.model.document_topic_matrix = matrix
104 | 	
105 | 	def extractTermTopicMatrix( self ):
106 | 		"""
107 | 		Extract term-topic matrix.
108 | 		Transpose the input topic-term distributions.
109 | 		Ensure all values are greater than or equal to 0.
110 | 		"""
111 | 		matrix = [ [0] * self.model.topic_count ] * self.model.term_count
112 | 		for j, line in enumerate( self.topic_term_counts ):
113 | 			for i, value in enumerate(line):
114 | 				matrix[i][j] = max( 0, float(value) )
115 | 		self.model.term_topic_matrix = matrix
116 | 
117 | def main():
118 | 	parser = argparse.ArgumentParser( description = 'Import results from STMT (Stanford Topic-Modeling Toolbox) into Termite.' )
119 | 	parser.add_argument( 'config_file'          , type = str, default = None        , help = 'Path of Termite configuration file.' )
120 | 	parser.add_argument( '--topic-model-library', type = str, dest = 'model_library', help = 'Override topic model format'         )
121 | 	parser.add_argument( '--topic-model-path'   , type = str, dest = 'model_path'   , help = 'Override topic model path'           )
122 | 	parser.add_argument( '--data-path'          , type = str, dest = 'data_path'    , help = 'Override data path'                  )
123 | 	parser.add_argument( '--logging'            , type = int, dest = 'logging'      , help = 'Override logging level'              )
124 | 	args = parser.parse_args()
125 | 	
126 | 	model_library = None
127 | 	model_path = None
128 | 	data_path = None
129 | 	logging_level = 20
130 | 	
131 | 	# Read in default values from the configuration file
132 | 	config = ConfigParser.RawConfigParser()
133 | 	config.read( args.config_file )
134 | 	model_library = config.get( 'TopicModel', 'library' )
135 | 	model_path = config.get( 'TopicModel', 'path' )
136 | 	data_path = config.get( 'Termite', 'path' )
137 | 	if config.has_section( 'Misc' ):
138 | 		if config.has_option( 'Misc', 'logging' ):
139 | 			logging_level = config.getint( 'Misc', 'logging' )
140 | 	
141 | 	# Read in user-specifiec values from the program arguments
142 | 	if args.model_library is not None:
143 | 		model_library = args.model_library
144 | 	if args.model_path is not None:
145 | 		model_path = args.model_path
146 | 	if args.data_path is not None:
147 | 		data_path = args.data_path
148 | 	if args.logging is not None:
149 | 		logging_level = args.logging
150 | 	
151 | 	ImportStmt( logging_level ).execute( model_library, model_path, data_path )
152 | 
153 | if __name__ == '__main__':
154 | 	main()
155 | 


--------------------------------------------------------------------------------
/client-src/TermFrequencyModel.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	TermFrequencyModel.js
  3 | 		This model processes and packages data for the term frequency view.
  4 | 
  5 | 	Initialization:
  6 | 		Load term frequency from 'data/global-term-freqs.json'
  7 | 	
  8 | 	Data Update:
  9 | 		Listens to FilteredTermTopicProbabilityModel (events: )
 10 | 
 11 | 	Details:
 12 | 	--------
 13 | 	Pulls data from FilteredTermTopicProbilityModel. The model loads some parameters from 
 14 | 	the url. On updates, the model receives a list
 15 | 	of terms and generates a list of item(term, frequency) (same order as the term list
 16 | 	received as input).  
 17 | */
 18 | var TermFrequencyModel = Backbone.Model.extend({
 19 | 	defaults : {
 20 | 		"termIndex" : null,
 21 | 		"totalTermFreqs": {},
 22 | 		"topicalFreqMatrix": [],
 23 | 		"colorList": [],
 24 | 		"selectedTopics": {} 
 25 | 	},
 26 | 	url : "data/global-term-freqs.json",
 27 | 	initialize : function() {
 28 | 		this.parentModel = null;
 29 | 		this.stateModel = null;
 30 | 		
 31 | 		// original data
 32 | 		this.originalMatrix = null;
 33 | 		this.originalTopicIndex = null;
 34 | 		this.originalTermIndex = null;
 35 | 		
 36 | 		// mappings
 37 | 		this.termFreqMap = null;
 38 | 		
 39 | 		// iteractions
 40 | 		// TODO: (later) clean up these. Definitely don't need all of these variables
 41 | 		this.selectedTopics = {};
 42 | 		this.colorList = [];
 43 | 		this.colorToTopic = {};
 44 | 		this.topicalFreqs = null;
 45 | 	}
 46 | });
 47 | 
 48 | /**
 49 |  * Initialize Term Frequency Model's parent and state model
 50 |  *
 51 |  * @private
 52 |  */
 53 | TermFrequencyModel.prototype.initModels = function( parent, state ){
 54 | 	this.parentModel = parent;
 55 | 	this.stateModel = state;
 56 | };
 57 | 
 58 | /**
 59 |  * Initialize all topics' selection status to null (called once by load)
 60 |  *
 61 |  * @private
 62 |  */
 63 | TermFrequencyModel.prototype.defaultSelection = function(){
 64 | 	var topicIndex = this.parentModel.get("topicIndex");
 65 | 	for( var i = 0; i < topicIndex.length; i++ ){
 66 | 		this.selectedTopics[i] = null;
 67 | 	}
 68 | 	this.set("selectedTopics", this.selectedTopics);
 69 | };
 70 | 
 71 | /**
 72 |  * Loads matrix, termIndex, topicIndex, and term to frequency mapping from the model's "url"
 73 |  * and triggers a loaded event that the next model (child model) listens to. Also, pulls 
 74 |  * any selected topics from state model and processes them.
 75 |  * (This function is called after the filtered model loaded event is fired)
 76 |  *
 77 |  * @param { string } the location of datafile to load values from
 78 |  * @return { void }
 79 |  */
 80 | TermFrequencyModel.prototype.load = function(){	
 81 | 	var successHandler = function( model, response, options )
 82 | 	{
 83 | 		this.set("termIndex", this.parentModel.get("termIndex"));
 84 | 		
 85 | 		this.originalMatrix = response.matrix;
 86 | 		this.originalTopicIndex = response.topicIndex;
 87 | 		this.originalTermIndex = response.termIndex;
 88 | 		
 89 | 		this.termFreqMap = response.termFreqMap;
 90 | 		this.defaultSelection();
 91 | 		this.getTotalTermFreqs();	
 92 | 		
 93 | 		// process selected topics from the saved state
 94 | 		var coloredTopics = this.stateModel.get("selectedTopics");
 95 | 		var colorList = [];
 96 | 		for( var obj in coloredTopics){
 97 | 			claimColor( coloredTopics[obj] );
 98 | 			colorList.push({"topic":obj, "color":coloredTopics[obj]});
 99 | 		}	
100 | 		colorList.sort(function(a, b) {return colorNames.indexOf(a.color) - colorNames.indexOf(b.color)});
101 | 		for( var i = 0; i < colorList.length; i++){
102 | 			this.selectTopic({"topic": colorList[i].topic, "color": colorList[i].color} );
103 | 		}
104 | 		
105 | 		// signal completion
106 | 		this.trigger("loaded:freqModel");	
107 | 		
108 | 	}.bind(this);
109 | 	var errorHandler = function( model, xhr, options ) { }.bind(this);
110 | 	this.fetch({
111 | 		add : false,
112 | 		success : successHandler,
113 | 		error : errorHandler
114 | 	});	
115 | };
116 | 
117 | /**
118 |  * Calls appropriate functions to update based on data change(s)
119 |  */
120 | TermFrequencyModel.prototype.update = function(){	
121 | 	this.generateTopicalMatrix( true );
122 | 	this.getTotalTermFreqs();	
123 | 	this.set("termIndex", this.parentModel.get("termIndex"));
124 | };
125 | 
126 | /** 
127 |  * Finds total frequency for each term in termIndex
128 |  *
129 |  * @private
130 |  */
131 | TermFrequencyModel.prototype.getTotalTermFreqs = function(){
132 | 	var frequencies = {};
133 | 	var terms = this.parentModel.get("termIndex");
134 | 	for( var i = 0; i < terms.length; i++){
135 | 		frequencies[terms[i]] = this.termFreqMap[terms[i]];
136 | 	}
137 | 	this.set("totalTermFreqs", frequencies);
138 | };
139 | 
140 | /** 
141 |  * Finds frequency / topic for each term in termIndex and each topic in selectedTopics
142 |  *
143 |  * @private
144 |  */
145 | TermFrequencyModel.prototype.generateTopicalMatrix = function( keepQuiet ) {
146 | 	var frequencies = [];
147 | 	var terms = this.parentModel.get("termIndex");
148 | 	for( var index = 0; index < this.colorList.length; index++){
149 | 		var tempList = [];
150 | 		var topic = this.colorToTopic[this.colorList[index]];
151 | 		for( var i = 0; i < terms.length; i++){
152 | 			var termIndex = this.originalTermIndex.indexOf(terms[i]);
153 | 			tempList.push(this.originalMatrix[termIndex][topic]);
154 | 		} 
155 | 		frequencies.push(tempList);
156 | 	}
157 | 	this.topicalFreqs = frequencies;
158 | 	this.set("topicalFreqMatrix", frequencies, {silent: keepQuiet});
159 | 	this.set("colorList", this.colorList);
160 | 	this.set("selectedTopics", this.selectedTopics);
161 | 	return frequencies;
162 | };
163 | 
164 | /** 
165 |  * Called by term frequency view. Returns frequency / topic for every term in termIndex
166 |  *
167 |  * @this { TermFrequencyModel }
168 |  * @param { int } target topic index
169 |  * @return { array } list of topical frequencies in termIndex ordering
170 |  */
171 | TermFrequencyModel.prototype.getTopicalsForTopic = function( topic ) {
172 | 	var frequencies = [];
173 | 	var terms = this.get("termIndex");
174 | 	for( var i = 0; i < terms.length; i++){
175 | 		var termIndex = this.originalTermIndex.indexOf(terms[i]);
176 | 		frequencies.push(this.originalMatrix[termIndex][topic]);
177 | 	} 
178 | 	return frequencies;
179 | };
180 | 
181 | // interactions
182 | /**
183 |  * Behavior when topic is selected
184 |  *
185 |  * @this { TermFrequencyModel }
186 |  * @param { object } topic: target topic index, color: associated color
187 |  * @return { void }
188 |  */
189 | TermFrequencyModel.prototype.selectTopic = function( obj ) {
190 | 	var topic = obj.topic;
191 | 	var color = obj.color;
192 | 	var topicIndex = this.parentModel.get("topicIndex");
193 | 	if( topic !== null){
194 | 
195 | 		// if color is DEFAULT, the event can be treated as a deselect
196 | 		if( color === DEFAULT) {
197 | 			if(this.selectedTopics[topic] !== null){
198 | 				var index = this.colorList.indexOf(this.selectedTopics[topic]);
199 | 				this.colorList.splice(index,1);
200 | 				this.selectedTopics[topic] = null;
201 | 				delete this.colorToTopic[color];
202 | 				
203 | 			} else {
204 | 				return;
205 | 			}
206 | 		}	
207 | 		// only add if this topic wasn't added previously
208 | 		else if(this.selectedTopics[topic] === null) {
209 | 			this.selectedTopics[topic] = color;
210 | 			this.colorList.push(color);
211 | 			this.colorToTopic[color] = topic;
212 | 		}
213 | 		
214 | 		// recompute the topical matrix
215 | 		this.generateTopicalMatrix( false );
216 | 	}
217 | };
218 | 


--------------------------------------------------------------------------------
/client-src/StateModel.js:
--------------------------------------------------------------------------------
  1 | var StateModel = Backbone.Model.extend({
  2 | 	defaults : {
  3 | 		"numAffinityTerms" : 25,
  4 | 		"numSalientTerms" : 0,
  5 | 		"visibleTerms" : [],
  6 | 		"totalTerms" : 25,
  7 | 		"foundTerms" : "",
  8 | 		"unfoundTerms" : "",
  9 | 		"sortType": "",
 10 | 		"addTopTwenty": false,
 11 | 		"highlightedTerm" : "",
 12 | 		"highlightedTopic" : null,
 13 | 		"selectedTopics" : {},
 14 | 		"doubleClickTopic": null,
 15 | 		"selectedTopicsStr": ""	    // var for load and save state
 16 | 	},
 17 | 	initialize : function() {
 18 | 		this.matrixView = null;
 19 | 		this.termFreqView = null;	
 20 | 	}
 21 | });
 22 | 
 23 | /**
 24 |  * Initialize state model's view models
 25 |  *
 26 |  * @private
 27 |  */
 28 | StateModel.prototype.initModel = function ( matrix, histogram ){
 29 | 	this.matrixView = matrix;
 30 | 	this.termFreqView = histogram;
 31 | };
 32 | 
 33 | // User Defined Terms 
 34 | /**
 35 |  * Set user defined control feedback views
 36 |  *
 37 |  * @this {state model}
 38 |  * @param { array } list of terms
 39 |  * @param { boolean } whether or not event should be silent
 40 |  */
 41 | StateModel.prototype.setFoundTerms = function( termList, keepQuiet ) {
 42 | 	this.set( "foundTerms", termList.join(", "), {silent: keepQuiet});
 43 | };
 44 | StateModel.prototype.setUnfoundTerms = function( termList, keepQuiet ){
 45 | 	if( termList.length > 0 && termList[0] != "")
 46 | 		this.set( "unfoundTerms", termList.join(", "), {silent: keepQuiet});
 47 | 	else
 48 | 		this.set( "unfoundTerms", "", {silent: keepQuiet});
 49 | };
 50 | StateModel.prototype.setVisibleTerms = function ( userSpecifiedVisibleTerms ) {
 51 | 	this.set( "visibleTerms", userSpecifiedVisibleTerms.split(/[ ,;]+/g) );
 52 | };
 53 | /** end user defined control feedback **/
 54 | 
 55 | 
 56 | /** 
 57 |  * Handles selecting topics using click event. Uses function freeColor and getColor that
 58 |  * are defined in ViewParameters
 59 |  *
 60 |  * @this {state model}
 61 |  * @param { string } DEFAULT defined in ViewParameters
 62 |  * @param { int } index of clicked topic
 63 |  */
 64 | StateModel.prototype.selectTopic = function( topicIndex ) {
 65 | 	var color = DEFAULT;
 66 | 	// frees the color associated with the topic if it was already selected
 67 | 	if( topicIndex in this.get("selectedTopics")) {
 68 | 		freeColor( this.get("selectedTopics")[topicIndex] );
 69 | 		delete this.get("selectedTopics")[topicIndex];
 70 | 	}
 71 | 	// assign a color to the selected topic if there are any free 
 72 | 	else {
 73 | 		color = getColor();
 74 | 		this.get("selectedTopics")[topicIndex] = color;
 75 | 	}
 76 | 	// fire event to signify topic coloring may have changed
 77 | 	this.trigger("color:topic", { "topic":topicIndex, "color": color } );
 78 | };
 79 | /**
 80 |  * Clears all topic selections (currently inefficiently implemented)
 81 |  */
 82 | StateModel.prototype.clearAllSelectedTopics = function() {
 83 | 	console.log("clear all topics");
 84 | 	var selectedTopics = this.get("selectedTopics");
 85 | 	for( var i in selectedTopics){
 86 | 		freeColor( selectedTopics[i] );
 87 | 		delete this.get("selectedTopics")[i];
 88 | 		this.trigger("color:topic", {"topic":i, "color":DEFAULT} );
 89 | 	}
 90 | };
 91 | /** end topic selection code **/
 92 | 
 93 | /** 
 94 |  * Handles sorting using double click on a topic label
 95 |  *
 96 |  * @this {state model}
 97 |  * @param { int } index of double clicked topic
 98 |  */
 99 | StateModel.prototype.getSortType = function ( topicIndex ){
100 | 	var sorts = ["desc", "asc", ""];
101 | 	
102 | 	if(this.get("doubleClickTopic") !== topicIndex)
103 | 		return sorts[0];
104 | 	else{
105 | 		var currentSort = this.get("sortType");
106 | 		var index = (sorts.indexOf(currentSort) + 1) % sorts.length;
107 | 		return sorts[index];
108 | 	}
109 | };
110 | StateModel.prototype.setDoubleClickTopic = function ( topicIndex ){
111 | 	var type = this.getSortType(topicIndex);
112 | 	if( type === "")
113 | 		this.set( "doubleClickTopic", null);
114 | 	else
115 | 		this.set( "doubleClickTopic", topicIndex);
116 | 	this.set( "sortType", type);
117 | };
118 | StateModel.prototype.clearSorting = function(){
119 | 	this.set( "doubleClickTopic", null);
120 | 	this.set( "sortType", "");
121 | };
122 | /** end double click event code **/
123 | 
124 | /**
125 |  * Handles highlighting events triggered by mouseover and mouseout
126 |  * 
127 |  * @param { string } target term
128 |  * @param { int } index of target topic
129 |  */
130 | StateModel.prototype.setHighlightedTerm = function( term ) {
131 | 	this.set("highlightedTerm", term );
132 | };
133 | StateModel.prototype.setHighlightedTopic = function( topic ) {
134 | 	this.set("highlightedTopic", topic );
135 | };
136 | /** end highlight event code **/
137 | 
138 | 
139 | /**
140 |  * load from query string including decoding some values
141 |  *
142 |  * @this {state model}
143 |  */
144 | StateModel.prototype.loadStatesFromQueryString = function() {
145 | 
146 | 	var decodeString = function( str ){
147 | 		var topicLabel = "#topic:";
148 | 		var colorLabel = "#color:";
149 | 				
150 | 		// extract color and topic pairs
151 | 		while( str.length > 0) {
152 | 			var topicIndex = str.indexOf(topicLabel);
153 | 			var colorIndex = str.indexOf(colorLabel);
154 | 
155 | 			var topic = null;
156 | 			var color = null;
157 | 			if(topicIndex >= 0 && colorIndex >= 0){
158 | 				topic = parseInt(str.substring(topicIndex+7, colorIndex));
159 | 				
160 | 				var tempIndex = str.indexOf(topicLabel, colorIndex+7);
161 | 				if(tempIndex >= 0){	//there's another pair
162 | 					color = str.substring(colorIndex+7, tempIndex);
163 | 					str = str.substring(tempIndex);
164 | 				} else { //no more pairs
165 | 					color = str.substring(colorIndex+7);
166 | 					// get rid of trailing characters...
167 | 					color = color.replace( /[^A-Za-z0-9]/g, "" );
168 | 					str = "";
169 | 				}
170 | 				this.get("selectedTopics")[topic] = color;
171 | 			}
172 | 		}
173 | 	}.bind(this);
174 | 
175 | 	var qs = new QueryString();
176 | 	qs.addValueParameter( 'numAffinityTerms', 'na', 'int' );
177 | 	qs.addValueParameter( 'numSalientTerms', 'ns', 'int' );
178 | 	qs.addArrayParameter( 'visibleTerms', 't' );
179 | 	qs.addValueParameter( 'sortType', 'st', 'str');
180 | 	qs.addValueParameter( 'doubleClickTopic', 'dct', 'int');
181 | 	qs.addValueParameter( 'addTopTwenty', 'att', 'str');
182 | 	qs.addValueParameter( 'selectedTopicsStr', 'tc', 'str');
183 | 	
184 | 	var states = qs.read();
185 | 	for ( var key in states ){
186 | 		if(key === "doubleClickTopic" && states[key] === -1){
187 | 			this.set(key, null);
188 | 		}
189 | 		else if( key === "selectedTopicsStr" && states[key] !== ""){
190 | 			// decode string
191 | 			decodeString( states[key] );
192 | 			this.set(key, states[key]);
193 | 		}
194 | 		else if( key === "addTopTwenty"){
195 | 			if( states[key].replace( /[^A-Za-z0-9]/g, "" ) === "false")
196 | 				this.set(key, false);
197 | 			else
198 | 				this.set(key, true);
199 | 		}
200 | 		else
201 | 			this.set( key, states[key] );
202 | 	}
203 | 
204 | 	this.trigger( "loaded:states" );
205 | 	this.trigger( "sending:colors", this.get("selectedTopics"));
206 | };
207 | 
208 | /**
209 |  * save current state to query string
210 |  *
211 |  * @this {state model}
212 |  */
213 | StateModel.prototype.saveStatesToQueryString = function() {
214 | 	var qs = new QueryString();
215 | 	qs.addValueParameter( 'numAffinityTerms', 'na', 'int' );
216 | 	qs.addValueParameter( 'numSalientTerms', 'ns', 'int' );
217 | 	qs.addArrayParameter( 'visibleTerms', 't' );
218 | 	qs.addValueParameter( 'sortType', 'st', 'str');
219 | 	qs.addValueParameter( 'doubleClickTopic', 'dct', 'int');
220 | 	qs.addValueParameter( 'addTopTwenty', 'att', 'str');
221 | 	
222 | 	var selectedTopics = this.get("selectedTopics");
223 | 	var strVersion = "";
224 | 	for( var i in selectedTopics){
225 | 		if(selectedTopics[i] !== DEFAULT)
226 | 			strVersion += "#topic:" + i + "#color:" + selectedTopics[i];
227 | 	}
228 | 	this.set("selectedTopicsStr", strVersion);
229 | 	qs.addValueParameter( 'selectedTopicsStr', 'tc', 'str');
230 | 	
231 | 	var keys = [ 'numAffinityTerms', 'numSalientTerms', 'visibleTerms', 'sortType', 'doubleClickTopic', 'addTopTwenty', 'selectedTopicsStr' ];
232 | 	var states = {};
233 | 	for ( var i in keys )
234 | 	{
235 | 		var key = keys[i];
236 | 		if(key === "doubleClickTopic" && this.get(key) === null){
237 | 			states[key] = -1;
238 | 		}
239 | 		else
240 | 			states[key] = this.get(key);
241 | 	}
242 | 	
243 | 	qs.write( states );
244 | };


--------------------------------------------------------------------------------
/README.old:
--------------------------------------------------------------------------------
  1 | README for Termite, a topic model visualization tool.
  2 | 
  3 | ---------------
  4 |   INFORMATION
  5 | ---------------
  6 | Termite is a visualization tool for inspecting the output of statistical
  7 | topic models based on the techniques described in the following publication:
  8 | 
  9 | Termite: Visualization Techniques for Assessing Textual Topic Models
 10 | Jason Chuang, Christopher D. Manning, Jeffrey Heer
 11 | Computer Science Dept, Stanford University
 12 | http://vis.stanford.edu/papers/termite
 13 | 
 14 | This tool is developed by:
 15 |     * Jason Chuang <jcchuang@cs.stanford.edu>
 16 |     * Ashley Jin <ashpjin@cs.stanford.edu>
 17 | 
 18 | and is distributed under the BSD-3 license.
 19 | 
 20 | -----------
 21 |   LICENSE
 22 | -----------
 23 | Copyright (c) 2013, Leland Stanford Junior University
 24 | All rights reserved.
 25 | 
 26 | Redistribution and use in source and binary forms, with or without
 27 | modification, are permitted provided that the following conditions are met:
 28 |     * Redistributions of source code must retain the above copyright
 29 |       notice, this list of conditions and the following disclaimer.
 30 |     * Redistributions in binary form must reproduce the above copyright
 31 |       notice, this list of conditions and the following disclaimer in the
 32 |       documentation and/or other materials provided with the distribution.
 33 |     * Neither the name of the <organization> nor the
 34 |       names of its contributors may be used to endorse or promote products
 35 |       derived from this software without specific prior written permission.
 36 | 
 37 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 38 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 39 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 40 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 41 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 42 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 43 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 44 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 45 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 46 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 47 | 
 48 | ----------------------
 49 |   ADDITIONAL CREDITS
 50 | ----------------------
 51 | Termite requires the use of the following libraries and tools.
 52 | We thank their respective authors for developing and distributing the tools.
 53 | 
 54 | Mallet: Machine learning for language toolkit
 55 |     Project website: http://mallet.cs.umass.edu
 56 |     Developed by Andrew McCallum, et al.
 57 |     Distributed under a CPL license: lib/LICENSE-mallet
 58 | 
 59 | STMT: Stanford topic modeling toolbox
 60 |     Project website: http://nlp.stanford.edu/software/tmt
 61 |     Developed by Daniel Ramage, et al.
 62 |     Distributed under a GNU license: lib/LICENSE-stmt
 63 | 
 64 | D3 javascript visualization library
 65 |     Project website: http://d3js.org
 66 |     Developed by Mike Bostock, et al.
 67 |     Distributed under a BSD license: lib/LICENSE-d3
 68 | 
 69 | Google closure javascript compiler
 70 |     Project website: https://developers.google.com/closure/compiler/
 71 |     Developed by Google engineers
 72 |     Distributed under an Apache license: lib/LICENSE-closure-compiler
 73 | 
 74 | Backbone
 75 |     Project website: http://backbonejs.org
 76 |     Developed by Jeremy Ashkenas, DocumentCloud Inc.
 77 |     Distributed under a MIT license: lib/LICENSE-backbone
 78 | 
 79 | Underscore
 80 |     Project website: http://underscorejs.org
 81 |     Developed by Jeremy Ashkenas, DocumentCloud Inc.
 82 |     Distributed under a MIT license: lib/LICENSE-underscore
 83 | 
 84 | jQuery
 85 |     Project website: http://jquery.com
 86 |     Developed by the jQuery Foundation
 87 |     Distributed under a MIT license: lib/LICENSE-jquery
 88 | 
 89 | html5slider
 90 |     Project website: https://github.com/fryn/html5slider
 91 |     Developed by Frank Yan
 92 |     Distributed under an MIT license: http://opensource.org/licenses/MIT
 93 | 
 94 | ------------------
 95 |   ONE-TIME SETUP
 96 | ------------------
 97 | Run the setup script to fetch the following tools, libraries, and prepare all javascript
 98 | files needed by Termite. This script only needs to be run once when Termite is first
 99 | downloaded onto a new machine.
100 |     >> ./setup.sh
101 | 
102 | Libraries fetched include:
103 |     * mallet
104 |     * stmt
105 |     * closure-compiler.js
106 |     * d3.v3.js
107 |     * jquery.js
108 |     * underscore.js
109 |     * backbone.js
110 |     * html5slider.js
111 | 
112 | The script creates a minified version of all javascript files.
113 | 
114 | -----------------------
115 |   BUILD A TOPIC MODEL
116 | -----------------------
117 | Building a topic model in Termite requires running a single python script. The script reads
118 | in an input text corpus, and produces an output folder whose content can be inspected 
119 | using a web browser, described in the next section.
120 | 
121 | Customize configuration file with the following information. A sample configuration
122 | file can be found in 'example.cfg'
123 |     [Corpus]       path to text corpus
124 |     [TopicModel]   directory for holding topic model outputs
125 |     [TopicModel]   number of topics to train
126 |     [TopicModel]   topic model (either mallet or stmt)
127 |     [Termite]      number of terms to seriate
128 |     [Termite]      path to save Termite-internal working files
129 |     
130 | Process the text corpus, and build a topic model by running the execution script.
131 | Execution time will vary depending on the size of the corpus.
132 |     >> ./execute.py <your_config_file.cfg>
133 | 
134 | The execution script calls in order: 
135 |     1. pipeline/tokenize.py                  Tokenize the text corpus
136 |     2. pipeline/train_mallet.py              Train a topic model using MALLET
137 |        pipeline/train_stmt.py                or STMT.
138 |     3. pipeline/compute_saliency.py          Compute term saliency
139 |     4. pipeline/compute_similarity.py        Compute term similarity
140 |     5. pipeline/compute_seriation.py:        Seriates terms
141 |     6. pipeline/prepare_data_for_client.py:  Generates datafiles for client
142 |     7. pipeline/prepare_vis_for_client.py:   Copies necessary scripts for client
143 | 
144 | ---------------------------- 
145 |   VIEW TOPIC MODEL OUTPUTS
146 | ----------------------------
147 | You are now ready to visualize the topic model outputs! Termite's output can be viewed in
148 | a web browser. To view the files locally (on your own computer), you need to set up a local
149 | web server. Alternatively, you may copy the output folder to a web server to publish the results.
150 | 
151 | Termite outputs are stored in the 'public_html' subfolder within the output directory.
152 | 
153 | To set up a local webserver:
154 |     1. Change into output directory (specified in the configuration file)
155 |        >> cd <output_folder>/public_html
156 |     2. Start a local server using python
157 |        >> ./web.sh
158 |     3. Open http://localhost:8888 in a modern web browser (Chrome, Safari, Firefox, or Opera)
159 |        to view a visualization of the model outputs.
160 | 
161 | To publish the results on a webserver:
162 |     1. Copy public_html directory to your remote server.
163 | 
164 | -----------------------------
165 |   TOPIC MODEL VISUALIZATION
166 | -----------------------------
167 | The visualization should consist of a matrix view with a frequency bar view to the right.
168 | The top right contains user controls, two slider bars and one input text box. User
169 | controls are explained in more detail in the 'VISUALIZATION USE CASES' section.
170 | 
171 | When the mouse is placed over terms, topics, circles, or bars in the matrix or frequency 
172 | bar view, the associated term, topic, bar, and circles will be highlighted in both views.
173 | 
174 | ---------------------------
175 |   VISUALIZATION USE CASES
176 | ---------------------------
177 | The Termite visualization tool has a set of user controls in the top right section of the
178 | webpage. The user may specify
179 |     * the number of terms with highest affinity to show
180 |     * the number of terms with highest saliency to show
181 |     * always display specific terms
182 |     * always display 20 most frequent terms belonging to selected topics
183 |     * click on a topic to select/color the topic
184 |     * mouse over a topic/term to highlight the topic/term
185 | (Note: the highest affinity term set and highest saliency term set may contain overlapping words)
186 | 


--------------------------------------------------------------------------------
/execute.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | import time
 10 | import os
 11 | from pipeline.tokenize import Tokenize
 12 | from pipeline.import_mallet import ImportMallet
 13 | from pipeline.import_stmt import ImportStmt
 14 | from pipeline.compute_saliency import ComputeSaliency
 15 | from pipeline.compute_similarity import ComputeSimilarity
 16 | from pipeline.compute_seriation import ComputeSeriation
 17 | from pipeline.prepare_data_for_client import PrepareDataForClient
 18 | 
 19 | class Execute( object ):
 20 | 
 21 | 	"""
 22 | 	Runs entire data processing pipeline and sets up client.
 23 | 	
 24 | 	Execute data processing scripts in order:
 25 | 		1. tokenize.py:				Tokenize corpus
 26 | 		2. train_stmt/mallet.py:	Train model
 27 | 		3. compute_saliency.py:		Compute term saliency
 28 | 		4. compute_similarity.py:	Compute term similarity
 29 | 		5. compute_seriation.py:	Seriates terms
 30 | 		6. prepare_data_for_client.py:	Generates datafiles for client
 31 | 		7. prepare_vis_for_client.py:	Copies necessary scripts for client
 32 | 	
 33 | 	Input is configuration file specifying target corpus and destination directory.
 34 | 	
 35 | 	Creates multiple directories that store files from each stage of the pipeline. 
 36 | 	Among the directories is the public_html directory that holds all client files.
 37 | 	"""
 38 | 	
 39 | 	DEFAULT_NUM_TOPICS = 25
 40 | 	
 41 | 	def __init__( self, logging_level ):
 42 | 		self.logger = logging.getLogger( 'Execute' )
 43 | 		self.logger.setLevel( logging_level )
 44 | 		handler = logging.StreamHandler( sys.stderr )
 45 | 		handler.setLevel( logging_level )
 46 | 		self.logger.addHandler( handler )
 47 | 	
 48 | 	def execute( self, corpus_format, corpus_path, tokenization, model_library, model_path, data_path, num_topics, number_of_seriated_terms ):
 49 | 		
 50 | 		assert corpus_format is not None
 51 | 		assert corpus_path is not None
 52 | 		assert model_library is not None
 53 | 		assert model_library == 'stmt' or model_library == 'mallet'
 54 | 		assert model_path is not None
 55 | 		assert data_path is not None
 56 | 		if num_topics is None:
 57 | 			num_topics = Execute.DEFAULT_NUM_TOPICS
 58 | 		assert number_of_seriated_terms is not None
 59 | 		
 60 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 61 | 		self.logger.info( 'Tokenizing source corpus...'                                                      )
 62 | 		self.logger.info( '    corpus_path = %s (%s)', corpus_path, corpus_format                            )
 63 | 		self.logger.info( '    model_path = %s (%s)', model_path, model_library                              )
 64 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 65 | 		self.logger.info( '    num_topics = %d', num_topics                                                  )
 66 | 		self.logger.info( '    number_of_seriated_terms = %s', number_of_seriated_terms                      )
 67 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 68 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 69 | 		
 70 | 		Tokenize( self.logger.level ).execute( corpus_format, corpus_path, data_path, tokenization )
 71 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 72 | 		
 73 | 		if model_library == 'stmt':
 74 | 			command = 'pipeline/train_stmt.sh {} {} {}'.format( data_path + '/tokens/tokens.txt', model_path, num_topics )
 75 | 			os.system( command )
 76 | 			ImportStmt( self.logger.level ).execute( model_library, model_path, data_path )
 77 | 		if model_library == 'mallet':
 78 | 			command = 'pipeline/train_mallet.sh {} {} {}'.format( data_path + '/tokens/tokens.txt', model_path, num_topics )
 79 | 			os.system( command )
 80 | 			ImportMallet( self.logger.level ).execute( model_library, model_path, data_path )
 81 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 82 | 		
 83 | 		ComputeSaliency( self.logger.level ).execute( data_path )
 84 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 85 | 
 86 | 		ComputeSimilarity( self.logger.level ).execute( data_path )
 87 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 88 | 
 89 | 		ComputeSeriation( self.logger.level ).execute( data_path, number_of_seriated_terms )
 90 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 91 | 
 92 | 		PrepareDataForClient( self.logger.level ).execute( data_path )
 93 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 94 | 		
 95 | 		command = 'pipeline/prepare_vis_for_client.sh {}'.format( data_path )
 96 | 		os.system( command )
 97 | 		self.logger.info( 'Current time = {}'.format( time.ctime() ) )
 98 | 
 99 | #-------------------------------------------------------------------------------#
100 | 
101 | def main():
102 | 	parser = argparse.ArgumentParser( description = 'Prepare data for Termite.' )
103 | 	parser.add_argument( 'config_file'    , type = str, help = 'Termite configuration file.' )
104 | 	parser.add_argument( '--corpus-format', type = str, dest = 'corpus_format', help = 'Override corpus format in the config file.' )
105 | 	parser.add_argument( '--corpus-path'  , type = str, dest = 'corpus_path'  , help = 'Override corpus path in the config file.' )
106 | 	parser.add_argument( '--model-library', type = str, dest = 'model_library', help = 'Override model library in the config file.' )
107 | 	parser.add_argument( '--model-path'   , type = str, dest = 'model_path'   , help = 'Override model path in the config file.' )
108 | 	parser.add_argument( '--num-topcis'   , type = int, dest = 'num_topics'   , help = 'Override number of topics in the config file.' )
109 | 	parser.add_argument( '--data-path'    , type = str, dest = 'data_path'    , help = 'Override data path in the config file.' )
110 | 	parser.add_argument( '--number-of-seriated-terms', type = int, dest = 'number_of_seriated_terms', help = 'Override the number of terms to seriate.' )
111 | 	parser.add_argument( '--logging'      , type = int, dest = 'logging'      , help = 'Override logging level specified in config file.' )
112 | 	args = parser.parse_args()
113 | 	
114 | 	corpus_format = None
115 | 	corpus_path = None
116 | 	model_library = None
117 | 	model_path = None
118 | 	data_path = None
119 | 	num_topics = None
120 | 	number_of_seriated_terms = None
121 | 	logging_level = 20
122 | 	
123 | 	# Read in default values from the configuration file
124 | 	config = ConfigParser.RawConfigParser()
125 | 	config.read( args.config_file )
126 | 	if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'format' ):
127 | 		corpus_format = config.get( 'Corpus', 'format' )
128 | 	if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'path' ):
129 | 		corpus_path = config.get( 'Corpus', 'path' )
130 | 	if config.has_section( 'Corpus' ) and config.has_option( 'Corpus', 'tokenization' ):
131 | 		tokenization = config.get( 'Corpus', 'tokenization' )
132 | 	if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'library' ):
133 | 		model_library = config.get( 'TopicModel', 'library' )
134 | 	if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'path' ):
135 | 		model_path = config.get( 'TopicModel', 'path' )
136 | 	if config.has_section( 'TopicModel' ) and config.has_option( 'TopicModel', 'num_topics' ):
137 | 		num_topics = config.getint( 'TopicModel', 'num_topics' )
138 | 	if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
139 | 		data_path = config.get( 'Termite', 'path' )
140 | 	if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'number_of_seriated_terms' ):
141 | 		number_of_seriated_terms = config.getint( 'Termite', 'number_of_seriated_terms' )
142 | 	if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
143 | 		logging_level = config.getint( 'Misc', 'logging' )
144 | 	
145 | 	# Read in user-specifiec values from the program arguments
146 | 	if args.corpus_format is not None:
147 | 		corpus_format = args.corpus_format
148 | 	if args.corpus_path is not None:
149 | 		corpus_path = args.corpus_path
150 | 	if args.model_library is not None:
151 | 		model_library = args.model_library
152 | 	if args.model_path is not None:
153 | 		model_path = args.model_path
154 | 	if args.num_topics is not None:
155 | 		num_topics = args.num_topics
156 | 	if args.data_path is not None:
157 | 		data_path = args.data_path
158 | 	if args.number_of_seriated_terms is not None:
159 | 		number_of_seriated_terms = args.number_of_seriated_terms
160 | 	if args.logging is not None:
161 | 		logging_level = args.logging
162 | 	
163 | 	Execute( logging_level ).execute( corpus_format, corpus_path, tokenization, model_library, model_path, data_path, num_topics, number_of_seriated_terms )
164 | 
165 | if __name__ == '__main__':
166 | 	main()
167 | 


--------------------------------------------------------------------------------
/pipeline/api_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import json
  6 | from io_utils import CheckAndMakeDirs
  7 | from io_utils import ReadAsList, ReadAsVector, ReadAsMatrix, ReadAsSparseVector, ReadAsSparseMatrix, ReadAsJson
  8 | from io_utils import WriteAsList, WriteAsVector, WriteAsMatrix, WriteAsSparseVector, WriteAsSparseMatrix, WriteAsJson, WriteAsTabDelimited
  9 | from utf8_utils import UnicodeReader, UnicodeWriter
 10 | 
 11 | class DocumentsAPI( object ):
 12 | 	ACCEPTABLE_FORMATS = frozenset( [ 'file' ] )
 13 | 	
 14 | 	def __init__( self, format, path ):
 15 | 		assert format in DocumentsAPI.ACCEPTABLE_FORMATS
 16 | 		self.format = format
 17 | 		self.path = path
 18 | 		self.data = []
 19 | 	
 20 | 	def read( self ):
 21 | 		self.data = {}
 22 | 		filename = self.path
 23 | 		with open( filename, 'r' ) as f:
 24 | 			lines = f.read().decode( 'utf-8', 'ignore' ).splitlines()
 25 | 			for line in lines:
 26 | 				docID, docContent = line.split( '\t' )
 27 | 				self.data[ docID ] = docContent
 28 | 
 29 | class TokensAPI( object ):
 30 | 	SUBFOLDER = 'tokens'
 31 | 	TOKENS = 'tokens.txt'
 32 | 	
 33 | 	def __init__( self, path ):
 34 | 		self.path = '{}/{}/'.format( path, TokensAPI.SUBFOLDER )
 35 | 		self.data = {}
 36 | 	
 37 | 	def read( self ):
 38 | 		self.data = {}
 39 | 		filename = self.path + TokensAPI.TOKENS
 40 | 		with open( filename, 'r' ) as f:
 41 | 			lines = UnicodeReader( f )
 42 | 			for ( docID, docTokens ) in lines:
 43 | 				self.data[ docID ] = docTokens.split( ' ' )
 44 | 	
 45 | 	def write( self ):
 46 | 		CheckAndMakeDirs( self.path )
 47 | 		filename = self.path + TokensAPI.TOKENS
 48 | 		with open( filename, 'w' ) as f:
 49 | 			writer = UnicodeWriter( f )
 50 | 			for ( docID, docTokens ) in self.data.iteritems():
 51 | 				writer.writerow( [ docID, ' '.join(docTokens) ] )
 52 | 
 53 | class ModelAPI( object ):
 54 | 	SUBFOLDER = 'model'
 55 | 	TOPIC_INDEX = 'topic-index.txt'
 56 | 	TERM_INDEX = 'term-index.txt'
 57 | 	TERM_TOPIC_MATRIX = 'term-topic-matrix.txt'
 58 | 	
 59 | 	def __init__( self, path ):
 60 | 		self.path = '{}/{}/'.format( path, ModelAPI.SUBFOLDER )
 61 | 		self.topic_index = []
 62 | 		self.term_index = []
 63 | 		self.topic_count = 0
 64 | 		self.term_count = 0
 65 | 		self.term_topic_matrix = []
 66 | 	
 67 | 	def read( self ):
 68 | 		self.topic_index = ReadAsList( self.path + ModelAPI.TOPIC_INDEX )
 69 | 		self.term_index = ReadAsList( self.path + ModelAPI.TERM_INDEX )
 70 | 		self.term_topic_matrix = ReadAsMatrix( self.path + ModelAPI.TERM_TOPIC_MATRIX )
 71 | 		self.verify()
 72 | 	
 73 | 	def verify( self ):
 74 | 		self.topic_count = len( self.topic_index )
 75 | 		self.term_count = len( self.term_index )
 76 | 		
 77 | 		assert self.term_count == len( self.term_topic_matrix )
 78 | 		for row in self.term_topic_matrix:
 79 | 			assert self.topic_count == len(row)
 80 | 	
 81 | 	def write( self ):
 82 | 		self.verify()
 83 | 		CheckAndMakeDirs( self.path )
 84 | 		WriteAsList( self.topic_index, self.path + ModelAPI.TOPIC_INDEX )
 85 | 		WriteAsList( self.term_index, self.path + ModelAPI.TERM_INDEX )
 86 | 		WriteAsMatrix( self.term_topic_matrix, self.path + ModelAPI.TERM_TOPIC_MATRIX )
 87 | 
 88 | class SaliencyAPI( object ):
 89 | 	SUBFOLDER = 'saliency'
 90 | 	TOPIC_WEIGHTS = 'topic-info.json'
 91 | 	TOPIC_WEIGHTS_TXT = 'topic-info.txt'
 92 | 	TOPIC_WEIGHTS_FIELDS = [ 'term', 'saliency', 'frequency', 'distinctiveness', 'rank', 'visibility' ]
 93 | 	TERM_SALIENCY = 'term-info.json'
 94 | 	TERM_SALIENCY_TXT = 'term-info.txt'
 95 | 	TERM_SALIENCY_FIELDS = [ 'topic', 'weight' ]
 96 | 	
 97 | 	def __init__( self, path ):
 98 | 		self.path = '{}/{}/'.format( path, SaliencyAPI.SUBFOLDER )
 99 | 		self.term_info = {}
100 | 		self.topic_info = {}
101 | 	
102 | 	def read( self ):
103 | 		self.term_info = ReadAsJson( self.path + SaliencyAPI.TERM_SALIENCY )
104 | 		self.topic_info = ReadAsJson( self.path + SaliencyAPI.TOPIC_WEIGHTS )
105 | 	
106 | 	def write( self ):
107 | 		CheckAndMakeDirs( self.path )
108 | 		WriteAsJson( self.term_info, self.path + SaliencyAPI.TERM_SALIENCY )
109 | 		WriteAsTabDelimited( self.term_info, self.path + SaliencyAPI.TERM_SALIENCY_TXT, SaliencyAPI.TOPIC_WEIGHTS_FIELDS )
110 | 		WriteAsJson( self.topic_info, self.path + SaliencyAPI.TOPIC_WEIGHTS )
111 | 		WriteAsTabDelimited( self.topic_info, self.path + SaliencyAPI.TOPIC_WEIGHTS_TXT, SaliencyAPI.TERM_SALIENCY_FIELDS )
112 | 
113 | class SimilarityAPI( object ):
114 | 	SUBFOLDER = 'similarity'
115 | 	DOCUMENT_OCCURRENCE = 'document-occurrence.txt'
116 | 	DOCUMENT_COOCCURRENCE = 'document-cooccurrence.txt'
117 | 	WINDOW_OCCURRENCE = 'window-occurrence.txt'
118 | 	WINDOW_COOCCURRENCE = 'window-cooccurrence.txt'
119 | 	UNIGRAM_COUNTS = 'unigram-counts.txt'
120 | 	BIGRAM_COUNTS = 'bigram-counts.txt'
121 | 	DOCUMENT_G2 = 'document-g2.txt'
122 | 	WINDOW_G2 = 'window-g2.txt'
123 | 	COLLOCATAPIN_G2 = 'collocation-g2.txt'
124 | 	COMBINED_G2 = 'combined-g2.txt'
125 | 	
126 | 	def __init__( self, path ):
127 | 		self.path = '{}/{}/'.format( path, SimilarityAPI.SUBFOLDER )
128 | 		self.document_occurrence = {}
129 | 		self.document_cooccurrence = {}
130 | 		self.window_occurrence = {}
131 | 		self.window_cooccurrence = {}
132 | 		self.unigram_counts = {}
133 | 		self.bigram_counts = {}
134 | 		self.document_g2 = {}
135 | 		self.window_g2 = {}
136 | 		self.collcation_g2 = {}
137 | 		self.combined_g2 = {}
138 | 	
139 | 	def read( self ):
140 | #		self.document_occurrence = ReadAsSparseVector( self.path + SimilarityAPI.DOCUMENT_OCCURRENCE )
141 | #		self.document_cooccurrence = ReadAsSparseMatrix( self.path + SimilarityAPI.DOCUMENT_COOCCURRENCE )
142 | #		self.window_occurrence = ReadAsSparseVector( self.path + SimilarityAPI.WINDOW_OCCURRENCE )
143 | #		self.window_cooccurrence = ReadAsSparseMatrix( self.path + SimilarityAPI.WINDOW_COOCCURRENCE )
144 | #		self.unigram_counts = ReadAsSparseVector( self.path + SimilarityAPI.UNIGRAM_COUNTS )
145 | #		self.bigram_counts = ReadAsSparseMatrix( self.path + SimilarityAPI.BIGRAM_COUNTS )
146 | #		self.document_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.DOCUMENT_G2 )
147 | #		self.window_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.WINDOW_G2 )
148 | #		self.collocation_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.COLLOCATAPIN_G2 )
149 | 		self.combined_g2 = ReadAsSparseMatrix( self.path + SimilarityAPI.COMBINED_G2 )
150 | 	
151 | 	def write( self ):
152 | 		CheckAndMakeDirs( self.path )
153 | #		WriteAsSparseVector( self.document_occurrence, self.path + SimilarityAPI.DOCUMENT_OCCURRENCE )
154 | #		WriteAsSparseMatrix( self.document_cooccurrence, self.path + SimilarityAPI.DOCUMENT_COOCCURRENCE )
155 | #		WriteAsSparseVector( self.window_occurrence, self.path + SimilarityAPI.WINDOW_OCCURRENCE )
156 | #		WriteAsSparseMatrix( self.window_cooccurrence, self.path + SimilarityAPI.WINDOW_COOCCURRENCE )
157 | #		WriteAsSparseVector( self.unigram_counts, self.path + SimilarityAPI.UNIGRAM_COUNTS )
158 | #		WriteAsSparseMatrix( self.bigram_counts, self.path + SimilarityAPI.BIGRAM_COUNTS )
159 | #		WriteAsSparseMatrix( self.document_g2, self.path + SimilarityAPI.DOCUMENT_G2 )
160 | #		WriteAsSparseMatrix( self.window_g2, self.path + SimilarityAPI.WINDOW_G2 )
161 | #		WriteAsSparseMatrix( self.collocation_g2, self.path + SimilarityAPI.COLLOCATAPIN_G2 )
162 | 		WriteAsSparseMatrix( self.combined_g2, self.path + SimilarityAPI.COMBINED_G2 )
163 | 
164 | class SeriationAPI( object ):
165 | 	SUBFOLDER = 'seriation'
166 | 	TERM_ORDERING = 'term-ordering.txt'
167 | 	TERM_ITER_INDEX = 'term-iter-index.txt'
168 | 	
169 | 	def __init__( self, path ):
170 | 		self.path = '{}/{}/'.format( path, SeriationAPI.SUBFOLDER )
171 | 		self.term_ordering = []
172 | 		self.term_iter_index = []
173 | 	
174 | 	def read( self ):
175 | 		self.term_ordering = ReadAsList( self.path + SeriationAPI.TERM_ORDERING )
176 | 		self.term_iter_index = ReadAsList( self.path + SeriationAPI.TERM_ITER_INDEX )
177 | 	
178 | 	def write( self ):
179 | 		CheckAndMakeDirs( self.path )
180 | 		WriteAsList( self.term_ordering, self.path + SeriationAPI.TERM_ORDERING )
181 | 		WriteAsList( self.term_iter_index, self.path + SeriationAPI.TERM_ITER_INDEX )
182 | 
183 | class ClientAPI( object ):
184 | 	SUBFOLDER = 'public_html/data'
185 | 	SERIATED_PARAMETERS = 'seriated-parameters.json'
186 | 	FILTERED_PARAMETERS = 'filtered-parameters.json'
187 | 	GLOBAL_TERM_FREQS = 'global-term-freqs.json'
188 | 	
189 | 	def __init__( self, path ):
190 | 		self.path = '{}/{}/'.format( path, ClientAPI.SUBFOLDER )
191 | 		self.seriated_parameters = {}
192 | 		self.filtered_parameters = {}
193 | 		self.global_term_freqs = {}
194 | 	
195 | 	def read( self ):
196 | 		self.seriated_parameters = ReadAsJson( self.path + ClientAPI.SERIATED_PARAMETERS )
197 | 		self.filtered_parameters = ReadAsJson( self.path + ClientAPI.FILTERED_PARAMETERS )
198 | 		self.global_term_freqs = ReadAsJson( self.path + ClientAPI.GLOBAL_TERM_FREQS )
199 | 	
200 | 	def write( self ):
201 | 		CheckAndMakeDirs( self.path )
202 | 		WriteAsJson( self.seriated_parameters, self.path + ClientAPI.SERIATED_PARAMETERS )
203 | 		WriteAsJson( self.filtered_parameters, self.path + ClientAPI.FILTERED_PARAMETERS )
204 | 		WriteAsJson( self.global_term_freqs, self.path + ClientAPI.GLOBAL_TERM_FREQS )
205 | 


--------------------------------------------------------------------------------
/client-src/html5slider.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | html5slider - a JS implementation of <input type=range> for Firefox 16 and up
  3 | https://github.com/fryn/html5slider
  4 | 
  5 | Copyright (c) 2010-2012 Frank Yan, <http://frankyan.com>
  6 | 
  7 | Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | of this software and associated documentation files (the "Software"), to deal
  9 | in the Software without restriction, including without limitation the rights
 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | copies of the Software, and to permit persons to whom the Software is
 12 | furnished to do so, subject to the following conditions:
 13 | 
 14 | The above copyright notice and this permission notice shall be included in
 15 | all copies or substantial portions of the Software.
 16 | 
 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | THE SOFTWARE.
 24 | */
 25 | 
 26 | (function() {
 27 | 
 28 | // test for native support
 29 | var test = document.createElement('input');
 30 | try {
 31 |   test.type = 'range';
 32 |   if (test.type == 'range')
 33 |     return;
 34 | } catch (e) {
 35 |   return;
 36 | }
 37 | 
 38 | // test for required property support
 39 | test.style.background = 'linear-gradient(red, red)';
 40 | if (!test.style.backgroundImage || !('MozAppearance' in test.style) ||
 41 |     !document.mozSetImageElement || !this.MutationObserver)
 42 |   return;
 43 | 
 44 | var scale;
 45 | var isMac = navigator.platform == 'MacIntel';
 46 | var thumb = {
 47 |   radius: isMac ? 9 : 6,
 48 |   width: isMac ? 22 : 12,
 49 |   height: isMac ? 16 : 20
 50 | };
 51 | var track = 'linear-gradient(transparent ' + (isMac ?
 52 |   '6px, #999 6px, #999 7px, #ccc 8px, #bbb 9px, #bbb 10px, transparent 10px' :
 53 |   '9px, #999 9px, #bbb 10px, #fff 11px, transparent 11px') +
 54 |   ', transparent)';
 55 | var styles = {
 56 |   'min-width': thumb.width + 'px',
 57 |   'min-height': thumb.height + 'px',
 58 |   'max-height': thumb.height + 'px',
 59 |   padding: '0 0 ' + (isMac ? '2px' : '1px'),
 60 |   border: 0,
 61 |   'border-radius': 0,
 62 |   cursor: 'default',
 63 |   'text-indent': '-999999px' // -moz-user-select: none; breaks mouse capture
 64 | };
 65 | var options = {
 66 |   attributes: true,
 67 |   attributeFilter: ['min', 'max', 'step', 'value']
 68 | };
 69 | var forEach = Array.prototype.forEach;
 70 | var onChange = document.createEvent('HTMLEvents');
 71 | onChange.initEvent('change', true, false);
 72 | 
 73 | if (document.readyState == 'loading')
 74 |   document.addEventListener('DOMContentLoaded', initialize, true);
 75 | else
 76 |   initialize();
 77 | 
 78 | function initialize() {
 79 |   // create initial sliders
 80 |   forEach.call(document.querySelectorAll('input[type=range]'), transform);
 81 |   // create sliders on-the-fly
 82 |   new MutationObserver(function(mutations) {
 83 |     mutations.forEach(function(mutation) {
 84 |       if (mutation.addedNodes)
 85 |         forEach.call(mutation.addedNodes, function(node) {
 86 |           check(node);
 87 |           if (node.childElementCount)
 88 |             forEach.call(node.querySelectorAll('input'), check);
 89 |         });
 90 |     });
 91 |   }).observe(document, { childList: true, subtree: true });
 92 | }
 93 | 
 94 | function check(input) {
 95 |   if (input.localName == 'input' && input.type != 'range' &&
 96 |       input.getAttribute('type') == 'range')
 97 |     transform(input);
 98 | }
 99 | 
100 | function transform(slider) {
101 | 
102 |   var isValueSet, areAttrsSet, isChanged, isClick, prevValue, rawValue, prevX;
103 |   var min, max, step, range, value = slider.value;
104 | 
105 |   // lazily create shared slider affordance
106 |   if (!scale) {
107 |     scale = document.body.appendChild(document.createElement('hr'));
108 |     style(scale, {
109 |       '-moz-appearance': isMac ? 'scale-horizontal' : 'scalethumb-horizontal',
110 |       display: 'block',
111 |       visibility: 'visible',
112 |       opacity: 1,
113 |       position: 'fixed',
114 |       top: '-999999px'
115 |     });
116 |     document.mozSetImageElement('__sliderthumb__', scale);
117 |   }
118 | 
119 |   // reimplement value and type properties
120 |   var getValue = function() { return '' + value; };
121 |   var setValue = function setValue(val) {
122 |     value = '' + val;
123 |     isValueSet = true;
124 |     draw();
125 |     delete slider.value;
126 |     slider.value = value;
127 |     slider.__defineGetter__('value', getValue);
128 |     slider.__defineSetter__('value', setValue);
129 |   };
130 |   slider.__defineGetter__('value', getValue);
131 |   slider.__defineSetter__('value', setValue);
132 |   slider.__defineGetter__('type', function() { return 'range'; });
133 | 
134 |   // sync properties with attributes
135 |   ['min', 'max', 'step'].forEach(function(prop) {
136 |     if (slider.hasAttribute(prop))
137 |       areAttrsSet = true;
138 |     slider.__defineGetter__(prop, function() {
139 |       return this.hasAttribute(prop) ? this.getAttribute(prop) : '';
140 |     });
141 |     slider.__defineSetter__(prop, function(val) {
142 |       val === null ? this.removeAttribute(prop) : this.setAttribute(prop, val);
143 |     });
144 |   });
145 | 
146 |   // initialize slider
147 |   slider.readOnly = true;
148 |   style(slider, styles);
149 |   update();
150 | 
151 |   new MutationObserver(function(mutations) {
152 |     mutations.forEach(function(mutation) {
153 |       if (mutation.attributeName != 'value') {
154 |         update();
155 |         areAttrsSet = true;
156 |       }
157 |       // note that value attribute only sets initial value
158 |       else if (!isValueSet) {
159 |         value = slider.getAttribute('value');
160 |         draw();
161 |       }
162 |     });
163 |   }).observe(slider, options);
164 | 
165 |   slider.addEventListener('mousedown', onDragStart, true);
166 |   slider.addEventListener('keydown', onKeyDown, true);
167 |   slider.addEventListener('focus', onFocus, true);
168 |   slider.addEventListener('blur', onBlur, true);
169 | 
170 |   function onDragStart(e) {
171 |     isClick = true;
172 |     setTimeout(function() { isClick = false; }, 0);
173 |     if (e.button || !range)
174 |       return;
175 |     var width = parseFloat(getComputedStyle(this, 0).width);
176 |     var multiplier = (width - thumb.width) / range;
177 |     if (!multiplier)
178 |       return;
179 |     // distance between click and center of thumb
180 |     var dev = e.clientX - this.getBoundingClientRect().left - thumb.width / 2 -
181 |               (value - min) * multiplier;
182 |     // if click was not on thumb, move thumb to click location
183 |     if (Math.abs(dev) > thumb.radius) {
184 |       isChanged = true;
185 |       this.value -= -dev / multiplier;
186 |     }
187 |     rawValue = value;
188 |     prevX = e.clientX;
189 |     this.addEventListener('mousemove', onDrag, true);
190 |     this.addEventListener('mouseup', onDragEnd, true);
191 |   }
192 | 
193 |   function onDrag(e) {
194 |     var width = parseFloat(getComputedStyle(this, 0).width);
195 |     var multiplier = (width - thumb.width) / range;
196 |     if (!multiplier)
197 |       return;
198 |     rawValue += (e.clientX - prevX) / multiplier;
199 |     prevX = e.clientX;
200 |     isChanged = true;
201 |     this.value = rawValue;
202 |   }
203 | 
204 |   function onDragEnd() {
205 |     this.removeEventListener('mousemove', onDrag, true);
206 |     this.removeEventListener('mouseup', onDragEnd, true);
207 |   }
208 | 
209 |   function onKeyDown(e) {
210 |     if (e.keyCode > 36 && e.keyCode < 41) { // 37-40: left, up, right, down
211 |       onFocus.call(this);
212 |       isChanged = true;
213 |       this.value = value + (e.keyCode == 38 || e.keyCode == 39 ? step : -step);
214 |     }
215 |   }
216 | 
217 |   function onFocus() {
218 |     if (!isClick)
219 |       this.style.boxShadow = !isMac ? '0 0 0 2px #fb0' :
220 |         'inset 0 0 20px rgba(0,127,255,.1), 0 0 1px rgba(0,127,255,.4)';
221 |   }
222 | 
223 |   function onBlur() {
224 |     this.style.boxShadow = '';
225 |   }
226 | 
227 |   // determines whether value is valid number in attribute form
228 |   function isAttrNum(value) {
229 |     return !isNaN(value) && +value == parseFloat(value);
230 |   }
231 | 
232 |   // validates min, max, and step attributes and redraws
233 |   function update() {
234 |     min = isAttrNum(slider.min) ? +slider.min : 0;
235 |     max = isAttrNum(slider.max) ? +slider.max : 100;
236 |     if (max < min)
237 |       max = min > 100 ? min : 100;
238 |     step = isAttrNum(slider.step) && slider.step > 0 ? +slider.step : 1;
239 |     range = max - min;
240 |     draw(true);
241 |   }
242 | 
243 |   // recalculates value property
244 |   function calc() {
245 |     if (!isValueSet && !areAttrsSet)
246 |       value = slider.getAttribute('value');
247 |     if (!isAttrNum(value))
248 |       value = (min + max) / 2;;
249 |     // snap to step intervals (WebKit sometimes does not - bug?)
250 |     value = Math.round((value - min) / step) * step + min;
251 |     if (value < min)
252 |       value = min;
253 |     else if (value > max)
254 |       value = min + ~~(range / step) * step;
255 |   }
256 | 
257 |   // renders slider using CSS background ;)
258 |   function draw(attrsModified) {
259 |     calc();
260 |     if (isChanged && value != prevValue)
261 |       slider.dispatchEvent(onChange);
262 |     isChanged = false;
263 |     if (!attrsModified && value == prevValue)
264 |       return;
265 |     prevValue = value;
266 |     var position = range ? (value - min) / range * 100 : 0;
267 |     var bg = '-moz-element(#__sliderthumb__) ' + position + '% no-repeat, ';
268 |     style(slider, { background: bg + track });
269 |   }
270 | 
271 | }
272 | 
273 | function style(element, styles) {
274 |   for (var prop in styles)
275 |     element.style.setProperty(prop, styles[prop], 'important');
276 | }
277 | 
278 | })();
279 | 


--------------------------------------------------------------------------------
/client-src/FilteredTermTopicProbabilityModel.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	FilteredTermTopicProbabilityModel.js
  3 | 	
  4 | 	This model is responsible for modifying data based on user inputs/controls 
  5 | 		Current user control changes:
  6 | 			-number of terms to show based on BEA choice order
  7 | 			-number of terms to show based on saliency score (desc order)
  8 | 			-specific terms to always show in the list of terms
  9 | 			-whether or not to add top "twenty" terms of selected topics
 10 | 			-sorting
 11 | 	
 12 | 	Details:
 13 | 	--------
 14 | 	Pulls data from SeriatedTermTopicProbabilityModel on initialize.
 15 | 	Afterwards, this model is called when the user controls on the website are changed.
 16 | 	At that time, the new "user defined" state is passed to the update function.  
 17 | */
 18 | 
 19 | var FilteredTermTopicProbabilityModel = Backbone.Model.extend({
 20 | 	defaults : {
 21 | 		"matrix" : null,
 22 | 		"termIndex" : null,
 23 | 		"topicIndex" : null,
 24 | 		"sparseMatrix" : null
 25 | 	},
 26 | 	url : "data/filtered-parameters.json",
 27 | 	initialize : function() {
 28 | 		this.stateModel = null;
 29 | 		this.parentModel = null;
 30 | 
 31 | 		// mappings
 32 | 		this.termRankMap = null;
 33 | 		this.termOrderMap = null;
 34 | 		this.rowIndexMap = null;
 35 | 		this.termDistinctivenessMap = null;
 36 | 		this.termSaliencyList = [];
 37 | 		
 38 | 		// interaction related variables
 39 | 		this.selectedTopics = {};
 40 | 		this.visibleTopTerms = {};
 41 | 	}
 42 | });
 43 | 
 44 | /**
 45 |  * Initialize filtered's parent and state model
 46 |  *
 47 |  * @private
 48 |  */
 49 | FilteredTermTopicProbabilityModel.prototype.initModel = function( model, state ){
 50 | 	this.parentModel = model;
 51 | 	this.stateModel = state;
 52 | };
 53 | 
 54 | /**
 55 |  * Initialize all topics' selection status to false (called once by load)
 56 |  *
 57 |  * @private
 58 |  */
 59 | FilteredTermTopicProbabilityModel.prototype.defaultSelection = function(){
 60 | 	var topicIndex = this.parentModel.get("topicIndex");
 61 | 	for( var i = 0; i < topicIndex.length; i++ ){
 62 | 		this.selectedTopics[i] = false;
 63 | 	}
 64 | };
 65 | 
 66 | /**
 67 |  * Loads various mappings from the model's "url"
 68 |  * and triggers a loaded event that the next model (child model) listens to.  
 69 |  * (This function is called after the seriated model loaded event is fired)
 70 |  *
 71 |  * @param { string } the location of datafile to load values from
 72 |  * @return { void }
 73 |  */
 74 | FilteredTermTopicProbabilityModel.prototype.load = function() {
 75 | 	var initRowIndexMap = function( termIndex ){
 76 | 		this.rowIndexMap = {};
 77 | 		for ( var i = 0; i < termIndex.length; i++ ){
 78 | 			this.rowIndexMap[termIndex[i]] = i;
 79 | 		}
 80 | 	}.bind(this);
 81 | 	
 82 | 	var initTermSaliencyList = function( saliencyMap ){
 83 | 		termSaliencyList = [];
 84 | 		tempList = [];
 85 | 		for ( var term in saliencyMap ){
 86 | 			tempList.push([term, saliencyMap[term]]);
 87 | 		}
 88 | 		tempList.sort(function(a, b) {return b[1] - a[1]});
 89 | 		for( var i = 0; i < tempList.length; i++ ){
 90 | 			this.termSaliencyList.push(tempList[i][0]);
 91 | 		}
 92 | 	}.bind(this);
 93 | 
 94 | 	var successHandler = function( model, response, options )
 95 | 	{
 96 | 		var keepQuiet = false;
 97 | 		this.termRankMap = response.termRankMap;
 98 | 		this.termOrderMap = response.termOrderMap;
 99 | 		this.termDistinctivenessMap = response.termDistinctivenessMap;
100 | 		initRowIndexMap( this.parentModel.get("termIndex") );
101 | 		initTermSaliencyList( response.termSaliencyMap );
102 | 			
103 | 		this.initTopTermLists();
104 | 		this.defaultSelection();
105 | 		this.filter( keepQuiet );	
106 | 		
107 | 		var coloredTopics = this.stateModel.get("selectedTopics");
108 | 		for( var obj in coloredTopics){
109 | 			claimColor( coloredTopics[obj] );
110 | 			this.selectTopic({"topic": obj, "color": coloredTopics[obj]} );
111 | 		}	
112 | 		
113 | 		this.trigger('loaded:filtered');
114 | 
115 | 	}.bind(this);
116 | 	var errorHandler = function( model, xhr, options ) { }.bind(this);
117 | 	this.fetch({
118 | 		add : false,
119 | 		success : successHandler,
120 | 		error : errorHandler
121 | 	});
122 | };
123 | 
124 | /** 
125 |  * Generates list of top twenty terms per topic in original topicIndex (called in load)
126 |  *
127 |  * @private
128 |  */
129 | FilteredTermTopicProbabilityModel.prototype.initTopTermLists = function() {
130 | 	var termIndex = this.parentModel.get("termIndex");
131 | 	var topicIndex = this.parentModel.get("topicIndex");
132 | 	
133 | 	var colFirstMatrix = generateColumnFirst(this.parentModel.get("matrix"));
134 | 	
135 | 	var termsPerTopic = 20;	
136 | 	this.topTermLists = {};
137 | 	for( var i = 0; i < topicIndex.length; i++){
138 | 		this.topTermLists[i] = [];
139 | 		
140 | 		// get term freqs for this topic
141 | 		var topicalFrequencies = colFirstMatrix[i];
142 | 		
143 | 		// sort the terms by topical frequency
144 | 		var indices = new Array(termIndex.length);
145 | 		for(var j = 0; j < termIndex.length; j++)
146 | 			indices[j] = j;
147 | 		indices.sort(function (a, b) { return topicalFrequencies[a] < topicalFrequencies[b] ? 1 : topicalFrequencies[a] > topicalFrequencies[b] ? -1 : 0; });
148 | 
149 | 		// take the top 20 (unless there are fewer than 20)
150 | 		var count = 0;
151 | 		while(count < 20 && indices[count] > THRESHHOLD){
152 | 			this.topTermLists[i].push(termIndex[indices[count]]);
153 | 			count++;
154 | 		}
155 | 	}
156 | };
157 | 
158 | /**
159 |  * Calls appropriate functions to update based on data change(s)
160 |  */
161 | FilteredTermTopicProbabilityModel.prototype.update = function( obj )
162 | {
163 | 	this.filter( false );
164 | };
165 | 
166 | /**
167 |  * adds top twenty term list of selected topics to the visibleTopTerms list
168 |  *
169 |  * @private
170 |  */
171 | FilteredTermTopicProbabilityModel.prototype.addTopTerms = function() {
172 | 	for( var obj in this.selectedTopics){
173 | 		if(this.selectedTopics[obj])
174 | 			this.visibleTopTerms[obj] = this.topTermLists[obj];
175 | 	}
176 | };
177 | 
178 | /**
179 |  * Refreshes the termIndex and ordering based on user changes
180 |  * 
181 |  * @param { boolean } determines whether certain "set"s should trigger change events
182 |  * @return { void }
183 |  */
184 | FilteredTermTopicProbabilityModel.prototype.filter = function( keepQuiet ) {
185 | 	var original_submatrix = this.parentModel.get("matrix");
186 | 	var original_termIndex = this.parentModel.get("termIndex");
187 | 	var original_topicIndex = this.parentModel.get("topicIndex");
188 | 	
189 | 	var userDefinedTerms = this.stateModel.get("visibleTerms").slice(0);
190 | 	if(this.stateModel.get("addTopTwenty"))
191 | 		this.addTopTerms();
192 | 	else
193 | 		this.visibleTopTerms = {};
194 | 	
195 | 	var affinityLimit = this.stateModel.get("numAffinityTerms");
196 | 	var saliencyLimit = this.stateModel.get("numSalientTerms");
197 | 	
198 | 	var foundTerms = [];
199 | 	var subset = [];
200 | 	// choose terms to keep
201 | 	var chooseTerm = function( term ){
202 | 		if( userDefinedTerms.indexOf( term ) >= 0 ){
203 | 			foundTerms.push(term);
204 | 			return true;
205 | 		} 
206 | 		if( this.termRankMap[term] < affinityLimit ){
207 | 			return true;
208 | 		} 
209 | 		if( this.termSaliencyList.indexOf( term ) >= 0 && this.termSaliencyList.indexOf( term ) < saliencyLimit ){
210 | 			return true;
211 | 		}
212 | 		for(var topicNo in this.visibleTopTerms){
213 | 			if( this.visibleTopTerms[topicNo].indexOf( term ) >= 0 )
214 | 				return true;
215 | 		}
216 | 		return false;
217 | 	}.bind(this);
218 | 	
219 | 	// sort the terms
220 | 	var sortType = this.stateModel.get("sortType");
221 | 	for ( var i = 0; i < original_termIndex.length; i++ ){
222 | 		var term = original_termIndex[i];
223 | 		if( chooseTerm( term ) ){
224 | 			if(sortType === "")
225 | 				subset.push( [term, this.termOrderMap[ term ]] );
226 | 			else if( sortType === "desc") {
227 | 				var topic = this.stateModel.get("doubleClickTopic");
228 | 				subset.push( [term, 1 / (original_submatrix[this.rowIndexMap[term]][topic]*this.termDistinctivenessMap[term])]);
229 | 			}
230 | 			else if( sortType === "asc") {
231 | 				var topic = this.stateModel.get("doubleClickTopic");
232 | 				subset.push( [term, original_submatrix[this.rowIndexMap[term]][topic]*this.termDistinctivenessMap[term]]);
233 | 			}
234 | 		}
235 | 	}
236 | 	// find out which user defined terms were found in the dataset
237 | 	for( var i = 0; i < foundTerms.length; i++){
238 | 		userDefinedTerms.splice(userDefinedTerms.indexOf(foundTerms[i]),1);
239 | 	}
240 | 	subset.sort(function(a, b) {return a[1] - b[1]});
241 | 		
242 | 	// update model and state attributes
243 | 	matrix = [];
244 | 	termIndex = []
245 | 	for(var j = 0; j < subset.length; j++){
246 | 		var term = subset[j][0];
247 | 		termIndex.push(term);
248 | 		matrix.push(original_submatrix[this.rowIndexMap[term]]);
249 | 	}
250 | 	this.set("topicIndex", original_topicIndex, { silent: keepQuiet } );
251 | 	this.set("termIndex", termIndex, { silent: keepQuiet } );
252 | 	this.set("matrix", matrix, { silent: keepQuiet} );
253 | 	this.set("sparseMatrix", generateSparseMatrix.bind(this)(),  {silent: keepQuiet});
254 | 	
255 | 	this.stateModel.setFoundTerms(foundTerms, keepQuiet);
256 | 	this.stateModel.setUnfoundTerms(userDefinedTerms, keepQuiet);
257 | 	this.stateModel.set("totalTerms", termIndex.length);
258 | };
259 | 
260 | /**
261 |  * Behavior when topic is selected
262 |  *
263 |  * @this { FilteredTermTopicProbabilityModel }
264 |  * @param { object } topic: target topic index, color: associated color
265 |  * @return { void }
266 |  */
267 | FilteredTermTopicProbabilityModel.prototype.selectTopic = function( obj ) {
268 | 	var topic = obj.topic;
269 | 	var colorClass = obj.color;
270 | 	var topicIndex = this.parentModel.get("topicIndex");
271 | 	if( topic !== null){
272 | 
273 | 		// if color is DEFAULT, the event can be treated as a deselect
274 | 		if( colorClass === DEFAULT){
275 | 			if(this.selectedTopics[topic]){
276 | 				delete this.visibleTopTerms[topic]; 
277 | 				this.selectedTopics[topic] = false;
278 | 				this.filter( false );
279 | 			}
280 | 			return;
281 | 		}
282 | 			
283 | 		// only add if this topic wasn't added previously
284 | 		if(this.selectedTopics[topic] === false) {
285 | 			this.selectedTopics[topic] = true;
286 | 			this.filter( false );
287 | 		}
288 | 	}
289 | };


--------------------------------------------------------------------------------
/pipeline/compute_similarity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | import math
 10 | import itertools
 11 | from api_utils import TokensAPI, SimilarityAPI
 12 | 
 13 | class ComputeSimilarity( object ):
 14 | 	"""
 15 | 	Similarity measures.
 16 | 	
 17 | 	Compute term similarity based on co-occurrence and
 18 | 	collocation likelihoods.
 19 | 	"""
 20 | 	
 21 | 	DEFAULT_SLIDING_WINDOW_SIZE = 10
 22 | 	MAX_FREQ = 100.0
 23 | 	
 24 | 	def __init__( self, logging_level ):
 25 | 		self.logger = logging.getLogger( 'ComputeSimilarity' )
 26 | 		self.logger.setLevel( logging_level )
 27 | 		handler = logging.StreamHandler( sys.stderr )
 28 | 		handler.setLevel( logging_level )
 29 | 		self.logger.addHandler( handler )
 30 | 	
 31 | 	def execute( self, data_path, sliding_window_size = None ):
 32 | 		
 33 | 		assert data_path is not None
 34 | 		if sliding_window_size is None:
 35 | 			sliding_window_size = ComputeSimilarity.DEFAULT_SLIDING_WINDOW_SIZE
 36 | 		
 37 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 38 | 		self.logger.info( 'Computing term similarity...'                                                     )
 39 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 40 | 		self.logger.info( '    sliding_window_size = %d', sliding_window_size                                )
 41 | 		
 42 | 		self.logger.info( 'Connecting to data...' )
 43 | 		self.tokens = TokensAPI( data_path )
 44 | 		self.similarity = SimilarityAPI( data_path )
 45 | 		
 46 | 		self.logger.info( 'Reading data from disk...' )
 47 | 		self.tokens.read()
 48 | 		
 49 | 		self.logger.info( 'Computing document co-occurrence...' )
 50 | 		self.computeDocumentCooccurrence()
 51 | 		
 52 | 		self.logger.info( 'Computing sliding-window co-occurrence...' )
 53 | 		self.computeSlidingWindowCooccurrence( sliding_window_size )
 54 | 		
 55 | 		self.logger.info( 'Counting total number of tokens, unigrams, and bigrams in the corpus...' )
 56 | 		self.computeTokenCounts()
 57 | 		
 58 | 		self.logger.info( 'Computing document co-occurrence likelihood...' )
 59 | 		self.similarity.document_g2 = self.getG2Stats( self.document_count, self.similarity.document_occurrence, self.similarity.document_cooccurrence )
 60 | 		
 61 | 		self.logger.info( 'Computing sliding-window co-occurrence likelihood...' )
 62 | 		self.similarity.window_g2 = self.getG2Stats( self.window_count, self.similarity.window_occurrence, self.similarity.window_cooccurrence )
 63 | 		
 64 | 		self.logger.info( 'Computing collocation likelihood...' )
 65 | 		self.similarity.collocation_g2 = self.getG2Stats( self.token_count, self.similarity.unigram_counts, self.similarity.bigram_counts )
 66 | 		
 67 | 		self.combineSimilarityMatrices()
 68 | 		
 69 | 		self.logger.info( 'Writing data to disk...' )
 70 | 		self.similarity.write()
 71 | 		
 72 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 73 | 	
 74 | 	def incrementCount( self, occurrence, key ):
 75 | 		if key not in occurrence:
 76 | 			occurrence[ key ] = 1
 77 | 		else:
 78 | 			occurrence[ key ] += 1
 79 | 	
 80 | 	def computeDocumentCooccurrence( self ):
 81 | 		document_count = 0
 82 | 		occurrence = {}
 83 | 		cooccurrence = {}
 84 | 		for docID, docTokens in self.tokens.data.iteritems():
 85 | 			self.logger.debug( '    %s (%d tokens)', docID, len(docTokens) )
 86 | 			tokenSet = frozenset(docTokens)
 87 | 			document_count += 1
 88 | 			for token in tokenSet:
 89 | 				self.incrementCount( occurrence, token )
 90 | 			for aToken in tokenSet:
 91 | 				for bToken in tokenSet:
 92 | 					if aToken < bToken:
 93 | 						self.incrementCount( cooccurrence, (aToken, bToken) )
 94 | 		
 95 | 		self.document_count = document_count
 96 | 		self.similarity.document_occurrence = occurrence
 97 | 		self.similarity.document_cooccurrence = cooccurrence
 98 | 	
 99 | 	def computeSlidingWindowCooccurrence( self, sliding_window_size ):
100 | 		window_count = 0
101 | 		occurrence = {}
102 | 		cooccurrence = {}
103 | 		for docID, docTokens in self.tokens.data.iteritems():
104 | 			allWindowTokens = self.getSlidingWindowTokens( docTokens, sliding_window_size )
105 | 			self.logger.debug( '    %s (%d tokens, %d windows)', docID, len(docTokens), len(allWindowTokens) )
106 | 			for windowTokens in allWindowTokens:
107 | 				tokenSet = frozenset(windowTokens)
108 | 				window_count += 1
109 | 				for token in tokenSet:
110 | 					self.incrementCount( occurrence, token )
111 | 				for aToken in tokenSet:
112 | 					for bToken in tokenSet:
113 | 						if aToken < bToken:
114 | 							self.incrementCount( cooccurrence, (aToken, bToken) )
115 | 		
116 | 		self.window_count = window_count
117 | 		self.similarity.window_occurrence = occurrence
118 | 		self.similarity.window_cooccurrence = cooccurrence
119 | 	
120 | 	def getSlidingWindowTokens( self, tokens, sliding_window_size ):
121 | 		allWindows = []
122 | 		aIndex = 0 - sliding_window_size
123 | 		bIndex = len(tokens) + sliding_window_size
124 | 		for index in range( aIndex, bIndex ):
125 | 			a = max( 0           , index - sliding_window_size )
126 | 			b = min( len(tokens) , index + sliding_window_size )
127 | 			allWindows.append( tokens[a:b] )
128 | 		return allWindows
129 | 	
130 | 	def computeTokenCounts( self ):
131 | 		token_count = sum( len(docTokens) for docTokens in self.tokens.data.itervalues() )
132 | 		
133 | 		unigram_counts = {}
134 | 		for docTokens in self.tokens.data.itervalues():
135 | 			for token in docTokens:
136 | 				self.incrementCount( unigram_counts, token )
137 | 		
138 | 		bigram_counts = {}
139 | 		for docTokens in self.tokens.data.itervalues():
140 | 			prevToken = None
141 | 			for currToken in docTokens:
142 | 				if prevToken is not None:
143 | 					self.incrementCount( bigram_counts, (prevToken, currToken) )
144 | 				prevToken = currToken
145 | 		
146 | 		self.token_count = token_count
147 | 		self.similarity.unigram_counts = unigram_counts
148 | 		self.similarity.bigram_counts = bigram_counts
149 | 	
150 | 	def getBinomial( self, B_given_A, any_given_A, B_given_notA, any_given_notA ):
151 | 		assert B_given_A >= 0
152 | 		assert B_given_notA >= 0
153 | 		assert any_given_A >= B_given_A
154 | 		assert any_given_notA >= B_given_notA
155 | 		
156 | 		a = float( B_given_A )
157 | 		b = float( B_given_notA )
158 | 		c = float( any_given_A )
159 | 		d = float( any_given_notA )
160 | 		E1 = c * ( a + b ) / ( c + d )
161 | 		E2 = d * ( a + b ) / ( c + d )
162 | 		
163 | 		g2a = 0
164 | 		g2b = 0
165 | 		if a > 0:
166 | 			g2a = a * math.log( a / E1 )
167 | 		if b > 0:
168 | 			g2b = b * math.log( b / E2 )
169 | 		return 2 * ( g2a + g2b )
170 | 	
171 | 	def getG2( self, freq_all, freq_ab, freq_a, freq_b ):
172 | 		assert freq_all >= freq_a
173 | 		assert freq_all >= freq_b
174 | 		assert freq_a >= freq_ab
175 | 		assert freq_b >= freq_ab
176 | 		assert freq_all >= 0
177 | 		assert freq_ab >= 0
178 | 		assert freq_a >= 0
179 | 		assert freq_b >= 0
180 | 		
181 | 		B_given_A = freq_ab
182 | 		B_given_notA = freq_b - freq_ab
183 | 		any_given_A = freq_a
184 | 		any_given_notA = freq_all - freq_a
185 | 		
186 | 		return self.getBinomial( B_given_A, any_given_A, B_given_notA, any_given_notA )
187 | 	
188 | 	def getG2Stats( self, max_count, occurrence, cooccurrence ):
189 | 		g2_stats = {}
190 | 		freq_all = max_count
191 | 		for ( firstToken, secondToken ) in cooccurrence:
192 | 			freq_a = occurrence[ firstToken ]
193 | 			freq_b = occurrence[ secondToken ]
194 | 			freq_ab = cooccurrence[ (firstToken, secondToken) ]
195 | 			
196 | 			scale = ComputeSimilarity.MAX_FREQ / freq_all
197 | 			rescaled_freq_all = freq_all * scale
198 | 			rescaled_freq_a = freq_a * scale
199 | 			rescaled_freq_b = freq_b * scale
200 | 			rescaled_freq_ab = freq_ab * scale
201 | 			if rescaled_freq_a > 1.0 and rescaled_freq_b > 1.0:
202 | 				g2_stats[ (firstToken, secondToken) ] = self.getG2( freq_all, freq_ab, freq_a, freq_b )
203 | 		return g2_stats
204 | 	
205 | 	def combineSimilarityMatrices( self ):
206 | 		self.logger.info( 'Combining similarity matrices...' )
207 | 		self.similarity.combined_g2 = {}
208 | 		
209 | 		keys_queued = []
210 | 		for key in self.similarity.document_g2:
211 | 			( firstToken, secondToken ) = key
212 | 			otherKey = ( secondToken, firstToken )
213 | 			keys_queued.append( key )
214 | 			keys_queued.append( otherKey )
215 | 		for key in self.similarity.window_g2:
216 | 			( firstToken, secondToken ) = key
217 | 			otherKey = ( secondToken, firstToken )
218 | 			keys_queued.append( key )
219 | 			keys_queued.append( otherKey )
220 | 		for key in self.similarity.collocation_g2:
221 | 			keys_queued.append( key )
222 | 		
223 | 		keys_processed = {}
224 | 		for key in keys_queued:
225 | 			keys_processed[ key ] = False
226 | 		
227 | 		for key in keys_queued:
228 | 			if not keys_processed[ key ]:
229 | 				keys_processed[ key ] = True
230 | 				
231 | 				( firstToken, secondToken ) = key
232 | 				if firstToken < secondToken:
233 | 					orderedKey = key
234 | 				else:
235 | 					orderedKey = ( secondToken, firstToken )
236 | 				score = 0.0
237 | 				if orderedKey in self.similarity.document_g2:
238 | 					score += self.similarity.document_g2[ orderedKey ]
239 | 				if orderedKey in self.similarity.window_g2:
240 | 					score += self.similarity.window_g2[ orderedKey ]
241 | 				if key in self.similarity.collocation_g2:
242 | 					score += self.similarity.collocation_g2[ key ]
243 | 				if score > 0.0:
244 | 					self.similarity.combined_g2[ key ] = score
245 | 
246 | #-------------------------------------------------------------------------------#
247 | 
248 | def main():
249 | 	parser = argparse.ArgumentParser( description = 'Compute term similarity for TermiteVis.' )
250 | 	parser.add_argument( 'config_file'          , type = str, default = None              , help = 'Path of Termite configuration file.' )
251 | 	parser.add_argument( '--data-path'          , type = str, dest = 'data_path'          , help = 'Override data path.'                 )
252 | 	parser.add_argument( '--sliding-window-size', type = int, dest = 'sliding_window_size', help = 'Override sliding window size.'       )
253 | 	parser.add_argument( '--logging'            , type = int, dest = 'logging'            , help = 'Override logging level.'             )
254 | 	args = parser.parse_args()
255 | 	
256 | 	data_path = None
257 | 	sliding_window_size = None
258 | 	logging_level = 20
259 | 	
260 | 	# Read in default values from the configuration file
261 | 	if args.config_file is not None:
262 | 		config = ConfigParser.RawConfigParser()
263 | 		config.read( args.config_file )
264 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
265 | 			data_path = config.get( 'Termite', 'path' )
266 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'sliding_window_size' ):
267 | 			sliding_window_size = config.get( 'Termite', 'sliding_window_size' )
268 | 		if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
269 | 			logging_level = config.getint( 'Misc', 'logging' )
270 | 	
271 | 	# Read in user-specifiec values from the program arguments
272 | 	if args.data_path is not None:
273 | 		data_path = args.data_path
274 | 	if args.sliding_window_size is not None:
275 | 		sliding_window_size = args.sliding_window_size
276 | 	if args.logging is not None:
277 | 		logging_level = args.logging
278 | 	
279 | 	ComputeSimilarity( logging_level ).execute( data_path, sliding_window_size )
280 | 
281 | if __name__ == '__main__':
282 | 	main()
283 | 


--------------------------------------------------------------------------------
/pipeline/compute_seriation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | import argparse
  6 | import ConfigParser
  7 | import logging
  8 | 
  9 | import time
 10 | from operator import itemgetter
 11 | from api_utils import SaliencyAPI, SimilarityAPI, SeriationAPI
 12 | 
 13 | class ComputeSeriation( object ):
 14 | 	"""Seriation algorithm.
 15 | 
 16 | 	Re-order words to improve promote the legibility of multi-word
 17 | 	phrases and reveal the clustering of related terms.
 18 | 	
 19 | 	As output, the algorithm produces a list of seriated terms and its 'ranking'
 20 | 	(i.e., the iteration in which a term was seriated).
 21 | 	"""
 22 | 	
 23 | 	DEFAULT_NUM_SERIATED_TERMS = 100
 24 | 	
 25 | 	def __init__( self, logging_level ):
 26 | 		self.logger = logging.getLogger( 'ComputeSeriation' )
 27 | 		self.logger.setLevel( logging_level )
 28 | 		handler = logging.StreamHandler( sys.stderr )
 29 | 		handler.setLevel( logging_level )
 30 | 		self.logger.addHandler( handler )
 31 | 	
 32 | 	def execute( self, data_path, numSeriatedTerms = None ):
 33 | 		
 34 | 		assert data_path is not None
 35 | 		if numSeriatedTerms is None:
 36 | 			numSeriatedTerms = ComputeSeriation.DEFAULT_NUM_SERIATED_TERMS
 37 | 		
 38 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 39 | 		self.logger.info( 'Computing term seriation...'                                                      )
 40 | 		self.logger.info( '    data_path = %s', data_path                                                    )
 41 | 		self.logger.info( '    number_of_seriated_terms = %d', numSeriatedTerms                              )
 42 | 		
 43 | 		self.logger.info( 'Connecting to data...' )
 44 | 		self.saliency = SaliencyAPI( data_path )
 45 | 		self.similarity = SimilarityAPI( data_path )
 46 | 		self.seriation = SeriationAPI( data_path )
 47 | 		
 48 | 		self.logger.info( 'Reading data from disk...' )
 49 | 		self.saliency.read()
 50 | 		self.similarity.read()
 51 | 		
 52 | 		self.logger.info( 'Reshaping saliency data...' )
 53 | 		self.reshape()
 54 | 		
 55 | 		self.logger.info( 'Computing seriation...' )
 56 | 		self.compute( numSeriatedTerms )
 57 | 		
 58 | 		self.logger.info( 'Writing data to disk...' )
 59 | 		self.seriation.write()
 60 | 		
 61 | 		self.logger.info( '--------------------------------------------------------------------------------' )
 62 | 	
 63 | 	def reshape( self ):
 64 | 		self.candidateSize = 100
 65 | 		self.orderedTermList = []
 66 | 		self.termSaliency = {}
 67 | 		self.termFreqs = {}
 68 | 		self.termDistinct = {}
 69 | 		self.termRank = {}
 70 | 		self.termVisibility = {}
 71 | 		for element in self.saliency.term_info:
 72 | 			term = element['term']
 73 | 			self.orderedTermList.append( term )
 74 | 			self.termSaliency[term] = element['saliency']
 75 | 			self.termFreqs[term] = element['frequency']
 76 | 			self.termDistinct[term] = element['distinctiveness']
 77 | 			self.termRank[term] = element['rank']
 78 | 			self.termVisibility[term] = element['visibility']
 79 | 	
 80 | 	def compute( self, numSeriatedTerms ):
 81 | 		# Elicit from user (1) the number of terms to output and (2) a list of terms that should be included in the output...
 82 | 		# set in init (i.e. read from config file)
 83 | 		
 84 | 		# Seriate!
 85 | 		start_time = time.time()
 86 | 		candidateTerms = self.orderedTermList
 87 | 		self.seriation.term_ordering = []
 88 | 		self.seriation.term_iter_index = []
 89 | 		self.buffers = [0,0]
 90 | 		
 91 | 		preBest = []
 92 | 		postBest = []
 93 | 		
 94 | 		for iteration in range(numSeriatedTerms):
 95 | 			print "Iteration no. ", iteration
 96 | 			
 97 | 			addedTerm = 0
 98 | 			if len(self.seriation.term_iter_index) > 0:
 99 | 				addedTerm = self.seriation.term_iter_index[-1]
100 | 			if iteration == 1:
101 | 				(preBest, postBest) = self.initBestEnergies(addedTerm, candidateTerms)
102 | 			(preBest, postBest, self.bestEnergies) = self.getBestEnergies(preBest, postBest, addedTerm)
103 | 			(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers) = self.iterate_eff(candidateTerms, self.seriation.term_ordering, self.seriation.term_iter_index, self.buffers, self.bestEnergies, iteration)
104 | 			
105 | 			print "---------------"
106 | 		seriation_time = time.time() - start_time
107 | 		
108 | 		# Output consists of (1) a list of ordered terms, and (2) the iteration index in which a term was ordered
109 | 		#print "term_ordering: ", self.seriation.term_ordering
110 | 		#print "term_iter_index: ", self.seriation.term_iter_index   # Feel free to pick a less confusing variable name
111 | 		
112 | 		#print "similarity matrix generation time: ", compute_sim_time
113 | 		#print "seriation time: ", seriation_time
114 | 		self.logger.debug("seriation time: " +  str(seriation_time))
115 | 
116 | #-------------------------------------------------------------------------------#
117 | # Helper Functions
118 | 	
119 | 	def initBestEnergies(self, firstTerm, candidateTerms):
120 | 		
121 | 		preBest = []
122 | 		postBest = []
123 | 		for candidate in candidateTerms:
124 | 			pre_score = 0
125 | 			post_score = 0
126 | 			
127 | 			# preBest
128 | 			if (candidate, firstTerm) in self.similarity.combined_g2:
129 | 				pre_score = self.similarity.combined_g2[(candidate, firstTerm)]
130 | 			# postBest
131 | 			if (firstTerm, candidate) in self.similarity.combined_g2:
132 | 				post_score = self.similarity.combined_g2[(firstTerm, candidate)]
133 | 			
134 | 			preBest.append((candidate, pre_score))
135 | 			postBest.append((candidate, post_score))
136 | 		
137 | 		return (preBest, postBest)
138 | 	
139 | 	def getBestEnergies(self, preBest, postBest, addedTerm):
140 | 		if addedTerm == 0:
141 | 			return (preBest, postBest, [])
142 | 		
143 | 		term_order = [x[0] for x in preBest]
144 | 		# compare candidate terms' bests against newly added term
145 | 		remove_index = -1
146 | 		for existingIndex in range(len(preBest)):
147 | 			term = term_order[existingIndex]
148 | 			if term == addedTerm:
149 | 				remove_index = existingIndex
150 | 			
151 | 			# check pre energies
152 | 			if (term, addedTerm) in self.similarity.combined_g2:
153 | 				if self.similarity.combined_g2[(term, addedTerm)] > preBest[existingIndex][1]:
154 | 					preBest[existingIndex] = (term, self.similarity.combined_g2[(term, addedTerm)])
155 | 			# check post energies
156 | 			if (addedTerm, term) in self.similarity.combined_g2:
157 | 				if self.similarity.combined_g2[(addedTerm, term)] > postBest[existingIndex][1]:
158 | 					postBest[existingIndex] = (term, self.similarity.combined_g2[(addedTerm, term)])
159 | 		
160 | 		# remove the added term's preBest and postBest scores
161 | 		if remove_index != -1:
162 | 			del preBest[remove_index]
163 | 			del postBest[remove_index]
164 | 		
165 | 		#create and sort the bestEnergies list
166 | 		energyMax = [sum(pair) for pair in zip([x[1] for x in preBest], [y[1] for y in postBest])]
167 | 		bestEnergies = zip([x[0] for x in preBest], energyMax)
168 | 		
169 | 		return (preBest, postBest, sorted(bestEnergies, key=itemgetter(1), reverse=True))
170 | 	
171 | 	def iterate_eff( self, candidateTerms, term_ordering, term_iter_index, buffers, bestEnergies, iteration_no ):
172 | 		maxEnergyChange = 0.0;
173 | 		maxTerm = "";
174 | 		maxPosition = 0;
175 | 		
176 | 		if len(bestEnergies) != 0:
177 | 			bestEnergy_terms = [x[0] for x in bestEnergies]
178 | 		else:
179 | 			bestEnergy_terms = candidateTerms
180 | 		
181 | 		breakout_counter = 0
182 | 		for candidate_index in range(len(bestEnergy_terms)):
183 | 			breakout_counter += 1
184 | 			candidate = bestEnergy_terms[candidate_index]
185 | 			for position in range(len(term_ordering)+1):
186 | 				current_buffer = buffers[position]
187 | 				candidateRank = self.termRank[candidate]
188 | 				if candidateRank <= (len(term_ordering) + self.candidateSize):
189 | 					current_energy_change = self.getEnergyChange(candidate, position, term_ordering, current_buffer, iteration_no)
190 | 					if current_energy_change > maxEnergyChange:
191 | 						maxEnergyChange = current_energy_change
192 | 						maxTerm = candidate
193 | 						maxPosition = position
194 | 			# check for early termination
195 | 			if candidate_index < len(bestEnergy_terms)-1 and len(bestEnergies) != 0:
196 | 				if maxEnergyChange >= (2*(bestEnergies[candidate_index][1] + current_buffer)):
197 | 					print "#-------- breaking out early ---------#"
198 | 					print "candidates checked: ", breakout_counter
199 | 					break;
200 | 		
201 | 		print "change in energy: ", maxEnergyChange
202 | 		print "maxTerm: ", maxTerm
203 | 		print "maxPosition: ", maxPosition
204 | 		
205 | 		candidateTerms.remove(maxTerm)
206 | 		
207 | 		# update buffers
208 | 		buf_score = 0
209 | 		if len(term_ordering) == 0:
210 | 			buffers = buffers
211 | 		elif maxPosition >= len(term_ordering):
212 | 			if (term_ordering[-1], maxTerm) in self.similarity.combined_g2:
213 | 				buf_score = self.similarity.combined_g2[(term_ordering[-1], maxTerm)]
214 | 			buffers.insert(len(buffers)-1, buf_score)
215 | 		elif maxPosition == 0:
216 | 			if (maxTerm, term_ordering[0]) in self.similarity.combined_g2:
217 | 				buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[0])]
218 | 			buffers.insert(1, buf_score)
219 | 		else:
220 | 			if (term_ordering[maxPosition-1], maxTerm) in self.similarity.combined_g2:
221 | 				buf_score = self.similarity.combined_g2[(term_ordering[maxPosition-1], maxTerm)]
222 | 			buffers[maxPosition] = buf_score
223 | 			
224 | 			buf_score = 0
225 | 			if (maxTerm, term_ordering[maxPosition]) in self.similarity.combined_g2:
226 | 				buf_score = self.similarity.combined_g2[(maxTerm, term_ordering[maxPosition])]
227 | 			buffers.insert(maxPosition+1, buf_score)
228 | 		
229 | 		# update term ordering and ranking
230 | 		if maxPosition >= len(term_ordering):
231 | 			term_ordering.append(maxTerm)
232 | 		else:
233 | 			term_ordering.insert(maxPosition, maxTerm)
234 | 		term_iter_index.append(maxTerm)
235 | 			
236 | 		
237 | 		return (candidateTerms, term_ordering, term_iter_index, buffers)
238 | 	
239 | 	def getEnergyChange(self, candidateTerm, position, term_list, currentBuffer, iteration_no):
240 | 		prevBond = 0.0
241 | 		postBond = 0.0
242 | 		
243 | 		# first iteration only
244 | 		if iteration_no == 0:
245 | 			current_freq = 0.0
246 | 			current_saliency = 0.0
247 | 			
248 | 			if candidateTerm in self.termFreqs:
249 | 				current_freq = self.termFreqs[candidateTerm]
250 | 			if candidateTerm in self.termSaliency:
251 | 				current_saliency = self.termSaliency[candidateTerm]
252 | 			return 0.001 * current_freq * current_saliency
253 | 		
254 | 		# get previous term
255 | 		if position > 0:
256 | 			prev_term = term_list[position-1]
257 | 			if (prev_term, candidateTerm) in self.similarity.combined_g2:
258 | 				prevBond = self.similarity.combined_g2[(prev_term, candidateTerm)]
259 | 		
260 | 		# get next term
261 | 		if position < len(term_list):
262 | 			next_term = term_list[position]
263 | 			if (next_term, candidateTerm) in self.similarity.combined_g2:
264 | 				postBond = self.similarity.combined_g2[(candidateTerm, next_term)]
265 | 		
266 | 		return 2*(prevBond + postBond - currentBuffer)
267 | 
268 | #-------------------------------------------------------------------------------#
269 | 
270 | def main():
271 | 	parser = argparse.ArgumentParser( description = 'Compute term seriation for TermiteVis.' )
272 | 	parser.add_argument( 'config_file'               , type = str, default = None                   , help = 'Path of Termite configuration file.'      )
273 | 	parser.add_argument( '--data-path'               , type = str, dest = 'data_path'               , help = 'Override data path.'                      )
274 | 	parser.add_argument( '--number-of-seriated-terms', type = int, dest = 'number_of_seriated_terms', help = 'Override the number of terms to seriate.' )
275 | 	parser.add_argument( '--logging'                 , type = int, dest = 'logging'                 , help = 'Override logging level.'                  )
276 | 	args = parser.parse_args()
277 | 	
278 | 	data_path = None
279 | 	number_of_seriated_terms = None
280 | 	logging_level = 20
281 | 	
282 | 	# Read in default values from the configuration file
283 | 	if args.config_file is not None:
284 | 		config = ConfigParser.RawConfigParser()
285 | 		config.read( args.config_file )
286 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'path' ):
287 | 			data_path = config.get( 'Termite', 'path' )
288 | 		if config.has_section( 'Termite' ) and config.has_option( 'Termite', 'number_of_seriated_terms' ):
289 | 			number_of_seriated_terms = config.getint( 'Termite', 'number_of_seriated_terms' )
290 | 		if config.has_section( 'Misc' ) and config.has_option( 'Misc', 'logging' ):
291 | 			logging_level = config.getint( 'Misc', 'logging' )
292 | 	
293 | 	# Read in user-specifiec values from the program arguments
294 | 	if args.data_path is not None:
295 | 		data_path = args.data_path
296 | 	if args.number_of_seriated_terms is not None:
297 | 		number_of_seriated_terms = args.number_of_seriated_terms
298 | 	if args.logging is not None:
299 | 		logging_level = args.logging
300 | 	
301 | 	ComputeSeriation( logging_level ).execute( data_path, number_of_seriated_terms )
302 | 
303 | if __name__ == '__main__':
304 | 	main()
305 | 


--------------------------------------------------------------------------------
/client-src/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
  2 | 
  3 | <html lang="en">
  4 | <head>
  5 | 	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  6 | 	<title>Termite | Topic Model Visualization</title>
  7 | 	<link href="termite.css" rel="stylesheet" type="text/css"/>
  8 | 	<link href="InteractionObjects.css" rel="stylesheet" type="text/css"/>
  9 | 	<script type="text/javascript" src="jquery.js"></script>
 10 | 	<script type="text/javascript" src="underscore.js"></script>
 11 | 	<script type="text/javascript" src="backbone.js"></script>
 12 | 	<script type="text/javascript" src="d3.v3.js"></script>
 13 | 	<script type="text/javascript" src="FullTermTopicProbabilityModel.js"></script>
 14 | 	<script type="text/javascript" src="SeriatedTermTopicProbabilityModel.js"></script>
 15 | 	<script type="text/javascript" src="FilteredTermTopicProbabilityModel.js"></script>
 16 | 	<script type="text/javascript" src="TermFrequencyModel.js"></script>
 17 | 	<script type="text/javascript" src="ViewParameters.js"></script>
 18 | 	<script type="text/javascript" src="TermTopicMatrixView.js"></script>
 19 | 	<script type="text/javascript" src="TermFrequencyView.js"></script>
 20 | 	<script type="text/javascript" src="UserControlViews.js"></script>
 21 | 	<script type="text/javascript" src="StateModel.js"></script>
 22 | 	<script type="text/javascript" src="html5slider.js"></script>
 23 | 	<script type="text/javascript" src="QueryString.js"></script>
 24 | <script type="text/javascript">
 25 | 
 26 | /**
 27 |  * Generates a descending sorted sparse matrix representation of a full matrix. 
 28 |  * Must be called by a model that has termIndex, topicIndex, and matrix default vars
 29 |  * (e.g. seriated model, filtered model)
 30 |  *
 31 |  * @this { a termTopic model }
 32 |  * @param { double } THRESHHOLD is defined in ViewParameters
 33 |  * @param { array } termIndex is a list of ordered terms, size n
 34 |  * @param { array } topicIndex is a list of ordered topics, size m
 35 |  * @param { 2D array } matrix is a n x m matrix of doubles
 36 |  * @return { array } Sparse matrix representation of matrix (list of objects)
 37 |  */
 38 | var generateSparseMatrix = function() {
 39 | 	var termIndex = this.get("termIndex");
 40 | 	var topicIndex = this.get("topicIndex");
 41 | 	var matrix = this.get("matrix");
 42 | 	var sparseMatrix = [];
 43 | 	for ( var i = 0; i < termIndex.length; i++ )
 44 | 		for ( var j = 0; j < topicIndex.length; j++ )
 45 | 			if ( matrix[i][j] > THRESHHOLD )
 46 | 				sparseMatrix.push({
 47 | 					'term' : termIndex[i],
 48 | 					'termIndex' : i,
 49 | 					'topicName' : topicIndex[j],
 50 | 					'topicIndex' : j,
 51 | 					'value' : matrix[i][j]
 52 | 				});
 53 | 	sparseMatrix = sparseMatrix.sort( function(a,b) { return b.value - a.value } );
 54 | 	return sparseMatrix;
 55 | };
 56 | /**
 57 |  * Returns a column-first representation of a row-first matrix
 58 |  * 
 59 |  * @this { index.html }
 60 |  * @param { 2D array } row first matrix
 61 |  * @return { 2D array } column first matrix
 62 |  */
 63 | var generateColumnFirst = function( matrix ) {
 64 | 	//var matrix = this.get("matrix");
 65 | 	var colMatrix = [];
 66 | 	if(matrix === null)
 67 | 		return null;
 68 | 	// init empty rows of column matrix
 69 | 	for (i = 0; i < matrix[0].length; ++i) {
 70 | 		colMatrix.push([]);
 71 | 	}
 72 | 	// fill in values
 73 | 	for (i = 0; i < matrix.length; ++i) {
 74 |     	var row = matrix[i];
 75 |     	for (j = 0; j < row.length; ++j) {
 76 |         	colMatrix[j].push(row[j]);
 77 |         }
 78 |     }
 79 |     return colMatrix;
 80 | };
 81 | // temp globals to check values with javascript console
 82 | var stateModel;
 83 | var seriatedTermTopicProbabilityModel;
 84 | var filteredTermTopicProbabilityModel;
 85 | var termTopicMatrixView;
 86 | var termFrequencyModel;
 87 | var termFrequencyView;
 88 | function init()
 89 | {
 90 | 	// create backbone models and views
 91 | 	stateModel = new StateModel();
 92 | 	
 93 | 	seriatedTermTopicProbabilityModel = new SeriatedTermTopicProbabilityModel();
 94 | 	filteredTermTopicProbabilityModel = new FilteredTermTopicProbabilityModel();
 95 | 	termFrequencyModel = new TermFrequencyModel();
 96 | 	
 97 | 	termTopicMatrixView = new TermTopicMatrixView( {el: "div.termTopicMatrixContainer"} );
 98 | 	termFrequencyView = new TermFrequencyView( {el: "div.termFrequencyContainer"} );
 99 | 	
100 | 	// init user control views
101 | 	var totalTermsView = new TotalTermsView( {model: stateModel} );
102 | 	var affinityNumTermsView = new AffinityNumTermsView( {model: stateModel} );
103 | 	var salientNumTermsView = new SalientNumTermsView( {model: stateModel} );
104 | 	var foundTermsView = new FoundTermsView( {model: stateModel} );
105 | 	var unfoundTermsView = new UnfoundTermsView( {model: stateModel} );
106 | 	var affinityNumTermsSlider = new AffinityNumTermsSlider( {model: stateModel} );
107 | 	var salientNumTermsSlider = new SalientNumTermsSlider( {model: stateModel} );
108 | 	var userDefinedTermsBox = new UserDefinedTermsBox( {model:stateModel} );
109 | 	var addTopTwenty = new AddTopTwenty( {model:stateModel} );
110 | 	var sortDescription = new SortDescription( {model:stateModel} );
111 | 	var clearAllButton = new ClearAllButton( { model:stateModel } );
112 | 	var clearSortButton = new ClearSortButton( { model:stateModel } );
113 | 	
114 | 	// initialize parent and state model where needed
115 | 	//seriatedTermTopicProbabilityModel.initModel( fullModel );	// unused at the moment
116 | 	filteredTermTopicProbabilityModel.initModel( seriatedTermTopicProbabilityModel, stateModel );
117 | 	termTopicMatrixView.initModel( filteredTermTopicProbabilityModel );
118 | 	termFrequencyModel.initModels( filteredTermTopicProbabilityModel, stateModel );
119 | 	termFrequencyView.initModel( termFrequencyModel );
120 | 	
121 | 	// load states from QueryString and save changes to state to query string
122 | 	stateModel.on( "change", stateModel.saveStatesToQueryString, stateModel );
123 | 
124 | 	// load all models
125 | 	stateModel.once( "loaded:states", seriatedTermTopicProbabilityModel.load, seriatedTermTopicProbabilityModel );
126 | 	seriatedTermTopicProbabilityModel.once( "loaded:seriated", filteredTermTopicProbabilityModel.load, filteredTermTopicProbabilityModel );
127 | 	filteredTermTopicProbabilityModel.once( "loaded:filtered", termTopicMatrixView.load, termTopicMatrixView );
128 | 	filteredTermTopicProbabilityModel.once( "loaded:filtered", termFrequencyModel.load, termFrequencyModel );
129 | 	termFrequencyModel.once( "loaded:freqModel", termFrequencyView.load, termFrequencyView );
130 | 	stateModel.once("sending:colors", termTopicMatrixView.receiveSelectedTopics, termTopicMatrixView);
131 | 	
132 | 	// initialize all events that listen to stateModel
133 | 	stateModel.once( "loaded:states", function() {	
134 | 		filteredTermTopicProbabilityModel.listenTo(stateModel, "change:numAffinityTerms", filteredTermTopicProbabilityModel.update.bind( filteredTermTopicProbabilityModel ));
135 | 		filteredTermTopicProbabilityModel.listenTo(stateModel, "change:numSalientTerms", filteredTermTopicProbabilityModel.update.bind( filteredTermTopicProbabilityModel ));
136 | 		filteredTermTopicProbabilityModel.listenTo(stateModel, "change:visibleTerms", filteredTermTopicProbabilityModel.update.bind( filteredTermTopicProbabilityModel ));
137 | 		filteredTermTopicProbabilityModel.listenTo(stateModel, "change:sortType change:doubleClickTopic", filteredTermTopicProbabilityModel.update.bind( filteredTermTopicProbabilityModel));
138 | 		filteredTermTopicProbabilityModel.listenTo(stateModel, "change:addTopTwenty", filteredTermTopicProbabilityModel.update.bind( filteredTermTopicProbabilityModel ));
139 | 		
140 | 		foundTermsView.listenTo( stateModel, 'change:foundTerms', foundTermsView.render );
141 | 		unfoundTermsView.listenTo( stateModel, 'change:unfoundTerms', unfoundTermsView.render );
142 | 
143 | 		termTopicMatrixView.listenTo(stateModel, "change:highlightedTerm", termTopicMatrixView.onSelectionTermChanged, termTopicMatrixView );
144 | 		termTopicMatrixView.listenTo(stateModel, "change:highlightedTopic", termTopicMatrixView.onSelectionTopicChanged, termTopicMatrixView );
145 | 	
146 | 		termFrequencyView.listenTo(stateModel, "change:highlightedTerm", termFrequencyView.onHighlightTermChanged, termFrequencyView);
147 | 		termFrequencyView.listenTo(stateModel, "change:highlightedTopic", termFrequencyView.onHighlightTopicChanged, termFrequencyView);
148 | 		
149 | 		termTopicMatrixView.listenTo( stateModel, "color:topic", termTopicMatrixView.clickTopic, termTopicMatrixView);
150 | 		filteredTermTopicProbabilityModel.listenTo( stateModel, "color:topic", filteredTermTopicProbabilityModel.selectTopic, filteredTermTopicProbabilityModel);
151 | 		termFrequencyModel.listenTo( stateModel, "color:topic", termFrequencyModel.selectTopic, termFrequencyModel);
152 | 		sortDescription.listenTo( stateModel, "change:sortType change:doubleClickTopic" , sortDescription.render );
153 | 	});
154 | 	
155 | 	// initialize all events that listen to filtered model
156 | 	filteredTermTopicProbabilityModel.once( "loaded:filtered", function() {
157 | 		// Declare dependencies
158 | 		// data pipeline events
159 | 		termFrequencyModel.listenTo( filteredTermTopicProbabilityModel, "change:termIndex", termFrequencyModel.update.bind( termFrequencyModel ));
160 | 		termTopicMatrixView.listenTo( filteredTermTopicProbabilityModel, "change:sparseMatrix", termTopicMatrixView.update.bind( termTopicMatrixView ));
161 | 		totalTermsView.listenTo( filteredTermTopicProbabilityModel, 'change:termIndex', totalTermsView.render );
162 | 	});
163 | 	
164 | 	// initialize all events that listen to term frequency model
165 | 	termFrequencyModel.once( "loaded:freqModel", function() {
166 | 		termFrequencyView.listenTo( termFrequencyModel, "change:topicalFreqMatrix", termFrequencyView.renderUpdate, termFrequencyView);
167 | 		termFrequencyView.listenTo( termFrequencyModel, "change:termIndex", termFrequencyView.renderUpdate.bind( termFrequencyView ));
168 | 	});
169 | 	
170 | 	// initialize user controls listeners that catch state model changes
171 | 	affinityNumTermsView.listenTo( stateModel, 'change:numAffinityTerms', affinityNumTermsView.render );
172 | 	salientNumTermsView.listenTo( stateModel, 'change:numSalientTerms', salientNumTermsView.render );
173 | 	totalTermsView.listenTo( stateModel, 'change:totalTerms', totalTermsView.render );
174 | 
175 | 	// initialize state model listeners that catch view events
176 | 	stateModel.listenTo( termTopicMatrixView, "mouseover:topic mouseout:topic", stateModel.setHighlightedTopic );
177 | 	stateModel.listenTo( termTopicMatrixView, "mouseover:term mouseout:term", stateModel.setHighlightedTerm );
178 | 	stateModel.listenTo( termFrequencyView, "mouseover:term mouseout:term", stateModel.setHighlightedTerm );
179 | 	stateModel.listenTo( termTopicMatrixView, "doubleClick:topic", stateModel.setDoubleClickTopic );	
180 | 	stateModel.listenTo( termTopicMatrixView, "click:topic", stateModel.selectTopic );
181 | 
182 | 	// being loading process
183 | 	stateModel.loadStatesFromQueryString();
184 | 	initColorObjects(null);
185 | }
186 | </script>
187 | </head>
188 | <body onload="init()">
189 | <div id="pageBackground">
190 | 	<div id="pageFrame">
191 | 		<div id="pageHeader">
192 | 			<div id="pageLogo" class="headerObject">
193 | 				<span class="title">Termite</span><span class="subtitle"> | Topic Model Visualization</span><br/>
194 | 				<span class="credits">Visualization by Jason Chuang, Ashley Jin, Stanford Vis Group</span><br/>
195 | 			</div>
196 | 			<div id="pageControl" class="headerObject">
197 | 				<table cellspacing="5">
198 | 					<tr>
199 | 						<td style="vertical-align: top">
200 | 							<div class="line">
201 | 								Currently showing 
202 | 								<div class="TotalTermsView" style="display:inline">25</div> 
203 | 								total terms.
204 | 							</div>
205 | 							<div class="line">
206 | 								Show top:
207 | 								<input class="AffinityNumTermsSlider" style="display:inline; width: 160px; vertical-align: -5px;" type="range" max="100" min="10" value="25"/> 
208 | 								<div class="AffinityNumTermsView" style="display:inline">25</div> 
209 | 								terms by affinity.</div>
210 | 							<div class="line">
211 | 								Show top:
212 | 								<input class="SalientNumTermsSlider" style="display:inline; width: 160px; vertical-align: -5px;" type="range" max="100" min="0" value="0"/>
213 | 								<div class="SalientNumTermsView" style="display:inline-block">0</div> 
214 | 								terms by saliency.</div>
215 | 							<div class="line">
216 | 								Always showing terms: 
217 | 								<input class="UserDefinedTermsBox" style="display:inline; width: 200px; vertical-align: -2px" type="text"/>
218 | 								<div class="FoundTermsView" style="display:inline-block"></div>
219 | 							</div>
220 | 							<div class="line">
221 | 								<div class="UnfoundTermsPrefix" style="display:inline-block; visibility:hidden">Cannot find these terms: </div>
222 | 								<div class="UnfoundTermsView" style="display:inline-block; visibility:hidden"></div>
223 | 							</div>
224 | 						</td>
225 | 						<td style="vertical-align: top">										
226 | 							<div class="line">
227 | 								Add top twenty terms for clicked topics
228 | 								<input type="checkbox" class="TopTwentyAddition" style="display:inline-block"></div> 
229 | 							</div>
230 | 							<div class="line">
231 | 								Sorting by: 
232 | 								<div class="SortDescription" style="display:inline-block">default</div> 
233 | 							</div>
234 | 							<div class="line">
235 | 								(Double click topic label to sort by topic.)
236 | 							</div>
237 | 							<div class="line">
238 | 								<button type="button" class="clearAll" style="width: 150px">Clear all topic selections</button>
239 | 							</div>
240 | 							<div class="line">
241 | 								<button type="button" class="clearSort" style="width: 150px">Clear sorting method</button>
242 | 							</div>
243 | 						</td>
244 | 					</tr>
245 | 				</table>
246 | 			</div>
247 | 		</div>
248 | 		<div id="pageContent">
249 | 			<table>
250 | 				<tr>
251 | 					<td style="vertical-align: top"><div class="termTopicMatrixContainer"></div></td>
252 | 					<td><div class="termFrequencyContainer"></div></td>
253 | 				</tr>
254 | 			</table>
255 | 			<div id="pageDetails">
256 | 				<p>This visualization shows the topical distribution of words in a corpus.</p>
257 | 				<p>The area of a circle is proportional to a word's frequency in a topic.</p>
258 | 			</div>
259 | 		</div>
260 | 		<div id="pageFooter">
261 | 			http://termite.stanford.edu | Version 1.1
262 | 		</div>
263 | 	</div>
264 | </div>
265 | </body>
266 | </html>
267 | 


--------------------------------------------------------------------------------
/client-src/TermFrequencyView.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	TermFrequencyView.js
  3 | 	
  4 | 	This view is responsible for generating the term frequency view.
  5 | 	
  6 | 	Details:
  7 | 	--------
  8 | 	Receives list of terms and associated frequencies from TermFrequencyModel. 
  9 | 	
 10 | 	Additionally, uses parameters defined in ViewParameters.js.
 11 | */
 12 | var TERMFREQ_TEXT_DEFAULT = {
 13 | 	FILL_COLOR: "#808080",
 14 | 	STROKE_OPACITY: 0,
 15 | 	FILL_OPACITY: 1
 16 | };
 17 | var TERMFREQ_BAR_DEFAULT = {
 18 | 	STROKE_COLOR: "#808080",
 19 | 	STROKE_WIDTH: 5,
 20 | 	STROKE_OPACITY: 0.4
 21 | };
 22 | var HISTOGRAM_ENCODING_PARAMETERS = {
 23 | 	NUM_TOPICS : 0,
 24 | 	setNumTopics : function(numTopics) { this.NUM_TOPICS = numTopics; },
 25 | 	DENSE_NUM_TOPICS: 50,
 26 | 	LOOSE_NUM_TOPICS: 20,
 27 | 	DENSE_PACKING: 12,
 28 | 	LOOSE_PACKING: 18,
 29 | 	packing : function()
 30 | 	{
 31 | 		return 12;
 32 | 	}
 33 | };
 34 | var HISTORGRAM_CONTAINER_PADDING = {
 35 | 	left_separation: 10,
 36 | 	top: 60,
 37 | 	left: 130, 
 38 | 	right: 20,
 39 | 	bottom: 60,
 40 | 	width: 150,
 41 | 	fullWidth : function() { return this.left + this.right + this.width },
 42 | 	fullHeight : function( numTopics, numTerms ) { return this.top + this.bottom + HISTOGRAM_ENCODING_PARAMETERS.packing() * numTerms }
 43 | };
 44 | 	
 45 | var TermFrequencyView = Backbone.View.extend({
 46 | 	initialize : function() {
 47 | 		this.parentModel = null;
 48 | 		
 49 | 		// encoders
 50 | 		this.ys = null;
 51 | 		this.line_length = null;
 52 | 		
 53 | 		// svg layers
 54 | 		this.svg = null;
 55 | 		this.svgTermLabelLayer = null;
 56 | 		this.svgTermBarLayer = null;
 57 | 		this.overlayLayer = null;
 58 | 		this.overlayLineLayer = null;
 59 | 		this.svgTopicalBarLayer = null;
 60 | 		this.svgTermHighlightLayer = null;
 61 | 		
 62 | 		// interaction variables
 63 | 		this.highlightedTerm = null;
 64 | 		this.highlightedTopic = null;
 65 | 		
 66 | 		this.selectedTopics = [];
 67 | 		this.colorClassPrefix = "HIST";
 68 | 		this.normalColor = "normal";
 69 | 		
 70 | 		// bar highlighting
 71 | 		this.totalOffsets = [];
 72 | 		this.prevHighlightColor = this.normalColor;
 73 | 		this.useOffset = false;
 74 | 	}
 75 | });
 76 | 
 77 | /**
 78 |  * Initialize Term Frequency View's parent model
 79 |  *
 80 |  * @private
 81 |  */
 82 | TermFrequencyView.prototype.initModel = function( model, state ){
 83 | 	this.parentModel = model;
 84 | };
 85 | 
 86 | /** 
 87 |  * Initialize/render histogram view's elements for the first time
 88 |  *
 89 |  * @private
 90 |  */
 91 | TermFrequencyView.prototype.load = function(){
 92 | 	this.renderInit();
 93 | 	this.renderUpdate();
 94 | };
 95 | 
 96 | /** 
 97 |  * Updates the view (public encapsulation used in index.html)
 98 |  */
 99 | TermFrequencyView.prototype.update = function() {
100 | 	this.renderUpdate();
101 | };
102 | 
103 | /**
104 |  * Transforms the topical frequency matrix into a form appropriate for d3 stacked bars
105 |  *
106 |  * @private
107 |  */
108 | TermFrequencyView.prototype.prepareStackedBars = function() {
109 | 	var matrix = this.parentModel.get("topicalFreqMatrix");
110 | 	if( matrix.length === 0)
111 | 		return [];
112 | 	
113 | 	var remapped = matrix.map( function(layer){
114 |     	return layer.map( function(d, j) { return { x : j, y : d }; } );
115 |     });
116 | 	
117 | 	var stackedTransformer = d3.layout.stack();
118 |     var stackedData = stackedTransformer(remapped);
119 |     
120 |     // update totalOffsets (for highlighting use)
121 |     this.totalOffsets = [];
122 |     if(stackedData.length > 0){
123 |     	for( var j = 0; j < stackedData[0].length; j++){
124 |     		var sum = 0.0;
125 |     		for( var i = 0; i < stackedData.length; i++){
126 |     			sum += stackedData[i][j].y;
127 |     		}
128 |     		this.totalOffsets[j] = sum;
129 |     	}
130 |     }
131 |     return stackedData;
132 | };
133 | 
134 | /** 
135 |  * Initialize histogram view's elements
136 |  *	-svg layers
137 |  *	-encoders
138 |  *  -etc.
139 |  *
140 |  * @private
141 |  */
142 | TermFrequencyView.prototype.renderInit = function() {
143 | 	var termIndex = this.parentModel.get("termIndex");
144 | 	var termFreq = this.parentModel.get("totalTermFreqs");
145 | 		
146 | 	// Compute encoders
147 | 	this.ys = d3.scale.linear();
148 | 	
149 | 	var maxFreq = 0.0;
150 | 	for( var i = 0; i < termIndex.length; i++ ) {
151 | 		if(termFreq[termIndex[i]] > maxFreq)
152 | 			maxFreq = termFreq[termIndex[i]];
153 | 	}
154 | 	this.line_length = d3.scale.linear().domain([0, maxFreq]).range( [ 0, HISTORGRAM_CONTAINER_PADDING.width ] );
155 | 
156 | 	// init svg layers
157 | 	var container =	d3.select(this.el);
158 | 	this.svg = container.append( "svg:svg" )
159 | 		.style( "cursor", "default" )
160 | 		.style( "width", HISTORGRAM_CONTAINER_PADDING.fullWidth() + "px" )
161 | 	this.svgTermLabelLayer = this.svg.append( "svg:g" )
162 | 		.attr( "class", "termLabelLayer" )
163 | 		.attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" );
164 | 	this.svgTermBarLayer = this.svg.append( "svg:g" )
165 | 		.attr( "class", "termBarLayer" )
166 | 		.attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" );
167 | 	this.overlayLayer = this.svg.append( "svg:g" )
168 | 		.attr( "class", "overlayLayer")
169 | 		.attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" );
170 | 	this.svgTopicalBarLayer = this.svg.append( "svg:g" )
171 | 		.attr( "class", "topicalBarLayer" )
172 | 		.attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" );
173 | 	this.svgTermHighlightLayer = this.svg.append( "svg:g" )
174 | 		.attr( "class", "termHighlightLayer" )
175 | 		.attr( "transform", "translate(" + HISTORGRAM_CONTAINER_PADDING.left + "," + HISTORGRAM_CONTAINER_PADDING.top + ")" );
176 | };
177 | 
178 | /** 
179 |  * Update histogram view's elements based on parent model's termIndex and term frequencies
180 |  *
181 |  * @private
182 |  */
183 | TermFrequencyView.prototype.renderUpdate = function() {
184 | 	var termIndex = this.parentModel.get("termIndex");
185 | 	var termFreq = this.parentModel.get("totalTermFreqs");
186 | 
187 | 	this.svg
188 | 		.style( "height", HISTORGRAM_CONTAINER_PADDING.fullHeight( HISTOGRAM_ENCODING_PARAMETERS.NUM_TOPICS, termIndex.length ) + "px" )
189 | 	
190 | 	this.ys.domain( [ 0, termIndex.length ] )
191 | 		.range( [ 0, termIndex.length * HISTOGRAM_ENCODING_PARAMETERS.packing()] );
192 | 
193 | 	this.svgTermLabelLayer.selectAll( "text" ).data( termIndex ).exit().remove();
194 | 	this.svgTermLabelLayer.selectAll( "text" ).data( termIndex ).enter().append( "svg:text" )
195 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this))
196 | 		.attr( "x", -HISTORGRAM_CONTAINER_PADDING.left_separation )
197 | 		.attr( "y", 3 )
198 | 	this.svgTermLabelLayer.selectAll( "text" ).data( termIndex )	
199 | 		.attr( "class", function(d) { return ["termLabel", "HISTnormal", getTermClassTag(d)].join(" ") })
200 | 		.attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) )
201 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this))
202 | 		.text( function(d) { return d } );
203 | 
204 | 	this.svgTermBarLayer.selectAll("line").data(termIndex).exit().remove();
205 | 	this.svgTermBarLayer.selectAll("line").data(termIndex).enter().append("svg:line")
206 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) )
207 | 		.attr( "y1", 0 )
208 | 		.attr( "y2", 0 )
209 | 		.attr( "x1", this.line_length(0) )
210 | 	this.svgTermBarLayer.selectAll("line").data(termIndex)
211 | 		.attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) )
212 | 		.attr( "class", function(d,i) { return ["termFreqBar", getTermClassTag(d)].join(" ") })
213 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) )
214 | 		.attr( "x2", function(d) { return this.line_length(termFreq[d]) }.bind(this) )
215 | 	
216 | 	var stackedData = this.prepareStackedBars();
217 | 	var colors = this.parentModel.get("colorList");
218 | 	
219 | 	this.overlayLayer.selectAll( "g" ).data(stackedData).exit().remove();
220 | 	this.overlayLayer.selectAll( "g" ).data(stackedData).enter().append("svg:g")
221 | 	this.gLayer = this.overlayLayer.selectAll( "g" ).data(stackedData)
222 | 		.attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) )
223 | 	
224 | 	this.gLayer.selectAll("line").data(function(d) {return d;}).exit().remove();
225 | 	this.gLayer.selectAll("line").data(function(d, i) { return d;}).enter().append("svg:line")
226 | 		.attr("y1", 0)
227 | 		.attr("y2", 0)
228 | 	this.gLayer.selectAll("line").data(function(d, i) { return d;})
229 | 		.attr("class", function(d,i){return ["line", getTermClassTag(termIndex[i])].join(" ") })
230 | 		.attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) )
231 | 		.attr("x1", function(d){ return this.line_length(d.y0)}.bind(this) )
232 | 		.attr("x2", function(d){return this.line_length(d.y0) + this.line_length(d.y)}.bind(this) )
233 | 		
234 | 	this.svgTopicalBarLayer.selectAll("line").data(termIndex).exit().remove();
235 | 	this.svgTopicalBarLayer.selectAll("line").data(termIndex).enter().append("svg:line")
236 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) )
237 | 		.attr( "y1", 0 )
238 | 		.attr( "y2", 0 )
239 | 		.attr( "x1", this.line_length(0) )
240 | 		.attr( "x2", this.line_length(0) )
241 | 	this.svgTopicalBarLayer.selectAll("line").data(termIndex)
242 | 		.attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) )
243 | 		.attr( "class", function(d,i) { return ["topicalFreqBar", getTermClassTag(d)].join(" ") })
244 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) )
245 | 			
246 | 	this.svgTermHighlightLayer.selectAll("line").data(termIndex).exit().remove();
247 | 	this.svgTermHighlightLayer.selectAll("line").data(termIndex).enter().append("svg:line")
248 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", "" ) }.bind(this) )
249 | 		.attr( "y1", 0 )
250 | 		.attr( "y2", 0 )
251 | 		.attr( "x1", this.line_length(0) )
252 | 		.style( "fill" , "none")
253 | 	this.svgTermHighlightLayer.selectAll("line").data(termIndex)
254 | 		.attr( "transform", function(d,i) { return "translate(0," + this.ys(i+0.5) + ")" }.bind(this) )
255 | 		.attr( "class", function(d,i) { return ["termHighlightBar", getTermClassTag(d)].join(" ") })
256 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this) )
257 | 		.attr( "x2", function(d) { return this.line_length(termFreq[d]) }.bind(this) )
258 | };
259 | 
260 | // interactions
261 | /** 
262 |  * Calls appropriate functions to deal with topic highlight event elements
263 |  *
264 |  * @param { model } model is passed but unused
265 |  * @param { int } value is the target topic
266 |  * @return { void }
267 |  */
268 | TermFrequencyView.prototype.onHighlightTopicChanged = function( model, value ) {
269 | 	var topic = value;
270 | 	if(topic === null)
271 | 		this.unhighlight( false, true );
272 | 	else
273 | 		this.highlight( null, topic );
274 | };
275 | /** 
276 |  * Calls appropriate functions to deal with term highlight event elements
277 |  *
278 |  * @param { model } model is passed but unused
279 |  * @param { string } value is the target term
280 |  * @return { void }
281 |  */
282 | TermFrequencyView.prototype.onHighlightTermChanged = function( model, value ) {
283 | 	var term = value;
284 | 	if(term === "")
285 | 		this.unhighlight( true, false );
286 | 	else
287 | 		this.highlight( term, null );
288 | };
289 | /** 
290 |  * Unhighlights elements based on term and/or topic
291 |  *
292 |  * @private
293 |  */
294 | TermFrequencyView.prototype.unhighlight = function( term, topic ) {
295 | 	// unhighlight term
296 | 	if( term ){
297 | 		term = this.highlightedTerm;
298 | 		this.highlightedTerm = null;
299 | 		this.svgTermLabelLayer.selectAll("." + getTermClassTag(term))
300 | 			.classed(this.colorClassPrefix + HIGHLIGHT, false)
301 | 			
302 | 		this.svgTermHighlightLayer.selectAll("." + getTermClassTag(term))
303 | 			.classed(this.colorClassPrefix + HIGHLIGHT, false)
304 | 	} 
305 | 	
306 | 	// unhighlight topic
307 | 	if( topic ){
308 | 		topic = this.highlightedTopic;
309 | 		var termIndex = this.parentModel.get("termIndex");
310 | 		var topicals = this.parentModel.getTopicalsForTopic(topic);
311 | 		this.highlightedTopic = null;
312 | 		
313 | 		// highlight labels
314 | 		for( var i = 0; i < termIndex.length; i++){
315 | 			var term = termIndex[i];
316 | 			if( topicals[i]> THRESHHOLD ){
317 | 				this.svgTermLabelLayer.selectAll("." + getTermClassTag(term))	
318 | 					.classed(this.colorClassPrefix + HIGHLIGHT, false)
319 | 			
320 | 				if( this.useOffset ){
321 | 					// make highlight bars invis 
322 | 					this.svgTopicalBarLayer.selectAll("." + getTermClassTag(term))
323 | 						.classed(this.colorClassPrefix + HIGHLIGHT, false)
324 | 						.attr( "x2", this.line_length(0))
325 | 						.attr( "x1", this.line_length(0));
326 | 				}
327 | 			}
328 | 		}
329 | 		
330 | 		// reset layers
331 | 		var colors = this.parentModel.get("colorList");
332 | 		this.gLayer = this.overlayLayer.selectAll( "g" )
333 | 			.attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) );
334 | 		
335 | 		// reset variables
336 | 		this.prevHighlightColor = this.normalColor;
337 | 		this.useOffset = false;
338 | 	}
339 | };
340 | /** 
341 |  * Highlights elements based on term and/or topic
342 |  *
343 |  * @private
344 |  */
345 | TermFrequencyView.prototype.highlight = function( term, topic ) {
346 | 	// highlight term
347 | 	if( term !== null ){
348 | 		this.highlightedTerm = term;
349 | 		this.svgTermLabelLayer.selectAll("." + getTermClassTag(term))
350 | 			.classed(this.colorClassPrefix + HIGHLIGHT, true)
351 | 			
352 | 		this.svgTermHighlightLayer.selectAll("." + getTermClassTag(term))
353 | 			.classed(this.colorClassPrefix + HIGHLIGHT, true)
354 | 	} 
355 | 	// highlight topic
356 | 	else if( topic !== null ){
357 | 		var termIndex = this.parentModel.get("termIndex");
358 | 		var topicals = this.parentModel.getTopicalsForTopic(topic);
359 | 		this.highlightedTopic = topic;
360 | 		
361 | 		// highlight labels
362 | 		var stackedData = this.prepareStackedBars();
363 | 		var selectedTopics = this.parentModel.get("selectedTopics");
364 | 		var colors = this.parentModel.get("colorList").slice();
365 | 		
366 | 		// decide how to "highlight" bars
367 | 		if( selectedTopics[topic] !== null){
368 | 			// previously selected topic
369 | 			this.prevHighlightColor = selectedTopics[topic];
370 | 			colors[colors.indexOf(selectedTopics[topic])] = HIGHLIGHT;
371 | 			this.gLayer = this.overlayLayer.selectAll( "g" )
372 | 				.attr("class", function(d,i) { return ["overlayGroup", this.colorClassPrefix + colors[i]].join(" ") }.bind(this) );
373 | 		} else {
374 | 			// add bar with offset
375 | 			this.useOffset = true;
376 | 		}
377 | 		for( var i = 0; i < termIndex.length; i++){
378 | 			var term = termIndex[i];
379 | 			if( topicals[i]> THRESHHOLD ){
380 | 				this.svgTermLabelLayer.selectAll("." + getTermClassTag(term))	
381 | 					.classed(this.colorClassPrefix + HIGHLIGHT, true)
382 | 				
383 | 				// highlight bars
384 | 				if( this.useOffset ) {
385 | 					// use the offset
386 | 					var offset = 0;
387 | 					if( this.totalOffsets.length > 0)
388 | 						offset = this.totalOffsets[i];
389 | 					
390 | 					this.svgTopicalBarLayer.selectAll("." + getTermClassTag(term))
391 | 						.classed(this.colorClassPrefix + HIGHLIGHT, true)
392 | 						.attr( "x2", this.line_length(offset + topicals[i]))
393 | 						.attr( "x1", this.line_length(offset));
394 | 				}
395 | 			}
396 | 		}
397 | 	}
398 | };


--------------------------------------------------------------------------------
/client-src/TermTopicMatrixView.js:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	TermTopicMatrixView.js
  3 | 	
  4 | 	This view is responsible for generating the term:topic similarity matrix.
  5 | 	
  6 | 	Details:
  7 | 	--------
  8 | 	Pulls list of ordered terms, topics, and similarity values from 
  9 | 	FilteredTermTopicProbabilityModel. 
 10 | 	
 11 | 	Additionally, uses parameters defined in ViewParameters.js.
 12 | */
 13 | var MATRIX_CONTAINER_PADDING = {
 14 | 	left_separation: 8,
 15 | 	top_separation: 5,
 16 | 	left: 110,
 17 | 	right: 20,
 18 | 	top: 60,
 19 | 	bottom: 60,
 20 | 	fullWidth : function( numTopics ) { return this.left + this.right + MATRIX_ENCODING_PARAMETERS.packing() * numTopics },
 21 | 	fullHeight : function( numTopics, numTerms ) { return this.top + this.bottom + MATRIX_ENCODING_PARAMETERS.packing() * numTerms }
 22 | };
 23 | 	
 24 | var MATRIX_ENCODING_PARAMETERS = {
 25 | 	NUM_TOPICS : 0,
 26 | 	NUM_TERMS : 0,
 27 | 	MATRIX : null,
 28 | 	setNumTopics : function(numTopics) { this.NUM_TOPICS = numTopics; },
 29 | 	setNumTerms : function(numTerms) { this.NUM_TERMS = numTerms; },
 30 | 	setMatrix : function(matrix) { this.MATRIX = matrix; },
 31 | 	DENSE_NUM_TOPICS: 50,
 32 | 	LOOSE_NUM_TOPICS: 20,
 33 | 	DENSE_PACKING: 12,
 34 | 	LOOSE_PACKING: 18,
 35 | 	packing : function()
 36 | 	{
 37 | 		return 12;
 38 | 	},
 39 | 	TARGET_PIXEL_DENSITY : 0.20,
 40 | 	radius : function( sparseMatrix, numTopics, numTerms )	// matrix view
 41 | 	{
 42 | 		var totalCirclePixels = 0.0;
 43 | 		for ( var i in sparseMatrix )
 44 | 			totalCirclePixels += sparseMatrix[i].value * Math.PI;
 45 | 		// Add up # pixels:  prob * Math.PI;
 46 | 		var totalMatrixPixels = numTopics * numTerms * this.packing() * this.packing();
 47 | 		
 48 | 		var targetPixels = ( totalMatrixPixels * this.TARGET_PIXEL_DENSITY );
 49 | 		var observedPixels = totalCirclePixels;
 50 | 		var areaScale = targetPixels / observedPixels;
 51 | 		var radiusScale = Math.sqrt( areaScale );
 52 | 		
 53 | 		var totalCirclePixels = 0.0;
 54 | 		for ( var i in sparseMatrix )
 55 | 			totalCirclePixels += radiusScale * radiusScale * ( sparseMatrix[i].value ) * Math.PI;
 56 | 		
 57 | 		return radiusScale;
 58 | 	}
 59 | };
 60 | 
 61 | var TermTopicMatrixView = Backbone.View.extend({
 62 | 	initialize : function() {
 63 | 		this.parentModel = null;
 64 | 		
 65 | 		// encodings
 66 | 		this.xs = null;
 67 | 		this.ys = null;
 68 | 		this.rs = null;
 69 | 		
 70 | 		// svg layers
 71 | 		this.svg = null;
 72 | 		this.xGridlineLayer = null;
 73 | 		this.yGridlineLayer = null;
 74 | 		this.matrixLayer = null;
 75 | 		this.leftLabelLayer = null;
 76 | 		this.topLabelLayer = null;
 77 | 				
 78 | 		// interaction variables
 79 | 		this.selectedTopics = [];
 80 | 		this.normalColor = "normal";
 81 | 		
 82 | 		this.highlightedTerm = null;
 83 | 		this.highlightedTopic = null;
 84 | 		
 85 | 		this.receivedColors = null;
 86 | 		
 87 | 	}
 88 | });
 89 | /** 
 90 |  * Initialize matrix view's parent model
 91 |  *
 92 |  * @private
 93 |  */
 94 | TermTopicMatrixView.prototype.initModel = function( model ) {
 95 | 	this.parentModel = model;
 96 | };
 97 | 
 98 | /**
 99 |  * Receives information about selected topics that were restored from saved state
100 |  *
101 |  */
102 | TermTopicMatrixView.prototype.receiveSelectedTopics = function( obj ){
103 | 	this.receivedColors = obj;
104 | };
105 | 
106 | /** 
107 |  * Initialize/render matrix view's elements for the first time
108 |  *
109 |  * @private
110 |  */
111 | TermTopicMatrixView.prototype.load = function(){
112 | 	this.renderInit();
113 | 	this.renderUpdate();
114 | 	
115 | 	for( var obj in this.selectedTopics ){
116 | 		this.selectTopic(obj, this.selectedTopics[obj]);
117 | 	}
118 | };
119 | 
120 | /** 
121 |  * Initialize all topics' selection color to DEFAULT (used by renderInit only)
122 |  *
123 |  * @private
124 |  */
125 | TermTopicMatrixView.prototype.defaultSelection = function(){
126 | 	var topicIndex = this.parentModel.get("topicIndex");
127 | 	for( var i = 0; i < topicIndex.length; i++ ){
128 | 		this.selectedTopics[i] = this.normalColor;
129 | 		if( this.receivedColors !== null && this.receivedColors[i] !== undefined){
130 | 			this.selectedTopics[i] = this.receivedColors[i];
131 | 		}
132 | 	}
133 | };
134 | 
135 | /** 
136 |  * Initialize matrix view's elements
137 |  *	-svg layers
138 |  *	-encoders
139 |  *  -etc.
140 |  *
141 |  * @private
142 |  */
143 | TermTopicMatrixView.prototype.renderInit = function(){
144 | 	var matrix = this.parentModel.get("sparseMatrix");
145 | 	var termIndex = this.parentModel.get("termIndex");
146 | 	var topicIndex = this.parentModel.get("topicIndex");
147 | 		
148 | 	this.defaultSelection();
149 | 		
150 | 	this.xs = d3.scale.linear();
151 | 	this.ys = d3.scale.linear();
152 | 
153 | 	this.rs = d3.scale.sqrt()
154 | 		.domain( [ 0, 1 ] )
155 | 		.range( [ 0, MATRIX_ENCODING_PARAMETERS.radius( matrix, topicIndex.length, termIndex.length ) ] );
156 | 	
157 | 	var container = d3.select( this.el );
158 | 	this.svg = container.append( "svg:svg" )
159 | 	
160 | 	this.initMatrixView();
161 | 	this.initTopLabelView();
162 | 	this.initLeftLabelView();
163 | };
164 | 
165 | /** 
166 |  * Update matrix view's elements based on parent model's termIndex, topicIndex, and matrix
167 |  *
168 |  * @private
169 |  */
170 | TermTopicMatrixView.prototype.renderUpdate = function(){
171 | 	var termIndex = this.parentModel.get("termIndex");
172 | 	var topicIndex = this.parentModel.get("topicIndex");
173 | 		
174 | 	this.xs
175 | 		.domain( [ 0, topicIndex.length ] )
176 | 		.range( [ MATRIX_CONTAINER_PADDING.left, MATRIX_CONTAINER_PADDING.left + topicIndex.length * MATRIX_ENCODING_PARAMETERS.packing() ] );
177 | 	this.ys
178 | 		.domain( [ 0, termIndex.length ] )
179 | 		.range( [ MATRIX_CONTAINER_PADDING.top, MATRIX_CONTAINER_PADDING.top + termIndex.length * MATRIX_ENCODING_PARAMETERS.packing() ] );
180 | 	this.svg
181 | 		.style( "width", MATRIX_CONTAINER_PADDING.fullWidth( topicIndex.length ) + "px" )
182 | 		.style( "height", MATRIX_CONTAINER_PADDING.fullHeight( topicIndex.length, termIndex.length ) + "px" )
183 | 	
184 | 	this.updateMatrixView();
185 | 	this.updateTopLabelView();
186 | 	this.updateLeftLabelView();
187 | };
188 | 
189 | /** 
190 |  * Init and update functions for each layer
191 |  *
192 |  * @private
193 |  */
194 | TermTopicMatrixView.prototype.initMatrixView = function(){			
195 | 	this.xGridlineLayer = this.svg.append( "svg:g" ).attr( "class", "xGridlineLayer" );
196 | 	this.yGridlineLayer = this.svg.append( "svg:g" ).attr( "class", "yGridlineLayer" );
197 | 	this.matrixLayer = this.svg.append( "svg:g" ).attr( "class", "matrixLayer" );
198 | };
199 | TermTopicMatrixView.prototype.updateMatrixView = function(){
200 | 	var matrix = this.parentModel.get("sparseMatrix");
201 | 	var termIndex = this.parentModel.get("termIndex");
202 | 	var topicIndex = this.parentModel.get("topicIndex");
203 | 	
204 | 	this.matrixLayer.selectAll( "circle" ).data( matrix ).exit().remove();
205 | 	this.matrixLayer.selectAll( "circle" ).data( matrix ).enter().append( "svg:circle" )
206 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", ""); this.trigger( "mouseout:topic", null); }.bind(this) )
207 | 	this.matrixLayer.selectAll( "circle" ).data( matrix )	
208 | 		.attr( "class", function(d) { return [ "matrixElement", this.selectedTopics[d.topicIndex], getTopicClassTag(d.topicName), getTermClassTag(d.term) ].join(" ") }.bind(this))
209 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d.term); this.trigger( "mouseover:topic", d.topicIndex); }.bind(this) )
210 | 		.on( "click", function (d) { this.trigger( "click:topic", d.topicIndex ) }.bind(this)) 
211 | 		.attr( "cx", function(d) { return this.xs(d.topicIndex+0.5) }.bind(this) )
212 | 		.attr( "cy", function(d) { return this.ys(d.termIndex+0.5) }.bind(this) )
213 | 		.attr( "r", function(d) { return this.rs(d.value) }.bind(this) )
214 | 		
215 | 	this.xGridlineLayer.selectAll( "line" ).data( termIndex ).exit().remove();
216 | 	this.xGridlineLayer.selectAll( "line" ).data( termIndex ).enter().append( "svg:line" )
217 | 		.attr( "x1", this.xs(0.5) )
218 | 	this.xGridlineLayer.selectAll( "line" ).data( termIndex )
219 | 		.attr( "class", function(d) { return [ "verticalLine", this.normalColor, getTermClassTag(d) ].join(" ") }.bind(this)) 
220 | 		.attr( "x2", this.xs(topicIndex.length-0.5) )
221 | 		.attr( "y1", function(d,i) { return this.ys(i+0.5) }.bind(this) )
222 | 		.attr( "y2", function(d,i) { return this.ys(i+0.5) }.bind(this) )
223 | 
224 | 	this.yGridlineLayer.selectAll( "line" ).data( topicIndex ).exit().remove();
225 | 	this.yGridlineLayer.selectAll( "line" ).data( topicIndex ).enter().append( "svg:line" )
226 | 		.attr( "y1", this.ys(0.5) )
227 | 	this.yGridlineLayer.selectAll( "line" ).data( topicIndex )
228 | 		.attr( "class", function(d, i) { return [ "verticalLine", this.selectedTopics[i], getTopicClassTag(d)].join(" ") }.bind(this)) 
229 | 		.attr( "x1", function(d,i){ return this.xs(i+0.5) }.bind(this) )
230 | 		.attr( "x2", function(d,i){ return this.xs(i+0.5) }.bind(this) )
231 | 		.attr( "y2", this.ys(termIndex.length-0.5) )
232 | };
233 | TermTopicMatrixView.prototype.initTopLabelView = function(){
234 | 	this.topLabelLayer = this.svg.append( "svg:g" )
235 | 		.attr( "class", "topLabelLayer" );
236 | };
237 | TermTopicMatrixView.prototype.updateTopLabelView = function(){
238 | 	var topicIndex = this.parentModel.get("topicIndex");
239 | 	var dblclickTimer = null;
240 | 
241 | 	this.topLabelLayer.selectAll( "text" ).data( topicIndex ).exit().remove()
242 | 	this.topLabelLayer.selectAll( "text" ).data( topicIndex ).enter().append( "svg:text" )
243 | 		.on( "mouseout", function() { this.trigger( "mouseout:topic", null) }.bind(this))
244 | 		.attr( "y", 3 )
245 | 	this.topLabelLayer.selectAll( "text" ).data( topicIndex )
246 | 		.attr( "class", function(d, i) { return ["topLabel", this.selectedTopics[i], getTopicClassTag(d)].join(" ") }.bind(this))
247 | 		.on( "mouseover", function(d, i) { this.trigger( "mouseover:topic", i ) }.bind(this))
248 | 		.attr( "transform", function(d,i) { return "translate(" + this.xs(i+0.5) + "," + (this.ys(0)-MATRIX_CONTAINER_PADDING.top_separation) + ") rotate(270)" }.bind(this) )
249 | 		.text( function(d) { return d } )
250 | 		.on( "click", function(d, i) { 
251 | 				dblclickTimer = setTimeout(function(){ clickWork(d, i)}, 200);
252 | 			})
253 | 		.on( "dblclick", function(d, i){ 
254 | 				clearTimeout(dblclickTimer);
255 |   				dblclickTimer = null;
256 |   				this.trigger( "doubleClick:topic", i) 
257 |   			}.bind(this))
258 |   	
259 |   	var clickWork = function(d, i) {
260 |   		if(dblclickTimer === null)
261 | 			return; 
262 | 		else { 
263 | 			this.trigger( "click:topic", i)
264 | 		}
265 |   	}.bind(this);
266 | };
267 | TermTopicMatrixView.prototype.initLeftLabelView = function(){
268 | 	this.leftLabelLayer = this.svg.append( "svg:g" )
269 | 		.attr( "class", "leftLabelLayer" );
270 | };
271 | TermTopicMatrixView.prototype.updateLeftLabelView = function(){
272 | 	var termIndex = this.parentModel.get("termIndex");
273 | 	
274 | 	this.leftLabelLayer.selectAll( "text" ).data( termIndex ).exit().remove();
275 | 	this.leftLabelLayer.selectAll( "text" ).data( termIndex ).enter().append( "svg:text" )
276 | 		.on( "mouseout", function() { this.trigger( "mouseout:term", "") }.bind(this))
277 | 		.attr( "y", 3 )
278 | 	this.leftLabelLayer.selectAll( "text" ).data( termIndex )
279 | 		.attr( "class", function(d) { return ["leftLabel", this.normalColor, getTermClassTag(d)].join(" ") }.bind(this))
280 | 		.on( "mouseover", function(d) { this.trigger( "mouseover:term", d ) }.bind(this))
281 | 		.attr( "transform", function(d,i) { return "translate(" + (this.xs(0)-MATRIX_CONTAINER_PADDING.left_separation) + "," + this.ys(i+0.5) + ")" }.bind(this) )
282 | 		.text( function(d) { return d } )
283 | };
284 | /** end init and update functions **/
285 | 
286 | /** 
287 |  * Updates the view (public encapsulation used in index.html)
288 |  */
289 | TermTopicMatrixView.prototype.update = function() {
290 | 	this.renderUpdate();
291 | };
292 | 
293 | 
294 | // Interactions
295 | /** 
296 |  * Calls appropriate functions to deal with term highlight event elements
297 |  *
298 |  * @param { model } model is passed but unused
299 |  * @param { string } value is the target term
300 |  * @return { void }
301 |  */
302 | TermTopicMatrixView.prototype.onSelectionTermChanged = function( model, value ) {
303 | 	var term = value;
304 | 	if(term === "")
305 | 		this.unhighlight( true, false );
306 | 	else
307 | 		this.highlight( term, null );
308 | };
309 | /** 
310 |  * Calls appropriate functions to deal with topic highlight event elements
311 |  *
312 |  * @param { model } model is passed but unused
313 |  * @param { int } value is the target topic index
314 |  * @return { void }
315 |  */
316 | TermTopicMatrixView.prototype.onSelectionTopicChanged = function( model, value ) {
317 | 	var topic = value;
318 | 	if(topic === null)
319 | 		this.unhighlight( false, true );
320 | 	else
321 | 		this.highlight( null, topic );
322 | };
323 | 
324 | /** 
325 |  * Highlights elements based on term and/or topic
326 |  *
327 |  * @private
328 |  */
329 | TermTopicMatrixView.prototype.highlight = function( term, topic ) {
330 | 	if( term !== null ){
331 | 		this.highlightedTerm = term;
332 | 		this.svg.selectAll("." + getTermClassTag(term))
333 | 			.classed(HIGHLIGHT, true)
334 | 	} 
335 | 	
336 | 	if( topic !== null ){
337 | 		var topicIndex = this.parentModel.get("topicIndex");
338 | 		var termIndex = this.parentModel.get("termIndex");
339 | 		var matrix = this.parentModel.get("matrix");
340 | 		
341 | 		this.highlightedTopic = topic;
342 | 		this.svg.selectAll("." + getTopicClassTag(topicIndex[topic]))
343 | 			.classed(HIGHLIGHT, true)	
344 | 			
345 | 		// highlight term labels
346 | 		for( var i = 0; i < termIndex.length; i++){
347 | 			var term = termIndex[i];
348 | 			if( matrix[i][topic] > THRESHHOLD ){
349 | 				this.leftLabelLayer.selectAll("." + getTermClassTag(term))	
350 | 					.classed(HIGHLIGHT, true)
351 | 			}
352 | 		}
353 | 	}
354 | };
355 | /** 
356 |  * Unhighlights elements based on term and/or topic
357 |  *
358 |  * @private
359 |  */
360 | TermTopicMatrixView.prototype.unhighlight = function( term, topic ) {
361 | 	if( term  && this.highlightedTerm !== null){
362 | 		this.svg.selectAll("." + getTermClassTag(this.highlightedTerm))
363 | 			.classed(HIGHLIGHT, false)
364 | 		
365 | 		this.highlightedTerm = null;
366 | 	} 
367 | 	
368 | 	if( topic && this.hightlightedTopic !== null){
369 | 		var topicIndex = this.parentModel.get("topicIndex");
370 | 		var termIndex = this.parentModel.get("termIndex");
371 | 		var matrix = this.parentModel.get("matrix");
372 | 		
373 | 		var topicNo = this.highlightedTopic;
374 | 		this.svg.selectAll("." + getTopicClassTag(topicIndex[topicNo]))
375 | 			.classed(HIGHLIGHT, false)
376 | 		
377 | 		// unhighlight labels
378 | 		for( var i = 0; i < termIndex.length; i++){
379 | 			var term = termIndex[i];
380 | 			if( matrix[i][topicNo] > THRESHHOLD ){
381 | 				this.leftLabelLayer.selectAll("." + getTermClassTag(term))	
382 | 					.classed(HIGHLIGHT, false)
383 | 			}
384 | 		}
385 | 		
386 | 		this.highlightedTopic = null;
387 | 	}
388 | };
389 | 
390 | /** 
391 |  * Calls appropriate functions to deal with topic selection event elements
392 |  *
393 |  * @param { object } contains both target topic index and associated color
394 |  * @return { void }
395 |  */
396 | TermTopicMatrixView.prototype.clickTopic = function( obj ){
397 | 	this.selectTopic(obj.topic, obj.color);
398 | };
399 | /** 
400 |  * topic selection behavior
401 |  *
402 |  * @private
403 |  */
404 | TermTopicMatrixView.prototype.selectTopic = function( topic, colorClass ) {
405 | 	var topicIndex = this.parentModel.get("topicIndex");
406 | 	if( topic !== null){
407 | 
408 | 		if( colorClass === DEFAULT)
409 | 			colorClass = this.normalColor;
410 | 			
411 | 		var oldColor = this.selectedTopics[topic];
412 | 
413 | 		// set new color
414 | 		this.svg.selectAll("." + getTopicClassTag(topicIndex[topic]))
415 | 			.classed(oldColor, false)
416 | 			.classed(colorClass, true)
417 | 			
418 | 		this.selectedTopics[topic] = colorClass;
419 | 	}
420 | };


--------------------------------------------------------------------------------