├── lib
    └── indri_jni.dll.win64
├── src
    ├── main
    │   ├── resources
    │   │   └── public
    │   │   │   ├── chalk-bar.png
    │   │   │   ├── chalk-left-end.png
    │   │   │   ├── chalk-right-end.png
    │   │   │   ├── scripts
    │   │   │       ├── index.js
    │   │   │       └── query.js
    │   │   │   └── stylesheets
    │   │   │       └── index.css
    │   ├── java
    │   │   ├── edu
    │   │   │   └── uncc
    │   │   │   │   └── cs
    │   │   │   │       └── watsonsim
    │   │   │   │           ├── scripts
    │   │   │   │               ├── package-info.java
    │   │   │   │               └── WiktionaryParser.java
    │   │   │   │           ├── datapreparation
    │   │   │   │               └── KingJamesBible.java
    │   │   │   │           ├── scorers
    │   │   │   │               ├── Merge.java
    │   │   │   │               ├── Scorer.java
    │   │   │   │               ├── PassageCount.java
    │   │   │   │               ├── AnswerLength.java
    │   │   │   │               ├── QuestionID.java
    │   │   │   │               ├── LuceneEcho.java
    │   │   │   │               ├── AnswerInPassage.java
    │   │   │   │               ├── LATMentions.java
    │   │   │   │               ├── package-info.java
    │   │   │   │               ├── WShalabyScorer.java
    │   │   │   │               ├── AnswerInQuestionScorer.java
    │   │   │   │               ├── PassageQuestionLengthRatio.java
    │   │   │   │               ├── Correct.java
    │   │   │   │               ├── TopPOS.java
    │   │   │   │               ├── QAKeywordMatch.java
    │   │   │   │               ├── QPKeywordMatch.java
    │   │   │   │               ├── AnswerScorer.java
    │   │   │   │               ├── WordProximity.java
    │   │   │   │               ├── EntropyTest.java
    │   │   │   │               ├── GloveAnswerQuestionContext.java
    │   │   │   │               ├── SkipBigram.java
    │   │   │   │               ├── PercentWordsInCommon.java
    │   │   │   │               ├── WPPageViews.java
    │   │   │   │               ├── GloveAnswerQuestionContextTest.java
    │   │   │   │               ├── CommonConstituents.java
    │   │   │   │               ├── NGram.java
    │   │   │   │               ├── DateMatches.java
    │   │   │   │               ├── Entropy.java
    │   │   │   │               ├── NamedEntityRecognizerScorer.java
    │   │   │   │               ├── PassageTermMatch.java
    │   │   │   │               ├── AnswerPOS.java
    │   │   │   │               ├── LATCheck.java
    │   │   │   │               ├── ElliotMerschScorer.java
    │   │   │   │               ├── StephensonOpenNLPScorer.java
    │   │   │   │               ├── PassageScorer.java
    │   │   │   │               ├── JM_Scorer.java
    │   │   │   │               └── PassageScorerOpenNLPAda.java
    │   │   │   │           ├── QType.java
    │   │   │   │           ├── index
    │   │   │   │               ├── Segment.java
    │   │   │   │               ├── Indri.java
    │   │   │   │               ├── Lucene.java
    │   │   │   │               └── Bigrams.java
    │   │   │   │           ├── nlp
    │   │   │   │               ├── Weighted.java
    │   │   │   │               ├── RelatednessTest.java
    │   │   │   │               ├── DenseVectorsTest.java
    │   │   │   │               ├── Redirects.java
    │   │   │   │               ├── ApproxStringIntMapTest.java
    │   │   │   │               ├── ApproxStringIntMap.java
    │   │   │   │               ├── StringStackTest.java
    │   │   │   │               ├── DenseVectors.java
    │   │   │   │               ├── StringStack.java
    │   │   │   │               └── ClueType.java
    │   │   │   │           ├── researchers
    │   │   │   │               ├── Normalize.java
    │   │   │   │               ├── package-info.java
    │   │   │   │               ├── HyphenTrimmer.java
    │   │   │   │               ├── PassageRetrieval.java
    │   │   │   │               ├── MergeByText.java
    │   │   │   │               ├── AnswerTrimming.java
    │   │   │   │               ├── RedirectSynonyms.java
    │   │   │   │               ├── MergeByCommonSupport.java
    │   │   │   │               ├── StrictFilters.java
    │   │   │   │               ├── PersonRecognition.java
    │   │   │   │               ├── MergeAnswers.java
    │   │   │   │               ├── URLExpander.java
    │   │   │   │               ├── StatsDump.java
    │   │   │   │               ├── Researcher.java
    │   │   │   │               ├── WekaTee.java
    │   │   │   │               └── TagLAT.java
    │   │   │   │           ├── KVTest.java
    │   │   │   │           ├── search
    │   │   │   │               ├── CachingSearcher.java
    │   │   │   │               ├── LucenePassageSearcher.java
    │   │   │   │               ├── MeanDVSearchTest.java
    │   │   │   │               ├── LuceneSearcher.java
    │   │   │   │               ├── IndriSearcher.java
    │   │   │   │               ├── Searcher.java
    │   │   │   │               ├── BingSearcher.java
    │   │   │   │               └── Anagrams.java
    │   │   │   │           ├── DBQuestionSource.java
    │   │   │   │           ├── WebFrontend.java
    │   │   │   │           ├── Log.java
    │   │   │   │           ├── KV.java
    │   │   │   │           ├── Question.java
    │   │   │   │           ├── Database.java
    │   │   │   │           ├── WatsonSim.java
    │   │   │   │           ├── Configuration.java
    │   │   │   │           └── Passage.java
    │   │   └── privatedata
    │   │   │   └── UserSpecificConstants.java.sample
    │   ├── parse.rules
    │   ├── parse.pl
    │   └── scala
    │   │   └── scripts
    │   │       └── BigramBigramIndexer.scala
    └── test
    │   └── java
    │       └── edu
    │           └── uncc
    │               └── cs
    │                   └── watsonsim
    │                       ├── QuestionResultsScorerTest.java
    │                       ├── AnswerMergeTest.java
    │                       ├── StringUtilsTest.java
    │                       ├── ReindexEdgesTest.java
    │                       ├── TypeDetectionTest.java
    │                       ├── DateMatchesTest.java
    │                       ├── CoreNLPSentenceSimilarityTest.java
    │                       └── QClassDetectionTest.java
├── .travis.yml
├── .gitignore
├── scripts
    ├── populate_semantic_graph.py
    ├── gensim
    │   ├── intro1.py
    │   ├── scatter.py
    │   ├── digestion.py
    │   ├── import_glove.py
    │   ├── vstore.py
    │   ├── analogy.py
    │   └── intro-1level.py
    ├── import_trec.py
    ├── convert_arff_to_leveldb.py
    ├── create.sql
    └── svm_graph.py
├── config.properties.sample
├── get_started.sh
└── get_started.py


/lib/indri_jni.dll.win64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/lib/indri_jni.dll.win64


--------------------------------------------------------------------------------
/src/main/resources/public/chalk-bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-bar.png


--------------------------------------------------------------------------------
/src/main/resources/public/chalk-left-end.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-left-end.png


--------------------------------------------------------------------------------
/src/main/resources/public/chalk-right-end.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-right-end.png


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: java
 2 | jdk:
 3 |   - openjdk7
 4 | 
 5 | install:
 6 |   - TERM=dumb gradle -Ptarget assemble
 7 | 
 8 | script:
 9 |     - TERM=dumb gradle -Ptarget --info check
10 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scripts/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Scripts intended to ease development with watsonsim
3 |  */
4 | /**
5 |  * @author Sean Gallagher
6 |  *
7 |  */
8 | package edu.uncc.cs.watsonsim.scripts;


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/datapreparation/KingJamesBible.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.datapreparation;
 2 | 
 3 | public class KingJamesBible {
 4 | 	
 5 | 	public static void main(String[] args) {
 6 | 		
 7 | 	}
 8 | 
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/Merge.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | /**
 4 |  * These are the ways to merge a score.
 5 |  * @author Sean
 6 |  *
 7 |  */
 8 | public enum Merge {
 9 | 	Mean, Or, Min, Max, Sum
10 | }
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | build/
 3 | .gradle/
 4 | *.jar
 5 | # Package Files #
 6 | *.jar
 7 | *.war
 8 | *.ear
 9 | .classpath
10 | .project
11 | .settings/*
12 | bin/*
13 | data/*
14 | lib/indri_jni.dll
15 | lib/indri_jni.so
16 | src/main/java/privatedata/UserSpecificConstants.java
17 | /bin
18 | config.properties
19 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/Scorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Question;
 7 | 
 8 | public interface Scorer {
 9 | 	public void scoreQuestion(Question q, List<Answer> answers);
10 | }
11 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/QType.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | /**
 4 |  * Enum representing the QType of the Question
 5 |  * 
 6 |  * @author Ken Overholt
 7 |  *
 8 |  */
 9 | public enum QType {
10 | 	FACTOID,
11 | 	FITB,
12 | 	COMMON_BONDS,
13 | 	BEFORE_AND_AFTER,
14 | 	ANAGRAM,
15 | 	QUOTATION
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PassageCount.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | import edu.uncc.cs.watsonsim.Answer;
 3 | import edu.uncc.cs.watsonsim.Question;
 4 | 
 5 | 
 6 | /**
 7 |  * @author Sean Gallagher
 8 |  */
 9 | public class PassageCount extends AnswerScorer {
10 | 	public double scoreAnswer(Question q, Answer a) {
11 | 		return a.passages.size();
12 | 	}
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/index/Segment.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.index;
 2 | 
 3 | import java.io.Closeable;
 4 | import java.util.function.Consumer;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Passage;
 7 | 
 8 | /**
 9 |  * A Segment is a part of the Indexing pipeline
10 |  * It is just the union of Closeable and Consumer
11 |  */
12 | public interface Segment extends Closeable, Consumer<Passage> {
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/Weighted.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | /**
 4 |  * Simple immutable wrapper to express weight or probability
 5 |  * @author Sean Gallagher
 6 |  *
 7 |  * @param <T>
 8 |  */
 9 | public class Weighted<T> {
10 | 	public final T item;
11 | 	public final double weight;
12 | 	public Weighted(T item, double weight) {
13 | 		this.item = item;
14 | 		this.weight = weight;
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/parse.rules:
--------------------------------------------------------------------------------
 1 | [(?noun_r urn:sent:type ?type_r) <-
 2 |     (?type_r urn:sent:nsubj ?noun_r),
 3 |     (?type_r urn:sent:det ?det),
 4 |     (?type_r urn:sent:cop ?cop)
 5 |     // (?type_r urn:sent:tag urn:sent:NN)
 6 |     // (?type_r urn:sent:idx ?type_idx)
 7 |     // (?noun_r urn:sent:idx ?noun_idx)
 8 | ]
 9 | [(?noun_r urn:sent:type ?type_other),
10 |  (?type_other urn:sent:conj_and ?type_r) ->
11 |     (?noun_r urn:sent:type ?type_r)
12 | ]
13 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerLength.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Question;
 5 | 
 6 | /**
 7 |  * Return the length of the candidate text in chars.
 8 |  * @author Sean Gallagher
 9 |  */
10 | public class AnswerLength extends AnswerScorer {
11 | 	
12 | 	public double scoreAnswer(Question q, Answer a) {
13 | 		return a.text.length();
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/Normalize.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Question;
 7 | import edu.uncc.cs.watsonsim.Score;
 8 | 
 9 | public class Normalize extends Researcher {
10 | 
11 | 	@Override
12 | 	public List<Answer> question(Question q, List<Answer> candidates) {
13 | 		return Score.normalizeGroup(candidates);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/package-info.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Question transformation functions intended to improve later scoring.
 3 |  * Researchers can modify the question in unspecified ways, and thus the order
 4 |  * of execution of researchers matters.
 5 |  * Try to pick a more structured way of modifying the question if applicable.
 6 |  * For example, scoring should use a Scorer.
 7 |  * 
 8 |  * @author Sean Gallagher
 9 |  */
10 | package edu.uncc.cs.watsonsim.researchers;


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/QuestionID.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Question;
 5 | 
 6 | /**
 7 |  * A bogus scorer whose purpose is to collate answers to the same question
 8 |  * @author Sean Gallagher
 9 |  */
10 | public class QuestionID extends AnswerScorer {
11 | 
12 | 	@Override
13 | 	public double scoreAnswer(Question q, Answer a) {
14 | 		return q.text.hashCode();
15 | 	}
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/scripts/populate_semantic_graph.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import leveldb
 3 | ldb = leveldb.LevelDB("data/edges-leveldb")
 4 | sdb = sqlite3.connect("sources.db")
 5 | block=[]
 6 | for k, v in ldb.RangeIter():
 7 |     block.append(k.decode("utf8").split("\t", 2) + [int(v.decode("utf8"))])
 8 |     if len(block) > 1000000:
 9 |         s = sdb.executemany("INSERT INTO semantic_graph(source, tag, target, count) VALUES (?, ?, ?, ?);", block);
10 |         print('.', end='')
11 |         block=[]
12 |         sdb.commit()


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/LuceneEcho.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Passage;
 5 | import edu.uncc.cs.watsonsim.Phrase;
 6 | 
 7 | /**
 8 |  * Take advantage of the Scorer dimension reduction for Lucene passages
 9 |  */
10 | public class LuceneEcho extends PassageScorer {
11 | 
12 | 	@Override
13 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
14 | 		return p.scores.get("LUCENE_SCORE");
15 | 	}
16 | 	
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerInPassage.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | /*
 4 |  * Author: Chris Stephenson
 5 |  * later rewritten by Sean
 6 |  */
 7 | 
 8 | import edu.uncc.cs.watsonsim.Answer;
 9 | import edu.uncc.cs.watsonsim.Passage;
10 | import edu.uncc.cs.watsonsim.Phrase;
11 | 
12 | public class AnswerInPassage extends PassageScorer {
13 | 	@Override
14 | 	public double scorePassage(Phrase q, Answer a, Passage p)
15 | 	{
16 | 		return p.text.contains(a.text) ?
17 | 				1 : 0;
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/LATMentions.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Question;
 5 | import edu.uncc.cs.watsonsim.scorers.AnswerScorer;
 6 | 
 7 | /**
 8 |  * Return how many unique LAT's there are for an answer. 
 9 |  * @author Sean
10 |  *
11 |  */
12 | public class LATMentions extends AnswerScorer {
13 | 	
14 | 	@Override
15 | 	public double scoreAnswer(Question q, Answer a) {
16 | 		return a.lexical_types.size();
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/package-info.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Text analyzers, for differentiating passages to improve ranking.
 3 |  * 
 4 |  * Scorers measure some aspect of the answer or passage, possibly in relation
 5 |  * to the question. Every scorer must return a primitive double.
 6 |  * <p>
 7 |  * Remember that the purpose of a scorer is not to provide a perfect rank on
 8 |  * it's own, only to differentiate "good" and "bad" passages in some meaningful
 9 |  * way. As such, the scale and sign are not very important.
10 |  */
11 | package edu.uncc.cs.watsonsim.scorers;


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/HyphenTrimmer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Phrase;
 5 | 
 6 | 
 7 | /** Trim any text from before the hyphen in the candidate text of an answer */
 8 | public class HyphenTrimmer extends Researcher {
 9 | 	
10 | 	public Answer answer(Phrase q, Answer a) {
11 | 		String[] improved_answer_parts = a.text.split("[-:(|]");
12 | 		
13 | 		if (improved_answer_parts.length>0) {
14 | 			return a.withText(improved_answer_parts[0].trim());
15 | 		}
16 | 		return a;
17 | 	}
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/parse.pl:
--------------------------------------------------------------------------------
 1 | is_type_statement(A, B) :-
 2 |     (nsubj(A, B), cop(B, _), det(B, _));
 3 |     (nsubj(B, A), cop(B, _), det(B, _)).
 4 | 
 5 | /* A plain type */
 6 | type_a(Name, Type) :-
 7 |     is_type_statement(Name, Type),
 8 |     det(Type, _).
 9 | 
10 | /* A type with subject conjunctions */
11 | type_b(Name, Type) :-
12 |     type_a(Name, Type) ;
13 |     (type_a(AnotherName, Type),
14 |      conj_and(AnotherName, Name)).
15 | 
16 | /* A type with subject or type conjunctions */
17 | type_c(Name, Type) :-
18 |     type_b(Name, Type) ;
19 |     (type_b(Name, AnotherType),
20 |      conj_and(AnotherType, Type)).
21 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/WShalabyScorer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | *
 3 | * @author Walid Shalaby
 4 | */ 
 5 | 
 6 | package edu.uncc.cs.watsonsim.scorers;
 7 | 
 8 | import edu.uncc.cs.watsonsim.Answer;
 9 | import edu.uncc.cs.watsonsim.Passage;
10 | import edu.uncc.cs.watsonsim.Phrase;
11 | 
12 | public class WShalabyScorer extends PassageScorer {
13 | 
14 | 	@Override
15 | 	/** Detect if the question matches the answer, score it appropriately
16 | 	 * This is to ease machine learning*/
17 | 	// TODO: Don't reassign for every passage
18 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
19 | 		return 0.0;
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/resources/public/scripts/index.js:
--------------------------------------------------------------------------------
 1 | /*$(function() {
 2 |     $("#search").ajaxForm({
 3 |     beforeSubmit: function() {
 4 |         $("#note").text("Asking learned grand-masters for insight.");
 5 |         return true; 
 6 |     },
 7 |     success: function(response) {
 8 |         $("#note").empty();
 9 |         $("#results").empty();
10 |         response.answers.forEach(function(item) {
11 |             var x = $("<li>"+item.title+"</li>");
12 |             x[0].style.background = "linear-gradient(#4FA5C2 " + 100 * item.score + ", #C8DAE0 " + 100 * item.score + ")";
13 |             $("#results").append(x);
14 |         });
15 |     }});
16 | });*/
17 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerInQuestionScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Question;
 5 | 
 6 | /**
 7 |  * Returns 1.0 if the answer text is found in the question and 0.0 otherwise
 8 |  * @author Ken Overholt
 9 |  *
10 |  */
11 | public class AnswerInQuestionScorer extends AnswerScorer {
12 | 	
13 | 	@Override
14 | 	public double scoreAnswer(Question q, Answer a) {
15 | 		String qtext = q.text.toLowerCase();
16 | 		String atext = a.text.toLowerCase();
17 | 		
18 | 		if (qtext.contains(atext))
19 | 			return 1.0;
20 | 		else
21 | 			return 0.0;		
22 | 	}
23 | 	
24 | }
25 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/QuestionResultsScorerTest.java:
--------------------------------------------------------------------------------
 1 | /**
 2 | *
 3 | * @author Walid Shalaby
 4 | */
 5 | 
 6 | package edu.uncc.cs.watsonsim;
 7 | import edu.uncc.cs.watsonsim.researchers.CombineScores;
 8 | 
 9 | 
10 | public class QuestionResultsScorerTest {
11 | 	
12 | 	public static void main(String[] args) {
13 | 		try {
14 | 			CombineScores q = new CombineScores();
15 | 			System.out.println("scoring: {indri-rank=1, indri-score=-1.582, lucene-rank=1, lucene-score=7.215, google-rank=1} ==> " + 
16 | 					q.score(new double[]{1,-1.582,1,7.215,1}));			
17 | 		} catch (Exception e) {
18 | 			// TODO Auto-generated catch block
19 | 			e.printStackTrace();
20 | 		}
21 | 		
22 | 	}
23 | 	
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/KVTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Before;
 6 | import org.junit.Test;
 7 | 
 8 | public class KVTest {
 9 | 
10 | 	@Before
11 | 	public void setUp() throws Exception {
12 | 	}
13 | 
14 | 	@Test
15 | 	public void testGet() {
16 | 		fail("Not yet implemented");
17 | 	}
18 | 
19 | 	@Test
20 | 	public void testAsVectorAsBytes() {
21 | 		float[] f = {(float) 12.0, (float) 0.99};
22 | 		byte[] b = {0, 0, 64, 65, -92, 112, 125, 63};
23 | 		for (int i=0; i<8; i++) b[i] = KV.asBytes(f)[i];
24 | 		for (int i=0; i<2; i++) f[i] = KV.asVector(b)[i];
25 | 	}
26 | 
27 | 	@Test
28 | 	public void testQuickGetOrCompute() {
29 | 		fail("Not yet implemented");
30 | 	}
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/config.properties.sample:
--------------------------------------------------------------------------------
 1 | # This is a .properties file. There is no need for quotes but see Wikipedi
 2 | # for a list of things that need to be escaped, and the precise syntax
 3 | # Most of what you need in this file should be straightforward though
 4 | lucene_index = data/v1.5/lucene_index
 5 | jena_lucene_index = data/rdf/lucene
 6 | indri_index = data/v1.5/indri_index
 7 | indri_enabled = false
 8 | terrier_index = OPTIONAL UNTIL TERRIER IS IMPLEMENTED
 9 | 
10 | bing_api_key = FILL ME IN
11 | 
12 | # The following are optional until the Google search is finished
13 | google_app_name = FILL ME IN
14 | google_api_key = FILL ME IN
15 | google_custom_search_id = FILL ME IN
16 | 
17 | # Setup your SQL database (you may need to edit this)
18 | jdbc_connection_string = jdbc:sqlite:data/watsonsim.db
19 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PassageQuestionLengthRatio.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | /* 
 4 |  * @author Wlodek
 5 |  */
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | 
11 | public class PassageQuestionLengthRatio extends PassageScorer {
12 | 	
13 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
14 | 		String qs = q.text;
15 | 		//String qst= q.text; //processes question, stopwords, punctuation removed
16 | 		//String as= a.candidate_text;
17 | 		//String ps=p.text; // text is guaranteed to have content
18 | 	    //ps.tokenize();
19 | 		
20 | 		int pl = p.text.length();
21 | 		int ql = qs.length();
22 | 		double sc=pl/ql;
23 | 		return sc;
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/privatedata/UserSpecificConstants.java.sample:
--------------------------------------------------------------------------------
 1 | package privatedata;
 2 | 
 3 | public class UserSpecificConstants {
 4 | 	// Constants
 5 | 	public static final String googleApplicationName = "";
 6 | 	public static final String googleAPIKey = ""; //Google provided API key
 7 | 	public static final String googleCustomSearchID = "";
 8 | 	
 9 | 	public static final String indriIndex = "data/indri_index";
10 | 	public static final String luceneIndex = "data/lucene_index";
11 | 	public static final String bingAPIKey = "aaaaaaaaaaa/aaaaaaaaaaa/aaaaaaaaaaaaaaaaaaa";
12 | 	public static final String luceneSearchField = "text";
13 | 	public static final String indriResultsFilter = "#filrej(list.title #combine(%s))"; 
14 | 	public static final String luceneResultsFilter = " NOT title:*\\:*" + " NOT title:list*";
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/Correct.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Environment;
 5 | import edu.uncc.cs.watsonsim.Question;
 6 | import edu.uncc.cs.watsonsim.nlp.Relatedness;
 7 | 
 8 | public class Correct extends AnswerScorer {
 9 | 	private final Relatedness syn;
10 | 	public Correct(Environment env) {
11 | 		syn = new Relatedness(env);
12 | 	}
13 | 	@Override
14 | 	/**
15 | 	 * Generate the target attribute for Machine Learning.
16 | 	 * @returns correctness		0.0 -> incorrect, 1.0 -> correct
17 | 	 * */
18 | 	public double scoreAnswer(Question q, Answer a) {
19 | 		if (q.correct_answer == null) {
20 | 			return 0;
21 | 		} else {
22 | 			return syn.implies(q.correct_answer, a) ? 1 : 0;
23 | 		}
24 |         
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/scripts/gensim/intro1.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 3 | from gensim import corpora, models, similarities
 4 | corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
 5 |            [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
 6 |            [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
 7 |            [(0, 1.0), (4, 2.0), (7, 1.0)],
 8 |            [(3, 1.0), (5, 1.0), (6, 1.0)],
 9 |            [(9, 1.0)],
10 |            [(9, 1.0), (10, 1.0)],
11 |            [(9, 1.0), (10, 1.0), (11, 1.0)],
12 |            [(8, 1.0), (10, 1.0), (11, 1.0)]]
13 | 
14 | tfidf = models.TfidfModel(corpus)
15 | index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
16 | 
17 | vec = [(0, 1), (4, 1)]
18 | sims = index[tfidf[vec]]
19 | print(list(enumerate(sims)))
20 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/TopPOS.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import org.apache.log4j.Logger;
 4 | 
 5 | import edu.stanford.nlp.trees.Tree;
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | 
 9 | /**
10 |  * Simple hashed POS-tag, mod 100 and scaled to between 0 and 1.
11 |  */
12 | public class TopPOS extends AnswerScorer {
13 | 	private final Logger log = Logger.getLogger(getClass());
14 | 
15 | 	public double scoreAnswer(Question q, Answer a) {
16 | 		for (Tree tree : a.getTrees()) {
17 | 			for (Tree child : tree.children()) {
18 | 				log.debug(a.text + " is a " + child.label().value() + " : " + (child.label().value().hashCode() % 100) / 100.0);
19 | 				return (child.label().value().hashCode() % 10) / 10.0;	
20 | 			}
21 | 		}
22 | 		return 0.0;
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/CachingSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.util.List;
 4 | import com.google.gson.reflect.TypeToken;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Environment;
 7 | import edu.uncc.cs.watsonsim.Passage;
 8 | 
 9 | public class CachingSearcher extends Searcher {
10 | 	private final Searcher searcher;
11 | 	private final String engine_name;
12 | 
13 | 	public CachingSearcher(Environment env, Searcher searcher, String engine_name) {
14 | 		super(env);
15 | 		this.searcher = searcher;
16 | 		this.engine_name = engine_name;
17 | 	}
18 | 	
19 | 	public List<Passage> query(String query) {
20 | 		return env.computeIfAbsent(
21 | 				"search:" + engine_name +":"+ query,
22 | 				k -> searcher.query(query),
23 | 				new TypeToken<List<Passage>>(){}.getType()
24 | 				);
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/RelatednessTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Before;
 6 | import org.junit.Test;
 7 | 
 8 | import edu.uncc.cs.watsonsim.Environment;
 9 | 
10 | public class RelatednessTest {
11 | 	private Relatedness rel;
12 | 	@Before
13 | 	public void setUp() throws Exception {
14 | 		rel = new Relatedness(new Environment());
15 | 	}
16 | 
17 | 	@Test
18 | 	public void testViaWikiLinks() {
19 | 		fail("Not yet implemented");
20 | 	}
21 | 
22 | 	@Test
23 | 	public void testMatchViaSearch() {
24 | 		fail("Not yet implemented");
25 | 	}
26 | 
27 | 	@Test
28 | 	public void testMatchViaLevenshtein() {
29 | 		fail("Not yet implemented");
30 | 	}
31 | 
32 | 	@Test
33 | 	public void testImplies() {
34 | 		fail("Not yet implemented");
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/QAKeywordMatch.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Question;
 7 | import edu.uncc.cs.watsonsim.StringUtils;
 8 | 
 9 | /*Author : Ricky Sanders
10 |  * 
11 |  * Checks the Question against the answer to remove
12 |  * answers that closely match the question
13 |  * 
14 |  */
15 | 
16 | public class QAKeywordMatch extends AnswerScorer {
17 | 	public double scoreAnswer(Question q, Answer a){
18 | 		List<String> questionTextArray = StringUtils.tokenize(q.text);
19 | 		List<String> answerTextArray = StringUtils.tokenize(a.text);
20 | 		int count = 0;
21 | 		for (String word : questionTextArray)
22 | 			if (answerTextArray.contains(word))
23 | 				count += 1;
24 | 		return (count / (double)questionTextArray.size());
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/AnswerMergeTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | public class AnswerMergeTest {
 6 | 
 7 | 	@Test
 8 | 	public void testMatches() {
 9 | 		// Results are equal if their titles are similar. This uses match_subset.
10 | 		/*assertTrue(
11 | 			new Answer("engine", "duck duck", "text", "reference", 0, 0).matches(
12 | 			new Answer("engine", "duck duck goose", "text", "reference", 0, 0)));
13 | 		
14 | 		assertFalse(
15 | 			new Answer("engine", "duck duck goose", "text", "reference", 0, 0).matches(
16 | 			new Answer("engine", "duck duck", "text", "reference", 0, 0)));
17 | 
18 | 		assertTrue(
19 | 			new Answer("engine", "sitting on a fence", "text", "reference", 0, 0).matches(
20 | 			new Answer("engine", "Pete and repeat were sitting on a fence", "text", "reference", 0, 0)));
21 | 			*/
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/DBQuestionSource.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | import java.sql.PreparedStatement;
 3 | import java.sql.ResultSet;
 4 | import java.sql.SQLException;
 5 | import java.util.ArrayList;
 6 | 
 7 | public class DBQuestionSource extends ArrayList<Question> {
 8 | 	private static final long serialVersionUID = 1L;
 9 | 	
10 | 	/** Run an arbitrary query on the database to get questions.
11 | 	 */
12 | 	public DBQuestionSource(Environment env, String conditions) throws SQLException {
13 | 		// Get a list of questions, ordered so that it is consistent
14 | 		PreparedStatement query = env.db.prep("select question, answer, category from questions "
15 | 				+ conditions + ";");
16 | 		read_results(query.executeQuery());
17 | 	}
18 | 	
19 | 	public void read_results(ResultSet sql) throws SQLException {
20 | 		while(sql.next()){
21 | 			Question q = Question.known(
22 | 					sql.getString("question"),
23 | 					sql.getString("answer"),
24 | 					sql.getString("category")
25 | 				);
26 | 			add(q);
27 | 		}
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/QPKeywordMatch.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Passage;
 7 | import edu.uncc.cs.watsonsim.Phrase;
 8 | import edu.uncc.cs.watsonsim.StringUtils;
 9 | 
10 | /*Author : Jacob Medd, Jagan Vujjini
11 |  * 
12 |  * Just Modified Jacob Medd's Scorer to ignore Stop Words.
13 |  * Will be adding the Stemmed Words Functionality.
14 |  *
15 |  * 
16 |  * Later modified. It seems that:
17 |  *   (% word in common) / (mean distance between common words)
18 |  * is a constant.
19 |  * 
20 |  * So just use one of them, and the % in common is easiest.
21 |  */
22 | 
23 | public class QPKeywordMatch extends PassageScorer {
24 | 	
25 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
26 | 			List<String> questionTextArray = StringUtils.tokenize(q.text);
27 | 			int count = 0;
28 | 			for (String word : questionTextArray)
29 | 				if (p.getTokens().contains(word))
30 | 					count += 1;
31 | 			return (count / (double)questionTextArray.size());
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Question;
 7 | import edu.uncc.cs.watsonsim.Score;
 8 | 
 9 | public abstract class AnswerScorer implements Scorer {
10 | 	String name;
11 | 	{
12 | 		name = this.getClass().getSimpleName().replaceAll("([a-z])([A-Z]+)", "$1_$2").toUpperCase();
13 | 		Score.register(name, 0.0, Merge.Sum);
14 | 	}
15 | 	/**
16 | 	 * By default, score every answer to a question.
17 | 	 * Remember to call scoreAnswer if you override this.
18 | 	 * @param q		Question
19 | 	 */
20 | 	@Override
21 | 	public void scoreQuestion(Question q, List<Answer> answers) {
22 | 		for (Answer a : answers)
23 | 			a.score(name, scoreAnswer(q, a));		
24 | 	}
25 | 	
26 | 	/**
27 | 	 * Override this method with your scorer implementation.
28 | 	 * @param q		Question
29 | 	 * @param a		Answer
30 | 	 * @return	The score for this answer, or NaN if not applicable.
31 | 	 */
32 | 	public double scoreAnswer(Question q, Answer a) {
33 | 		return 0.0;
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/scripts/gensim/scatter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from matplotlib import pyplot as plt
 3 | from sklearn.decomposition import PCA, KernelPCA
 4 | from sklearn.manifold import Isomap, TSNE
 5 | from analogy import Analogy
 6 | from vstore import VStore
 7 | 
 8 | a = Analogy(VStore("vectors.lmdb", "big-glove"))
 9 | 
10 | buf = ""
11 | linebuf = raw_input("Please enter some words to plot, or empty for a canned list: ")
12 | while linebuf:
13 | 	buf += linebuf + " "
14 | 	linebuf = raw_input("... ")
15 | 
16 | 
17 | labels = buf.split() \
18 |     or "doctor nurse politician senator lawyer barrister defend accuse heal treat cure elect vote".split() 
19 | 
20 | vs = [a.w(x) for x in labels if a.w(x) is not None  ]
21 | flatplot = TSNE(2)
22 | ps = flatplot.fit_transform(vs)
23 | 
24 | plt.title("Reduced vector space model")
25 | plt.xlabel("First Principal Component")
26 | plt.ylabel("Second Principal Component")
27 | plt.scatter(ps[:, 0], ps[:, 1])
28 | for (x, y), label in zip(ps, labels):
29 |     print "plotting %f, %f, %s" %(x, y, label)
30 |     plt.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points')
31 | 
32 | plt.show()
33 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/StringUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | import edu.uncc.cs.watsonsim.StringUtils;
 8 | 
 9 | public class StringUtilsTest {
10 | 
11 | 	@Test
12 | 	public void test_match_subset() {
13 | 		assertTrue(StringUtils.matchSubset("cat toy", "cat toy"));
14 | 		
15 | 		assertTrue(StringUtils.matchSubset("thundering applause", "resounding, thundering applause"));
16 | 		
17 | 		assertTrue(StringUtils.matchSubset("What is for dinner, mother?", "What, is mother for dinner?"));
18 | 	}
19 | 	
20 | 	@Test
21 | 	public void test_filter_relevant() {
22 | 		assertEquals(StringUtils.canonicalize("cat toy"), "cat toy");
23 | 		assertEquals(StringUtils.canonicalize("resounding, thundering applause"), "resounding thundering applause");
24 | 		assertEquals(StringUtils.canonicalize("What is for dinner, mother?"), "what dinner mother");
25 | 		assertEquals(StringUtils.canonicalize("I am a walaby"), "i am walaby"); // This is more documentation than test
26 | 		assertEquals(StringUtils.canonicalize("I\n\t am   a walaby~!@#$%^&*()_+`-={}[]:\";\'<>?,./"), "i am walaby");
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/DenseVectorsTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.Optional;
 6 | 
 7 | import org.junit.Before;
 8 | import org.junit.Test;
 9 | import static edu.uncc.cs.watsonsim.nlp.DenseVectors.*;
10 | 
11 | public class DenseVectorsTest {
12 | 
13 | 	@Before
14 | 	public void setUp() throws Exception {
15 | 	}
16 | 
17 | 	@Test
18 | 	public void testSim() {
19 | 		assertEquals(sim(vectorFor("diabetes"), vectorFor("retinopathy")), 0.54, 0.01);
20 | 		assertEquals(sim(vectorFor("diabetes"), vectorFor("diabetic")), 0.78, 0.01);
21 | 		assertEquals(sim(vectorFor("(*&(*&^(*&^"), vectorFor("diabetic")), 0.00, 0.01);
22 | 		assertEquals(sim(vectorFor("diabetes"), vectorFor("")), 0.00, 0.01);
23 | 		assertEquals(sim(vectorFor("diabetes"), Optional.of(new float[300])), 0.00, 0.01);
24 | 		
25 | 		float[] X = new float[300]; X[0] = (float) 0.5;
26 | 		float[] Y = new float[300]; Y[1] = (float) 0.5;
27 | 		float[] Z = new float[300]; Z[0] = (float) 0.5; Z[1] = (float) 0.5;
28 | 		
29 | 		assertEquals(sim(X, Y), 0.0, 0.01);
30 | 		assertEquals(sim(X, Z), 0.707, 0.01);
31 | 		assertEquals(sim(X, X), 1.0, 0.01);
32 | 	}
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/WordProximity.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.HashSet;
 5 | import java.util.List;
 6 | import java.util.Set;
 7 | 
 8 | import edu.uncc.cs.watsonsim.Answer;
 9 | import edu.uncc.cs.watsonsim.Passage;
10 | import edu.uncc.cs.watsonsim.Phrase;
11 | import edu.uncc.cs.watsonsim.Question;
12 | 
13 | public class WordProximity extends PassageScorer {
14 | 	Set<String> q_words = new HashSet<String>();
15 | 	
16 | 	@Override
17 | 	public void scoreQuestion(Question q, List<Answer> answers) {
18 | 		q_words.clear();
19 | 		q_words.addAll(Arrays.asList(q.text.split("\\W+")));
20 | 		super.scoreQuestion(q, answers);
21 | 	}
22 | 	
23 | 	@Override
24 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
25 | 		double distance = 1;
26 | 		double average_log_distance = 0;
27 | 		
28 | 		for (String w : p.text.split("\\W+")) {
29 | 			if (q_words.contains(w)) {
30 | 				average_log_distance += Math.log(distance);
31 | 				distance = 1;
32 | 			} else {
33 | 				distance++;
34 | 			}
35 | 		}
36 | 
37 | 		// This result is given as log(interval). Does that matter?
38 | 		return average_log_distance;
39 | 	}
40 | 	
41 | }
42 | 


--------------------------------------------------------------------------------
/scripts/gensim/digestion.py:
--------------------------------------------------------------------------------
 1 | from gensim import corpora
 2 | import re
 3 | 
 4 | def filter_alnum(text):
 5 |     return re.findall("\w+", text)
 6 | 
 7 | def cbow_dict(source):
 8 |     return corpora.Dictionary([[w] for w in open(source).read().lower().split()])
 9 | 
10 | def line_dict(source):
11 |     return corpora.Dictionary([filter_alnum(l) for l in open(source)])
12 | 
13 | class CBOWCorpus(object):
14 |     def __init__(self, source, dictionary):
15 |         self.dictionary = dictionary
16 |         self.words = [ w for w in open(source).read().split() if w not in stoplist]
17 | 
18 |     def __len__(self): # this is O(n)
19 |         return len(self.words)-4
20 | 
21 |     def __iter__(self):
22 |         for i in xrange(len(self.words)-4):
23 |             yield self.dictionary.doc2bow(self.words[i:i+4])
24 | 
25 | class LineCorpus(object):
26 |     def __init__(self, source, dictionary):
27 |         self.source = source
28 |         self.dictionary = dictionary
29 | 
30 |     def __len__(self):
31 |         i=0
32 |         for line in open(self.source):
33 |             i += 1
34 |         return i
35 | 
36 |     def __iter__(self):
37 |         for line in open(self.source):
38 |             yield self.dictionary.doc2bow(filter_alnum(line))
39 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/WebFrontend.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | import static spark.Spark.*;
 3 | 
 4 | import java.util.List;
 5 | 
 6 | import spark.*;
 7 | 
 8 | public class WebFrontend {
 9 | 
10 | 	public static void main(String[] args) {
11 | 		Spark.staticFileLocation("public");
12 | 		//externalStaticFileLocation("public");
13 | 		get("/ask", (Request request, Response response) -> {
14 |     		Question question = new Question(request.queryParams("query"));
15 |     		/*
16 |     		OutputStream st = response.raw().getOutputStream();
17 |     		Logger.getRootLogger().addAppender(
18 |     				new WriterAppender(
19 |     						new SimpleLayout(),
20 |     						st));*/
21 |     		List<Answer> answers = new DefaultPipeline().ask(question);
22 | 	        
23 |     		StringBuilder output = new StringBuilder();
24 | 	        // Throw whole summaries of the data at the client
25 | 	        for (Answer r: answers) {
26 | 	        	output.append(r.toJSON());
27 | 	        	output.append(',');
28 | 	        }
29 | 	        
30 | 	        
31 | 	        response.type("application/json");
32 | 	        return String.format("{\"id\": {\"answers\": [%s]}", output.substring(0, output.length() - 1));
33 | 		});
34 | 		
35 | 		
36 | 
37 | 	}
38 | 
39 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/EntropyTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | import edu.uncc.cs.watsonsim.Environment;
12 | 
13 | public class EntropyTest {
14 | 	Entropy e;
15 | 
16 | 	@Before
17 | 	public void setUp() throws Exception {
18 | 		Environment env = new Environment();
19 | 		e = new Entropy(env);
20 | 	}
21 | 
22 | 	@Test
23 | 	public void testGetEntropy() {
24 | 		assertTrue(
25 | 				e.entropy(Arrays.asList("zucchini", "sepals"))
26 | 				> e.entropy(Arrays.asList("the", "of")));
27 | 		
28 | 		String w1 = "Subverting Randall’s editor’s admiral intentions, "
29 | 				+ "alternative enjoyment ensues composing complete "
30 | 				+ "paragraphs entirely shunning Randall’s thousand "
31 | 				+ "commonest dictionary terms. Bombastic prose "
32 | 				+ "frequently results.";
33 | 		List<String> ws1 = Arrays.asList(w1.split(" "));
34 | 		String w2 = "See spot run. Spot runs fast. Spot and Joey play in the "
35 | 				+ "park.";
36 | 		List<String> ws2 = Arrays.asList(w2.split(" "));
37 | 		assertTrue(e.entropy(ws1) > e.entropy(ws2));
38 | 	}
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/ReindexEdgesTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | import static edu.stanford.nlp.util.Triple.makeTriple;
 8 | import edu.uncc.cs.watsonsim.index.Edges;
 9 | import static java.util.Arrays.asList;
10 | 
11 | public class ReindexEdgesTest {
12 | 
13 | 	@Test
14 | 	public void testSimpleExample() {
15 | 		Phrase p = new Phrase("This is an example.");
16 | 
17 | 		assertEquals(asList(
18 | 				makeTriple("example","nsubj","This"),
19 | 				makeTriple("example","cop","is"),
20 | 				makeTriple("example","det","an")),
21 | 			Edges.generateEdges(p));
22 | 	}
23 | 	
24 | 	@Test
25 | 	public void testExtraLinks() {
26 | 		Phrase p = new Phrase("Donald Duck is a cool cartoon character. "
27 | 				+ "He sounds really funny.");
28 | 		System.out.println(Edges.generateEdges(p));
29 | 		
30 | 		assertTrue(Edges.generateEdges(p).containsAll(asList(
31 | 				makeTriple("Donald Duck","_isa","cartoon character"),
32 | 				makeTriple("Donald Duck","_gender","MALE"),
33 | 				makeTriple("Donald Duck","_animate","ANIMATE"),
34 | 				makeTriple("Donald Duck","_number","SINGULAR"),
35 | 				makeTriple("sound","nsubj","Donald Duck")
36 | 				)));
37 | 	}
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/GloveAnswerQuestionContext.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | import java.util.stream.Collectors;
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Phrase;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | import edu.uncc.cs.watsonsim.nlp.DenseVectors;
 9 | 
10 | /**
11 |  * Returns the total context similarity between the answer and question.
12 |  * The algorithm it uses is simply the mean of the word vectors (not really a
13 |  * great solution, better with short questions / answers)
14 |  */
15 | public class GloveAnswerQuestionContext extends AnswerScorer {
16 | 	
17 | 	@Override
18 | 	public double scoreAnswer(Question q, Answer a) {
19 | 		List<float[]> qtokens = q.memo(Phrase.simpleTokens)
20 | 				.stream().map(DenseVectors::vectorFor)
21 | 				.filter(v -> v.isPresent())
22 | 				.map(v ->v.get())
23 | 				.collect(Collectors.toList());
24 | 		List<float[]> atokens = a.memo(Phrase.simpleTokens)
25 | 				.stream().map(DenseVectors::vectorFor)
26 | 				.filter(v -> v.isPresent())
27 | 				.map(v -> v.get())
28 | 				.collect(Collectors.toList());
29 | 		
30 | 		return DenseVectors.sim(DenseVectors.mean(atokens), DenseVectors.mean(qtokens));	
31 | 	}
32 | 	
33 | }
34 | 


--------------------------------------------------------------------------------
/scripts/gensim/import_glove.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Loads GloVe data into a VStore
 3 | import sys
 4 | import argparse
 5 | import numpy
 6 | from vstore import VStore
 7 | 
 8 | parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB")
 9 | parser.add_argument("--name", action="store", type=str, default="glove",
10 | 	help="name of the database into which to load the vectors")
11 | parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb",
12 | 	help="shared database filename")
13 | parser.add_argument("--merge", action="store_true",
14 | 	help="merge the new dataset, rather than replacing")
15 | parser.add_argument("source", type=file,
16 | 	help="uncompressed GloVe dataset")
17 | args = parser.parse_args()
18 | 
19 | # Invert control in order to use one transaction
20 | table = VStore(args.dbfile, args.name)
21 | table.drop()
22 | def loader():
23 | 	for loaded, line in enumerate(args.source):
24 | 		line = line.split()
25 | 		name = line.pop(0)
26 | 		## Tokenization errors can cause a word to be too long for lmdb
27 | 		if len(name) > 100:
28 | 			continue
29 | 		if loaded % 10000 == 0:
30 | 			print "Loaded {} rows".format(loaded)
31 | 
32 | 		yield name, numpy.array(line, dtype=numpy.float32)
33 | table.load(loader())
34 | print "Finished loading"
35 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/SkipBigram.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.List;
 5 | import java.util.Set;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | import edu.uncc.cs.watsonsim.StringUtils;
11 | 
12 | /**
13 |  * @author Sean Gallagher
14 |  *
15 |  */
16 | 
17 | public class SkipBigram extends PassageScorer {
18 | 	
19 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
20 | 		
21 | 		// Jane Austen
22 | 		Set<String> a_set = generateBigrams(StringUtils.tokenize(a.text));
23 | 		
24 | 		// Romantic novelist Jane Austen once wrote -the- book Emma.
25 | 		Set<String> p_set = generateBigrams(p.getTokens());
26 | 		
27 | 		a_set.retainAll(p_set);
28 | 		
29 | 		return a_set.size();
30 | 	}
31 | 	
32 | 	private Set<String> generateBigrams(List<String> terms) {
33 | 		Set<String> bigrams = new HashSet<>();
34 | 		for (int ti=0; ti<terms.size()-1; ti++) {
35 | 			// First the bigram
36 | 			bigrams.add(terms.get(ti) + terms.get(ti+1));
37 | 			if (ti < terms.size()-2) {
38 | 				// Maybe the skip bigram, if we are more than one word from end
39 | 				bigrams.add(terms.get(ti) + terms.get(ti+1));
40 | 			}
41 | 		}
42 | 		return bigrams;
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PercentWordsInCommon.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Passage;
 5 | import edu.uncc.cs.watsonsim.Phrase;
 6 | 
 7 | //Score is Percent of words in common / the average distance between the words
 8 | public class PercentWordsInCommon extends PassageScorer {
 9 | 	
10 | public double scorePassage(Phrase q, Answer a, Passage p) {
11 | 		String[] questionTextArray = q.text.split(" ");
12 | 		int[] distanceBetweenWords = new int[q.text.length()];
13 | 		int distanceIndex = 0;
14 | 		int lastMatch = -1;
15 | 		String passageText = p.text;
16 | 		int distanceSum = 0;
17 | 		int count = 0;
18 | 		for (int i = 0; i < questionTextArray.length; i++)
19 | 		{
20 | 			if (passageText.contains(questionTextArray[i]))
21 | 			{
22 | 				if (lastMatch == -1)
23 | 					lastMatch = i;
24 | 				else
25 | 				{
26 | 					distanceBetweenWords[distanceIndex] = (i - lastMatch);
27 | 					distanceIndex += 1;
28 | 					lastMatch = i;
29 | 				}
30 | 				count += 1;
31 | 			}
32 | 		}
33 | 		for (int i = 0; i < count; i++)
34 | 		{
35 | 			distanceSum += distanceBetweenWords[i];
36 | 		}
37 | 		if (count > 0)
38 | 			return (count/((double)q.text.length()))/((double)distanceSum/count);
39 | 		else
40 | 			return 0;
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/PassageRetrieval.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.List;
 4 | import java.util.regex.Matcher;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Environment;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Question;
10 | import edu.uncc.cs.watsonsim.search.*;
11 | 
12 | /**
13 |  * Search for documents having relevance to both the question and a candidate
14 |  * answer.
15 |  */
16 | public class PassageRetrieval extends Researcher {
17 | 	private final Searcher[] searchers;
18 | 	
19 | 	public PassageRetrieval(Environment env, Searcher... searchers) {
20 | 		this.searchers = searchers;
21 | 	}
22 | 	
23 | 	
24 | 	@Override
25 | 	public List<Answer> question(Question q, List<Answer> answers) {
26 | 		
27 | 		int total_passages = answers.stream().mapToInt(a -> {
28 | 			// Query every engine
29 | 			int count = 0;
30 | 	    	for (Searcher s : searchers) {
31 | 	    		List<Passage> passages = s.query(
32 | 	    				q.text + " " + Matcher.quoteReplacement(a.text));
33 | 	    		a.passages.addAll(passages);
34 | 	    		count += passages.size();
35 | 	    	}
36 | 	    	return count;
37 | 		}).sum();
38 | 	    	
39 | 		
40 | 		q.log.info("Found " + total_passages + " supporting passages.");
41 | 		return answers;
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/WPPageViews.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.sql.ResultSet;
 4 | import java.sql.SQLException;
 5 | import java.util.HashMap;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Environment;
 9 | import edu.uncc.cs.watsonsim.Question;
10 | import edu.uncc.cs.watsonsim.nlp.ApproxStringIntMap;
11 | 
12 | public class WPPageViews extends AnswerScorer {
13 | 	private static ApproxStringIntMap pageviews = new ApproxStringIntMap(null);
14 | 	
15 | 	public WPPageViews(Environment env) {
16 | 		load(env);
17 | 	}
18 | 	
19 | 	private static synchronized void load(Environment env) {
20 | 		if (pageviews.isEmpty()) {
21 | 			int collisions = 0;
22 | 			try {
23 | 				ResultSet res = env.db.prep(
24 | 						"SELECT title, page_views FROM page_views;")
25 | 						.executeQuery();
26 | 				while (res.next()) {
27 | 					collisions += pageviews.containsKey(res.getString(1).toLowerCase()) ? 1 : 0;
28 | 					pageviews.put(res.getString(1).toLowerCase(), res.getInt(2));
29 | 				}
30 | 			} catch (SQLException e) {
31 | 				// at worst give 0s
32 | 				e.printStackTrace();
33 | 			}
34 | 			System.out.println("Loaded view data about " + pageviews.size() + " pages "
35 | 					+ "(" + collisions + " collisions)");
36 | 		}
37 | 	}
38 | 
39 | 	@Override
40 | 	public double scoreAnswer(Question q, Answer a) {
41 | 		return pageviews.get(a.toString().toLowerCase());
42 | 	}
43 | 	
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/GloveAnswerQuestionContextTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Before;
 6 | import org.junit.Test;
 7 | 
 8 | import edu.uncc.cs.watsonsim.Answer;
 9 | import edu.uncc.cs.watsonsim.Question;
10 | 
11 | public class GloveAnswerQuestionContextTest {
12 | 
13 | 	@Before
14 | 	public void setUp() throws Exception {
15 | 	}
16 | 
17 | 	@Test
18 | 	public void testScoreAnswer() {
19 | 		GloveAnswerQuestionContext scorer = new GloveAnswerQuestionContext();
20 | 		assertEquals(scorer.scoreAnswer(new Question("frog"), new Answer("toad")), 0.73, 0.01);
21 | 		assertEquals(scorer.scoreAnswer(new Question("frog"), new Answer("maple")), 0.23, 0.01);
22 | 		assertEquals(scorer.scoreAnswer(
23 | 				new Question("Who was Marilyn Monroe's second husband?"),
24 | 				new Answer("Joe Dimaggio")), 0.26, 0.01);
25 | 		assertEquals(scorer.scoreAnswer(
26 | 				new Question("Who was Marilyn Monroe's ^&^*()(*&$%^% 7868769987 jhgkjhgbnvbnuyr second husband?"),
27 | 				new Answer("Joe Dimaggio")), 0.26, 0.01);
28 | 		assertEquals(scorer.scoreAnswer(
29 | 				new Question("Who was Marilyn Monroe's second husband?"),
30 | 				new Answer("husband")), 0.71, 0.01);
31 | 		assertEquals(scorer.scoreAnswer(
32 | 				new Question("34986 **(&)(*& uiuytiuytiuyti"),
33 | 				new Answer("iuyoiuyoiuyhjjkhg")), 0.0, 0.01);
34 | 		assertEquals(scorer.scoreAnswer(
35 | 				new Question("democracy"),
36 | 				new Answer("")), 0.0, 0.01);
37 | 	}
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/TypeDetectionTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | import org.junit.Test;
 9 | 
10 | import edu.uncc.cs.watsonsim.Environment;
11 | import edu.uncc.cs.watsonsim.nlp.DBPediaCandidateType;
12 | 
13 | public class TypeDetectionTest {
14 | 
15 | 	@Test
16 | 	/**
17 | 	 * Check to see if the types received for a given input are sane.
18 | 	 * 
19 | 	 * This is not stubbed because this is only a client wrapper; there would
20 | 	 * be nothing left after stubbing. So expect it to fail if you do not have
21 | 	 * the DBPedia database setup.
22 | 	 */
23 | 	public void test() {
24 | 		testHasAll("New York", new String[]{"city", "municipality", "place"});
25 | 		testHasAll("tab", new String[]{"beverage", "food"});
26 | 	}
27 | 
28 | 
29 | 	public void testHasAll(String source, String[] targets) {
30 | 		List<String> types = new ArrayList<>();
31 | 		try {
32 | 			Environment env = new Environment();
33 | 			types = new DBPediaCandidateType(env).viaDBPedia(source);
34 | 		} catch (RuntimeException e) {
35 | 			// If this goes wrong, it probably just means we are disconnected
36 | 			System.err.println("Failed to connect to SPARQL endpoint for answer "
37 | 					+ "type detection. Perhaps you are disconnected?");
38 | 			System.err.println(e.getMessage());
39 | 			System.err.println(e.getStackTrace());
40 | 			return;
41 | 		}
42 | 		
43 | 		for (String target : targets)
44 | 			assertTrue(types.contains(target));
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/CommonConstituents.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.HashSet;
 4 | 
 5 | import edu.stanford.nlp.trees.Tree;
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Passage;
 8 | import edu.uncc.cs.watsonsim.Phrase;
 9 | 
10 | /* @author Wlodek
11 |  * @author Sean Gallagher
12 |  * 
13 |  * Create a score based on how many parse trees the question, candidate answer
14 |  * and passage have in common.
15 |  * 
16 |  * This scorer can be very slow.
17 |  */
18 | 
19 | public class CommonConstituents extends PassageScorer {
20 | 	/**
21 | 	 * Score the similarity of two sentences according to
22 | 	 * sum([ len(x) | x of X, y of Y, if x == y ])
23 | 	 * where X and Y are the sets of subtrees of the parses of s1 and s2.  
24 | 	 * @param x
25 | 	 * @param y
26 | 	 * @return
27 | 	 */
28 | 	public static double getCommonSubtreeCount(Phrase t1, Phrase t2) {
29 | 		
30 | 		HashSet<String> t1_subtrees = new HashSet<>();
31 | 		HashSet<String> t2_subtrees = new HashSet<>();
32 | 		for (Tree x : t1.getTrees()) t1_subtrees.add(x.toString());
33 | 		for (Tree y : t2.getTrees()) t2_subtrees.add(y.toString());
34 | 		t1_subtrees.retainAll(t2_subtrees);
35 | 		
36 | 		// x.getLeaves().size() may also be a good idea.
37 | 		// I don't have any intuition for which may be better.
38 | 		return t1_subtrees.size();
39 | 	}
40 | 		
41 | 
42 | 	/** Generate a simple score based on scorePhrases.
43 | 	 * 
44 | 	 */
45 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
46 | 		return getCommonSubtreeCount(p, new Phrase(a.text));
47 | 	}
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/scripts/import_trec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Before you start, run:
 3 | # -> pip install clint
 4 | import argparse
 5 | import sqlite3
 6 | from clint.textui import progress
 7 | import multiprocessing
 8 | from multiprocessing.pool import Pool
 9 | from lxml.etree import HTML
10 | parser = argparse.ArgumentParser(description="Import TREC data into sqlite3")
11 | parser.add_argument("-t", "--table", default="documents", help="SQL table to dump into")
12 | parser.add_argument("db", help="SQLite database")
13 | parser.add_argument("source", help="Source tag [e.g. wikipedia,wikiquotes,shakespeare ...]")
14 | parser.add_argument("trec", nargs="+", help="Input TREC files")
15 | args = parser.parse_args()
16 | 
17 | db = sqlite3.connect(args.db)
18 | db.executescript("""
19 |   pragma journal_mode = WAL;
20 |   pragma synchronous = OFF;""")
21 | 
22 | 
23 | for i, fname in progress.bar(enumerate(args.trec), "Importing TREC data..", 50, expected_size=len(args.trec)):
24 |   with open(fname) as f:
25 |     b = HTML(f.read()).findall("*doc")
26 |     entries = [
27 |       [d.findtext("docno"), d.findtext("title"), d.findtext("text")]
28 |       for d in b]
29 | 
30 |   db.executemany("insert or replace into %s (docno, title, text, source) values (?,?,?,'%s');" %(args.table, args.source), entries)
31 |   if not (i % 250):
32 |       db.execute("insert into search_{table}(search_{table}) values ('merge=200,8');".format(table=args.table)) # Clean search trees a bit
33 |       db.commit()
34 | 
35 | # Clean the tree the last time. 
36 | #db.execute("insert into search_{table}(search_{table}) values ('optimize');".format(table=args.table))
37 | db.commit()
38 | db.close()
39 | 


--------------------------------------------------------------------------------
/scripts/convert_arff_to_leveldb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # This is a crazy hack to convert Weka's arff into caffe's leveldb
 4 | import leveldb
 5 | import caffe_pb2
 6 | import struct
 7 | 
 8 | def read(filename):
 9 |     fl = open(filename)
10 |     label_index = 0
11 |     label_found = False
12 |     line = fl.readline()
13 |     while line:
14 |         if line.startswith("@data"):
15 |             break
16 |         elif not label_found and line.startswith("@attribute"):
17 |             if line.split()[1] == "CORRECT":
18 |                 label_found = True
19 |             else:
20 |                 label_index += 1
21 |     
22 |     for line in fl:
23 |         if line.strip():
24 |             l = [float(x.replace("?", "NaN")) for x in line.split(',')]
25 |             label = l.pop(label_index)
26 |             yield (l, label)
27 | 
28 | def transform(prev):
29 |     d = caffe_pb2.Datum()
30 |     d.channels = 1
31 |     d.height = 1
32 |     d.width = 2064
33 |     totals = [0] * d.width
34 |     for entry, label in prev:
35 |         totals = [t+e for t, e in zip(totals, entry)]
36 |         d.data = struct.pack("2064d", *entry)
37 |         d.label = label
38 |         yield d.SerializeToString()
39 |         
40 |     d.data = struct.pack("2064d", *totals)
41 |     open("watson_mean.binaryproto", "w").write(d.SerializeToString())
42 |     
43 | 
44 | def write(filename, prev):
45 |     ldb = leveldb.LevelDB(filename=filename, create_if_missing=True, error_if_exists=True)
46 |     for key, entry in enumerate(prev):
47 |         ldb.Put(str(key).zfill(5), entry)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     import sys
52 |     write(sys.argv[2], transform(read(sys.argv[1])))


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/DateMatchesTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | import edu.uncc.cs.watsonsim.scorers.DateMatches;
 8 | 
 9 | public class DateMatchesTest {
10 | 
11 | 	@Test
12 | 	public void test() {
13 | 		assertTrue(DateMatches.maybeYear("2005"));
14 | 		assertTrue(DateMatches.maybeYear("05"));
15 | 		assertFalse(DateMatches.maybeYear("-12"));
16 | 		assertFalse(DateMatches.maybeYear("Fall"));
17 | 		
18 | 		assertTrue(DateMatches.maybeMonth("March"));
19 | 		assertTrue(DateMatches.maybeMonth("Mar"));
20 | 		assertTrue(DateMatches.maybeMonth("03"));
21 | 		assertTrue(DateMatches.maybeMonth("3"));
22 | 		
23 | 		assertTrue(DateMatches.maybeDay("2"));
24 | 		assertTrue(DateMatches.maybeDay("12"));
25 | 		assertFalse(DateMatches.maybeDay("123"));
26 | 		
27 | 		assertTrue(DateMatches.maybeDate("04/05/1992"));
28 | 		assertTrue(DateMatches.maybeDate("04-05-1992"));
29 | 		assertTrue(DateMatches.maybeDate("04 05 1992"));
30 | 		assertTrue(DateMatches.maybeDate("05 1992"));
31 | 		assertTrue(DateMatches.maybeDate("05-1992"));
32 | 		assertTrue(DateMatches.maybeDate("05/1992"));
33 | 		assertTrue(DateMatches.maybeDate("May 1992"));
34 | 		assertTrue(DateMatches.maybeDate("04 May"));
35 | 		assertTrue(DateMatches.maybeDate("May 04"));
36 | 		assertTrue(DateMatches.maybeDate("May 4, 1992"));
37 | 		assertTrue(DateMatches.maybeDate("1992, 04 May"));
38 | 		assertFalse(DateMatches.maybeDate("99181919728"));
39 | 		assertFalse(DateMatches.maybeDate("1010 1010 0101 0001"));
40 | 		assertFalse(DateMatches.maybeDate("Mayday Mayday"));
41 | 		assertTrue(DateMatches.maybeDate("12 June 19283")); // still 12 June
42 | 	}
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/NGram.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author Varsha Devadas
 3 |  */
 4 | 
 5 | package edu.uncc.cs.watsonsim.scorers;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | 
10 | import edu.uncc.cs.watsonsim.Answer;
11 | import edu.uncc.cs.watsonsim.Passage;
12 | import edu.uncc.cs.watsonsim.Phrase;
13 | import edu.uncc.cs.watsonsim.StringUtils;
14 | 
15 | public class NGram extends PassageScorer {
16 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
17 | 		// Jane Austen
18 | 		List<String> a_set = generateNgrams(3, StringUtils.tokenize(a.text));
19 | 		
20 | 		// Romantic novelist Jane Austen once wrote -the- book Emma.
21 | 		List<String> p_set = generateNgrams(3, p.getTokens());
22 | 		
23 | 		a_set.retainAll(p_set);
24 | 		return a_set.size();
25 | 			
26 | 	}
27 | 	public static List<String> generateNgrams(int n, List<String> words) {
28 |         List<String> ngrams = new ArrayList<String>();
29 |         for (int i = 0; i < words.size() - n + 1; i++)
30 |             ngrams.add(concat(words, i, i+n));
31 |         return ngrams;
32 | 	}
33 | 	
34 | 	public static String concat(List<String> words, int start, int end) {
35 |         StringBuilder sb = new StringBuilder();
36 |         for (int i = start; i < end; i++)
37 |             sb.append((i > start ? " " : "") + words.get(i));
38 |         return sb.toString();
39 |     }
40 | 	
41 | 	/*public static void main(String[] args) {
42 | 		
43 |     		Question question = Pipeline.ask("Who wrote Emma?");
44 |     		Answer r = question.get(0);
45 |     		NGram ngram = new NGram();
46 |     		
47 | 	        double result = ngram.scorePassage(question, r, r.passages.get(0));
48 | 	        
49 | 	        System.out.println(result);
50 |     	}*/
51 | 		
52 | 	
53 | 	
54 | }
55 | 
56 | 
57 | 	
58 | 
59 | 


--------------------------------------------------------------------------------
/scripts/gensim/vstore.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #  Provides a nice wrapper for GloVe data and other already-processed vectors
 3 | import lmdb
 4 | import numpy
 5 | numpy.set_printoptions(threshold=20)
 6 | 
 7 | class VStore(object):
 8 | 	_allenvs = {}
 9 | 	def __init__(self, filename, name):
10 | 		''' Create a lmdb-backed VStore using a cached environment '''
11 | 		if filename not in self._allenvs:
12 | 			self._allenvs[filename] = lmdb.Environment(filename,
13 | 				map_size=100<<30,
14 | 				max_dbs=100)
15 | 		self._env = self._allenvs[filename]
16 | 		self._db = self._env.open_db(name);
17 | 
18 | 	def _txn(self, write=False):
19 | 		''' Convenience method for making a transaction '''
20 | 		return self._env.begin(self._db, write=write)
21 | 
22 | 	def get(self, name, default=None):
23 | 		''' Get a vector by name '''
24 | 		with self._txn() as txn:
25 | 			r = txn.get(name)
26 | 			if r is None:
27 | 				return default
28 | 			else:
29 | 				return numpy.frombuffer(r, dtype=numpy.float32)
30 | 
31 | 	def drop(self):
32 | 		'''Drop everything in a database'''
33 | 		with self._txn(write=True) as txn:
34 | 			txn.drop(self._db, delete=False)
35 | 
36 | 	def put(self, name, value):
37 | 		''' Put a vector into the entry for name '''
38 | 		with self._txn(write=True) as txn:
39 | 			txn.put(name, numpy.getbuffer(value))
40 | 
41 | 	def read(self):
42 | 		''' Get all the vectors from the database '''
43 | 		with self._txn(write=False) as txn:
44 | 			for key, value in txn.cursor():
45 | 				yield (key, numpy.frombuffer(value, dtype=numpy.float32))
46 | 
47 | 	def load(self, gen):
48 | 		''' Put() into the database many (name, vector) pairs '''
49 | 		with self._txn(write=True) as txn:
50 | 			try:
51 | 				for name, value in gen:
52 | 					txn.put(name, numpy.getbuffer(value))
53 | 			except lmdb.BadValsizeError as e:
54 | 				print name, value.shape, value
55 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/DateMatches.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.regex.Matcher;
 4 | import java.util.regex.Pattern;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | import edu.uncc.cs.watsonsim.nlp.ClueType;
 9 | 
10 | /**
11 |  * Check if: the question needs a date, and the answer is one 
12 |  * @author Sean
13 |  */
14 | public class DateMatches extends AnswerScorer {
15 | 	public static boolean maybeMonth(String in) {
16 | 		return in.matches("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\.?\\w*"
17 | 				+ "|\\d{1,2}");
18 | 	}
19 | 	
20 | 	public static boolean maybeYear(String in) {
21 | 		return in.matches("\\d{2,4}");
22 | 	}
23 | 	
24 | 	public static boolean maybeDay(String in) {
25 | 		return in.matches("\\d{1,2}[a-z]*");
26 | 	}
27 | 	
28 | 	public static boolean maybeDate(String in) {
29 | 		boolean years=false, months=false, days=false;
30 | 		Matcher m = Pattern
31 | 				.compile("([^- ,/]+)[- ,/]+([^- ,/]+)([- ,/]+[^- ,/]+)?")
32 | 				.matcher(in);
33 | 		if (m.find()) {
34 | 			for (int group=0; group<m.groupCount(); group++) {
35 | 				years |= maybeYear(m.group(group));
36 | 				months |= maybeMonth(m.group(group));
37 | 				days |= maybeDay(m.group(group));
38 | 			}
39 | 		}
40 | 		return (months && days)  // year optional
41 | 				|| (years && months) // day optional
42 | 				|| years; // Only year. Strange, but TREC does this.
43 | 	}
44 | 	
45 | 	public double scoreAnswer(Question q, Answer a) {	
46 | 		boolean matches = false;
47 | 		switch (q.memo(ClueType::fromClue).toLowerCase()) {
48 | 		case "year": matches = maybeYear(a.text);
49 | 		case "date":
50 | 		case "day": matches = maybeDate(a.text);
51 | 		}
52 | 
53 | 		if (matches) a.log(this, "The date matches the question's format.");
54 | 
55 | 		return matches ? 1 : 0;
56 | 	}
57 | 	
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/Redirects.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import java.sql.ResultSet;
 4 | import java.sql.SQLException;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | import java.util.Optional;
 8 | 
 9 | import edu.uncc.cs.watsonsim.Environment;
10 | 
11 | public class Redirects {
12 | 	
13 | 	// Here's a cute trick: we don't know the redirect sources or targets, but
14 | 	// we can refer the target to itself and then match on whether the sources
15 | 	// and targets refer (as targets) to the same hash
16 | 	private static ApproxStringIntMap redirects = new ApproxStringIntMap(null);
17 | 
18 | 	public Redirects(Environment env) {
19 | 		load(env);
20 | 	}
21 | 	
22 | 	private static synchronized void load(Environment env) {
23 | 		if (redirects.isEmpty()) {
24 | 			int collisions = 0;
25 | 			try {
26 | 				ResultSet rs = env.db.prep(
27 | 						"SELECT source, target FROM wiki_redirects;"
28 | 						).executeQuery();
29 | 				/* There is a trick here. Ordering by target means we don't
30 | 				 * need to check for the ID of the target each time, if it is
31 | 				 * different, then it will be the next in order.
32 | 				 */
33 | 				while (rs.next()) {
34 | 					collisions += redirects.containsKey(rs.getString(1)) ? 1 : 0;
35 | 					redirects.put(rs.getString(1), rs.getString(2).hashCode());
36 | 					redirects.put(rs.getString(2), rs.getString(2).hashCode());
37 | 				}
38 | 			} catch (SQLException e) {
39 | 				// Leave the table blank and give 0's
40 | 				e.printStackTrace();
41 | 			}
42 | 			redirects.put("mammalia", "mammal".hashCode());
43 | 			System.out.println("Loaded " + redirects.size() + " redirects "
44 | 					+ "(" + collisions + " collisions)");
45 | 		}
46 | 	}
47 | 
48 | 	public boolean matches(String a, String b) {
49 | 		int a_redir = redirects.get(a);
50 | 		int b_redir = redirects.get(b);
51 | 		return a_redir != 0 && a_redir == b_redir;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/index/Indri.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.index;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Collections;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Passage;
 7 | import lemurproject.indri.IndexEnvironment;
 8 | 
 9 | public class Indri implements Segment {
10 | 	private final IndexEnvironment index;
11 | 	public Indri(String path) {
12 | 		// Only initialize the query environment and index once
13 | 		index = new IndexEnvironment();
14 | 		
15 | 		/* Setup Indri */
16 | 		try {
17 | 			// open means to append
18 | 			// create means to replace
19 | 			// TODO: ask the user
20 | 			index.create(path);
21 | 			//index.open(path);
22 | 			index.setMemory(1<<30);
23 | 			index.setIndexedFields(new String[]{"text"});
24 | 			index.setStoreDocs(false);
25 | 		} catch (Exception e) {
26 | 			e.printStackTrace();
27 | 			throw new RuntimeException("Can't create Indri index."
28 | 					+ " Please check that you entered the right path in "
29 | 					+ "config.properties");
30 | 		}
31 | 	}
32 | 
33 |     @Override
34 |     public void accept(Passage p) {
35 |     	String trecdoc = "<DOC>\n<DOCNO>\n"
36 |     			+ p.reference
37 |     			+ "</DOCNO>\n<TEXT>\n"
38 |     			+ p.text
39 |     			+ "</TEXT>\n</DOC>\n";
40 |     	synchronized (index) {
41 |     		try {
42 | 				index.addString(trecdoc, "trectext", Collections.emptyMap());
43 | 			} catch (Exception e) {
44 | 				// Sadly, Indri throws everything and functions throw nothing
45 | 				// so we simply wrap what could be anything into a
46 | 				// stop-the-world runtime exception.
47 | 				e.printStackTrace();
48 | 				throw new RuntimeException(e);
49 | 			}
50 |     	}
51 |     }
52 |     
53 | 	@Override
54 | 	public void close() throws IOException {
55 | 		try {
56 | 			index.close();
57 | 		} catch (Exception e) {
58 | 			e.printStackTrace();
59 | 			// Cheat and say it's IO. It probably is anyway.
60 | 			throw new IOException(e);
61 | 		}
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/Entropy.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.sql.ResultSet;
 4 | import java.sql.SQLException;
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Environment;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | import edu.uncc.cs.watsonsim.Question;
11 | import edu.uncc.cs.watsonsim.nlp.ApproxStringIntMap;
12 | import edu.uncc.cs.watsonsim.nlp.StringStack;
13 | 
14 | public class Entropy extends AnswerScorer {
15 | 	// This is a custom approach for about a 10-fold reduction in memory
16 | 	private static final double mult = 2<<20;
17 | 	private static ApproxStringIntMap dict = new ApproxStringIntMap(new StringStack());
18 | 	
19 | 	public Entropy(Environment env) {
20 | 		load(env);
21 | 	}
22 | 	
23 | 	private static synchronized void load(Environment env) {
24 | 		if (dict.isEmpty()) {
25 | 			int collisions = 0;
26 | 			try {
27 | 				ResultSet rs = env.db.prep("SELECT word, p FROM entropy;").executeQuery();
28 | 				while (rs.next()) {
29 | 					collisions += dict.containsKey(rs.getString(1)) ? 1 : 0;
30 | 					// This mult is to put enough of the double's precision in
31 | 					// the int. p is logarithmic so overflow is not a problem.
32 | 					dict.put(rs.getString(1), (int)(rs.getDouble(2)*mult));
33 | 				}
34 | 			} catch (SQLException e) {
35 | 				// Leave the table blank and give 0's
36 | 				e.printStackTrace();
37 | 			}
38 | 			System.out.println("Loaded " + dict.size() + " words' entropy "
39 | 					+ "(" + collisions + " collisions)");
40 | 		}
41 | 	}
42 | 	
43 | 	protected double entropy(Iterable<String> targets) {
44 | 		double ent = 0;
45 | 		for (String target: targets) {
46 | 			ent += dict.get(target) / mult;
47 | 		}
48 | 		return ent;
49 | 	}
50 | 
51 | 	@Override
52 | 	public double scoreAnswer(Question q, Answer a) {
53 | 		return entropy(a.memo(Phrase.tokens));
54 | 	}
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/LucenePassageSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | import java.util.Collections;
 7 | 
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.search.IndexSearcher;
10 | import org.apache.lucene.search.ScoreDoc;
11 | import edu.uncc.cs.watsonsim.Environment;
12 | import edu.uncc.cs.watsonsim.Passage;
13 | import edu.uncc.cs.watsonsim.Score;
14 | import edu.uncc.cs.watsonsim.scorers.Merge;
15 | 
16 | /**
17 |  * @author Phani Rahul
18 |  */
19 | public class LucenePassageSearcher extends Searcher {
20 | 	private final IndexSearcher lucene;
21 | 	private final Environment env;
22 | 	
23 | 	public LucenePassageSearcher(Environment env) {
24 | 		super(env);
25 | 		this.lucene = env.lucene;
26 | 		this.env = env;
27 | 		Score.register("LUCENE_SCORE", -1, Merge.Mean);
28 | 		Score.register("LUCENE_RANK", -1, Merge.Mean);
29 | 	}
30 | 	
31 | 	public List<Passage> query(String question_text) {
32 | 		List<Passage> results = new ArrayList<>();
33 | 		try {
34 | 			ScoreDoc[] hits = env.simpleLuceneQuery(question_text, MAX_RESULTS);
35 | 			// This isn't range based because we need the rank
36 | 			for (int i=0; i < hits.length; i++) {
37 | 				Document doc = lucene.doc(hits[i].doc, Collections.singleton("docno"));
38 | 				results.add(new edu.uncc.cs.watsonsim.Passage(
39 | 						"lucene", 			// Engine
40 | 						"",	// Title
41 | 						"", // Text
42 | 						doc.get("docno"))   // Reference
43 | 						.score("LUCENE_RANK", (double) i)           // Rank
44 | 						.score("LUCENE_SCORE", (double) hits[i].score)	// Source
45 | 						);
46 | 			}
47 | 		} catch (IOException e) {
48 | 			System.out.println("Failed to query Lucene. Is the index in the correct location?");
49 | 			e.printStackTrace();
50 | 		}
51 | 		
52 | 		// Fill any missing full text from sources
53 | 		return fillFromSources(results);
54 | 	}
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/MergeByText.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Environment;
 8 | import edu.uncc.cs.watsonsim.Question;
 9 | import edu.uncc.cs.watsonsim.nlp.Relatedness;
10 | 
11 | public class MergeByText extends Researcher {
12 | 	private final Relatedness syn;
13 | 	/**
14 | 	 * Create a new merger using shared environment resources.
15 | 	 * @param env
16 | 	 */
17 | 	public MergeByText(Environment env) {
18 | 		syn = new Relatedness(env);
19 | 	}
20 | 	
21 | 	@Override
22 | 	/** Call merge on any two answers with the same title */
23 | 	public List<Answer> question(Question q, List<Answer> answers) {
24 | 		List<List<Answer>> answer_blocks = new ArrayList<>();
25 | 		// Arrange the answers into blocks
26 | 		each_answer:
27 | 		for (Answer original : answers) {
28 | 			for (List<Answer> block : answer_blocks) {
29 | 				for (Answer example : block) {
30 | 					// Look through the examples in this topic
31 | 					// If it matches, choose to put it in this block and quit.
32 | 					if (syn.matchViaLevenshtein(original.text, example.text)) {
33 | 						block.add(original);
34 | 						continue each_answer;
35 | 					}
36 | 				}
37 | 			}
38 | 			
39 | 			// Make a new topic for this answer
40 | 			List<Answer> new_block = new ArrayList<>();
41 | 			new_block.add(original);
42 | 			answer_blocks.add(new_block);
43 | 		}
44 | 
45 | 		// Merge the blocks
46 | 		List<Answer> new_answers = new ArrayList<>();
47 | 		for (List<Answer> block : answer_blocks) {
48 | 			if (block.size() > 1) {
49 | 				new_answers.add(Answer.merge(block));
50 | 			} else {
51 | 				new_answers.add(block.get(0));
52 | 			}
53 | 		}
54 | 		
55 | 		log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by surface similarity).");
56 | 		return new_answers;
57 | 	}
58 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/AnswerTrimming.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | 
 9 | /**
10 |  * @author Suresh Appana
11 |  *
12 |  */
13 | public class AnswerTrimming extends Researcher {
14 | 	@Override
15 | 	public List<Answer> question(Question question, List<Answer> answers) {
16 |         List<Answer> answers_updated = new ArrayList<>();
17 |         for(Answer ans : answers) {
18 |         	String text = ans.text;
19 |         	//System.out.println(text);
20 |         	String[] answer_array = ans.text.split(" ");
21 |         	int answer_array_length = answer_array.length;
22 |         	
23 |         	
24 |         	
25 |         	for (int j = 0; j < answer_array_length; j++) {
26 | 				for (int i = answer_array_length - 1; i >= j; i--) {
27 | 					StringBuilder sb = new StringBuilder();
28 | 					for (int k = j; k <= i; k++) {
29 | 						// System.out.println("i=" + i + ", j=" + j + ", k");
30 | 						sb.append(answer_array[k]);
31 | 						if (k != i)
32 | 							sb.append(" ");
33 | 					}
34 | 					if (sb.length() > 0 && question.text.toLowerCase().contains(sb.toString().toLowerCase())) {
35 | 						text = text
36 | 								.replace(sb.toString(), "")
37 | 								.trim()
38 | 								.replaceAll(" +", " ")
39 | 								.replaceAll("^([^a-z|A-Z|0-9])( )*", "")
40 | 								.replaceAll("()*([^a-z|A-Z|0-9])$", "")
41 | 								.trim();
42 | 						answer_array = text.split(" ");
43 | 						answer_array_length = answer_array.length;
44 | 						i = answer_array_length - 1;
45 | 						j = 0;
46 | 					}
47 | 				}
48 | 			}
49 |         	answers_updated.add( ans.withText(text));
50 |         }
51 |         
52 |         //for(int i=0;i<answers.size();i++)
53 |         //	System.out.println(answers.get(i).text+"//"+answers_updated.get(i).text);
54 |         
55 |         return answers_updated;
56 |         //answers = late_researchers.pull(question, answers_updated); 
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/scripts/gensim/analogy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from vstore import VStore
 3 | from argparse import ArgumentParser
 4 | import numpy as np
 5 | import code
 6 | from sklearn.decomposition import PCA
 7 | 
 8 | class Analogy(object):
 9 | 	def __init__(self, model):
10 | 		self.model = model
11 | 		self.pca = None
12 | 	
13 | 	def w(self, word):
14 | 		'''Get the vector for a word - a short alias'''
15 | 		r = self.model.get(word)
16 | 		if r is None:
17 | 			print "Model {} has no vector for {}.".format(self.model, word)
18 | 		return r
19 | 	
20 | 	def pcaw(self, word):
21 | 		if self.pca is None: return None
22 | 		return self.pca.transform(self.w(word))
23 | 	
24 | 	def context(self, words, components=100):
25 | 		self.pca = PCA(components)
26 | 		self.pca.fit([self.w(x) for x in words])
27 | 		
28 | 	@staticmethod
29 | 	def sim(left, right):
30 | 		'''Compare two dense vectors using cosine similarity'''
31 | 		if left is not None and right is not None:
32 | 			return (
33 | 				np.sum(left*right) /
34 | 				(np.sqrt(np.sum(left**2)) * np.sqrt(np.sum(right**2)))
35 | 			)
36 | 		return None
37 | 
38 | if __name__ == '__main__':
39 | 	np.set_printoptions(threshold=20)
40 | 
41 | 	parser = ArgumentParser(description="Perform simple algebra on words")
42 | 	parser.add_argument("--dbfile",
43 | 		default="vectors.lmdb",
44 | 		help="use this database file to get vectors")
45 | 	parser.add_argument("model", action="store",
46 |  		help="compare using this model database (e.g. glove)")
47 | 	args = parser.parse_args()
48 | 	model = VStore(args.dbfile, args.model)
49 | 
50 | 	analogy = Analogy(model)
51 | 	w = analogy.w
52 | 	pcaw = analogy.pcaw
53 | 	context = analogy.context
54 | 	sim = analogy.sim
55 | 	print "What follows is a python prompt."
56 | 	print "w('elicidate') --> vector for `elicidate`"
57 | 	print "w('mogrify') + w('frobnicate') --> vector sum"
58 | 	print "    same for -, *, /, **, etc as usual for numpy"
59 | 	print "sim(w('republican'), w('democrat')) -> society in a 32bit float"
60 | 	print "    (actually a simple cosine similarity)"
61 | 	code.interact(local=vars())
62 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/RedirectSynonyms.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.sql.PreparedStatement;
 4 | import java.sql.ResultSet;
 5 | import java.sql.SQLException;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.commons.lang3.StringEscapeUtils;
10 | import edu.uncc.cs.watsonsim.Answer;
11 | import edu.uncc.cs.watsonsim.Database;
12 | import edu.uncc.cs.watsonsim.Environment;
13 | import edu.uncc.cs.watsonsim.Question;
14 | import edu.uncc.cs.watsonsim.Score;
15 | import edu.uncc.cs.watsonsim.scorers.Merge;
16 | 
17 | /**
18 |  * Create a bunch of new answers with the same passages based on "synonyms"
19 |  * made from Wikipedia redirects.
20 |  * 
21 |  * @author Sean
22 |  */
23 | public class RedirectSynonyms extends Researcher {
24 | 	private final Database db;
25 | 	private final PreparedStatement s;
26 | 	
27 | 	public RedirectSynonyms(Environment env) {
28 | 		db = env.db;
29 | 		s = db.prep("SELECT source from wiki_redirects where target = ?;");
30 | 		Score.register("IS_WIKI_REDIRECT", 0.0, Merge.Min);
31 | 	}
32 | 
33 | 	@Override
34 | 	public List<Answer> question(Question q, List<Answer> answers) {
35 | 		// For logging 
36 | 		int synonym_count = 0;
37 | 		List<Answer> new_answers = new ArrayList<Answer>();
38 | 		for (Answer a : answers) {
39 | 			try {
40 | 				s.setString(1, a.text);
41 | 				ResultSet results = s.executeQuery();
42 | 				while (results.next()) {
43 | 					synonym_count++;
44 | 					Answer new_answer = new Answer(
45 | 							new ArrayList<>(a.passages),
46 | 							a.scores.clone(),
47 | 							StringEscapeUtils.unescapeXml(results.getString("source")));
48 | 					a.scores.put("IS_WIKI_REDIRECT", 1.0);
49 | 					new_answers.add(new_answer);
50 | 				}
51 | 			} catch (SQLException e) {
52 | 				// Just don't make any synonyms.
53 | 				return answers;
54 | 			}
55 | 		}
56 | 		
57 | 		log.info("Found " + synonym_count + " synonyms for " + answers.size() +
58 | 				" candidate answers using Wikipedia redirects.");
59 | 		return new_answers;
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/index/Lucene.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.index;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Path;
 5 | 
 6 | import org.apache.lucene.analysis.Analyzer;
 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.document.Field;
10 | import org.apache.lucene.document.StoredField;
11 | import org.apache.lucene.document.TextField;
12 | import org.apache.lucene.index.IndexWriter;
13 | import org.apache.lucene.index.IndexWriterConfig;
14 | import org.apache.lucene.search.similarities.BM25Similarity;
15 | import org.apache.lucene.store.Directory;
16 | import org.apache.lucene.store.FSDirectory;
17 | 
18 | import edu.uncc.cs.watsonsim.Passage;
19 | 
20 | public class Lucene implements Segment {
21 | 	private final IndexWriter index;
22 | 	public Lucene(Path path) throws IOException {
23 | 		/* Setup Lucene */
24 |         Directory dir = FSDirectory.open(path);
25 |         // here we are using a standard analyzer, there are a lot of analyzers available to our use.
26 |         Analyzer analyzer = new StandardAnalyzer();
27 |         IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
28 |         //this mode by default overwrites the previous index, not a very good option in real usage
29 |         iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
30 |         iwc.setSimilarity(new BM25Similarity());
31 |         index = new IndexWriter(dir, iwc);
32 | 	}
33 | 	
34 |     public void accept(Passage p){
35 | 		// Index with Lucene
36 |         Document doc = new Document();
37 |         doc.add(new TextField("title", p.title, Field.Store.NO));
38 |         doc.add(new TextField("text", p.text, Field.Store.YES));
39 |         doc.add(new StoredField("docno", p.reference));
40 |         try {
41 | 			index.addDocument(doc);
42 | 		} catch (IOException e) {
43 | 			// TODO Auto-generated catch block
44 | 			e.printStackTrace();
45 | 		}
46 |     }
47 | 
48 | 	@Override
49 | 	public void close() throws IOException {
50 | 		index.close();
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/MergeByCommonSupport.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashSet;
 5 | import java.util.List;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Question;
10 | 
11 | public class MergeByCommonSupport extends Researcher {
12 | 	
13 | 	@Override
14 | 	/** Call merge on any two answers, where the answers have more passages in common than different*/
15 | 	public List<Answer> question(Question q, List<Answer> answers) {
16 | 		List<List<Answer>> answer_blocks = new ArrayList<>();
17 | 		each_answer:
18 | 		for (Answer original : answers) {
19 | 			HashSet<Passage> o_passages = new HashSet<>();
20 | 			o_passages.addAll(original.passages);
21 | 			
22 | 			for (List<Answer> block : answer_blocks) {
23 | 				for (Answer example : block) {
24 | 
25 | 					HashSet<Passage> e_passages = new HashSet<>();
26 | 					e_passages.addAll(example.passages);
27 | 					int example_cardinality = e_passages.size();
28 | 					e_passages.retainAll(o_passages);
29 | 					
30 | 					double percent_common = e_passages.size() /
31 | 							(example_cardinality + o_passages.size() - e_passages.size() + 0.01);
32 | 					
33 | 					if ( percent_common > 0.5 ) {
34 | 						// If the intersection > half the union, then merge the questions
35 | 						block.add(original);
36 | 						continue each_answer;
37 | 					}
38 | 				}
39 | 			}
40 | 			
41 | 			// Make a new topic for this answer
42 | 			List<Answer> new_block = new ArrayList<>();
43 | 			new_block.add(original);
44 | 			answer_blocks.add(new_block);
45 | 		}
46 | 
47 | 		// Merge the blocks
48 | 		List<Answer> new_answers = new ArrayList<>();
49 | 		for (List<Answer> block : answer_blocks) {
50 | 			if (block.size() > 1) {
51 | 				new_answers.add(Answer.merge(block));
52 | 			} else {
53 | 				new_answers.add(block.get(0));
54 | 			}
55 | 		}
56 | 		
57 | 		log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by common passages).");
58 | 		return new_answers;
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/NamedEntityRecognizerScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | import edu.uncc.cs.watsonsim.StringUtils;
11 | import opennlp.tools.namefind.NameFinderME;
12 | import opennlp.tools.namefind.TokenNameFinderModel;
13 | import opennlp.tools.tokenize.SimpleTokenizer;
14 | import opennlp.tools.util.Span;
15 | 
16 | /**
17 |  * This scorer will return the number of named entities matched in a given
18 |  * question
19 |  * 
20 |  * @author Jonathan Shuman
21 |  * 
22 |  */
23 | public class NamedEntityRecognizerScorer extends PassageScorer {
24 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
25 | 
26 | 		// Jane Austen
27 | 		String c_t = StringUtils.join(p.text, " ");
28 | 
29 | 		// Romantic novelist Jane Austen once wrote -the- book Emma.
30 | 		String q_t = q.text;
31 | 
32 | 		return numberOfNamedPersonEntities(q_t, c_t);
33 | 
34 | 	}
35 | 
36 | 	private double numberOfNamedPersonEntities(String q_t, String c_t) {
37 | 		InputStream modelIn = null;
38 | 		double retVal = 0;
39 | 		try {
40 | 			modelIn = new FileInputStream("data/en-ner-person.bin");
41 | 			TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
42 | 			NameFinderME nameFinder = new NameFinderME(model);
43 | 			String[] c_words = SimpleTokenizer.INSTANCE.tokenize(c_t);
44 | 			String[] q_words = SimpleTokenizer.INSTANCE.tokenize(q_t);
45 | 			Span[] c_tokens = nameFinder.find(c_words);
46 | 			
47 | 			for (Span cS : c_tokens) {
48 | 				for (String q_word : q_words)
49 | 					if ((c_words[cS.getStart()]).contains(q_word))
50 | 						retVal++;
51 | 			}
52 | 		} catch (IOException e) {
53 | 			e.printStackTrace();
54 | 			return Double.NaN;
55 | 		} finally {
56 | 			if (modelIn != null) {
57 | 				try {
58 | 					modelIn.close();
59 | 				} catch (IOException e) {
60 | 					return Double.NaN;
61 | 				}
62 | 
63 | 			}
64 | 		}
65 | 		return retVal;
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PassageTermMatch.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Passage;
 7 | import edu.uncc.cs.watsonsim.Phrase;
 8 | import edu.uncc.cs.watsonsim.StringUtils;
 9 | 
10 | /**
11 |  * The Passage Term match scorer is designed, simply, to count the number of times
12 |  * a term appears in the text.
13 |  * 
14 |  * "This assigns a score by
15 |  *	matching question terms to passage terms, regardless
16 |  *	of grammatical relationship or word order."
17 |  *
18 |  * It returns a number which is equal to the number of occurrences
19 |  * @author Jonathan Shuman
20 |  *
21 |  */
22 | public class PassageTermMatch extends PassageScorer { 
23 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
24 | 		
25 | 		// Jane Austen
26 | 		String c_t = StringUtils.join(p.text, " ");
27 | 		
28 | 		// Romantic novelist Jane Austen once wrote -the- book Emma.
29 | 		String q_t = q.text;
30 | 		
31 | 		return generateNumberTerms(q_t, c_t);
32 | 		
33 | 	}
34 | 	
35 | 	/**
36 | 	 * @param queryText The text of the query to search passages
37 | 	 * @param passageText The text of the passage
38 | 	 * @return Number of occurrences of words in query in the passage
39 | 	 */
40 | 	private int generateNumberTerms(String queryText, String passageText) {
41 | 		/*
42 | 		 * We will first separate the text of the query and passage into terms.
43 | 		 * Note: The parameters are assumed to have stopwords removed.
44 | 		 */
45 | 		List<String> qTerms = StringUtils.tokenize(queryText);
46 | 		List<String> pTerms = StringUtils.tokenize(passageText);
47 | 		
48 | 		// Join the passage back together with stop words removed. 
49 | 		// We will use the StringUtils function to remove the words.
50 | 		String passageStopsRemoved = StringUtils.join(pTerms, " ");
51 | 		
52 | 		int matches = 0;
53 | 		//Scan through each of the terms to get its number of occurances in the passage text.
54 | 		for (String term : qTerms) {
55 | 			// First the bigram
56 | 			matches += StringUtils.countMatches(passageStopsRemoved, term);
57 | 		}
58 | 		return matches;
59 | 	}
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/resources/public/scripts/query.js:
--------------------------------------------------------------------------------
 1 | function write_log(text) {
 2 | 	$("#console").append($("<li>").text(text));
 3 | 	$("#console").animate({ scrollTop: $("#console").prop("scrollHeight") }, "slow");
 4 | }
 5 | 
 6 | 
 7 | angular.module('queryApp', [])
 8 |   .controller('QueryController', function($scope) {
 9 |     var queryDetail = this;
10 |     queryDetail.answers = [
11 | 		{text: "Some sample data", score: 0.8976,
12 | 			evidence: [{source: "moomoo", note: "this is an example"}, {source: "akjshkjd", note: "another example"}],
13 | 			scores: {ANSWER_RANK: 0.8172, ANSWER_SCORE: 0.8162, LAT_CHECK: 0.99, CORR: 0.1},
14 | 			passages: [{
15 | 				title: "moomoo",
16 | 				text: "this is an example",
17 | 				reference: "wp-full-8272-18"},
18 | 				{title: "akjshkjd",
19 | 				text: "another example",
20 | 				reference: "wp-full-8272-18"}]
21 | 		},
22 | 		{text: "Moo! bar bax", score: 0.4926, evidence: [{source: "moomoo", note: "this is an example"}]},
23 | 		{text: "Another example", score: 0.207, evidence: [{source: "moomoo", note: "this is an example"}]}
24 | 	];
25 | 	queryDetail.note = "Ask any natural language question to have it answered!";
26 | 	
27 | 	queryDetail.handle_message = function(event) {
28 | 		// Handle incoming messages
29 | 		console.log(event.data);
30 | 		var content = JSON.parse(event.data);
31 | 		switch (content.flag) { // flag
32 | 		case "log":
33 | 			write_log(content.message);
34 | 			break;
35 | 		case "result":
36 | 			queryDetail.answers = content.message;
37 | 			$("#console").slideUp();
38 | 			queryDetail.note = "";
39 | 			break;
40 | 		}
41 | 	};
42 | 	queryDetail.begin = function () {
43 | 		// Clean the screen
44 | 		$("#console li").remove();
45 | 		$("#console").slideDown();
46 | 		
47 | 		// Open a channel
48 | 		var query_channel = new WebSocket("ws://localhost:8887/asklive");
49 | 		query_channel.onopen = function (event) {
50 | 			// Ask the question
51 | 			query_channel.send("ask:" + $("#search [name=query]").val());
52 | 			write_log("Sending query...");
53 | 		};
54 | 		query_channel.onmessage = function(e) {
55 | 			queryDetail.handle_message(e);
56 | 			$scope.$apply();
57 | 		};
58 | 		//event.preventDefault();
59 | 	};
60 |   });
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/MeanDVSearchTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | import edu.uncc.cs.watsonsim.Environment;
12 | import edu.uncc.cs.watsonsim.Passage;
13 | import static org.fusesource.lmdbjni.Constants.*;
14 | 
15 | public class MeanDVSearchTest {
16 | 
17 | 	MeanDVSearch mds;
18 | 	@Before
19 | 	public void setUp() throws Exception {
20 | 	}
21 | 
22 | 	@Test
23 | 	public void test() {
24 | 		mds = new MeanDVSearch(new Environment());
25 | 		List<Passage> frogstuff = mds.query("frog");
26 | 		assertTrue(frogstuff.size() > 0);
27 | 		assertTrue(frogstuff.get(0).title.contains("frog"));
28 | 	}
29 | 	
30 | 	@Test
31 | 	public void testBubble() {
32 | 		double[] sims = new double[5];
33 | 		byte[][] names = new byte[5][];
34 | 		byte[] name_e = bytes("e");
35 | 		byte[] name_f = bytes("f");
36 | 		byte[] name_g = bytes("g");
37 | 		
38 | 		sims[0]=0.8; sims[1]=0.5; sims[2]=0.0; sims[3]=-1;
39 | 		names[0]=bytes("a"); names[1]=bytes("b"); names[2]=bytes("c"); names[3]=bytes("d");
40 | 		
41 | 		MeanDVSearch.bubble(sims, names, 0.9, name_e, 4);
42 | 		assertEquals(0.9, sims[0], 0.01);
43 | 		assertEquals(0.8, sims[1], 0.01);
44 | 		assertEquals(0.5, sims[2], 0.01);
45 | 		assertEquals(0.0, sims[3], 0.01);
46 | 		assertEquals(name_e, names[0]);
47 | 		//----------------------------------------------------------------
48 | 		
49 | 		MeanDVSearch.bubble(sims, names, 0.1, name_f, 4);
50 | 		assertEquals(0.9, sims[0], 0.01);
51 | 		assertEquals(0.8, sims[1], 0.01);
52 | 		assertEquals(0.5, sims[2], 0.01);
53 | 		assertEquals(0.1, sims[3], 0.01);
54 | 		assertEquals(name_f, names[3]);
55 | 		//----------------------------------------------------------------
56 | 		
57 | 		MeanDVSearch.bubble(sims, names, 0.5, name_g, 4);
58 | 		assertEquals(0.9, sims[0], 0.01);
59 | 		assertEquals(0.8, sims[1], 0.01);
60 | 		assertEquals(0.5, sims[2], 0.01);
61 | 		assertEquals(0.5, sims[3], 0.01);
62 | 		assertEquals(name_g, names[3]);
63 | 		//----------------------------------------------------------------
64 | 	}
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/Log.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import java.util.function.Consumer;
 4 | 
 5 | /**
 6 |  * Wrapper logger
 7 |  * 
 8 |  * Loggers already allow many modules to log to many places.
 9 |  * But we need each module to log to some (but not all) places. So basically,
10 |  * want to pass around a fancy many-to-many channel. 
11 |  * 
12 |  * @author Sean
13 |  *
14 |  */
15 | public class Log {
16 | 	private Consumer<String> listener;
17 | 	private final Log parent;
18 | 	private final Class<?> speaker;
19 | 	private final long start;
20 | 	
21 | 	private enum Level {ERROR, WARNING, INFO, DEBUG};
22 | 	
23 | 	public static final Log NIL = new Log(Object.class, x->{});
24 | 	
25 | 	// Start a root logger
26 | 	public Log(Object speaker, Consumer<String> listener) {
27 | 		this.parent = null;
28 | 		this.speaker = speaker.getClass();
29 | 		this.start = System.currentTimeMillis();
30 | 		this.listener = listener;
31 | 	}
32 | 	
33 | 	// Start a child logger
34 | 	private Log(Object speaker, Log parent) {
35 | 		this.parent = parent;
36 | 		this.speaker = speaker.getClass();
37 | 		this.start = parent.start;
38 | 	}
39 | 	
40 | 	/**
41 | 	 * Make a new writable subchannel.
42 | 	 */
43 | 	public Log kid(Class<?> speaker) {
44 | 		return new Log(speaker, this);
45 | 	}
46 | 	
47 | 	public void setListener(Consumer<String> listener) {
48 | 		this.listener = listener;
49 | 	}
50 | 	
51 | 	/**
52 | 	 * Push some notifications. Listeners may lose interest.
53 | 	 */
54 | 	private void push(String content, Level level) {
55 | 		if (listener != null) {
56 | 			listener.accept(String.format("%.2f [%s %s] %s",
57 | 					(System.currentTimeMillis()-start) / 1000.0,
58 | 					level.name(),
59 | 					speaker.getSimpleName(),
60 | 					content));
61 | 		} else if (parent != null) {
62 | 			parent.push(content, level);
63 | 		}
64 | 	}
65 | 	
66 | 	public void error(String message) {
67 | 		push(message, Level.ERROR);
68 | 	}
69 | 	
70 | 	public void warn(String message) {
71 | 		push(message, Level.WARNING);
72 | 	}
73 | 	
74 | 	public void info(String message) {
75 | 		push(message, Level.INFO);
76 | 	}
77 | 	
78 | 	public void debug(String message) {
79 | 		push(message, Level.DEBUG);
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerPOS.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.stanford.nlp.ling.IndexedWord;
 4 | import edu.stanford.nlp.semgraph.SemanticGraph;
 5 | import edu.stanford.nlp.semgraph.SemanticGraphEdge;
 6 | import edu.stanford.nlp.trees.GrammaticalRelation;
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Question;
 9 | 
10 | /**
11 |  * 
12 |  * @author Yeshvant
13 |  *
14 |  */
15 | public class AnswerPOS extends AnswerScorer {
16 | 
17 | 	public AnswerPOS() {
18 | 	}
19 | 
20 | 	public double scoreAnswer(Question q, Answer a) {
21 | 		
22 | 		for (SemanticGraph graph : a.getGraphs()) {
23 | 
24 | 			if(!graph.getRoots().isEmpty())
25 | 			{
26 | 			if (graph.getFirstRoot().tag().contains("NN")) {
27 | 				for (SemanticGraphEdge edge : graph.edgeIterable()) {
28 | 
29 | 					IndexedWord a1 = edge.getDependent();
30 | 					IndexedWord a2 = edge.getGovernor();
31 | 
32 | 					if (a1.tag().contains("NN")) {
33 | 						return 1.0;
34 | 					}
35 | 					if (a2.tag().contains("NN")) {
36 | 						return 1.0;
37 | 					}
38 | 
39 | 				}
40 | 
41 | 			}
42 | 		   }
43 | 		  }
44 | 		return 0.0;
45 | 		}
46 | 
47 | 		
48 | 
49 | 	public static void main(String args[]) {
50 | 		Answer a = new Answer("For luck Kate will only knock on this wood");
51 | 		// System.err.println(a.graphs.size());
52 | 		// System.out.println("hello");
53 | 		double score = 0;
54 | 		for (SemanticGraph graph : a.getGraphs()) {
55 | 
56 | 			if (graph.getFirstRoot().tag().contains("NN")) {
57 | 				for (SemanticGraphEdge edge : graph.edgeIterable()) {
58 | 
59 | 					 GrammaticalRelation rel = edge.getRelation(); 
60 | 					IndexedWord a1 = edge.getDependent();
61 | 					IndexedWord a2 = edge.getGovernor();
62 | 
63 | 					// System.out.println(a1.originalText()+"Tag: "+a1.tag());
64 | 					// System.out.println(a2.originalText()+" Tag: "+a2.tag()+" "+rel.getShortName()+" Relation to "+a1.originalText()+" Tag: "+a1.tag());
65 | 					if (a1.tag().contains("NN")) {
66 | 						score = 1.0;
67 | 						// return
68 | 
69 | 					}
70 | 					if (a2.tag().contains("NN")) {
71 | 						score = 1.0;
72 | 						// return
73 | 
74 | 					}
75 | 
76 | 				}
77 | 
78 | 			}
79 | 		}
80 | 
81 | 	}
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/KV.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.fusesource.lmdbjni.Constants.bytes;
 4 | 
 5 | import java.nio.ByteBuffer;
 6 | import java.nio.ByteOrder;
 7 | import java.nio.FloatBuffer;
 8 | import java.util.Optional;
 9 | import java.util.function.Function;
10 | 
11 | import org.fusesource.lmdbjni.Constants;
12 | import org.fusesource.lmdbjni.Env;
13 | import org.fusesource.lmdbjni.Transaction;
14 | 
15 | public class KV {
16 | 	public Env db = new Env();
17 | 	public KV() {
18 | 		db.open("data/lmdb", org.fusesource.lmdbjni.Constants.CREATE);
19 | 	}
20 | 	
21 | 	/**
22 | 	 * Get a byte array from the database just as it was stored.
23 | 	 * @param table		Which table to retrieve it from
24 | 	 * @param key		Which key you want
25 | 	 * @return			byte[]
26 | 	 */
27 | 	public Optional<byte[]> get(String table, String key) {
28 | 		return Optional.ofNullable(db.openDatabase(table).get(bytes(key)));
29 | 	}
30 | 	
31 | 	/**
32 | 	 * Basically just does ((float[]) bytes) which is moderately complex.
33 | 	 * @param bytes
34 | 	 * @return
35 | 	 */
36 | 	public static float[] asVector(byte[] bytes) {
37 | 		FloatBuffer fb = FloatBuffer.allocate((bytes.length + 3) / 4);
38 | 		fb.put(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer());
39 | 		return fb.array();
40 | 	}
41 | 	
42 | 	/**
43 | 	 * Basically just does ((byte[]) floats) which is moderately complex.
44 | 	 * @param bytes
45 | 	 * @return
46 | 	 */
47 | 	public static byte[] asBytes(float[] floats) {
48 | 		ByteBuffer bb = ByteBuffer.wrap(new byte[floats.length*4]).order(ByteOrder.LITTLE_ENDIAN);
49 | 		bb.asFloatBuffer().put(floats);
50 | 		return bb.array();
51 | 	}
52 | 	
53 | 	
54 | 	/**
55 | 	 * Non-atomically update an entry or return it.
56 | 	 * This is used for cases reading is common (getting a fast path
57 | 	 * with only a read lock) but writing is not (and might be run twice).
58 | 	 */
59 | 	public String quickGetOrCompute(String table, String key, Function<String, String> comp) {
60 | 		return get(table, key).map(Constants::string).orElseGet(() -> {
61 | 			try (Transaction tx = db.createWriteTransaction()){
62 | 				String o = comp.apply(key);
63 | 				db.openDatabase(tx, table, 0).put(bytes(key), bytes(o));
64 | 				return o;
65 | 			}
66 | 		});
67 | 	}
68 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/StrictFilters.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | 
 9 | public class StrictFilters extends Researcher {
10 | 	/**
11 | 	 * Perform several strict filters relating mostly to game rules.
12 | 	 * 
13 | 	 * 1: Remove J! Archive since it has actual answers.
14 | 	 * 2: Remove "List of *" because that's not the format of an answer.
15 | 	 * 3: Remove any answer inside the question because they don't give the
16 | 	 *    answers away in the questions (at least not in a string-match way)
17 | 	 * 4: Remove ultra-long answers because J! never wants a 3-minute speech
18 | 	 * 5: Remove answers not in Latin text
19 | 	 */
20 | 	public List<Answer> question(Question q, List<Answer> answers) {
21 | 		List<Answer> new_answers = new ArrayList<>();
22 | 		for (Answer a : answers) {
23 | 			
24 | 			// J! Archive has answers
25 | 			if (a.text.contains("J! Archive")) {}
26 | 			
27 | 			// "List of" is a bad sign 
28 | 			else if (a.text.contains("List of")) {}
29 | 			
30 | 			// Is the answer in the question?
31 | 			else if (almostContains(q.text, a.text)) {}
32 | 			
33 | 			// Is it too long?
34 | 			// The longest real answer in our sample of about 40,000 is:
35 | 			// How much wood would a woodchuck chuck if a woodchuck could chuck wood?
36 | 			// and it's 70 characters long. So cut there.
37 | 			else if (a.getTokens().isEmpty() || a.text.length() > 70) {}
38 | 			
39 | 			// Is over half of it non-Latin text?
40 | 			else if (a.text.replaceAll("[^A-Za-z0-9 ]", "").length() * 2 < a.text.length()) {}
41 | 			
42 | 			// Does it look like a web address?
43 | 			else if (a.text.matches("^(http://)?([A-Za-z]+\\.)?[A-Za-z]+\\.(com|net|org|co\\.[A-Za-z]{2})$")) {}
44 | 			
45 | 			else {
46 | 				new_answers.add(a);
47 | 			}
48 | 		}
49 | 		
50 | 		log.info("Eliminated " + (answers.size() - new_answers.size()) + " invalid answers");
51 | 		return new_answers;
52 | 	}
53 | 	
54 | 	/**
55 | 	 * Check if the question text (left) almost contains the answer text
56 | 	 * (right).
57 | 	 */
58 | 	public boolean almostContains(String left, String right) {
59 | 		// TODO: more stopword removal, etc.
60 | 		return left.toLowerCase().contains(right.toLowerCase());
61 | 	}
62 | }
63 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/CoreNLPSentenceSimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.ArrayList;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | import edu.stanford.nlp.trees.Tree;
10 | import edu.uncc.cs.watsonsim.Phrase;
11 | import edu.uncc.cs.watsonsim.nlp.Trees;
12 | import edu.uncc.cs.watsonsim.scorers.CommonConstituents;
13 | 
14 | public class CoreNLPSentenceSimilarityTest {
15 | 
16 | 	@Test
17 | 	public void testParseToTree() {
18 | 		
19 | 		// Empty case
20 | 		assertEquals(new ArrayList<>(), Trees.parse(""));
21 | 		// Simple case
22 | 		assertEquals(Tree.valueOf("(ROOT (NP (NN Example)))"), Trees.parse("Example").get(0));
23 | 		// Challenging case
24 | 		// fails: "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo."
25 | 		// succeeds, or at least it looks generally right to me:
26 | 		assertEquals(Tree.valueOf("(ROOT (S (NP (NNP Niel) (NNP Armstrong)) "
27 | 				+ "(VP (VBD was) (NP (DT the) (JJ first) (NN man)"
28 | 				+ "(S (VP (TO to) (VP (VB walk) "
29 | 				+ "(PP (IN on) (NP (DT the) (NN moon)))))))) (. .)))"),
30 | 				Trees.parse("Niel Armstrong was the first man to walk on the moon.").get(0));
31 | 		
32 | 		assertEquals(
33 | 				Tree.valueOf("(ROOT (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ tall))) (. .)))"),
34 | 				Trees.parse("I am tall. You are short.").get(0));
35 | 		assertEquals(
36 | 				Tree.valueOf("(ROOT (S (NP (PRP You)) (VP (VBP are) (ADJP (JJ short))) (. .)))"),
37 | 				Trees.parse("I am tall. You are short.").get(1));
38 | 		
39 | 	}
40 | 
41 | 	@Test
42 | 	public void testScorePhrases() {
43 | 		CommonConstituents scorer = new CommonConstituents();
44 | 		
45 | 		
46 | 		// These are in large part to make sure that it does not accidentally change.
47 | 		/*assertEquals(
48 | 				1.0,
49 | 				scorer.getCommonSubtreeCount(
50 | 					new Phrase("this"),
51 | 					new Phrase("this")),
52 | 				0.01
53 | 		);*/
54 | 		assertEquals(
55 | 				6.0,
56 | 				scorer.getCommonSubtreeCount(
57 | 					new Phrase("My goat knows the bowling score."),
58 | 					new Phrase("Michael rowed the boat ashore.")),
59 | 				0.01
60 | 		);
61 | 		assertEquals(
62 | 				12.0,
63 | 				scorer.getCommonSubtreeCount(
64 | 					new Phrase("A tisket. A tasket. A green and yellow basket."),
65 | 					new Phrase("A tisket, a tasket, what color is my basket?")),
66 | 				0.01
67 | 		);
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/scripts/BigramBigramIndexer.scala:
--------------------------------------------------------------------------------
 1 | package scripts;
 2 | 
 3 | import java.io.File;
 4 | import java.sql.ResultSet;
 5 | import java.sql.SQLException;
 6 | import java.util.ArrayList;
 7 | import java.util.regex.Matcher;
 8 | import java.util.regex.Pattern;
 9 | 
10 | import lemurproject.indri.IndexEnvironment;
11 | import lemurproject.indri.ParsedDocument;
12 | import lemurproject.indri.ParsedDocument.TermExtent;
13 | 
14 | import org.apache.lucene.analysis.Analyzer;
15 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
16 | import org.apache.lucene.document.Document;
17 | import org.apache.lucene.document.Field;
18 | import org.apache.lucene.document.StoredField;
19 | import org.apache.lucene.document.TextField;
20 | import org.apache.lucene.index.IndexWriter;
21 | import org.apache.lucene.index.IndexWriterConfig;
22 | import org.apache.lucene.store.Directory;
23 | import org.apache.lucene.store.FSDirectory;
24 | import org.apache.lucene.util.Version;
25 | 
26 | import privatedata.UserSpecificConstants;
27 | import uncc2014watsonsim.Passage;
28 | import uncc2014watsonsim.Database;
29 | 
30 | /**
31 |  * This is an experimental bigram-bigram association indexer.
32 |  * The point here is to find the most relevant relations between every pair of
33 |  * bigrams, according to the pairwise entropy.
34 |  * 
35 |  * The whole thing is designed to run in (maybe 3 GB) memory using bit
36 |  * twiddling and primitive arrays for efficiency, hash tables and dynamic
37 |  * programming for time complexity, a cache eviction policy for
38 |  * memory complexity, and some distributional tweaks for fairness.
39 |  * 
40 |  * This is not exactly tried and true software.
41 |  */
42 | object BigramBigramIndexer {
43 |   val db = new Database();
44 |   
45 |   def main(args: Array[String]) {
46 |     println("Hello!")
47 |   }
48 |   
49 |   /**
50 |    * Fetch rows from the database, extract the text, and tokenize it.
51 |    */
52 |   def getRowText() : Stream[Array[String]] = {
53 |     val rows = db.prep("SELECT reference, title, text FROM "
54 |         + "meta INNER JOIN content ON meta.id=content.id "
55 |         + "WHERE source != 'wp-full' and source != 'wiktionary-01'"
56 |         + " ORDER BY title;").executeQuery();
57 |     
58 |     new Iterator[Array[String]] {
59 |       def hasNext = rows.next()
60 |       def next() = rows.getString(1).split("[^a-zA-Z]")
61 |     }.toStream
62 |   }
63 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/Question.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import org.apache.log4j.Logger;
 4 | 
 5 | /**
 6 |  * An immutable natural language phrase intended to be evaluated as a question
 7 |  * or clue.
 8 |  * 
 9 |  * Available annotators (there may be more, these just get you started)
10 |  * ClueType.fromClue
11 |  * QClassDetection.detectType
12 |  * 
13 |  * @author Sean
14 |  */
15 | public class Question extends Phrase {
16 | 	public final Answer correct_answer;
17 |     private final String category;
18 |     private final QType type;
19 |     
20 |     /**
21 |      * Construct a new question for analysis.
22 |      * @param question   The natural language clue
23 |      * @param correct_answer  The target answer, if available (or null)
24 |      * @param category  The category of the problem, also natural language
25 |      */
26 |     public Question(String question, Answer correct_answer, String category) {
27 |     	super(question);
28 |     	this.correct_answer = correct_answer;
29 |     	this.category = category;
30 |         this.type = QClassDetection.detectType(this);
31 |         this.memo(QClassDetection::detectType);
32 |         Logger log = Logger.getLogger(getClass());
33 |         log.info("Looks like a " + type.toString().toLowerCase() + " question");
34 |     }
35 |     
36 |     /**
37 |      * Create a simple question without bells and whistles
38 |      */
39 |     public Question(String question) {
40 |     	this(question, null, "");
41 |     }
42 |     
43 | 	/**
44 |      * Create a question from a clue and a hint about it's category
45 |      */
46 |     public Question(String question, String category) {
47 |         this(question, null, category);
48 |     }
49 | 
50 |     /**
51 |      * Create a question with a clue and plain string answer but no category
52 |      */
53 |     public static Question known(String question, String answer) {
54 |         return known(question, answer, "");
55 |     }
56 | 
57 |     /**
58 |      * Create a question with a clue, a plain string answer, and category
59 |      */
60 |     public static Question known(String question, String answer, String category) {
61 |         return new Question(question,
62 |         		new Answer("answer", answer, answer, ""),
63 | 				category);
64 |     }
65 | 
66 |     public String getCategory() {
67 |         return category;
68 |     }
69 | 
70 |     public QType getType() {
71 |         return type;
72 |     }   
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/LATCheck.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import org.apache.log4j.Logger;
 4 | 
 5 | import edu.uncc.cs.watsonsim.Answer;
 6 | import edu.uncc.cs.watsonsim.Environment;
 7 | import edu.uncc.cs.watsonsim.Question;
 8 | import edu.uncc.cs.watsonsim.nlp.ClueType;
 9 | import edu.uncc.cs.watsonsim.nlp.Relatedness;
10 | import edu.uncc.cs.watsonsim.scorers.AnswerScorer;
11 | 
12 | /**
13 |  * Check if the question LAT matches one of the answer LATs
14 |  * @author Sean
15 |  *
16 |  */
17 | public class LATCheck extends AnswerScorer {
18 | 	private final Relatedness syn;
19 | 	private final Logger log = Logger.getLogger(getClass());
20 | 	
21 | 	/**
22 | 	 * Create a new LATCheck using a shared environment
23 | 	 */
24 | 	public LATCheck(Environment env) {
25 | 		syn = new Relatedness(env);
26 | 	}
27 | 	
28 | 	@Override
29 | 	public double scoreAnswer(Question q, Answer a) {
30 | 		/*
31 | 		 * There are several options here of how to determine synonyms.
32 | 		 * 
33 | 		 * Synonym generation approaches:
34 | 		 * 1) Given a label, find the article titles.
35 | 		 * 2)*Given an article title, find the labels.
36 | 		 * 3) Given a label, find the other labels sharing an article title.
37 | 		 * 4) Given a label, find the main article, and all the links to that main article.
38 | 		 * 5) Given two labels, combine the weights of common article titles.
39 | 		 * 
40 | 		 * Synonym checking approaches:
41 | 		 * 1)*Synonymize Q's, check against A's
42 | 		 * 2) Synonymize A's, check against Q's
43 | 		 * 3) Synonymize both, combine common results
44 | 		 * 
45 | 		 * Right now, we are using (G2, C1).
46 | 		 */
47 | 		/*if (!q.simple_lat.isEmpty()) {
48 | 			List<Weighted<String>> question_synonyms = syn.viaWikiLinks(new String[]{q.simple_lat});
49 | 			question_synonyms.add(new Weighted<String>(q.simple_lat, 1000.0));
50 | 			for (Weighted<String> synonym : question_synonyms) {
51 | 				for (String candidate_type : a.lexical_types) {
52 | 					if (syn.matchViaLevenshtein(synonym.item, candidate_type)) {
53 | 						log.info(a.text + " is a " + synonym.item
54 | 								+ " which is  " + q.simple_lat
55 | 								+ " (weight " + Math.log(synonym.weight) + ")");
56 | 						return Math.log(synonym.weight);
57 | 					}
58 | 				}
59 | 			}
60 | 		}*/
61 | 		for (String lextype : a.lexical_types) {
62 | 			if (syn.matchViaSearch(q.memo(ClueType::fromClue), lextype))
63 | 				return 1.0;
64 | 		}
65 | 		return -1.0;
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/ElliotMerschScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import edu.uncc.cs.watsonsim.Answer;
 4 | import edu.uncc.cs.watsonsim.Passage;
 5 | import edu.uncc.cs.watsonsim.Phrase;
 6 | 
 7 | public class ElliotMerschScorer extends PassageScorer{
 8 | 	
 9 | 	public double Scorer (Phrase q, Answer a, Passage p){
10 | 		
11 | 				String Qraw = q.text;
12 | 				String Ptext = p.text;
13 | 				String Ptitle = p.title;
14 | 				
15 | 				//test variables
16 | 				//String Qraw = "What is the tallest building?";
17 | 				//String Ptext = "The world's tallest man-made structure is the 829.8 m (2,722 ft) tall Burj Khalifa in Dubai, United Arab Emirates. The building gained the official title of Tallest Building in the World at its opening on January 4, 2010.";
18 | 				//String Ptitle = "List of tallest buildings and structures in the world";
19 | 				
20 | 				double score = 0;
21 | 				
22 | 				String[] Qsplit = Qraw.split(" ");
23 | 				String[] PtitleSplit = Ptitle.split(" ");
24 | 				String[] PtextSplit = Ptext.split(" ");
25 | 				
26 | 				//check passage title
27 | 				for (int i=0; i<Qsplit.length; i++){
28 | 					String currQWord = Qsplit[i];
29 | 					
30 | 					for (int x=0; x<PtitleSplit.length; x++){
31 | 						String currTWord = PtitleSplit[x];
32 | 						
33 | 						if (currQWord.toLowerCase().contains(currTWord.toLowerCase())){
34 | 							score++;
35 | 							
36 | 						}
37 | 					}
38 | 				}
39 | 				
40 | 				System.out.println("Occurences in title: " + score);
41 | 
42 | 				
43 | 				double textOccur = 0;
44 | 				//check passage text
45 | 				for (int i=0; i<Qsplit.length; i++){
46 | 					String currQWord = Qsplit[i];
47 | 					
48 | 					for (int x=0; x<PtextSplit.length; x++){
49 | 						String currPWord = PtextSplit[x];
50 | 						
51 | 						if (currQWord.toLowerCase().contains(currPWord.toLowerCase())){
52 | 							textOccur++;
53 | 							
54 | 						}
55 | 					}
56 | 				}
57 | 				
58 | 				System.out.println("Occurences in text: " + textOccur);
59 | 				
60 | 				//title occurences worth more than text occurences
61 | 				score = score*2;
62 | 				
63 | 				double totalLength = Qsplit.length + PtextSplit.length;
64 | 				double finalscore = 0;
65 | 				finalscore = (score + textOccur)/totalLength;
66 | 				finalscore = finalscore * 100;
67 | 				System.out.println("Score:");
68 | 				System.out.println(finalscore + " / 100");
69 | 		
70 | 				return finalscore;
71 | 
72 | 	}
73 | 	
74 | 	
75 | }
76 | 


--------------------------------------------------------------------------------
/scripts/create.sql:
--------------------------------------------------------------------------------
 1 | /* These two settings result in a 2x to 50x speedup for SQLite
 2 |  * If you are concerned, you can use synchronous = NORMAL
 3 |  * Remember that btrfs does not actually obey fsync so this has less of an
 4 |  * impact with btrfs than others and it will seem pretty fast either way.
 5 |  */
 6 | PRAGMA journal_mode = WAL;
 7 | PRAGMA synchronous = OFF;
 8 | PRAGMA foreign_keys = ON;
 9 | 
10 | /*
11 |  * meta and content are separate because the content is very large and makes
12 |  * routine changes slower otherwise.
13 |  */
14 | CREATE TABLE meta (
15 |     id INTEGER PRIMARY KEY,
16 |     title TEXT,
17 |     source TEXT,
18 |     reference TEXT,
19 |     pageviews INTEGER
20 | );
21 | 
22 | CREATE TABLE content (
23 |     id INTEGER PRIMARY KEY,
24 |     text TEXT,
25 |     FOREIGN KEY(id) REFERENCES meta(id)
26 | );
27 | 
28 | CREATE INDEX meta_source ON meta(source);
29 | CREATE INDEX meta_title ON meta(title);
30 | 
31 | CREATE TABLE redirects (
32 |     target_id INTEGER,
33 |     source_title TEXT,
34 |     FOREIGN KEY(target_id) REFERENCES meta(id)
35 | );
36 | CREATE INDEX redirects_id ON redirects(target_id);
37 | 
38 | -- Used for an experimental scorer. (PhraseTokens)
39 | -- Should it be moved to another DB?
40 | CREATE TABLE relate_words(
41 |     id INTEGER PRIMARY KEY,
42 |     name TEXT UNIQUE,
43 |     count INTEGER
44 | );
45 | CREATE TABLE relate_links(
46 |     id INTEGER PRIMARY KEY,
47 |     source INTEGER,
48 |     dest INTEGER,
49 |     count INTEGER,
50 |     FOREIGN KEY(source) REFERENCES relate_words(id),
51 |     FOREIGN KEY(dest) REFERENCES relate_words(id),
52 |     UNIQUE(source, dest)
53 | );
54 | 
55 | -- merged from questions.db
56 | CREATE TABLE results (question int, rank int, score double, engine text, title text, fulltext text, correct boolean, reference text);
57 | CREATE TABLE questions (rowid int primary key, question text, answer text, category text);
58 | CREATE INDEX results_fkey_question ON  results(question);
59 | CREATE TABLE cache (
60 |     query TEXT,
61 |     engine TEXT,
62 |     title TEXT,
63 |     fulltext TEXT,
64 |     reference TEXT,
65 |     id BIGINT,
66 |     created_on INTEGER DEFAULT CURRENT_TIMESTAMP);
67 | CREATE INDEX cache_query ON cache(query);
68 | CREATE TABLE cache_scores(passage_id int, name text, value float);
69 | CREATE INDEX cache_scores_passage_id ON cache_scores(passage_id);
70 | CREATE INDEX cache_query_engine ON cache(query, engine);
71 | CREATE INDEX cache_timestamp ON cache(created_on);
72 | 


--------------------------------------------------------------------------------
/scripts/svm_graph.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import math
 4 | from sklearn.svm import SVC
 5 | wout = np.load("wek2.npy")[:10000]
 6 | # X = input features
 7 | # y = output gold-standard prediction
 8 | # q = question id (for collation)
 9 | X, y, q = np.delete(wout, [8, 46], axis=1), wout[:, 8], wout[:, 46]
10 | # border between test and training data
11 | border = len(y) * 2/3
12 | 
13 | # Spacing of the parameters we are trying to visualize (C and gamma)
14 | base = 10 ** (1/10.)
15 | exp_range = range(-60, 61)
16 | 
17 | def svc((C, gamma)):
18 |     s = SVC(C=C, gamma=gamma, probability=True)
19 |     start = time.time()
20 |     s.fit(X[:border], y[:border])
21 |     train_time = time.time() - start
22 |     pred = s.predict_proba(X[border:])[:, 0]
23 |     test_time = (time.time() - start) - train_time
24 | 
25 |     # This is the literal is-it-the-right-answer  binary score.
26 |     # This measure is what we try to maximize but its relation to question
27 |     # accuracy is complicated
28 |     accu = np.sum((pred > 0.5) == y) / len(y)
29 | 
30 |     ###  This is the actual question prediction error, in bits
31 |     # First, find the probabilities
32 |     pred_y = pred * y[border:] # These are the probabilities for right answers
33 |     pred_y = pred_y[pred_y.nonzero()]   # the same, stripped of 0's
34 |     mean_bits = np.mean(-np.log(pred_y) / np.log(2))  # measured in mean bits
35 | 
36 |     ### This is the literal accuracy - it gets complicated
37 |     # Sort the answers by probability, descending (only getting the indices)
38 |     confidence_order = np.argsort(pred)
39 |     # This indexing trick always takes the last assignment for each index
40 |     # This will hold the index of the best answer for each question
41 |     best_answer = np.zeros(np.max(q.astype(int))+1)
42 |     best_answer[q[confidence_order].astype(int)] = confidence_order
43 |     # Take the average correctness of the best answer
44 |     accu_by_q = y[border:][best_answer.astype(int)].mean()
45 | 
46 |     return [C, gamma, accu, mean_bits, accu_by_q, train_time, test_time]
47 | 
48 | import code
49 | 
50 | def multi():
51 |     from multiprocessing import Pool
52 |     p = Pool(40)
53 |     ins = [(base**i, base**j) for i in exp_range for j in exp_range]
54 |     with open("svmresults-largeimage-smallset.log", "w") as o:
55 |         for row in p.imap_unordered(svc, ins):
56 |             print '\t'.join(map(str, row))
57 |             o.write('\t'.join(map(str, row)) + '\n')
58 | 
59 | code.interact(local=vars())
60 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/PersonRecognition.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import java.util.List;
 7 | import java.util.logging.Level;
 8 | import java.util.logging.Logger;
 9 | 
10 | import edu.uncc.cs.watsonsim.Answer;
11 | import edu.uncc.cs.watsonsim.Phrase;
12 | import edu.uncc.cs.watsonsim.QType;
13 | import edu.uncc.cs.watsonsim.Question;
14 | import opennlp.tools.namefind.NameFinderME;
15 | import opennlp.tools.namefind.TokenNameFinderModel;
16 | import opennlp.tools.util.Span;
17 | 
18 | /**
19 |  *
20 |  * @author Phani Rahul
21 |  */
22 | public class PersonRecognition extends Researcher {
23 | 
24 |     private static TokenNameFinderModel model = null;
25 |     private static NameFinderME nameFinder = null;
26 |     private boolean enabled=true;
27 | 
28 |     public PersonRecognition() {
29 |         InputStream is;
30 | 		try {
31 | 			is = new FileInputStream("data/en-ner-person.bin");
32 | 	        model = new TokenNameFinderModel(is);
33 | 		} catch (IOException e) {
34 | 			e.printStackTrace();
35 | 			System.err.println("Missing NLP model data. Deactivating NameRecognitionResearcher.");
36 | 			enabled = false;
37 | 		}
38 |         nameFinder = null;
39 |         try {
40 |             nameFinder = new NameFinderME(model);
41 |         } catch (Exception ex) {
42 |             Logger.getLogger(PersonRecognition.class.getName()).log(Level.SEVERE, null, ex);
43 |         }
44 |     }
45 | 
46 | 	@Override
47 |     public List<Answer> question(Question q, List<Answer> answers) {
48 |     	if (q.getType() == QType.FITB && enabled){
49 |     		answers = super.question(q, answers);
50 |     	}
51 |     	return answers;
52 |     }
53 | 
54 |     @Override
55 |     public Answer answer(Phrase q, Answer answer) {
56 |         Span nameSpans[] = null;
57 |         String[] sentence = null;
58 |         sentence = answer.text.split("[,'()  ]+");
59 | 
60 |         nameSpans = nameFinder.find(sentence);
61 |         nameFinder.clearAdaptiveData();
62 | 
63 |         StringBuilder ret = new StringBuilder();
64 |         for (Span s : nameSpans) {
65 | 
66 |             for (int i = s.getStart(); i < s.getEnd(); i++) {
67 |                 ret.append(sentence[i]);
68 |                 ret.append(" ");
69 |             }
70 |         }
71 |         if (!ret.toString().isEmpty()){
72 |         	return answer.withText(ret.toString());
73 |         }
74 |         return answer;	
75 |     }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/ApproxStringIntMapTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.Iterator;
 6 | 
 7 | import org.apache.commons.lang3.tuple.Pair;
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | public class ApproxStringIntMapTest {
12 | 	ApproxStringIntMap asim;
13 | 	@Before
14 | 	public void setUp() {
15 | 		asim = new ApproxStringIntMap(new StringStack("moo", "far"));
16 | 	}
17 | 
18 | 	@Test
19 | 	public void testSize() {
20 | 		assertEquals(0, asim.size());
21 | 		asim.put("moo", 1);
22 | 		assertEquals(1, asim.size());
23 | 	}
24 | 
25 | 	@Test
26 | 	public void testIsEmpty() {
27 | 		assertTrue(asim.isEmpty());
28 | 		asim.put("moo", 1);
29 | 		assertFalse(asim.isEmpty());
30 | 	}
31 | 
32 | 	@Test
33 | 	public void testContainsKey() {
34 | 		assertFalse(asim.containsKey("moo"));
35 | 		asim.put("moo", 1);
36 | 		assertTrue(asim.containsKey("moo"));
37 | 		assertFalse(asim.containsKey("far"));
38 | 		asim.put("erk", 7);
39 | 		assertTrue(asim.containsKey("moo"));
40 | 		assertTrue(asim.containsKey("erk"));
41 | 		assertFalse(asim.containsKey("far"));
42 | 	}
43 | 
44 | 	@Test
45 | 	public void testGetPut() {
46 | 		assertEquals(0, asim.get("moo")); // ! Keep this in mind!
47 | 		asim.put("far", 1);
48 | 		assertEquals(0, asim.get("moo"));
49 | 		assertEquals(1, asim.get("far"));
50 | 		asim.put("erk",  2);
51 | 		assertEquals(0, asim.get("moo"));
52 | 		assertEquals(2, asim.get("erk"));
53 | 	}
54 | 	
55 | 	@Test
56 | 	public void testAddTo() {
57 | 		assertEquals(0, asim.get("moo"));
58 | 		asim.addTo("moo", 4);
59 | 		assertEquals(4, asim.get("moo"));
60 | 		asim.addTo("moo", 4);
61 | 		assertEquals(8, asim.get("moo"));
62 | 	}
63 | 
64 | 	@Test
65 | 	public void testRemove() {
66 | 		asim.put("moo", 1);
67 | 		asim.put("far", 2);
68 | 		assertTrue(asim.containsKey("far"));
69 | 		asim.remove("far");
70 | 		assertFalse(asim.containsKey("far"));
71 | 	}
72 | 
73 | 	@Test
74 | 	public void testClear() {
75 | 		asim.put("moo", 1);
76 | 		asim.put("far", 2);
77 | 		assertTrue(asim.containsKey("far"));
78 | 		asim.clear();
79 | 		assertFalse(asim.containsKey("far"));
80 | 	}
81 | 
82 | 	@Test
83 | 	public void testIterator() {
84 | 		asim.put("moo", 1);
85 | 		asim.put("far", 2);
86 | 		Iterator<Pair<String, Integer>> pairs = asim.iterator();
87 | 		assertTrue(pairs.hasNext());
88 | 		assertEquals(Pair.of("moo", 1), pairs.next());
89 | 		assertTrue(pairs.hasNext());
90 | 		assertEquals(Pair.of("far", 2), pairs.next());
91 | 		assertFalse(pairs.hasNext());
92 | 	}
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/MergeAnswers.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.HashSet;
 5 | import java.util.List;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Question;
 9 | 
10 | /*Author : Ricky Sanders
11 |  * 
12 |  * Compares answer to answer to merge those that have 3 or more words in common
13 |  * Currently keeps the longest answer
14 |  * 
15 |  * WORK IN PROGRESS
16 |  */
17 | 
18 | public class MergeAnswers extends Researcher{
19 | 	@Override
20 | 		/** Call merge on any two similar answers */
21 | 		public List<Answer> question(Question q, List<Answer> answers) {
22 | 		List<List<Answer>> answer_blocks = new ArrayList<>();
23 | 
24 | 		// Arrange the answers into blocks
25 | 		each_answer:
26 | 		for (Answer original : answers) {
27 |             HashSet<String> original_terms = new HashSet<String>();
28 |             original_terms.addAll(original.getTokens());
29 |             //return reference_terms.containsAll(StringUtils.tokenize(reference));
30 | 			for (List<Answer> block : answer_blocks) {
31 | 				for (Answer example : block) {
32 | 		            HashSet<String> example_terms = new HashSet<String>();
33 | 		            example_terms.addAll(example.getTokens());
34 | 					// Look through the examples in this topic
35 | 					// If it matches, choose to put it in this block and quit.
36 | 		            
37 | 		            int sizeExample = example_terms.size();
38 | 		            
39 | 		            example_terms.retainAll(original_terms);
40 | 		            int count = example_terms.size();
41 | 		            
42 | 		            double percentCorrect = count/(sizeExample + 0.01);
43 | 
44 | 					/** Merge by word count of 3 only */
45 | 					
46 | 					if (count >= 3 || percentCorrect >= 0.5) {
47 | 						original.log(this, "It restates %s", original);
48 | 						block.add(original);
49 | 						continue each_answer;
50 | 					}
51 | 
52 | 				}
53 | 			}
54 | 				
55 | 			// Make a new topic for this answer
56 | 			List<Answer> new_block = new ArrayList<>();
57 | 			new_block.add(original);
58 | 			answer_blocks.add(new_block);
59 | 		}
60 | 
61 | 		// Merge the blocks
62 | 		List<Answer> new_answers = new ArrayList<>();
63 | 		for (List<Answer> block : answer_blocks) {
64 | 			if (block.size() > 1) {
65 | 				new_answers.add(Answer.merge(block));
66 | 			} else {
67 | 				new_answers.add(block.get(0));
68 | 			}
69 | 		}
70 | 		
71 | 		log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by word similarity).");
72 | 		return new_answers;
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/URLExpander.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.io.ByteArrayInputStream;
 4 | import java.io.InputStreamReader;
 5 | import com.google.gson.reflect.TypeToken;
 6 | 
 7 | import crawlercommons.fetcher.BaseFetchException;
 8 | import crawlercommons.fetcher.http.SimpleHttpFetcher;
 9 | import crawlercommons.fetcher.http.UserAgent;
10 | import de.l3s.boilerpipe.BoilerpipeProcessingException;
11 | import de.l3s.boilerpipe.extractors.ArticleExtractor;
12 | import edu.uncc.cs.watsonsim.Answer;
13 | import edu.uncc.cs.watsonsim.Environment;
14 | import edu.uncc.cs.watsonsim.Passage;
15 | import edu.uncc.cs.watsonsim.Phrase;
16 | 
17 | 
18 | /** Fill in the full text of an answer from it's URL, if it has one */
19 | public class URLExpander extends Researcher {
20 | 	private SimpleHttpFetcher fetcher;
21 | 			
22 | 	private Environment env;
23 | 	
24 | 	public URLExpander(Environment env) {
25 | 		this.env = env;
26 | 		fetcher = new SimpleHttpFetcher(3,
27 | 				new UserAgent(
28 | 						"Watsonsim QA engine (bot)",
29 | 						"stgallag@gmail.com",
30 | 						"http://github.com/SeanTater/uncc2014watsonsim",
31 | 						"Mozilla/5.0",
32 | 						"10 May 2015"));
33 | 
34 | 		//fetcher.setConnectionTimeout(2000);
35 | 		//fetcher.setSocketTimeout(2000);
36 | 		fetcher.setMaxRetryCount(1);
37 | 	}
38 | 	
39 | 	/**
40 | 	 * Get a page from the Internet and clean it.
41 | 	 */
42 | 	private String fetch(String key) {
43 | 		try {
44 | 			byte[] payload = fetcher.fetch(key.substring(4)).getContent();
45 | 			InputStreamReader isr = new InputStreamReader(
46 | 					new ByteArrayInputStream(payload));
47 | 			return ArticleExtractor.INSTANCE.getText(isr);
48 | 		} catch (BaseFetchException | BoilerpipeProcessingException e) {
49 | 			// TODO Auto-generated catch block
50 | 			System.err.println("Can't connect to " + key);
51 | 			return "";
52 | 		}
53 | 	}
54 | 	
55 | 	public Answer answer(Phrase q, Answer a) {
56 | 		a.passages.replaceAll( p -> {
57 | 			if (p.reference.startsWith("http") && p.reference.contains(".htm")) {
58 | 				/* This is roundabout because I really want to avoid
59 | 				 * committing to a character set. (So I don't use String.)
60 | 				 */
61 | 				// Download
62 | 				String payload = env.computeIfAbsent("url:"+p.reference,
63 | 						this::fetch,
64 | 						new TypeToken<String>(){}.getType());
65 | 				if (!payload.isEmpty()) {
66 | 					// Parse
67 | 					p = new Passage(
68 | 							"live-url",
69 | 							p.title,
70 | 							payload,
71 | 							p.reference);
72 | 					a.log(this, "Filled in passage from %s", p.reference);
73 | 				}
74 | 			}
75 | 			return p;
76 | 		});
77 | 		return a;
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/StephensonOpenNLPScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import opennlp.tools.parser.ParserModel;
 9 | import opennlp.tools.postag.POSModel;
10 | import opennlp.tools.postag.POSTaggerME;
11 | import opennlp.tools.sentdetect.SentenceDetectorME;
12 | import opennlp.tools.sentdetect.SentenceModel;
13 | import opennlp.tools.util.InvalidFormatException;
14 | 
15 | /*
16 |  * Author: Chris Stephenson
17 |  */
18 | 
19 | public class StephensonOpenNLPScorer {
20 |   private boolean modelsAreInitialized=false;	
21 | 	public  String modelsPath="data/"; //models directory
22 | 	private File parserMFile; 
23 | 	private File sentDetectorMFile;
24 | 	private File chunkerMFile;
25 | 	private File posMFile;
26 | 
27 | 	public SentenceModel sentenceModel; //sentence detection model 
28 | 	public ParserModel parserModel; //parsing model
29 | 	public POSTaggerME tagger;
30 | 	
31 | 	
32 | 	public void init() throws InvalidFormatException{
33 | 		File modelsDir = new File(this.modelsPath);
34 | 
35 | 		this.parserMFile = new File(modelsDir, "en-parser-chunking.bin");
36 | 		this.sentDetectorMFile = new File(modelsDir, "en-sent.bin");
37 | 		this.chunkerMFile=new File(modelsDir,"en-chunker.bin");
38 | 		this.posMFile = new File(modelsDir,"en-pos-maxent.bin");
39 | 
40 | 		InputStream sentModelIn = null;
41 | 		FileInputStream parserStream;
42 | 		try {
43 | 			//for finding sentences
44 | 			sentModelIn = new FileInputStream(sentDetectorMFile);
45 | 			this.sentenceModel = new SentenceModel(sentModelIn);
46 | 			//for finding POS
47 | 			FileInputStream posModelStream = new FileInputStream(posMFile);
48 | 			POSModel model = new POSModel(posModelStream);
49 | 			this.tagger = new POSTaggerME(model);
50 | 			//for parsing
51 | 			parserStream = new FileInputStream(parserMFile);
52 | 			this.parserModel = new ParserModel(parserStream);
53 | 		} catch (FileNotFoundException e2) {
54 | 			// TODO Auto-generated catch block
55 | 			e2.printStackTrace();
56 | 		} catch (IOException e) {
57 | 			// TODO Auto-generated catch block
58 | 			e.printStackTrace();
59 | 		}
60 | 		this.modelsAreInitialized=true;
61 | 	}
62 | 	
63 | 	public void testSentDetector(String testSents) throws InvalidFormatException{
64 | 		init();
65 | 		SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
66 | 		String[] sentences = sentenceDetector.sentDetect(testSents);
67 | 		for (int i=0;i<sentences.length; i++)
68 | 			System.err.println("sent: "+sentences[i]);
69 | 	}
70 | 	
71 | 	
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PassageScorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | 
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | import edu.uncc.cs.watsonsim.Question;
11 | import edu.uncc.cs.watsonsim.Score;
12 | 
13 | /** Scorers apply scores in parallel to:
14 |  *  - Answers
15 |  *  - Passages
16 |  *  By default, a score is NaN.
17 |  *  Scorers are expected to run in parallel. Try to avoid side effects.
18 |  *  Otherwise use "synchronized".
19 |  */
20 | public abstract class PassageScorer implements Scorer { 
21 | 	// This is a constructor-less hack to give Researchers a convenient name
22 | 	// It is used for assigning scores.
23 | 	String name;
24 | 	private String max_name, min_name, median_name, mean_name;
25 | 	{
26 | 		name = this.getClass().getSimpleName().replaceAll("([a-z])([A-Z]+)", "$1_$2").toUpperCase();
27 | 		max_name = name+"_MAX";
28 | 		min_name = name+"_MIN";
29 | 		mean_name = name+"_MEAN";
30 | 		median_name = name+"_MEDIAN";
31 | 		Score.register(max_name, -1, Merge.Mean);
32 | 		Score.register(min_name, -1, Merge.Mean);
33 | 		Score.register(mean_name, -1, Merge.Mean);
34 | 		Score.register(median_name, -1, Merge.Mean);
35 | 	}
36 | 
37 | 	/** Default implementation of research for a question.
38 | 	 * Calls research_answer for every Answer, collecting the mean, median, max
39 | 	 * and min of the results.
40 | 	 * Override this if you need more power.
41 | 	 * @param q		Question
42 | 	 */
43 | 	public void scoreQuestion(Question q, List<Answer> answers) {
44 | 		for (Answer a : answers) {
45 | 			double sum = 0.0;
46 | 			final int p_count = a.passages.size();
47 | 			if (p_count > 0) {
48 | 				double[] scores = new double[p_count];
49 | 				for (int pi=0; pi<p_count; pi++) {
50 | 					Passage p = a.passages.get(pi);
51 | 					scores[pi] = scorePassage(q, a, p); 
52 | 					sum += scores[pi];
53 | 					p.score(name, scores[pi]);
54 | 				}
55 | 				Arrays.sort(scores);
56 | 				a.score(max_name, scores[0]);
57 | 				a.score(min_name, scores[p_count - 1]);
58 | 				a.score(mean_name, sum/p_count);
59 | 				a.score(median_name, scores[p_count / 2]);
60 | 			}
61 | 		}
62 | 	}
63 | 	
64 | 	/** Default implementation for researching a passage.
65 | 	 * Does nothing by default. You don't need to override this if you don't
66 | 	 * use it.
67 | 	 * 
68 | 	 * @param q		Input Question, varies slowest
69 | 	 * @param a		Input Answer, varies medium
70 | 	 * @param p		Input Passage, varies fastest
71 | 	 */
72 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
73 | 		return Double.NaN;
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/ApproxStringIntMap.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import java.util.Iterator;
 4 | 
 5 | import org.apache.commons.lang3.tuple.Pair;
 6 | import org.apache.lucene.codecs.bloom.MurmurHash2;
 7 | 
 8 | import com.carrotsearch.hppc.IntIntOpenHashMap;
 9 | 
10 | /**
11 |  * A memory-efficient String-int map: only stores hash->int relations,
12 |  * and later when you iterate it guesses the hash->String relation using a
13 |  * dictionary.
14 |  * @author Sean
15 |  */
16 | public class ApproxStringIntMap implements Iterable<Pair<String, Integer>> {
17 | 	StringStack dict;
18 | 	IntIntOpenHashMap map = new IntIntOpenHashMap();
19 | 	
20 | 	private int hash(String x) {
21 | 		byte[] b = x.getBytes();
22 | 		return MurmurHash2.hash(b, 0, 0, b.length);
23 | 	}
24 | 	
25 | 	/** Create an approximate String-int map using a shared dictionary */
26 | 	public ApproxStringIntMap(StringStack dictionary) {
27 | 		dict = dictionary;
28 | 	}
29 | 
30 | 	public int size() {
31 | 		return map.size();
32 | 	}
33 | 
34 | 	public boolean isEmpty() {
35 | 		return size() == 0;
36 | 	}
37 | 
38 | 	public boolean containsKey(String key) {
39 | 		return map.containsKey(hash(key));
40 | 	}
41 | 
42 | 	public int get(String key) {
43 | 		return map.get(hash(key));
44 | 	}
45 | 
46 | 	public int put(String key, int value) {
47 | 		return map.put(hash(key), value);
48 | 	}
49 | 	
50 | 	public int addTo(String key, int amount) {
51 | 		return map.addTo(hash(key), amount);
52 | 	}
53 | 
54 | 	public int remove(String key) {
55 | 		return map.remove(hash(key));
56 | 	}
57 | 
58 | 	public void clear() {
59 | 		map.clear();
60 | 	}
61 | 
62 | 	/**
63 | 	 * Iterate the entries in this map - linear in complexity to the vocabulary
64 | 	 * size!
65 | 	 */
66 | 	public Iterator<Pair<String, Integer>> iterator() {
67 | 		return new StringIntMapIterator(this);
68 | 	}
69 | 	
70 | 	private class StringIntMapIterator implements Iterator<Pair<String, Integer>> {
71 | 		private final Iterator<String> dictiter;
72 | 		private Pair<String, Integer> next_item;
73 | 		private ApproxStringIntMap asim;
74 | 		StringIntMapIterator(ApproxStringIntMap asim) {
75 | 			this.dictiter = asim.dict.iterator();
76 | 			this.asim = asim;
77 | 		}
78 | 		
79 | 		@Override
80 | 		public boolean hasNext() {
81 | 			while (next_item == null && dictiter.hasNext()) {
82 | 				String key = dictiter.next();
83 | 				if (asim.containsKey(key))
84 | 					next_item = Pair.of(key, asim.get(key));
85 | 			}
86 | 			return next_item != null;
87 | 		}
88 | 
89 | 		@Override
90 | 		public Pair<String, Integer> next() {
91 | 			Pair<String,Integer> item = next_item;
92 | 			next_item = null;
93 | 			return item;
94 | 		}
95 | 		
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/index/Bigrams.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.index;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import java.nio.file.StandardOpenOption;
 7 | import java.util.concurrent.ConcurrentHashMap;
 8 | import java.util.stream.Stream;
 9 | 
10 | import org.apache.log4j.Logger;
11 | 
12 | import edu.stanford.nlp.util.IterableIterator;
13 | import edu.uncc.cs.watsonsim.Passage;
14 | 
15 | /**
16 |  * Count the bigrams in all passages for entropy based scorers
17 |  * @author Sean Gallaghers
18 |  */
19 | public class Bigrams implements Segment {
20 | 	private ConcurrentHashMap<String, Integer> unigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50);
21 | 	private ConcurrentHashMap<String, Integer> bigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50);
22 | 	private final Logger log = Logger.getLogger(getClass());
23 | 	
24 | 	public Bigrams() {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void close() throws IOException {
29 | 		flush();
30 | 	}
31 | 	
32 | 	public void flush() throws IOException {
33 | 		// Make space-separated lines
34 | 		Stream<String> lines = unigrams.entrySet().stream()
35 | 				.map((pair) ->
36 | 					pair.getKey() + " " + pair.getValue());
37 | 		unigrams= new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50);
38 | 		Files.write(
39 | 				Paths.get("/mnt/NCDS/sean", "unigrams"),
40 | 				new IterableIterator<String>(lines.iterator()),
41 | 				StandardOpenOption.CREATE,
42 | 				StandardOpenOption.WRITE,
43 | 				StandardOpenOption.APPEND);
44 | 		// Make space-separated lines
45 | 		lines = bigrams.entrySet().stream()
46 | 				.map((pair) ->
47 | 					pair.getKey() + " " + pair.getValue());
48 | 		bigrams =new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50);
49 | 		Files.write(
50 | 				Paths.get("/mnt/NCDS/sean", "bigrams"),
51 | 				new IterableIterator<String>(lines.iterator()),
52 | 				StandardOpenOption.CREATE,
53 | 				StandardOpenOption.WRITE,
54 | 				StandardOpenOption.APPEND);
55 | 	}
56 | 
57 | 	@Override
58 | 	public void accept(Passage t) {
59 | 		if (!t.getTokens().isEmpty()) {
60 | 			unigrams.merge(t.getTokens().get(0), 1, (a, b) -> a+b); 
61 | 		}
62 | 		for (int i=0; i < t.getTokens().size() - 1; i++) {
63 | 			String key = t.getTokens().get(i) + " " + t.getTokens().get(i+1);
64 | 			bigrams.merge(key, 1, (a, b) -> a+b);
65 | 			unigrams.merge(t.getTokens().get(i+1), 1, (a, b) -> a+b);
66 | 		}
67 | 		// Try to keep it from absorbing all available memory
68 | 		if (unigrams.size() > 1_000_000
69 | 				|| bigrams.size() > 1_000_000) {
70 | 			try {
71 | 				flush();
72 | 			} catch (IOException failed_flush) {
73 | 				log.error(failed_flush);
74 | 			}
75 | 		}
76 | 	}
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/StringStackTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import java.util.Iterator;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | public class StringStackTest {
10 | 
11 | 	@Test
12 | 	public void testSize() {
13 | 		assertEquals(0, new StringStack().size());
14 | 		assertEquals(1, new StringStack("moo").size());
15 | 		assertEquals(2, new StringStack("foo", "bar").size());
16 | 	}
17 | 
18 | 	@Test
19 | 	public void testIsEmpty() {
20 | 		assertTrue(new StringStack().isEmpty());
21 | 		assertFalse(new StringStack("moo").isEmpty());
22 | 	}
23 | 
24 | 	@Test
25 | 	public void testContains() {
26 | 		assertFalse(new StringStack().contains("moo"));
27 | 		assertTrue(new StringStack("moo").contains("moo"));
28 | 		assertFalse(new StringStack("moo").contains("foobar"));
29 | 		assertTrue(new StringStack("foo", "moo").contains("foo"));
30 | 	}
31 | 
32 | 	@Test
33 | 	public void testAdd() {
34 | 		StringStack ss = new StringStack();
35 | 		assertEquals(0, ss.size());
36 | 		ss.add("moo");
37 | 		assertEquals(1, ss.size());
38 | 		assertFalse(ss.contains("erk"));
39 | 		assertTrue(ss.contains("moo"));
40 | 		ss.add("moo");
41 | 		assertEquals(2, ss.size());
42 | 		ss.add("erk");
43 | 		assertEquals(3, ss.size());
44 | 		assertTrue(ss.contains("erk"));
45 | 		assertTrue(ss.contains("moo"));
46 | 	}
47 | 
48 | 	@Test
49 | 	public void testClear() {
50 | 		StringStack ss = new StringStack("moo");
51 | 		ss.clear();
52 | 		assertEquals(0, ss.size());
53 | 		assertFalse(ss.contains("moo"));
54 | 	}
55 | 
56 | 	@Test
57 | 	public void testGet() {
58 | 		StringStack ss = new StringStack("moo", "far");
59 | 		assertEquals(null, ss.get(-1));
60 | 		assertEquals("moo", ss.get(0));
61 | 		assertEquals("far", ss.get(1));
62 | 		assertEquals(null, ss.get(2));
63 | 		
64 | 	}
65 | 
66 | 	@Test
67 | 	public void testIndexOf() {
68 | 		StringStack ss = new StringStack("moo", "far");
69 | 		assertEquals(0, ss.indexOf("moo"));
70 | 		assertEquals(1, ss.indexOf("far"));
71 | 		assertEquals(-1, ss.indexOf("erk"));
72 | 		assertEquals(-1, ss.indexOf(null));
73 | 	}
74 | 
75 | 	@Test
76 | 	public void testIterator() {
77 | 		StringStack ss = new StringStack("moo", "far");
78 | 		Iterator<String> iters = ss.iterator();
79 | 		assertTrue(iters.hasNext());
80 | 		assertEquals("moo", iters.next());
81 | 		assertTrue(iters.hasNext());
82 | 		assertEquals("far", iters.next());
83 | 		assertFalse(iters.hasNext());
84 | 		
85 | 		// Check that it's repeatable
86 | 		iters = ss.iterator();
87 | 		assertTrue(iters.hasNext());
88 | 		assertEquals("moo", iters.next());
89 | 		assertTrue(iters.hasNext());
90 | 		assertEquals("far", iters.next());
91 | 		assertFalse(iters.hasNext());
92 | 	}
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/StatsDump.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Path;
 6 | import java.nio.file.Paths;
 7 | 
 8 | import static java.nio.file.StandardOpenOption.*;
 9 | 
10 | import java.nio.charset.Charset;
11 | import java.sql.Timestamp;
12 | import java.util.List;
13 | import edu.uncc.cs.watsonsim.Answer;
14 | import edu.uncc.cs.watsonsim.Environment;
15 | import edu.uncc.cs.watsonsim.Question;
16 | import edu.uncc.cs.watsonsim.Score;
17 | 
18 | import org.json.simple.*;
19 | 
20 | public class StatsDump extends Researcher {
21 | 	private JSONObject jrun = new JSONObject();
22 | 	private JSONArray jquestions = new JSONArray();
23 | 	private final Path logfile;
24 | 	
25 | 	/**
26 | 	 * Start a new run in the reports tables.
27 | 	 */
28 | 	@SuppressWarnings("unchecked")
29 | 	public StatsDump(Timestamp run_id, Environment env) {
30 | 		this.logfile = Paths.get("data/run_log_"+run_id.toString());
31 | 		
32 | 		jrun.put("timestamp", run_id.toString());
33 | 		jrun.put("questions", jquestions);
34 | 	}
35 | 	
36 | 	/**
37 | 	 * Store a question with its answers and scores in the reports tables.
38 | 	 */
39 | 	@SuppressWarnings("unchecked")
40 | 	@Override
41 | 	public synchronized List<Answer> question(Question q, List<Answer> answers) {
42 | 		JSONObject jquestion = new JSONObject();
43 | 		jquestion.put("text", q.text);
44 | 		jquestion.put("category", q.getCategory());
45 | 		jquestion.put("graphs", q.getGraphs().toString());
46 | 		jquestion.put("trees", q.getTrees().toString());
47 | 		jquestion.put("tokens", q.getTokens().toString());
48 | 		// defaults
49 | 		jquestion.put("correct", false);
50 | 		jquestion.put("rank", -1);
51 | 		
52 | 		JSONArray janswers = new JSONArray();
53 | 		jquestion.put("answers", janswers);
54 | 		
55 | 		for (int rank=answers.size()-1; rank>=0; rank--) {
56 | 			Answer a = answers.get(rank);
57 | 			JSONObject ja = new JSONObject();
58 | 			janswers.add(ja);
59 | 			
60 | 			ja.put("text", a.text);
61 | 			ja.put("evidence", a.explain());
62 | 			boolean correct = a.scores.get("CORRECT") > 0.99;
63 | 			ja.put("correct", correct);
64 | 			
65 | 			// Convenience attributes
66 | 			if (rank==0)
67 | 				jquestion.put("correct", correct);
68 | 			if (correct)
69 | 				jquestion.put("rank", rank);
70 | 			
71 | 			JSONObject jscores = new JSONObject(); 
72 | 			ja.put("scores", jscores);
73 | 			
74 | 			ja.putAll(Score.asMap(a.scores));
75 | 		}
76 | 		try {
77 | 			Files.write(logfile, jquestion.toJSONString().getBytes(Charset.forName("UTF-8")), APPEND, CREATE);
78 | 		} catch (IOException e) {
79 | 			// Silently skip writing the question
80 | 		}
81 | 		return answers;
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/scripts/gensim/intro-1level.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 4 | from gensim import corpora, models, similarities
 5 | 
 6 | # remove common words and tokenize
 7 | stoplist = set('for a of the and to in'.split())
 8 | 
 9 | ### Create the corpus out of the documents
10 | if os.path.exists('word8-lines.short.corpus.mm'):
11 |     dictionary = corpora.Dictionary.load("word8-lines.short.dict")
12 |     corpus = corpora.MmCorpus('word8-lines.short.corpus.mm')
13 |     lsi = models.LsiModel.load('word8-lines.short.lsimodel')
14 |     index = similarities.MatrixSimilarity.load("word8-lines.short.matsim")
15 | else:
16 |     # collect statistics about all tokens
17 |     dictionary = corpora.Dictionary(line.lower().split() for line in open('word8-lines.short'))
18 |     # remove stop words and words that appear only once
19 |     stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
20 |              if stopword in dictionary.token2id]
21 |     once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
22 |     dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
23 |     dictionary.compactify() # remove gaps in id sequence after words that were removed
24 |     dictionary.save('word8-lines.short.dict')
25 |     print(dictionary)
26 | 
27 |     ### Preprocessing
28 |     class MyCorpus(object):
29 |         def __len__(self):
30 |             i=0
31 |             for line in open("word8-lines.short"):
32 |                 i += 1
33 |             return i
34 | 
35 |     	def __iter__(self):
36 |     		for line in open('word8-lines.short'):
37 |     			# assume there's one document per line, tokens separated by whitespace
38 |     			yield dictionary.doc2bow(line.lower().split())
39 | 
40 |     corpus = MyCorpus()
41 |     corpora.MmCorpus.serialize('word8-lines.short.corpus.mm', corpus) # store to disk, for later use
42 | 
43 |     ### Creating the index
44 |     tfidf = models.TfidfModel(corpus)
45 |     corpus_tfidf = tfidf[corpus]
46 |     lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) # initialize an LSI transformation
47 |     lsi.save('word8-lines.short.lsimodel')
48 |     #corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
49 | 
50 |     index = similarities.MatrixSimilarity(lsi[corpus], num_features=300)
51 |     index.save('word8-lines.short.matsim')
52 | 
53 | ## Get a query
54 | query = raw_input("Search: ")
55 | while query:
56 |     vec = dictionary.doc2bow(query.lower().split())
57 | 
58 |     sims = index[lsi[vec]]
59 |     print(sorted(list(enumerate(sims)), key=lambda x: -x[1])[:20])
60 |     query = raw_input("Search: ")
61 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/JM_Scorer.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scorers;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.IOException;
 6 | 
 7 | import edu.uncc.cs.watsonsim.Answer;
 8 | import edu.uncc.cs.watsonsim.Passage;
 9 | import edu.uncc.cs.watsonsim.Phrase;
10 | import opennlp.tools.cmdline.parser.ParserTool;
11 | import opennlp.tools.parser.Parse;
12 | import opennlp.tools.parser.Parser;
13 | import opennlp.tools.parser.ParserFactory;
14 | import opennlp.tools.parser.ParserModel;
15 | import opennlp.tools.postag.POSModel;
16 | import opennlp.tools.postag.POSTaggerME;
17 | import opennlp.tools.tokenize.Tokenizer;
18 | import opennlp.tools.tokenize.TokenizerME;
19 | import opennlp.tools.tokenize.TokenizerModel;
20 | import opennlp.tools.util.InvalidFormatException;
21 | 
22 | public class JM_Scorer extends PassageScorer{
23 | 	public double matchChildren(Parse pa1, Parse pa2) {
24 | 		String p1NodeLabel = pa1.getLabel();
25 | 		String p2NodeLabel = pa2.getLabel();
26 | 		Parse[] children1 = pa1.getChildren();
27 | 		Parse[] children2 = pa2.getChildren();
28 | 		double matchFound = 0;
29 | 		
30 | 		if (pa1 == null || pa2 == null) {
31 | 			return 0;
32 | 		}
33 | 		
34 | 		if (p1NodeLabel.equals(p2NodeLabel)) {
35 | 			if (pa1.getCoveredText().equals(pa2.getCoveredText())) {
36 | 				matchFound = 1;
37 | 			}
38 | 		}
39 | 		
40 | 		return matchFound + matchChildren(children1[0], children2[0]) + matchChildren(children1[1], children2[1]);
41 | 	}
42 | 	
43 | 	//a simple scorer based on the number of matches; requires the first string to be in the passage
44 | 	public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
45 | 		POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
46 | 		Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
47 | 		Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
48 | 		double score = 0;
49 | 		
50 | 		Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
51 | 		Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
52 | 		
53 | 		if (passage.contains(ca)) {
54 | 			for (int i =0; i < questionParse.length; i++) {
55 | 				score += matchChildren(questionParse[i],passageParse[i]);
56 | 			}
57 | 		}
58 | 		
59 | 		return score;
60 | 	}
61 | 	
62 | 	public double scorePassage(Phrase q, Answer a, Passage p) {
63 | 		try {
64 | 			p.score("JM_Scorer", scoreStructure(q.text, a.text, p.text, false));
65 | 		} catch (InvalidFormatException e) {
66 | 			e.printStackTrace();
67 | 		} catch (IOException e) {
68 | 			e.printStackTrace();
69 | 		}
70 | 		return Double.NaN;
71 | 	}
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/resources/public/stylesheets/index.css:
--------------------------------------------------------------------------------
  1 | .navbar {
  2 | 	margin-bottom: 0;
  3 | }
  4 | 
  5 | .navbar-brand {
  6 |     font-family: 'Alegreya Sans', sans-serif;
  7 |     color: white;
  8 |     font-size: 225%;
  9 |     vertical-align: middle;
 10 | }
 11 | 
 12 | .navbar-brand small {
 13 | 	position: relative;
 14 | 	top: 4px;
 15 | 	right: 5px;
 16 | 	display: inline-block;
 17 | 	transform: rotate(-20deg);
 18 |     color: #ccc;
 19 | 	font-size: 65%;
 20 | }
 21 | 
 22 | body, .jumbotron {
 23 | 	background: rgb(245, 235, 207);
 24 | }
 25 | 
 26 | .jumbotron {
 27 | 	font-family: 'Schoolbell', cursive;
 28 |     font-size:  x-large;
 29 | 	color: white;
 30 | 	background: #9b9;
 31 | 	border: 8px solid #ddd;
 32 | 	border-bottom-width: 15px;
 33 | 	margin-top: 1em;
 34 | 	padding: 0.25em 1em;
 35 | 	min-height: 10em;
 36 | }
 37 | 
 38 | #results {
 39 |     list-style-type: none;
 40 | }
 41 | 
 42 | #results .answer-text {
 43 | 	font-family: 'Schoolbell', cursive;
 44 |     font-size:  x-large;
 45 | }
 46 | 
 47 | #results .answer {
 48 |     display: block;
 49 |     margin-bottom: 0.2em;
 50 | }
 51 | 
 52 | #results .answer-bar {
 53 | 	/* A chalk background */
 54 |     display:  inline-block;
 55 |     margin-right: 1em;
 56 |     padding: 0.15em 0;
 57 |     background-color: #f7f7f7;
 58 | 	background:
 59 | 		url(/chalk-left-end.png) left center no-repeat,
 60 | 		url(/chalk-bar.png) left center repeat-x,
 61 | 		url(/chalk-right-end.png) right center no-repeat;
 62 | 	background-size: 10px, 220px, 10px;
 63 | }
 64 | 
 65 | .answer-details {
 66 | 	font-size: medium;
 67 | 	font-family: sans-serif;
 68 | }
 69 | 
 70 | .answer-details .panel {
 71 | 	color: initial;
 72 | }
 73 | 
 74 | #console, .console {
 75 |     list-style-type: none;
 76 |     background-color: black;
 77 |     color: #ddd;
 78 |     font-family: monospace;
 79 | }
 80 | 
 81 | #console {
 82 | 	display: none;
 83 |     height: 20em;
 84 |     overflow-y: scroll;
 85 | }
 86 | 
 87 | /* columns of same height styles */
 88 | 
 89 | .row-full-height {
 90 |   height: 100%;
 91 | }
 92 | .col-full-height {
 93 |   height: 100%;
 94 |   vertical-align: middle;
 95 | }
 96 | .row-same-height {
 97 |   display: table;
 98 |   width: 100%;
 99 |   /* fix overflow */
100 |   table-layout: fixed;
101 | }
102 | .col-xs-height {
103 |   display: table-cell;
104 |   float: none !important;
105 | }
106 | 
107 | @media (min-width: 768px) {
108 |   .col-sm-height {
109 |     display: table-cell;
110 |     float: none !important;
111 |   }
112 | }
113 | @media (min-width: 992px) {
114 |   .col-md-height {
115 |     display: table-cell;
116 |     float: none !important;
117 |   }
118 | }
119 | @media (min-width: 1200px) {
120 |   .col-lg-height {
121 |     display: table-cell;
122 |     float: none !important;
123 |   }
124 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/Researcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.researchers;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.uncc.cs.watsonsim.Answer;
 7 | import edu.uncc.cs.watsonsim.Log;
 8 | import edu.uncc.cs.watsonsim.Phrase;
 9 | import edu.uncc.cs.watsonsim.Question;
10 | 
11 | /** Researchers can modify questions and have the guarantee of running
12 |  * sequentially. They also do not return double's because they are not expected
13 |  * to do scoring. If they do, they can use score() themselves. Consider using
14 |  * Scorer instead for that, which is parallelizable.
15 |  */
16 | abstract public class Researcher {
17 | 	/**
18 | 	 * The empty researcher does nothing.
19 | 	 */
20 | 	public static final Researcher NIL = new Researcher() {
21 | 		public List<Answer> pull(Question q, List<Answer> answers){return answers;}
22 | 	};
23 | 	
24 | 	/**
25 | 	 * The previous item in the research chain
26 | 	 */
27 | 	protected Researcher chain = NIL;
28 | 	
29 | 	/**
30 | 	 * Output to the user. (This is a multi-user app so each pipeline needs to
31 | 	 * know where to push new results.)
32 | 	 */
33 | 	protected Log log = Log.NIL;
34 | 	
35 | 	/**
36 | 	 * Join together segments of a (recursive) Researcher pipeline.
37 | 	 * The idea of it is that you can "pull" a question through it by passing
38 | 	 * it to pull() of the last Researcher segment.
39 | 	 * 
40 | 	 * @param segments  Pipe segments, which will be mutated (for the chain)
41 | 	 * @return  The last Researcher in the line
42 | 	 */
43 | 	public static Researcher pipe(Log output, Researcher... segments) {
44 | 		Researcher prev = NIL;
45 | 		for (Researcher link : segments) {
46 | 			link.chain = prev;
47 | 			link.log = output.kid(link.getClass());
48 | 			prev = link;
49 | 		}
50 | 		return prev;
51 | 	}
52 | 	
53 | 	/**
54 | 	 * Wrapper method to pull questions through the research chain
55 | 	 */
56 | 	public List<Answer> pull(Question q, List<Answer> candidates) {
57 | 		return question(q, chain.pull(q, candidates));
58 | 	}
59 | 
60 | 	/** Default implementation of research for a question.
61 | 	 * Simply calls research_answer for every Answer
62 | 	 * Override this if you need more power.
63 | 	 * @param question
64 | 	 * @throws Exception 
65 | 	 */
66 | 	public List<Answer> question(Question q, List<Answer> candidates) {
67 | 		List<Answer> outs = new ArrayList<>();
68 | 		for (Answer in : candidates)
69 | 			outs.add(answer(q, in));
70 | 		return outs;
71 | 	}
72 | 	
73 | 	/** Default implementation for researching an answer.
74 | 	 * Does nothing by default. You don't need to override this if you don't
75 | 	 * use it.
76 | 	 * @param q TODO
77 | 	 * @param answer
78 | 	 * 
79 | 	 * @return TODO
80 | 	 */
81 | 	public Answer answer(Phrase q, Answer a) {
82 | 		return a;
83 | 	}
84 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/DenseVectors.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.nlp;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Optional;
 5 | import java.util.stream.Stream;
 6 | 
 7 | import edu.uncc.cs.watsonsim.KV;
 8 | 
 9 | public class DenseVectors {
10 | 	public static final int N = 300;
11 | 	private static final KV kv = new KV();
12 | 	
13 | 	/**
14 | 	 * Possibly get a vector context for a word (otherwise an empty Optional)
15 | 	 * @param word		The word in question
16 | 	 * @return			A Optional<float[]> for that word, or Optional.empty()
17 | 	 */
18 | 	public static Optional<float[]> vectorFor(String word) {
19 | 		if (word == null || word.isEmpty()) {
20 | 			return Optional.empty();
21 | 		} else {
22 | 			return kv.get("big-glove", word).map(KV::asVector);
23 | 		}
24 | 	}
25 | 	
26 | 	/**
27 | 	 * Find the cosine similarity of two vectors, which may or may not exist.
28 | 	 * This is pessimistic, saying that if we have never seen a word before, it
29 | 	 * is probably unrelated to everyone
30 | 	 * @return
31 | 	 */
32 | 	public static double sim(float[] left, float[] right) {
33 | 		/*
34 | 		 *         A.T * B
35 | 		 * -----------------------
36 | 		 * sqrt(A.T*A) sqrt(B.T*B)
37 | 		 */
38 | 		assert left.length == N;
39 | 		assert right.length == N;
40 | 		double ab = 0.0, aa = 0.0, bb = 0.0;
41 | 		for (int i=0; i<Math.min(left.length, right.length); i++) {
42 | 			ab += left [i] * right[i];
43 | 			aa += left [i] * left [i];
44 | 			bb += right[i] * right[i];
45 | 		}
46 | 		if (aa == 0.0 || bb == 0.0) return 0;
47 | 		else return ab / (Math.sqrt(aa) * Math.sqrt(bb));
48 | 	}
49 | 	
50 | 	/**
51 | 	 * Tiny wrapper around sim(float[], float[]) for optional-word situations
52 | 	 */
53 | 	public static double sim(Optional<float[]> left, Optional<float[]> right) {
54 | 		if (left.isPresent() && right.isPresent())
55 | 			return sim(left.get(), right.get());
56 | 		else
57 | 			return 0.0;
58 | 	}
59 | 	
60 | 	/**
61 | 	 * Average some vectors, as a multi-word model. This is not very meaningful
62 | 	 * and may do strange things for the semantics. (e.g. we plan to do better)
63 | 	 */
64 | 	public static float[] mean(List<float[]> vecs) {
65 | 		float[] mean = new float[N];
66 | 		int count = 0;
67 | 		for (float[] vec: vecs) {
68 | 			for (int i=0; i<N; i++) mean[i] += vec[i];
69 | 			count++;
70 | 		}
71 | 		if (count>0) for (int i=0; i<N; i++) mean[i] /= count;
72 | 		return mean;
73 | 	}
74 | 	
75 | 	/**
76 | 	 * Multiply many vectors, as a multi-word model. It can be better than mean
77 | 	 * but it's still not a syntactic parse.
78 | 	 */
79 | 	public static float[] logproduct(List<float[]> vecs) {
80 | 		float[] logprod = new float[N];
81 | 		int count = 0;
82 | 		for (float[] vec: vecs) {
83 | 			for (int i=0; i<N; i++) logprod[i] += Math.log(Math.abs(vec[i]));
84 | 			count++;
85 | 		}
86 | 		for (int i=0; i<N; i++) logprod[i] /= count;
87 | 		return logprod;
88 | 	}
89 | }
90 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/StringStack.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim.nlp;
  2 | 
  3 | import static java.nio.charset.StandardCharsets.UTF_8;
  4 | import java.util.Iterator;
  5 | 
  6 | import com.carrotsearch.hppc.ByteArrayList;
  7 | import com.carrotsearch.hppc.IntArrayList;
  8 | 
  9 | /** An append-only, compact string stack. */
 10 | public class StringStack implements Iterable<String> {
 11 | 	/** start_byte[i] = x --> word i starts at block[x], ends at block[start_byte[i+1]]
 12 | 	 * The last element is where the free space begins. */
 13 | 	IntArrayList start_byte = IntArrayList.from(0);
 14 | 	ByteArrayList block = new ByteArrayList();
 15 | 	
 16 | 	/** Create a string stack from some existing strings */
 17 | 	public StringStack(String... xs) {
 18 | 		for (String x: xs)
 19 | 			add(x);
 20 | 	}
 21 | 	
 22 | 	/** Create a string stack from some existing strings */
 23 | 	public StringStack(Iterable<String> xs) {
 24 | 		for (String x: xs)
 25 | 			add(x);
 26 | 	}
 27 | 
 28 | 	/** How many strings are inside? */
 29 | 	public int size() {
 30 | 		return start_byte.size() - 1;
 31 | 	}
 32 | 
 33 | 	/** Does it have at least one string? */
 34 | 	public boolean isEmpty() {
 35 | 		return size() == 0;
 36 | 	}
 37 | 
 38 | 	/** Does this contain string x? (O(n) - and expensive)*/
 39 | 	public boolean contains(String o) {
 40 | 		for (String x: this) {
 41 | 			if (x.equals(o)) return true;
 42 | 		}
 43 | 		return false;
 44 | 	}
 45 | 
 46 | 	/** Add a string */
 47 | 	public boolean add(String e) {
 48 | 		block.add(e.getBytes(UTF_8));
 49 | 		start_byte.add(block.size());
 50 | 		return true;
 51 | 		
 52 | 	}
 53 | 
 54 | 	/** Remove all contents */
 55 | 	public void clear() {
 56 | 		start_byte.clear();
 57 | 		start_byte.add(0);
 58 | 		block.clear();
 59 | 	}
 60 | 
 61 | 	/** Get a string by index */
 62 | 	public String get(int index) {
 63 | 		if (0 <= index && index + 1 < start_byte.size()) {
 64 | 			int offset = start_byte.get(index);
 65 | 			int length = start_byte.get(index+1) - offset;
 66 | 			return new String(block.buffer, offset, length);
 67 | 		} else {
 68 | 			return null;
 69 | 		}
 70 | 	}
 71 | 
 72 | 	/** Find string x (O(n) - and expensive) */
 73 | 	public int indexOf(String o) {
 74 | 		int i = 0;
 75 | 		for (String x: this) {
 76 | 			if (x.equals(o)) return i;
 77 | 			else i++;
 78 | 		}
 79 | 		return -1;
 80 | 	}
 81 | 
 82 | 	/** Iterate a StringList */
 83 | 	public Iterator<String> iterator() {
 84 | 		return new StringListIterator(this);
 85 | 	}
 86 | 	
 87 | 	private class StringListIterator implements Iterator<String> {
 88 | 		private int index = 0;
 89 | 		private final StringStack sl;
 90 | 		
 91 | 		public StringListIterator(StringStack sl) {
 92 | 			this.sl = sl;
 93 | 		}
 94 | 		
 95 | 		@Override
 96 | 		public boolean hasNext() {
 97 | 			return index < sl.size();
 98 | 		}
 99 | 
100 | 		@Override
101 | 		public String next() {
102 | 			return sl.get(index++);
103 | 		}
104 | 		
105 | 	}
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scripts/WiktionaryParser.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.scripts;
 2 | import java.io.BufferedReader;
 3 | import java.io.BufferedWriter;
 4 | import java.io.FileReader;
 5 | import java.io.FileWriter;
 6 | import java.io.IOException;
 7 | 
 8 | 
 9 | public class WiktionaryParser {
10 | 	public static void main (String[] args) throws IOException{
11 | 	String title = "";
12 | 	String def = "";
13 | 	int defNum = 0;
14 |     try(BufferedReader br = new BufferedReader(new FileReader("Test2.xml"))) {
15 |         String line = br.readLine();
16 |         FileWriter fstream = new FileWriter("out.txt");
17 |         BufferedWriter out = new BufferedWriter(fstream);
18 |         while (line != null) {
19 |         	if(line.contains("<page>")){
20 |         		defNum++;
21 |         		out.newLine();
22 |         		out.newLine();
23 |         		line = br.readLine();
24 |         		outerloop:
25 |         		while ((line.contains("</page>")) != true){
26 |         			
27 |         			if (line.contains("<title>") && (line.contains("Wiktionary") == false)){
28 |     					out.write("____________________________________");
29 |     					out.newLine();
30 |     					out.newLine();
31 |         				title = line;
32 |         				out.write("<DOC>");
33 |         				out.newLine();
34 |         				out.write("<TITLE>");
35 |         				title = title.replaceAll("<title>", "").replaceAll("</title>", "");
36 |         				title = title.trim();
37 |         				out.write(title);
38 |         				out.write("</TITLE>");
39 |         				out.newLine();
40 |         				out.write("<TEXT>");
41 |         				}else if(line.contains("<title>") && (line.contains("Wiktionary") == true)){
42 |         					defNum = 0;
43 |         				break outerloop;	
44 |         				}
45 |         			if (line.contains("# ")){
46 |         				def = line;
47 |         				def = def.replace("[", "");
48 |         				def = def.replace("]", "");
49 |         				def = def.replace("{", "");
50 |         				def = def.replace("}", "");
51 |         				out.write(def);
52 |         				out.newLine();
53 | 
54 |         			}
55 |         			if (line.contains("===Etymology===")){
56 |         				line = br.readLine();
57 |         				while(line.contains("===") != true){
58 |         					if(line.contains("*")){
59 |         					line = line.replace("[", "");
60 |         					line = line.replace("]", "");
61 |         					line = line.replace("{", "");
62 |         					line = line.replace("}", "");
63 |         					out.write(line);
64 |         					out.newLine();
65 | 
66 |         					}
67 |         					
68 |         					line = br.readLine();
69 |         				}
70 |         				out.newLine();
71 |         				
72 |         			}
73 |         				
74 |         				line = br.readLine();
75 |         				
76 |     				}
77 | 
78 |         		out.write("</TEXT>");
79 |         		out.newLine();
80 |         		out.write("</DOC>");
81 |         		}  
82 |         	line = br.readLine();
83 |         	
84 |         	}
85 |         System.out.println(defNum + " definitions exported to out.txt");
86 | 		out.close();
87 |     	}
88 |     
89 | 	
90 | 
91 | 	}
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/Database.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import java.nio.FloatBuffer;
 4 | import java.sql.Array;
 5 | import java.sql.Connection;
 6 | import java.sql.DriverManager;
 7 | import java.sql.PreparedStatement;
 8 | import java.sql.ResultSet;
 9 | import java.sql.SQLException;
10 | 
11 | 
12 | public class Database {
13 | 	private static Connection conn;
14 | 	
15 | 	public Database(Configuration env) {
16 | 		try {
17 | 			//Class.forName("org.sqlite.JDBC");
18 | 		    //Properties props = new Properties();
19 | 		    //props.put("busy_timeout", "30000");
20 | 			//conn = DriverManager.getConnection("jdbc:sqlite:/mnt/NCDS/sean/06Jan2014.3.watsonsim.db", props);
21 | 			
22 | 			// JDBC's SQLite uses autocommit (So commit() is redundant)
23 | 			// Furthermore, close() is a no-op as long as the results are commit()'d
24 | 			 
25 | 			
26 | 			//Class.forName("org.postgresql.Driver");
27 | 			if (conn == null) {
28 | 				conn = DriverManager.getConnection(env.getConfOrDie("jdbc_connection_string"));
29 | 				if (backend().startsWith("SQLite")) {
30 | 					//conn.createStatement().execute("PRAGMA journal_mode = WAL;");
31 | 					//conn.createStatement().execute("PRAGMA busy_timeout = 30000;");
32 | 					//conn.createStatement().execute("PRAGMA synchronous = OFF;");
33 | 				}
34 | 			}
35 | 			//conn.createStatement().execute("PRAGMA busy_timeout = 30000;");
36 | 			//System.err.println(conn.getClass().getName());
37 | 
38 | 		} catch (SQLException e2) {
39 | 			e2.printStackTrace();
40 | 			throw new RuntimeException("Can't run without a database.");
41 | 		}
42 | 	}
43 | 	
44 | 	/** Simple wrapper for creating an SQL statement */
45 | 	public PreparedStatement prep(String sql) {
46 | 		PreparedStatement ps;
47 | 		try {
48 | 			ps = conn.prepareStatement(sql);
49 | 			ps.setFetchSize(100);
50 | 		} catch (SQLException e) {
51 | 			e.printStackTrace();
52 | 			throw new RuntimeException("Can't prepare an SQL statement \"" + sql + "\"");
53 | 		}
54 | 		return ps;
55 | 	}
56 | 	
57 | 	public void commit() {
58 | 		try {
59 | 			if (!conn.getAutoCommit()) {
60 | 				conn.commit();
61 | 			}
62 | 		} catch (SQLException e) {
63 | 			e.printStackTrace();
64 | 		}
65 | 	}
66 | 	
67 | 
68 | 	/**
69 | 	 * This is a convenience method for getting the first item after executing
70 | 	 * a prepared statement.
71 | 	 * 
72 | 	 * This is useful for statements ending in "RETURNING __;"
73 | 	 * 
74 | 	 * @param ps  The statement to run 
75 | 	 * @return  The ResultSet, moved forward one result
76 | 	 * @throws SQLException
77 | 	 */
78 | 	public ResultSet then(PreparedStatement ps) throws SQLException {
79 | 		ResultSet rs = ps.executeQuery();
80 | 		rs.next();
81 | 		return rs;
82 | 	}
83 | 
84 | 	/**
85 | 	 * A simple delegate for creating Postgres arrays
86 | 	 */
87 | 	public Array createArrayOf(String typeName, Object[] elements) {
88 | 		try {
89 | 			return conn.createArrayOf(typeName, elements);
90 | 		} catch (SQLException e) {
91 | 			e.printStackTrace();
92 | 			throw new RuntimeException("Can't create an SQL array from \"" + elements + "\"");
93 | 		}
94 | 	}
95 | 	
96 | 	public String backend() {
97 | 		return conn.getClass().getSimpleName();
98 | 	}
99 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/LuceneSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.document.Document;
 8 | import org.apache.lucene.index.Term;
 9 | import org.apache.lucene.search.BooleanClause;
10 | import org.apache.lucene.search.BooleanQuery;
11 | import org.apache.lucene.search.IndexSearcher;
12 | import org.apache.lucene.search.PhraseQuery;
13 | import org.apache.lucene.search.ScoreDoc;
14 | import org.apache.lucene.search.TermQuery;
15 | 
16 | import edu.uncc.cs.watsonsim.Environment;
17 | import edu.uncc.cs.watsonsim.Passage;
18 | import edu.uncc.cs.watsonsim.Question;
19 | import edu.uncc.cs.watsonsim.Score;
20 | import edu.uncc.cs.watsonsim.scorers.Merge;
21 | 
22 | /**
23 |  * @author Phani Rahul
24 |  */
25 | public class LuceneSearcher extends Searcher {
26 | 	private final IndexSearcher lucene;
27 | 	
28 | 	public LuceneSearcher(Environment env) {
29 | 		super(env);
30 | 		lucene = env.lucene;
31 | 		Score.register("LUCENE_ANSWER_RANK", -1, Merge.Mean);
32 | 		Score.register("LUCENE_ANSWER_SCORE", -1, Merge.Mean);
33 | 		Score.register("LUCENE_ANSWER_PRESENT", 0.0, Merge.Sum);
34 | 	}
35 | 	
36 | 	/**
37 | 	 * Create a Lucene query using the bigrams in the given text
38 | 	 * @param text
39 | 	 */
40 | 	public BooleanQuery queryFromSkipBigrams(String text) {
41 | 		BooleanQuery q = new BooleanQuery();
42 | 		String prev_word = null;
43 | 		for (String word : text.split("\\W+")) {
44 | 			if (prev_word != null) {
45 | 				PhraseQuery pq = new PhraseQuery();
46 | 				pq.setSlop(1);
47 | 				pq.add(new Term("text", prev_word));
48 | 				pq.add(new Term("text", word));
49 | 				q.add(pq, BooleanClause.Occur.SHOULD);
50 | 			}
51 | 			q.add(new TermQuery(new Term("text", word)), BooleanClause.Occur.SHOULD);
52 | 			prev_word = word;
53 | 		}
54 | 		return q;
55 | 	}
56 | 	
57 | 	
58 | 	public List<Passage> query(Question question) {
59 | 		List<Passage> results = new ArrayList<>();
60 | 		try {
61 | 			//ScoreDoc[] hits = env.simpleLuceneQuery(question.text, MAX_RESULTS);
62 | 			ScoreDoc[] hits = lucene.search(
63 | 					queryFromSkipBigrams(
64 | 							question.text
65 | 							+ " "
66 | 							+ question.getCategory()),
67 | 					MAX_RESULTS).scoreDocs;
68 | 			// This isn't range based because we need the rank
69 | 			for (int i=0; i < hits.length; i++) {
70 | 				ScoreDoc s = hits[i];
71 | 				Document doc = lucene.doc(s.doc);
72 | 				results.add(new edu.uncc.cs.watsonsim.Passage(
73 | 						"lucene", 			// Engine
74 | 						"",	// Title - filled in by shared db
75 | 						"", // Text - filled in by shared db
76 | 						doc.get("docno"))   // Reference
77 | 						.score("LUCENE_ANSWER_RANK", (double) i)        // Rank
78 | 						.score("LUCENE_ANSWER_SCORE", (double) s.score)	// Source
79 | 						.score("LUCENE_ANSWER_PRESENT", 1.0)
80 | 						);
81 | 			}
82 | 		} catch (IOException e) {
83 | 			System.out.println("Failed to query Lucene. Is the index in the correct location?");
84 | 			e.printStackTrace();
85 | 		}
86 | 		
87 | 		// Fill any missing full text from sources
88 | 		return fillFromSources(results);
89 | 	}
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/WatsonSim.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.IOException;
 5 | import java.io.InputStreamReader;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.log4j.BasicConfigurator;
 9 | import org.apache.log4j.Level;
10 | import org.apache.log4j.Logger;
11 | 
12 | 
13 | public class WatsonSim {
14 |     public static void main(String[] args) throws Exception {
15 | 
16 |         // Read a command from the console
17 |         System.out.print("Watsonsim CLI\n"
18 |         		+ "Enter any natural language question to have it answered.\n"
19 |         		+ "(Keep in mind phrasing it like Jeopardy! improves results.)\n"
20 |         		+ "Place the correct answer after a | to check an answer.\n"
21 |         		+ ">>> ");
22 | 
23 | 	    BasicConfigurator.configure();
24 | 	    Logger.getRootLogger().setLevel(Level.INFO);
25 |         prompt();
26 |     }
27 |     
28 |     private static void listAnswers(List<Answer> answers, int max) {
29 | 		for (int i=0; i<answers.size() && i < max; i++) {
30 |         	Answer answer = answers.get(i);
31 |         	System.out.println(String.format("%2d: %s", i, answer.toLongString()));
32 |         }
33 |         if (answers.size() > max) {
34 |         	System.out.println((answers.size() - max)
35 |         			+ " additional candidates are hidden.");
36 |         }
37 |     }
38 |     
39 |     private static Question readQuestion(String command) {
40 | 		if (command.contains("|")) {
41 | 			String[] parts = command.split("\\|");
42 | 			return Question.known(parts[0].trim(), parts[1].trim());
43 | 		} else {
44 |     		return new Question(command);	
45 | 		}
46 |     }
47 |     
48 |     private static void prompt() throws IOException {
49 |     	BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
50 | 	    String command = br.readLine();
51 | 	    // Defensively scroll the console so that the next error doesn't
52 | 	    // clobber the user's text.
53 | 	    System.out.println();
54 | 	    DefaultPipeline pipe = new DefaultPipeline();
55 | 	    
56 |     	while (!command.isEmpty()) {
57 |     		Question question = readQuestion(command);
58 |     		List<Answer> answers = pipe.ask(question);
59 | 	        
60 | 	        // Print out a simple one-line summary of each answer
61 | 	        listAnswers(answers, 10);
62 | 	
63 | 	        do {
64 | 		        // Read in the next command from the console
65 | 		        System.out.println("Enter \"...\" to see the hidden candidates,\n"
66 | 		        		+ "an answer index to see an explanation,\n"
67 | 		        		+ "a question to search again, or enter to quit\n>>> ");
68 | 		        command = br.readLine();
69 | 	        	if (StringUtils.isNumeric(command)) {
70 | 	        		// Explain
71 | 		        	Answer a = answers.get(Integer.parseInt(command));
72 | 	        		System.out.println("Explanation for " + a);
73 | 	        		System.out.println(a.explain());
74 | 	        	} else if (command.equals("...")) {
75 | 	        		// List all
76 | 	        		listAnswers(answers, 1000);
77 | 	        	} else {
78 | 	        		// Done with this question
79 | 	        		break;
80 | 	        	}
81 | 	        } while (true);
82 |     	}
83 |     }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/IndriSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.log4j.Logger;
 7 | 
 8 | import edu.uncc.cs.watsonsim.Environment;
 9 | import edu.uncc.cs.watsonsim.Passage;
10 | import edu.uncc.cs.watsonsim.Question;
11 | import edu.uncc.cs.watsonsim.Score;
12 | import edu.uncc.cs.watsonsim.StringUtils;
13 | import edu.uncc.cs.watsonsim.scorers.Merge;
14 | import lemurproject.indri.QueryAnnotation;
15 | import lemurproject.indri.QueryEnvironment;
16 | import lemurproject.indri.ScoredExtentResult;
17 | 
18 | /**
19 |  *
20 |  * @author Phani Rahul
21 |  */
22 | public class IndriSearcher extends Searcher {
23 | 	private final QueryEnvironment q = new QueryEnvironment();
24 | 	private boolean enabled = true;
25 | 	private final Logger log = Logger.getLogger(getClass());
26 | 	private final boolean strict;
27 | 	
28 | 	/**
29 | 	 * Setup the Indri Query Environment.
30 | 	 * The "indri_index" property is the Indri index path
31 | 	 * @param config  The configuration Properties
32 | 	 */
33 | 	public IndriSearcher(Environment env, boolean strict) {
34 | 		super(env);
35 | 		this.strict = strict;
36 | 		if (env.getConfOrDie("indri_enabled") == "false") {
37 | 			enabled = false;
38 | 		} else {
39 | 			try {
40 | 				q.addIndex(env.getConfOrDie("indri_index"));
41 | 			} catch (Exception e) {
42 | 				System.out.println("Setting up the Indri index failed."
43 | 						+ " Is the index in the correct location?"
44 | 						+ " Is indri_jni included?");
45 | 				e.printStackTrace();
46 | 				enabled=false;
47 | 			}
48 | 		}
49 | 		Score.register("INDRI_ANSWER_SCORE", -1, Merge.Mean);
50 | 		Score.register("INDRI_ANSWER_RANK", -1, Merge.Mean);
51 | 		Score.register("INDRI_ANSWER_PRESENT", 0.0, Merge.Sum);
52 | 	}
53 | 	
54 | 	public List<Passage> query(Question question){
55 | 		if (!enabled) return new ArrayList<>();
56 | 		// Develop the query
57 | 		String query = q.reformulateQuery(StringUtils.sanitize(
58 |         		question.getCategory() + " " + question.text
59 |         ));
60 | 		if (strict) query = query.replaceAll("#combine", "#uw");
61 | 		log.info("Executing query " + query);
62 | 		
63 | 		ScoredExtentResult[] ser;
64 | 		QueryAnnotation aq;
65 | 		// Fetch all titles, texts
66 | 		String[] docnos;
67 | 		try {
68 | 			aq = q.runAnnotatedQuery(query, MAX_RESULTS);
69 | 			ser = aq.getResults();
70 | 			docnos = q.documentMetadata(ser, "docno");
71 | 		} catch (Exception e) {
72 | 			// If any other step fails, give a more general message but don't die.
73 | 			System.out.println("Querying Indri failed. Is the index in the correct location? Is indri_jni included?");
74 | 			e.printStackTrace();
75 | 			return new ArrayList<>();
76 | 		}
77 | 
78 | 		// Compile them into a uniform format
79 | 		List<Passage> results = new ArrayList<Passage>();
80 | 		for (int i=0; i<ser.length; i++) {
81 | 	    	results.add(new Passage(
82 |     			"indri",         	// Engine
83 |     			"",			        // Title
84 |     			"",                 // Full Text
85 | 				docnos[i])          // Reference
86 | 			.score("INDRI_ANSWER_RANK", (double) i)
87 | 			.score("INDRI_ANSWER_SCORE", ser[i].score)
88 | 			.score("INDRI_ANSWER_PRESENT", 1.0));
89 | 		}
90 | 		return fillFromSources(results);
91 | 	}
92 | 	
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/Searcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.sql.PreparedStatement;
 4 | import java.sql.ResultSet;
 5 | import java.sql.SQLException;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | 
 9 | import edu.uncc.cs.watsonsim.Database;
10 | import edu.uncc.cs.watsonsim.Environment;
11 | import edu.uncc.cs.watsonsim.Passage;
12 | import edu.uncc.cs.watsonsim.Question;
13 | 
14 | /*
15 |  * This interface might change; Please be ready to accommodate the changes.
16 |  * This interface should be implemented by local search engines like 
17 |  * Indri and Lucene, when querying them. Basically, it retrieves the basic data
18 |  * from the queried result set.
19 |  */
20 | 
21 | /**
22 |  *
23 |  * @author Phani Rahul
24 |  */
25 | public abstract class Searcher {
26 | 	protected final Database db;
27 | 	protected final Environment env;
28 | 	public Searcher(Environment env) {
29 | 		this.env = env;
30 | 		db = env.db;
31 | 	}
32 | 
33 |     /**
34 |      * Runs the <i>query</i>, populating a list of ResultSets
35 |      * 
36 |      * For each ResultSet:
37 |      * <p>1: Gets the score of the document from the search result. For different
38 |      * search engines, the scoring methods are different. If the document is 
39 |      * in TREC text format or TREC web format, every {@literal<DOC></DOC>} should be
40 |      * considered as a separate document.
41 |      * <p>2: Gets the title of the document.
42 |      * <p>3: Gets the full text of the document.
43 |      *
44 |      * @param query
45 |      * @throws Exception 
46 |      */
47 |     
48 | 	public List<Passage> query(String query) {
49 | 		return new ArrayList<>();
50 | 	};
51 | 	public List<Passage> query(Question q) {
52 | 		return query(q.text);
53 | 	};
54 | 
55 |     /**
56 |      * How many results should Lucene and Indri return?
57 |      * This is also how many passages the scorers should expect.
58 |      */
59 | 
60 |     public final static int MAX_RESULTS = 10;
61 |     
62 |     
63 |     /** Fill in the missing titles and full texts from Answers using the
64 |      * sources from the relational database.
65 |      *  
66 |      * This is a no-op if the sources database is missing.
67 |      */
68 |     List<Passage> fillFromSources(List<Passage> passages) {
69 |     	List<Passage> results = new ArrayList<>();
70 |     	PreparedStatement fetcher = db.prep("SELECT title, text FROM sources WHERE reference=? or id=?;");
71 | 
72 |     	for (Passage p: passages) {
73 |     		ResultSet doc_row;
74 |     		try {
75 | 				fetcher.setString(1, p.reference);
76 | 				fetcher.setString(2, p.reference);
77 | 				doc_row = fetcher.executeQuery();
78 | 				if (doc_row.next()
79 | 						&& doc_row.getString("title") != null
80 | 						&& doc_row.getString("text") != null) {
81 | 					Passage np = new Passage(
82 | 							p.engine_name,
83 | 							doc_row.getString("title"),
84 | 							doc_row.getString("text"),
85 | 							p.reference
86 | 							);
87 | 					np.scores = p.scores.clone();
88 |                     results.add(np);
89 | 				}
90 | 			} catch (SQLException e) {
91 | 				e.printStackTrace();
92 | 				throw new RuntimeException("Failed to execute sources search. "
93 | 						+ "Missing document? docno:"+p.reference);
94 | 			}
95 |     	}
96 |     	return results;
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/Configuration.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileInputStream;
 5 | import java.io.FileNotFoundException;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.io.Reader;
 9 | import java.util.Collections;
10 | import java.util.HashMap;
11 | import java.util.Map;
12 | import java.util.Properties;
13 | 
14 | public class Configuration {
15 | 
16 | 	protected final String data_path = "data/";
17 | 	public final Map<String, String> config;
18 | 
19 | 	@SuppressWarnings({ "unchecked", "rawtypes" }) // From Properties -> Map
20 | 	public Configuration() {
21 | 		/*
22 | 		 * Normally, wrapping a IOException with a RuntimeException is bad
23 | 		 * but if you cannot find a configuration file many bad things will
24 | 		 * happen, and basically every useful feature will fail. So you might
25 | 		 * as well just quit here.
26 | 		 */
27 | 		try {
28 | 			// Check the data path
29 | 			File f = new File(data_path);
30 | 			if (!(f.exists() && f.isDirectory())) {
31 | 				throw new IOException(data_path + " should be a directory.");
32 | 			}
33 | 			
34 | 			// Read the configuration
35 | 			Properties props = null;
36 | 			for (String prefix : new String[]{this.data_path, ""}) {
37 | 				try (Reader s = new InputStreamReader(
38 | 						new FileInputStream(prefix + "config.properties"), "UTF-8")){
39 | 					// Make it, then link it if it works.
40 | 					Properties _local_props = new Properties();
41 | 					_local_props.load(s);
42 | 					props = _local_props;
43 | 				} catch (FileNotFoundException e) {
44 | 					// This is only an error if none are found.
45 | 				}
46 | 			}
47 | 			// If it didn't link, all the reads failed.
48 | 			if (props == null) {
49 | 				throw new IOException("Failed to read config.properties in either "
50 | 						+ this.data_path
51 | 						+ " or "
52 | 						+ System.getProperty("user.dir") // CWD
53 | 						+ " You can create one by making a copy of"
54 | 						+ " config.properties.sample. Check the README as well.");
55 | 			}
56 | 			// Now make properties immutable.
57 | 			Map<Object, Object> m = new HashMap<>();
58 | 			m.putAll(props);
59 | 			this.config = Collections.unmodifiableMap((Map) m);
60 | 		} catch (IOException e) {
61 | 			throw new RuntimeException(e);
62 | 		}
63 | 	}
64 | 
65 | 	/**
66 | 	 * Convenience method for getting a setting.
67 | 	 * @param config Map from the configuration file (config.properties) 
68 | 	 * @param key The key that must exist in the properties
69 | 	 * @return The non-null String value, or else throw a RuntimeException.
70 | 	 */
71 | 	public String getConfOrDie(String key) {
72 | 		String value = config.get(key);
73 | 		if (value == null) throw new RuntimeException("Required key (" + key + ") missing from configuration file.");
74 | 		return value;
75 | 	}
76 | 
77 | 	/**
78 | 	 * Get the path to a resource, ensuring it exists.
79 | 	 * This is mostly to give helpful errors and fail fast if you missed a
80 | 	 * step setting up.
81 | 	 * @param resource The relative path of the resource without leading /
82 | 	 */
83 | 	public String pathMustExist(String resource) {
84 | 		String path = data_path + File.separator + resource;
85 | 		if (!new File(path).exists()) {
86 | 			throw new RuntimeException("The data directory is missing the"
87 | 					+ " expected resource: " + path);
88 | 		}
89 | 		return path;
90 | 	}
91 | 
92 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/Passage.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim;
  2 | 
  3 | import org.json.simple.JSONObject;
  4 | 
  5 | import org.apache.commons.lang3.StringEscapeUtils;
  6 | 
  7 | 
  8 | public class Passage extends Phrase {
  9 | 	// Stored Fields
 10 | 	public final String reference;
 11 | 	public final String engine_name;
 12 | 	public final String title;
 13 | 	
 14 | 	// Mutable
 15 |     public Score scores = Score.empty();
 16 |     
 17 |     /**
 18 |      * Create a new Passage
 19 |      * 
 20 |      * @param engine_name  A simple lowercase string
 21 |      * @param title
 22 |      * @param text
 23 |      * @param reference   Specific to the engine, or a URL, for later lookup
 24 |      */
 25 | 	public Passage(String engine_name, String title, String text, String reference) {
 26 | 		super(text);
 27 | 		if (engine_name == null)
 28 | 			throw new NullPointerException("Engine name cannot be null.");
 29 | 		if (title == null)
 30 | 			throw new NullPointerException("Title cannot be null.");
 31 | 		if (reference == null)
 32 | 			throw new NullPointerException("Reference cannot be null.");
 33 | 		
 34 | 		this.reference = reference;
 35 | 		this.engine_name = engine_name;
 36 | 		this.title = StringEscapeUtils.unescapeXml(title);
 37 | 	}
 38 | 	
 39 | 	// Copy constructor
 40 | 	public Passage(Passage original) {
 41 | 		this(original.engine_name, original.title, original.text, original.reference);
 42 | 		scores = original.scores.clone();
 43 | 	}
 44 |     
 45 |     /** Set the value of this Score for this passage, returning the passage.
 46 |      * 
 47 |      * The intended use is something like this:
 48 |      * new Passage(.......).score("SKIP_BIGRAM", 9.45).score("NGRAM", -1.2)
 49 |      * @param name
 50 |      * @param value
 51 |      */
 52 |     public Passage score(String name, double value) {
 53 |     	scores.put(name, value);
 54 |     	return this;
 55 |     }
 56 |     
 57 |     /** Return a JSON object with the same fields */
 58 |     public JSONObject toJSON() {
 59 | 		JSONObject jo = new JSONObject();
 60 | 		jo.put("text", text);
 61 | 		jo.put("title", title);
 62 | 		jo.put("reference", reference);
 63 | 		jo.put("engine_name", engine_name);
 64 | 		return jo;
 65 |     }
 66 | 	
 67 | 	/******************************************************
 68 | 	 * 
 69 | 	 * Autogenerated hashcode() and equals() follow
 70 | 	 * 
 71 | 	 ******************************************************/
 72 | 
 73 | 	@Override
 74 | 	public int hashCode() {
 75 | 		final int prime = 31;
 76 | 		int result = 1;
 77 | 		result = prime * result + getTokens().hashCode();
 78 | 		result = prime * result + engine_name.hashCode();
 79 | 		result = prime * result + reference.hashCode();
 80 | 		result = prime * result + text.hashCode();
 81 | 		result = prime * result + title.hashCode();
 82 | 		return result;
 83 | 	}
 84 | 
 85 | 	@Override
 86 | 	public boolean equals(Object obj) {
 87 | 		if (this == obj)
 88 | 			return true;
 89 | 		if (obj == null)
 90 | 			return false;
 91 | 		if (getClass() != obj.getClass())
 92 | 			return false;
 93 | 		Passage other = (Passage) obj;
 94 | 		if (!getTokens().equals(other.getTokens()))
 95 | 			return false;
 96 | 		else if (!engine_name.equals(other.engine_name))
 97 | 			return false;
 98 | 		else if (!reference.equals(other.reference))
 99 | 			return false;
100 | 		else if (!text.equals(other.text))
101 | 			return false;
102 | 		else if (!title.equals(other.title))
103 | 			return false;
104 | 		return true;
105 | 	}
106 | }


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/WekaTee.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim.researchers;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.sql.Timestamp;
  6 | import java.util.ArrayList;
  7 | import java.util.Collection;
  8 | import java.util.List;
  9 | 
 10 | import edu.uncc.cs.watsonsim.Answer;
 11 | import edu.uncc.cs.watsonsim.Question;
 12 | import edu.uncc.cs.watsonsim.Score;
 13 | import weka.core.Attribute;
 14 | import weka.core.FastVector;
 15 | import weka.core.Instance;
 16 | import weka.core.Instances;
 17 | import weka.core.converters.ArffSaver;
 18 | import weka.core.converters.Saver;
 19 | 
 20 | 
 21 | /** Pipe Answer scores to an ARFF file for Weka */
 22 | public class WekaTee extends Researcher {
 23 | 	private final static List<Score> dataset = new ArrayList<>();
 24 | 	private static ArffSaver saver;
 25 | 	private static int saved_schema_version = -1;
 26 | 	
 27 | 	
 28 | 	// Make every run unique, but overwrite between questions
 29 | 	// This way, you still get /something/ if you interrupt it
 30 | 	private final Timestamp start_time;
 31 | 	/**
 32 | 	 * Dump the training data to an ARFF file marked by the given timestamp
 33 | 	 * @param start_time
 34 | 	 */
 35 | 	public WekaTee(Timestamp start_time) {
 36 | 		this.start_time = start_time;
 37 | 	}
 38 | 
 39 | 	@Override
 40 | 	/**
 41 | 	 * Multithreaded counterpart to dump, which is synchronized
 42 | 	 */
 43 | 	public List<Answer> question(Question q, List<Answer> answers) {
 44 | 		List<Score> new_entries = new ArrayList<>();
 45 | 		for (Answer a : answers) {
 46 | 			new_entries.add(a.scores.clone());
 47 | 		}
 48 | 		
 49 | 		
 50 | 		dump(new_entries, start_time);
 51 | 		return answers;
 52 | 	}
 53 | 	
 54 | 	/** File-writing serialized counterpart to question()
 55 | 	 * 
 56 | 	 * @param new_entries	The new arrays to dump
 57 | 	 * @param start_time	The timestamp of the file to dump to
 58 | 	 */
 59 | 	private static synchronized void dump(List<Score> new_entries, Timestamp start_time) {
 60 | 		dataset.addAll(new_entries);
 61 | 		
 62 | 		Collection<String> names = Score.latestSchema();
 63 | 		try {
 64 | 			if (names.size() != saved_schema_version) {
 65 | 				dump_from_scratch(names, start_time);
 66 | 			} else {
 67 | 				// Only do a few quick updates
 68 | 				for (Score row : new_entries)
 69 | 					saver.writeIncremental(new Instance(1.0, row.getEach(names)));
 70 | 			}
 71 | 			// There are synchronization issues otherwise.
 72 | 			saver.getWriter().flush();
 73 | 		} catch (IOException e) {
 74 | 			e.printStackTrace();
 75 | 			throw new RuntimeException("Failed to write Weka Log!");
 76 | 		}
 77 | 	}
 78 | 	
 79 | 	/**
 80 | 	 *  When the score changes, rewrite the file.
 81 | 	 *  This is really rare in practice, so don't bother optimizing it.
 82 | 	 */
 83 | 	private static void dump_from_scratch(Collection<String> names, Timestamp start_time) throws IOException {
 84 | 		saved_schema_version = names.size();
 85 | 		
 86 | 		FastVector attributes = new FastVector();
 87 | 		// Answer score names
 88 | 		for (String name: names)
 89 | 			attributes.addElement(new Attribute(name));
 90 | 		Instances data = new Instances("Watsonsim captured question stream", attributes, 0);
 91 | 		
 92 | 		// Save the results to a file
 93 | 		saver = new ArffSaver();
 94 | 		saver.setStructure(data);
 95 | 		saver.setRetrieval(Saver.INCREMENTAL);
 96 | 		saver.setFile(new File("data/weka-log." + start_time + ".arff"));
 97 | 		for (Score row : dataset)
 98 | 			saver.writeIncremental(new Instance(1.0, row.getEach(names)));
 99 | 	}
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/BingSearcher.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim.search;
 2 | 
 3 | import java.io.IOException;
 4 | import java.net.URI;
 5 | import java.net.URISyntaxException;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.http.client.fluent.*;
10 | import org.apache.http.client.utils.URIBuilder;
11 | import org.apache.log4j.Logger;
12 | import org.jsoup.Jsoup;
13 | import org.jsoup.nodes.Document;
14 | import org.jsoup.nodes.Element;
15 | 
16 | import edu.uncc.cs.watsonsim.Environment;
17 | import edu.uncc.cs.watsonsim.Passage;
18 | import edu.uncc.cs.watsonsim.Score;
19 | import edu.uncc.cs.watsonsim.scorers.Merge;
20 | 
21 | /**
22 |  * Internet-enabled Searcher for Bing.
23 |  * 
24 |  * You will need a Bing api key, which you can (as of the time of this writing)
25 |  * get from <a href="http://datamarket.azure.com">Microsoft</a>
26 |  * 
27 |  * Bing gives around 5000 queries per month, which means that in most cases for
28 |  * sustained development you will need to use CachingSearcher.
29 |  * 
30 |  * @see CachingSearcher
31 |  * @see privatedata.bingAPIKey
32 |  * @author Sean Gallagher
33 |  * @author Stephen Stanton
34 |  * @author D Haval
35 |  */
36 | public class BingSearcher extends Searcher {
37 | 	private final String key;
38 | 	private final Logger log = Logger.getLogger(getClass());
39 | 	public BingSearcher(Environment env) {
40 | 		super(env);
41 | 		Score.register("BING_ANSWER_RANK", -1, Merge.Mean);
42 | 		Score.register("BING_ANSWER_PRESENT", 0.0, Merge.Sum);
43 | 		key = env.getConfOrDie("bing_api_key");
44 | 	}
45 | 	
46 | 	public List<Passage> query(String query) {
47 | 		
48 | 	    URI uri = URI.create(""); // A bogus workaround for "may not have been initialized"
49 | 		try {
50 | 			uri = new URIBuilder()
51 | 				.setScheme("https")
52 | 				.setHost("api.datamarket.azure.com")
53 | 				.setPath("/Data.ashx/Bing/Search/v1/Web")
54 | 				.addParameter("Query", String.format("'%s'", query)).build(); // Should we place it in quotes?
55 | 				//.addParameter("$top", "50")
56 | 				//.addParameter("$format", "Atom").build();
57 | 		} catch (URISyntaxException e1) {
58 | 			/* This bogus block is required by Java,
59 | 			 * but strictly speaking new URIBuilder() can't actually throw
60 | 			 * this error because it has no input (so there can be no syntax
61 | 			 * error). Hence, this block is unreachable.
62 | 			 */
63 | 			e1.printStackTrace();
64 | 		}
65 | 
66 | 	    List<Passage> results = new ArrayList<Passage>();
67 | 	    try {
68 | 	    	String resp = Executor
69 | 	    		.newInstance()
70 | 				.auth(key, key)
71 | 	    		.execute(Request.Get(uri))
72 | 	    		.returnContent().asString();
73 | 	    	
74 | 	    	Document doc = Jsoup.parse(resp);
75 | 	    	List<Element> elements = doc.select("entry");
76 | 	    	// Perhaps limit to MAX_RESULTS?
77 | 		    for (int i=0; i < elements.size(); i++) {
78 | 		    	Element e = elements.get(i);
79 | 		    	
80 | 	    		results.add(new Passage(
81 |         			"bing",         	// Engine
82 |         			e.select("d|Title").text(),	        // Title
83 |         			e.select("d|Description").text(), // Full Text
84 |         			e.select("d|Url").text())          // Reference
85 |     				.score("BING_ANSWER_RANK", (double) i) // Score
86 |     				.score("BING_ANSWER_PRESENT", 1.0)
87 |     			);
88 | 	    	}
89 | 		    log.info("Retrieved " + elements.size() + " candidates from Bing.");
90 | 	    } catch (IOException e) {
91 | 	    	log.error("Error while searching with Bing. Ignoring. Details follow.");
92 | 	        log.error(e.getMessage());
93 | 	    }
94 | 	    return results;
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/get_started.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Options ######################################################################
  4 | DATA_URL="https://dl.dropboxusercontent.com/u/92563044/watsonsim/data-snapshot.tar.gz"
  5 | GRADLE_URL="https://services.gradle.org/distributions/gradle-2.2.1-bin.zip"
  6 | PGBACKUP_URL="https://dl.dropboxusercontent.com/u/92563044/watsonsim/data-snapshot.pgdump"
  7 | 
  8 | ################################################################################
  9 | 
 10 | GRADLE_TARGET=`basename "$DATA_URL"`
 11 | DATA_TARGET=`basename "$DATA_URL"`
 12 | PGBACKUP_TARGET=`basename "$PGBACKUP_TARGET"`
 13 | 
 14 | install_postgres() {
 15 |   if lsb_release -a | grep -q "Ubuntu 14.10"
 16 |   then
 17 |     echo "Detected Ubuntu 14.10."
 18 |     echo "Installing dependencies for starting."
 19 |     sudo apt-get update
 20 |     sudo apt-get install postgresql-9.4
 21 |   else
 22 |     cat <<END
 23 |       This script hasn't been tested with your distribution.
 24 |       Please make sure the following are installed:
 25 |       PostgreSQL Server 9.3+
 26 | END
 27 |   fi
 28 | }
 29 | 
 30 | load_gradle() {
 31 |   # Download Gradle
 32 |   wget "$GRADLE_URL"
 33 |   unzip "$GRADLE_TARGET"
 34 | }
 35 | 
 36 | load_data() {
 37 |   DELETE=$1
 38 |   echo "Downloading archives (varies, maybe about 75GB). It will take a while."
 39 |   wget "$DATA_URL"
 40 |   echo "Decompressing data archive"
 41 |   if tar -Jxvf "$DATA_TARGET" && test $DELETE
 42 |   then
 43 |     rm "$PGBACKUP_TARGET"
 44 |   fi
 45 | }
 46 | 
 47 | restore_postgres() {
 48 |   DELETE=$1
 49 |   if $DELETE && pg_restore $2 <$PGBACKUP_TARGET
 50 |   then
 51 |     rm $PGBACKUP_TARGET
 52 |   fi
 53 | }
 54 | 
 55 | read_bool() {
 56 |   echo "$1 [Y/n]: "
 57 |   read out
 58 |   if echo $out | egrep -qi '[yt]'
 59 |   then
 60 |     return "true"
 61 |   else
 62 |     return "false"
 63 |   fi
 64 | }
 65 | 
 66 | #### main() ####################################################################
 67 | 
 68 | cat <<END
 69 |   This install script installs Watsonsim and its associated data.
 70 |   To do this, it:
 71 |     Installs PostgreSQL server using local repositories
 72 |     Downloads, compiles, installs:
 73 |       Indri, libSVM, Eclipse and Gradle
 74 |     Downloads Java dependencies using Gradle
 75 |     Makes an eclipse project
 76 |     Downloads indexes (30GB+ download, 50GB+ on disk)
 77 |     Downloads a database (unknown download, 70GB+ on disk)
 78 |     
 79 |   This install script is designed for Ubuntu and Fedora Linux.
 80 |   If you have the right dependencies, you can probably run it on other
 81 |   distributions as well. It probably won't handle others (e.g. cygwin).
 82 | END
 83 | 
 84 | read_bool "Do you want to continue?" || exit 0
 85 | 
 86 | # Ask all the questions FIRST
 87 | LOAD_GRADLE=`read_bool "Download Gradle?"`
 88 | INSTALL_POSTGRES=`read_bool "Install Postgres?"`
 89 | LOAD_POSTGRES=`read_bool "Download Database?"`
 90 | RESTORE_POSTGRES=`read_bool "Restore Database (overwrites contents)?"`
 91 | if $RESTORE_POSTGRES
 92 | then
 93 |   pg_restore --help
 94 |   cat <<END
 95 |   There are many options for restoring a database backup.
 96 |   For example, consider:
 97 |     -U username    -h host    -p port   -d database
 98 | END
 99 |   echo "The filename will be filled in automatically (as $PGBACKUP_TARGET)."
100 |   echo -n "Type in your options: pg_restore "
101 |   read PGBACKUP_OPTS
102 | fi
103 | LOAD_DATA=`read_bool "Download Indexes?"`
104 | DELETE_AFTER=`read_bool "Delete downloaded archives after uncompressing?"`
105 | 
106 | $LOAD_GRADLE && load_gradle
107 | $INSTALL_POSTGRES && install_postgres
108 | $LOAD_POSTGRES && wget "$PGBACKUP_URL"
109 | $RESTORE_POSTGRES && restore_postgres $DELETE_AFTER "$PGBACKUP_OPTS"
110 | $LOAD_DATA && load_data $DELETE_AFTER
111 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/researchers/TagLAT.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim.researchers;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Arrays;
  5 | import java.util.List;
  6 | 
  7 | import edu.stanford.nlp.util.Pair;
  8 | import edu.uncc.cs.watsonsim.Answer;
  9 | import edu.uncc.cs.watsonsim.Environment;
 10 | import edu.uncc.cs.watsonsim.Passage;
 11 | import edu.uncc.cs.watsonsim.Phrase;
 12 | import edu.uncc.cs.watsonsim.Question;
 13 | import edu.uncc.cs.watsonsim.nlp.ClueType;
 14 | import edu.uncc.cs.watsonsim.nlp.DBPediaCandidateType;
 15 | import edu.uncc.cs.watsonsim.nlp.SupportCandidateType;
 16 | import edu.uncc.cs.watsonsim.nlp.Relatedness;
 17 | 
 18 | 
 19 | public class TagLAT extends Researcher {
 20 | 	private final DBPediaCandidateType dbpedia;
 21 | 	private final Relatedness syn;
 22 | 	
 23 | 	public TagLAT(Environment env) {
 24 | 		dbpedia = new DBPediaCandidateType(env);
 25 | 		syn = new Relatedness(env);
 26 | 	}
 27 | 	
 28 | 	public List<Answer> pull(Question q, List<Answer> answers) {
 29 | 		return pull(q, answers, 0);
 30 | 	}
 31 | 	
 32 | 	public List<Answer> pull(Question q, List<Answer> answers, int depth) {
 33 | 		return question(q, chain.pull(q, answers), depth);
 34 | 	}
 35 | 	
 36 | 	
 37 | 	/**
 38 | 	 * Find the possible lexical types of a candidate, and label the answer.
 39 | 	 */
 40 | 	public List<Answer> question(Question q, List<Answer> answers, int depth) {
 41 | 		int have_any_types = 0;
 42 | 		
 43 | 		int dbpedia_types = 0;
 44 | 		int support_types = 0;
 45 | 		
 46 | 		List<Answer> suggestions = new ArrayList<>();
 47 | 		
 48 | 		for (Answer a: answers) {
 49 | 			
 50 | 			// Handle DBPedia types
 51 | 			
 52 | 			a.lexical_types = dbpedia.viaDBPedia(a.text);
 53 | 			for (String type: a.lexical_types) {
 54 | 				a.log(this, "DBPedia says it's a %s", type);
 55 | 			}
 56 | 			if (a.lexical_types.isEmpty())
 57 | 				a.log(this, "DBPedia has no type information for it.");
 58 | 			dbpedia_types += a.lexical_types.size(); 
 59 | 			
 60 | 			// Handle Support types
 61 | 			
 62 | 			for (Passage p: a.passages) {
 63 | 				List<Pair<String, String>> types = p.memo(SupportCandidateType::extract);
 64 | 				for (Pair<String, String> name_and_type : types) {
 65 | 					Phrase name = new Phrase(name_and_type.first);
 66 | 					Phrase type = new Phrase(name_and_type.second);
 67 | 					if (syn.implies(a, name)) {
 68 | 						a.log(this, "Passage %s says it's a %s.", p.reference, type);
 69 | 						a.lexical_types.add(type.text);
 70 | 						support_types++;
 71 | 					} else if (syn.implies(type, new Phrase(q.memo(ClueType::fromClue)))) {
 72 | 						Answer suggestion = new Answer(name.text);
 73 | 						suggestion.lexical_types = Arrays.asList(type.text);
 74 | 						suggestion.log(this, "Found it's a %s, while reading about %s in %s", type, a, p.reference);
 75 | 						if (!(suggestions.contains(suggestion)
 76 | 								|| answers.contains(suggestion))) {
 77 | 							log.info("Suggesting " + name);
 78 | 							suggestions.add(suggestion);
 79 | 						}
 80 | 						
 81 | 					}
 82 | 				}
 83 | 			}
 84 | 			if (!a.lexical_types.isEmpty()) have_any_types++;
 85 | 		}
 86 | 		
 87 | 		// This is the chain magic:
 88 | 		// We can pull the new suggestions through the pipeline and merge them!
 89 | 		List<Answer> new_answers = new ArrayList<>();
 90 | 		if (!suggestions.isEmpty() && depth < 3)
 91 | 			new_answers.addAll(pull(q, suggestions, depth+1));
 92 | 		new_answers.addAll(answers);
 93 | 		
 94 | 
 95 | 		//System.out.println(text + " could be any of " + types);
 96 | 		log.info("Found " + (dbpedia_types+support_types) + " types for "
 97 | 				+ have_any_types + " candidates. ("+ support_types +" by reading) "
 98 | 				+ (answers.size() - have_any_types) + " candidates are unknown.");
 99 | 		return new_answers;
100 | 	}
101 | 
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/nlp/ClueType.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim.nlp;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.Collections;
  5 | import java.util.List;
  6 | 
  7 | import org.apache.commons.lang3.ObjectUtils;
  8 | 
  9 | import static edu.uncc.cs.watsonsim.nlp.Trees.concat;
 10 | import edu.stanford.nlp.trees.Tree;
 11 | import edu.uncc.cs.watsonsim.Configuration;
 12 | import edu.uncc.cs.watsonsim.Phrase;
 13 | 
 14 | /**
 15 |  * Detect the LAT as the noun in closest proximity to a determiner.
 16 |  */
 17 | public class ClueType {
 18 | 	
 19 | 	public ClueType(Configuration env) {
 20 | 	}
 21 | 	
 22 | 	/**
 23 | 	 * Intermediate results from LAT detection
 24 | 	 */
 25 | 	private static final class Analysis {
 26 | 		public final Tree dt, nn;	// Determiner, Noun// This is from worst to best! That way -1 is the worse-than-worst;
 27 | 		private static final List<String> DT_RANK = Arrays.asList(new String[]{
 28 | 				"those", "that", "these", "which", "what", "this"
 29 | 		});
 30 | 		public Analysis(Tree d, Tree n){
 31 | 			dt = d; nn = n;
 32 | 		}
 33 | 
 34 | 		/**
 35 | 		 * Case insensitively rank the LAT's by a predefined order
 36 | 		 */
 37 | 		public int rank() {
 38 | 			if (dt == null) return -1;
 39 | 			return DT_RANK.indexOf(concat(dt).toLowerCase());
 40 | 		}
 41 | 		
 42 | 		public boolean ok() {
 43 | 			return dt != null && nn != null;
 44 | 		}
 45 | 	}
 46 | 
 47 | 	/**
 48 | 	 * Merge two partial LAT analyses.
 49 | 	 * 1) Favor complete analyses over fragments
 50 | 	 * 2) Favor specific determiners in a specific order
 51 | 	 * @return a new immutable partial LAT analysis  
 52 | 	 */
 53 | 	private static Analysis merge(Analysis a, Analysis b) {
 54 | 		if (a.ok() && b.ok()) 	return (a.rank() < b.rank()) ? b : a;
 55 | 		else if (a.ok())		return a;
 56 | 		else if (b.ok()) 		return b; 			
 57 | 		else {
 58 | 			// Neither are viable. Merge them.
 59 | 			return new Analysis(
 60 | 					ObjectUtils.firstNonNull(a.dt, b.dt),
 61 | 					ObjectUtils.firstNonNull(a.nn, b.nn));
 62 | 		}
 63 | 	}
 64 | 	
 65 | 	
 66 | 	/**
 67 | 	 * A very simple LAT detector. It wants the lowest subtree with both a determiner and a noun
 68 | 	 */
 69 | 	private static Analysis detectPart(Tree t) {
 70 | 		switch (t.value()) {
 71 | 		case "WDT":
 72 | 		case "DT": return new Analysis(t, null);
 73 | 		case "NN":
 74 | 		case "NNS": return new Analysis(null, t);
 75 | 		default:
 76 | 			Analysis l = new Analysis((Tree) null, null);
 77 | 			// The last noun tends to be the most general
 78 | 			List<Tree> kids = t.getChildrenAsList();
 79 | 			Collections.reverse(kids);
 80 | 			for (Tree kid : kids)
 81 | 				l = merge(l, detectPart(kid));
 82 | 			return l;
 83 | 		}
 84 | 		
 85 | 	}
 86 | 	/**
 87 | 	 * Detect the LAT using a simple rule-based approach
 88 | 	 * @return The most general single-word noun LAT
 89 | 	 */
 90 | 	public static String fromClue(Phrase phrase) {
 91 | 		for (Tree t : phrase.getTrees()) {
 92 | 			Analysis lat = detectPart(t);
 93 | 			if (lat.ok() && lat.rank() >= 0) {
 94 | 				String latname = concat(lat.nn);
 95 | 				phrase.log.info("Target lexical type: " + latname); 
 96 | 				return latname;
 97 | 			} else {
 98 | 				phrase.log.info("Unknown target lexical type.");
 99 | 				return "";
100 | 			}
101 | 		}
102 | 		return "";
103 | 	}
104 | 	
105 | 	/**
106 | 	 * Detect the LAT using a simple rule-based approach
107 | 	 * This is a thin wrapper for use as a string
108 | 	 * @return The most general single-word noun LAT
109 | 	 */
110 | 	public static String fromClue(String text) {
111 | 		Phrase p = new Phrase(text);
112 | 		for (Tree t : p.getTrees()) {
113 | 			Analysis lat = detectPart(t);
114 | 			if (lat.ok() && lat.rank() >= 0) {
115 | 				return concat(lat.nn).toLowerCase();
116 | 			}
117 | 		}
118 | 		return "";
119 | 	}
120 | 	
121 | 	
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/scorers/PassageScorerOpenNLPAda.java:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 | * @author Adarsh
 4 | */
 5 | package edu.uncc.cs.watsonsim.scorers;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.HashMap;
 9 | import java.util.Map;
10 | 
11 | import edu.uncc.cs.watsonsim.researchers.OpenNlpTests;
12 | import opennlp.tools.parser.Parse;
13 | import opennlp.tools.util.InvalidFormatException;
14 | 
15 | public class PassageScorerOpenNLPAda {
16 | 		
17 | 	OpenNlpTests t = new OpenNlpTests();
18 | 	public double compareParseType(Parse[] pa1, Parse[] pa2, boolean verbose){
19 | 		double numMatches=0;
20 | 		Map<String, String> key1 = new HashMap<String, String>();
21 | 		for (int i=0;i<pa1.length;i++){
22 | 			key1.put(pa1[i].getType(),"y");
23 | 			//pa1h.put(key[0],"y");
24 | 		}
25 | 		for (int j=0;j<pa2.length;j++){
26 | 			String key2=pa2[j].getType();
27 | 			if (key1.containsKey(key2)){ 
28 | 				numMatches++;
29 | 				if (verbose) System.out.println("\n");
30 | 				pa2[j].show();
31 | 				if (verbose) System.out.println("type: "+pa2[j].getType());
32 | 			}
33 | 		}
34 | 		if (verbose) System.out.println("numTypeMatches "+numMatches);
35 | 		return numMatches;
36 | 	}
37 | 	
38 | 	
39 | 	public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
40 | 		double score1=0, score2=0;
41 | 		Parse[] caParse = t.parsePassageText(ca);
42 | 		Parse[] qParse = t.parsePassageText(q);
43 | 		Parse[] pasParse = t.parsePassageText(passage);
44 | 		Parse[] caParseCh = t.getAllChildren(caParse);
45 | 		Parse[] qParseCh = t.getAllChildren(qParse);
46 | 		Parse[] pasParseCh = t.getAllChildren(pasParse);
47 | 		score1=this.compareParseType(qParseCh, pasParseCh,verbose);
48 | 		score2=this.compareParseType(caParseCh, pasParseCh,verbose);
49 | 		return score1*score2;
50 | 	}
51 | 
52 | 	//normalized scorer. 
53 | 	public double scoreStructureNorm(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{
54 | 		double score1=0, score2=0;
55 | 		//OnlpParserTest pt= new OnlpParserTest();
56 | 		Parse[] caParse = t.parsePassageText(ca);
57 | 		Parse[] qParse = t.parsePassageText(q);
58 | 		Parse[] pasParse = t.parsePassageText(passage);
59 | 		Parse[] caParseCh = t.getAllChildren(caParse);
60 | 		Parse[] qParseCh = t.getAllChildren(qParse);
61 | 		Parse[] pasParseCh = t.getAllChildren(pasParse);
62 | 		score1=this.compareParseType(qParseCh, pasParseCh,verbose);
63 | 		score2=this.compareParseType(caParseCh, pasParseCh,verbose);
64 | 		return score1*score2/passage.length();
65 | 	}
66 | 
67 | 
68 | public static void main(String[] args) throws IOException{
69 | 	String ca="Jane Austen"; 
70 | 	String qq="Jane Austen wrote Emma";
71 | 	String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+
72 | 			"the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " +
73 | 			"as produces little effect after much labor [7]. " +
74 | 			"Jane Austen wrote Emma."+
75 | 			"When she was a girl she wrote stories. Her works were printed only after much revision. " +
76 | 			"Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " +
77 | 			"Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " +
78 | 			"Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " +
79 | 			"a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " +
80 | 			"She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " +
81 | 			"She had been working on a new novel, Sanditon, but she died before she could finish it.";
82 | 	PassageScorerOpenNLPAda ps = new PassageScorerOpenNLPAda(); 
83 | 	System.out.println();
84 | 	System.out.println("NormalizedScore: "+ps.scoreStructureNorm(ca,qq, passage,true));
85 | 	System.out.println("Raw Score: "+ps.scoreStructure(ca,qq, passage,true));
86 | }
87 | }
88 | 


--------------------------------------------------------------------------------
/get_started.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ################################################################################
 4 | GRADLE_URL = "https://services.gradle.org/distributions/gradle-2.2.1-bin.zip"
 5 | ################################################################################
 6 | # Needs requests, wget
 7 | ################################################################################
 8 | import platform
 9 | import requests
10 | from   setuptools import setup, Command
11 | import shutil
12 | from   subprocess import call, check_call
13 | import sys
14 | import tarfile
15 | import urllib2
16 | import zipfile
17 | class Download(Command):
18 |     def unpack(ar, delete):
19 |         """ Unpack a file and delete the original """
20 |         print "Unpacking %s" %ar
21 |         if ar.endswith("tar"):
22 |             tarfile.open(ar).extractall()
23 |         elif ar.endswith("zip"):
24 |             zipfile.Zipfile(ar, "r").extractall()
25 |         else:
26 |             print "Could not recognize file format of %s. Aborting unpack." %ar
27 |             return # Skip the possible delete
28 |         if delete: os.remove(ar)
29 |     
30 |     def installPostgres():
31 |         if platform.system() == "Linux":
32 |             dist = platform.dist()[0]
33 |             try:
34 |                 if dist == "Fedora":
35 |                     check_call("sudo yum install postgres-9.3".split())
36 |                 elif dist == "Ubuntu":
37 |                     check_call("sudo apt-get install postgres-9.3".split())
38 |             except CalledProcessError as e:
39 |                 print e
40 |                 print "Opening a shell to allow you to install and setup Postgres manually."
41 |                 print 'Use "exit 1" to abort installation'
42 |                 check_call(os.environ.get("SHELL", "sh"))
43 |         else:
44 |             print "Can only install Postgres on Linux (yet)."
45 |     
46 |     def ask(prompt):
47 |         return raw_input(prompt + " | ")[0].lower() in ('y', 't')
48 | 
49 |     def run():
50 |         import argparse
51 |         import wget
52 |         parser = argparse.ArgumentParser(description="Setup the Watsonsim question answering system.")
53 |         parser.add_argument("--no-postgres",
54 |             action="store_false",
55 |             dest='postgres',
56 |             default=True,
57 |             help="Don't install postgresql server (which would be from the repository).")
58 |         parser.add_argument("--no-gradle",
59 |             action="store_false",
60 |             dest='gradle',
61 |             default=True,
62 |             help="Don't download and install gradle.")
63 |         args = parser.parse_args()
64 |         
65 |         print "This script is not ready yet, refer to the homepage for installation instructions."
66 |         sys.exit(1)
67 |         if not ask("Are you sure you want to start? It may take many hours and 150+ GB of disk space. "):
68 |             sys.exit(1)
69 |         
70 |         # The theory here is to do the smallest tasks first.
71 |         if args.gradle:
72 |             # Less than 5 minutes
73 |             wget.download(GRADLE_URL)
74 |             unpack(os.path.basename(GRADLE_URL), then_delete)
75 |         if args.postgres:
76 |             # Maybe about 5 minutes
77 |             installPostgres()
78 |         
79 |         #http://apache.osuosl.org/jena/binaries/jena-fuseki-1.1.1-distribution.tar.gz
80 |         #java -cp jena-fuseki-1.1.1/fuseki-server.jar tdb.tdbloader --tdb=jena-lucene.ttl *.owl *.nt
81 |         #java -cp jena-fuseki-1.1.1/fuseki-server.jar jena.textindexer --desc=../jena-lucene.ttl
82 | 
83 | setup(
84 |     name="Watsonsim Question Answering System",
85 |     version="0.5",
86 |     author="Sean Gallagher",
87 |     author_email="stgallag@gmail.com",
88 |     url="http://github.com/SeanTater/uncc2014watsonsim",
89 |     setup_requires = [
90 |         'wget>=2.2',
91 |         'requests>=2.2.1'
92 |     ],
93 |     install_requires = [
94 |         'psycopg2>=2.4.5'
95 |     ],
96 |     cmdclass={"download": Download}
97 | )
98 | 


--------------------------------------------------------------------------------
/src/test/java/edu/uncc/cs/watsonsim/QClassDetectionTest.java:
--------------------------------------------------------------------------------
 1 | package edu.uncc.cs.watsonsim;
 2 | 
 3 | import static org.junit.Assert.*;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | import edu.uncc.cs.watsonsim.QType;
 8 | import edu.uncc.cs.watsonsim.Question;
 9 | 
10 | public class QClassDetectionTest {
11 | 
12 | 	@Test
13 | 	public void test() {
14 | 		Question[] questions = {
15 | 				Question.known("In book 1, \"The ___ Hat\"","sorting","HARRY POTTER & TME CHAPTER TITLES"),
16 | 				Question.known("In book 4, \"The ___ World Cup\"","quidditch","HARRY POTTER & TME CHAPTER TITLES"),
17 | 				Question.known("In book 6, her \"helping hand\"","Hermione","HARRY POTTER & TME CHAPTER TITLES"),
18 | 				Question.known("Plant a Smooch on Yours Truly, Katharine","Kiss Me, Kate","MUSICALS BY ANY OTHER NAME"),
19 | 				Question.known("The 2 digits that give James Bond license to kill","0","BY THE NUMBERS"),
20 | 				Question.known("Of 1.8, 2.5, or 3.7 hours per home, the average time PBS is viewed each week in U.S.","1.8","PBS"),
21 | 				Question.known("Gary glitter: \"Rock and Roll _____ _____\"","Part 2","LET'S ROCK!"),
22 | 				Question.known("Simple Abundance by Sarah Ban Breathnach has this many messages for women, one for each day in 1996","366","IN THE BOOKSTORE"),
23 | 				//TODO: fix bug: the constructor for these incorrectly use the second parameter is an answer; however that
24 | 				//      functionality is used by another method (JSONQuestionSource(Reader)).
25 | 				Question.known("Kimono, caftan, bath-","FASHIONABLE COMMON BONDS"),
26 | 				Question.known("Ontario,Havasu,Baikal","COMMON BONDS"),
27 | 				Question.known("Trash, a boyfriend you're sick of, goods or securities sold below costs","COMMON BONDS"),
28 | 				Question.known("green crested, collared, anole","Beastly Common Bonds"),
29 | 				Question.known("Later jailed for fraud, Australian Alan Bond became a national hero for financing the 1983 capture of this sailing trophy","UNCOMMON BONDS"),
30 | 				Question.known("Rolled, steelcut, Scotch","EDIBLE COMMON BONDS"),
31 | 				Question.known("Nursery rhyme waterspout crawler who's a Marvel crime fighter","BEFORE & AFTER"),
32 | 				Question.known("Nursery rhyme waterspout crawler who's a Marvel crime fighter","Before & After"),
33 | 				Question.known("This man succeeded John Carver as governor of Plymouth Colony in 1621 & served for 31 of the next 35 years", "AMERICA BEFORE THE REVOLUTION"),
34 | 				Question.known("John Milton epic about Gertrude Stein's Parisian expatriate Yanks who were born starting in 1965", "BEFORE, DURING, & AFTER"),
35 | 				Question.known("Gray", "INDIANAGRAMS"),
36 | 				Question.known("The king is dead, long \"lives\" the king", "  MUSICAL ANAGRAMS"),
37 | 				Question.known("Anthem ender: BEHAVE HOME FORT", "ANAGRAMS"),
38 | 				Question.known("Lose", "SCRAMBLED FISH"),
39 | 				Question.known("", ""),
40 | 				Question.known("He not only wrote & directed \"Little Johnny Jones\", he also played the title role", "QUOTATION")
41 | 			};
42 | 		
43 | 			QType[] labels = {
44 | 				QType.FITB,
45 | 				QType.FITB,
46 | 				QType.FACTOID,
47 | 				QType.FACTOID,
48 | 				QType.FACTOID,
49 | 				QType.FACTOID,
50 | 				QType.FITB,
51 | 				QType.FACTOID,
52 | 				QType.COMMON_BONDS,
53 | 				QType.COMMON_BONDS,
54 | 				QType.COMMON_BONDS,
55 | 				QType.COMMON_BONDS,
56 | 				QType.COMMON_BONDS,
57 | 				QType.COMMON_BONDS,
58 | 				QType.BEFORE_AND_AFTER,
59 | 				QType.BEFORE_AND_AFTER,
60 | 				QType.BEFORE_AND_AFTER,
61 | 				QType.BEFORE_AND_AFTER,
62 | 				QType.ANAGRAM,
63 | 				QType.ANAGRAM,
64 | 				QType.ANAGRAM,
65 | 				QType.ANAGRAM,
66 | 				QType.FACTOID,
67 | 				QType.QUOTATION
68 | 			};
69 | 			
70 | 			int missed = 0;
71 | 			for (int i=0; i<questions.length; i++) {
72 | 				try {
73 | 					assertEquals(labels[i], questions[i].getType());
74 | 				} catch (java.lang.AssertionError ae) {
75 | 					System.out.println("Failed to correctly categorize " + questions[i].text + " as " + labels[i] + "; incorrect type: " + questions[i].getType());
76 | 					missed++;
77 | 				}
78 | 			}
79 | 			assertTrue(missed * 4 < questions.length);
80 | 		}
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/edu/uncc/cs/watsonsim/search/Anagrams.java:
--------------------------------------------------------------------------------
  1 | package edu.uncc.cs.watsonsim.search;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.IOException;
  5 | import java.io.InputStreamReader;
  6 | import java.nio.file.Files;
  7 | import java.nio.file.Paths;
  8 | import java.util.ArrayList;
  9 | import java.util.Arrays;
 10 | import java.util.HashMap;
 11 | import java.util.List;
 12 | import java.util.Map;
 13 | import java.util.regex.Matcher;
 14 | import java.util.regex.Pattern;
 15 | 
 16 | import edu.uncc.cs.watsonsim.Environment;
 17 | import edu.uncc.cs.watsonsim.Log;
 18 | import edu.uncc.cs.watsonsim.Passage;
 19 | import edu.uncc.cs.watsonsim.Score;
 20 | import edu.uncc.cs.watsonsim.scorers.Merge;
 21 | 
 22 | public class Anagrams extends Searcher {
 23 | 
 24 | 	private final Map<String, List<String>> mp = new HashMap<>();
 25 | 	private Log log;
 26 | 
 27 | 	public Anagrams(Environment env)  {
 28 | 		super(env);
 29 | 		log = env.log.kid(getClass());
 30 | 		try
 31 | 		{
 32 | 		for (String line : Files.readAllLines(Paths.get("data", "words"))) {
 33 | 			// condition of different anagram questions:
 34 | 			// usually anagram questions are coming for word coming after :
 35 | 			// regular expression for searching if a : is coming in the question
 36 | 
 37 | 			char[] charArray = line.toLowerCase().toCharArray();
 38 | 			Arrays.sort(charArray);
 39 | 			String source = String.valueOf(charArray);
 40 | 			List<String> targets = mp.get(source);
 41 | 			if (targets == null) {
 42 | 				targets = new ArrayList<>();
 43 | 				mp.put(source, targets);
 44 | 			}
 45 | 			targets.add(line);
 46 | 		}
 47 | 		}
 48 | 		catch(IOException e)
 49 | 		{
 50 | 			e.printStackTrace();
 51 | 		}
 52 | 		Score.register("IS_ONLY_ANAGRAM", 0.0, Merge.Min);
 53 | 	}
 54 | 
 55 | 	public static void main(String args[]) throws IOException {
 56 | 		Anagrams ta = new Anagrams(new Environment());
 57 | 		System.out.println("Enter the Jeopardy Anagram Question:");
 58 | 		BufferedReader br2 = new BufferedReader(
 59 | 				new InputStreamReader(System.in));
 60 | 		String question = br2.readLine();
 61 | 		ta.query(question);
 62 | 	}
 63 | 
 64 | 	public static List<String> search_key(String keys,Map<String, List<String>> mp) 
 65 | 	{
 66 | 		char[] charArray = keys.toLowerCase().toCharArray();
 67 | 		Arrays.sort(charArray);
 68 | 		// String searchKey = String.valueOf(charArray);
 69 | 		List<String> entries = mp.get(String.valueOf(charArray));
 70 | 		if (entries == null)
 71 | 		{
 72 | 			entries = new ArrayList<>();
 73 | 		}
 74 | 		entries.remove(keys);
 75 | 		return entries;
 76 | 	}
 77 | 
 78 | 	@Override
 79 | 	public List<Passage> query(String query) {
 80 | 		// Some anagrams come in a very clear syntax:
 81 | 		//    either in quotes, or after a colon. Find them.
 82 | 		Matcher matcher = Pattern.compile("\"([A-z ]+)\"|: ([A-z ]+)")
 83 | 				.matcher(query);
 84 | 		
 85 | 		List<String> entries = new ArrayList<>();
 86 | 		if (matcher.find() && matcher.group(1) != null) {
 87 | 			// Good news. We found a quoted string to generate anagrams from.
 88 | 			entries.addAll(search_key(matcher.group(1), mp));
 89 | 			if (!entries.isEmpty()) {
 90 | 				log.info("Found " + entries.size()
 91 | 						+ " quoted anagrams");	
 92 | 			}
 93 | 		} else {
 94 | 			// Bad news. We have to guess all the words.
 95 | 			String[] words = query.split(" ");
 96 | 			if (words.length <= 2) {
 97 | 				// When there are so few words, the whole question is likely 
 98 | 				// an anagram. For example, "Nuke Air" -> "Ukariane"
 99 | 				entries.addAll(search_key(query.replace(" ", ""), mp));
100 | 			} else {
101 | 				// Otherwise, consider each word separately.
102 | 				for (String word : words) {
103 | 					entries.addAll(search_key(word, mp));
104 | 				}
105 | 			}
106 | 		}
107 | 		
108 | 		entries.removeAll(Arrays.asList("Si","shit","Ni"));
109 | 		
110 | 		List<Passage> results = new ArrayList<>();
111 | 		for (String text : entries) {
112 | 			results.add(new edu.uncc.cs.watsonsim.Passage("lucene", // Engine
113 | 					text, // Title
114 | 					text, // Text
115 | 					"anagram:" + text).score("IS_ONLY_ANAGRAM", 1.0));
116 | 
117 | 		}
118 | 		
119 | 		
120 | 		
121 | 		return results;
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------