├── lib └── indri_jni.dll.win64 ├── src ├── main │ ├── resources │ │ └── public │ │ │ ├── chalk-bar.png │ │ │ ├── chalk-left-end.png │ │ │ ├── chalk-right-end.png │ │ │ ├── scripts │ │ │ ├── index.js │ │ │ └── query.js │ │ │ └── stylesheets │ │ │ └── index.css │ ├── java │ │ ├── edu │ │ │ └── uncc │ │ │ │ └── cs │ │ │ │ └── watsonsim │ │ │ │ ├── scripts │ │ │ │ ├── package-info.java │ │ │ │ └── WiktionaryParser.java │ │ │ │ ├── datapreparation │ │ │ │ └── KingJamesBible.java │ │ │ │ ├── scorers │ │ │ │ ├── Merge.java │ │ │ │ ├── Scorer.java │ │ │ │ ├── PassageCount.java │ │ │ │ ├── AnswerLength.java │ │ │ │ ├── QuestionID.java │ │ │ │ ├── LuceneEcho.java │ │ │ │ ├── AnswerInPassage.java │ │ │ │ ├── LATMentions.java │ │ │ │ ├── package-info.java │ │ │ │ ├── WShalabyScorer.java │ │ │ │ ├── AnswerInQuestionScorer.java │ │ │ │ ├── PassageQuestionLengthRatio.java │ │ │ │ ├── Correct.java │ │ │ │ ├── TopPOS.java │ │ │ │ ├── QAKeywordMatch.java │ │ │ │ ├── QPKeywordMatch.java │ │ │ │ ├── AnswerScorer.java │ │ │ │ ├── WordProximity.java │ │ │ │ ├── EntropyTest.java │ │ │ │ ├── GloveAnswerQuestionContext.java │ │ │ │ ├── SkipBigram.java │ │ │ │ ├── PercentWordsInCommon.java │ │ │ │ ├── WPPageViews.java │ │ │ │ ├── GloveAnswerQuestionContextTest.java │ │ │ │ ├── CommonConstituents.java │ │ │ │ ├── NGram.java │ │ │ │ ├── DateMatches.java │ │ │ │ ├── Entropy.java │ │ │ │ ├── NamedEntityRecognizerScorer.java │ │ │ │ ├── PassageTermMatch.java │ │ │ │ ├── AnswerPOS.java │ │ │ │ ├── LATCheck.java │ │ │ │ ├── ElliotMerschScorer.java │ │ │ │ ├── StephensonOpenNLPScorer.java │ │ │ │ ├── PassageScorer.java │ │ │ │ ├── JM_Scorer.java │ │ │ │ └── PassageScorerOpenNLPAda.java │ │ │ │ ├── QType.java │ │ │ │ ├── index │ │ │ │ ├── Segment.java │ │ │ │ ├── Indri.java │ │ │ │ ├── Lucene.java │ │ │ │ └── Bigrams.java │ │ │ │ ├── nlp │ │ │ │ ├── Weighted.java │ │ │ │ ├── RelatednessTest.java │ │ │ │ ├── DenseVectorsTest.java │ │ │ │ ├── Redirects.java │ │ │ │ ├── ApproxStringIntMapTest.java │ │ │ │ ├── ApproxStringIntMap.java │ │ │ │ ├── StringStackTest.java │ │ │ │ ├── DenseVectors.java │ │ │ │ ├── StringStack.java │ │ │ │ └── ClueType.java │ │ │ │ ├── researchers │ │ │ │ ├── Normalize.java │ │ │ │ ├── package-info.java │ │ │ │ ├── HyphenTrimmer.java │ │ │ │ ├── PassageRetrieval.java │ │ │ │ ├── MergeByText.java │ │ │ │ ├── AnswerTrimming.java │ │ │ │ ├── RedirectSynonyms.java │ │ │ │ ├── MergeByCommonSupport.java │ │ │ │ ├── StrictFilters.java │ │ │ │ ├── PersonRecognition.java │ │ │ │ ├── MergeAnswers.java │ │ │ │ ├── URLExpander.java │ │ │ │ ├── StatsDump.java │ │ │ │ ├── Researcher.java │ │ │ │ ├── WekaTee.java │ │ │ │ └── TagLAT.java │ │ │ │ ├── KVTest.java │ │ │ │ ├── search │ │ │ │ ├── CachingSearcher.java │ │ │ │ ├── LucenePassageSearcher.java │ │ │ │ ├── MeanDVSearchTest.java │ │ │ │ ├── LuceneSearcher.java │ │ │ │ ├── IndriSearcher.java │ │ │ │ ├── Searcher.java │ │ │ │ ├── BingSearcher.java │ │ │ │ └── Anagrams.java │ │ │ │ ├── DBQuestionSource.java │ │ │ │ ├── WebFrontend.java │ │ │ │ ├── Log.java │ │ │ │ ├── KV.java │ │ │ │ ├── Question.java │ │ │ │ ├── Database.java │ │ │ │ ├── WatsonSim.java │ │ │ │ ├── Configuration.java │ │ │ │ └── Passage.java │ │ └── privatedata │ │ │ └── UserSpecificConstants.java.sample │ ├── parse.rules │ ├── parse.pl │ └── scala │ │ └── scripts │ │ └── BigramBigramIndexer.scala └── test │ └── java │ └── edu │ └── uncc │ └── cs │ └── watsonsim │ ├── QuestionResultsScorerTest.java │ ├── AnswerMergeTest.java │ ├── StringUtilsTest.java │ ├── ReindexEdgesTest.java │ ├── TypeDetectionTest.java │ ├── DateMatchesTest.java │ ├── CoreNLPSentenceSimilarityTest.java │ └── QClassDetectionTest.java ├── .travis.yml ├── .gitignore ├── scripts ├── populate_semantic_graph.py ├── gensim │ ├── intro1.py │ ├── scatter.py │ ├── digestion.py │ ├── import_glove.py │ ├── vstore.py │ ├── analogy.py │ └── intro-1level.py ├── import_trec.py ├── convert_arff_to_leveldb.py ├── create.sql └── svm_graph.py ├── config.properties.sample ├── get_started.sh └── get_started.py /lib/indri_jni.dll.win64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/lib/indri_jni.dll.win64 -------------------------------------------------------------------------------- /src/main/resources/public/chalk-bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-bar.png -------------------------------------------------------------------------------- /src/main/resources/public/chalk-left-end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-left-end.png -------------------------------------------------------------------------------- /src/main/resources/public/chalk-right-end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanTater/uncc2014watsonsim/HEAD/src/main/resources/public/chalk-right-end.png -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - openjdk7 4 | 5 | install: 6 | - TERM=dumb gradle -Ptarget assemble 7 | 8 | script: 9 | - TERM=dumb gradle -Ptarget --info check 10 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scripts/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Scripts intended to ease development with watsonsim 3 | */ 4 | /** 5 | * @author Sean Gallagher 6 | * 7 | */ 8 | package edu.uncc.cs.watsonsim.scripts; -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/datapreparation/KingJamesBible.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.datapreparation; 2 | 3 | public class KingJamesBible { 4 | 5 | public static void main(String[] args) { 6 | 7 | } 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/Merge.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | /** 4 | * These are the ways to merge a score. 5 | * @author Sean 6 | * 7 | */ 8 | public enum Merge { 9 | Mean, Or, Min, Max, Sum 10 | } 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | build/ 3 | .gradle/ 4 | *.jar 5 | # Package Files # 6 | *.jar 7 | *.war 8 | *.ear 9 | .classpath 10 | .project 11 | .settings/* 12 | bin/* 13 | data/* 14 | lib/indri_jni.dll 15 | lib/indri_jni.so 16 | src/main/java/privatedata/UserSpecificConstants.java 17 | /bin 18 | config.properties 19 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/Scorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Question; 7 | 8 | public interface Scorer { 9 | public void scoreQuestion(Question q, List answers); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/QType.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | /** 4 | * Enum representing the QType of the Question 5 | * 6 | * @author Ken Overholt 7 | * 8 | */ 9 | public enum QType { 10 | FACTOID, 11 | FITB, 12 | COMMON_BONDS, 13 | BEFORE_AND_AFTER, 14 | ANAGRAM, 15 | QUOTATION 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/PassageCount.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | import edu.uncc.cs.watsonsim.Answer; 3 | import edu.uncc.cs.watsonsim.Question; 4 | 5 | 6 | /** 7 | * @author Sean Gallagher 8 | */ 9 | public class PassageCount extends AnswerScorer { 10 | public double scoreAnswer(Question q, Answer a) { 11 | return a.passages.size(); 12 | } 13 | } 14 | 15 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/index/Segment.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.index; 2 | 3 | import java.io.Closeable; 4 | import java.util.function.Consumer; 5 | 6 | import edu.uncc.cs.watsonsim.Passage; 7 | 8 | /** 9 | * A Segment is a part of the Indexing pipeline 10 | * It is just the union of Closeable and Consumer 11 | */ 12 | public interface Segment extends Closeable, Consumer { 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/Weighted.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | /** 4 | * Simple immutable wrapper to express weight or probability 5 | * @author Sean Gallagher 6 | * 7 | * @param 8 | */ 9 | public class Weighted { 10 | public final T item; 11 | public final double weight; 12 | public Weighted(T item, double weight) { 13 | this.item = item; 14 | this.weight = weight; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/parse.rules: -------------------------------------------------------------------------------- 1 | [(?noun_r urn:sent:type ?type_r) <- 2 | (?type_r urn:sent:nsubj ?noun_r), 3 | (?type_r urn:sent:det ?det), 4 | (?type_r urn:sent:cop ?cop) 5 | // (?type_r urn:sent:tag urn:sent:NN) 6 | // (?type_r urn:sent:idx ?type_idx) 7 | // (?noun_r urn:sent:idx ?noun_idx) 8 | ] 9 | [(?noun_r urn:sent:type ?type_other), 10 | (?type_other urn:sent:conj_and ?type_r) -> 11 | (?noun_r urn:sent:type ?type_r) 12 | ] 13 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerLength.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Question; 5 | 6 | /** 7 | * Return the length of the candidate text in chars. 8 | * @author Sean Gallagher 9 | */ 10 | public class AnswerLength extends AnswerScorer { 11 | 12 | public double scoreAnswer(Question q, Answer a) { 13 | return a.text.length(); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/Normalize.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Question; 7 | import edu.uncc.cs.watsonsim.Score; 8 | 9 | public class Normalize extends Researcher { 10 | 11 | @Override 12 | public List question(Question q, List candidates) { 13 | return Score.normalizeGroup(candidates); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Question transformation functions intended to improve later scoring. 3 | * Researchers can modify the question in unspecified ways, and thus the order 4 | * of execution of researchers matters. 5 | * Try to pick a more structured way of modifying the question if applicable. 6 | * For example, scoring should use a Scorer. 7 | * 8 | * @author Sean Gallagher 9 | */ 10 | package edu.uncc.cs.watsonsim.researchers; -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/QuestionID.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Question; 5 | 6 | /** 7 | * A bogus scorer whose purpose is to collate answers to the same question 8 | * @author Sean Gallagher 9 | */ 10 | public class QuestionID extends AnswerScorer { 11 | 12 | @Override 13 | public double scoreAnswer(Question q, Answer a) { 14 | return q.text.hashCode(); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /scripts/populate_semantic_graph.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import leveldb 3 | ldb = leveldb.LevelDB("data/edges-leveldb") 4 | sdb = sqlite3.connect("sources.db") 5 | block=[] 6 | for k, v in ldb.RangeIter(): 7 | block.append(k.decode("utf8").split("\t", 2) + [int(v.decode("utf8"))]) 8 | if len(block) > 1000000: 9 | s = sdb.executemany("INSERT INTO semantic_graph(source, tag, target, count) VALUES (?, ?, ?, ?);", block); 10 | print('.', end='') 11 | block=[] 12 | sdb.commit() -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/LuceneEcho.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Passage; 5 | import edu.uncc.cs.watsonsim.Phrase; 6 | 7 | /** 8 | * Take advantage of the Scorer dimension reduction for Lucene passages 9 | */ 10 | public class LuceneEcho extends PassageScorer { 11 | 12 | @Override 13 | public double scorePassage(Phrase q, Answer a, Passage p) { 14 | return p.scores.get("LUCENE_SCORE"); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerInPassage.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | /* 4 | * Author: Chris Stephenson 5 | * later rewritten by Sean 6 | */ 7 | 8 | import edu.uncc.cs.watsonsim.Answer; 9 | import edu.uncc.cs.watsonsim.Passage; 10 | import edu.uncc.cs.watsonsim.Phrase; 11 | 12 | public class AnswerInPassage extends PassageScorer { 13 | @Override 14 | public double scorePassage(Phrase q, Answer a, Passage p) 15 | { 16 | return p.text.contains(a.text) ? 17 | 1 : 0; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/LATMentions.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Question; 5 | import edu.uncc.cs.watsonsim.scorers.AnswerScorer; 6 | 7 | /** 8 | * Return how many unique LAT's there are for an answer. 9 | * @author Sean 10 | * 11 | */ 12 | public class LATMentions extends AnswerScorer { 13 | 14 | @Override 15 | public double scoreAnswer(Question q, Answer a) { 16 | return a.lexical_types.size(); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Text analyzers, for differentiating passages to improve ranking. 3 | * 4 | * Scorers measure some aspect of the answer or passage, possibly in relation 5 | * to the question. Every scorer must return a primitive double. 6 | *

7 | * Remember that the purpose of a scorer is not to provide a perfect rank on 8 | * it's own, only to differentiate "good" and "bad" passages in some meaningful 9 | * way. As such, the scale and sign are not very important. 10 | */ 11 | package edu.uncc.cs.watsonsim.scorers; -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/HyphenTrimmer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Phrase; 5 | 6 | 7 | /** Trim any text from before the hyphen in the candidate text of an answer */ 8 | public class HyphenTrimmer extends Researcher { 9 | 10 | public Answer answer(Phrase q, Answer a) { 11 | String[] improved_answer_parts = a.text.split("[-:(|]"); 12 | 13 | if (improved_answer_parts.length>0) { 14 | return a.withText(improved_answer_parts[0].trim()); 15 | } 16 | return a; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/parse.pl: -------------------------------------------------------------------------------- 1 | is_type_statement(A, B) :- 2 | (nsubj(A, B), cop(B, _), det(B, _)); 3 | (nsubj(B, A), cop(B, _), det(B, _)). 4 | 5 | /* A plain type */ 6 | type_a(Name, Type) :- 7 | is_type_statement(Name, Type), 8 | det(Type, _). 9 | 10 | /* A type with subject conjunctions */ 11 | type_b(Name, Type) :- 12 | type_a(Name, Type) ; 13 | (type_a(AnotherName, Type), 14 | conj_and(AnotherName, Name)). 15 | 16 | /* A type with subject or type conjunctions */ 17 | type_c(Name, Type) :- 18 | type_b(Name, Type) ; 19 | (type_b(Name, AnotherType), 20 | conj_and(AnotherType, Type)). 21 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/WShalabyScorer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * @author Walid Shalaby 4 | */ 5 | 6 | package edu.uncc.cs.watsonsim.scorers; 7 | 8 | import edu.uncc.cs.watsonsim.Answer; 9 | import edu.uncc.cs.watsonsim.Passage; 10 | import edu.uncc.cs.watsonsim.Phrase; 11 | 12 | public class WShalabyScorer extends PassageScorer { 13 | 14 | @Override 15 | /** Detect if the question matches the answer, score it appropriately 16 | * This is to ease machine learning*/ 17 | // TODO: Don't reassign for every passage 18 | public double scorePassage(Phrase q, Answer a, Passage p) { 19 | return 0.0; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/resources/public/scripts/index.js: -------------------------------------------------------------------------------- 1 | /*$(function() { 2 | $("#search").ajaxForm({ 3 | beforeSubmit: function() { 4 | $("#note").text("Asking learned grand-masters for insight."); 5 | return true; 6 | }, 7 | success: function(response) { 8 | $("#note").empty(); 9 | $("#results").empty(); 10 | response.answers.forEach(function(item) { 11 | var x = $("

  • "+item.title+"
  • "); 12 | x[0].style.background = "linear-gradient(#4FA5C2 " + 100 * item.score + ", #C8DAE0 " + 100 * item.score + ")"; 13 | $("#results").append(x); 14 | }); 15 | }}); 16 | });*/ 17 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerInQuestionScorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Question; 5 | 6 | /** 7 | * Returns 1.0 if the answer text is found in the question and 0.0 otherwise 8 | * @author Ken Overholt 9 | * 10 | */ 11 | public class AnswerInQuestionScorer extends AnswerScorer { 12 | 13 | @Override 14 | public double scoreAnswer(Question q, Answer a) { 15 | String qtext = q.text.toLowerCase(); 16 | String atext = a.text.toLowerCase(); 17 | 18 | if (qtext.contains(atext)) 19 | return 1.0; 20 | else 21 | return 0.0; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/QuestionResultsScorerTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * @author Walid Shalaby 4 | */ 5 | 6 | package edu.uncc.cs.watsonsim; 7 | import edu.uncc.cs.watsonsim.researchers.CombineScores; 8 | 9 | 10 | public class QuestionResultsScorerTest { 11 | 12 | public static void main(String[] args) { 13 | try { 14 | CombineScores q = new CombineScores(); 15 | System.out.println("scoring: {indri-rank=1, indri-score=-1.582, lucene-rank=1, lucene-score=7.215, google-rank=1} ==> " + 16 | q.score(new double[]{1,-1.582,1,7.215,1})); 17 | } catch (Exception e) { 18 | // TODO Auto-generated catch block 19 | e.printStackTrace(); 20 | } 21 | 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/KVTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | public class KVTest { 9 | 10 | @Before 11 | public void setUp() throws Exception { 12 | } 13 | 14 | @Test 15 | public void testGet() { 16 | fail("Not yet implemented"); 17 | } 18 | 19 | @Test 20 | public void testAsVectorAsBytes() { 21 | float[] f = {(float) 12.0, (float) 0.99}; 22 | byte[] b = {0, 0, 64, 65, -92, 112, 125, 63}; 23 | for (int i=0; i<8; i++) b[i] = KV.asBytes(f)[i]; 24 | for (int i=0; i<2; i++) f[i] = KV.asVector(b)[i]; 25 | } 26 | 27 | @Test 28 | public void testQuickGetOrCompute() { 29 | fail("Not yet implemented"); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /config.properties.sample: -------------------------------------------------------------------------------- 1 | # This is a .properties file. There is no need for quotes but see Wikipedi 2 | # for a list of things that need to be escaped, and the precise syntax 3 | # Most of what you need in this file should be straightforward though 4 | lucene_index = data/v1.5/lucene_index 5 | jena_lucene_index = data/rdf/lucene 6 | indri_index = data/v1.5/indri_index 7 | indri_enabled = false 8 | terrier_index = OPTIONAL UNTIL TERRIER IS IMPLEMENTED 9 | 10 | bing_api_key = FILL ME IN 11 | 12 | # The following are optional until the Google search is finished 13 | google_app_name = FILL ME IN 14 | google_api_key = FILL ME IN 15 | google_custom_search_id = FILL ME IN 16 | 17 | # Setup your SQL database (you may need to edit this) 18 | jdbc_connection_string = jdbc:sqlite:data/watsonsim.db 19 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/PassageQuestionLengthRatio.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | /* 4 | * @author Wlodek 5 | */ 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Phrase; 10 | 11 | public class PassageQuestionLengthRatio extends PassageScorer { 12 | 13 | public double scorePassage(Phrase q, Answer a, Passage p) { 14 | String qs = q.text; 15 | //String qst= q.text; //processes question, stopwords, punctuation removed 16 | //String as= a.candidate_text; 17 | //String ps=p.text; // text is guaranteed to have content 18 | //ps.tokenize(); 19 | 20 | int pl = p.text.length(); 21 | int ql = qs.length(); 22 | double sc=pl/ql; 23 | return sc; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/privatedata/UserSpecificConstants.java.sample: -------------------------------------------------------------------------------- 1 | package privatedata; 2 | 3 | public class UserSpecificConstants { 4 | // Constants 5 | public static final String googleApplicationName = ""; 6 | public static final String googleAPIKey = ""; //Google provided API key 7 | public static final String googleCustomSearchID = ""; 8 | 9 | public static final String indriIndex = "data/indri_index"; 10 | public static final String luceneIndex = "data/lucene_index"; 11 | public static final String bingAPIKey = "aaaaaaaaaaa/aaaaaaaaaaa/aaaaaaaaaaaaaaaaaaa"; 12 | public static final String luceneSearchField = "text"; 13 | public static final String indriResultsFilter = "#filrej(list.title #combine(%s))"; 14 | public static final String luceneResultsFilter = " NOT title:*\\:*" + " NOT title:list*"; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/Correct.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Environment; 5 | import edu.uncc.cs.watsonsim.Question; 6 | import edu.uncc.cs.watsonsim.nlp.Relatedness; 7 | 8 | public class Correct extends AnswerScorer { 9 | private final Relatedness syn; 10 | public Correct(Environment env) { 11 | syn = new Relatedness(env); 12 | } 13 | @Override 14 | /** 15 | * Generate the target attribute for Machine Learning. 16 | * @returns correctness 0.0 -> incorrect, 1.0 -> correct 17 | * */ 18 | public double scoreAnswer(Question q, Answer a) { 19 | if (q.correct_answer == null) { 20 | return 0; 21 | } else { 22 | return syn.implies(q.correct_answer, a) ? 1 : 0; 23 | } 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /scripts/gensim/intro1.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 3 | from gensim import corpora, models, similarities 4 | corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)], 5 | [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)], 6 | [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)], 7 | [(0, 1.0), (4, 2.0), (7, 1.0)], 8 | [(3, 1.0), (5, 1.0), (6, 1.0)], 9 | [(9, 1.0)], 10 | [(9, 1.0), (10, 1.0)], 11 | [(9, 1.0), (10, 1.0), (11, 1.0)], 12 | [(8, 1.0), (10, 1.0), (11, 1.0)]] 13 | 14 | tfidf = models.TfidfModel(corpus) 15 | index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12) 16 | 17 | vec = [(0, 1), (4, 1)] 18 | sims = index[tfidf[vec]] 19 | print(list(enumerate(sims))) 20 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/TopPOS.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import org.apache.log4j.Logger; 4 | 5 | import edu.stanford.nlp.trees.Tree; 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Question; 8 | 9 | /** 10 | * Simple hashed POS-tag, mod 100 and scaled to between 0 and 1. 11 | */ 12 | public class TopPOS extends AnswerScorer { 13 | private final Logger log = Logger.getLogger(getClass()); 14 | 15 | public double scoreAnswer(Question q, Answer a) { 16 | for (Tree tree : a.getTrees()) { 17 | for (Tree child : tree.children()) { 18 | log.debug(a.text + " is a " + child.label().value() + " : " + (child.label().value().hashCode() % 100) / 100.0); 19 | return (child.label().value().hashCode() % 10) / 10.0; 20 | } 21 | } 22 | return 0.0; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/CachingSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.util.List; 4 | import com.google.gson.reflect.TypeToken; 5 | 6 | import edu.uncc.cs.watsonsim.Environment; 7 | import edu.uncc.cs.watsonsim.Passage; 8 | 9 | public class CachingSearcher extends Searcher { 10 | private final Searcher searcher; 11 | private final String engine_name; 12 | 13 | public CachingSearcher(Environment env, Searcher searcher, String engine_name) { 14 | super(env); 15 | this.searcher = searcher; 16 | this.engine_name = engine_name; 17 | } 18 | 19 | public List query(String query) { 20 | return env.computeIfAbsent( 21 | "search:" + engine_name +":"+ query, 22 | k -> searcher.query(query), 23 | new TypeToken>(){}.getType() 24 | ); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/RelatednessTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import edu.uncc.cs.watsonsim.Environment; 9 | 10 | public class RelatednessTest { 11 | private Relatedness rel; 12 | @Before 13 | public void setUp() throws Exception { 14 | rel = new Relatedness(new Environment()); 15 | } 16 | 17 | @Test 18 | public void testViaWikiLinks() { 19 | fail("Not yet implemented"); 20 | } 21 | 22 | @Test 23 | public void testMatchViaSearch() { 24 | fail("Not yet implemented"); 25 | } 26 | 27 | @Test 28 | public void testMatchViaLevenshtein() { 29 | fail("Not yet implemented"); 30 | } 31 | 32 | @Test 33 | public void testImplies() { 34 | fail("Not yet implemented"); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/QAKeywordMatch.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Question; 7 | import edu.uncc.cs.watsonsim.StringUtils; 8 | 9 | /*Author : Ricky Sanders 10 | * 11 | * Checks the Question against the answer to remove 12 | * answers that closely match the question 13 | * 14 | */ 15 | 16 | public class QAKeywordMatch extends AnswerScorer { 17 | public double scoreAnswer(Question q, Answer a){ 18 | List questionTextArray = StringUtils.tokenize(q.text); 19 | List answerTextArray = StringUtils.tokenize(a.text); 20 | int count = 0; 21 | for (String word : questionTextArray) 22 | if (answerTextArray.contains(word)) 23 | count += 1; 24 | return (count / (double)questionTextArray.size()); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/AnswerMergeTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import org.junit.Test; 4 | 5 | public class AnswerMergeTest { 6 | 7 | @Test 8 | public void testMatches() { 9 | // Results are equal if their titles are similar. This uses match_subset. 10 | /*assertTrue( 11 | new Answer("engine", "duck duck", "text", "reference", 0, 0).matches( 12 | new Answer("engine", "duck duck goose", "text", "reference", 0, 0))); 13 | 14 | assertFalse( 15 | new Answer("engine", "duck duck goose", "text", "reference", 0, 0).matches( 16 | new Answer("engine", "duck duck", "text", "reference", 0, 0))); 17 | 18 | assertTrue( 19 | new Answer("engine", "sitting on a fence", "text", "reference", 0, 0).matches( 20 | new Answer("engine", "Pete and repeat were sitting on a fence", "text", "reference", 0, 0))); 21 | */ 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/DBQuestionSource.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | import java.sql.PreparedStatement; 3 | import java.sql.ResultSet; 4 | import java.sql.SQLException; 5 | import java.util.ArrayList; 6 | 7 | public class DBQuestionSource extends ArrayList { 8 | private static final long serialVersionUID = 1L; 9 | 10 | /** Run an arbitrary query on the database to get questions. 11 | */ 12 | public DBQuestionSource(Environment env, String conditions) throws SQLException { 13 | // Get a list of questions, ordered so that it is consistent 14 | PreparedStatement query = env.db.prep("select question, answer, category from questions " 15 | + conditions + ";"); 16 | read_results(query.executeQuery()); 17 | } 18 | 19 | public void read_results(ResultSet sql) throws SQLException { 20 | while(sql.next()){ 21 | Question q = Question.known( 22 | sql.getString("question"), 23 | sql.getString("answer"), 24 | sql.getString("category") 25 | ); 26 | add(q); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/QPKeywordMatch.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Passage; 7 | import edu.uncc.cs.watsonsim.Phrase; 8 | import edu.uncc.cs.watsonsim.StringUtils; 9 | 10 | /*Author : Jacob Medd, Jagan Vujjini 11 | * 12 | * Just Modified Jacob Medd's Scorer to ignore Stop Words. 13 | * Will be adding the Stemmed Words Functionality. 14 | * 15 | * 16 | * Later modified. It seems that: 17 | * (% word in common) / (mean distance between common words) 18 | * is a constant. 19 | * 20 | * So just use one of them, and the % in common is easiest. 21 | */ 22 | 23 | public class QPKeywordMatch extends PassageScorer { 24 | 25 | public double scorePassage(Phrase q, Answer a, Passage p) { 26 | List questionTextArray = StringUtils.tokenize(q.text); 27 | int count = 0; 28 | for (String word : questionTextArray) 29 | if (p.getTokens().contains(word)) 30 | count += 1; 31 | return (count / (double)questionTextArray.size()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerScorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Question; 7 | import edu.uncc.cs.watsonsim.Score; 8 | 9 | public abstract class AnswerScorer implements Scorer { 10 | String name; 11 | { 12 | name = this.getClass().getSimpleName().replaceAll("([a-z])([A-Z]+)", "$1_$2").toUpperCase(); 13 | Score.register(name, 0.0, Merge.Sum); 14 | } 15 | /** 16 | * By default, score every answer to a question. 17 | * Remember to call scoreAnswer if you override this. 18 | * @param q Question 19 | */ 20 | @Override 21 | public void scoreQuestion(Question q, List answers) { 22 | for (Answer a : answers) 23 | a.score(name, scoreAnswer(q, a)); 24 | } 25 | 26 | /** 27 | * Override this method with your scorer implementation. 28 | * @param q Question 29 | * @param a Answer 30 | * @return The score for this answer, or NaN if not applicable. 31 | */ 32 | public double scoreAnswer(Question q, Answer a) { 33 | return 0.0; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scripts/gensim/scatter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from matplotlib import pyplot as plt 3 | from sklearn.decomposition import PCA, KernelPCA 4 | from sklearn.manifold import Isomap, TSNE 5 | from analogy import Analogy 6 | from vstore import VStore 7 | 8 | a = Analogy(VStore("vectors.lmdb", "big-glove")) 9 | 10 | buf = "" 11 | linebuf = raw_input("Please enter some words to plot, or empty for a canned list: ") 12 | while linebuf: 13 | buf += linebuf + " " 14 | linebuf = raw_input("... ") 15 | 16 | 17 | labels = buf.split() \ 18 | or "doctor nurse politician senator lawyer barrister defend accuse heal treat cure elect vote".split() 19 | 20 | vs = [a.w(x) for x in labels if a.w(x) is not None ] 21 | flatplot = TSNE(2) 22 | ps = flatplot.fit_transform(vs) 23 | 24 | plt.title("Reduced vector space model") 25 | plt.xlabel("First Principal Component") 26 | plt.ylabel("Second Principal Component") 27 | plt.scatter(ps[:, 0], ps[:, 1]) 28 | for (x, y), label in zip(ps, labels): 29 | print "plotting %f, %f, %s" %(x, y, label) 30 | plt.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points') 31 | 32 | plt.show() 33 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/StringUtilsTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | import edu.uncc.cs.watsonsim.StringUtils; 8 | 9 | public class StringUtilsTest { 10 | 11 | @Test 12 | public void test_match_subset() { 13 | assertTrue(StringUtils.matchSubset("cat toy", "cat toy")); 14 | 15 | assertTrue(StringUtils.matchSubset("thundering applause", "resounding, thundering applause")); 16 | 17 | assertTrue(StringUtils.matchSubset("What is for dinner, mother?", "What, is mother for dinner?")); 18 | } 19 | 20 | @Test 21 | public void test_filter_relevant() { 22 | assertEquals(StringUtils.canonicalize("cat toy"), "cat toy"); 23 | assertEquals(StringUtils.canonicalize("resounding, thundering applause"), "resounding thundering applause"); 24 | assertEquals(StringUtils.canonicalize("What is for dinner, mother?"), "what dinner mother"); 25 | assertEquals(StringUtils.canonicalize("I am a walaby"), "i am walaby"); // This is more documentation than test 26 | assertEquals(StringUtils.canonicalize("I\n\t am a walaby~!@#$%^&*()_+`-={}[]:\";\'<>?,./"), "i am walaby"); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/DenseVectorsTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.Optional; 6 | 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | import static edu.uncc.cs.watsonsim.nlp.DenseVectors.*; 10 | 11 | public class DenseVectorsTest { 12 | 13 | @Before 14 | public void setUp() throws Exception { 15 | } 16 | 17 | @Test 18 | public void testSim() { 19 | assertEquals(sim(vectorFor("diabetes"), vectorFor("retinopathy")), 0.54, 0.01); 20 | assertEquals(sim(vectorFor("diabetes"), vectorFor("diabetic")), 0.78, 0.01); 21 | assertEquals(sim(vectorFor("(*&(*&^(*&^"), vectorFor("diabetic")), 0.00, 0.01); 22 | assertEquals(sim(vectorFor("diabetes"), vectorFor("")), 0.00, 0.01); 23 | assertEquals(sim(vectorFor("diabetes"), Optional.of(new float[300])), 0.00, 0.01); 24 | 25 | float[] X = new float[300]; X[0] = (float) 0.5; 26 | float[] Y = new float[300]; Y[1] = (float) 0.5; 27 | float[] Z = new float[300]; Z[0] = (float) 0.5; Z[1] = (float) 0.5; 28 | 29 | assertEquals(sim(X, Y), 0.0, 0.01); 30 | assertEquals(sim(X, Z), 0.707, 0.01); 31 | assertEquals(sim(X, X), 1.0, 0.01); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/WordProximity.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.Arrays; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | import edu.uncc.cs.watsonsim.Answer; 9 | import edu.uncc.cs.watsonsim.Passage; 10 | import edu.uncc.cs.watsonsim.Phrase; 11 | import edu.uncc.cs.watsonsim.Question; 12 | 13 | public class WordProximity extends PassageScorer { 14 | Set q_words = new HashSet(); 15 | 16 | @Override 17 | public void scoreQuestion(Question q, List answers) { 18 | q_words.clear(); 19 | q_words.addAll(Arrays.asList(q.text.split("\\W+"))); 20 | super.scoreQuestion(q, answers); 21 | } 22 | 23 | @Override 24 | public double scorePassage(Phrase q, Answer a, Passage p) { 25 | double distance = 1; 26 | double average_log_distance = 0; 27 | 28 | for (String w : p.text.split("\\W+")) { 29 | if (q_words.contains(w)) { 30 | average_log_distance += Math.log(distance); 31 | distance = 1; 32 | } else { 33 | distance++; 34 | } 35 | } 36 | 37 | // This result is given as log(interval). Does that matter? 38 | return average_log_distance; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /scripts/gensim/digestion.py: -------------------------------------------------------------------------------- 1 | from gensim import corpora 2 | import re 3 | 4 | def filter_alnum(text): 5 | return re.findall("\w+", text) 6 | 7 | def cbow_dict(source): 8 | return corpora.Dictionary([[w] for w in open(source).read().lower().split()]) 9 | 10 | def line_dict(source): 11 | return corpora.Dictionary([filter_alnum(l) for l in open(source)]) 12 | 13 | class CBOWCorpus(object): 14 | def __init__(self, source, dictionary): 15 | self.dictionary = dictionary 16 | self.words = [ w for w in open(source).read().split() if w not in stoplist] 17 | 18 | def __len__(self): # this is O(n) 19 | return len(self.words)-4 20 | 21 | def __iter__(self): 22 | for i in xrange(len(self.words)-4): 23 | yield self.dictionary.doc2bow(self.words[i:i+4]) 24 | 25 | class LineCorpus(object): 26 | def __init__(self, source, dictionary): 27 | self.source = source 28 | self.dictionary = dictionary 29 | 30 | def __len__(self): 31 | i=0 32 | for line in open(self.source): 33 | i += 1 34 | return i 35 | 36 | def __iter__(self): 37 | for line in open(self.source): 38 | yield self.dictionary.doc2bow(filter_alnum(line)) 39 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/WebFrontend.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | import static spark.Spark.*; 3 | 4 | import java.util.List; 5 | 6 | import spark.*; 7 | 8 | public class WebFrontend { 9 | 10 | public static void main(String[] args) { 11 | Spark.staticFileLocation("public"); 12 | //externalStaticFileLocation("public"); 13 | get("/ask", (Request request, Response response) -> { 14 | Question question = new Question(request.queryParams("query")); 15 | /* 16 | OutputStream st = response.raw().getOutputStream(); 17 | Logger.getRootLogger().addAppender( 18 | new WriterAppender( 19 | new SimpleLayout(), 20 | st));*/ 21 | List answers = new DefaultPipeline().ask(question); 22 | 23 | StringBuilder output = new StringBuilder(); 24 | // Throw whole summaries of the data at the client 25 | for (Answer r: answers) { 26 | output.append(r.toJSON()); 27 | output.append(','); 28 | } 29 | 30 | 31 | response.type("application/json"); 32 | return String.format("{\"id\": {\"answers\": [%s]}", output.substring(0, output.length() - 1)); 33 | }); 34 | 35 | 36 | 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/EntropyTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import edu.uncc.cs.watsonsim.Environment; 12 | 13 | public class EntropyTest { 14 | Entropy e; 15 | 16 | @Before 17 | public void setUp() throws Exception { 18 | Environment env = new Environment(); 19 | e = new Entropy(env); 20 | } 21 | 22 | @Test 23 | public void testGetEntropy() { 24 | assertTrue( 25 | e.entropy(Arrays.asList("zucchini", "sepals")) 26 | > e.entropy(Arrays.asList("the", "of"))); 27 | 28 | String w1 = "Subverting Randall’s editor’s admiral intentions, " 29 | + "alternative enjoyment ensues composing complete " 30 | + "paragraphs entirely shunning Randall’s thousand " 31 | + "commonest dictionary terms. Bombastic prose " 32 | + "frequently results."; 33 | List ws1 = Arrays.asList(w1.split(" ")); 34 | String w2 = "See spot run. Spot runs fast. Spot and Joey play in the " 35 | + "park."; 36 | List ws2 = Arrays.asList(w2.split(" ")); 37 | assertTrue(e.entropy(ws1) > e.entropy(ws2)); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/ReindexEdgesTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | import static edu.stanford.nlp.util.Triple.makeTriple; 8 | import edu.uncc.cs.watsonsim.index.Edges; 9 | import static java.util.Arrays.asList; 10 | 11 | public class ReindexEdgesTest { 12 | 13 | @Test 14 | public void testSimpleExample() { 15 | Phrase p = new Phrase("This is an example."); 16 | 17 | assertEquals(asList( 18 | makeTriple("example","nsubj","This"), 19 | makeTriple("example","cop","is"), 20 | makeTriple("example","det","an")), 21 | Edges.generateEdges(p)); 22 | } 23 | 24 | @Test 25 | public void testExtraLinks() { 26 | Phrase p = new Phrase("Donald Duck is a cool cartoon character. " 27 | + "He sounds really funny."); 28 | System.out.println(Edges.generateEdges(p)); 29 | 30 | assertTrue(Edges.generateEdges(p).containsAll(asList( 31 | makeTriple("Donald Duck","_isa","cartoon character"), 32 | makeTriple("Donald Duck","_gender","MALE"), 33 | makeTriple("Donald Duck","_animate","ANIMATE"), 34 | makeTriple("Donald Duck","_number","SINGULAR"), 35 | makeTriple("sound","nsubj","Donald Duck") 36 | ))); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/GloveAnswerQuestionContext.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | import java.util.stream.Collectors; 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Phrase; 7 | import edu.uncc.cs.watsonsim.Question; 8 | import edu.uncc.cs.watsonsim.nlp.DenseVectors; 9 | 10 | /** 11 | * Returns the total context similarity between the answer and question. 12 | * The algorithm it uses is simply the mean of the word vectors (not really a 13 | * great solution, better with short questions / answers) 14 | */ 15 | public class GloveAnswerQuestionContext extends AnswerScorer { 16 | 17 | @Override 18 | public double scoreAnswer(Question q, Answer a) { 19 | List qtokens = q.memo(Phrase.simpleTokens) 20 | .stream().map(DenseVectors::vectorFor) 21 | .filter(v -> v.isPresent()) 22 | .map(v ->v.get()) 23 | .collect(Collectors.toList()); 24 | List atokens = a.memo(Phrase.simpleTokens) 25 | .stream().map(DenseVectors::vectorFor) 26 | .filter(v -> v.isPresent()) 27 | .map(v -> v.get()) 28 | .collect(Collectors.toList()); 29 | 30 | return DenseVectors.sim(DenseVectors.mean(atokens), DenseVectors.mean(qtokens)); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /scripts/gensim/import_glove.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Loads GloVe data into a VStore 3 | import sys 4 | import argparse 5 | import numpy 6 | from vstore import VStore 7 | 8 | parser = argparse.ArgumentParser(description="Load GloVe vectors into LMDB") 9 | parser.add_argument("--name", action="store", type=str, default="glove", 10 | help="name of the database into which to load the vectors") 11 | parser.add_argument("--dbfile", action="store", type=str, default="vectors.lmdb", 12 | help="shared database filename") 13 | parser.add_argument("--merge", action="store_true", 14 | help="merge the new dataset, rather than replacing") 15 | parser.add_argument("source", type=file, 16 | help="uncompressed GloVe dataset") 17 | args = parser.parse_args() 18 | 19 | # Invert control in order to use one transaction 20 | table = VStore(args.dbfile, args.name) 21 | table.drop() 22 | def loader(): 23 | for loaded, line in enumerate(args.source): 24 | line = line.split() 25 | name = line.pop(0) 26 | ## Tokenization errors can cause a word to be too long for lmdb 27 | if len(name) > 100: 28 | continue 29 | if loaded % 10000 == 0: 30 | print "Loaded {} rows".format(loaded) 31 | 32 | yield name, numpy.array(line, dtype=numpy.float32) 33 | table.load(loader()) 34 | print "Finished loading" 35 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/SkipBigram.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.HashSet; 4 | import java.util.List; 5 | import java.util.Set; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Phrase; 10 | import edu.uncc.cs.watsonsim.StringUtils; 11 | 12 | /** 13 | * @author Sean Gallagher 14 | * 15 | */ 16 | 17 | public class SkipBigram extends PassageScorer { 18 | 19 | public double scorePassage(Phrase q, Answer a, Passage p) { 20 | 21 | // Jane Austen 22 | Set a_set = generateBigrams(StringUtils.tokenize(a.text)); 23 | 24 | // Romantic novelist Jane Austen once wrote -the- book Emma. 25 | Set p_set = generateBigrams(p.getTokens()); 26 | 27 | a_set.retainAll(p_set); 28 | 29 | return a_set.size(); 30 | } 31 | 32 | private Set generateBigrams(List terms) { 33 | Set bigrams = new HashSet<>(); 34 | for (int ti=0; ti 0) 38 | return (count/((double)q.text.length()))/((double)distanceSum/count); 39 | else 40 | return 0; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/PassageRetrieval.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.List; 4 | import java.util.regex.Matcher; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Environment; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Question; 10 | import edu.uncc.cs.watsonsim.search.*; 11 | 12 | /** 13 | * Search for documents having relevance to both the question and a candidate 14 | * answer. 15 | */ 16 | public class PassageRetrieval extends Researcher { 17 | private final Searcher[] searchers; 18 | 19 | public PassageRetrieval(Environment env, Searcher... searchers) { 20 | this.searchers = searchers; 21 | } 22 | 23 | 24 | @Override 25 | public List question(Question q, List answers) { 26 | 27 | int total_passages = answers.stream().mapToInt(a -> { 28 | // Query every engine 29 | int count = 0; 30 | for (Searcher s : searchers) { 31 | List passages = s.query( 32 | q.text + " " + Matcher.quoteReplacement(a.text)); 33 | a.passages.addAll(passages); 34 | count += passages.size(); 35 | } 36 | return count; 37 | }).sum(); 38 | 39 | 40 | q.log.info("Found " + total_passages + " supporting passages."); 41 | return answers; 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/WPPageViews.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.sql.ResultSet; 4 | import java.sql.SQLException; 5 | import java.util.HashMap; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Environment; 9 | import edu.uncc.cs.watsonsim.Question; 10 | import edu.uncc.cs.watsonsim.nlp.ApproxStringIntMap; 11 | 12 | public class WPPageViews extends AnswerScorer { 13 | private static ApproxStringIntMap pageviews = new ApproxStringIntMap(null); 14 | 15 | public WPPageViews(Environment env) { 16 | load(env); 17 | } 18 | 19 | private static synchronized void load(Environment env) { 20 | if (pageviews.isEmpty()) { 21 | int collisions = 0; 22 | try { 23 | ResultSet res = env.db.prep( 24 | "SELECT title, page_views FROM page_views;") 25 | .executeQuery(); 26 | while (res.next()) { 27 | collisions += pageviews.containsKey(res.getString(1).toLowerCase()) ? 1 : 0; 28 | pageviews.put(res.getString(1).toLowerCase(), res.getInt(2)); 29 | } 30 | } catch (SQLException e) { 31 | // at worst give 0s 32 | e.printStackTrace(); 33 | } 34 | System.out.println("Loaded view data about " + pageviews.size() + " pages " 35 | + "(" + collisions + " collisions)"); 36 | } 37 | } 38 | 39 | @Override 40 | public double scoreAnswer(Question q, Answer a) { 41 | return pageviews.get(a.toString().toLowerCase()); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/GloveAnswerQuestionContextTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import edu.uncc.cs.watsonsim.Answer; 9 | import edu.uncc.cs.watsonsim.Question; 10 | 11 | public class GloveAnswerQuestionContextTest { 12 | 13 | @Before 14 | public void setUp() throws Exception { 15 | } 16 | 17 | @Test 18 | public void testScoreAnswer() { 19 | GloveAnswerQuestionContext scorer = new GloveAnswerQuestionContext(); 20 | assertEquals(scorer.scoreAnswer(new Question("frog"), new Answer("toad")), 0.73, 0.01); 21 | assertEquals(scorer.scoreAnswer(new Question("frog"), new Answer("maple")), 0.23, 0.01); 22 | assertEquals(scorer.scoreAnswer( 23 | new Question("Who was Marilyn Monroe's second husband?"), 24 | new Answer("Joe Dimaggio")), 0.26, 0.01); 25 | assertEquals(scorer.scoreAnswer( 26 | new Question("Who was Marilyn Monroe's ^&^*()(*&$%^% 7868769987 jhgkjhgbnvbnuyr second husband?"), 27 | new Answer("Joe Dimaggio")), 0.26, 0.01); 28 | assertEquals(scorer.scoreAnswer( 29 | new Question("Who was Marilyn Monroe's second husband?"), 30 | new Answer("husband")), 0.71, 0.01); 31 | assertEquals(scorer.scoreAnswer( 32 | new Question("34986 **(&)(*& uiuytiuytiuyti"), 33 | new Answer("iuyoiuyoiuyhjjkhg")), 0.0, 0.01); 34 | assertEquals(scorer.scoreAnswer( 35 | new Question("democracy"), 36 | new Answer("")), 0.0, 0.01); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/TypeDetectionTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.junit.Test; 9 | 10 | import edu.uncc.cs.watsonsim.Environment; 11 | import edu.uncc.cs.watsonsim.nlp.DBPediaCandidateType; 12 | 13 | public class TypeDetectionTest { 14 | 15 | @Test 16 | /** 17 | * Check to see if the types received for a given input are sane. 18 | * 19 | * This is not stubbed because this is only a client wrapper; there would 20 | * be nothing left after stubbing. So expect it to fail if you do not have 21 | * the DBPedia database setup. 22 | */ 23 | public void test() { 24 | testHasAll("New York", new String[]{"city", "municipality", "place"}); 25 | testHasAll("tab", new String[]{"beverage", "food"}); 26 | } 27 | 28 | 29 | public void testHasAll(String source, String[] targets) { 30 | List types = new ArrayList<>(); 31 | try { 32 | Environment env = new Environment(); 33 | types = new DBPediaCandidateType(env).viaDBPedia(source); 34 | } catch (RuntimeException e) { 35 | // If this goes wrong, it probably just means we are disconnected 36 | System.err.println("Failed to connect to SPARQL endpoint for answer " 37 | + "type detection. Perhaps you are disconnected?"); 38 | System.err.println(e.getMessage()); 39 | System.err.println(e.getStackTrace()); 40 | return; 41 | } 42 | 43 | for (String target : targets) 44 | assertTrue(types.contains(target)); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/CommonConstituents.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.HashSet; 4 | 5 | import edu.stanford.nlp.trees.Tree; 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Passage; 8 | import edu.uncc.cs.watsonsim.Phrase; 9 | 10 | /* @author Wlodek 11 | * @author Sean Gallagher 12 | * 13 | * Create a score based on how many parse trees the question, candidate answer 14 | * and passage have in common. 15 | * 16 | * This scorer can be very slow. 17 | */ 18 | 19 | public class CommonConstituents extends PassageScorer { 20 | /** 21 | * Score the similarity of two sentences according to 22 | * sum([ len(x) | x of X, y of Y, if x == y ]) 23 | * where X and Y are the sets of subtrees of the parses of s1 and s2. 24 | * @param x 25 | * @param y 26 | * @return 27 | */ 28 | public static double getCommonSubtreeCount(Phrase t1, Phrase t2) { 29 | 30 | HashSet t1_subtrees = new HashSet<>(); 31 | HashSet t2_subtrees = new HashSet<>(); 32 | for (Tree x : t1.getTrees()) t1_subtrees.add(x.toString()); 33 | for (Tree y : t2.getTrees()) t2_subtrees.add(y.toString()); 34 | t1_subtrees.retainAll(t2_subtrees); 35 | 36 | // x.getLeaves().size() may also be a good idea. 37 | // I don't have any intuition for which may be better. 38 | return t1_subtrees.size(); 39 | } 40 | 41 | 42 | /** Generate a simple score based on scorePhrases. 43 | * 44 | */ 45 | public double scorePassage(Phrase q, Answer a, Passage p) { 46 | return getCommonSubtreeCount(p, new Phrase(a.text)); 47 | } 48 | } 49 | 50 | -------------------------------------------------------------------------------- /scripts/import_trec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Before you start, run: 3 | # -> pip install clint 4 | import argparse 5 | import sqlite3 6 | from clint.textui import progress 7 | import multiprocessing 8 | from multiprocessing.pool import Pool 9 | from lxml.etree import HTML 10 | parser = argparse.ArgumentParser(description="Import TREC data into sqlite3") 11 | parser.add_argument("-t", "--table", default="documents", help="SQL table to dump into") 12 | parser.add_argument("db", help="SQLite database") 13 | parser.add_argument("source", help="Source tag [e.g. wikipedia,wikiquotes,shakespeare ...]") 14 | parser.add_argument("trec", nargs="+", help="Input TREC files") 15 | args = parser.parse_args() 16 | 17 | db = sqlite3.connect(args.db) 18 | db.executescript(""" 19 | pragma journal_mode = WAL; 20 | pragma synchronous = OFF;""") 21 | 22 | 23 | for i, fname in progress.bar(enumerate(args.trec), "Importing TREC data..", 50, expected_size=len(args.trec)): 24 | with open(fname) as f: 25 | b = HTML(f.read()).findall("*doc") 26 | entries = [ 27 | [d.findtext("docno"), d.findtext("title"), d.findtext("text")] 28 | for d in b] 29 | 30 | db.executemany("insert or replace into %s (docno, title, text, source) values (?,?,?,'%s');" %(args.table, args.source), entries) 31 | if not (i % 250): 32 | db.execute("insert into search_{table}(search_{table}) values ('merge=200,8');".format(table=args.table)) # Clean search trees a bit 33 | db.commit() 34 | 35 | # Clean the tree the last time. 36 | #db.execute("insert into search_{table}(search_{table}) values ('optimize');".format(table=args.table)) 37 | db.commit() 38 | db.close() 39 | -------------------------------------------------------------------------------- /scripts/convert_arff_to_leveldb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This is a crazy hack to convert Weka's arff into caffe's leveldb 4 | import leveldb 5 | import caffe_pb2 6 | import struct 7 | 8 | def read(filename): 9 | fl = open(filename) 10 | label_index = 0 11 | label_found = False 12 | line = fl.readline() 13 | while line: 14 | if line.startswith("@data"): 15 | break 16 | elif not label_found and line.startswith("@attribute"): 17 | if line.split()[1] == "CORRECT": 18 | label_found = True 19 | else: 20 | label_index += 1 21 | 22 | for line in fl: 23 | if line.strip(): 24 | l = [float(x.replace("?", "NaN")) for x in line.split(',')] 25 | label = l.pop(label_index) 26 | yield (l, label) 27 | 28 | def transform(prev): 29 | d = caffe_pb2.Datum() 30 | d.channels = 1 31 | d.height = 1 32 | d.width = 2064 33 | totals = [0] * d.width 34 | for entry, label in prev: 35 | totals = [t+e for t, e in zip(totals, entry)] 36 | d.data = struct.pack("2064d", *entry) 37 | d.label = label 38 | yield d.SerializeToString() 39 | 40 | d.data = struct.pack("2064d", *totals) 41 | open("watson_mean.binaryproto", "w").write(d.SerializeToString()) 42 | 43 | 44 | def write(filename, prev): 45 | ldb = leveldb.LevelDB(filename=filename, create_if_missing=True, error_if_exists=True) 46 | for key, entry in enumerate(prev): 47 | ldb.Put(str(key).zfill(5), entry) 48 | 49 | 50 | if __name__ == "__main__": 51 | import sys 52 | write(sys.argv[2], transform(read(sys.argv[1]))) -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/DateMatchesTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | import edu.uncc.cs.watsonsim.scorers.DateMatches; 8 | 9 | public class DateMatchesTest { 10 | 11 | @Test 12 | public void test() { 13 | assertTrue(DateMatches.maybeYear("2005")); 14 | assertTrue(DateMatches.maybeYear("05")); 15 | assertFalse(DateMatches.maybeYear("-12")); 16 | assertFalse(DateMatches.maybeYear("Fall")); 17 | 18 | assertTrue(DateMatches.maybeMonth("March")); 19 | assertTrue(DateMatches.maybeMonth("Mar")); 20 | assertTrue(DateMatches.maybeMonth("03")); 21 | assertTrue(DateMatches.maybeMonth("3")); 22 | 23 | assertTrue(DateMatches.maybeDay("2")); 24 | assertTrue(DateMatches.maybeDay("12")); 25 | assertFalse(DateMatches.maybeDay("123")); 26 | 27 | assertTrue(DateMatches.maybeDate("04/05/1992")); 28 | assertTrue(DateMatches.maybeDate("04-05-1992")); 29 | assertTrue(DateMatches.maybeDate("04 05 1992")); 30 | assertTrue(DateMatches.maybeDate("05 1992")); 31 | assertTrue(DateMatches.maybeDate("05-1992")); 32 | assertTrue(DateMatches.maybeDate("05/1992")); 33 | assertTrue(DateMatches.maybeDate("May 1992")); 34 | assertTrue(DateMatches.maybeDate("04 May")); 35 | assertTrue(DateMatches.maybeDate("May 04")); 36 | assertTrue(DateMatches.maybeDate("May 4, 1992")); 37 | assertTrue(DateMatches.maybeDate("1992, 04 May")); 38 | assertFalse(DateMatches.maybeDate("99181919728")); 39 | assertFalse(DateMatches.maybeDate("1010 1010 0101 0001")); 40 | assertFalse(DateMatches.maybeDate("Mayday Mayday")); 41 | assertTrue(DateMatches.maybeDate("12 June 19283")); // still 12 June 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/NGram.java: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Varsha Devadas 3 | */ 4 | 5 | package edu.uncc.cs.watsonsim.scorers; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | import edu.uncc.cs.watsonsim.Answer; 11 | import edu.uncc.cs.watsonsim.Passage; 12 | import edu.uncc.cs.watsonsim.Phrase; 13 | import edu.uncc.cs.watsonsim.StringUtils; 14 | 15 | public class NGram extends PassageScorer { 16 | public double scorePassage(Phrase q, Answer a, Passage p) { 17 | // Jane Austen 18 | List a_set = generateNgrams(3, StringUtils.tokenize(a.text)); 19 | 20 | // Romantic novelist Jane Austen once wrote -the- book Emma. 21 | List p_set = generateNgrams(3, p.getTokens()); 22 | 23 | a_set.retainAll(p_set); 24 | return a_set.size(); 25 | 26 | } 27 | public static List generateNgrams(int n, List words) { 28 | List ngrams = new ArrayList(); 29 | for (int i = 0; i < words.size() - n + 1; i++) 30 | ngrams.add(concat(words, i, i+n)); 31 | return ngrams; 32 | } 33 | 34 | public static String concat(List words, int start, int end) { 35 | StringBuilder sb = new StringBuilder(); 36 | for (int i = start; i < end; i++) 37 | sb.append((i > start ? " " : "") + words.get(i)); 38 | return sb.toString(); 39 | } 40 | 41 | /*public static void main(String[] args) { 42 | 43 | Question question = Pipeline.ask("Who wrote Emma?"); 44 | Answer r = question.get(0); 45 | NGram ngram = new NGram(); 46 | 47 | double result = ngram.scorePassage(question, r, r.passages.get(0)); 48 | 49 | System.out.println(result); 50 | }*/ 51 | 52 | 53 | 54 | } 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /scripts/gensim/vstore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Provides a nice wrapper for GloVe data and other already-processed vectors 3 | import lmdb 4 | import numpy 5 | numpy.set_printoptions(threshold=20) 6 | 7 | class VStore(object): 8 | _allenvs = {} 9 | def __init__(self, filename, name): 10 | ''' Create a lmdb-backed VStore using a cached environment ''' 11 | if filename not in self._allenvs: 12 | self._allenvs[filename] = lmdb.Environment(filename, 13 | map_size=100<<30, 14 | max_dbs=100) 15 | self._env = self._allenvs[filename] 16 | self._db = self._env.open_db(name); 17 | 18 | def _txn(self, write=False): 19 | ''' Convenience method for making a transaction ''' 20 | return self._env.begin(self._db, write=write) 21 | 22 | def get(self, name, default=None): 23 | ''' Get a vector by name ''' 24 | with self._txn() as txn: 25 | r = txn.get(name) 26 | if r is None: 27 | return default 28 | else: 29 | return numpy.frombuffer(r, dtype=numpy.float32) 30 | 31 | def drop(self): 32 | '''Drop everything in a database''' 33 | with self._txn(write=True) as txn: 34 | txn.drop(self._db, delete=False) 35 | 36 | def put(self, name, value): 37 | ''' Put a vector into the entry for name ''' 38 | with self._txn(write=True) as txn: 39 | txn.put(name, numpy.getbuffer(value)) 40 | 41 | def read(self): 42 | ''' Get all the vectors from the database ''' 43 | with self._txn(write=False) as txn: 44 | for key, value in txn.cursor(): 45 | yield (key, numpy.frombuffer(value, dtype=numpy.float32)) 46 | 47 | def load(self, gen): 48 | ''' Put() into the database many (name, vector) pairs ''' 49 | with self._txn(write=True) as txn: 50 | try: 51 | for name, value in gen: 52 | txn.put(name, numpy.getbuffer(value)) 53 | except lmdb.BadValsizeError as e: 54 | print name, value.shape, value 55 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/DateMatches.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.regex.Matcher; 4 | import java.util.regex.Pattern; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Question; 8 | import edu.uncc.cs.watsonsim.nlp.ClueType; 9 | 10 | /** 11 | * Check if: the question needs a date, and the answer is one 12 | * @author Sean 13 | */ 14 | public class DateMatches extends AnswerScorer { 15 | public static boolean maybeMonth(String in) { 16 | return in.matches("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\.?\\w*" 17 | + "|\\d{1,2}"); 18 | } 19 | 20 | public static boolean maybeYear(String in) { 21 | return in.matches("\\d{2,4}"); 22 | } 23 | 24 | public static boolean maybeDay(String in) { 25 | return in.matches("\\d{1,2}[a-z]*"); 26 | } 27 | 28 | public static boolean maybeDate(String in) { 29 | boolean years=false, months=false, days=false; 30 | Matcher m = Pattern 31 | .compile("([^- ,/]+)[- ,/]+([^- ,/]+)([- ,/]+[^- ,/]+)?") 32 | .matcher(in); 33 | if (m.find()) { 34 | for (int group=0; group\n\n" 38 | + p.text 39 | + "\n\n"; 40 | synchronized (index) { 41 | try { 42 | index.addString(trecdoc, "trectext", Collections.emptyMap()); 43 | } catch (Exception e) { 44 | // Sadly, Indri throws everything and functions throw nothing 45 | // so we simply wrap what could be anything into a 46 | // stop-the-world runtime exception. 47 | e.printStackTrace(); 48 | throw new RuntimeException(e); 49 | } 50 | } 51 | } 52 | 53 | @Override 54 | public void close() throws IOException { 55 | try { 56 | index.close(); 57 | } catch (Exception e) { 58 | e.printStackTrace(); 59 | // Cheat and say it's IO. It probably is anyway. 60 | throw new IOException(e); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/Entropy.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.sql.ResultSet; 4 | import java.sql.SQLException; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Environment; 9 | import edu.uncc.cs.watsonsim.Phrase; 10 | import edu.uncc.cs.watsonsim.Question; 11 | import edu.uncc.cs.watsonsim.nlp.ApproxStringIntMap; 12 | import edu.uncc.cs.watsonsim.nlp.StringStack; 13 | 14 | public class Entropy extends AnswerScorer { 15 | // This is a custom approach for about a 10-fold reduction in memory 16 | private static final double mult = 2<<20; 17 | private static ApproxStringIntMap dict = new ApproxStringIntMap(new StringStack()); 18 | 19 | public Entropy(Environment env) { 20 | load(env); 21 | } 22 | 23 | private static synchronized void load(Environment env) { 24 | if (dict.isEmpty()) { 25 | int collisions = 0; 26 | try { 27 | ResultSet rs = env.db.prep("SELECT word, p FROM entropy;").executeQuery(); 28 | while (rs.next()) { 29 | collisions += dict.containsKey(rs.getString(1)) ? 1 : 0; 30 | // This mult is to put enough of the double's precision in 31 | // the int. p is logarithmic so overflow is not a problem. 32 | dict.put(rs.getString(1), (int)(rs.getDouble(2)*mult)); 33 | } 34 | } catch (SQLException e) { 35 | // Leave the table blank and give 0's 36 | e.printStackTrace(); 37 | } 38 | System.out.println("Loaded " + dict.size() + " words' entropy " 39 | + "(" + collisions + " collisions)"); 40 | } 41 | } 42 | 43 | protected double entropy(Iterable targets) { 44 | double ent = 0; 45 | for (String target: targets) { 46 | ent += dict.get(target) / mult; 47 | } 48 | return ent; 49 | } 50 | 51 | @Override 52 | public double scoreAnswer(Question q, Answer a) { 53 | return entropy(a.memo(Phrase.tokens)); 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/LucenePassageSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Collections; 7 | 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.search.IndexSearcher; 10 | import org.apache.lucene.search.ScoreDoc; 11 | import edu.uncc.cs.watsonsim.Environment; 12 | import edu.uncc.cs.watsonsim.Passage; 13 | import edu.uncc.cs.watsonsim.Score; 14 | import edu.uncc.cs.watsonsim.scorers.Merge; 15 | 16 | /** 17 | * @author Phani Rahul 18 | */ 19 | public class LucenePassageSearcher extends Searcher { 20 | private final IndexSearcher lucene; 21 | private final Environment env; 22 | 23 | public LucenePassageSearcher(Environment env) { 24 | super(env); 25 | this.lucene = env.lucene; 26 | this.env = env; 27 | Score.register("LUCENE_SCORE", -1, Merge.Mean); 28 | Score.register("LUCENE_RANK", -1, Merge.Mean); 29 | } 30 | 31 | public List query(String question_text) { 32 | List results = new ArrayList<>(); 33 | try { 34 | ScoreDoc[] hits = env.simpleLuceneQuery(question_text, MAX_RESULTS); 35 | // This isn't range based because we need the rank 36 | for (int i=0; i < hits.length; i++) { 37 | Document doc = lucene.doc(hits[i].doc, Collections.singleton("docno")); 38 | results.add(new edu.uncc.cs.watsonsim.Passage( 39 | "lucene", // Engine 40 | "", // Title 41 | "", // Text 42 | doc.get("docno")) // Reference 43 | .score("LUCENE_RANK", (double) i) // Rank 44 | .score("LUCENE_SCORE", (double) hits[i].score) // Source 45 | ); 46 | } 47 | } catch (IOException e) { 48 | System.out.println("Failed to query Lucene. Is the index in the correct location?"); 49 | e.printStackTrace(); 50 | } 51 | 52 | // Fill any missing full text from sources 53 | return fillFromSources(results); 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/MergeByText.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Environment; 8 | import edu.uncc.cs.watsonsim.Question; 9 | import edu.uncc.cs.watsonsim.nlp.Relatedness; 10 | 11 | public class MergeByText extends Researcher { 12 | private final Relatedness syn; 13 | /** 14 | * Create a new merger using shared environment resources. 15 | * @param env 16 | */ 17 | public MergeByText(Environment env) { 18 | syn = new Relatedness(env); 19 | } 20 | 21 | @Override 22 | /** Call merge on any two answers with the same title */ 23 | public List question(Question q, List answers) { 24 | List> answer_blocks = new ArrayList<>(); 25 | // Arrange the answers into blocks 26 | each_answer: 27 | for (Answer original : answers) { 28 | for (List block : answer_blocks) { 29 | for (Answer example : block) { 30 | // Look through the examples in this topic 31 | // If it matches, choose to put it in this block and quit. 32 | if (syn.matchViaLevenshtein(original.text, example.text)) { 33 | block.add(original); 34 | continue each_answer; 35 | } 36 | } 37 | } 38 | 39 | // Make a new topic for this answer 40 | List new_block = new ArrayList<>(); 41 | new_block.add(original); 42 | answer_blocks.add(new_block); 43 | } 44 | 45 | // Merge the blocks 46 | List new_answers = new ArrayList<>(); 47 | for (List block : answer_blocks) { 48 | if (block.size() > 1) { 49 | new_answers.add(Answer.merge(block)); 50 | } else { 51 | new_answers.add(block.get(0)); 52 | } 53 | } 54 | 55 | log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by surface similarity)."); 56 | return new_answers; 57 | } 58 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/AnswerTrimming.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Question; 8 | 9 | /** 10 | * @author Suresh Appana 11 | * 12 | */ 13 | public class AnswerTrimming extends Researcher { 14 | @Override 15 | public List question(Question question, List answers) { 16 | List answers_updated = new ArrayList<>(); 17 | for(Answer ans : answers) { 18 | String text = ans.text; 19 | //System.out.println(text); 20 | String[] answer_array = ans.text.split(" "); 21 | int answer_array_length = answer_array.length; 22 | 23 | 24 | 25 | for (int j = 0; j < answer_array_length; j++) { 26 | for (int i = answer_array_length - 1; i >= j; i--) { 27 | StringBuilder sb = new StringBuilder(); 28 | for (int k = j; k <= i; k++) { 29 | // System.out.println("i=" + i + ", j=" + j + ", k"); 30 | sb.append(answer_array[k]); 31 | if (k != i) 32 | sb.append(" "); 33 | } 34 | if (sb.length() > 0 && question.text.toLowerCase().contains(sb.toString().toLowerCase())) { 35 | text = text 36 | .replace(sb.toString(), "") 37 | .trim() 38 | .replaceAll(" +", " ") 39 | .replaceAll("^([^a-z|A-Z|0-9])( )*", "") 40 | .replaceAll("()*([^a-z|A-Z|0-9])$", "") 41 | .trim(); 42 | answer_array = text.split(" "); 43 | answer_array_length = answer_array.length; 44 | i = answer_array_length - 1; 45 | j = 0; 46 | } 47 | } 48 | } 49 | answers_updated.add( ans.withText(text)); 50 | } 51 | 52 | //for(int i=0;i vector for `elicidate`" 57 | print "w('mogrify') + w('frobnicate') --> vector sum" 58 | print " same for -, *, /, **, etc as usual for numpy" 59 | print "sim(w('republican'), w('democrat')) -> society in a 32bit float" 60 | print " (actually a simple cosine similarity)" 61 | code.interact(local=vars()) 62 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/RedirectSynonyms.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.sql.PreparedStatement; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import org.apache.commons.lang3.StringEscapeUtils; 10 | import edu.uncc.cs.watsonsim.Answer; 11 | import edu.uncc.cs.watsonsim.Database; 12 | import edu.uncc.cs.watsonsim.Environment; 13 | import edu.uncc.cs.watsonsim.Question; 14 | import edu.uncc.cs.watsonsim.Score; 15 | import edu.uncc.cs.watsonsim.scorers.Merge; 16 | 17 | /** 18 | * Create a bunch of new answers with the same passages based on "synonyms" 19 | * made from Wikipedia redirects. 20 | * 21 | * @author Sean 22 | */ 23 | public class RedirectSynonyms extends Researcher { 24 | private final Database db; 25 | private final PreparedStatement s; 26 | 27 | public RedirectSynonyms(Environment env) { 28 | db = env.db; 29 | s = db.prep("SELECT source from wiki_redirects where target = ?;"); 30 | Score.register("IS_WIKI_REDIRECT", 0.0, Merge.Min); 31 | } 32 | 33 | @Override 34 | public List question(Question q, List answers) { 35 | // For logging 36 | int synonym_count = 0; 37 | List new_answers = new ArrayList(); 38 | for (Answer a : answers) { 39 | try { 40 | s.setString(1, a.text); 41 | ResultSet results = s.executeQuery(); 42 | while (results.next()) { 43 | synonym_count++; 44 | Answer new_answer = new Answer( 45 | new ArrayList<>(a.passages), 46 | a.scores.clone(), 47 | StringEscapeUtils.unescapeXml(results.getString("source"))); 48 | a.scores.put("IS_WIKI_REDIRECT", 1.0); 49 | new_answers.add(new_answer); 50 | } 51 | } catch (SQLException e) { 52 | // Just don't make any synonyms. 53 | return answers; 54 | } 55 | } 56 | 57 | log.info("Found " + synonym_count + " synonyms for " + answers.size() + 58 | " candidate answers using Wikipedia redirects."); 59 | return new_answers; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/index/Lucene.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.index; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Path; 5 | 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.document.Field; 10 | import org.apache.lucene.document.StoredField; 11 | import org.apache.lucene.document.TextField; 12 | import org.apache.lucene.index.IndexWriter; 13 | import org.apache.lucene.index.IndexWriterConfig; 14 | import org.apache.lucene.search.similarities.BM25Similarity; 15 | import org.apache.lucene.store.Directory; 16 | import org.apache.lucene.store.FSDirectory; 17 | 18 | import edu.uncc.cs.watsonsim.Passage; 19 | 20 | public class Lucene implements Segment { 21 | private final IndexWriter index; 22 | public Lucene(Path path) throws IOException { 23 | /* Setup Lucene */ 24 | Directory dir = FSDirectory.open(path); 25 | // here we are using a standard analyzer, there are a lot of analyzers available to our use. 26 | Analyzer analyzer = new StandardAnalyzer(); 27 | IndexWriterConfig iwc = new IndexWriterConfig(analyzer); 28 | //this mode by default overwrites the previous index, not a very good option in real usage 29 | iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); 30 | iwc.setSimilarity(new BM25Similarity()); 31 | index = new IndexWriter(dir, iwc); 32 | } 33 | 34 | public void accept(Passage p){ 35 | // Index with Lucene 36 | Document doc = new Document(); 37 | doc.add(new TextField("title", p.title, Field.Store.NO)); 38 | doc.add(new TextField("text", p.text, Field.Store.YES)); 39 | doc.add(new StoredField("docno", p.reference)); 40 | try { 41 | index.addDocument(doc); 42 | } catch (IOException e) { 43 | // TODO Auto-generated catch block 44 | e.printStackTrace(); 45 | } 46 | } 47 | 48 | @Override 49 | public void close() throws IOException { 50 | index.close(); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/MergeByCommonSupport.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Question; 10 | 11 | public class MergeByCommonSupport extends Researcher { 12 | 13 | @Override 14 | /** Call merge on any two answers, where the answers have more passages in common than different*/ 15 | public List question(Question q, List answers) { 16 | List> answer_blocks = new ArrayList<>(); 17 | each_answer: 18 | for (Answer original : answers) { 19 | HashSet o_passages = new HashSet<>(); 20 | o_passages.addAll(original.passages); 21 | 22 | for (List block : answer_blocks) { 23 | for (Answer example : block) { 24 | 25 | HashSet e_passages = new HashSet<>(); 26 | e_passages.addAll(example.passages); 27 | int example_cardinality = e_passages.size(); 28 | e_passages.retainAll(o_passages); 29 | 30 | double percent_common = e_passages.size() / 31 | (example_cardinality + o_passages.size() - e_passages.size() + 0.01); 32 | 33 | if ( percent_common > 0.5 ) { 34 | // If the intersection > half the union, then merge the questions 35 | block.add(original); 36 | continue each_answer; 37 | } 38 | } 39 | } 40 | 41 | // Make a new topic for this answer 42 | List new_block = new ArrayList<>(); 43 | new_block.add(original); 44 | answer_blocks.add(new_block); 45 | } 46 | 47 | // Merge the blocks 48 | List new_answers = new ArrayList<>(); 49 | for (List block : answer_blocks) { 50 | if (block.size() > 1) { 51 | new_answers.add(Answer.merge(block)); 52 | } else { 53 | new_answers.add(block.get(0)); 54 | } 55 | } 56 | 57 | log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by common passages)."); 58 | return new_answers; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/NamedEntityRecognizerScorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Phrase; 10 | import edu.uncc.cs.watsonsim.StringUtils; 11 | import opennlp.tools.namefind.NameFinderME; 12 | import opennlp.tools.namefind.TokenNameFinderModel; 13 | import opennlp.tools.tokenize.SimpleTokenizer; 14 | import opennlp.tools.util.Span; 15 | 16 | /** 17 | * This scorer will return the number of named entities matched in a given 18 | * question 19 | * 20 | * @author Jonathan Shuman 21 | * 22 | */ 23 | public class NamedEntityRecognizerScorer extends PassageScorer { 24 | public double scorePassage(Phrase q, Answer a, Passage p) { 25 | 26 | // Jane Austen 27 | String c_t = StringUtils.join(p.text, " "); 28 | 29 | // Romantic novelist Jane Austen once wrote -the- book Emma. 30 | String q_t = q.text; 31 | 32 | return numberOfNamedPersonEntities(q_t, c_t); 33 | 34 | } 35 | 36 | private double numberOfNamedPersonEntities(String q_t, String c_t) { 37 | InputStream modelIn = null; 38 | double retVal = 0; 39 | try { 40 | modelIn = new FileInputStream("data/en-ner-person.bin"); 41 | TokenNameFinderModel model = new TokenNameFinderModel(modelIn); 42 | NameFinderME nameFinder = new NameFinderME(model); 43 | String[] c_words = SimpleTokenizer.INSTANCE.tokenize(c_t); 44 | String[] q_words = SimpleTokenizer.INSTANCE.tokenize(q_t); 45 | Span[] c_tokens = nameFinder.find(c_words); 46 | 47 | for (Span cS : c_tokens) { 48 | for (String q_word : q_words) 49 | if ((c_words[cS.getStart()]).contains(q_word)) 50 | retVal++; 51 | } 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | return Double.NaN; 55 | } finally { 56 | if (modelIn != null) { 57 | try { 58 | modelIn.close(); 59 | } catch (IOException e) { 60 | return Double.NaN; 61 | } 62 | 63 | } 64 | } 65 | return retVal; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/PassageTermMatch.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.util.List; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Passage; 7 | import edu.uncc.cs.watsonsim.Phrase; 8 | import edu.uncc.cs.watsonsim.StringUtils; 9 | 10 | /** 11 | * The Passage Term match scorer is designed, simply, to count the number of times 12 | * a term appears in the text. 13 | * 14 | * "This assigns a score by 15 | * matching question terms to passage terms, regardless 16 | * of grammatical relationship or word order." 17 | * 18 | * It returns a number which is equal to the number of occurrences 19 | * @author Jonathan Shuman 20 | * 21 | */ 22 | public class PassageTermMatch extends PassageScorer { 23 | public double scorePassage(Phrase q, Answer a, Passage p) { 24 | 25 | // Jane Austen 26 | String c_t = StringUtils.join(p.text, " "); 27 | 28 | // Romantic novelist Jane Austen once wrote -the- book Emma. 29 | String q_t = q.text; 30 | 31 | return generateNumberTerms(q_t, c_t); 32 | 33 | } 34 | 35 | /** 36 | * @param queryText The text of the query to search passages 37 | * @param passageText The text of the passage 38 | * @return Number of occurrences of words in query in the passage 39 | */ 40 | private int generateNumberTerms(String queryText, String passageText) { 41 | /* 42 | * We will first separate the text of the query and passage into terms. 43 | * Note: The parameters are assumed to have stopwords removed. 44 | */ 45 | List qTerms = StringUtils.tokenize(queryText); 46 | List pTerms = StringUtils.tokenize(passageText); 47 | 48 | // Join the passage back together with stop words removed. 49 | // We will use the StringUtils function to remove the words. 50 | String passageStopsRemoved = StringUtils.join(pTerms, " "); 51 | 52 | int matches = 0; 53 | //Scan through each of the terms to get its number of occurances in the passage text. 54 | for (String term : qTerms) { 55 | // First the bigram 56 | matches += StringUtils.countMatches(passageStopsRemoved, term); 57 | } 58 | return matches; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/main/resources/public/scripts/query.js: -------------------------------------------------------------------------------- 1 | function write_log(text) { 2 | $("#console").append($("
  • ").text(text)); 3 | $("#console").animate({ scrollTop: $("#console").prop("scrollHeight") }, "slow"); 4 | } 5 | 6 | 7 | angular.module('queryApp', []) 8 | .controller('QueryController', function($scope) { 9 | var queryDetail = this; 10 | queryDetail.answers = [ 11 | {text: "Some sample data", score: 0.8976, 12 | evidence: [{source: "moomoo", note: "this is an example"}, {source: "akjshkjd", note: "another example"}], 13 | scores: {ANSWER_RANK: 0.8172, ANSWER_SCORE: 0.8162, LAT_CHECK: 0.99, CORR: 0.1}, 14 | passages: [{ 15 | title: "moomoo", 16 | text: "this is an example", 17 | reference: "wp-full-8272-18"}, 18 | {title: "akjshkjd", 19 | text: "another example", 20 | reference: "wp-full-8272-18"}] 21 | }, 22 | {text: "Moo! bar bax", score: 0.4926, evidence: [{source: "moomoo", note: "this is an example"}]}, 23 | {text: "Another example", score: 0.207, evidence: [{source: "moomoo", note: "this is an example"}]} 24 | ]; 25 | queryDetail.note = "Ask any natural language question to have it answered!"; 26 | 27 | queryDetail.handle_message = function(event) { 28 | // Handle incoming messages 29 | console.log(event.data); 30 | var content = JSON.parse(event.data); 31 | switch (content.flag) { // flag 32 | case "log": 33 | write_log(content.message); 34 | break; 35 | case "result": 36 | queryDetail.answers = content.message; 37 | $("#console").slideUp(); 38 | queryDetail.note = ""; 39 | break; 40 | } 41 | }; 42 | queryDetail.begin = function () { 43 | // Clean the screen 44 | $("#console li").remove(); 45 | $("#console").slideDown(); 46 | 47 | // Open a channel 48 | var query_channel = new WebSocket("ws://localhost:8887/asklive"); 49 | query_channel.onopen = function (event) { 50 | // Ask the question 51 | query_channel.send("ask:" + $("#search [name=query]").val()); 52 | write_log("Sending query..."); 53 | }; 54 | query_channel.onmessage = function(e) { 55 | queryDetail.handle_message(e); 56 | $scope.$apply(); 57 | }; 58 | //event.preventDefault(); 59 | }; 60 | }); 61 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/MeanDVSearchTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import edu.uncc.cs.watsonsim.Environment; 12 | import edu.uncc.cs.watsonsim.Passage; 13 | import static org.fusesource.lmdbjni.Constants.*; 14 | 15 | public class MeanDVSearchTest { 16 | 17 | MeanDVSearch mds; 18 | @Before 19 | public void setUp() throws Exception { 20 | } 21 | 22 | @Test 23 | public void test() { 24 | mds = new MeanDVSearch(new Environment()); 25 | List frogstuff = mds.query("frog"); 26 | assertTrue(frogstuff.size() > 0); 27 | assertTrue(frogstuff.get(0).title.contains("frog")); 28 | } 29 | 30 | @Test 31 | public void testBubble() { 32 | double[] sims = new double[5]; 33 | byte[][] names = new byte[5][]; 34 | byte[] name_e = bytes("e"); 35 | byte[] name_f = bytes("f"); 36 | byte[] name_g = bytes("g"); 37 | 38 | sims[0]=0.8; sims[1]=0.5; sims[2]=0.0; sims[3]=-1; 39 | names[0]=bytes("a"); names[1]=bytes("b"); names[2]=bytes("c"); names[3]=bytes("d"); 40 | 41 | MeanDVSearch.bubble(sims, names, 0.9, name_e, 4); 42 | assertEquals(0.9, sims[0], 0.01); 43 | assertEquals(0.8, sims[1], 0.01); 44 | assertEquals(0.5, sims[2], 0.01); 45 | assertEquals(0.0, sims[3], 0.01); 46 | assertEquals(name_e, names[0]); 47 | //---------------------------------------------------------------- 48 | 49 | MeanDVSearch.bubble(sims, names, 0.1, name_f, 4); 50 | assertEquals(0.9, sims[0], 0.01); 51 | assertEquals(0.8, sims[1], 0.01); 52 | assertEquals(0.5, sims[2], 0.01); 53 | assertEquals(0.1, sims[3], 0.01); 54 | assertEquals(name_f, names[3]); 55 | //---------------------------------------------------------------- 56 | 57 | MeanDVSearch.bubble(sims, names, 0.5, name_g, 4); 58 | assertEquals(0.9, sims[0], 0.01); 59 | assertEquals(0.8, sims[1], 0.01); 60 | assertEquals(0.5, sims[2], 0.01); 61 | assertEquals(0.5, sims[3], 0.01); 62 | assertEquals(name_g, names[3]); 63 | //---------------------------------------------------------------- 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/Log.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import java.util.function.Consumer; 4 | 5 | /** 6 | * Wrapper logger 7 | * 8 | * Loggers already allow many modules to log to many places. 9 | * But we need each module to log to some (but not all) places. So basically, 10 | * want to pass around a fancy many-to-many channel. 11 | * 12 | * @author Sean 13 | * 14 | */ 15 | public class Log { 16 | private Consumer listener; 17 | private final Log parent; 18 | private final Class speaker; 19 | private final long start; 20 | 21 | private enum Level {ERROR, WARNING, INFO, DEBUG}; 22 | 23 | public static final Log NIL = new Log(Object.class, x->{}); 24 | 25 | // Start a root logger 26 | public Log(Object speaker, Consumer listener) { 27 | this.parent = null; 28 | this.speaker = speaker.getClass(); 29 | this.start = System.currentTimeMillis(); 30 | this.listener = listener; 31 | } 32 | 33 | // Start a child logger 34 | private Log(Object speaker, Log parent) { 35 | this.parent = parent; 36 | this.speaker = speaker.getClass(); 37 | this.start = parent.start; 38 | } 39 | 40 | /** 41 | * Make a new writable subchannel. 42 | */ 43 | public Log kid(Class speaker) { 44 | return new Log(speaker, this); 45 | } 46 | 47 | public void setListener(Consumer listener) { 48 | this.listener = listener; 49 | } 50 | 51 | /** 52 | * Push some notifications. Listeners may lose interest. 53 | */ 54 | private void push(String content, Level level) { 55 | if (listener != null) { 56 | listener.accept(String.format("%.2f [%s %s] %s", 57 | (System.currentTimeMillis()-start) / 1000.0, 58 | level.name(), 59 | speaker.getSimpleName(), 60 | content)); 61 | } else if (parent != null) { 62 | parent.push(content, level); 63 | } 64 | } 65 | 66 | public void error(String message) { 67 | push(message, Level.ERROR); 68 | } 69 | 70 | public void warn(String message) { 71 | push(message, Level.WARNING); 72 | } 73 | 74 | public void info(String message) { 75 | push(message, Level.INFO); 76 | } 77 | 78 | public void debug(String message) { 79 | push(message, Level.DEBUG); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/AnswerPOS.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.stanford.nlp.ling.IndexedWord; 4 | import edu.stanford.nlp.semgraph.SemanticGraph; 5 | import edu.stanford.nlp.semgraph.SemanticGraphEdge; 6 | import edu.stanford.nlp.trees.GrammaticalRelation; 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Question; 9 | 10 | /** 11 | * 12 | * @author Yeshvant 13 | * 14 | */ 15 | public class AnswerPOS extends AnswerScorer { 16 | 17 | public AnswerPOS() { 18 | } 19 | 20 | public double scoreAnswer(Question q, Answer a) { 21 | 22 | for (SemanticGraph graph : a.getGraphs()) { 23 | 24 | if(!graph.getRoots().isEmpty()) 25 | { 26 | if (graph.getFirstRoot().tag().contains("NN")) { 27 | for (SemanticGraphEdge edge : graph.edgeIterable()) { 28 | 29 | IndexedWord a1 = edge.getDependent(); 30 | IndexedWord a2 = edge.getGovernor(); 31 | 32 | if (a1.tag().contains("NN")) { 33 | return 1.0; 34 | } 35 | if (a2.tag().contains("NN")) { 36 | return 1.0; 37 | } 38 | 39 | } 40 | 41 | } 42 | } 43 | } 44 | return 0.0; 45 | } 46 | 47 | 48 | 49 | public static void main(String args[]) { 50 | Answer a = new Answer("For luck Kate will only knock on this wood"); 51 | // System.err.println(a.graphs.size()); 52 | // System.out.println("hello"); 53 | double score = 0; 54 | for (SemanticGraph graph : a.getGraphs()) { 55 | 56 | if (graph.getFirstRoot().tag().contains("NN")) { 57 | for (SemanticGraphEdge edge : graph.edgeIterable()) { 58 | 59 | GrammaticalRelation rel = edge.getRelation(); 60 | IndexedWord a1 = edge.getDependent(); 61 | IndexedWord a2 = edge.getGovernor(); 62 | 63 | // System.out.println(a1.originalText()+"Tag: "+a1.tag()); 64 | // System.out.println(a2.originalText()+" Tag: "+a2.tag()+" "+rel.getShortName()+" Relation to "+a1.originalText()+" Tag: "+a1.tag()); 65 | if (a1.tag().contains("NN")) { 66 | score = 1.0; 67 | // return 68 | 69 | } 70 | if (a2.tag().contains("NN")) { 71 | score = 1.0; 72 | // return 73 | 74 | } 75 | 76 | } 77 | 78 | } 79 | } 80 | 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/KV.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.fusesource.lmdbjni.Constants.bytes; 4 | 5 | import java.nio.ByteBuffer; 6 | import java.nio.ByteOrder; 7 | import java.nio.FloatBuffer; 8 | import java.util.Optional; 9 | import java.util.function.Function; 10 | 11 | import org.fusesource.lmdbjni.Constants; 12 | import org.fusesource.lmdbjni.Env; 13 | import org.fusesource.lmdbjni.Transaction; 14 | 15 | public class KV { 16 | public Env db = new Env(); 17 | public KV() { 18 | db.open("data/lmdb", org.fusesource.lmdbjni.Constants.CREATE); 19 | } 20 | 21 | /** 22 | * Get a byte array from the database just as it was stored. 23 | * @param table Which table to retrieve it from 24 | * @param key Which key you want 25 | * @return byte[] 26 | */ 27 | public Optional get(String table, String key) { 28 | return Optional.ofNullable(db.openDatabase(table).get(bytes(key))); 29 | } 30 | 31 | /** 32 | * Basically just does ((float[]) bytes) which is moderately complex. 33 | * @param bytes 34 | * @return 35 | */ 36 | public static float[] asVector(byte[] bytes) { 37 | FloatBuffer fb = FloatBuffer.allocate((bytes.length + 3) / 4); 38 | fb.put(ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()); 39 | return fb.array(); 40 | } 41 | 42 | /** 43 | * Basically just does ((byte[]) floats) which is moderately complex. 44 | * @param bytes 45 | * @return 46 | */ 47 | public static byte[] asBytes(float[] floats) { 48 | ByteBuffer bb = ByteBuffer.wrap(new byte[floats.length*4]).order(ByteOrder.LITTLE_ENDIAN); 49 | bb.asFloatBuffer().put(floats); 50 | return bb.array(); 51 | } 52 | 53 | 54 | /** 55 | * Non-atomically update an entry or return it. 56 | * This is used for cases reading is common (getting a fast path 57 | * with only a read lock) but writing is not (and might be run twice). 58 | */ 59 | public String quickGetOrCompute(String table, String key, Function comp) { 60 | return get(table, key).map(Constants::string).orElseGet(() -> { 61 | try (Transaction tx = db.createWriteTransaction()){ 62 | String o = comp.apply(key); 63 | db.openDatabase(tx, table, 0).put(bytes(key), bytes(o)); 64 | return o; 65 | } 66 | }); 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/StrictFilters.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Question; 8 | 9 | public class StrictFilters extends Researcher { 10 | /** 11 | * Perform several strict filters relating mostly to game rules. 12 | * 13 | * 1: Remove J! Archive since it has actual answers. 14 | * 2: Remove "List of *" because that's not the format of an answer. 15 | * 3: Remove any answer inside the question because they don't give the 16 | * answers away in the questions (at least not in a string-match way) 17 | * 4: Remove ultra-long answers because J! never wants a 3-minute speech 18 | * 5: Remove answers not in Latin text 19 | */ 20 | public List question(Question q, List answers) { 21 | List new_answers = new ArrayList<>(); 22 | for (Answer a : answers) { 23 | 24 | // J! Archive has answers 25 | if (a.text.contains("J! Archive")) {} 26 | 27 | // "List of" is a bad sign 28 | else if (a.text.contains("List of")) {} 29 | 30 | // Is the answer in the question? 31 | else if (almostContains(q.text, a.text)) {} 32 | 33 | // Is it too long? 34 | // The longest real answer in our sample of about 40,000 is: 35 | // How much wood would a woodchuck chuck if a woodchuck could chuck wood? 36 | // and it's 70 characters long. So cut there. 37 | else if (a.getTokens().isEmpty() || a.text.length() > 70) {} 38 | 39 | // Is over half of it non-Latin text? 40 | else if (a.text.replaceAll("[^A-Za-z0-9 ]", "").length() * 2 < a.text.length()) {} 41 | 42 | // Does it look like a web address? 43 | else if (a.text.matches("^(http://)?([A-Za-z]+\\.)?[A-Za-z]+\\.(com|net|org|co\\.[A-Za-z]{2})$")) {} 44 | 45 | else { 46 | new_answers.add(a); 47 | } 48 | } 49 | 50 | log.info("Eliminated " + (answers.size() - new_answers.size()) + " invalid answers"); 51 | return new_answers; 52 | } 53 | 54 | /** 55 | * Check if the question text (left) almost contains the answer text 56 | * (right). 57 | */ 58 | public boolean almostContains(String left, String right) { 59 | // TODO: more stopword removal, etc. 60 | return left.toLowerCase().contains(right.toLowerCase()); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/CoreNLPSentenceSimilarityTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.ArrayList; 6 | 7 | import org.junit.Test; 8 | 9 | import edu.stanford.nlp.trees.Tree; 10 | import edu.uncc.cs.watsonsim.Phrase; 11 | import edu.uncc.cs.watsonsim.nlp.Trees; 12 | import edu.uncc.cs.watsonsim.scorers.CommonConstituents; 13 | 14 | public class CoreNLPSentenceSimilarityTest { 15 | 16 | @Test 17 | public void testParseToTree() { 18 | 19 | // Empty case 20 | assertEquals(new ArrayList<>(), Trees.parse("")); 21 | // Simple case 22 | assertEquals(Tree.valueOf("(ROOT (NP (NN Example)))"), Trees.parse("Example").get(0)); 23 | // Challenging case 24 | // fails: "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo." 25 | // succeeds, or at least it looks generally right to me: 26 | assertEquals(Tree.valueOf("(ROOT (S (NP (NNP Niel) (NNP Armstrong)) " 27 | + "(VP (VBD was) (NP (DT the) (JJ first) (NN man)" 28 | + "(S (VP (TO to) (VP (VB walk) " 29 | + "(PP (IN on) (NP (DT the) (NN moon)))))))) (. .)))"), 30 | Trees.parse("Niel Armstrong was the first man to walk on the moon.").get(0)); 31 | 32 | assertEquals( 33 | Tree.valueOf("(ROOT (S (NP (PRP I)) (VP (VBP am) (ADJP (JJ tall))) (. .)))"), 34 | Trees.parse("I am tall. You are short.").get(0)); 35 | assertEquals( 36 | Tree.valueOf("(ROOT (S (NP (PRP You)) (VP (VBP are) (ADJP (JJ short))) (. .)))"), 37 | Trees.parse("I am tall. You are short.").get(1)); 38 | 39 | } 40 | 41 | @Test 42 | public void testScorePhrases() { 43 | CommonConstituents scorer = new CommonConstituents(); 44 | 45 | 46 | // These are in large part to make sure that it does not accidentally change. 47 | /*assertEquals( 48 | 1.0, 49 | scorer.getCommonSubtreeCount( 50 | new Phrase("this"), 51 | new Phrase("this")), 52 | 0.01 53 | );*/ 54 | assertEquals( 55 | 6.0, 56 | scorer.getCommonSubtreeCount( 57 | new Phrase("My goat knows the bowling score."), 58 | new Phrase("Michael rowed the boat ashore.")), 59 | 0.01 60 | ); 61 | assertEquals( 62 | 12.0, 63 | scorer.getCommonSubtreeCount( 64 | new Phrase("A tisket. A tasket. A green and yellow basket."), 65 | new Phrase("A tisket, a tasket, what color is my basket?")), 66 | 0.01 67 | ); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/scripts/BigramBigramIndexer.scala: -------------------------------------------------------------------------------- 1 | package scripts; 2 | 3 | import java.io.File; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import lemurproject.indri.IndexEnvironment; 11 | import lemurproject.indri.ParsedDocument; 12 | import lemurproject.indri.ParsedDocument.TermExtent; 13 | 14 | import org.apache.lucene.analysis.Analyzer; 15 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 16 | import org.apache.lucene.document.Document; 17 | import org.apache.lucene.document.Field; 18 | import org.apache.lucene.document.StoredField; 19 | import org.apache.lucene.document.TextField; 20 | import org.apache.lucene.index.IndexWriter; 21 | import org.apache.lucene.index.IndexWriterConfig; 22 | import org.apache.lucene.store.Directory; 23 | import org.apache.lucene.store.FSDirectory; 24 | import org.apache.lucene.util.Version; 25 | 26 | import privatedata.UserSpecificConstants; 27 | import uncc2014watsonsim.Passage; 28 | import uncc2014watsonsim.Database; 29 | 30 | /** 31 | * This is an experimental bigram-bigram association indexer. 32 | * The point here is to find the most relevant relations between every pair of 33 | * bigrams, according to the pairwise entropy. 34 | * 35 | * The whole thing is designed to run in (maybe 3 GB) memory using bit 36 | * twiddling and primitive arrays for efficiency, hash tables and dynamic 37 | * programming for time complexity, a cache eviction policy for 38 | * memory complexity, and some distributional tweaks for fairness. 39 | * 40 | * This is not exactly tried and true software. 41 | */ 42 | object BigramBigramIndexer { 43 | val db = new Database(); 44 | 45 | def main(args: Array[String]) { 46 | println("Hello!") 47 | } 48 | 49 | /** 50 | * Fetch rows from the database, extract the text, and tokenize it. 51 | */ 52 | def getRowText() : Stream[Array[String]] = { 53 | val rows = db.prep("SELECT reference, title, text FROM " 54 | + "meta INNER JOIN content ON meta.id=content.id " 55 | + "WHERE source != 'wp-full' and source != 'wiktionary-01'" 56 | + " ORDER BY title;").executeQuery(); 57 | 58 | new Iterator[Array[String]] { 59 | def hasNext = rows.next() 60 | def next() = rows.getString(1).split("[^a-zA-Z]") 61 | }.toStream 62 | } 63 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/Question.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import org.apache.log4j.Logger; 4 | 5 | /** 6 | * An immutable natural language phrase intended to be evaluated as a question 7 | * or clue. 8 | * 9 | * Available annotators (there may be more, these just get you started) 10 | * ClueType.fromClue 11 | * QClassDetection.detectType 12 | * 13 | * @author Sean 14 | */ 15 | public class Question extends Phrase { 16 | public final Answer correct_answer; 17 | private final String category; 18 | private final QType type; 19 | 20 | /** 21 | * Construct a new question for analysis. 22 | * @param question The natural language clue 23 | * @param correct_answer The target answer, if available (or null) 24 | * @param category The category of the problem, also natural language 25 | */ 26 | public Question(String question, Answer correct_answer, String category) { 27 | super(question); 28 | this.correct_answer = correct_answer; 29 | this.category = category; 30 | this.type = QClassDetection.detectType(this); 31 | this.memo(QClassDetection::detectType); 32 | Logger log = Logger.getLogger(getClass()); 33 | log.info("Looks like a " + type.toString().toLowerCase() + " question"); 34 | } 35 | 36 | /** 37 | * Create a simple question without bells and whistles 38 | */ 39 | public Question(String question) { 40 | this(question, null, ""); 41 | } 42 | 43 | /** 44 | * Create a question from a clue and a hint about it's category 45 | */ 46 | public Question(String question, String category) { 47 | this(question, null, category); 48 | } 49 | 50 | /** 51 | * Create a question with a clue and plain string answer but no category 52 | */ 53 | public static Question known(String question, String answer) { 54 | return known(question, answer, ""); 55 | } 56 | 57 | /** 58 | * Create a question with a clue, a plain string answer, and category 59 | */ 60 | public static Question known(String question, String answer, String category) { 61 | return new Question(question, 62 | new Answer("answer", answer, answer, ""), 63 | category); 64 | } 65 | 66 | public String getCategory() { 67 | return category; 68 | } 69 | 70 | public QType getType() { 71 | return type; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/LATCheck.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import org.apache.log4j.Logger; 4 | 5 | import edu.uncc.cs.watsonsim.Answer; 6 | import edu.uncc.cs.watsonsim.Environment; 7 | import edu.uncc.cs.watsonsim.Question; 8 | import edu.uncc.cs.watsonsim.nlp.ClueType; 9 | import edu.uncc.cs.watsonsim.nlp.Relatedness; 10 | import edu.uncc.cs.watsonsim.scorers.AnswerScorer; 11 | 12 | /** 13 | * Check if the question LAT matches one of the answer LATs 14 | * @author Sean 15 | * 16 | */ 17 | public class LATCheck extends AnswerScorer { 18 | private final Relatedness syn; 19 | private final Logger log = Logger.getLogger(getClass()); 20 | 21 | /** 22 | * Create a new LATCheck using a shared environment 23 | */ 24 | public LATCheck(Environment env) { 25 | syn = new Relatedness(env); 26 | } 27 | 28 | @Override 29 | public double scoreAnswer(Question q, Answer a) { 30 | /* 31 | * There are several options here of how to determine synonyms. 32 | * 33 | * Synonym generation approaches: 34 | * 1) Given a label, find the article titles. 35 | * 2)*Given an article title, find the labels. 36 | * 3) Given a label, find the other labels sharing an article title. 37 | * 4) Given a label, find the main article, and all the links to that main article. 38 | * 5) Given two labels, combine the weights of common article titles. 39 | * 40 | * Synonym checking approaches: 41 | * 1)*Synonymize Q's, check against A's 42 | * 2) Synonymize A's, check against Q's 43 | * 3) Synonymize both, combine common results 44 | * 45 | * Right now, we are using (G2, C1). 46 | */ 47 | /*if (!q.simple_lat.isEmpty()) { 48 | List> question_synonyms = syn.viaWikiLinks(new String[]{q.simple_lat}); 49 | question_synonyms.add(new Weighted(q.simple_lat, 1000.0)); 50 | for (Weighted synonym : question_synonyms) { 51 | for (String candidate_type : a.lexical_types) { 52 | if (syn.matchViaLevenshtein(synonym.item, candidate_type)) { 53 | log.info(a.text + " is a " + synonym.item 54 | + " which is " + q.simple_lat 55 | + " (weight " + Math.log(synonym.weight) + ")"); 56 | return Math.log(synonym.weight); 57 | } 58 | } 59 | } 60 | }*/ 61 | for (String lextype : a.lexical_types) { 62 | if (syn.matchViaSearch(q.memo(ClueType::fromClue), lextype)) 63 | return 1.0; 64 | } 65 | return -1.0; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/ElliotMerschScorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import edu.uncc.cs.watsonsim.Answer; 4 | import edu.uncc.cs.watsonsim.Passage; 5 | import edu.uncc.cs.watsonsim.Phrase; 6 | 7 | public class ElliotMerschScorer extends PassageScorer{ 8 | 9 | public double Scorer (Phrase q, Answer a, Passage p){ 10 | 11 | String Qraw = q.text; 12 | String Ptext = p.text; 13 | String Ptitle = p.title; 14 | 15 | //test variables 16 | //String Qraw = "What is the tallest building?"; 17 | //String Ptext = "The world's tallest man-made structure is the 829.8 m (2,722 ft) tall Burj Khalifa in Dubai, United Arab Emirates. The building gained the official title of Tallest Building in the World at its opening on January 4, 2010."; 18 | //String Ptitle = "List of tallest buildings and structures in the world"; 19 | 20 | double score = 0; 21 | 22 | String[] Qsplit = Qraw.split(" "); 23 | String[] PtitleSplit = Ptitle.split(" "); 24 | String[] PtextSplit = Ptext.split(" "); 25 | 26 | //check passage title 27 | for (int i=0; i 0.5) == y) / len(y) 29 | 30 | ### This is the actual question prediction error, in bits 31 | # First, find the probabilities 32 | pred_y = pred * y[border:] # These are the probabilities for right answers 33 | pred_y = pred_y[pred_y.nonzero()] # the same, stripped of 0's 34 | mean_bits = np.mean(-np.log(pred_y) / np.log(2)) # measured in mean bits 35 | 36 | ### This is the literal accuracy - it gets complicated 37 | # Sort the answers by probability, descending (only getting the indices) 38 | confidence_order = np.argsort(pred) 39 | # This indexing trick always takes the last assignment for each index 40 | # This will hold the index of the best answer for each question 41 | best_answer = np.zeros(np.max(q.astype(int))+1) 42 | best_answer[q[confidence_order].astype(int)] = confidence_order 43 | # Take the average correctness of the best answer 44 | accu_by_q = y[border:][best_answer.astype(int)].mean() 45 | 46 | return [C, gamma, accu, mean_bits, accu_by_q, train_time, test_time] 47 | 48 | import code 49 | 50 | def multi(): 51 | from multiprocessing import Pool 52 | p = Pool(40) 53 | ins = [(base**i, base**j) for i in exp_range for j in exp_range] 54 | with open("svmresults-largeimage-smallset.log", "w") as o: 55 | for row in p.imap_unordered(svc, ins): 56 | print '\t'.join(map(str, row)) 57 | o.write('\t'.join(map(str, row)) + '\n') 58 | 59 | code.interact(local=vars()) 60 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/PersonRecognition.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.util.List; 7 | import java.util.logging.Level; 8 | import java.util.logging.Logger; 9 | 10 | import edu.uncc.cs.watsonsim.Answer; 11 | import edu.uncc.cs.watsonsim.Phrase; 12 | import edu.uncc.cs.watsonsim.QType; 13 | import edu.uncc.cs.watsonsim.Question; 14 | import opennlp.tools.namefind.NameFinderME; 15 | import opennlp.tools.namefind.TokenNameFinderModel; 16 | import opennlp.tools.util.Span; 17 | 18 | /** 19 | * 20 | * @author Phani Rahul 21 | */ 22 | public class PersonRecognition extends Researcher { 23 | 24 | private static TokenNameFinderModel model = null; 25 | private static NameFinderME nameFinder = null; 26 | private boolean enabled=true; 27 | 28 | public PersonRecognition() { 29 | InputStream is; 30 | try { 31 | is = new FileInputStream("data/en-ner-person.bin"); 32 | model = new TokenNameFinderModel(is); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | System.err.println("Missing NLP model data. Deactivating NameRecognitionResearcher."); 36 | enabled = false; 37 | } 38 | nameFinder = null; 39 | try { 40 | nameFinder = new NameFinderME(model); 41 | } catch (Exception ex) { 42 | Logger.getLogger(PersonRecognition.class.getName()).log(Level.SEVERE, null, ex); 43 | } 44 | } 45 | 46 | @Override 47 | public List question(Question q, List answers) { 48 | if (q.getType() == QType.FITB && enabled){ 49 | answers = super.question(q, answers); 50 | } 51 | return answers; 52 | } 53 | 54 | @Override 55 | public Answer answer(Phrase q, Answer answer) { 56 | Span nameSpans[] = null; 57 | String[] sentence = null; 58 | sentence = answer.text.split("[,'() ]+"); 59 | 60 | nameSpans = nameFinder.find(sentence); 61 | nameFinder.clearAdaptiveData(); 62 | 63 | StringBuilder ret = new StringBuilder(); 64 | for (Span s : nameSpans) { 65 | 66 | for (int i = s.getStart(); i < s.getEnd(); i++) { 67 | ret.append(sentence[i]); 68 | ret.append(" "); 69 | } 70 | } 71 | if (!ret.toString().isEmpty()){ 72 | return answer.withText(ret.toString()); 73 | } 74 | return answer; 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/ApproxStringIntMapTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.Iterator; 6 | 7 | import org.apache.commons.lang3.tuple.Pair; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | public class ApproxStringIntMapTest { 12 | ApproxStringIntMap asim; 13 | @Before 14 | public void setUp() { 15 | asim = new ApproxStringIntMap(new StringStack("moo", "far")); 16 | } 17 | 18 | @Test 19 | public void testSize() { 20 | assertEquals(0, asim.size()); 21 | asim.put("moo", 1); 22 | assertEquals(1, asim.size()); 23 | } 24 | 25 | @Test 26 | public void testIsEmpty() { 27 | assertTrue(asim.isEmpty()); 28 | asim.put("moo", 1); 29 | assertFalse(asim.isEmpty()); 30 | } 31 | 32 | @Test 33 | public void testContainsKey() { 34 | assertFalse(asim.containsKey("moo")); 35 | asim.put("moo", 1); 36 | assertTrue(asim.containsKey("moo")); 37 | assertFalse(asim.containsKey("far")); 38 | asim.put("erk", 7); 39 | assertTrue(asim.containsKey("moo")); 40 | assertTrue(asim.containsKey("erk")); 41 | assertFalse(asim.containsKey("far")); 42 | } 43 | 44 | @Test 45 | public void testGetPut() { 46 | assertEquals(0, asim.get("moo")); // ! Keep this in mind! 47 | asim.put("far", 1); 48 | assertEquals(0, asim.get("moo")); 49 | assertEquals(1, asim.get("far")); 50 | asim.put("erk", 2); 51 | assertEquals(0, asim.get("moo")); 52 | assertEquals(2, asim.get("erk")); 53 | } 54 | 55 | @Test 56 | public void testAddTo() { 57 | assertEquals(0, asim.get("moo")); 58 | asim.addTo("moo", 4); 59 | assertEquals(4, asim.get("moo")); 60 | asim.addTo("moo", 4); 61 | assertEquals(8, asim.get("moo")); 62 | } 63 | 64 | @Test 65 | public void testRemove() { 66 | asim.put("moo", 1); 67 | asim.put("far", 2); 68 | assertTrue(asim.containsKey("far")); 69 | asim.remove("far"); 70 | assertFalse(asim.containsKey("far")); 71 | } 72 | 73 | @Test 74 | public void testClear() { 75 | asim.put("moo", 1); 76 | asim.put("far", 2); 77 | assertTrue(asim.containsKey("far")); 78 | asim.clear(); 79 | assertFalse(asim.containsKey("far")); 80 | } 81 | 82 | @Test 83 | public void testIterator() { 84 | asim.put("moo", 1); 85 | asim.put("far", 2); 86 | Iterator> pairs = asim.iterator(); 87 | assertTrue(pairs.hasNext()); 88 | assertEquals(Pair.of("moo", 1), pairs.next()); 89 | assertTrue(pairs.hasNext()); 90 | assertEquals(Pair.of("far", 2), pairs.next()); 91 | assertFalse(pairs.hasNext()); 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/MergeAnswers.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Question; 9 | 10 | /*Author : Ricky Sanders 11 | * 12 | * Compares answer to answer to merge those that have 3 or more words in common 13 | * Currently keeps the longest answer 14 | * 15 | * WORK IN PROGRESS 16 | */ 17 | 18 | public class MergeAnswers extends Researcher{ 19 | @Override 20 | /** Call merge on any two similar answers */ 21 | public List question(Question q, List answers) { 22 | List> answer_blocks = new ArrayList<>(); 23 | 24 | // Arrange the answers into blocks 25 | each_answer: 26 | for (Answer original : answers) { 27 | HashSet original_terms = new HashSet(); 28 | original_terms.addAll(original.getTokens()); 29 | //return reference_terms.containsAll(StringUtils.tokenize(reference)); 30 | for (List block : answer_blocks) { 31 | for (Answer example : block) { 32 | HashSet example_terms = new HashSet(); 33 | example_terms.addAll(example.getTokens()); 34 | // Look through the examples in this topic 35 | // If it matches, choose to put it in this block and quit. 36 | 37 | int sizeExample = example_terms.size(); 38 | 39 | example_terms.retainAll(original_terms); 40 | int count = example_terms.size(); 41 | 42 | double percentCorrect = count/(sizeExample + 0.01); 43 | 44 | /** Merge by word count of 3 only */ 45 | 46 | if (count >= 3 || percentCorrect >= 0.5) { 47 | original.log(this, "It restates %s", original); 48 | block.add(original); 49 | continue each_answer; 50 | } 51 | 52 | } 53 | } 54 | 55 | // Make a new topic for this answer 56 | List new_block = new ArrayList<>(); 57 | new_block.add(original); 58 | answer_blocks.add(new_block); 59 | } 60 | 61 | // Merge the blocks 62 | List new_answers = new ArrayList<>(); 63 | for (List block : answer_blocks) { 64 | if (block.size() > 1) { 65 | new_answers.add(Answer.merge(block)); 66 | } else { 67 | new_answers.add(block.get(0)); 68 | } 69 | } 70 | 71 | log.info("Merged " + answers.size() + " candidates into " + new_answers.size() + " (by word similarity)."); 72 | return new_answers; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/URLExpander.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.InputStreamReader; 5 | import com.google.gson.reflect.TypeToken; 6 | 7 | import crawlercommons.fetcher.BaseFetchException; 8 | import crawlercommons.fetcher.http.SimpleHttpFetcher; 9 | import crawlercommons.fetcher.http.UserAgent; 10 | import de.l3s.boilerpipe.BoilerpipeProcessingException; 11 | import de.l3s.boilerpipe.extractors.ArticleExtractor; 12 | import edu.uncc.cs.watsonsim.Answer; 13 | import edu.uncc.cs.watsonsim.Environment; 14 | import edu.uncc.cs.watsonsim.Passage; 15 | import edu.uncc.cs.watsonsim.Phrase; 16 | 17 | 18 | /** Fill in the full text of an answer from it's URL, if it has one */ 19 | public class URLExpander extends Researcher { 20 | private SimpleHttpFetcher fetcher; 21 | 22 | private Environment env; 23 | 24 | public URLExpander(Environment env) { 25 | this.env = env; 26 | fetcher = new SimpleHttpFetcher(3, 27 | new UserAgent( 28 | "Watsonsim QA engine (bot)", 29 | "stgallag@gmail.com", 30 | "http://github.com/SeanTater/uncc2014watsonsim", 31 | "Mozilla/5.0", 32 | "10 May 2015")); 33 | 34 | //fetcher.setConnectionTimeout(2000); 35 | //fetcher.setSocketTimeout(2000); 36 | fetcher.setMaxRetryCount(1); 37 | } 38 | 39 | /** 40 | * Get a page from the Internet and clean it. 41 | */ 42 | private String fetch(String key) { 43 | try { 44 | byte[] payload = fetcher.fetch(key.substring(4)).getContent(); 45 | InputStreamReader isr = new InputStreamReader( 46 | new ByteArrayInputStream(payload)); 47 | return ArticleExtractor.INSTANCE.getText(isr); 48 | } catch (BaseFetchException | BoilerpipeProcessingException e) { 49 | // TODO Auto-generated catch block 50 | System.err.println("Can't connect to " + key); 51 | return ""; 52 | } 53 | } 54 | 55 | public Answer answer(Phrase q, Answer a) { 56 | a.passages.replaceAll( p -> { 57 | if (p.reference.startsWith("http") && p.reference.contains(".htm")) { 58 | /* This is roundabout because I really want to avoid 59 | * committing to a character set. (So I don't use String.) 60 | */ 61 | // Download 62 | String payload = env.computeIfAbsent("url:"+p.reference, 63 | this::fetch, 64 | new TypeToken(){}.getType()); 65 | if (!payload.isEmpty()) { 66 | // Parse 67 | p = new Passage( 68 | "live-url", 69 | p.title, 70 | payload, 71 | p.reference); 72 | a.log(this, "Filled in passage from %s", p.reference); 73 | } 74 | } 75 | return p; 76 | }); 77 | return a; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/StephensonOpenNLPScorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import opennlp.tools.parser.ParserModel; 9 | import opennlp.tools.postag.POSModel; 10 | import opennlp.tools.postag.POSTaggerME; 11 | import opennlp.tools.sentdetect.SentenceDetectorME; 12 | import opennlp.tools.sentdetect.SentenceModel; 13 | import opennlp.tools.util.InvalidFormatException; 14 | 15 | /* 16 | * Author: Chris Stephenson 17 | */ 18 | 19 | public class StephensonOpenNLPScorer { 20 | private boolean modelsAreInitialized=false; 21 | public String modelsPath="data/"; //models directory 22 | private File parserMFile; 23 | private File sentDetectorMFile; 24 | private File chunkerMFile; 25 | private File posMFile; 26 | 27 | public SentenceModel sentenceModel; //sentence detection model 28 | public ParserModel parserModel; //parsing model 29 | public POSTaggerME tagger; 30 | 31 | 32 | public void init() throws InvalidFormatException{ 33 | File modelsDir = new File(this.modelsPath); 34 | 35 | this.parserMFile = new File(modelsDir, "en-parser-chunking.bin"); 36 | this.sentDetectorMFile = new File(modelsDir, "en-sent.bin"); 37 | this.chunkerMFile=new File(modelsDir,"en-chunker.bin"); 38 | this.posMFile = new File(modelsDir,"en-pos-maxent.bin"); 39 | 40 | InputStream sentModelIn = null; 41 | FileInputStream parserStream; 42 | try { 43 | //for finding sentences 44 | sentModelIn = new FileInputStream(sentDetectorMFile); 45 | this.sentenceModel = new SentenceModel(sentModelIn); 46 | //for finding POS 47 | FileInputStream posModelStream = new FileInputStream(posMFile); 48 | POSModel model = new POSModel(posModelStream); 49 | this.tagger = new POSTaggerME(model); 50 | //for parsing 51 | parserStream = new FileInputStream(parserMFile); 52 | this.parserModel = new ParserModel(parserStream); 53 | } catch (FileNotFoundException e2) { 54 | // TODO Auto-generated catch block 55 | e2.printStackTrace(); 56 | } catch (IOException e) { 57 | // TODO Auto-generated catch block 58 | e.printStackTrace(); 59 | } 60 | this.modelsAreInitialized=true; 61 | } 62 | 63 | public void testSentDetector(String testSents) throws InvalidFormatException{ 64 | init(); 65 | SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel); 66 | String[] sentences = sentenceDetector.sentDetect(testSents); 67 | for (int i=0;i answers) { 44 | for (Answer a : answers) { 45 | double sum = 0.0; 46 | final int p_count = a.passages.size(); 47 | if (p_count > 0) { 48 | double[] scores = new double[p_count]; 49 | for (int pi=0; piint relations, 12 | * and later when you iterate it guesses the hash->String relation using a 13 | * dictionary. 14 | * @author Sean 15 | */ 16 | public class ApproxStringIntMap implements Iterable> { 17 | StringStack dict; 18 | IntIntOpenHashMap map = new IntIntOpenHashMap(); 19 | 20 | private int hash(String x) { 21 | byte[] b = x.getBytes(); 22 | return MurmurHash2.hash(b, 0, 0, b.length); 23 | } 24 | 25 | /** Create an approximate String-int map using a shared dictionary */ 26 | public ApproxStringIntMap(StringStack dictionary) { 27 | dict = dictionary; 28 | } 29 | 30 | public int size() { 31 | return map.size(); 32 | } 33 | 34 | public boolean isEmpty() { 35 | return size() == 0; 36 | } 37 | 38 | public boolean containsKey(String key) { 39 | return map.containsKey(hash(key)); 40 | } 41 | 42 | public int get(String key) { 43 | return map.get(hash(key)); 44 | } 45 | 46 | public int put(String key, int value) { 47 | return map.put(hash(key), value); 48 | } 49 | 50 | public int addTo(String key, int amount) { 51 | return map.addTo(hash(key), amount); 52 | } 53 | 54 | public int remove(String key) { 55 | return map.remove(hash(key)); 56 | } 57 | 58 | public void clear() { 59 | map.clear(); 60 | } 61 | 62 | /** 63 | * Iterate the entries in this map - linear in complexity to the vocabulary 64 | * size! 65 | */ 66 | public Iterator> iterator() { 67 | return new StringIntMapIterator(this); 68 | } 69 | 70 | private class StringIntMapIterator implements Iterator> { 71 | private final Iterator dictiter; 72 | private Pair next_item; 73 | private ApproxStringIntMap asim; 74 | StringIntMapIterator(ApproxStringIntMap asim) { 75 | this.dictiter = asim.dict.iterator(); 76 | this.asim = asim; 77 | } 78 | 79 | @Override 80 | public boolean hasNext() { 81 | while (next_item == null && dictiter.hasNext()) { 82 | String key = dictiter.next(); 83 | if (asim.containsKey(key)) 84 | next_item = Pair.of(key, asim.get(key)); 85 | } 86 | return next_item != null; 87 | } 88 | 89 | @Override 90 | public Pair next() { 91 | Pair item = next_item; 92 | next_item = null; 93 | return item; 94 | } 95 | 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/index/Bigrams.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.index; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import java.nio.file.StandardOpenOption; 7 | import java.util.concurrent.ConcurrentHashMap; 8 | import java.util.stream.Stream; 9 | 10 | import org.apache.log4j.Logger; 11 | 12 | import edu.stanford.nlp.util.IterableIterator; 13 | import edu.uncc.cs.watsonsim.Passage; 14 | 15 | /** 16 | * Count the bigrams in all passages for entropy based scorers 17 | * @author Sean Gallaghers 18 | */ 19 | public class Bigrams implements Segment { 20 | private ConcurrentHashMap unigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); 21 | private ConcurrentHashMap bigrams = new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); 22 | private final Logger log = Logger.getLogger(getClass()); 23 | 24 | public Bigrams() { 25 | } 26 | 27 | @Override 28 | public void close() throws IOException { 29 | flush(); 30 | } 31 | 32 | public void flush() throws IOException { 33 | // Make space-separated lines 34 | Stream lines = unigrams.entrySet().stream() 35 | .map((pair) -> 36 | pair.getKey() + " " + pair.getValue()); 37 | unigrams= new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); 38 | Files.write( 39 | Paths.get("/mnt/NCDS/sean", "unigrams"), 40 | new IterableIterator(lines.iterator()), 41 | StandardOpenOption.CREATE, 42 | StandardOpenOption.WRITE, 43 | StandardOpenOption.APPEND); 44 | // Make space-separated lines 45 | lines = bigrams.entrySet().stream() 46 | .map((pair) -> 47 | pair.getKey() + " " + pair.getValue()); 48 | bigrams =new ConcurrentHashMap<>(1_000_000, (float) 0.75, 50); 49 | Files.write( 50 | Paths.get("/mnt/NCDS/sean", "bigrams"), 51 | new IterableIterator(lines.iterator()), 52 | StandardOpenOption.CREATE, 53 | StandardOpenOption.WRITE, 54 | StandardOpenOption.APPEND); 55 | } 56 | 57 | @Override 58 | public void accept(Passage t) { 59 | if (!t.getTokens().isEmpty()) { 60 | unigrams.merge(t.getTokens().get(0), 1, (a, b) -> a+b); 61 | } 62 | for (int i=0; i < t.getTokens().size() - 1; i++) { 63 | String key = t.getTokens().get(i) + " " + t.getTokens().get(i+1); 64 | bigrams.merge(key, 1, (a, b) -> a+b); 65 | unigrams.merge(t.getTokens().get(i+1), 1, (a, b) -> a+b); 66 | } 67 | // Try to keep it from absorbing all available memory 68 | if (unigrams.size() > 1_000_000 69 | || bigrams.size() > 1_000_000) { 70 | try { 71 | flush(); 72 | } catch (IOException failed_flush) { 73 | log.error(failed_flush); 74 | } 75 | } 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/StringStackTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.util.Iterator; 6 | 7 | import org.junit.Test; 8 | 9 | public class StringStackTest { 10 | 11 | @Test 12 | public void testSize() { 13 | assertEquals(0, new StringStack().size()); 14 | assertEquals(1, new StringStack("moo").size()); 15 | assertEquals(2, new StringStack("foo", "bar").size()); 16 | } 17 | 18 | @Test 19 | public void testIsEmpty() { 20 | assertTrue(new StringStack().isEmpty()); 21 | assertFalse(new StringStack("moo").isEmpty()); 22 | } 23 | 24 | @Test 25 | public void testContains() { 26 | assertFalse(new StringStack().contains("moo")); 27 | assertTrue(new StringStack("moo").contains("moo")); 28 | assertFalse(new StringStack("moo").contains("foobar")); 29 | assertTrue(new StringStack("foo", "moo").contains("foo")); 30 | } 31 | 32 | @Test 33 | public void testAdd() { 34 | StringStack ss = new StringStack(); 35 | assertEquals(0, ss.size()); 36 | ss.add("moo"); 37 | assertEquals(1, ss.size()); 38 | assertFalse(ss.contains("erk")); 39 | assertTrue(ss.contains("moo")); 40 | ss.add("moo"); 41 | assertEquals(2, ss.size()); 42 | ss.add("erk"); 43 | assertEquals(3, ss.size()); 44 | assertTrue(ss.contains("erk")); 45 | assertTrue(ss.contains("moo")); 46 | } 47 | 48 | @Test 49 | public void testClear() { 50 | StringStack ss = new StringStack("moo"); 51 | ss.clear(); 52 | assertEquals(0, ss.size()); 53 | assertFalse(ss.contains("moo")); 54 | } 55 | 56 | @Test 57 | public void testGet() { 58 | StringStack ss = new StringStack("moo", "far"); 59 | assertEquals(null, ss.get(-1)); 60 | assertEquals("moo", ss.get(0)); 61 | assertEquals("far", ss.get(1)); 62 | assertEquals(null, ss.get(2)); 63 | 64 | } 65 | 66 | @Test 67 | public void testIndexOf() { 68 | StringStack ss = new StringStack("moo", "far"); 69 | assertEquals(0, ss.indexOf("moo")); 70 | assertEquals(1, ss.indexOf("far")); 71 | assertEquals(-1, ss.indexOf("erk")); 72 | assertEquals(-1, ss.indexOf(null)); 73 | } 74 | 75 | @Test 76 | public void testIterator() { 77 | StringStack ss = new StringStack("moo", "far"); 78 | Iterator iters = ss.iterator(); 79 | assertTrue(iters.hasNext()); 80 | assertEquals("moo", iters.next()); 81 | assertTrue(iters.hasNext()); 82 | assertEquals("far", iters.next()); 83 | assertFalse(iters.hasNext()); 84 | 85 | // Check that it's repeatable 86 | iters = ss.iterator(); 87 | assertTrue(iters.hasNext()); 88 | assertEquals("moo", iters.next()); 89 | assertTrue(iters.hasNext()); 90 | assertEquals("far", iters.next()); 91 | assertFalse(iters.hasNext()); 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/StatsDump.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Files; 5 | import java.nio.file.Path; 6 | import java.nio.file.Paths; 7 | 8 | import static java.nio.file.StandardOpenOption.*; 9 | 10 | import java.nio.charset.Charset; 11 | import java.sql.Timestamp; 12 | import java.util.List; 13 | import edu.uncc.cs.watsonsim.Answer; 14 | import edu.uncc.cs.watsonsim.Environment; 15 | import edu.uncc.cs.watsonsim.Question; 16 | import edu.uncc.cs.watsonsim.Score; 17 | 18 | import org.json.simple.*; 19 | 20 | public class StatsDump extends Researcher { 21 | private JSONObject jrun = new JSONObject(); 22 | private JSONArray jquestions = new JSONArray(); 23 | private final Path logfile; 24 | 25 | /** 26 | * Start a new run in the reports tables. 27 | */ 28 | @SuppressWarnings("unchecked") 29 | public StatsDump(Timestamp run_id, Environment env) { 30 | this.logfile = Paths.get("data/run_log_"+run_id.toString()); 31 | 32 | jrun.put("timestamp", run_id.toString()); 33 | jrun.put("questions", jquestions); 34 | } 35 | 36 | /** 37 | * Store a question with its answers and scores in the reports tables. 38 | */ 39 | @SuppressWarnings("unchecked") 40 | @Override 41 | public synchronized List question(Question q, List answers) { 42 | JSONObject jquestion = new JSONObject(); 43 | jquestion.put("text", q.text); 44 | jquestion.put("category", q.getCategory()); 45 | jquestion.put("graphs", q.getGraphs().toString()); 46 | jquestion.put("trees", q.getTrees().toString()); 47 | jquestion.put("tokens", q.getTokens().toString()); 48 | // defaults 49 | jquestion.put("correct", false); 50 | jquestion.put("rank", -1); 51 | 52 | JSONArray janswers = new JSONArray(); 53 | jquestion.put("answers", janswers); 54 | 55 | for (int rank=answers.size()-1; rank>=0; rank--) { 56 | Answer a = answers.get(rank); 57 | JSONObject ja = new JSONObject(); 58 | janswers.add(ja); 59 | 60 | ja.put("text", a.text); 61 | ja.put("evidence", a.explain()); 62 | boolean correct = a.scores.get("CORRECT") > 0.99; 63 | ja.put("correct", correct); 64 | 65 | // Convenience attributes 66 | if (rank==0) 67 | jquestion.put("correct", correct); 68 | if (correct) 69 | jquestion.put("rank", rank); 70 | 71 | JSONObject jscores = new JSONObject(); 72 | ja.put("scores", jscores); 73 | 74 | ja.putAll(Score.asMap(a.scores)); 75 | } 76 | try { 77 | Files.write(logfile, jquestion.toJSONString().getBytes(Charset.forName("UTF-8")), APPEND, CREATE); 78 | } catch (IOException e) { 79 | // Silently skip writing the question 80 | } 81 | return answers; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /scripts/gensim/intro-1level.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 4 | from gensim import corpora, models, similarities 5 | 6 | # remove common words and tokenize 7 | stoplist = set('for a of the and to in'.split()) 8 | 9 | ### Create the corpus out of the documents 10 | if os.path.exists('word8-lines.short.corpus.mm'): 11 | dictionary = corpora.Dictionary.load("word8-lines.short.dict") 12 | corpus = corpora.MmCorpus('word8-lines.short.corpus.mm') 13 | lsi = models.LsiModel.load('word8-lines.short.lsimodel') 14 | index = similarities.MatrixSimilarity.load("word8-lines.short.matsim") 15 | else: 16 | # collect statistics about all tokens 17 | dictionary = corpora.Dictionary(line.lower().split() for line in open('word8-lines.short')) 18 | # remove stop words and words that appear only once 19 | stop_ids = [dictionary.token2id[stopword] for stopword in stoplist 20 | if stopword in dictionary.token2id] 21 | once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] 22 | dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once 23 | dictionary.compactify() # remove gaps in id sequence after words that were removed 24 | dictionary.save('word8-lines.short.dict') 25 | print(dictionary) 26 | 27 | ### Preprocessing 28 | class MyCorpus(object): 29 | def __len__(self): 30 | i=0 31 | for line in open("word8-lines.short"): 32 | i += 1 33 | return i 34 | 35 | def __iter__(self): 36 | for line in open('word8-lines.short'): 37 | # assume there's one document per line, tokens separated by whitespace 38 | yield dictionary.doc2bow(line.lower().split()) 39 | 40 | corpus = MyCorpus() 41 | corpora.MmCorpus.serialize('word8-lines.short.corpus.mm', corpus) # store to disk, for later use 42 | 43 | ### Creating the index 44 | tfidf = models.TfidfModel(corpus) 45 | corpus_tfidf = tfidf[corpus] 46 | lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) # initialize an LSI transformation 47 | lsi.save('word8-lines.short.lsimodel') 48 | #corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 49 | 50 | index = similarities.MatrixSimilarity(lsi[corpus], num_features=300) 51 | index.save('word8-lines.short.matsim') 52 | 53 | ## Get a query 54 | query = raw_input("Search: ") 55 | while query: 56 | vec = dictionary.doc2bow(query.lower().split()) 57 | 58 | sims = index[lsi[vec]] 59 | print(sorted(list(enumerate(sims)), key=lambda x: -x[1])[:20]) 60 | query = raw_input("Search: ") 61 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/JM_Scorer.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scorers; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | 7 | import edu.uncc.cs.watsonsim.Answer; 8 | import edu.uncc.cs.watsonsim.Passage; 9 | import edu.uncc.cs.watsonsim.Phrase; 10 | import opennlp.tools.cmdline.parser.ParserTool; 11 | import opennlp.tools.parser.Parse; 12 | import opennlp.tools.parser.Parser; 13 | import opennlp.tools.parser.ParserFactory; 14 | import opennlp.tools.parser.ParserModel; 15 | import opennlp.tools.postag.POSModel; 16 | import opennlp.tools.postag.POSTaggerME; 17 | import opennlp.tools.tokenize.Tokenizer; 18 | import opennlp.tools.tokenize.TokenizerME; 19 | import opennlp.tools.tokenize.TokenizerModel; 20 | import opennlp.tools.util.InvalidFormatException; 21 | 22 | public class JM_Scorer extends PassageScorer{ 23 | public double matchChildren(Parse pa1, Parse pa2) { 24 | String p1NodeLabel = pa1.getLabel(); 25 | String p2NodeLabel = pa2.getLabel(); 26 | Parse[] children1 = pa1.getChildren(); 27 | Parse[] children2 = pa2.getChildren(); 28 | double matchFound = 0; 29 | 30 | if (pa1 == null || pa2 == null) { 31 | return 0; 32 | } 33 | 34 | if (p1NodeLabel.equals(p2NodeLabel)) { 35 | if (pa1.getCoveredText().equals(pa2.getCoveredText())) { 36 | matchFound = 1; 37 | } 38 | } 39 | 40 | return matchFound + matchChildren(children1[0], children2[0]) + matchChildren(children1[1], children2[1]); 41 | } 42 | 43 | //a simple scorer based on the number of matches; requires the first string to be in the passage 44 | public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{ 45 | POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin")))); 46 | Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin")))); 47 | Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin")))); 48 | double score = 0; 49 | 50 | Parse[] questionParse = ParserTool.parseLine(q, parser, 1); 51 | Parse[] passageParse = ParserTool.parseLine(q, parser, 1); 52 | 53 | if (passage.contains(ca)) { 54 | for (int i =0; i < questionParse.length; i++) { 55 | score += matchChildren(questionParse[i],passageParse[i]); 56 | } 57 | } 58 | 59 | return score; 60 | } 61 | 62 | public double scorePassage(Phrase q, Answer a, Passage p) { 63 | try { 64 | p.score("JM_Scorer", scoreStructure(q.text, a.text, p.text, false)); 65 | } catch (InvalidFormatException e) { 66 | e.printStackTrace(); 67 | } catch (IOException e) { 68 | e.printStackTrace(); 69 | } 70 | return Double.NaN; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/resources/public/stylesheets/index.css: -------------------------------------------------------------------------------- 1 | .navbar { 2 | margin-bottom: 0; 3 | } 4 | 5 | .navbar-brand { 6 | font-family: 'Alegreya Sans', sans-serif; 7 | color: white; 8 | font-size: 225%; 9 | vertical-align: middle; 10 | } 11 | 12 | .navbar-brand small { 13 | position: relative; 14 | top: 4px; 15 | right: 5px; 16 | display: inline-block; 17 | transform: rotate(-20deg); 18 | color: #ccc; 19 | font-size: 65%; 20 | } 21 | 22 | body, .jumbotron { 23 | background: rgb(245, 235, 207); 24 | } 25 | 26 | .jumbotron { 27 | font-family: 'Schoolbell', cursive; 28 | font-size: x-large; 29 | color: white; 30 | background: #9b9; 31 | border: 8px solid #ddd; 32 | border-bottom-width: 15px; 33 | margin-top: 1em; 34 | padding: 0.25em 1em; 35 | min-height: 10em; 36 | } 37 | 38 | #results { 39 | list-style-type: none; 40 | } 41 | 42 | #results .answer-text { 43 | font-family: 'Schoolbell', cursive; 44 | font-size: x-large; 45 | } 46 | 47 | #results .answer { 48 | display: block; 49 | margin-bottom: 0.2em; 50 | } 51 | 52 | #results .answer-bar { 53 | /* A chalk background */ 54 | display: inline-block; 55 | margin-right: 1em; 56 | padding: 0.15em 0; 57 | background-color: #f7f7f7; 58 | background: 59 | url(/chalk-left-end.png) left center no-repeat, 60 | url(/chalk-bar.png) left center repeat-x, 61 | url(/chalk-right-end.png) right center no-repeat; 62 | background-size: 10px, 220px, 10px; 63 | } 64 | 65 | .answer-details { 66 | font-size: medium; 67 | font-family: sans-serif; 68 | } 69 | 70 | .answer-details .panel { 71 | color: initial; 72 | } 73 | 74 | #console, .console { 75 | list-style-type: none; 76 | background-color: black; 77 | color: #ddd; 78 | font-family: monospace; 79 | } 80 | 81 | #console { 82 | display: none; 83 | height: 20em; 84 | overflow-y: scroll; 85 | } 86 | 87 | /* columns of same height styles */ 88 | 89 | .row-full-height { 90 | height: 100%; 91 | } 92 | .col-full-height { 93 | height: 100%; 94 | vertical-align: middle; 95 | } 96 | .row-same-height { 97 | display: table; 98 | width: 100%; 99 | /* fix overflow */ 100 | table-layout: fixed; 101 | } 102 | .col-xs-height { 103 | display: table-cell; 104 | float: none !important; 105 | } 106 | 107 | @media (min-width: 768px) { 108 | .col-sm-height { 109 | display: table-cell; 110 | float: none !important; 111 | } 112 | } 113 | @media (min-width: 992px) { 114 | .col-md-height { 115 | display: table-cell; 116 | float: none !important; 117 | } 118 | } 119 | @media (min-width: 1200px) { 120 | .col-lg-height { 121 | display: table-cell; 122 | float: none !important; 123 | } 124 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/Researcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.uncc.cs.watsonsim.Answer; 7 | import edu.uncc.cs.watsonsim.Log; 8 | import edu.uncc.cs.watsonsim.Phrase; 9 | import edu.uncc.cs.watsonsim.Question; 10 | 11 | /** Researchers can modify questions and have the guarantee of running 12 | * sequentially. They also do not return double's because they are not expected 13 | * to do scoring. If they do, they can use score() themselves. Consider using 14 | * Scorer instead for that, which is parallelizable. 15 | */ 16 | abstract public class Researcher { 17 | /** 18 | * The empty researcher does nothing. 19 | */ 20 | public static final Researcher NIL = new Researcher() { 21 | public List pull(Question q, List answers){return answers;} 22 | }; 23 | 24 | /** 25 | * The previous item in the research chain 26 | */ 27 | protected Researcher chain = NIL; 28 | 29 | /** 30 | * Output to the user. (This is a multi-user app so each pipeline needs to 31 | * know where to push new results.) 32 | */ 33 | protected Log log = Log.NIL; 34 | 35 | /** 36 | * Join together segments of a (recursive) Researcher pipeline. 37 | * The idea of it is that you can "pull" a question through it by passing 38 | * it to pull() of the last Researcher segment. 39 | * 40 | * @param segments Pipe segments, which will be mutated (for the chain) 41 | * @return The last Researcher in the line 42 | */ 43 | public static Researcher pipe(Log output, Researcher... segments) { 44 | Researcher prev = NIL; 45 | for (Researcher link : segments) { 46 | link.chain = prev; 47 | link.log = output.kid(link.getClass()); 48 | prev = link; 49 | } 50 | return prev; 51 | } 52 | 53 | /** 54 | * Wrapper method to pull questions through the research chain 55 | */ 56 | public List pull(Question q, List candidates) { 57 | return question(q, chain.pull(q, candidates)); 58 | } 59 | 60 | /** Default implementation of research for a question. 61 | * Simply calls research_answer for every Answer 62 | * Override this if you need more power. 63 | * @param question 64 | * @throws Exception 65 | */ 66 | public List question(Question q, List candidates) { 67 | List outs = new ArrayList<>(); 68 | for (Answer in : candidates) 69 | outs.add(answer(q, in)); 70 | return outs; 71 | } 72 | 73 | /** Default implementation for researching an answer. 74 | * Does nothing by default. You don't need to override this if you don't 75 | * use it. 76 | * @param q TODO 77 | * @param answer 78 | * 79 | * @return TODO 80 | */ 81 | public Answer answer(Phrase q, Answer a) { 82 | return a; 83 | } 84 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/DenseVectors.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import java.util.List; 4 | import java.util.Optional; 5 | import java.util.stream.Stream; 6 | 7 | import edu.uncc.cs.watsonsim.KV; 8 | 9 | public class DenseVectors { 10 | public static final int N = 300; 11 | private static final KV kv = new KV(); 12 | 13 | /** 14 | * Possibly get a vector context for a word (otherwise an empty Optional) 15 | * @param word The word in question 16 | * @return A Optional for that word, or Optional.empty() 17 | */ 18 | public static Optional vectorFor(String word) { 19 | if (word == null || word.isEmpty()) { 20 | return Optional.empty(); 21 | } else { 22 | return kv.get("big-glove", word).map(KV::asVector); 23 | } 24 | } 25 | 26 | /** 27 | * Find the cosine similarity of two vectors, which may or may not exist. 28 | * This is pessimistic, saying that if we have never seen a word before, it 29 | * is probably unrelated to everyone 30 | * @return 31 | */ 32 | public static double sim(float[] left, float[] right) { 33 | /* 34 | * A.T * B 35 | * ----------------------- 36 | * sqrt(A.T*A) sqrt(B.T*B) 37 | */ 38 | assert left.length == N; 39 | assert right.length == N; 40 | double ab = 0.0, aa = 0.0, bb = 0.0; 41 | for (int i=0; i left, Optional right) { 54 | if (left.isPresent() && right.isPresent()) 55 | return sim(left.get(), right.get()); 56 | else 57 | return 0.0; 58 | } 59 | 60 | /** 61 | * Average some vectors, as a multi-word model. This is not very meaningful 62 | * and may do strange things for the semantics. (e.g. we plan to do better) 63 | */ 64 | public static float[] mean(List vecs) { 65 | float[] mean = new float[N]; 66 | int count = 0; 67 | for (float[] vec: vecs) { 68 | for (int i=0; i0) for (int i=0; i vecs) { 80 | float[] logprod = new float[N]; 81 | int count = 0; 82 | for (float[] vec: vecs) { 83 | for (int i=0; i { 11 | /** start_byte[i] = x --> word i starts at block[x], ends at block[start_byte[i+1]] 12 | * The last element is where the free space begins. */ 13 | IntArrayList start_byte = IntArrayList.from(0); 14 | ByteArrayList block = new ByteArrayList(); 15 | 16 | /** Create a string stack from some existing strings */ 17 | public StringStack(String... xs) { 18 | for (String x: xs) 19 | add(x); 20 | } 21 | 22 | /** Create a string stack from some existing strings */ 23 | public StringStack(Iterable xs) { 24 | for (String x: xs) 25 | add(x); 26 | } 27 | 28 | /** How many strings are inside? */ 29 | public int size() { 30 | return start_byte.size() - 1; 31 | } 32 | 33 | /** Does it have at least one string? */ 34 | public boolean isEmpty() { 35 | return size() == 0; 36 | } 37 | 38 | /** Does this contain string x? (O(n) - and expensive)*/ 39 | public boolean contains(String o) { 40 | for (String x: this) { 41 | if (x.equals(o)) return true; 42 | } 43 | return false; 44 | } 45 | 46 | /** Add a string */ 47 | public boolean add(String e) { 48 | block.add(e.getBytes(UTF_8)); 49 | start_byte.add(block.size()); 50 | return true; 51 | 52 | } 53 | 54 | /** Remove all contents */ 55 | public void clear() { 56 | start_byte.clear(); 57 | start_byte.add(0); 58 | block.clear(); 59 | } 60 | 61 | /** Get a string by index */ 62 | public String get(int index) { 63 | if (0 <= index && index + 1 < start_byte.size()) { 64 | int offset = start_byte.get(index); 65 | int length = start_byte.get(index+1) - offset; 66 | return new String(block.buffer, offset, length); 67 | } else { 68 | return null; 69 | } 70 | } 71 | 72 | /** Find string x (O(n) - and expensive) */ 73 | public int indexOf(String o) { 74 | int i = 0; 75 | for (String x: this) { 76 | if (x.equals(o)) return i; 77 | else i++; 78 | } 79 | return -1; 80 | } 81 | 82 | /** Iterate a StringList */ 83 | public Iterator iterator() { 84 | return new StringListIterator(this); 85 | } 86 | 87 | private class StringListIterator implements Iterator { 88 | private int index = 0; 89 | private final StringStack sl; 90 | 91 | public StringListIterator(StringStack sl) { 92 | this.sl = sl; 93 | } 94 | 95 | @Override 96 | public boolean hasNext() { 97 | return index < sl.size(); 98 | } 99 | 100 | @Override 101 | public String next() { 102 | return sl.get(index++); 103 | } 104 | 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scripts/WiktionaryParser.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.scripts; 2 | import java.io.BufferedReader; 3 | import java.io.BufferedWriter; 4 | import java.io.FileReader; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | 8 | 9 | public class WiktionaryParser { 10 | public static void main (String[] args) throws IOException{ 11 | String title = ""; 12 | String def = ""; 13 | int defNum = 0; 14 | try(BufferedReader br = new BufferedReader(new FileReader("Test2.xml"))) { 15 | String line = br.readLine(); 16 | FileWriter fstream = new FileWriter("out.txt"); 17 | BufferedWriter out = new BufferedWriter(fstream); 18 | while (line != null) { 19 | if(line.contains("")){ 20 | defNum++; 21 | out.newLine(); 22 | out.newLine(); 23 | line = br.readLine(); 24 | outerloop: 25 | while ((line.contains("")) != true){ 26 | 27 | if (line.contains("") && (line.contains("Wiktionary") == false)){ 28 | out.write("____________________________________"); 29 | out.newLine(); 30 | out.newLine(); 31 | title = line; 32 | out.write("<DOC>"); 33 | out.newLine(); 34 | out.write("<TITLE>"); 35 | title = title.replaceAll("<title>", "").replaceAll("", ""); 36 | title = title.trim(); 37 | out.write(title); 38 | out.write(""); 39 | out.newLine(); 40 | out.write(""); 41 | }else if(line.contains("") && (line.contains("Wiktionary") == true)){ 42 | defNum = 0; 43 | break outerloop; 44 | } 45 | if (line.contains("# ")){ 46 | def = line; 47 | def = def.replace("[", ""); 48 | def = def.replace("]", ""); 49 | def = def.replace("{", ""); 50 | def = def.replace("}", ""); 51 | out.write(def); 52 | out.newLine(); 53 | 54 | } 55 | if (line.contains("===Etymology===")){ 56 | line = br.readLine(); 57 | while(line.contains("===") != true){ 58 | if(line.contains("*")){ 59 | line = line.replace("[", ""); 60 | line = line.replace("]", ""); 61 | line = line.replace("{", ""); 62 | line = line.replace("}", ""); 63 | out.write(line); 64 | out.newLine(); 65 | 66 | } 67 | 68 | line = br.readLine(); 69 | } 70 | out.newLine(); 71 | 72 | } 73 | 74 | line = br.readLine(); 75 | 76 | } 77 | 78 | out.write("</TEXT>"); 79 | out.newLine(); 80 | out.write("</DOC>"); 81 | } 82 | line = br.readLine(); 83 | 84 | } 85 | System.out.println(defNum + " definitions exported to out.txt"); 86 | out.close(); 87 | } 88 | 89 | 90 | 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/Database.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import java.nio.FloatBuffer; 4 | import java.sql.Array; 5 | import java.sql.Connection; 6 | import java.sql.DriverManager; 7 | import java.sql.PreparedStatement; 8 | import java.sql.ResultSet; 9 | import java.sql.SQLException; 10 | 11 | 12 | public class Database { 13 | private static Connection conn; 14 | 15 | public Database(Configuration env) { 16 | try { 17 | //Class.forName("org.sqlite.JDBC"); 18 | //Properties props = new Properties(); 19 | //props.put("busy_timeout", "30000"); 20 | //conn = DriverManager.getConnection("jdbc:sqlite:/mnt/NCDS/sean/06Jan2014.3.watsonsim.db", props); 21 | 22 | // JDBC's SQLite uses autocommit (So commit() is redundant) 23 | // Furthermore, close() is a no-op as long as the results are commit()'d 24 | 25 | 26 | //Class.forName("org.postgresql.Driver"); 27 | if (conn == null) { 28 | conn = DriverManager.getConnection(env.getConfOrDie("jdbc_connection_string")); 29 | if (backend().startsWith("SQLite")) { 30 | //conn.createStatement().execute("PRAGMA journal_mode = WAL;"); 31 | //conn.createStatement().execute("PRAGMA busy_timeout = 30000;"); 32 | //conn.createStatement().execute("PRAGMA synchronous = OFF;"); 33 | } 34 | } 35 | //conn.createStatement().execute("PRAGMA busy_timeout = 30000;"); 36 | //System.err.println(conn.getClass().getName()); 37 | 38 | } catch (SQLException e2) { 39 | e2.printStackTrace(); 40 | throw new RuntimeException("Can't run without a database."); 41 | } 42 | } 43 | 44 | /** Simple wrapper for creating an SQL statement */ 45 | public PreparedStatement prep(String sql) { 46 | PreparedStatement ps; 47 | try { 48 | ps = conn.prepareStatement(sql); 49 | ps.setFetchSize(100); 50 | } catch (SQLException e) { 51 | e.printStackTrace(); 52 | throw new RuntimeException("Can't prepare an SQL statement \"" + sql + "\""); 53 | } 54 | return ps; 55 | } 56 | 57 | public void commit() { 58 | try { 59 | if (!conn.getAutoCommit()) { 60 | conn.commit(); 61 | } 62 | } catch (SQLException e) { 63 | e.printStackTrace(); 64 | } 65 | } 66 | 67 | 68 | /** 69 | * This is a convenience method for getting the first item after executing 70 | * a prepared statement. 71 | * 72 | * This is useful for statements ending in "RETURNING __;" 73 | * 74 | * @param ps The statement to run 75 | * @return The ResultSet, moved forward one result 76 | * @throws SQLException 77 | */ 78 | public ResultSet then(PreparedStatement ps) throws SQLException { 79 | ResultSet rs = ps.executeQuery(); 80 | rs.next(); 81 | return rs; 82 | } 83 | 84 | /** 85 | * A simple delegate for creating Postgres arrays 86 | */ 87 | public Array createArrayOf(String typeName, Object[] elements) { 88 | try { 89 | return conn.createArrayOf(typeName, elements); 90 | } catch (SQLException e) { 91 | e.printStackTrace(); 92 | throw new RuntimeException("Can't create an SQL array from \"" + elements + "\""); 93 | } 94 | } 95 | 96 | public String backend() { 97 | return conn.getClass().getSimpleName(); 98 | } 99 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/LuceneSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.Term; 9 | import org.apache.lucene.search.BooleanClause; 10 | import org.apache.lucene.search.BooleanQuery; 11 | import org.apache.lucene.search.IndexSearcher; 12 | import org.apache.lucene.search.PhraseQuery; 13 | import org.apache.lucene.search.ScoreDoc; 14 | import org.apache.lucene.search.TermQuery; 15 | 16 | import edu.uncc.cs.watsonsim.Environment; 17 | import edu.uncc.cs.watsonsim.Passage; 18 | import edu.uncc.cs.watsonsim.Question; 19 | import edu.uncc.cs.watsonsim.Score; 20 | import edu.uncc.cs.watsonsim.scorers.Merge; 21 | 22 | /** 23 | * @author Phani Rahul 24 | */ 25 | public class LuceneSearcher extends Searcher { 26 | private final IndexSearcher lucene; 27 | 28 | public LuceneSearcher(Environment env) { 29 | super(env); 30 | lucene = env.lucene; 31 | Score.register("LUCENE_ANSWER_RANK", -1, Merge.Mean); 32 | Score.register("LUCENE_ANSWER_SCORE", -1, Merge.Mean); 33 | Score.register("LUCENE_ANSWER_PRESENT", 0.0, Merge.Sum); 34 | } 35 | 36 | /** 37 | * Create a Lucene query using the bigrams in the given text 38 | * @param text 39 | */ 40 | public BooleanQuery queryFromSkipBigrams(String text) { 41 | BooleanQuery q = new BooleanQuery(); 42 | String prev_word = null; 43 | for (String word : text.split("\\W+")) { 44 | if (prev_word != null) { 45 | PhraseQuery pq = new PhraseQuery(); 46 | pq.setSlop(1); 47 | pq.add(new Term("text", prev_word)); 48 | pq.add(new Term("text", word)); 49 | q.add(pq, BooleanClause.Occur.SHOULD); 50 | } 51 | q.add(new TermQuery(new Term("text", word)), BooleanClause.Occur.SHOULD); 52 | prev_word = word; 53 | } 54 | return q; 55 | } 56 | 57 | 58 | public List<Passage> query(Question question) { 59 | List<Passage> results = new ArrayList<>(); 60 | try { 61 | //ScoreDoc[] hits = env.simpleLuceneQuery(question.text, MAX_RESULTS); 62 | ScoreDoc[] hits = lucene.search( 63 | queryFromSkipBigrams( 64 | question.text 65 | + " " 66 | + question.getCategory()), 67 | MAX_RESULTS).scoreDocs; 68 | // This isn't range based because we need the rank 69 | for (int i=0; i < hits.length; i++) { 70 | ScoreDoc s = hits[i]; 71 | Document doc = lucene.doc(s.doc); 72 | results.add(new edu.uncc.cs.watsonsim.Passage( 73 | "lucene", // Engine 74 | "", // Title - filled in by shared db 75 | "", // Text - filled in by shared db 76 | doc.get("docno")) // Reference 77 | .score("LUCENE_ANSWER_RANK", (double) i) // Rank 78 | .score("LUCENE_ANSWER_SCORE", (double) s.score) // Source 79 | .score("LUCENE_ANSWER_PRESENT", 1.0) 80 | ); 81 | } 82 | } catch (IOException e) { 83 | System.out.println("Failed to query Lucene. Is the index in the correct location?"); 84 | e.printStackTrace(); 85 | } 86 | 87 | // Fill any missing full text from sources 88 | return fillFromSources(results); 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/WatsonSim.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.util.List; 7 | 8 | import org.apache.log4j.BasicConfigurator; 9 | import org.apache.log4j.Level; 10 | import org.apache.log4j.Logger; 11 | 12 | 13 | public class WatsonSim { 14 | public static void main(String[] args) throws Exception { 15 | 16 | // Read a command from the console 17 | System.out.print("Watsonsim CLI\n" 18 | + "Enter any natural language question to have it answered.\n" 19 | + "(Keep in mind phrasing it like Jeopardy! improves results.)\n" 20 | + "Place the correct answer after a | to check an answer.\n" 21 | + ">>> "); 22 | 23 | BasicConfigurator.configure(); 24 | Logger.getRootLogger().setLevel(Level.INFO); 25 | prompt(); 26 | } 27 | 28 | private static void listAnswers(List<Answer> answers, int max) { 29 | for (int i=0; i<answers.size() && i < max; i++) { 30 | Answer answer = answers.get(i); 31 | System.out.println(String.format("%2d: %s", i, answer.toLongString())); 32 | } 33 | if (answers.size() > max) { 34 | System.out.println((answers.size() - max) 35 | + " additional candidates are hidden."); 36 | } 37 | } 38 | 39 | private static Question readQuestion(String command) { 40 | if (command.contains("|")) { 41 | String[] parts = command.split("\\|"); 42 | return Question.known(parts[0].trim(), parts[1].trim()); 43 | } else { 44 | return new Question(command); 45 | } 46 | } 47 | 48 | private static void prompt() throws IOException { 49 | BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); 50 | String command = br.readLine(); 51 | // Defensively scroll the console so that the next error doesn't 52 | // clobber the user's text. 53 | System.out.println(); 54 | DefaultPipeline pipe = new DefaultPipeline(); 55 | 56 | while (!command.isEmpty()) { 57 | Question question = readQuestion(command); 58 | List<Answer> answers = pipe.ask(question); 59 | 60 | // Print out a simple one-line summary of each answer 61 | listAnswers(answers, 10); 62 | 63 | do { 64 | // Read in the next command from the console 65 | System.out.println("Enter \"...\" to see the hidden candidates,\n" 66 | + "an answer index to see an explanation,\n" 67 | + "a question to search again, or enter to quit\n>>> "); 68 | command = br.readLine(); 69 | if (StringUtils.isNumeric(command)) { 70 | // Explain 71 | Answer a = answers.get(Integer.parseInt(command)); 72 | System.out.println("Explanation for " + a); 73 | System.out.println(a.explain()); 74 | } else if (command.equals("...")) { 75 | // List all 76 | listAnswers(answers, 1000); 77 | } else { 78 | // Done with this question 79 | break; 80 | } 81 | } while (true); 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/IndriSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.apache.log4j.Logger; 7 | 8 | import edu.uncc.cs.watsonsim.Environment; 9 | import edu.uncc.cs.watsonsim.Passage; 10 | import edu.uncc.cs.watsonsim.Question; 11 | import edu.uncc.cs.watsonsim.Score; 12 | import edu.uncc.cs.watsonsim.StringUtils; 13 | import edu.uncc.cs.watsonsim.scorers.Merge; 14 | import lemurproject.indri.QueryAnnotation; 15 | import lemurproject.indri.QueryEnvironment; 16 | import lemurproject.indri.ScoredExtentResult; 17 | 18 | /** 19 | * 20 | * @author Phani Rahul 21 | */ 22 | public class IndriSearcher extends Searcher { 23 | private final QueryEnvironment q = new QueryEnvironment(); 24 | private boolean enabled = true; 25 | private final Logger log = Logger.getLogger(getClass()); 26 | private final boolean strict; 27 | 28 | /** 29 | * Setup the Indri Query Environment. 30 | * The "indri_index" property is the Indri index path 31 | * @param config The configuration Properties 32 | */ 33 | public IndriSearcher(Environment env, boolean strict) { 34 | super(env); 35 | this.strict = strict; 36 | if (env.getConfOrDie("indri_enabled") == "false") { 37 | enabled = false; 38 | } else { 39 | try { 40 | q.addIndex(env.getConfOrDie("indri_index")); 41 | } catch (Exception e) { 42 | System.out.println("Setting up the Indri index failed." 43 | + " Is the index in the correct location?" 44 | + " Is indri_jni included?"); 45 | e.printStackTrace(); 46 | enabled=false; 47 | } 48 | } 49 | Score.register("INDRI_ANSWER_SCORE", -1, Merge.Mean); 50 | Score.register("INDRI_ANSWER_RANK", -1, Merge.Mean); 51 | Score.register("INDRI_ANSWER_PRESENT", 0.0, Merge.Sum); 52 | } 53 | 54 | public List<Passage> query(Question question){ 55 | if (!enabled) return new ArrayList<>(); 56 | // Develop the query 57 | String query = q.reformulateQuery(StringUtils.sanitize( 58 | question.getCategory() + " " + question.text 59 | )); 60 | if (strict) query = query.replaceAll("#combine", "#uw"); 61 | log.info("Executing query " + query); 62 | 63 | ScoredExtentResult[] ser; 64 | QueryAnnotation aq; 65 | // Fetch all titles, texts 66 | String[] docnos; 67 | try { 68 | aq = q.runAnnotatedQuery(query, MAX_RESULTS); 69 | ser = aq.getResults(); 70 | docnos = q.documentMetadata(ser, "docno"); 71 | } catch (Exception e) { 72 | // If any other step fails, give a more general message but don't die. 73 | System.out.println("Querying Indri failed. Is the index in the correct location? Is indri_jni included?"); 74 | e.printStackTrace(); 75 | return new ArrayList<>(); 76 | } 77 | 78 | // Compile them into a uniform format 79 | List<Passage> results = new ArrayList<Passage>(); 80 | for (int i=0; i<ser.length; i++) { 81 | results.add(new Passage( 82 | "indri", // Engine 83 | "", // Title 84 | "", // Full Text 85 | docnos[i]) // Reference 86 | .score("INDRI_ANSWER_RANK", (double) i) 87 | .score("INDRI_ANSWER_SCORE", ser[i].score) 88 | .score("INDRI_ANSWER_PRESENT", 1.0)); 89 | } 90 | return fillFromSources(results); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/Searcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.sql.PreparedStatement; 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import edu.uncc.cs.watsonsim.Database; 10 | import edu.uncc.cs.watsonsim.Environment; 11 | import edu.uncc.cs.watsonsim.Passage; 12 | import edu.uncc.cs.watsonsim.Question; 13 | 14 | /* 15 | * This interface might change; Please be ready to accommodate the changes. 16 | * This interface should be implemented by local search engines like 17 | * Indri and Lucene, when querying them. Basically, it retrieves the basic data 18 | * from the queried result set. 19 | */ 20 | 21 | /** 22 | * 23 | * @author Phani Rahul 24 | */ 25 | public abstract class Searcher { 26 | protected final Database db; 27 | protected final Environment env; 28 | public Searcher(Environment env) { 29 | this.env = env; 30 | db = env.db; 31 | } 32 | 33 | /** 34 | * Runs the <i>query</i>, populating a list of ResultSets 35 | * 36 | * For each ResultSet: 37 | * <p>1: Gets the score of the document from the search result. For different 38 | * search engines, the scoring methods are different. If the document is 39 | * in TREC text format or TREC web format, every {@literal<DOC></DOC>} should be 40 | * considered as a separate document. 41 | * <p>2: Gets the title of the document. 42 | * <p>3: Gets the full text of the document. 43 | * 44 | * @param query 45 | * @throws Exception 46 | */ 47 | 48 | public List<Passage> query(String query) { 49 | return new ArrayList<>(); 50 | }; 51 | public List<Passage> query(Question q) { 52 | return query(q.text); 53 | }; 54 | 55 | /** 56 | * How many results should Lucene and Indri return? 57 | * This is also how many passages the scorers should expect. 58 | */ 59 | 60 | public final static int MAX_RESULTS = 10; 61 | 62 | 63 | /** Fill in the missing titles and full texts from Answers using the 64 | * sources from the relational database. 65 | * 66 | * This is a no-op if the sources database is missing. 67 | */ 68 | List<Passage> fillFromSources(List<Passage> passages) { 69 | List<Passage> results = new ArrayList<>(); 70 | PreparedStatement fetcher = db.prep("SELECT title, text FROM sources WHERE reference=? or id=?;"); 71 | 72 | for (Passage p: passages) { 73 | ResultSet doc_row; 74 | try { 75 | fetcher.setString(1, p.reference); 76 | fetcher.setString(2, p.reference); 77 | doc_row = fetcher.executeQuery(); 78 | if (doc_row.next() 79 | && doc_row.getString("title") != null 80 | && doc_row.getString("text") != null) { 81 | Passage np = new Passage( 82 | p.engine_name, 83 | doc_row.getString("title"), 84 | doc_row.getString("text"), 85 | p.reference 86 | ); 87 | np.scores = p.scores.clone(); 88 | results.add(np); 89 | } 90 | } catch (SQLException e) { 91 | e.printStackTrace(); 92 | throw new RuntimeException("Failed to execute sources search. " 93 | + "Missing document? docno:"+p.reference); 94 | } 95 | } 96 | return results; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/Configuration.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.io.Reader; 9 | import java.util.Collections; 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | import java.util.Properties; 13 | 14 | public class Configuration { 15 | 16 | protected final String data_path = "data/"; 17 | public final Map<String, String> config; 18 | 19 | @SuppressWarnings({ "unchecked", "rawtypes" }) // From Properties -> Map 20 | public Configuration() { 21 | /* 22 | * Normally, wrapping a IOException with a RuntimeException is bad 23 | * but if you cannot find a configuration file many bad things will 24 | * happen, and basically every useful feature will fail. So you might 25 | * as well just quit here. 26 | */ 27 | try { 28 | // Check the data path 29 | File f = new File(data_path); 30 | if (!(f.exists() && f.isDirectory())) { 31 | throw new IOException(data_path + " should be a directory."); 32 | } 33 | 34 | // Read the configuration 35 | Properties props = null; 36 | for (String prefix : new String[]{this.data_path, ""}) { 37 | try (Reader s = new InputStreamReader( 38 | new FileInputStream(prefix + "config.properties"), "UTF-8")){ 39 | // Make it, then link it if it works. 40 | Properties _local_props = new Properties(); 41 | _local_props.load(s); 42 | props = _local_props; 43 | } catch (FileNotFoundException e) { 44 | // This is only an error if none are found. 45 | } 46 | } 47 | // If it didn't link, all the reads failed. 48 | if (props == null) { 49 | throw new IOException("Failed to read config.properties in either " 50 | + this.data_path 51 | + " or " 52 | + System.getProperty("user.dir") // CWD 53 | + " You can create one by making a copy of" 54 | + " config.properties.sample. Check the README as well."); 55 | } 56 | // Now make properties immutable. 57 | Map<Object, Object> m = new HashMap<>(); 58 | m.putAll(props); 59 | this.config = Collections.unmodifiableMap((Map) m); 60 | } catch (IOException e) { 61 | throw new RuntimeException(e); 62 | } 63 | } 64 | 65 | /** 66 | * Convenience method for getting a setting. 67 | * @param config Map from the configuration file (config.properties) 68 | * @param key The key that must exist in the properties 69 | * @return The non-null String value, or else throw a RuntimeException. 70 | */ 71 | public String getConfOrDie(String key) { 72 | String value = config.get(key); 73 | if (value == null) throw new RuntimeException("Required key (" + key + ") missing from configuration file."); 74 | return value; 75 | } 76 | 77 | /** 78 | * Get the path to a resource, ensuring it exists. 79 | * This is mostly to give helpful errors and fail fast if you missed a 80 | * step setting up. 81 | * @param resource The relative path of the resource without leading / 82 | */ 83 | public String pathMustExist(String resource) { 84 | String path = data_path + File.separator + resource; 85 | if (!new File(path).exists()) { 86 | throw new RuntimeException("The data directory is missing the" 87 | + " expected resource: " + path); 88 | } 89 | return path; 90 | } 91 | 92 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/Passage.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import org.json.simple.JSONObject; 4 | 5 | import org.apache.commons.lang3.StringEscapeUtils; 6 | 7 | 8 | public class Passage extends Phrase { 9 | // Stored Fields 10 | public final String reference; 11 | public final String engine_name; 12 | public final String title; 13 | 14 | // Mutable 15 | public Score scores = Score.empty(); 16 | 17 | /** 18 | * Create a new Passage 19 | * 20 | * @param engine_name A simple lowercase string 21 | * @param title 22 | * @param text 23 | * @param reference Specific to the engine, or a URL, for later lookup 24 | */ 25 | public Passage(String engine_name, String title, String text, String reference) { 26 | super(text); 27 | if (engine_name == null) 28 | throw new NullPointerException("Engine name cannot be null."); 29 | if (title == null) 30 | throw new NullPointerException("Title cannot be null."); 31 | if (reference == null) 32 | throw new NullPointerException("Reference cannot be null."); 33 | 34 | this.reference = reference; 35 | this.engine_name = engine_name; 36 | this.title = StringEscapeUtils.unescapeXml(title); 37 | } 38 | 39 | // Copy constructor 40 | public Passage(Passage original) { 41 | this(original.engine_name, original.title, original.text, original.reference); 42 | scores = original.scores.clone(); 43 | } 44 | 45 | /** Set the value of this Score for this passage, returning the passage. 46 | * 47 | * The intended use is something like this: 48 | * new Passage(.......).score("SKIP_BIGRAM", 9.45).score("NGRAM", -1.2) 49 | * @param name 50 | * @param value 51 | */ 52 | public Passage score(String name, double value) { 53 | scores.put(name, value); 54 | return this; 55 | } 56 | 57 | /** Return a JSON object with the same fields */ 58 | public JSONObject toJSON() { 59 | JSONObject jo = new JSONObject(); 60 | jo.put("text", text); 61 | jo.put("title", title); 62 | jo.put("reference", reference); 63 | jo.put("engine_name", engine_name); 64 | return jo; 65 | } 66 | 67 | /****************************************************** 68 | * 69 | * Autogenerated hashcode() and equals() follow 70 | * 71 | ******************************************************/ 72 | 73 | @Override 74 | public int hashCode() { 75 | final int prime = 31; 76 | int result = 1; 77 | result = prime * result + getTokens().hashCode(); 78 | result = prime * result + engine_name.hashCode(); 79 | result = prime * result + reference.hashCode(); 80 | result = prime * result + text.hashCode(); 81 | result = prime * result + title.hashCode(); 82 | return result; 83 | } 84 | 85 | @Override 86 | public boolean equals(Object obj) { 87 | if (this == obj) 88 | return true; 89 | if (obj == null) 90 | return false; 91 | if (getClass() != obj.getClass()) 92 | return false; 93 | Passage other = (Passage) obj; 94 | if (!getTokens().equals(other.getTokens())) 95 | return false; 96 | else if (!engine_name.equals(other.engine_name)) 97 | return false; 98 | else if (!reference.equals(other.reference)) 99 | return false; 100 | else if (!text.equals(other.text)) 101 | return false; 102 | else if (!title.equals(other.title)) 103 | return false; 104 | return true; 105 | } 106 | } -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/WekaTee.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.sql.Timestamp; 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | import java.util.List; 9 | 10 | import edu.uncc.cs.watsonsim.Answer; 11 | import edu.uncc.cs.watsonsim.Question; 12 | import edu.uncc.cs.watsonsim.Score; 13 | import weka.core.Attribute; 14 | import weka.core.FastVector; 15 | import weka.core.Instance; 16 | import weka.core.Instances; 17 | import weka.core.converters.ArffSaver; 18 | import weka.core.converters.Saver; 19 | 20 | 21 | /** Pipe Answer scores to an ARFF file for Weka */ 22 | public class WekaTee extends Researcher { 23 | private final static List<Score> dataset = new ArrayList<>(); 24 | private static ArffSaver saver; 25 | private static int saved_schema_version = -1; 26 | 27 | 28 | // Make every run unique, but overwrite between questions 29 | // This way, you still get /something/ if you interrupt it 30 | private final Timestamp start_time; 31 | /** 32 | * Dump the training data to an ARFF file marked by the given timestamp 33 | * @param start_time 34 | */ 35 | public WekaTee(Timestamp start_time) { 36 | this.start_time = start_time; 37 | } 38 | 39 | @Override 40 | /** 41 | * Multithreaded counterpart to dump, which is synchronized 42 | */ 43 | public List<Answer> question(Question q, List<Answer> answers) { 44 | List<Score> new_entries = new ArrayList<>(); 45 | for (Answer a : answers) { 46 | new_entries.add(a.scores.clone()); 47 | } 48 | 49 | 50 | dump(new_entries, start_time); 51 | return answers; 52 | } 53 | 54 | /** File-writing serialized counterpart to question() 55 | * 56 | * @param new_entries The new arrays to dump 57 | * @param start_time The timestamp of the file to dump to 58 | */ 59 | private static synchronized void dump(List<Score> new_entries, Timestamp start_time) { 60 | dataset.addAll(new_entries); 61 | 62 | Collection<String> names = Score.latestSchema(); 63 | try { 64 | if (names.size() != saved_schema_version) { 65 | dump_from_scratch(names, start_time); 66 | } else { 67 | // Only do a few quick updates 68 | for (Score row : new_entries) 69 | saver.writeIncremental(new Instance(1.0, row.getEach(names))); 70 | } 71 | // There are synchronization issues otherwise. 72 | saver.getWriter().flush(); 73 | } catch (IOException e) { 74 | e.printStackTrace(); 75 | throw new RuntimeException("Failed to write Weka Log!"); 76 | } 77 | } 78 | 79 | /** 80 | * When the score changes, rewrite the file. 81 | * This is really rare in practice, so don't bother optimizing it. 82 | */ 83 | private static void dump_from_scratch(Collection<String> names, Timestamp start_time) throws IOException { 84 | saved_schema_version = names.size(); 85 | 86 | FastVector attributes = new FastVector(); 87 | // Answer score names 88 | for (String name: names) 89 | attributes.addElement(new Attribute(name)); 90 | Instances data = new Instances("Watsonsim captured question stream", attributes, 0); 91 | 92 | // Save the results to a file 93 | saver = new ArffSaver(); 94 | saver.setStructure(data); 95 | saver.setRetrieval(Saver.INCREMENTAL); 96 | saver.setFile(new File("data/weka-log." + start_time + ".arff")); 97 | for (Score row : dataset) 98 | saver.writeIncremental(new Instance(1.0, row.getEach(names))); 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/BingSearcher.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.io.IOException; 4 | import java.net.URI; 5 | import java.net.URISyntaxException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import org.apache.http.client.fluent.*; 10 | import org.apache.http.client.utils.URIBuilder; 11 | import org.apache.log4j.Logger; 12 | import org.jsoup.Jsoup; 13 | import org.jsoup.nodes.Document; 14 | import org.jsoup.nodes.Element; 15 | 16 | import edu.uncc.cs.watsonsim.Environment; 17 | import edu.uncc.cs.watsonsim.Passage; 18 | import edu.uncc.cs.watsonsim.Score; 19 | import edu.uncc.cs.watsonsim.scorers.Merge; 20 | 21 | /** 22 | * Internet-enabled Searcher for Bing. 23 | * 24 | * You will need a Bing api key, which you can (as of the time of this writing) 25 | * get from <a href="http://datamarket.azure.com">Microsoft</a> 26 | * 27 | * Bing gives around 5000 queries per month, which means that in most cases for 28 | * sustained development you will need to use CachingSearcher. 29 | * 30 | * @see CachingSearcher 31 | * @see privatedata.bingAPIKey 32 | * @author Sean Gallagher 33 | * @author Stephen Stanton 34 | * @author D Haval 35 | */ 36 | public class BingSearcher extends Searcher { 37 | private final String key; 38 | private final Logger log = Logger.getLogger(getClass()); 39 | public BingSearcher(Environment env) { 40 | super(env); 41 | Score.register("BING_ANSWER_RANK", -1, Merge.Mean); 42 | Score.register("BING_ANSWER_PRESENT", 0.0, Merge.Sum); 43 | key = env.getConfOrDie("bing_api_key"); 44 | } 45 | 46 | public List<Passage> query(String query) { 47 | 48 | URI uri = URI.create(""); // A bogus workaround for "may not have been initialized" 49 | try { 50 | uri = new URIBuilder() 51 | .setScheme("https") 52 | .setHost("api.datamarket.azure.com") 53 | .setPath("/Data.ashx/Bing/Search/v1/Web") 54 | .addParameter("Query", String.format("'%s'", query)).build(); // Should we place it in quotes? 55 | //.addParameter("$top", "50") 56 | //.addParameter("$format", "Atom").build(); 57 | } catch (URISyntaxException e1) { 58 | /* This bogus block is required by Java, 59 | * but strictly speaking new URIBuilder() can't actually throw 60 | * this error because it has no input (so there can be no syntax 61 | * error). Hence, this block is unreachable. 62 | */ 63 | e1.printStackTrace(); 64 | } 65 | 66 | List<Passage> results = new ArrayList<Passage>(); 67 | try { 68 | String resp = Executor 69 | .newInstance() 70 | .auth(key, key) 71 | .execute(Request.Get(uri)) 72 | .returnContent().asString(); 73 | 74 | Document doc = Jsoup.parse(resp); 75 | List<Element> elements = doc.select("entry"); 76 | // Perhaps limit to MAX_RESULTS? 77 | for (int i=0; i < elements.size(); i++) { 78 | Element e = elements.get(i); 79 | 80 | results.add(new Passage( 81 | "bing", // Engine 82 | e.select("d|Title").text(), // Title 83 | e.select("d|Description").text(), // Full Text 84 | e.select("d|Url").text()) // Reference 85 | .score("BING_ANSWER_RANK", (double) i) // Score 86 | .score("BING_ANSWER_PRESENT", 1.0) 87 | ); 88 | } 89 | log.info("Retrieved " + elements.size() + " candidates from Bing."); 90 | } catch (IOException e) { 91 | log.error("Error while searching with Bing. Ignoring. Details follow."); 92 | log.error(e.getMessage()); 93 | } 94 | return results; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /get_started.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Options ###################################################################### 4 | DATA_URL="https://dl.dropboxusercontent.com/u/92563044/watsonsim/data-snapshot.tar.gz" 5 | GRADLE_URL="https://services.gradle.org/distributions/gradle-2.2.1-bin.zip" 6 | PGBACKUP_URL="https://dl.dropboxusercontent.com/u/92563044/watsonsim/data-snapshot.pgdump" 7 | 8 | ################################################################################ 9 | 10 | GRADLE_TARGET=`basename "$DATA_URL"` 11 | DATA_TARGET=`basename "$DATA_URL"` 12 | PGBACKUP_TARGET=`basename "$PGBACKUP_TARGET"` 13 | 14 | install_postgres() { 15 | if lsb_release -a | grep -q "Ubuntu 14.10" 16 | then 17 | echo "Detected Ubuntu 14.10." 18 | echo "Installing dependencies for starting." 19 | sudo apt-get update 20 | sudo apt-get install postgresql-9.4 21 | else 22 | cat <<END 23 | This script hasn't been tested with your distribution. 24 | Please make sure the following are installed: 25 | PostgreSQL Server 9.3+ 26 | END 27 | fi 28 | } 29 | 30 | load_gradle() { 31 | # Download Gradle 32 | wget "$GRADLE_URL" 33 | unzip "$GRADLE_TARGET" 34 | } 35 | 36 | load_data() { 37 | DELETE=$1 38 | echo "Downloading archives (varies, maybe about 75GB). It will take a while." 39 | wget "$DATA_URL" 40 | echo "Decompressing data archive" 41 | if tar -Jxvf "$DATA_TARGET" && test $DELETE 42 | then 43 | rm "$PGBACKUP_TARGET" 44 | fi 45 | } 46 | 47 | restore_postgres() { 48 | DELETE=$1 49 | if $DELETE && pg_restore $2 <$PGBACKUP_TARGET 50 | then 51 | rm $PGBACKUP_TARGET 52 | fi 53 | } 54 | 55 | read_bool() { 56 | echo "$1 [Y/n]: " 57 | read out 58 | if echo $out | egrep -qi '[yt]' 59 | then 60 | return "true" 61 | else 62 | return "false" 63 | fi 64 | } 65 | 66 | #### main() #################################################################### 67 | 68 | cat <<END 69 | This install script installs Watsonsim and its associated data. 70 | To do this, it: 71 | Installs PostgreSQL server using local repositories 72 | Downloads, compiles, installs: 73 | Indri, libSVM, Eclipse and Gradle 74 | Downloads Java dependencies using Gradle 75 | Makes an eclipse project 76 | Downloads indexes (30GB+ download, 50GB+ on disk) 77 | Downloads a database (unknown download, 70GB+ on disk) 78 | 79 | This install script is designed for Ubuntu and Fedora Linux. 80 | If you have the right dependencies, you can probably run it on other 81 | distributions as well. It probably won't handle others (e.g. cygwin). 82 | END 83 | 84 | read_bool "Do you want to continue?" || exit 0 85 | 86 | # Ask all the questions FIRST 87 | LOAD_GRADLE=`read_bool "Download Gradle?"` 88 | INSTALL_POSTGRES=`read_bool "Install Postgres?"` 89 | LOAD_POSTGRES=`read_bool "Download Database?"` 90 | RESTORE_POSTGRES=`read_bool "Restore Database (overwrites contents)?"` 91 | if $RESTORE_POSTGRES 92 | then 93 | pg_restore --help 94 | cat <<END 95 | There are many options for restoring a database backup. 96 | For example, consider: 97 | -U username -h host -p port -d database 98 | END 99 | echo "The filename will be filled in automatically (as $PGBACKUP_TARGET)." 100 | echo -n "Type in your options: pg_restore " 101 | read PGBACKUP_OPTS 102 | fi 103 | LOAD_DATA=`read_bool "Download Indexes?"` 104 | DELETE_AFTER=`read_bool "Delete downloaded archives after uncompressing?"` 105 | 106 | $LOAD_GRADLE && load_gradle 107 | $INSTALL_POSTGRES && install_postgres 108 | $LOAD_POSTGRES && wget "$PGBACKUP_URL" 109 | $RESTORE_POSTGRES && restore_postgres $DELETE_AFTER "$PGBACKUP_OPTS" 110 | $LOAD_DATA && load_data $DELETE_AFTER 111 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/researchers/TagLAT.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.researchers; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import edu.stanford.nlp.util.Pair; 8 | import edu.uncc.cs.watsonsim.Answer; 9 | import edu.uncc.cs.watsonsim.Environment; 10 | import edu.uncc.cs.watsonsim.Passage; 11 | import edu.uncc.cs.watsonsim.Phrase; 12 | import edu.uncc.cs.watsonsim.Question; 13 | import edu.uncc.cs.watsonsim.nlp.ClueType; 14 | import edu.uncc.cs.watsonsim.nlp.DBPediaCandidateType; 15 | import edu.uncc.cs.watsonsim.nlp.SupportCandidateType; 16 | import edu.uncc.cs.watsonsim.nlp.Relatedness; 17 | 18 | 19 | public class TagLAT extends Researcher { 20 | private final DBPediaCandidateType dbpedia; 21 | private final Relatedness syn; 22 | 23 | public TagLAT(Environment env) { 24 | dbpedia = new DBPediaCandidateType(env); 25 | syn = new Relatedness(env); 26 | } 27 | 28 | public List<Answer> pull(Question q, List<Answer> answers) { 29 | return pull(q, answers, 0); 30 | } 31 | 32 | public List<Answer> pull(Question q, List<Answer> answers, int depth) { 33 | return question(q, chain.pull(q, answers), depth); 34 | } 35 | 36 | 37 | /** 38 | * Find the possible lexical types of a candidate, and label the answer. 39 | */ 40 | public List<Answer> question(Question q, List<Answer> answers, int depth) { 41 | int have_any_types = 0; 42 | 43 | int dbpedia_types = 0; 44 | int support_types = 0; 45 | 46 | List<Answer> suggestions = new ArrayList<>(); 47 | 48 | for (Answer a: answers) { 49 | 50 | // Handle DBPedia types 51 | 52 | a.lexical_types = dbpedia.viaDBPedia(a.text); 53 | for (String type: a.lexical_types) { 54 | a.log(this, "DBPedia says it's a %s", type); 55 | } 56 | if (a.lexical_types.isEmpty()) 57 | a.log(this, "DBPedia has no type information for it."); 58 | dbpedia_types += a.lexical_types.size(); 59 | 60 | // Handle Support types 61 | 62 | for (Passage p: a.passages) { 63 | List<Pair<String, String>> types = p.memo(SupportCandidateType::extract); 64 | for (Pair<String, String> name_and_type : types) { 65 | Phrase name = new Phrase(name_and_type.first); 66 | Phrase type = new Phrase(name_and_type.second); 67 | if (syn.implies(a, name)) { 68 | a.log(this, "Passage %s says it's a %s.", p.reference, type); 69 | a.lexical_types.add(type.text); 70 | support_types++; 71 | } else if (syn.implies(type, new Phrase(q.memo(ClueType::fromClue)))) { 72 | Answer suggestion = new Answer(name.text); 73 | suggestion.lexical_types = Arrays.asList(type.text); 74 | suggestion.log(this, "Found it's a %s, while reading about %s in %s", type, a, p.reference); 75 | if (!(suggestions.contains(suggestion) 76 | || answers.contains(suggestion))) { 77 | log.info("Suggesting " + name); 78 | suggestions.add(suggestion); 79 | } 80 | 81 | } 82 | } 83 | } 84 | if (!a.lexical_types.isEmpty()) have_any_types++; 85 | } 86 | 87 | // This is the chain magic: 88 | // We can pull the new suggestions through the pipeline and merge them! 89 | List<Answer> new_answers = new ArrayList<>(); 90 | if (!suggestions.isEmpty() && depth < 3) 91 | new_answers.addAll(pull(q, suggestions, depth+1)); 92 | new_answers.addAll(answers); 93 | 94 | 95 | //System.out.println(text + " could be any of " + types); 96 | log.info("Found " + (dbpedia_types+support_types) + " types for " 97 | + have_any_types + " candidates. ("+ support_types +" by reading) " 98 | + (answers.size() - have_any_types) + " candidates are unknown."); 99 | return new_answers; 100 | } 101 | 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/nlp/ClueType.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.nlp; 2 | 3 | import java.util.Arrays; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | import org.apache.commons.lang3.ObjectUtils; 8 | 9 | import static edu.uncc.cs.watsonsim.nlp.Trees.concat; 10 | import edu.stanford.nlp.trees.Tree; 11 | import edu.uncc.cs.watsonsim.Configuration; 12 | import edu.uncc.cs.watsonsim.Phrase; 13 | 14 | /** 15 | * Detect the LAT as the noun in closest proximity to a determiner. 16 | */ 17 | public class ClueType { 18 | 19 | public ClueType(Configuration env) { 20 | } 21 | 22 | /** 23 | * Intermediate results from LAT detection 24 | */ 25 | private static final class Analysis { 26 | public final Tree dt, nn; // Determiner, Noun// This is from worst to best! That way -1 is the worse-than-worst; 27 | private static final List<String> DT_RANK = Arrays.asList(new String[]{ 28 | "those", "that", "these", "which", "what", "this" 29 | }); 30 | public Analysis(Tree d, Tree n){ 31 | dt = d; nn = n; 32 | } 33 | 34 | /** 35 | * Case insensitively rank the LAT's by a predefined order 36 | */ 37 | public int rank() { 38 | if (dt == null) return -1; 39 | return DT_RANK.indexOf(concat(dt).toLowerCase()); 40 | } 41 | 42 | public boolean ok() { 43 | return dt != null && nn != null; 44 | } 45 | } 46 | 47 | /** 48 | * Merge two partial LAT analyses. 49 | * 1) Favor complete analyses over fragments 50 | * 2) Favor specific determiners in a specific order 51 | * @return a new immutable partial LAT analysis 52 | */ 53 | private static Analysis merge(Analysis a, Analysis b) { 54 | if (a.ok() && b.ok()) return (a.rank() < b.rank()) ? b : a; 55 | else if (a.ok()) return a; 56 | else if (b.ok()) return b; 57 | else { 58 | // Neither are viable. Merge them. 59 | return new Analysis( 60 | ObjectUtils.firstNonNull(a.dt, b.dt), 61 | ObjectUtils.firstNonNull(a.nn, b.nn)); 62 | } 63 | } 64 | 65 | 66 | /** 67 | * A very simple LAT detector. It wants the lowest subtree with both a determiner and a noun 68 | */ 69 | private static Analysis detectPart(Tree t) { 70 | switch (t.value()) { 71 | case "WDT": 72 | case "DT": return new Analysis(t, null); 73 | case "NN": 74 | case "NNS": return new Analysis(null, t); 75 | default: 76 | Analysis l = new Analysis((Tree) null, null); 77 | // The last noun tends to be the most general 78 | List<Tree> kids = t.getChildrenAsList(); 79 | Collections.reverse(kids); 80 | for (Tree kid : kids) 81 | l = merge(l, detectPart(kid)); 82 | return l; 83 | } 84 | 85 | } 86 | /** 87 | * Detect the LAT using a simple rule-based approach 88 | * @return The most general single-word noun LAT 89 | */ 90 | public static String fromClue(Phrase phrase) { 91 | for (Tree t : phrase.getTrees()) { 92 | Analysis lat = detectPart(t); 93 | if (lat.ok() && lat.rank() >= 0) { 94 | String latname = concat(lat.nn); 95 | phrase.log.info("Target lexical type: " + latname); 96 | return latname; 97 | } else { 98 | phrase.log.info("Unknown target lexical type."); 99 | return ""; 100 | } 101 | } 102 | return ""; 103 | } 104 | 105 | /** 106 | * Detect the LAT using a simple rule-based approach 107 | * This is a thin wrapper for use as a string 108 | * @return The most general single-word noun LAT 109 | */ 110 | public static String fromClue(String text) { 111 | Phrase p = new Phrase(text); 112 | for (Tree t : p.getTrees()) { 113 | Analysis lat = detectPart(t); 114 | if (lat.ok() && lat.rank() >= 0) { 115 | return concat(lat.nn).toLowerCase(); 116 | } 117 | } 118 | return ""; 119 | } 120 | 121 | 122 | } 123 | 124 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/scorers/PassageScorerOpenNLPAda.java: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * @author Adarsh 4 | */ 5 | package edu.uncc.cs.watsonsim.scorers; 6 | 7 | import java.io.IOException; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | import edu.uncc.cs.watsonsim.researchers.OpenNlpTests; 12 | import opennlp.tools.parser.Parse; 13 | import opennlp.tools.util.InvalidFormatException; 14 | 15 | public class PassageScorerOpenNLPAda { 16 | 17 | OpenNlpTests t = new OpenNlpTests(); 18 | public double compareParseType(Parse[] pa1, Parse[] pa2, boolean verbose){ 19 | double numMatches=0; 20 | Map<String, String> key1 = new HashMap<String, String>(); 21 | for (int i=0;i<pa1.length;i++){ 22 | key1.put(pa1[i].getType(),"y"); 23 | //pa1h.put(key[0],"y"); 24 | } 25 | for (int j=0;j<pa2.length;j++){ 26 | String key2=pa2[j].getType(); 27 | if (key1.containsKey(key2)){ 28 | numMatches++; 29 | if (verbose) System.out.println("\n"); 30 | pa2[j].show(); 31 | if (verbose) System.out.println("type: "+pa2[j].getType()); 32 | } 33 | } 34 | if (verbose) System.out.println("numTypeMatches "+numMatches); 35 | return numMatches; 36 | } 37 | 38 | 39 | public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{ 40 | double score1=0, score2=0; 41 | Parse[] caParse = t.parsePassageText(ca); 42 | Parse[] qParse = t.parsePassageText(q); 43 | Parse[] pasParse = t.parsePassageText(passage); 44 | Parse[] caParseCh = t.getAllChildren(caParse); 45 | Parse[] qParseCh = t.getAllChildren(qParse); 46 | Parse[] pasParseCh = t.getAllChildren(pasParse); 47 | score1=this.compareParseType(qParseCh, pasParseCh,verbose); 48 | score2=this.compareParseType(caParseCh, pasParseCh,verbose); 49 | return score1*score2; 50 | } 51 | 52 | //normalized scorer. 53 | public double scoreStructureNorm(String ca, String q, String passage, boolean verbose) throws InvalidFormatException{ 54 | double score1=0, score2=0; 55 | //OnlpParserTest pt= new OnlpParserTest(); 56 | Parse[] caParse = t.parsePassageText(ca); 57 | Parse[] qParse = t.parsePassageText(q); 58 | Parse[] pasParse = t.parsePassageText(passage); 59 | Parse[] caParseCh = t.getAllChildren(caParse); 60 | Parse[] qParseCh = t.getAllChildren(qParse); 61 | Parse[] pasParseCh = t.getAllChildren(pasParse); 62 | score1=this.compareParseType(qParseCh, pasParseCh,verbose); 63 | score2=this.compareParseType(caParseCh, pasParseCh,verbose); 64 | return score1*score2/passage.length(); 65 | } 66 | 67 | 68 | public static void main(String[] args) throws IOException{ 69 | String ca="Jane Austen"; 70 | String qq="Jane Austen wrote Emma"; 71 | String passage="Jane Austen was very modest about her own genius.[7] She once famously described her work as "+ 72 | "the little bit (two Inches wide) of Ivory, on which I work with so fine a brush, " + 73 | "as produces little effect after much labor [7]. " + 74 | "Jane Austen wrote Emma."+ 75 | "When she was a girl she wrote stories. Her works were printed only after much revision. " + 76 | "Only four of her novels were printed while she was alive. They were Sense and Sensibility (1811), " + 77 | "Pride and Prejudice (1813), Mansfield Park (1814) and Emma (1816). " + 78 | "Two other novels, Northanger Abbey and Persuasion, were printed in 1817 with " + 79 | "a biographical notice by her brother, Henry Austen. Persuasion was written shortly before her death. " + 80 | "She also wrote two earlier works, Lady Susan, and an unfinished novel, The Watsons. " + 81 | "She had been working on a new novel, Sanditon, but she died before she could finish it."; 82 | PassageScorerOpenNLPAda ps = new PassageScorerOpenNLPAda(); 83 | System.out.println(); 84 | System.out.println("NormalizedScore: "+ps.scoreStructureNorm(ca,qq, passage,true)); 85 | System.out.println("Raw Score: "+ps.scoreStructure(ca,qq, passage,true)); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /get_started.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################ 4 | GRADLE_URL = "https://services.gradle.org/distributions/gradle-2.2.1-bin.zip" 5 | ################################################################################ 6 | # Needs requests, wget 7 | ################################################################################ 8 | import platform 9 | import requests 10 | from setuptools import setup, Command 11 | import shutil 12 | from subprocess import call, check_call 13 | import sys 14 | import tarfile 15 | import urllib2 16 | import zipfile 17 | class Download(Command): 18 | def unpack(ar, delete): 19 | """ Unpack a file and delete the original """ 20 | print "Unpacking %s" %ar 21 | if ar.endswith("tar"): 22 | tarfile.open(ar).extractall() 23 | elif ar.endswith("zip"): 24 | zipfile.Zipfile(ar, "r").extractall() 25 | else: 26 | print "Could not recognize file format of %s. Aborting unpack." %ar 27 | return # Skip the possible delete 28 | if delete: os.remove(ar) 29 | 30 | def installPostgres(): 31 | if platform.system() == "Linux": 32 | dist = platform.dist()[0] 33 | try: 34 | if dist == "Fedora": 35 | check_call("sudo yum install postgres-9.3".split()) 36 | elif dist == "Ubuntu": 37 | check_call("sudo apt-get install postgres-9.3".split()) 38 | except CalledProcessError as e: 39 | print e 40 | print "Opening a shell to allow you to install and setup Postgres manually." 41 | print 'Use "exit 1" to abort installation' 42 | check_call(os.environ.get("SHELL", "sh")) 43 | else: 44 | print "Can only install Postgres on Linux (yet)." 45 | 46 | def ask(prompt): 47 | return raw_input(prompt + " | ")[0].lower() in ('y', 't') 48 | 49 | def run(): 50 | import argparse 51 | import wget 52 | parser = argparse.ArgumentParser(description="Setup the Watsonsim question answering system.") 53 | parser.add_argument("--no-postgres", 54 | action="store_false", 55 | dest='postgres', 56 | default=True, 57 | help="Don't install postgresql server (which would be from the repository).") 58 | parser.add_argument("--no-gradle", 59 | action="store_false", 60 | dest='gradle', 61 | default=True, 62 | help="Don't download and install gradle.") 63 | args = parser.parse_args() 64 | 65 | print "This script is not ready yet, refer to the homepage for installation instructions." 66 | sys.exit(1) 67 | if not ask("Are you sure you want to start? It may take many hours and 150+ GB of disk space. "): 68 | sys.exit(1) 69 | 70 | # The theory here is to do the smallest tasks first. 71 | if args.gradle: 72 | # Less than 5 minutes 73 | wget.download(GRADLE_URL) 74 | unpack(os.path.basename(GRADLE_URL), then_delete) 75 | if args.postgres: 76 | # Maybe about 5 minutes 77 | installPostgres() 78 | 79 | #http://apache.osuosl.org/jena/binaries/jena-fuseki-1.1.1-distribution.tar.gz 80 | #java -cp jena-fuseki-1.1.1/fuseki-server.jar tdb.tdbloader --tdb=jena-lucene.ttl *.owl *.nt 81 | #java -cp jena-fuseki-1.1.1/fuseki-server.jar jena.textindexer --desc=../jena-lucene.ttl 82 | 83 | setup( 84 | name="Watsonsim Question Answering System", 85 | version="0.5", 86 | author="Sean Gallagher", 87 | author_email="stgallag@gmail.com", 88 | url="http://github.com/SeanTater/uncc2014watsonsim", 89 | setup_requires = [ 90 | 'wget>=2.2', 91 | 'requests>=2.2.1' 92 | ], 93 | install_requires = [ 94 | 'psycopg2>=2.4.5' 95 | ], 96 | cmdclass={"download": Download} 97 | ) 98 | -------------------------------------------------------------------------------- /src/test/java/edu/uncc/cs/watsonsim/QClassDetectionTest.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.Test; 6 | 7 | import edu.uncc.cs.watsonsim.QType; 8 | import edu.uncc.cs.watsonsim.Question; 9 | 10 | public class QClassDetectionTest { 11 | 12 | @Test 13 | public void test() { 14 | Question[] questions = { 15 | Question.known("In book 1, \"The ___ Hat\"","sorting","HARRY POTTER & TME CHAPTER TITLES"), 16 | Question.known("In book 4, \"The ___ World Cup\"","quidditch","HARRY POTTER & TME CHAPTER TITLES"), 17 | Question.known("In book 6, her \"helping hand\"","Hermione","HARRY POTTER & TME CHAPTER TITLES"), 18 | Question.known("Plant a Smooch on Yours Truly, Katharine","Kiss Me, Kate","MUSICALS BY ANY OTHER NAME"), 19 | Question.known("The 2 digits that give James Bond license to kill","0","BY THE NUMBERS"), 20 | Question.known("Of 1.8, 2.5, or 3.7 hours per home, the average time PBS is viewed each week in U.S.","1.8","PBS"), 21 | Question.known("Gary glitter: \"Rock and Roll _____ _____\"","Part 2","LET'S ROCK!"), 22 | Question.known("Simple Abundance by Sarah Ban Breathnach has this many messages for women, one for each day in 1996","366","IN THE BOOKSTORE"), 23 | //TODO: fix bug: the constructor for these incorrectly use the second parameter is an answer; however that 24 | // functionality is used by another method (JSONQuestionSource(Reader)). 25 | Question.known("Kimono, caftan, bath-","FASHIONABLE COMMON BONDS"), 26 | Question.known("Ontario,Havasu,Baikal","COMMON BONDS"), 27 | Question.known("Trash, a boyfriend you're sick of, goods or securities sold below costs","COMMON BONDS"), 28 | Question.known("green crested, collared, anole","Beastly Common Bonds"), 29 | Question.known("Later jailed for fraud, Australian Alan Bond became a national hero for financing the 1983 capture of this sailing trophy","UNCOMMON BONDS"), 30 | Question.known("Rolled, steelcut, Scotch","EDIBLE COMMON BONDS"), 31 | Question.known("Nursery rhyme waterspout crawler who's a Marvel crime fighter","BEFORE & AFTER"), 32 | Question.known("Nursery rhyme waterspout crawler who's a Marvel crime fighter","Before & After"), 33 | Question.known("This man succeeded John Carver as governor of Plymouth Colony in 1621 & served for 31 of the next 35 years", "AMERICA BEFORE THE REVOLUTION"), 34 | Question.known("John Milton epic about Gertrude Stein's Parisian expatriate Yanks who were born starting in 1965", "BEFORE, DURING, & AFTER"), 35 | Question.known("Gray", "INDIANAGRAMS"), 36 | Question.known("The king is dead, long \"lives\" the king", " MUSICAL ANAGRAMS"), 37 | Question.known("Anthem ender: BEHAVE HOME FORT", "ANAGRAMS"), 38 | Question.known("Lose", "SCRAMBLED FISH"), 39 | Question.known("", ""), 40 | Question.known("He not only wrote & directed \"Little Johnny Jones\", he also played the title role", "QUOTATION") 41 | }; 42 | 43 | QType[] labels = { 44 | QType.FITB, 45 | QType.FITB, 46 | QType.FACTOID, 47 | QType.FACTOID, 48 | QType.FACTOID, 49 | QType.FACTOID, 50 | QType.FITB, 51 | QType.FACTOID, 52 | QType.COMMON_BONDS, 53 | QType.COMMON_BONDS, 54 | QType.COMMON_BONDS, 55 | QType.COMMON_BONDS, 56 | QType.COMMON_BONDS, 57 | QType.COMMON_BONDS, 58 | QType.BEFORE_AND_AFTER, 59 | QType.BEFORE_AND_AFTER, 60 | QType.BEFORE_AND_AFTER, 61 | QType.BEFORE_AND_AFTER, 62 | QType.ANAGRAM, 63 | QType.ANAGRAM, 64 | QType.ANAGRAM, 65 | QType.ANAGRAM, 66 | QType.FACTOID, 67 | QType.QUOTATION 68 | }; 69 | 70 | int missed = 0; 71 | for (int i=0; i<questions.length; i++) { 72 | try { 73 | assertEquals(labels[i], questions[i].getType()); 74 | } catch (java.lang.AssertionError ae) { 75 | System.out.println("Failed to correctly categorize " + questions[i].text + " as " + labels[i] + "; incorrect type: " + questions[i].getType()); 76 | missed++; 77 | } 78 | } 79 | assertTrue(missed * 4 < questions.length); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/edu/uncc/cs/watsonsim/search/Anagrams.java: -------------------------------------------------------------------------------- 1 | package edu.uncc.cs.watsonsim.search; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.nio.file.Files; 7 | import java.nio.file.Paths; 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.Map; 13 | import java.util.regex.Matcher; 14 | import java.util.regex.Pattern; 15 | 16 | import edu.uncc.cs.watsonsim.Environment; 17 | import edu.uncc.cs.watsonsim.Log; 18 | import edu.uncc.cs.watsonsim.Passage; 19 | import edu.uncc.cs.watsonsim.Score; 20 | import edu.uncc.cs.watsonsim.scorers.Merge; 21 | 22 | public class Anagrams extends Searcher { 23 | 24 | private final Map<String, List<String>> mp = new HashMap<>(); 25 | private Log log; 26 | 27 | public Anagrams(Environment env) { 28 | super(env); 29 | log = env.log.kid(getClass()); 30 | try 31 | { 32 | for (String line : Files.readAllLines(Paths.get("data", "words"))) { 33 | // condition of different anagram questions: 34 | // usually anagram questions are coming for word coming after : 35 | // regular expression for searching if a : is coming in the question 36 | 37 | char[] charArray = line.toLowerCase().toCharArray(); 38 | Arrays.sort(charArray); 39 | String source = String.valueOf(charArray); 40 | List<String> targets = mp.get(source); 41 | if (targets == null) { 42 | targets = new ArrayList<>(); 43 | mp.put(source, targets); 44 | } 45 | targets.add(line); 46 | } 47 | } 48 | catch(IOException e) 49 | { 50 | e.printStackTrace(); 51 | } 52 | Score.register("IS_ONLY_ANAGRAM", 0.0, Merge.Min); 53 | } 54 | 55 | public static void main(String args[]) throws IOException { 56 | Anagrams ta = new Anagrams(new Environment()); 57 | System.out.println("Enter the Jeopardy Anagram Question:"); 58 | BufferedReader br2 = new BufferedReader( 59 | new InputStreamReader(System.in)); 60 | String question = br2.readLine(); 61 | ta.query(question); 62 | } 63 | 64 | public static List<String> search_key(String keys,Map<String, List<String>> mp) 65 | { 66 | char[] charArray = keys.toLowerCase().toCharArray(); 67 | Arrays.sort(charArray); 68 | // String searchKey = String.valueOf(charArray); 69 | List<String> entries = mp.get(String.valueOf(charArray)); 70 | if (entries == null) 71 | { 72 | entries = new ArrayList<>(); 73 | } 74 | entries.remove(keys); 75 | return entries; 76 | } 77 | 78 | @Override 79 | public List<Passage> query(String query) { 80 | // Some anagrams come in a very clear syntax: 81 | // either in quotes, or after a colon. Find them. 82 | Matcher matcher = Pattern.compile("\"([A-z ]+)\"|: ([A-z ]+)") 83 | .matcher(query); 84 | 85 | List<String> entries = new ArrayList<>(); 86 | if (matcher.find() && matcher.group(1) != null) { 87 | // Good news. We found a quoted string to generate anagrams from. 88 | entries.addAll(search_key(matcher.group(1), mp)); 89 | if (!entries.isEmpty()) { 90 | log.info("Found " + entries.size() 91 | + " quoted anagrams"); 92 | } 93 | } else { 94 | // Bad news. We have to guess all the words. 95 | String[] words = query.split(" "); 96 | if (words.length <= 2) { 97 | // When there are so few words, the whole question is likely 98 | // an anagram. For example, "Nuke Air" -> "Ukariane" 99 | entries.addAll(search_key(query.replace(" ", ""), mp)); 100 | } else { 101 | // Otherwise, consider each word separately. 102 | for (String word : words) { 103 | entries.addAll(search_key(word, mp)); 104 | } 105 | } 106 | } 107 | 108 | entries.removeAll(Arrays.asList("Si","shit","Ni")); 109 | 110 | List<Passage> results = new ArrayList<>(); 111 | for (String text : entries) { 112 | results.add(new edu.uncc.cs.watsonsim.Passage("lucene", // Engine 113 | text, // Title 114 | text, // Text 115 | "anagram:" + text).score("IS_ONLY_ANAGRAM", 1.0)); 116 | 117 | } 118 | 119 | 120 | 121 | return results; 122 | } 123 | } 124 | --------------------------------------------------------------------------------