├── README.md
├── bin
    ├── commons-logging.jar
    ├── ejml-0.23.jar
    ├── joda-time.jar
    ├── jollyday.jar
    ├── jwnl.jar
    ├── libsvm.jar
    ├── processreviews-sept2014.jar
    └── stanford-corenlp-3.3.1.jar
├── commandline-howto.txt
├── exportjar.jardesc
├── jwnl14_file_properties.xml
├── lib
    ├── commons-logging.jar
    ├── ejml-0.23.jar
    ├── jackson-annotations-2.2.3.jar
    ├── jackson-core-2.3.2.jar
    ├── jackson-databind-2.2.3.jar
    ├── joda-time.jar
    ├── jollyday.jar
    ├── jwnl.jar
    ├── libsvm.jar
    └── stanford-corenlp-3.3.1.jar
└── src
    └── ca
        └── carter
            └── thesis
                ├── ProcessReviews.java
                ├── RetrainingThread.java
                ├── ReviewFileReaderFlat.java
                ├── ReviewFileReaderXML.java
                ├── SeedModelCreatorThread.java
                ├── SentenceProcessorThread.java
                ├── TokenPredictorThread.java
                ├── WikipediaParaphraser.java
                ├── WordNetResolver.java
                ├── evaluation
                    └── ResultsSummary.java
                ├── languagemodels
                    ├── DefaultTokenizer.java
                    └── StopWords.java
                ├── ml
                    ├── BinaryPrediction.java
                    ├── ClassWeighting.java
                    ├── FeatureDistance.java
                    ├── FeatureRepository.java
                    ├── FeatureType.java
                    ├── ModelType.java
                    ├── Prediction.java
                    ├── PredictionTokenWithContextPair.java
                    ├── SVMTokenModel.java
                    ├── SVMTokenModelFeature.java
                    ├── SVMTokenModelSentiment.java
                    └── Views.java
                └── model
                    ├── AspectMatchPolicy.java
                    ├── ProductFeatureOpinion.java
                    ├── ProductOpinionFeatureDetail.java
                    ├── ReconciledFeatureOpinion.java
                    ├── Review.java
                    ├── SemanticallyTaggedTokenWithContext.java
                    ├── Sentence.java
                    ├── Sentiment.java
                    ├── SimpleSentence.java
                    ├── Task.java
                    ├── TokenWithContext.java
                    └── phrasetree
                        ├── AbstractPhraseTreePart.java
                        ├── PartOfSentimentStructure.java
                        ├── PartOfSpeech.java
                        ├── PhraseTree.java
                        ├── StringWithTree.java
                        └── TokenLeaf.java


/README.md:
--------------------------------------------------------------------------------
 1 | # Inferring aspect-specific opinion structure in product reviews using co-training
 2 | This is an algorithm for aspect-based sentiment analysis using co-training, a semi-supervised machine learning algorithm that partitions the machine learning features into two sufficient and uncorrelated "views" and then self-learns.
 3 | 
 4 | ### Required data sets
 5 | 
 6 | The application uses data provided by third parties.  To use this project, you'll need to download and unzip:
 7 | 
 8 | http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip (Hu and Liu, KDD-2004)
 9 | 
10 | http://metashare.ilsp.gr:8080/repository/browse/semeval-2014-absa-test-data-gold-annotations/b98d11cec18211e38229842b2b6a04d77591d40acd7542b7af823a54fb03a155/ (Ganu et al., 2009)
11 | 
12 | Many thanks to the authors and annotators of these two data sets.
13 | 
14 | ### Executable version
15 | 
16 | The executable version of this project is in the bin directory.
17 | 
18 | Command line arguments can be inspected by running the command:  
19 | `java -jar processreviews-sept2014.jar help`
20 | 
21 | The Stanford CoreNLP models are also required (stanford-corenlp-3.3.1-models.jar) and should be put in the bin directory (if running from the command line) and the lib directory (if using the source code itself).  This can be downloaded from http://search.maven.org/#browse%7C304725258
22 | 
23 | ### Source code
24 | 
25 | ProcessReviews.java is the main class.
26 | 
27 | To use the source code, you'll have to edit a couple of things to start:
28 | - in jwnl14_file_properties.xml, you'll need to update the *dictionary_path* parameter
29 | - in *ProcessReviews.java*, you'll have to update the *defaultRootDir* variable to point to the required data sets
30 | 
31 | You can use the *exportjar.jardesc* file in Eclipse to package up a new executable jar file.
32 | 
33 | ### Citing
34 | 
35 | If using this code, please cite the paper:
36 | 
37 |     @inproceedings{carter:aspectspecificcotraining,
38 |      author = {Carter, Dave and Inkpen, Diana},
39 |      title = {Inferring Aspect-Specific Opinion Structure in Product Reviews using Co-training},
40 |      booktitle = {Proceedings of CICLing-2015},
41 |      series = {Lecture Notes in Computer Science 9042},
42 |      year = {2015},
43 |      isbn = {978-3-319-18117-2},
44 |      location = {Cairo, Egypt},
45 |      pages = {225--240},
46 |      numpages = {16},
47 |      url = {http://dx.doi.org/10.1007/978-3-319-18117-2_17},
48 |      doi = {10.1007/978-3-319-18117-2_17},
49 |      publisher = {Springer-Verlag},
50 |      address = {Berlin, Heidelberg},
51 |     } 
52 | 
53 | 


--------------------------------------------------------------------------------
/bin/commons-logging.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/commons-logging.jar


--------------------------------------------------------------------------------
/bin/ejml-0.23.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/ejml-0.23.jar


--------------------------------------------------------------------------------
/bin/joda-time.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/joda-time.jar


--------------------------------------------------------------------------------
/bin/jollyday.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/jollyday.jar


--------------------------------------------------------------------------------
/bin/jwnl.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/jwnl.jar


--------------------------------------------------------------------------------
/bin/libsvm.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/libsvm.jar


--------------------------------------------------------------------------------
/bin/processreviews-sept2014.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/processreviews-sept2014.jar


--------------------------------------------------------------------------------
/bin/stanford-corenlp-3.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/stanford-corenlp-3.3.1.jar


--------------------------------------------------------------------------------
/commandline-howto.txt:
--------------------------------------------------------------------------------
  1 | java -jar processreviews.jar > results.txt 2>&1
  2 | 
  3 | SENTENCES, no cotraining
  4 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 6 null 8 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t6cnt8-2500.txt 2>&1 &
  5 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 7 null 9 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t7cnt9-2500.txt 2>&1 &
  6 | 
  7 | SENTENCES, cotraining
  8 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 12 13 8 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t12c13t8-2500.txt 2>&1 &
  9 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 13 12 8 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t13c12t8-2500.txt 2>&1 &
 10 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 14 15 9 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t14c15t9-2500.txt 2>&1 &
 11 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 15 14 9 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t15c14t9-2500.txt 2>&1 &
 12 | 
 13 | 
 14 | TASK2, no cotraining
 15 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 6 null 8 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t6cnt8-2500.txt 2>&1 &
 16 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 7 null 9 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t7cnt9-2500.txt 2>&1 &
 17 | 
 18 | TASK2, cotraining
 19 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500.txt 2>&1
 20 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500.txt 2>&1
 21 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500.txt 2>&1
 22 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500.txt 2>&1
 23 | 
 24 | TASK2, cotraining, trying different thresholds
 25 | 
 26 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055.txt 2>&1 &
 27 | 
 28 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.65 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c065.txt 2>&1 &
 29 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.65 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c065.txt 2>&1 &
 30 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.65 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c065.txt 2>&1 &
 31 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.65 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c065.txt 2>&1 &
 32 | 
 33 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.75 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c075.txt 2>&1 &
 34 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.75 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c075.txt 2>&1 &
 35 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.75 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c075.txt 2>&1 &
 36 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.75 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c075.txt 2>&1 &
 37 | 
 38 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.85 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c085.txt 2>&1 &
 39 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.85 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c085.txt 2>&1 &
 40 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.85 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c085.txt 2>&1 &
 41 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.85 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c085.txt 2>&1 &
 42 | 
 43 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.95 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c095.txt 2>&1 &
 44 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.95 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c095.txt 2>&1 &
 45 | on mbp - java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.95 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c095.txt 2>&1 &
 46 | on mbp - java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.95 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c095.txt 2>&1 &
 47 | 
 48 | TASK2, trying different max number of iterations
 49 | 
 50 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 null 8 2500 EXACT 0.55 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-0coiteration.txt 2>&1 &
 51 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 1 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-1coiteration.txt 2>&1 &
 52 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 2 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-2coiteration.txt 2>&1 &
 53 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 3 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-3coiteration.txt 2>&1 &
 54 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 4 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-4coiteration.txt 2>&1 &
 55 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 5 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-5coiteration.txt 2>&1 &
 56 | 
 57 | 
 58 | TASK2, cotraining baselines
 59 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 null 8 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12cnt8-2500.txt 2>&1 &
 60 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 null 8 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13cnt8-2500.txt 2>&1 &
 61 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 null 9 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14cnt9-2500.txt 2>&1 &
 62 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 null 9 >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15cnt9-2500.txt 2>&1 &
 63 | 
 64 | 
 65 | TASK1, no contraining
 66 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 6 null 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t6cnt8-2500.txt 2>&1 &
 67 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 7 null 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t7cnt9-2500.txt 2>&1 &
 68 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 6 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t6cnt8-2500.txt 2>&1 &
 69 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 7 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t7cnt9-2500.txt 2>&1 &
 70 | 
 71 | TASK1, cotraining
 72 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 13 8 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t12c13t8-2500.txt 2>&1 &
 73 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 12 8 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t13c12t8-2500.txt 2>&1 &
 74 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 15 9 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t14c15t9-2500.txt 2>&1 &
 75 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 14 9 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t15c14t9-2500.txt 2>&1 &
 76 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 13 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t12c13t8-2500.txt 2>&1 &
 77 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 12 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t13c12t8-2500.txt 2>&1 &
 78 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 15 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t14c15t9-2500.txt 2>&1 &
 79 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 14 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t15c14t9-2500.txt 2>&1 &
 80 | 
 81 | TASK1, cotraining baselines
 82 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 null 8 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t12cnt8-2500.txt 2>&1 &
 83 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 null 8 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t13cnt8-2500.txt 2>&1 &
 84 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 null 9 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t14cnt9-2500.txt 2>&1 &
 85 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 null 9 2500 EXACT >   /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t15cnt9-2500.txt 2>&1 &
 86 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t12cnt8-2500.txt 2>&1 &
 87 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t13cnt8-2500.txt 2>&1 &
 88 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t14cnt9-2500.txt 2>&1 &
 89 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t15cnt9-2500.txt 2>&1 &
 90 | 
 91 | 
 92 | TASK1, BINGLIU, CROSS VALIDATION
 93 | 
 94 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 1 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t1cntn-2500.txt 2>&1 &
 95 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 2 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t2cntn-2500.txt 2>&1 &
 96 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 3 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t3cntn-2500.txt 2>&1 &
 97 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 4 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t4cntn-2500.txt 2>&1 &
 98 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 5 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t5cntn-2500.txt 2>&1 &
 99 | 
100 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t1cntn-2500.txt 2>&1 &
101 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t2cntn-2500.txt 2>&1 &
102 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t3cntn-2500.txt 2>&1 &
103 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t4cntn-2500.txt 2>&1 &
104 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t5cntn-2500.txt 2>&1 &
105 | 
106 | (underway)
107 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 1 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t1cntn-2500-nogen.txt 2>&1 &
108 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 2 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t2cntn-2500-nogen.txt 2>&1 &
109 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 3 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t3cntn-2500-nogen.txt 2>&1 &
110 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 4 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t4cntn-2500-nogen.txt 2>&1 &
111 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 5 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t5cntn-2500-nogen.txt 2>&1 &
112 | 
113 | (underway)
114 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t1cntn-2500-nogen.txt 2>&1 &
115 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t2cntn-2500-nogen.txt 2>&1 &
116 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t3cntn-2500-nogen.txt 2>&1 &
117 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t4cntn-2500-nogen.txt 2>&1 &
118 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t5cntn-2500-nogen.txt 2>&1 &
119 | 
120 | 
121 | SENTENCES, BINGLIU, CROSS VALIDATION
122 | 
123 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1cntn-2500.txt 2>&1 &
124 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2cntn-2500.txt 2>&1 &
125 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3cntn-2500.txt 2>&1 &
126 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4cntn-2500.txt 2>&1 &
127 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5cntn-2500.txt 2>&1 &
128 | 
129 | (underway Nov 8)
130 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1cntn-2500-nogen.txt 2>&1 &
131 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 2 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t2cntn-2500-nogen.txt 2>&1 &
132 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 3 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t3cntn-2500-nogen.txt 2>&1 &
133 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 4 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t4cntn-2500-nogen.txt 2>&1 &
134 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 5 null null 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t5cntn-2500-nogen.txt 2>&1 &
135 | 
136 | SENTENCES, BINGLIU, DOMAIN ADAPTATION
137 | 
138 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2,3,4,5 null 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2345cnt1-2500.txt 2>&1 &
139 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,3,4,5 null 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1345cnt2-2500.txt 2>&1 &
140 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,4,5 null 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1245cnt3-2500.txt 2>&1 &
141 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,3,5 null 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1235cnt4-2500.txt 2>&1 &
142 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,3,4 null 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1234cnt5-2500.txt 2>&1 &
143 | 
144 | (started)
145 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 2,3,4,5 null 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t2345cnt1-2500-nogen.txt 2>&1 &
146 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,3,4,5 null 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1345cnt2-2500-nogen.txt 2>&1 &
147 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,4,5 null 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1245cnt3-2500-nogen.txt 2>&1 &
148 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,3,5 null 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1235cnt4-2500-nogen.txt 2>&1 &
149 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,3,4 null 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1234cnt5-2500-nogen.txt 2>&1 &
150 | 
151 | 
152 | SENTENCES, BINGLIU, DOMAIN ADAPTATION COTRAINING
153 | 
154 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 3,4,5 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c345t1-2500.txt 2>&1 &
155 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 2,4,5 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c245t1-2500.txt 2>&1 &
156 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 2,3,5 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c235t1-2500.txt 2>&1 &
157 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 2,3,4 1 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c234t1-2500.txt 2>&1 &
158 | 
159 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 3,4,5 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c345t2-2500.txt 2>&1 &
160 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,4,5 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c145t2-2500.txt 2>&1 &
161 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,3,5 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c135t2-2500.txt 2>&1 &
162 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,3,4 2 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c134t2-2500.txt 2>&1 &
163 | 
164 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,4,5 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c245t3-2500.txt 2>&1 &
165 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,4,5 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c145t3-2500.txt 2>&1 &
166 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,2,5 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c125t3-2500.txt 2>&1 &
167 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,2,4 3 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c124t3-2500.txt 2>&1 &
168 | 
169 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,3,5 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c235t4-2500.txt 2>&1 &
170 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,3,5 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c135t4-2500.txt 2>&1 &
171 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,2,5 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c125t4-2500.txt 2>&1 &
172 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,2,3 4 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c123t4-2500.txt 2>&1 &
173 | 
174 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,3,4 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c234t5-2500.txt 2>&1 &
175 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,3,4 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c134t5-2500.txt 2>&1 &
176 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,2,4 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c124t5-2500.txt 2>&1 &
177 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,2,3 5 2500 PARTIAL >  /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c123t5-2500.txt 2>&1 &
178 | 
179 | 
180 | 
181 | 
182 | EC2 stuff:
183 | 
184 | https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:
185 | ssh -i ~/.ssh/amazon-ec2-free.pem ec2-user@54.213.228.153
186 | scp -i /Users/davecart/.ssh/amazon-ec2-free.pem -r ~/Dropbox/EC2\ image/* ec2-user@54.213.248.172:


--------------------------------------------------------------------------------
/exportjar.jardesc:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <jardesc>
 3 |     <jar path="process-reviews/dist/processreviews-nov2014-nogenericremoval.jar"/>
 4 |     <options buildIfNeeded="true" compress="true" descriptionLocation="/process-reviews/exportjar.jardesc" exportErrors="true" exportWarnings="true" includeDirectoryEntries="false" overwrite="true" saveDescription="true" storeRefactorings="false" useSourceFolders="false"/>
 5 |     <storedRefactorings deprecationInfo="true" structuralOnly="false"/>
 6 |     <selectedProjects/>
 7 |     <manifest generateManifest="false" mainClassHandleIdentifier="=process-reviews/src&lt;ca.carter.thesis{ProcessReviews.java[ProcessReviews" manifestLocation="/process-reviews/manifest" manifestVersion="1.0" reuseManifest="false" saveManifest="true" usesManifest="true">
 8 |         <sealing sealJar="false">
 9 |             <packagesToSeal/>
10 |             <packagesToUnSeal/>
11 |         </sealing>
12 |     </manifest>
13 |     <selectedElements exportClassFiles="true" exportJavaFiles="false" exportOutputFolder="false">
14 |         <javaElement handleIdentifier="=process-reviews/src"/>
15 |     </selectedElements>
16 | </jardesc>
17 | 


--------------------------------------------------------------------------------
/jwnl14_file_properties.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <jwnl_properties language="en">
 3 | 	<version publisher="Princeton" number="3.1" language="en"/>
 4 | 	<dictionary class="net.didion.jwnl.dictionary.FileBackedDictionary">
 5 | 		<param name="morphological_processor" value="net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor">
 6 | 			<param name="operations">
 7 | 				<param value="net.didion.jwnl.dictionary.morph.LookupExceptionsOperation"/>
 8 | 				<param value="net.didion.jwnl.dictionary.morph.DetachSuffixesOperation">
 9 | 					<param name="noun" value="|s=|ses=s|xes=x|zes=z|ches=ch|shes=sh|men=man|ies=y|"/>
10 | 					<param name="verb" value="|s=|ies=y|es=e|es=|ed=e|ed=|ing=e|ing=|"/>
11 | 					<param name="adjective" value="|er=|est=|er=e|est=e|"/>
12 |                     <param name="operations">
13 |                         <param value="net.didion.jwnl.dictionary.morph.LookupIndexWordOperation"/>
14 |                         <param value="net.didion.jwnl.dictionary.morph.LookupExceptionsOperation"/>
15 |                     </param>
16 | 				</param>
17 | 				<param value="net.didion.jwnl.dictionary.morph.TokenizerOperation">
18 | 					<param name="delimiters">
19 | 						<param value=" "/>
20 | 						<param value="-"/>
21 | 					</param>
22 | 					<param name="token_operations">
23 |                         <param value="net.didion.jwnl.dictionary.morph.LookupIndexWordOperation"/>
24 | 						<param value="net.didion.jwnl.dictionary.morph.LookupExceptionsOperation"/>
25 | 						<param value="net.didion.jwnl.dictionary.morph.DetachSuffixesOperation">
26 | 							<param name="noun" value="|s=|ses=s|xes=x|zes=z|ches=ch|shes=sh|men=man|ies=y|"/>
27 | 							<param name="verb" value="|s=|ies=y|es=e|es=|ed=e|ed=|ing=e|ing=|"/>
28 | 							<param name="adjective" value="|er=|est=|er=e|est=e|"/>
29 |                             <param name="operations">
30 |                                 <param value="net.didion.jwnl.dictionary.morph.LookupIndexWordOperation"/>
31 |                                 <param value="net.didion.jwnl.dictionary.morph.LookupExceptionsOperation"/>
32 |                             </param>
33 | 						</param>
34 | 					</param>
35 | 				</param>
36 | 			</param>
37 | 		</param>
38 | 		<param name="dictionary_element_factory" value="net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory"/>
39 | 		<param name="file_manager" value="net.didion.jwnl.dictionary.file_manager.FileManagerImpl">
40 | 			<param name="file_type" value="net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile"/>
41 | 			<param name="dictionary_path" value="/Users/davecarter/Dropbox/Thesis data/wordnet31dict"/>
42 | 		</param>
43 | 	</dictionary>
44 | 	<resource class="PrincetonResource"/>
45 | </jwnl_properties>


--------------------------------------------------------------------------------
/lib/commons-logging.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/commons-logging.jar


--------------------------------------------------------------------------------
/lib/ejml-0.23.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/ejml-0.23.jar


--------------------------------------------------------------------------------
/lib/jackson-annotations-2.2.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-annotations-2.2.3.jar


--------------------------------------------------------------------------------
/lib/jackson-core-2.3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-core-2.3.2.jar


--------------------------------------------------------------------------------
/lib/jackson-databind-2.2.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-databind-2.2.3.jar


--------------------------------------------------------------------------------
/lib/joda-time.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/joda-time.jar


--------------------------------------------------------------------------------
/lib/jollyday.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jollyday.jar


--------------------------------------------------------------------------------
/lib/jwnl.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jwnl.jar


--------------------------------------------------------------------------------
/lib/libsvm.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/libsvm.jar


--------------------------------------------------------------------------------
/lib/stanford-corenlp-3.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/stanford-corenlp-3.3.1.jar


--------------------------------------------------------------------------------
/src/ca/carter/thesis/RetrainingThread.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis;
 2 | 
 3 | import ca.carter.thesis.ml.SVMTokenModel;
 4 | 
 5 | public class RetrainingThread extends Thread {
 6 | 
 7 | 	SVMTokenModel model;
 8 | 	
 9 | 	
10 | 	public RetrainingThread(SVMTokenModel model) {
11 | 		super();
12 | 		this.model = model;
13 | 	}
14 | 	
15 | 	public void run() {
16 | 		model.retrain(null);
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ReviewFileReaderFlat.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.File;
 5 | import java.io.FileReader;
 6 | import java.io.IOException;
 7 | import java.util.LinkedList;
 8 | import java.util.Queue;
 9 | 
10 | import ca.carter.thesis.model.SimpleSentence;
11 | 
12 | public class ReviewFileReaderFlat {
13 | 
14 | 	static Queue<SimpleSentence> readReviewFile(File file, Integer limit) throws IOException
15 | 	{
16 | 		Queue<SimpleSentence> output = new LinkedList<SimpleSentence>();
17 | 		
18 | 		BufferedReader br = new BufferedReader(new FileReader(file));
19 | 		String line;
20 | 		int lineNum = 0;
21 | 		while ((line = br.readLine()) != null) {
22 | 			if (line.isEmpty() || line.charAt(0) == '*')
23 | 			{
24 | 				//do nothing
25 | 			}
26 | 			else
27 | 			{
28 | 				output.add(new SimpleSentence(line, true));
29 | 				
30 | 				if (limit != null && lineNum++ >= limit)
31 | 					break;
32 | 			}
33 | 		}
34 | 		br.close();
35 | 		
36 | 		return output;
37 | 	}
38 | 	
39 | }
40 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ReviewFileReaderXML.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis;
  2 | 
  3 | import javax.xml.parsers.DocumentBuilderFactory;
  4 | import javax.xml.parsers.DocumentBuilder;
  5 | 
  6 | import org.w3c.dom.Document;
  7 | import org.w3c.dom.NodeList;
  8 | import org.w3c.dom.Node;
  9 | import org.w3c.dom.Element;
 10 | 
 11 | import ca.carter.thesis.model.ProductFeatureOpinion;
 12 | import ca.carter.thesis.model.Sentence;
 13 | import ca.carter.thesis.model.SimpleSentence;
 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 15 | 
 16 | import java.io.File;
 17 | import java.io.IOException;
 18 | import java.util.ArrayList;
 19 | import java.util.LinkedList;
 20 | import java.util.Queue;
 21 | 
 22 | //mostly borrowed from http://www.mkyong.com/java/how-to-read-xml-file-in-java-dom-parser/
 23 | 
 24 | public class ReviewFileReaderXML {
 25 | 
 26 | 	static Queue<SimpleSentence> readReviewFile(File file, Integer limit) throws IOException
 27 | 	{
 28 | 		StanfordCoreNLP pipeline = Sentence.getDefaultPipeline();
 29 | 		
 30 | 		Queue<SimpleSentence> output = new LinkedList<SimpleSentence>();
 31 | 		
 32 | 		try {
 33 | 			DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
 34 | 			DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
 35 | 			Document doc = dBuilder.parse(file);
 36 | 
 37 | 			//optional, but recommended
 38 | 			//read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
 39 | 			doc.getDocumentElement().normalize();
 40 | 
 41 | 			//System.out.println("Root element :" + doc.getDocumentElement().getNodeName());
 42 | 
 43 | 			NodeList nList = doc.getElementsByTagName("sentence");
 44 | 
 45 | 			//System.out.println("----------------------------");
 46 | 
 47 | 			final int listLength = nList.getLength();
 48 | 			int sentenceNum = 0;
 49 | 			
 50 | 			for (int temp = 0; temp < Math.min(listLength, (limit == null ? listLength : limit)); temp++) {
 51 | 
 52 | 				Node nNode = nList.item(temp);
 53 | 
 54 | 				//System.out.println("\nCurrent Element :" + nNode.getNodeName());
 55 | 
 56 | 				if (nNode.getNodeType() == Node.ELEMENT_NODE) {
 57 | 
 58 | 					Element eElement = (Element) nNode;
 59 | 
 60 | 					String sentenceText = eElement.getElementsByTagName("text").item(0).getTextContent();
 61 | 					
 62 | 					SimpleSentence sentenceToReturn = new SimpleSentence(sentenceText, false);
 63 | 					
 64 | 					//System.out.println("ID : " + eElement.getAttribute("id"));
 65 | 					
 66 | //					if (sentenceNum == 1521 || sentenceNum == 1522)
 67 | //						System.out.println("Sentence " + temp + ": " + sentenceText); //eElement.getElementsByTagName("text").item(0).getTextContent());
 68 | 					sentenceNum++;
 69 | 					
 70 | 					if (eElement.getElementsByTagName("aspectTerms").getLength() == 0 )
 71 | 					{
 72 | 						//System.out.println("No aspects");
 73 | 					}
 74 | 					else
 75 | 					{
 76 | 						sentenceToReturn.setOpinions(new ArrayList<ProductFeatureOpinion>());
 77 | 						
 78 | 						
 79 | 						NodeList aspectsNodeList =  ((Element) eElement.getElementsByTagName("aspectTerms").item(0) ).getElementsByTagName("aspectTerm");
 80 | 						final int numberOfAspects = aspectsNodeList.getLength();
 81 | 						
 82 | 						//System.out.println(numberOfAspects + " aspects");
 83 | 						
 84 | 						for (int i = 0; i < numberOfAspects; i++)
 85 | 						{
 86 | 							Element aspectElement = (Element) aspectsNodeList.item(i);
 87 | 	
 88 | 							//System.out.println("     Aspect:   " + aspectElement.getAttribute("term"));
 89 | 							//System.out.println("     Polarity: " + aspectElement.getAttribute("polarity"));
 90 | 							//System.out.println("     From: " + aspectElement.getAttribute("from"));
 91 | 							//System.out.println("     To: " + aspectElement.getAttribute("to"));
 92 | 							
 93 | 							String aspect = aspectElement.getAttribute("term");
 94 | 							String polarity = aspectElement.getAttribute("polarity");
 95 | 							int from = Integer.valueOf(aspectElement.getAttribute("from"));
 96 | 							int to = Integer.valueOf(aspectElement.getAttribute("to"));
 97 | 							
 98 | 							if ("conflict".equals(polarity))
 99 | 							{
100 | 								sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, "positive", from, to , pipeline));
101 | 								sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, "negative", from, to , pipeline));
102 | 
103 | 							}
104 | 							else
105 | 							{
106 | 								sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, polarity, from, to , pipeline));
107 | 							}
108 | 						}
109 | 						
110 | 					}
111 | 					
112 | 					output.add(sentenceToReturn);
113 | 				}
114 | 			}
115 | 		} catch (Exception e) {
116 | 			e.printStackTrace();
117 | 		}
118 | 		
119 | 		return output;
120 | 	}
121 | 	
122 | 	public static void main(String[] args) {
123 | 		File xmlFile = new File("/Users/davecarter/Dropbox/Thesis data/Semeval-2014-task4/Restaurants_Train_v2.xml");
124 | 
125 | 		
126 | 		try {
127 | 			Queue<SimpleSentence> sentences = readReviewFile(xmlFile, null);
128 | 			
129 | 			System.out.println("Parsed " + sentences.size() + " sentences.");
130 | 		} catch (IOException e) {
131 | 			// TODO Auto-generated catch block
132 | 			e.printStackTrace();
133 | 		}
134 | 	}
135 | }
136 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/SeedModelCreatorThread.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.FileWriter;
 5 | import java.io.IOException;
 6 | import java.io.Writer;
 7 | import java.util.List;
 8 | 
 9 | import ca.carter.thesis.ml.ClassWeighting;
10 | import ca.carter.thesis.ml.ModelType;
11 | import ca.carter.thesis.ml.SVMTokenModel;
12 | import ca.carter.thesis.ml.SVMTokenModelFeature;
13 | import ca.carter.thesis.ml.SVMTokenModelSentiment;
14 | import ca.carter.thesis.model.Task;
15 | import ca.carter.thesis.model.TokenWithContext;
16 | 
17 | public class SeedModelCreatorThread extends Thread {
18 | 
19 | 	private List<SVMTokenModel> models;
20 | 	private String modelFileOutput;
21 | 	private String fileName;
22 | 	private ModelType modelType;
23 | 	private Task task;
24 | 	private List<TokenWithContext> seedTokens;
25 | 	private ClassWeighting classWeighting;
26 | 	private Double c;
27 | 	private Double gamma;
28 | 	private Double epsilon;
29 | 	
30 | 	public SeedModelCreatorThread(List<SVMTokenModel> models, String modelFileOutput, String fileName,
31 | 			ModelType modelType, Task task, List<TokenWithContext> seedTokens, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) {
32 | 		super();
33 | 		this.models = models;
34 | 		this.modelFileOutput = modelFileOutput;
35 | 		this.fileName = fileName;
36 | 		this.modelType = modelType;
37 | 		this.task = task;
38 | 		this.seedTokens = seedTokens;
39 | 		this.classWeighting = classWeighting;
40 | 		this.c = c;
41 | 		this.gamma = gamma;
42 | 		this.epsilon = epsilon;
43 | 	}
44 | 
45 | 
46 | 
47 | 	public void run() {
48 | 		Writer[] modelWriter = null;
49 | 		if (modelFileOutput != null)
50 | 		{
51 | 			try {
52 | 				modelWriter = new Writer[2];
53 | 				modelWriter[0] = new BufferedWriter(new FileWriter(modelFileOutput + "view0lexical" + fileName));
54 | 				modelWriter[1] = new BufferedWriter(new FileWriter(modelFileOutput + "view1syntactic" + fileName));				
55 | 			} catch (IOException e) {
56 | 				// TODO Auto-generated catch block
57 | 				e.printStackTrace();
58 | 			}
59 | 		}
60 | 		
61 | 		SVMTokenModel model = null;
62 | 		switch (modelType)
63 | 		{
64 | 		case FEATURE:
65 | 			model = new SVMTokenModelFeature(task, seedTokens, modelWriter, classWeighting, c, gamma, epsilon);
66 | 			break;
67 | 		case SENTIMENT:
68 | 			model = new SVMTokenModelSentiment(task, seedTokens, modelWriter, classWeighting, c, gamma, epsilon);
69 | 			break;
70 | 		}
71 | 		
72 | 		System.out.println(modelType + ": C is " + model.getC(0) + ", gamma is " + model.getGamma(0) + ", and epsilon is " + model.getEpsilon() + "; using " + classWeighting + " weighting policy." );
73 | 		models.add(model);
74 | 
75 | 		
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/SentenceProcessorThread.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Queue;
 5 | 
 6 | import ca.carter.thesis.model.ProductFeatureOpinion;
 7 | import ca.carter.thesis.model.Sentence;
 8 | import ca.carter.thesis.model.SimpleSentence;
 9 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
10 | 
11 | public class SentenceProcessorThread extends Thread
12 | {
13 | 	private Queue<SimpleSentence> rawSentences;
14 | 	private List<Sentence> sentences;
15 | 	private String genericName;
16 | 	private String brandName;
17 | 	private int titleLineIgnored = 0;
18 | 	private int sentencesProcessed = 0;
19 | 	
20 | 	protected static Integer lock = 1;
21 | 	
22 | 	public SentenceProcessorThread(Queue<SimpleSentence> rawSentences, List<Sentence> sentences, String genericName, String brandName)
23 | 	{
24 | 		this.sentences = sentences;
25 | 		this.rawSentences = rawSentences;
26 | 		this.genericName = genericName;
27 | 		this.brandName = brandName;
28 | 	}
29 | 	
30 | 	public int getTitleLineIgnored() {
31 | 		return titleLineIgnored;
32 | 	}
33 | 
34 | 	public int getSentencesProcessed() {
35 | 		return sentencesProcessed;
36 | 	}
37 | 
38 | 	public void run() {
39 | 		
40 | 		try {
41 | 			StanfordCoreNLP pipeline = Sentence.getDefaultPipeline();
42 | 			StanfordCoreNLP featurePipeline = ProductFeatureOpinion.getDefaultPipeline();
43 | 
44 | 			SimpleSentence nextLine = null;
45 | 
46 | 			if (rawSentences == null || rawSentences.isEmpty())
47 | 			{
48 | 				System.err.println("Sentence list was null/empty.");
49 | 				return;
50 | 			}
51 | 			
52 | 			synchronized(lock) {
53 | 				nextLine = rawSentences.poll();
54 | 			}
55 | 			
56 | 			while (nextLine != null) {
57 | 				if (nextLine.isNeedsOpinionParsing() == true && nextLine.getSentence().startsWith("[t]"))
58 | 				{
59 | 					titleLineIgnored++;
60 | 					//TODO: this would be useful at some point; but for now, skipping titles
61 | 				}
62 | 				else
63 | 				{
64 | 					try
65 | 					{
66 | 						Sentence sentence = null;
67 | 						if (nextLine.isNeedsOpinionParsing() == true)
68 | 							sentence = new Sentence(nextLine, pipeline, featurePipeline, genericName, brandName, false);
69 | 						else
70 | 							sentence = new Sentence(nextLine, pipeline, featurePipeline, genericName, brandName, false);
71 | 							
72 | 						sentencesProcessed++;
73 | 						synchronized(sentences)
74 | 						{
75 | 							sentences.add(sentence);
76 | 						}
77 | 					}
78 | 					catch (Exception e)
79 | 					{
80 | 						System.err.println("Had trouble parsing " + nextLine);
81 | 						e.printStackTrace();
82 | 					}
83 | 				}
84 | 				synchronized(lock) {
85 | 					nextLine = rawSentences.poll();
86 | 				}
87 | 
88 | 			}
89 | 		} catch (Exception e) {
90 | 			// TODO Auto-generated catch block
91 | 			e.printStackTrace();
92 | 		}
93 | 		
94 | 		System.out.println("Processed " + sentencesProcessed);
95 | 	}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/TokenPredictorThread.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis;
 2 | 
 3 | import ca.carter.thesis.ml.Prediction;
 4 | import ca.carter.thesis.ml.SVMTokenModel;
 5 | import ca.carter.thesis.model.TokenWithContext;
 6 | 
 7 | public class TokenPredictorThread extends Thread {
 8 | 
 9 | 	private SVMTokenModel model;
10 | 	private TokenWithContext nextCotrainingToken;
11 | 	private Prediction prediction;
12 | 	
13 | 	
14 | 	
15 | 	public TokenPredictorThread(SVMTokenModel model
16 | 			) {
17 | 		super();
18 | 		this.model = model;
19 | 	}
20 | 
21 | 
22 | 
23 | 	public TokenWithContext getNextCotrainingToken() {
24 | 		return nextCotrainingToken;
25 | 	}
26 | 
27 | 
28 | 
29 | 	public void setNextCotrainingToken(TokenWithContext nextCotrainingToken) {
30 | 		this.nextCotrainingToken = nextCotrainingToken;
31 | 	}
32 | 
33 | 
34 | 
35 | 	public Prediction getPrediction() {
36 | 		return prediction;
37 | 	}
38 | 
39 | 
40 | 
41 | 	public void setPrediction(Prediction prediction) {
42 | 		this.prediction = prediction;
43 | 	}
44 | 
45 | 
46 | 
47 | 	public void run() {
48 | 		prediction = model.predict(nextCotrainingToken);
49 | 		if (prediction == null)
50 | 			System.err.println("Null prediction for " + nextCotrainingToken);
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/WikipediaParaphraser.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.InputStreamReader;
  5 | import java.net.HttpURLConnection;
  6 | import java.net.URL;
  7 | import java.net.URLEncoder;
  8 | import java.util.ArrayList;
  9 | import java.util.Collections;
 10 | import java.util.HashMap;
 11 | import java.util.List;
 12 | import java.util.Map;
 13 | 
 14 | import com.fasterxml.jackson.databind.JsonNode;
 15 | import com.fasterxml.jackson.databind.ObjectMapper;
 16 | 
 17 | public class WikipediaParaphraser {
 18 | 
 19 | 	//caching requests to be polite to the wikipedia folks
 20 | 	private static final Map<String, List<String>> cache = Collections.synchronizedMap( new HashMap<String, List<String>>() );
 21 | 	private static final boolean debug = false;
 22 | 	
 23 | 	public static List<String> getParaphrases(String string, boolean printDebug)
 24 | 	{
 25 | 		try
 26 | 		{
 27 | 			List<String> cachedCopy = cache.get(string);
 28 | 			if (cachedCopy != null)
 29 | 			{
 30 | 				if (debug)
 31 | 					System.out.println("Cache hit pos.");
 32 | 				return cachedCopy;
 33 | 			}
 34 | 			else if (cache.containsKey(string))
 35 | 			{
 36 | 				if (debug)
 37 | 					System.out.println("Cache hit neg.");
 38 | 				return null;
 39 | 			}
 40 | 			else
 41 | 			{
 42 | 				if (debug)
 43 | 					System.out.println("Cache miss.");
 44 | 			}
 45 | 			
 46 | 			//String jsonFromWikipedia = 
 47 | 			//		"{\"query\":{\"normalized\":[{\"from\":\"picture quality\",\"to\":\"Picture quality\"}],\"pages\":{\"38253269\":{\"pageid\":38253269,\"ns\":0,\"title\":\"Picture quality\",\"revisions\":[{\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"#redirect [[image quality]]\"}]}}}}";
 48 | 					//"{\"query\":{\"normalized\":[{\"from\":\"scroll button\",\"to\":\"Scroll button\"}],\"pages\":{\"-1\":{\"ns\":0,\"title\":\"Scroll button\",\"missing\":\"\"}}}}"
 49 | 			
 50 | 			String jsonFromWikipedia = getTextFromURL("http://en.wikipedia.org/w/api.php?format=json&action=query&titles=" + URLEncoder.encode(string, "UTF-8")  + "&prop=revisions&rvprop=content");
 51 | 			
 52 | 			//http://en.wikipedia.org/w/api.php?format=json&action=query&titles=picture%20quality&prop=revisions&rvprop=content
 53 | 	
 54 | 			ObjectMapper mapper = new ObjectMapper();
 55 | 			JsonNode rootNode = mapper.readTree(jsonFromWikipedia);
 56 | 			
 57 | 			try
 58 | 			{
 59 | 				JsonNode nameNode = rootNode.get("query").get("pages").elements().next().get("revisions").get(0).get("*");
 60 | 				String nodeTitle = nameNode.asText().toLowerCase();
 61 | 				
 62 | 				if (nodeTitle.startsWith("#redirect"))
 63 | 				{
 64 | 					List<String> results = new ArrayList<String>();
 65 | 					if (debug)
 66 | 						System.out.println(nodeTitle);
 67 | 					String trimmedNodeTitle = nodeTitle.substring(nodeTitle.indexOf("[[") + 2, nodeTitle.indexOf("]]")).replace('_', ' ').trim();
 68 | 					if (trimmedNodeTitle.endsWith("(disambiguation)"))
 69 | 						trimmedNodeTitle = trimmedNodeTitle.substring(0, trimmedNodeTitle.indexOf("(disambiguation)")).trim();
 70 | 						
 71 | 					String[] splitBySection = trimmedNodeTitle.split("#");
 72 | 					for (String nextSection : splitBySection)
 73 | 					{
 74 | 						if (!nextSection.equalsIgnoreCase(string))
 75 | 							results.add(nextSection);
 76 | 					}
 77 | 					if (!results.isEmpty())
 78 | 					{
 79 | 						if (printDebug)
 80 | 							System.out.println("Possible paraphrase: " + string + " ==> " + serializeList(results));
 81 | 
 82 | 						cache.put(string, results);
 83 | 						return results;
 84 | 					}
 85 | 					
 86 | 				}
 87 | 
 88 | 			}
 89 | 			catch (Exception e)
 90 | 			{
 91 | 				//do nothing; just a failure
 92 | 			}
 93 | 			
 94 | 			
 95 | 		} catch (Exception e) {
 96 | 			e.printStackTrace();
 97 | 		}
 98 | 
 99 | 		cache.put(string, null);
100 | 		return null;
101 | 	}
102 | 	
103 | 	//almost verbatim from http://stackoverflow.com/questions/1485708/how-do-i-do-a-http-get-in-java
104 | 	private static String getTextFromURL(String urlToRead) {
105 | 	      URL url;
106 | 	      HttpURLConnection conn;
107 | 	      BufferedReader rd;
108 | 	      String line;
109 | 	      String result = "";
110 | 	      try {
111 | 	         url = new URL(urlToRead);
112 | 	         conn = (HttpURLConnection) url.openConnection();
113 | 	         conn.setRequestProperty("User-Agent", "CarterThesis/1.0 (Macintosh; U; Intel Mac OS X 10.9; en-CA; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2");
114 | 	         conn.setRequestMethod("GET");
115 | 	         rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));
116 | 	         while ((line = rd.readLine()) != null) {
117 | 	            result += line;
118 | 	         }
119 | 	         rd.close();
120 | 	      } catch (Exception e) {
121 | 	         e.printStackTrace();
122 | 	      }
123 | 	      return result;
124 | 	   }
125 | 	
126 | 	public static String serializeList(List<String> list)
127 | 	{
128 | 		if (list == null)
129 | 			return null;
130 | 		
131 | 		StringBuilder sb = new StringBuilder();
132 | 		
133 | 		boolean first = true;
134 | 		for (String nextParaphrase : list)
135 | 		{
136 | 			if (!first)
137 | 				sb.append(", ");
138 | 			first = false;
139 | 			sb.append(nextParaphrase);
140 | 		}
141 | 		
142 | 		return sb.toString();
143 | 	}
144 | 	
145 | 	public static void main(String[] args)
146 | 	{
147 | 		String[] testPhrases = {
148 | 				"picture quality",
149 | 				"set up",
150 | 				"rechargable battery", // ==> Rechargeable_battery
151 | 				"auto focus", // ==> Autofocus
152 | 				"picture quality", // ==> image quality
153 | 				"movie", // ==> film
154 | 				"spot metering", // ==> metering mode#spot metering
155 | 				"dvd player", //should have none, but wikipedia will tend to correct the capitalization
156 | 				"video format",
157 | 				"lens cap", // ==> lens cover
158 | 				"lense", // ==> lens
159 | 				"photo", // ==> photograph
160 | 				"white balance", // ==> color balance
161 | 				"uploading"
162 | 		};
163 | 		
164 | 		for (String nextTestPhrase : testPhrases)
165 | 		{
166 | 			System.out.print(nextTestPhrase);
167 | 			System.out.print(" -> ");
168 | 			List<String> paraphrases = getParaphrases(nextTestPhrase, false);
169 | 			if (paraphrases != null)
170 | 			{	
171 | 				System.out.print(serializeList(paraphrases));
172 | 			}
173 | 			System.out.print("\n");
174 | 			
175 | 		}
176 | 	}
177 | }
178 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/WordNetResolver.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileInputStream;
  5 | import java.io.FileNotFoundException;
  6 | 
  7 | import net.didion.jwnl.JWNL;
  8 | import net.didion.jwnl.JWNLException;
  9 | import net.didion.jwnl.data.IndexWord;
 10 | import net.didion.jwnl.data.POS;
 11 | import net.didion.jwnl.data.Pointer;
 12 | import net.didion.jwnl.data.PointerType;
 13 | import net.didion.jwnl.data.Synset;
 14 | import net.didion.jwnl.data.Word;
 15 | import net.didion.jwnl.dictionary.Dictionary;
 16 | 
 17 | public class WordNetResolver {
 18 | 
 19 | 	private static final String jwnlRoot ="/Users/" + System.getProperty("user.name") + "/Dropbox/Thesis work/workspace/process-reviews/jwnl14_file_properties.xml";
 20 | 
 21 | 	private static boolean initialized = false;
 22 | 	private static Dictionary dict;
 23 | 	
 24 | 	static 
 25 | 	{
 26 | 		
 27 | 		try {
 28 | 			JWNL.initialize(new FileInputStream(new File(jwnlRoot)));
 29 | 			
 30 | 			dict = Dictionary.getInstance();
 31 | 			
 32 | 			initialized = true;
 33 | 
 34 | 		} catch (FileNotFoundException e) {
 35 | 			// TODO Auto-generated catch block
 36 | 			e.printStackTrace();
 37 | 		} catch (JWNLException e) {
 38 | 			// TODO Auto-generated catch block
 39 | 			e.printStackTrace();
 40 | 		}
 41 | 		catch (Throwable t)
 42 | 		{
 43 | 			t.printStackTrace();
 44 | 		}
 45 | 		
 46 | 	}
 47 | 	
 48 | 	public static synchronized String getAttributeForAdjective(String adj)
 49 | 	{
 50 | 		try
 51 | 		{
 52 | 			if (!initialized)
 53 | 			{
 54 | 				while (!initialized)
 55 | 				{
 56 | 					Thread.sleep(1000);
 57 | 				}
 58 | 			}
 59 | 			IndexWord indexWord = null;
 60 | 			try
 61 | 			{
 62 | 				indexWord = dict.lookupIndexWord(POS.ADJECTIVE, adj);
 63 | 			}
 64 | 			catch (java.util.NoSuchElementException e)
 65 | 			{
 66 | 				System.out.println("Could not find adjective " + adj);
 67 | 				e.printStackTrace();
 68 | 			}
 69 | 	
 70 | 				
 71 | 			if (indexWord == null)
 72 | 				return null;
 73 | 			
 74 | 			Synset firstSense = indexWord.getSense(1); //numbering starts at 1, not 0
 75 | 	
 76 | 			if (firstSense == null)
 77 | 				return null;
 78 | 			
 79 | 			Pointer[] attributePointers = firstSense.getPointers(PointerType.ATTRIBUTE); //should be length 0 or 1 in most cases
 80 | 	
 81 | 			if (attributePointers == null || attributePointers.length == 0)
 82 | 			{
 83 | 				if (indexWord.getSenseCount() == 1)
 84 | 				{
 85 | 					//check for stuff like "low-cost", "low-priced" in synset when it is a fairly well-defined word
 86 | 					for (Word word: firstSense.getWords())
 87 | 					{
 88 | 						if (word.getLemma().startsWith("low-") || word.getLemma().startsWith("high-"))
 89 | 						{
 90 | 							String tentativeValue = word.getLemma().substring(word.getLemma().indexOf("-") + 1);
 91 | 							
 92 | 							if (tentativeValue.endsWith("d"))
 93 | 							{
 94 | 								//if it's a past participle we want to return "price", not "priced"
 95 | 								IndexWord coreVerb = dict.lookupIndexWord(POS.VERB, tentativeValue);
 96 | 								if (coreVerb != null)
 97 | 								{
 98 | 									Synset firstVerbSense = coreVerb.getSense(1); //numbering starts at 1, not 0
 99 | 									if (firstVerbSense != null)
100 | 									{
101 | 										String tentativeReplacementVerb = firstVerbSense.getWords()[0].getLemma();
102 | 										if (tentativeReplacementVerb.substring(0, 2).equalsIgnoreCase(tentativeValue.substring(0, 2)))
103 | 											return tentativeReplacementVerb;
104 | 									}
105 | 								}
106 | 
107 | 							}
108 | 
109 | 							return tentativeValue;
110 | 						}
111 | 					}
112 | 					
113 | 				}
114 | 
115 | 				//fall through
116 | 				return null;
117 | 			}
118 | 			
119 | 			Synset attributeSynset = attributePointers[0].getTargetSynset();
120 | 			
121 | 			if (attributeSynset == null || attributeSynset.getWordsSize() == 0)
122 | 				return null;
123 | 			
124 | 			return attributeSynset.getWords()[0].getLemma();
125 | 		}
126 | 		catch (Exception e)
127 | 		{
128 | 			System.out.println("Could not look up " + adj);
129 | 			e.printStackTrace();
130 | 			return null;
131 | 		}
132 | 		
133 | 	}
134 | 	
135 | 	
136 | 	public static boolean isFeatureNearlySynonymous(String word1, String word2)
137 | 	{
138 | 		try
139 | 		{
140 | 			IndexWord indexWord = dict.lookupIndexWord(POS.NOUN, word1);
141 | 			
142 | 			if (indexWord == null)
143 | 				return false;	//TODO: check the reverse?
144 | 			
145 | 			for (Synset nextSense : indexWord.getSenses())
146 | 			{
147 | 				for (Word nextWord : nextSense.getWords())
148 | 				{
149 | 					if (nextWord.getLemma().equalsIgnoreCase(word2))
150 | 						return true;
151 | 				}
152 | 			}
153 | 			
154 | 			return false;
155 | 		}
156 | 		catch (Exception e)
157 | 		{
158 | 			System.out.println("Could not look up " + word1 + " and " + word2);
159 | 			e.printStackTrace();
160 | 			return false;
161 | 		}
162 | 
163 | 	}
164 | 	
165 | 	
166 | 	public static void main(String[] args)
167 | 	{
168 | 		try {
169 | 			String[] testWords = {
170 | 					
171 | 					"small",  //size
172 | 					"large",
173 | 					"loud",
174 | 					"bright",
175 | 					"wide",
176 | 					"full",
177 | 					"empty",
178 | 					"light", 	//weight
179 | 					"easy",  	//ease
180 | 					"big",    	//size
181 | 					"compact", 	//size
182 | 					"useful",	//none
183 | 					"affordable", //price    **doesn't have attribute, but other words in synset are "low-cost" and "low-priced"
184 | 					"pricey",
185 | 					"heavy",	//weight
186 | 					"beautiful",	//beauty
187 | 					"cold",
188 | 					"razor-sharp",
189 | 					"wicked",
190 | 					"fast",
191 | 					
192 | 			};
193 | 			
194 | 			for (String testWord : testWords)
195 | 			{
196 | 				System.out.println(testWord + " --> " + WordNetResolver.getAttributeForAdjective(testWord));
197 | 			}
198 | 			
199 | 			System.out.println(WordNetResolver.isFeatureNearlySynonymous("cost", "price"));
200 | 			System.out.println(WordNetResolver.isFeatureNearlySynonymous("price", "cost"));
201 | 		} catch (Exception e) {
202 | 			// TODO Auto-generated catch block
203 | 			e.printStackTrace();
204 | 		}
205 | 	}
206 | 	
207 | }
208 | 
209 | 
210 | /*
211 |  * Examples that might be correctable with a better heuristic:
212 | * Found temperature for adjective hot while considering heat
213 | * Found volume for adjective loud while considering sound
214 | * Found comfort for adjective comfortable while considering earbud
215 | Found beauty for adjective ugly while considering style
216 | Found difficulty for adjective difficult while considering software
217 | Found comfort for adjective comfortable while considering earbud
218 | Found attractiveness for adjective attractive while considering design
219 | Found comfort for adjective comfortable while considering earpiece
220 | Found clarity for adjective clear while considering sound quality
221 | 
222 | 
223 | */
224 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/evaluation/ResultsSummary.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.evaluation;
  2 | 
  3 | import ca.carter.thesis.ml.ModelType;
  4 | 
  5 | public class ResultsSummary {
  6 | 	int numTested;
  7 | 	int falsePositives;
  8 | 	double truePositives; //is double for cases where we have a partial result (i.e., multiple aspect-sentiment pairs in a sentence)
  9 | 	double falseNegatives;
 10 | 	int trueNegatives;
 11 | 	ModelType modelType;
 12 | 	
 13 | 	public ResultsSummary(int numTested, double truePositives, int trueNegatives, int falsePositives,
 14 | 			 double falseNegatives, ModelType modelType ) {
 15 | 		super();
 16 | 		this.numTested = numTested;
 17 | 		this.falsePositives = falsePositives;
 18 | 		this.truePositives = truePositives;
 19 | 		this.falseNegatives = falseNegatives;
 20 | 		this.trueNegatives = trueNegatives;
 21 | 		this.modelType = modelType;
 22 | 	}
 23 | 
 24 | 
 25 | 	public ModelType getModelType() {
 26 | 		return modelType;
 27 | 	}
 28 | 
 29 | 
 30 | 	public void setModelType(ModelType modelType) {
 31 | 		this.modelType = modelType;
 32 | 	}
 33 | 
 34 | 
 35 | 	public int getNumTested() {
 36 | 		return numTested;
 37 | 	}
 38 | 
 39 | 
 40 | 	public void setNumTested(int numTested) {
 41 | 		this.numTested = numTested;
 42 | 	}
 43 | 
 44 | 
 45 | 	public int getFalsePositives() {
 46 | 		return falsePositives;
 47 | 	}
 48 | 
 49 | 
 50 | 	public void setFalsePositives(int falsePositives) {
 51 | 		this.falsePositives = falsePositives;
 52 | 	}
 53 | 
 54 | 
 55 | 	public double getTruePositives() {
 56 | 		return truePositives;
 57 | 	}
 58 | 
 59 | 
 60 | 	public void setTruePositives(double truePositives) {
 61 | 		this.truePositives = truePositives;
 62 | 	}
 63 | 
 64 | 
 65 | 	public double getFalseNegatives() {
 66 | 		return falseNegatives;
 67 | 	}
 68 | 
 69 | 
 70 | 	public void setFalseNegatives(double falseNegatives) {
 71 | 		this.falseNegatives = falseNegatives;
 72 | 	}
 73 | 
 74 | 
 75 | 	public int getTrueNegatives() {
 76 | 		return trueNegatives;
 77 | 	}
 78 | 
 79 | 
 80 | 	public void setTrueNegatives(int trueNegatives) {
 81 | 		this.trueNegatives = trueNegatives;
 82 | 	}
 83 | 	
 84 | 	public void printOutResults()
 85 | 	{
 86 | 		System.out.println("True positives: "  + truePositives  + " (" + (100 * truePositives  / numTested) + "%)");
 87 | 		System.out.println("True negatives: "  + trueNegatives  + " (" + (100 * trueNegatives  / numTested) + "%)");
 88 | 		System.out.println("False positives: " + falsePositives + " (" + (100 * falsePositives / numTested) + "%)");
 89 | 		System.out.println("False negatives: " + falseNegatives + " (" + (100 * falseNegatives / numTested) + "%)");
 90 | 	
 91 | 		
 92 | 		System.out.println("Precision: " + getPrecision());
 93 | 		System.out.println("Recall/sensitivity: " + getRecall());
 94 | 		System.out.println("Accuracy: " + getAccuracy());
 95 | 		System.out.println("Specificity: " + getSpecificity());
 96 | 		System.out.println("F1 = " + getF1());
 97 | 		System.out.println("Total tested: " + numTested);
 98 | 		
 99 | 		System.out.println(toThreePlaces(getPrecision()) + " & " + toThreePlaces(getRecall())  + " & " + toThreePlaces(getF1())  + " & " + toThreePlaces(getAccuracy()));
100 | 		
101 | 	}
102 | 	
103 | 	public static double toThreePlaces(double num)
104 | 	{
105 | 		return Math.round(num * 1000) / 1000.0;
106 | 	}
107 | 	
108 | 	public double getPrecision() {
109 | 		if (truePositives == 0)
110 | 			return 0.0;
111 | 		
112 | 		return 1.0 * truePositives / (truePositives + falsePositives);
113 | 	}
114 | 	public double getRecall() {
115 | 		if (truePositives == 0)
116 | 			return 0.0;
117 | 
118 | 		return 1.0 * truePositives / (truePositives + falseNegatives);
119 | 
120 | 	}
121 | 	public double getAccuracy() {
122 | 		return 1.0 * (truePositives + trueNegatives) / (numTested);
123 | 
124 | 	}
125 | 	public double getSpecificity() {
126 | 		if (trueNegatives == 0)
127 | 			return 0.0;
128 | 		
129 | 		return 1.0 * trueNegatives / (trueNegatives + falsePositives);
130 | 
131 | 	}
132 | 	public double getF1() {
133 | 		if (truePositives == 0)
134 | 			return 0.0;
135 | 
136 | 		return (2.0 * getPrecision() * getRecall() / (getPrecision() + getRecall()) );
137 | 	}
138 | 
139 | }
140 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/languagemodels/DefaultTokenizer.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.languagemodels;
 2 | 
 3 | import java.util.StringTokenizer;
 4 | 
 5 | public class DefaultTokenizer {
 6 | 	public static StringTokenizer getDefaultTokenizer(String text)
 7 | 	{
 8 | 		return new StringTokenizer(text," \t\n\r\f:,'\"");
 9 | 	}
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/languagemodels/StopWords.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.languagemodels;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.List;
  6 | 
  7 | public class StopWords {
  8 | 	
  9 | 	//not used; stop words may be part of product names or feature types (though presumably would never be sentiment-bearing words)
 10 | 	
 11 | 	//from MySQL 5.x : https://dev.mysql.com/doc/refman/5.7/en/fulltext-stopwords.html
 12 | 	
 13 | 	private static final List<String> stopWordList = new ArrayList<String>();
 14 | 	
 15 | 	private static final String[] stopWords = 
 16 | 	{ 
 17 | 		"a's",
 18 | 		  "able",
 19 | 		  "about",
 20 | 		  "above",
 21 | 		  "according",
 22 | 		  "accordingly",
 23 | 		  "across",
 24 | 		  "actually",
 25 | 		  "after",
 26 | 		  "afterwards",
 27 | 		  "again",
 28 | 		  "against",
 29 | 		  "ain't",
 30 | 		  "all",
 31 | 		  "allow",
 32 | 		  "allows",
 33 | 		  "almost",
 34 | 		  "alone",
 35 | 		  "along",
 36 | 		  "already",
 37 | 		  "also",
 38 | 		  "although",
 39 | 		  "always",
 40 | 		  "am",
 41 | 		  "among",
 42 | 		  "amongst",
 43 | 		  "an",
 44 | 		  "and",
 45 | 		  "another",
 46 | 		  "any",
 47 | 		  "anybody",
 48 | 		  "anyhow",
 49 | 		  "anyone",
 50 | 		  "anything",
 51 | 		  "anyway",
 52 | 		  "anyways",
 53 | 		  "anywhere",
 54 | 		  "apart",
 55 | 		  "appear",
 56 | 		  "appreciate",
 57 | 		  "appropriate",
 58 | 		  "are",
 59 | 		  "aren't",
 60 | 		  "around",
 61 | 		  "as",
 62 | 		  "aside",
 63 | 		  "ask",
 64 | 		  "asking",
 65 | 		  "associated",
 66 | 		  "at",
 67 | 		  "available",
 68 | 		  "away",
 69 | 		  "awfully",
 70 | 		  "be",
 71 | 		  "became",
 72 | 		  "because",
 73 | 		  "become",
 74 | 		  "becomes",
 75 | 		  "becoming",
 76 | 		  "been",
 77 | 		  "before",
 78 | 		  "beforehand",
 79 | 		  "behind",
 80 | 		  "being",
 81 | 		  "believe",
 82 | 		  "below",
 83 | 		  "beside",
 84 | 		  "besides",
 85 | 		  "best",
 86 | 		  "better",
 87 | 		  "between",
 88 | 		  "beyond",
 89 | 		  "both",
 90 | 		  "brief",
 91 | 		  "but",
 92 | 		  "by",
 93 | 		  "c'mon",
 94 | 		  "c's",
 95 | 		  "came",
 96 | 		  "can",
 97 | 		  "can't",
 98 | 		  "cannot",
 99 | 		  "cant",
100 | 		  "cause",
101 | 		  "causes",
102 | 		  "certain",
103 | 		  "certainly",
104 | 		  "changes",
105 | 		  "clearly",
106 | 		  "co",
107 | 		  "com",
108 | 		  "come",
109 | 		  "comes",
110 | 		  "concerning",
111 | 		  "consequently",
112 | 		  "consider",
113 | 		  "considering",
114 | 		  "contain",
115 | 		  "containing",
116 | 		  "contains",
117 | 		  "corresponding",
118 | 		  "could",
119 | 		  "couldn't",
120 | 		  "course",
121 | 		  "currently",
122 | 		  "definitely",
123 | 		  "described",
124 | 		  "despite",
125 | 		  "did",
126 | 		  "didn't",
127 | 		  "different",
128 | 		  "do",
129 | 		  "does",
130 | 		  "doesn't",
131 | 		  "doing",
132 | 		  "don't",
133 | 		  "done",
134 | 		  "down",
135 | 		  "downwards",
136 | 		  "during",
137 | 		  "each",
138 | 		  "edu",
139 | 		  "eg",
140 | 		  "eight",
141 | 		  "either",
142 | 		  "else",
143 | 		  "elsewhere",
144 | 		  "enough",
145 | 		  "entirely",
146 | 		  "especially",
147 | 		  "et",
148 | 		  "etc",
149 | 		  "even",
150 | 		  "ever",
151 | 		  "every",
152 | 		  "everybody",
153 | 		  "everyone",
154 | 		  "everything",
155 | 		  "everywhere",
156 | 		  "ex",
157 | 		  "exactly",
158 | 		  "example",
159 | 		  "except",
160 | 		  "far",
161 | 		  "few",
162 | 		  "fifth",
163 | 		  "first",
164 | 		  "five",
165 | 		  "followed",
166 | 		  "following",
167 | 		  "follows",
168 | 		  "for",
169 | 		  "former",
170 | 		  "formerly",
171 | 		  "forth",
172 | 		  "four",
173 | 		  "from",
174 | 		  "further",
175 | 		  "furthermore",
176 | 		  "get",
177 | 		  "gets",
178 | 		  "getting",
179 | 		  "given",
180 | 		  "gives",
181 | 		  "go",
182 | 		  "goes",
183 | 		  "going",
184 | 		  "gone",
185 | 		  "got",
186 | 		  "gotten",
187 | 		  "greetings",
188 | 		  "had",
189 | 		  "hadn't",
190 | 		  "happens",
191 | 		  "hardly",
192 | 		  "has",
193 | 		  "hasn't",
194 | 		  "have",
195 | 		  "haven't",
196 | 		  "having",
197 | 		  "he",
198 | 		  "he's",
199 | 		  "hello",
200 | 		  "help",
201 | 		  "hence",
202 | 		  "her",
203 | 		  "here",
204 | 		  "here's",
205 | 		  "hereafter",
206 | 		  "hereby",
207 | 		  "herein",
208 | 		  "hereupon",
209 | 		  "hers",
210 | 		  "herself",
211 | 		  "hi",
212 | 		  "him",
213 | 		  "himself",
214 | 		  "his",
215 | 		  "hither",
216 | 		  "hopefully",
217 | 		  "how",
218 | 		  "howbeit",
219 | 		  "however",
220 | 		  "i'd",
221 | 		  "i'll",
222 | 		  "i'm",
223 | 		  "i've",
224 | 		  "ie",
225 | 		  "if",
226 | 		  "ignored",
227 | 		  "immediate",
228 | 		  "in",
229 | 		  "inasmuch",
230 | 		  "inc",
231 | 		  "indeed",
232 | 		  "indicate",
233 | 		  "indicated",
234 | 		  "indicates",
235 | 		  "inner",
236 | 		  "insofar",
237 | 		  "instead",
238 | 		  "into",
239 | 		  "inward",
240 | 		  "is",
241 | 		  "isn't",
242 | 		  "it",
243 | 		  "it'd",
244 | 		  "it'll",
245 | 		  "it's",
246 | 		  "its",
247 | 		  "itself",
248 | 		  "just",
249 | 		  "keep",
250 | 		  "keeps",
251 | 		  "kept",
252 | 		  "know",
253 | 		  "knows",
254 | 		  "known",
255 | 		  "last",
256 | 		  "lately",
257 | 		  "later",
258 | 		  "latter",
259 | 		  "latterly",
260 | 		  "least",
261 | 		  "less",
262 | 		  "lest",
263 | 		  "let",
264 | 		  "let's",
265 | 		  "like",
266 | 		  "liked",
267 | 		  "likely",
268 | 		  "little",
269 | 		  "look",
270 | 		  "looking",
271 | 		  "looks",
272 | 		  "ltd",
273 | 		  "mainly",
274 | 		  "many",
275 | 		  "may",
276 | 		  "maybe",
277 | 		  "me",
278 | 		  "mean",
279 | 		  "meanwhile",
280 | 		  "merely",
281 | 		  "might",
282 | 		  "more",
283 | 		  "moreover",
284 | 		  "most",
285 | 		  "mostly",
286 | 		  "much",
287 | 		  "must",
288 | 		  "my",
289 | 		  "myself",
290 | 		  "name",
291 | 		  "namely",
292 | 		  "nd",
293 | 		  "near",
294 | 		  "nearly",
295 | 		  "necessary",
296 | 		  "need",
297 | 		  "needs",
298 | 		  "neither",
299 | 		  "never",
300 | 		  "nevertheless",
301 | 		  "new",
302 | 		  "next",
303 | 		  "nine",
304 | 		  "no",
305 | 		  "nobody",
306 | 		  "non",
307 | 		  "none",
308 | 		  "noone",
309 | 		  "nor",
310 | 		  "normally",
311 | 		  "not",
312 | 		  "nothing",
313 | 		  "novel",
314 | 		  "now",
315 | 		  "nowhere",
316 | 		  "obviously",
317 | 		  "of",
318 | 		  "off",
319 | 		  "often",
320 | 		  "oh",
321 | 		  "ok",
322 | 		  "okay",
323 | 		  "old",
324 | 		  "on",
325 | 		  "once",
326 | 		  "one",
327 | 		  "ones",
328 | 		  "only",
329 | 		  "onto",
330 | 		  "or",
331 | 		  "other",
332 | 		  "others",
333 | 		  "otherwise",
334 | 		  "ought",
335 | 		  "our",
336 | 		  "ours",
337 | 		  "ourselves",
338 | 		  "out",
339 | 		  "outside",
340 | 		  "over",
341 | 		  "overall",
342 | 		  "own",
343 | 		  "particular",
344 | 		  "particularly",
345 | 		  "per",
346 | 		  "perhaps",
347 | 		  "placed",
348 | 		  "please",
349 | 		  "plus",
350 | 		  "possible",
351 | 		  "presumably",
352 | 		  "probably",
353 | 		  "provides",
354 | 		  "que",
355 | 		  "quite",
356 | 		  "qv",
357 | 		  "rather",
358 | 		  "rd",
359 | 		  "re",
360 | 		  "really",
361 | 		  "reasonably",
362 | 		  "regarding",
363 | 		  "regardless",
364 | 		  "regards",
365 | 		  "relatively",
366 | 		  "respectively",
367 | 		  "right",
368 | 		  "said",
369 | 		  "same",
370 | 		  "saw",
371 | 		  "say",
372 | 		  "saying",
373 | 		  "says",
374 | 		  "second",
375 | 		  "secondly",
376 | 		  "see",
377 | 		  "seeing",
378 | 		  "seem",
379 | 		  "seemed",
380 | 		  "seeming",
381 | 		  "seems",
382 | 		  "seen",
383 | 		  "self",
384 | 		  "selves",
385 | 		  "sensible",
386 | 		  "sent",
387 | 		  "serious",
388 | 		  "seriously",
389 | 		  "seven",
390 | 		  "several",
391 | 		  "shall",
392 | 		  "she",
393 | 		  "should",
394 | 		  "shouldn't",
395 | 		  "since",
396 | 		  "six",
397 | 		  "so",
398 | 		  "some",
399 | 		  "somebody",
400 | 		  "somehow",
401 | 		  "someone",
402 | 		  "something",
403 | 		  "sometime",
404 | 		  "sometimes",
405 | 		  "somewhat",
406 | 		  "somewhere",
407 | 		  "soon",
408 | 		  "sorry",
409 | 		  "specified",
410 | 		  "specify",
411 | 		  "specifying",
412 | 		  "still",
413 | 		  "sub",
414 | 		  "such",
415 | 		  "sup",
416 | 		  "sure",
417 | 		  "t's",
418 | 		  "take",
419 | 		  "taken",
420 | 		  "tell",
421 | 		  "tends",
422 | 		  "th",
423 | 		  "than",
424 | 		  "thank",
425 | 		  "thanks",
426 | 		  "thanx",
427 | 		  "that",
428 | 		  "that's",
429 | 		  "thats",
430 | 		  "the",
431 | 		  "their",
432 | 		  "theirs",
433 | 		  "them",
434 | 		  "themselves",
435 | 		  "then",
436 | 		  "thence",
437 | 		  "there",
438 | 		  "there's",
439 | 		  "thereafter",
440 | 		  "thereby",
441 | 		  "therefore",
442 | 		  "therein",
443 | 		  "theres",
444 | 		  "thereupon",
445 | 		  "these",
446 | 		  "they",
447 | 		  "they'd",
448 | 		  "they'll",
449 | 		  "they're",
450 | 		  "they've",
451 | 		  "think",
452 | 		  "third",
453 | 		  "this",
454 | 		  "thorough",
455 | 		  "thoroughly",
456 | 		  "those",
457 | 		  "though",
458 | 		  "three",
459 | 		  "through",
460 | 		  "throughout",
461 | 		  "thru",
462 | 		  "thus",
463 | 		  "to",
464 | 		  "together",
465 | 		  "too",
466 | 		  "took",
467 | 		  "toward",
468 | 		  "towards",
469 | 		  "tried",
470 | 		  "tries",
471 | 		  "truly",
472 | 		  "try",
473 | 		  "trying",
474 | 		  "twice",
475 | 		  "two",
476 | 		  "un",
477 | 		  "under",
478 | 		  "unfortunately",
479 | 		  "unless",
480 | 		  "unlikely",
481 | 		  "until",
482 | 		  "unto",
483 | 		  "up",
484 | 		  "upon",
485 | 		  "us",
486 | 		  "use",
487 | 		  "used",
488 | 		  "useful",
489 | 		  "uses",
490 | 		  "using",
491 | 		  "usually",
492 | 		  "value",
493 | 		  "various",
494 | 		  "very",
495 | 		  "via",
496 | 		  "viz",
497 | 		  "vs",
498 | 		  "want",
499 | 		  "wants",
500 | 		  "was",
501 | 		  "wasn't",
502 | 		  "way",
503 | 		  "we",
504 | 		  "we'd",
505 | 		  "we'll",
506 | 		  "we're",
507 | 		  "we've",
508 | 		  "welcome",
509 | 		  "well",
510 | 		  "went",
511 | 		  "were",
512 | 		  "weren't",
513 | 		  "what",
514 | 		  "what's",
515 | 		  "whatever",
516 | 		  "when",
517 | 		  "whence",
518 | 		  "whenever",
519 | 		  "where",
520 | 		  "where's",
521 | 		  "whereafter",
522 | 		  "whereas",
523 | 		  "whereby",
524 | 		  "wherein",
525 | 		  "whereupon",
526 | 		  "wherever",
527 | 		  "whether",
528 | 		  "which",
529 | 		  "while",
530 | 		  "whither",
531 | 		  "who",
532 | 		  "who's",
533 | 		  "whoever",
534 | 		  "whole",
535 | 		  "whom",
536 | 		  "whose",
537 | 		  "why",
538 | 		  "will",
539 | 		  "willing",
540 | 		  "wish",
541 | 		  "with",
542 | 		  "within",
543 | 		  "without",
544 | 		  "won't",
545 | 		  "wonder",
546 | 		  "would",
547 | 		  "wouldn't",
548 | 		  "yes",
549 | 		  "yet",
550 | 		  "you",
551 | 		  "you'd",
552 | 		  "you'll",
553 | 		  "you're",
554 | 		  "you've",
555 | 		  "your",
556 | 		  "yours",
557 | 		  "yourself",
558 | 		  "yourselves",
559 | 		  "zero",
560 | 	};
561 | 	
562 | 	static {
563 | 		for (String nextStopWord : stopWords)
564 | 		{
565 | 			stopWordList.add(nextStopWord);
566 | 		}
567 |         Collections.sort(stopWordList);
568 | 	}
569 | 	
570 | 	public static boolean isStopWord(String token)
571 | 	{
572 | 		if (token == null)
573 | 			return true;
574 | 		else
575 | 		{
576 | 	        int pos = Collections.binarySearch(stopWordList, token.toLowerCase());
577 | 	        return pos >= 0;
578 | 		}
579 | 	}
580 | }
581 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/BinaryPrediction.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | import java.util.Map;
 4 | 
 5 | public class BinaryPrediction {
 6 | 	private double probability;
 7 | 	private boolean classNumber;
 8 | 	private Map<Integer, Double> classProbabilities;
 9 | 	
10 | 	
11 | 	public BinaryPrediction(boolean classNumber, double probability,
12 | 			Map<Integer, Double> classProbabilities) {
13 | 		super();
14 | 		this.classNumber = classNumber;
15 | 		this.probability = probability;
16 | 		this.classProbabilities = classProbabilities;
17 | 	}
18 | 	public BinaryPrediction(boolean classNumber, double probability) {
19 | 		super();
20 | 		this.probability = probability;
21 | 		this.classNumber = classNumber;
22 | 	}
23 | 	public double getProbability() {
24 | 		return probability;
25 | 	}
26 | 	public void setProbability(double probability) {
27 | 		this.probability = probability;
28 | 	}
29 | 	public boolean getClassNumber() {
30 | 		return classNumber;
31 | 	}
32 | 	public void setClassNumber(boolean classNumber) {
33 | 		this.classNumber = classNumber;
34 | 	}
35 | 	public Map<Integer, Double> getClassProbabilities() {
36 | 		return classProbabilities;
37 | 	}
38 | 	public void setClassProbabilities(Map<Integer, Double> classProbabilities) {
39 | 		this.classProbabilities = classProbabilities;
40 | 	}
41 | 	@Override
42 | 	public String toString() {
43 | 		return classNumber + ", probability " + probability + " " + classProbabilities;
44 | 	}
45 | 	
46 | 	
47 | 	
48 | }
49 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/ClassWeighting.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | public enum ClassWeighting {
 4 | 	EQUAL,
 5 | 	OVERSIZENEG,
 6 | 	UNDERSIZEPOS,
 7 | 	OVERSIZEPOS,
 8 | 	UNDERSIZENEG
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/FeatureDistance.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.ml;
  2 | 
  3 | public enum FeatureDistance {
  4 | 	SELF	( 0),
  5 | 	BEFORE	(-1),	//also, above
  6 | 	AFTER	( 1),
  7 | 	
  8 | 	//for neighbours only
  9 | 	PLUSONE		(1),
 10 | 	PLUSTWO 	(2),
 11 | 	PLUSTHREE 	(3),
 12 | 	
 13 | 	//for neighbours and parental lineage
 14 | 	MINUSONE	(-1),
 15 | 	MINUSTWO	(-2),
 16 | 	MINUSTHREE	(-3),
 17 | 	
 18 | 	//for parental lineage only
 19 | 	MINUSFOUR	(-4), 
 20 | 	MINUSFIVE	(-5),
 21 | 	MINUSSIX	(-6),
 22 | 	MINUSSEVEN	(-7),
 23 | 	MINUSEIGHT	(-8),
 24 | 	MINUSNINE	(-9),
 25 | 	MINUSTEN	(-10),
 26 | 	MINUSMORE	(-11);
 27 | 
 28 | 	private final int numericInterpretation;
 29 | 	
 30 | 	FeatureDistance(int numericInterpretation)
 31 | 	{
 32 | 		this.numericInterpretation = numericInterpretation;
 33 | 	}
 34 | 	public int getNumericInterpretation()
 35 | 	{
 36 | 		return this.numericInterpretation;
 37 | 	}
 38 | 	public boolean canBeGeneralized()
 39 | 	{
 40 | 		switch(this)
 41 | 		{
 42 | 		case MINUSONE:
 43 | 		case MINUSTWO:
 44 | 		case MINUSTHREE:
 45 | 		case MINUSFOUR:
 46 | 		case MINUSFIVE:
 47 | 		case MINUSSIX:
 48 | 		case MINUSMORE:
 49 | 		case PLUSONE:
 50 | 		case PLUSTWO:
 51 | 		case PLUSTHREE:
 52 | 			return true;
 53 | 		default:
 54 | 			return false;
 55 | 		}
 56 | 	}
 57 | 	
 58 | 	public FeatureDistance getGeneralCase()
 59 | 	{
 60 | 		if (this.numericInterpretation > 0)
 61 | 			return AFTER;
 62 | 		else if (this.numericInterpretation < 0)
 63 | 			return BEFORE;
 64 | 		else
 65 | 			return SELF;
 66 | 	}
 67 | 	
 68 | 	public static FeatureDistance byDistance(int distance)
 69 | 	{
 70 | 		//hard coded for performance; if more needed, change to an immutable map + lookup
 71 | 		switch(distance)
 72 | 		{
 73 | 		case 0:
 74 | 			return SELF;
 75 | 		case 1:
 76 | 			return PLUSONE;
 77 | 		case 2:
 78 | 			return PLUSTWO;
 79 | 		case 3:
 80 | 			return PLUSTHREE;
 81 | 		case -1:
 82 | 			return MINUSONE;
 83 | 		case -2:
 84 | 			return MINUSTWO;
 85 | 		case -3:
 86 | 			return MINUSTHREE;
 87 | 		case -4:
 88 | 			return MINUSFOUR;
 89 | 		case -5:
 90 | 			return MINUSFIVE;
 91 | 		case -6:
 92 | 			return MINUSSIX;
 93 | 		case -7:
 94 | 			return MINUSSEVEN;
 95 | 		case -8:
 96 | 			return MINUSEIGHT;
 97 | 		case -9:
 98 | 			return MINUSNINE;
 99 | 		case -10:
100 | 			return MINUSTEN;
101 | 		default:
102 | 			if (distance < -10)
103 | 				return MINUSMORE;
104 | 			else
105 | 				return null;
106 | 		}
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/FeatureRepository.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.ml;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | 
  7 | public class FeatureRepository  {
  8 | 
  9 | //	protected List<String> masterClassList = new ArrayList<String>();
 10 | 	protected List<String> masterTokenList = new ArrayList<String>();
 11 | 
 12 | 	//feature list management: add to list if it doesn't exist, else increment the count
 13 | 	protected int getNumberInList(String token, boolean addMissing)
 14 | 	{
 15 | 		//useful for both features and classes
 16 | 		int featureNumber = masterTokenList.indexOf(token);
 17 | 		if (addMissing && featureNumber < 0)
 18 | 		{
 19 | 			masterTokenList.add(token);
 20 | 			featureNumber = masterTokenList.size() - 1;
 21 | 		}
 22 | 		
 23 | 		return featureNumber;
 24 | 	}
 25 | 	
 26 | 	public int getNumberOfFeatures()
 27 | 	{
 28 | 		return masterTokenList.size();
 29 | 	}
 30 | 
 31 | 	public String getNameOfFeature(int key)
 32 | 	{
 33 | 		return masterTokenList.get(key);
 34 | 	}
 35 | 
 36 | 	public FeatureRepository()
 37 | 	{
 38 | 	}
 39 | 	
 40 | 	//default features using raw (non-normalized) word counts
 41 | 	//override this for fancier features
 42 | 	//if we're building a new model, we want to add the missing features; otherwise, if we're merely predicting, we probably don't
 43 | 	/*
 44 | 	@Override
 45 | 	public Map<Integer, Double> getFeaturesForTriple(Triple nextTriple, boolean addMissingFeatures, boolean isTraining) {
 46 | 
 47 | 		//can remove named entities as a step towards coupled training
 48 | 		//String toAnalyze = nextTriple.getPhrase().getString();
 49 | 		String toAnalyze = NamedEntityModelImpl.removeThisModelsFeatures(nextTriple);
 50 | 		
 51 | 		if (toAnalyze == null)
 52 | 			return null;
 53 | 		
 54 | 		StringTokenizer st = DefaultTokenizer.getDefaultTokenizer(toAnalyze);
 55 | 		int numTokens = st.countTokens();
 56 | 		
 57 | 		Map<Integer, Double> tokenCounts = new HashMap<Integer, Double>(); 
 58 | 		
 59 | 		List<String> wordsWithCaps = null;
 60 | 		boolean lastWordHadCap = false;
 61 | 		
 62 | 		for (int i = 0; i < numTokens; i++)
 63 | 		{
 64 | 			//tokenization notes:
 65 | 			//	remove trailing punctuation (so that final words in sentences are folded with non-terminating words
 66 | 			//	if a token contains punctuation (hyphen), record it both with and without
 67 | 			//  if a token contains punctuation (period), record it both with and without for variations in hyphenated names and acronyms (UN versus U.N.)
 68 | 			//	if a series of tokens has capital letters, record it together
 69 | 			List<String> tokenVariations = new ArrayList<String>();
 70 | 			
 71 | 			String nextToken = stripTrailingPunctuation(st.nextToken());
 72 | 			tokenVariations.add(nextToken);
 73 | 
 74 | 			//hyphens
 75 | 			if (nextToken.contains("-"))
 76 | 			{
 77 | 				for (String nextChunk : nextToken.split("-"))
 78 | 				{
 79 | 					tokenVariations.add(nextChunk);
 80 | 				}
 81 | 			}
 82 | 			
 83 | 			//periods and acronyms
 84 | 			if (nextToken.contains("."))
 85 | 				tokenVariations.add(nextToken.replaceAll("\\.", ""));
 86 | 			
 87 | 			//strings of capitalized words (which may be named entities)
 88 | 			boolean isCapitalized = nextToken.matches(".*[A-Z].*");
 89 | 			if (isCapitalized)
 90 | 			{
 91 | 				if (wordsWithCaps == null)
 92 | 					wordsWithCaps = new ArrayList<String>();
 93 | 				wordsWithCaps.add(nextToken);
 94 | 				lastWordHadCap = true;
 95 | 			}
 96 | 			if ((!isCapitalized && lastWordHadCap && wordsWithCaps != null) || (isCapitalized && i == numTokens - 1))
 97 | 			{
 98 | 				int wordsWithCapsSize = wordsWithCaps.size();
 99 | 				if (wordsWithCaps.size() > 1)
100 | 				{
101 | 					//do all combinations: so "UN Security Council" becomes "UN Security" + "Security Council" + "UN Security Council"
102 | 					for (int length = 2; length <= wordsWithCapsSize; length++ )
103 | 					{
104 | 						for (int firstWord = 0; firstWord <= wordsWithCapsSize - length; firstWord++)
105 | 						{
106 | 							//System.out.println("length is " + length + "; firstWord is " + firstWord );
107 | 							StringBuilder sb = new StringBuilder();
108 | 							for (int j = 0; j < length; j++)
109 | 							{
110 | 								if (j > 0)
111 | 									sb.append(" ");
112 | 								sb.append(wordsWithCaps.get(firstWord + j));
113 | 							}
114 | 							String permutation = sb.toString();
115 | 							tokenVariations.add(permutation);
116 | 							if (permutation.contains("-"))
117 | 								tokenVariations.add(permutation.replace('-', ' '));
118 | 							if (permutation.contains("."))
119 | 								tokenVariations.add(permutation.replaceAll("\\.", ""));
120 | 							if (permutation.contains(".") && permutation.contains("-"))
121 | 								tokenVariations.add(permutation.replace('-', ' ').replaceAll("\\.", ""));
122 | 						}
123 | 					}
124 | 				}
125 | 				lastWordHadCap = false;
126 | 				wordsWithCaps = null;
127 | 			}
128 | 			
129 | 			for (String nextTokenVariation : tokenVariations)
130 | 			{
131 | 				addFeature(nextTokenVariation, tokenCounts, addMissingFeatures);
132 | 			}
133 | 		}
134 | 
135 | 		return tokenCounts;
136 | 	}
137 | 	*/
138 | 	
139 | 	/*
140 | 	protected void addFeature(String token, Map<Integer, Double> tokenCounts, boolean addMissingFeatures)
141 | 	{
142 | 		addFeature(token, tokenCounts, addMissingFeatures, 1.0);
143 | 	}
144 | 	
145 | 	protected void addFeature(String token, Map<Integer, Double> tokenCounts, boolean addMissingFeatures, double amountToAddToFeature)
146 | 	{
147 | 		//System.out.println(nextTokenVariation);
148 | 		
149 | 		int featureNumber = getNumberInList(masterTokenList, token, addMissingFeatures);
150 | 		
151 | 		if (addMissingFeatures || featureNumber > 0)  //first half is redundant but add speed; feature number can only be less than zero if addMissingFeatures is false
152 | 		{
153 | 			Double existingCount = tokenCounts.get(featureNumber);
154 | 			if (existingCount == null)
155 | 				tokenCounts.put(featureNumber, amountToAddToFeature);
156 | 			else
157 | 				tokenCounts.put(featureNumber, existingCount + amountToAddToFeature);
158 | 		}
159 | 		
160 | 	}
161 | */
162 | 	
163 | 	/*
164 | 	private String stripTrailingPunctuation(String token)
165 | 	{
166 | 		Matcher m = patEndsWithPunctuation.matcher(token);
167 | 		if (m.find())
168 | 			return token.substring(0, token.length() - 1);
169 | 		else
170 | 			return token;
171 | 	}
172 | 	*/
173 | 	
174 | 	/*
175 | 	@Override
176 | 	public Prediction getCertainPrediction(Triple triple) {
177 | 		return new Prediction(getClassNumber(triple), 1.0);
178 | 	}
179 | 	*/
180 | 
181 | 	
182 | }
183 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/FeatureType.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | public enum FeatureType {
 4 | 	
 5 | 	/* The purpose of this class is to identify specific types of features for use in a general-purpose feature store.
 6 | 	
 7 |     /* TokenWithContext
 8 |      * [token=first, lemma=first,
 9 |      * pos=RB,
10 |      * previousTokens=[null,null,null], nextTokens=[let,me,say],
11 |      * parentage=[ADVP], localParentage=[ADVP],
12 |      * isNamedEntity=true, isCoreferenceHead=true,
13 |      * flatResolvedCoreference=first,
14 |      * partOfSentimentStructure=null, --> what we're trying to classify
15 |      * semanticSpecificRole=advmod, semanticGeneralRole=mod,
16 |      * semanticIncomingEdge=let-VB-root, //this is just a TokenWithContext
17 |      * semanticOutgoingEdgesIncludeNegation=false, semanticallyTaggedTokensWithContext=null]
18 |      */
19 | 
20 | 	TOKEN,
21 | 	LEMMA,
22 | 	POS,
23 | 	TOKENNEIGHBOUR,
24 | 	LEMMANEIGHBOUR,
25 | 	POSNEIGHBOUR,
26 | 	PARENTAGE,
27 | 	LOCALPARENTAGE,
28 | 	BOOLEAN, //isNamedEntity, isCoreferenceHead, semanticOutgoingEdgesIncludeNegation
29 | 	
30 | 	RESOLVEDCOREFERENCE, //TODO: need to somehow incorporate the properties the resolved coreference
31 | 	
32 | 	SEMANTICSPECIFICROLE,
33 | 	SEMANTICGENERALROLE,
34 | 	
35 | 	SEMANTICINCOMINGEDGEROLE,
36 | 	SEMANTICINCOMINGEDGETOKEN,
37 | 	SEMANTICINCOMINGEDGELEMMA,
38 | 	SEMANTICINCOMINGEDGEPOS,
39 | 	
40 | 	SEMANTICOUTGOINGEDGEROLE,
41 | 	SEMANTICOUTGOINGEDGETOKEN,
42 | 	SEMANTICOUTGOINGEDGELEMMA,
43 | 	SEMANTICOUTGOINGEDGEPOS
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/ModelType.java:
--------------------------------------------------------------------------------
1 | package ca.carter.thesis.ml;
2 | 
3 | public enum ModelType {
4 | 	FEATURE,
5 | 	SENTIMENT;
6 | }
7 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/Prediction.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | import java.text.DecimalFormat;
 4 | import java.util.Map;
 5 | 
 6 | public class Prediction {
 7 | 	private double probability;
 8 | 	private int classNumber;
 9 | 	private Map<Integer, Double> classProbabilities;
10 | 	
11 | 	
12 | 	public Prediction(int classNumber, double probability,
13 | 			Map<Integer, Double> classProbabilities) {
14 | 		super();
15 | 		this.classNumber = classNumber;
16 | 		this.probability = probability;
17 | 		this.classProbabilities = classProbabilities;
18 | 	}
19 | 	public Prediction(int classNumber, double probability) {
20 | 		super();
21 | 		this.probability = probability;
22 | 		this.classNumber = classNumber;
23 | 	}
24 | 	public double getProbability() {
25 | 		return probability;
26 | 	}
27 | 	public void setProbability(double probability) {
28 | 		this.probability = probability;
29 | 	}
30 | 	public int getClassNumber() {
31 | 		return classNumber;
32 | 	}
33 | 	public void setClassNumber(int classNumber) {
34 | 		this.classNumber = classNumber;
35 | 	}
36 | 	public Map<Integer, Double> getClassProbabilities() {
37 | 		return classProbabilities;
38 | 	}
39 | 	public void setClassProbabilities(Map<Integer, Double> classProbabilities) {
40 | 		this.classProbabilities = classProbabilities;
41 | 	}
42 | 	@Override
43 | 	public String toString() {
44 | 		return classNumber + ", probability " + new DecimalFormat("#.###").format(probability) + " " + classProbabilities;
45 | 	}
46 | 	
47 | 	
48 | 	
49 | }
50 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/PredictionTokenWithContextPair.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | import ca.carter.thesis.model.TokenWithContext;
 4 | 
 5 | public class PredictionTokenWithContextPair {
 6 | 	private TokenWithContext tokenWithContext;
 7 | 	private Prediction prediction;
 8 | 	
 9 | 	
10 | 	
11 | 	public PredictionTokenWithContextPair(TokenWithContext tokenWithContext,
12 | 			Prediction prediction) {
13 | 		super();
14 | 		this.tokenWithContext = tokenWithContext;
15 | 		this.prediction = prediction;
16 | 	}
17 | 	public TokenWithContext getTokenWithContext() {
18 | 		return tokenWithContext;
19 | 	}
20 | 	public void setTokenWithContext(TokenWithContext tokenWithContext) {
21 | 		this.tokenWithContext = tokenWithContext;
22 | 	}
23 | 	public Prediction getPrediction() {
24 | 		return prediction;
25 | 	}
26 | 	public void setPrediction(Prediction prediction) {
27 | 		this.prediction = prediction;
28 | 	}
29 | 	
30 | 	
31 | }
32 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/SVMTokenModel.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.ml;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Writer;
  5 | import java.lang.reflect.InvocationTargetException;
  6 | import java.lang.reflect.Method;
  7 | import java.util.ArrayList;
  8 | import java.util.HashMap;
  9 | import java.util.List;
 10 | import java.util.Map;
 11 | import java.util.Map.Entry;
 12 | import java.util.TreeMap;
 13 | import java.util.Vector;
 14 | 
 15 | import ca.carter.thesis.languagemodels.StopWords;
 16 | import ca.carter.thesis.model.*;
 17 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure;
 18 | import ca.carter.thesis.model.phrasetree.PartOfSpeech;
 19 | import libsvm.svm;
 20 | import libsvm.svm_model;
 21 | import libsvm.svm_node;
 22 | import libsvm.svm_parameter;
 23 | import libsvm.svm_print_interface;
 24 | import libsvm.svm_problem;
 25 | 
 26 | /******
 27 |  * 
 28 |  * @author davecarter
 29 |  *
 30 |  * Co-training enabled model of a vector space split into two views
 31 |  * This is used as the basis for both the sentiment/opinion word classifier and the product aspect classifier
 32 |  * 
 33 |  * The two views are lexical (including part of speech, lemma, etc.) and syntactic, respectively
 34 |  *  
 35 |  */
 36 | 
 37 | public abstract class SVMTokenModel {
 38 | 
 39 | 	protected static final boolean useViews = true;
 40 | 	public static final boolean useOnlyOneView = false;
 41 | 	public static final Views useOnlyOneViewWhichView = Views.BAGOFWORDS;
 42 | 	
 43 | 	private final int numberOfViews = (useViews && !useOnlyOneView ? 2 : 1);
 44 | 	private svm_problem[] problemViews;
 45 | 	private svm_parameter[] parametersViews;	
 46 | 	private svm_model[] svmModelViews = new svm_model[numberOfViews];;
 47 | 	private Task task;
 48 | 	
 49 | 	private final ClassWeighting classWeighting;
 50 | 	public static final int kernel = svm_parameter.RBF;
 51 | 		
 52 | 	protected final Double specifiedC;
 53 | 	protected final Double specifiedGamma;
 54 | 	protected final Double specifiedEpsilon;
 55 | 	
 56 | 	List<TokenWithContext> tokens;
 57 | 	
 58 | 	FeatureRepository featureRepository = new FeatureRepository();
 59 | 		
 60 | 	private static final boolean omitStopWords = false;
 61 | 	
 62 | 	private static final boolean debugFeatureNames = false;
 63 | 	
 64 | 	public SVMTokenModel(Task task, List<TokenWithContext> tokens, Writer[] writer, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) {
 65 | 		
 66 | 		this.task = task;
 67 | 		this.tokens = tokens;
 68 | 		this.classWeighting = (classWeighting == null ? ClassWeighting.EQUAL : classWeighting);
 69 | 		
 70 | 		this.specifiedC = c;
 71 | 		this.specifiedGamma = gamma;
 72 | 		this.specifiedEpsilon = epsilon;
 73 | 		
 74 | 		if (tokens == null)
 75 | 		{
 76 | 			System.err.println("No tokens present. Can not create training model.");
 77 | 			return;
 78 | 		}
 79 | 		
 80 | 		//writer != null ----- this means that, as we're building the model, we should also record it to a text file for, say, parameter estimation using an external tool
 81 | 
 82 | 		//build master features list
 83 | 		for (TokenWithContext nextTrainingToken : tokens)
 84 | 		{
 85 | 			buildFeaturesForToken(nextTrainingToken);
 86 | 		}
 87 | 		
 88 | 		
 89 | 
 90 | 		retrain(writer);
 91 | 	}	
 92 | 	
 93 | 	public void retrain(Writer[] writer)
 94 | 	{
 95 | 		boolean outputToFile = (writer != null);
 96 | 		
 97 | 		svm.svm_set_print_string_function(svm_print_null);
 98 | 		
 99 | 		List<Vector<Double>> xClassesViews = new ArrayList<Vector<Double>>();
100 | 		List<Vector<svm_node[]>> yFeaturesViews = new ArrayList<Vector<svm_node[]>>();
101 | 		
102 | 		for (int viewNum = 0 ; viewNum < numberOfViews; viewNum++)
103 | 		{
104 | 			xClassesViews.add(new Vector<Double>());
105 | 			yFeaturesViews.add(new Vector<svm_node[]>());
106 | 		}
107 | 				
108 | 		int max_index[] = {0 , 0}; //todo: this is views-based; backfill single-view case
109 | 		
110 | 		int numInPositiveClass = 0;
111 | 		int numInNegativeClass = 0;
112 | 		
113 | 		for (TokenWithContext nextTrainingToken : tokens)
114 | 		{
115 | 			if (omitStopWords && StopWords.isStopWord(nextTrainingToken.getToken()))
116 | 				continue;
117 | 			
118 | 			try
119 | 			{
120 | 				//if we added this token in the cotraining phase, it will have a predicted class that we should use instead of the ground truth class, which would be unknown in a semi-supervised case
121 | 				double classNumber;
122 | 				
123 | 				if (nextTrainingToken.getPredictedClass() != null)
124 | 				{
125 | 					classNumber = (nextTrainingToken.getPredictedModel() == this.getModelType() ? nextTrainingToken.getPredictedClass() : 0.0);
126 | 				}
127 | 				else
128 | 					classNumber = getClassForToken(nextTrainingToken);
129 | 				
130 | 				//debugging code only
131 | 				//if (classNumber != 0)
132 | 				//	System.out.println("Positive token for " + this.getModelType() + " " + getFormattedTokenContext(nextTrainingToken) + "\n" + nextTrainingToken.toString() );
133 | 					
134 | 				if (classNumber != 0)
135 | 					numInPositiveClass++;
136 | 				else
137 | 					numInNegativeClass++;
138 | 				
139 | 				if (outputToFile)
140 | 				{
141 | 					for (Writer nextWriter : writer)
142 | 					{
143 | 						nextWriter.write((int) classNumber + " ");
144 | 					}
145 | 				}
146 | 				
147 | 				for (int viewNum = 0 ; viewNum < numberOfViews; viewNum++)
148 | 				{
149 | 					Map<Integer, Double> featureMap = getFeaturesForToken(nextTrainingToken, viewNum);
150 | 					//System.out.println(nextTrainingToken.getToken() + " " + featureMap);	
151 | 					
152 | 					if (featureMap == null)
153 | 						continue;
154 | 					
155 | 					//build into SVM model
156 | 					int m = featureMap.size();
157 | 					svm_node[] x = new svm_node[m];
158 | 					int j = 0;
159 | 					
160 | 					for (Entry<Integer, Double> nextFeature : featureMap.entrySet())
161 | 					{
162 | 						x[j] = new svm_node();
163 | 						x[j].index = nextFeature.getKey();
164 | 						x[j].value = nextFeature.getValue();
165 | 		
166 | 						j++;
167 | 	
168 | 						if (outputToFile)
169 | 							writer[viewNum].write(nextFeature.getKey() + ":" + nextFeature.getValue() + " ");
170 | 					}
171 | 					
172 | 					if (outputToFile)
173 | 						writer[viewNum].write("\n");
174 | 					
175 | 					if(m>0) max_index[viewNum] = Math.max(max_index[viewNum], x[m-1].index);
176 | 	
177 | 					xClassesViews.get(viewNum).addElement(classNumber);
178 | 					yFeaturesViews.get(viewNum).addElement(x);
179 | 				}
180 | 			}
181 | 			catch (IOException e)
182 | 			{
183 | 				
184 | 			}
185 | 		}
186 | 		
187 | 		try {
188 | 			if (outputToFile)
189 | 			{
190 | 				for (Writer nextWriter : writer)
191 | 				{
192 | 					nextWriter.close();
193 | 				}
194 | 				System.out.println("Done writing " + this.getName() + " model files. Will not build classifier model.");
195 | 				return;
196 | 			}
197 | 		} catch (IOException e) {
198 | 			// TODO Auto-generated catch block
199 | 			e.printStackTrace();
200 | 		}
201 | 		
202 | 		//System.out.println( xClasses );
203 | 		//System.out.println( yFeatures );
204 | 
205 | 		problemViews = new svm_problem[numberOfViews];
206 | 		
207 | 		for (int viewNum = 0; viewNum < numberOfViews; viewNum++)
208 | 		{
209 | 			//System.out.println("View " + viewNum);
210 | 			
211 | 			problemViews[viewNum] = new svm_problem();
212 | 			
213 | 			problemViews[viewNum].l = xClassesViews.get(viewNum).size();
214 | 			problemViews[viewNum].x = new svm_node[problemViews[viewNum].l][];
215 | 			for(int i=0;i<problemViews[viewNum].l;i++)
216 | 				problemViews[viewNum].x[i] = yFeaturesViews.get(viewNum).elementAt(i);
217 | 			problemViews[viewNum].y = new double[problemViews[viewNum].l];
218 | 			for(int i=0;i<problemViews[viewNum].l;i++)
219 | 				problemViews[viewNum].y[i] = xClassesViews.get(viewNum).elementAt(i);
220 | 		}
221 | 		
222 | 		parametersViews = new svm_parameter[numberOfViews];
223 | 		
224 | 		for (int viewNum = 0; viewNum < numberOfViews; viewNum++)
225 | 		{
226 | 			svm_parameter parameters =  new svm_parameter();
227 | 			
228 | 			/* the old default values
229 | 			parameters.svm_type = svm_parameter.C_SVC;
230 | 			parameters.kernel_type = svm_parameter.RBF;
231 | 			parameters.degree = 3;
232 | 			//parameters.gamma = 0;	// 1/num_features
233 | 			parameters.gamma = 1 / getTotalNumberOfAvailableFeatures();
234 | 			parameters.coef0 = 0;
235 | 			parameters.nu = 0.5;
236 | 			parameters.cache_size = 100;
237 | 			parameters.C = 1;
238 | 			parameters.eps = epsilon; //1e-3;
239 | 			parameters.p = 0.1;
240 | 			parameters.shrinking = 1;
241 | 			parameters.probability = 1;
242 | 			parameters.nr_weight = 0;
243 | 			parameters.weight_label = new int[0];
244 | 			parameters.weight = new double[0];
245 | 			if(parameters.gamma == 0 && max_index > 0)
246 | 				parameters.gamma = 1.0/max_index;
247 | 			*/
248 | 	
249 | 			parameters.probability = 1;
250 | 			parameters.cache_size = 250;
251 | 			
252 | 			if (specifiedEpsilon != null)
253 | 				parameters.eps = specifiedEpsilon;
254 | 			else
255 | 				parameters.eps = getEpsilon();
256 | 	
257 | 			parameters.kernel_type = kernel;
258 | 			
259 | 			//determined classififier-by-classifier by grid search on initial 20% training data, averaged over the five data sets
260 | 			//parameters.gamma = 1 / (numInPositiveClass + numInNegativeClass); //getGamma(); ...since number of data points changes, so should gamma; and experiment to do parameter estimation found that ideal gamma is usually around 1 / number of data points 
261 | 			parameters.C = getC(viewNum);
262 | 			parameters.gamma = getGamma(viewNum);
263 | 	
264 | 			//weight the C of the classes to accommodate the imbalance; re. Asa Ben-Hur and Jason Weston "A User's Guide to Support Vector Machines", section 7
265 | 			if (classWeighting != ClassWeighting.EQUAL)
266 | 			{
267 | 				parameters.weight_label = new int[2];
268 | 				parameters.weight = new double[2];
269 | 				parameters.weight_label[0] = 0; //negative class
270 | 				parameters.weight_label[1] = 1; //positive class
271 | 	
272 | 				switch (classWeighting)
273 | 				{
274 | 					case OVERSIZENEG:
275 | 						parameters.weight[0] = getC(viewNum) * numInPositiveClass / numInNegativeClass;
276 | 						parameters.weight[1] = getC(viewNum);
277 | 						break;
278 | 					case UNDERSIZENEG:
279 | 						parameters.weight[0] = getC(viewNum) / (numInPositiveClass / numInNegativeClass);
280 | 						parameters.weight[1] = getC(viewNum);
281 | 						break;
282 | 					case OVERSIZEPOS:
283 | 						parameters.weight[0] = getC(viewNum);
284 | 						parameters.weight[1] = getC(viewNum) * (numInPositiveClass / numInNegativeClass);
285 | 						break;
286 | 					case UNDERSIZEPOS:
287 | 						parameters.weight[0] = getC(viewNum);
288 | 						parameters.weight[1] = getC(viewNum) / (numInPositiveClass / numInNegativeClass);
289 | 						break;
290 | 					default:
291 | 				}
292 | 			}
293 | 		
294 | 			parametersViews[viewNum] = parameters;
295 | 
296 | 			String error_msg = svm.svm_check_parameter(problemViews[viewNum],parametersViews[viewNum]);
297 | 
298 | 			/*
299 | 			 * default gamma given no tuning information
300 | 			parameters.C = 1;
301 | 			if(parameters.gamma == 0 && max_index > 0)
302 | 				parameters.gamma = 1.0/max_index;
303 | 			*/
304 | 
305 | 
306 | 			/*
307 | 			System.out.println("svm_type " + parameters.svm_type);
308 | 			System.out.println("kernel_type " + parameters.kernel_type);  //0 is linear (special case of RBF), 2 is RBF
309 | 			System.out.println("degree " + parameters.degree);
310 | 			System.out.println("gamma " + parameters.gamma);
311 | 			System.out.println("coef0 " + parameters.coef0);
312 | 			System.out.println("nu " + parameters.nu);
313 | 			System.out.println("C " + parameters.C);
314 | 			System.out.println("eps " + parameters.eps);
315 | 			System.out.println("p " + parameters.p);
316 | 			System.out.println("shrinking " + parameters.shrinking);
317 | 			System.out.println("probability " + parameters.probability);
318 | 			System.out.println("nr_weight " + parameters.nr_weight);
319 | 			System.out.println("weight_label " + parameters.weight_label);
320 | 			System.out.println("weight " + parameters.weight);
321 | 			*/
322 | 			
323 | 			if(error_msg != null)
324 | 			{
325 | 				System.err.print("ERROR: "+error_msg+"\n");
326 | 				//System.exit(1);
327 | 			}
328 | 		 
329 | 			svmModelViews[viewNum] = svm.svm_train(problemViews[viewNum],parametersViews[viewNum]);
330 | 		}
331 | 		
332 | 	}
333 | 	
334 | 	
335 | 	abstract public double getC(int viewNum);
336 | 	abstract public double getGamma(int viewNum);
337 | 	abstract public double getEpsilon();
338 | 
339 | 	abstract public ModelType getModelType();
340 | 	
341 | 	public void addTokenForNextTraining(TokenWithContext token)
342 | 	{
343 | 		buildFeaturesForToken(token);
344 | 		tokens.add(token);
345 | 	}
346 | 	
347 | 	abstract public Double getClassForToken(TokenWithContext token);
348 | 	//abstract protected Double getExpectedClassNumber(TokenWithContext token);
349 | 	
350 | 	abstract public String getName();
351 | 	
352 | 	private void buildFeaturesForToken(TokenWithContext token)
353 | 	{
354 | 		featureRepository.getNumberInList(token.getToken(), true);
355 | 	}
356 | 
357 | 	//get the features for the token bean in a sparse map
358 | 	private Map<Integer, Double> getFeaturesForToken(TokenWithContext token, int viewNumber)
359 | 	{
360 | 		//by reflection, go get all get____ for Strings and is_____ for booleans
361 | 		//then process lists, etc. manually
362 | 
363 | 		//using TreeMap because it means it will be sorted as it is built; needed for flat file output
364 | 		Map<Integer, Double> localMap = new TreeMap<Integer, Double>();
365 | 
366 | 		// simplest, token only version, is :
367 | 		//	int tokenNumber = featureRepository.getNumberInList(token.getToken(), true);
368 | 		//  localMap.put(tokenNumber, 1.0);
369 | 		
370 | 		for (Method nextMethod : TokenWithContext.class.getDeclaredMethods())
371 | 		{
372 | 			try {
373 | 				String value = null;
374 | 				int featureNumber;
375 | 				
376 | 				//TODO xxx need to arrange this by view number, splitting lexical and syntactic
377 | 				
378 | 				if (nextMethod.getName().startsWith("get") && validForView(nextMethod.getName(), viewNumber) )
379 | 				{
380 | 					if (nextMethod.getName().startsWith("getPredicted") || nextMethod.getName().startsWith("getFormatted"))
381 | 					{
382 | 						//do nothing; this are not real features
383 | 					}	
384 | 					else if (nextMethod.getName().equals("getOpinion"))
385 | 					{
386 | 						//TODO: do nothing, since it is something we are trying to predict
387 | 					}
388 | 					else if (nextMethod.getName().equals("getPartOfSentimentStructure"))
389 | 					{
390 | 						//TODO: should do nothing, since these are something we're trying to predict
391 | 					}
392 | 					else if (nextMethod.getName().equals("getPositionInSentence"))
393 | 					{
394 | 						
395 | 						// TODO: consider putting this in/taking this out; could be valuable or could be a total red herring
396 | 						/*double positionValue = (1.0 * (double) token.getPositionInSentence() / 100);
397 | 						featureNumber = featureRepository.getNumberInList("sentencepos", true);
398 | 						localMap.put(featureNumber, positionValue);*/
399 | 						
400 | 					}
401 | 					else if (nextMethod.getReturnType() == String.class)
402 | 					{
403 | 						value = (String) nextMethod.invoke(token, null);
404 | 						featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true);
405 | 						if (debugFeatureNames)
406 | 							System.out.println(nextMethod.getName() + " -> " + value);
407 | 						localMap.put(featureNumber, 1.0);
408 | 					}
409 | 					else if (nextMethod.getReturnType() == TokenWithContext.class)
410 | 					{
411 | 						TokenWithContext localValue = (TokenWithContext) nextMethod.invoke(token, null);
412 | 						addFeaturesForNearbyTokenWithContext(localMap, localValue, nextMethod.getName(), null, viewNumber);
413 | 					}
414 | 					else if (nextMethod.getReturnType() == PartOfSpeech.class)
415 | 					{
416 | 						PartOfSpeech localValue = (PartOfSpeech) nextMethod.invoke(token, null);
417 | 						if (localValue != null)
418 | 							value = localValue.toString();
419 | 						featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true);
420 | 						if (debugFeatureNames)
421 | 							System.out.println(nextMethod.getName() + " -> " + value);
422 | 						localMap.put(featureNumber, 1.0);
423 | 					}
424 | 					else if (nextMethod.getReturnType() == PartOfSentimentStructure.class)
425 | 					{
426 | 						PartOfSentimentStructure localValue = (PartOfSentimentStructure) nextMethod.invoke(token, null);
427 | 						if (localValue != null)
428 | 							value = localValue.toString();
429 | 						featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true);
430 | 						if (debugFeatureNames)
431 | 							System.out.println(nextMethod.getName() + " -> " + value);
432 | 						localMap.put(featureNumber, 1.0);
433 | 					}
434 | 					else if (nextMethod.getReturnType() == List.class)
435 | 					{
436 | 						final List<?> list = (List<?>) nextMethod.invoke(token, null);
437 | 						
438 | 						final boolean isListBefore = (nextMethod.getName().contains("Previous") || nextMethod.getName().contains("Parentage"));
439 | 						final boolean isListAfter = (nextMethod.getName().contains("Next"));
440 | 						
441 | 						//if (!isListBefore && !isListAfter)
442 | 						//	System.err.println("Not sure if the list for " + nextMethod.getName() + " is before or after the given token, so it'll be hard to figure out how to assign distance relationships.");
443 | 												
444 | 						if (list != null && ! list.isEmpty())
445 | 						{
446 | 							int relativePosition = 0;
447 | 							if (isListBefore)
448 | 								relativePosition = 0 - list.size();
449 | 							else if (isListAfter)
450 | 								relativePosition = 1;
451 | 							else
452 | 								relativePosition = 0;
453 | 
454 | 							//necessary for something like the preceeding token list for the second token in a sentence
455 | 							Object firstNonNullInList = null;
456 | 							for (Object nextInList : list)
457 | 							{
458 | 								if (nextInList != null)
459 | 								{
460 | 									firstNonNullInList = nextInList;
461 | 									break;
462 | 								}
463 | 							}
464 | 												
465 | 							//if the list is useable, check the class type of the first item in the list
466 | 							if (firstNonNullInList == null)
467 | 							{
468 | 								//entire list is null, so there's nothing more interesting to report
469 | 								featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=allnull", true);
470 | 								localMap.put(featureNumber, 1.0);
471 | 								if (debugFeatureNames)
472 | 									System.out.println(nextMethod.getName() + " -> " + value);
473 | 							}
474 | 							else if (firstNonNullInList instanceof TokenWithContext)
475 | 							{
476 | 								for (Object nextItem : list)
477 | 								{
478 | 									final TokenWithContext nextTokenInList = (TokenWithContext) nextItem;
479 | 									FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition);
480 | 									if (featureDistance != FeatureDistance.MINUSMORE)
481 | 										addFeaturesForNearbyTokenWithContext(localMap, nextTokenInList, nextMethod.getName(), featureDistance, viewNumber);
482 | 									if (isListBefore || isListAfter)
483 | 										relativePosition++;
484 | 								}
485 | 							}
486 | 							else if (firstNonNullInList instanceof PartOfSpeech) //&& viewNumber == 0 //despite being POS, is only called from view 1; only in terms of syntactic structure
487 | 							{
488 | 								for (Object nextItem : list)
489 | 								{
490 | 									final PartOfSpeech nextPOS = (PartOfSpeech) nextItem;
491 | 									FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition);
492 | 									if (featureDistance != FeatureDistance.MINUSMORE)
493 | 									{
494 | 										featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + "=" + nextItem.toString(), true);
495 | 										localMap.put(featureNumber, 1.0);
496 | 										if (debugFeatureNames)
497 | 											System.out.println(nextMethod.getName() + " -> " + value);
498 | 									}
499 | 									if (isListBefore || isListAfter)
500 | 										relativePosition++;
501 | 								}
502 | 							}
503 | 							else if (firstNonNullInList instanceof SemanticallyTaggedTokenWithContext) //&& viewNumber == 1  //only called from view 1
504 | 							{
505 | 								for (Object nextItem : list)
506 | 								{
507 | 									//TODO: might be worthwhile to pull in more TokenWithContext features here?
508 | 									final SemanticallyTaggedTokenWithContext nextTokenInList = (SemanticallyTaggedTokenWithContext) nextItem;
509 | 									FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition);
510 | 									if (featureDistance != FeatureDistance.MINUSMORE)
511 | 									{
512 | 										featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".Role=" + nextTokenInList.getSemanticRole(), true);
513 | 										localMap.put(featureNumber, 1.0);
514 | 										if (debugFeatureNames)
515 | 											System.out.println(nextMethod.getName() + featureDistance + ".Role=" + nextTokenInList.getSemanticRole() + " -> " + value);
516 | 										featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".RoleAndToken=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getLemma(), true);
517 | 										localMap.put(featureNumber, 1.0);
518 | 										if (debugFeatureNames)
519 | 											System.out.println(nextMethod.getName() + featureDistance + ".RoleAndToken=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getLemma() + " -> " + value);
520 | 										featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".RoleAndPOS=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getPos(), true);
521 | 										localMap.put(featureNumber, 1.0);
522 | 										if (debugFeatureNames)
523 | 											System.out.println(nextMethod.getName() + featureDistance + ".RoleAndPOS=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getPos() + " -> " + value);
524 | 
525 | 									}
526 | 
527 | 									if (isListBefore || isListAfter)
528 | 										relativePosition++;
529 | 								}
530 | 							}
531 | 							else
532 | 							{
533 | 								System.err.println("Unhandled list type for " + nextMethod.getName() + " / " + firstNonNullInList.getClass().getName());
534 | 							}
535 | 						}
536 | 							
537 | 
538 | 					}
539 | 					else
540 | 					{
541 | 						//should never get called; if so, we need to implement a new type in here
542 | 						System.err.println("Unhandled getter : " + nextMethod.getName() + " " + nextMethod.getReturnType().getName());
543 | 					}
544 | 				}
545 | 				else if (nextMethod.getReturnType() == boolean.class && nextMethod.getName().startsWith("is") && validForView(nextMethod.getName(), viewNumber))
546 | 				{
547 | 					value = ((Boolean) nextMethod.invoke(token, null)).toString();
548 | 					featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true);
549 | 					localMap.put(featureNumber, 1.0);
550 | 					if (debugFeatureNames)
551 | 						System.out.println(nextMethod.getName() + " -> " + value);
552 | 				}
553 | 
554 | 			} catch (IllegalArgumentException e) {
555 | 				e.printStackTrace();
556 | 			} catch (IllegalAccessException e) {
557 | 				e.printStackTrace();
558 | 			} catch (InvocationTargetException e) {
559 | 				e.printStackTrace();
560 | 			}
561 | 		}
562 | 		
563 | 		
564 | 		
565 | 		return localMap;
566 | 	}
567 | 	
568 | 	private boolean validForView(String getterName, int viewNum)
569 | 	{
570 | 		if (!useViews)
571 | 			return true;
572 | 		
573 | 		final String[] validLexicalMethods = {"getToken",
574 | 				"getLemma",
575 | 				"getPos",
576 | 				"getPreviousTokens",
577 | 				"getNextTokens",
578 | 				"isAdjective",
579 | 				"getAttribute",	//WordNet attribute property; i.e., fast -> speed	
580 | 				//"isSemanticOutgoingEdgesIncludeNegation"
581 | 		};
582 | 		final String[] validSyntacticMethods = {"getPositionInSentence",
583 | 				"getPartOfSentimentStructure",
584 | 				"getLocalParentage",
585 | 				"getParentage",
586 | 				"isCoreferenceHead",
587 | 				"getFlatResolvedCoreference",
588 | 				"getImmediateParent",
589 | 				"getPreviousToken",
590 | 				"getNextToken",
591 | 				"getSemanticallyTaggedTokensWithContext",
592 | 				"getSemanticSpecificRole",
593 | 				"getSemanticGeneralRole",
594 | 				"getSemanticIncomingEdge",
595 | 				"isSemanticOutgoingEdgesIncludeNegation",
596 | 				"isNamedEntity"
597 | 		};
598 | 		final String[] validBagOfWordsMethods = {"getToken"};
599 | 		
600 | 		//TODO: lots of String comparisons here against a fixed list; there should be a way to speed this up
601 | 		
602 | 		if (useOnlyOneView)
603 | 		{
604 | 			if (useOnlyOneViewWhichView == Views.LEXICAL) //lexical
605 | 			{
606 | 				for (String nextMethod : validLexicalMethods)
607 | 				{
608 | 					if (nextMethod.equals(getterName))
609 | 						return true;
610 | 				}
611 | 			}
612 | 			else if (useOnlyOneViewWhichView == Views.SYNTACTIC) //syntactic
613 | 			{
614 | 				for (String nextMethod : validSyntacticMethods)
615 | 				{
616 | 					if (nextMethod.equals(getterName))
617 | 						return true;
618 | 				}
619 | 			}
620 | 			else if (useOnlyOneViewWhichView == Views.BAGOFWORDS)
621 | 			{
622 | 				for (String nextMethod : validBagOfWordsMethods)
623 | 				{
624 | 					if (nextMethod.equals(getterName))
625 | 						return true;
626 | 				}
627 | 			}
628 | 		}	
629 | 		else
630 | 		{
631 | 			if (Views.getViewForNumber(viewNum) == Views.LEXICAL) //lexical
632 | 			{
633 | 				for (String nextMethod : validLexicalMethods)
634 | 				{
635 | 					if (nextMethod.equals(getterName))
636 | 						return true;
637 | 				}
638 | 			}
639 | 			else if (Views.getViewForNumber(viewNum) == Views.SYNTACTIC) //syntactic
640 | 			{
641 | 				for (String nextMethod : validSyntacticMethods)
642 | 				{
643 | 					if (nextMethod.equals(getterName))
644 | 						return true;
645 | 				}
646 | 				
647 | 			}
648 | 		}
649 | 
650 | 		return false;
651 | 		
652 | 	}
653 | 	
654 | 	//adapted from LibSVM sample code
655 | 	@SuppressWarnings("unused")
656 | 	public Prediction predict(TokenWithContext token)
657 | 	{	
658 | 		if (omitStopWords && StopWords.isStopWord(token.getToken()))
659 | 			return new Prediction(0, -1, null);	//estimate probability at less than zero, so as to not include stop words in subsequent cotraining models
660 | 
661 | 		Prediction bestPredictionSoFar = null;
662 | 		
663 | 		for (int viewNum = 0; viewNum < numberOfViews; viewNum++)
664 | 		{
665 | 			
666 | 			Map<Integer, Double> features = getFeaturesForToken(token, viewNum);
667 | 		
668 | 			int nr_class=svm.svm_get_nr_class(svmModelViews[viewNum]);
669 | 			double[] prob_estimates = null;
670 | 			int[] labels = null;
671 | 
672 | 			labels=new int[nr_class];
673 | 			svm.svm_get_labels(svmModelViews[viewNum],labels);
674 | 			prob_estimates = new double[nr_class];
675 | 
676 | 			svm_node[] x = new svm_node[features.size()];
677 | 			int i = 0;
678 | 			for (Entry<Integer, Double> nextFeature : features.entrySet())
679 | 			{
680 | 				x[i] = new svm_node();
681 | 				x[i].index = nextFeature.getKey();
682 | 				x[i].value = nextFeature.getValue();
683 | 	
684 | 				i++;
685 | 			}
686 | 
687 | 			double v = svm.svm_predict_probability(svmModelViews[viewNum],x,prob_estimates);
688 | 			Map<Integer, Double> classProbabilities = new HashMap<Integer, Double>();
689 | 			for(int j=0;j<nr_class;j++)
690 | 			{
691 | 				//the classes are not in any particular order in the SVM model, so it would be correct to use j without consulting the class labels in the SVM model
692 | 				classProbabilities.put(labels[j], prob_estimates[j]); 
693 | 			}
694 | 			
695 | 			if (bestPredictionSoFar == null || (classProbabilities.get((int) v ) > bestPredictionSoFar.getProbability())  )
696 | 				bestPredictionSoFar = new Prediction((int) v, classProbabilities.get((int) v), classProbabilities);
697 | 		}
698 | 		
699 | 		return bestPredictionSoFar;
700 | 	}
701 | 	
702 | 	public void addFeaturesForNearbyTokenWithContext(Map<Integer, Double> featureMap, TokenWithContext tokenWithContext, String tokenWithContextGetterName, FeatureDistance featureDistance, int view)
703 | 	{
704 | 		int featureNumber;
705 | 	
706 | 		final String featureNamePrefix;
707 | 		
708 | 		if (featureDistance == null)
709 | 			featureNamePrefix = tokenWithContextGetterName;
710 | 		else
711 | 		{
712 | 			//handle cases where having a word, say, before, is interesting, but the fact that it's one or two or three words before is too specific
713 | 			
714 | 			featureNamePrefix = tokenWithContextGetterName + featureDistance.toString();
715 | 			
716 | 			if (featureDistance.canBeGeneralized())
717 | 			{
718 | 				addFeaturesForNearbyTokenWithContext(featureMap, tokenWithContext, tokenWithContextGetterName, featureDistance.getGeneralCase(), view);
719 | 			}
720 | 		}
721 | 		
722 | 		if (tokenWithContext != null)
723 | 		{
724 | 			//in a TokenWithContext case, we want several features: token, lemma, POS, part of sentiment, etc.
725 | 			
726 | 			if (!useViews || view == 0) //lexical
727 | 			{
728 | 				if (debugFeatureNames)
729 | 					System.out.println(featureNamePrefix + ".getToken");
730 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getToken" + "=" + tokenWithContext.getToken(), true);
731 | 				featureMap.put(featureNumber, 1.0);
732 | 				if (debugFeatureNames)
733 | 					System.out.println(featureNamePrefix + ".getLemma");
734 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getLemma" + "=" + tokenWithContext.getLemma(), true);
735 | 				featureMap.put(featureNumber, 1.0);
736 | 				if (debugFeatureNames)
737 | 					System.out.println(featureNamePrefix + ".getPos");
738 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getPos" + "=" + tokenWithContext.getPos(), true);
739 | 				featureMap.put(featureNumber, 1.0);
740 | 				
741 | 			}
742 | 			if (!useViews || view == 1) //syntactic
743 | 			{
744 | 				//featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getPartOfSentimentStructure" + "=" + tokenWithContext.getPartOfSentimentStructure(), true);
745 | 				//featureMap.put(featureNumber, 1.0);
746 | 				if (debugFeatureNames)
747 | 					System.out.println(featureNamePrefix + ".isCoreferenceHead");
748 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isCoreferenceHead" + "=" + tokenWithContext.isCoreferenceHead(), true);
749 | 				featureMap.put(featureNumber, 1.0);
750 | 				if (debugFeatureNames)
751 | 					System.out.println(featureNamePrefix + ".getFlatResolvedCoreference");
752 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getFlatResolvedCoreference" + "=" + tokenWithContext.getFlatResolvedCoreference(), true);
753 | 				featureMap.put(featureNumber, 1.0);
754 | 				if (debugFeatureNames)
755 | 					System.out.println(featureNamePrefix + ".isNamedEntity");
756 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isNamedEntity" + "=" + tokenWithContext.isNamedEntity(), true);
757 | 				featureMap.put(featureNumber, 1.0);
758 | 				if (debugFeatureNames)
759 | 					System.out.println(featureNamePrefix + ".getSemanticGeneralRole");
760 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getSemanticGeneralRole" + "=" + tokenWithContext.getSemanticGeneralRole(), true);
761 | 				featureMap.put(featureNumber, 1.0);
762 | 				if (debugFeatureNames)
763 | 					System.out.println(featureNamePrefix + ".getSemanticSpecificRole");
764 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getSemanticSpecificRole" + "=" + tokenWithContext.getSemanticSpecificRole(), true);
765 | 				featureMap.put(featureNumber, 1.0);
766 | 				if (debugFeatureNames)
767 | 					System.out.println(featureNamePrefix + ".isSemanticOutgoingEdgesIncludeNegation");
768 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isSemanticOutgoingEdgesIncludeNegation" + "=" + tokenWithContext.isSemanticOutgoingEdgesIncludeNegation(), true);
769 | 				featureMap.put(featureNumber, 1.0);
770 | 				//POS feature; could be argued that this does not belong in this view; on the other hand, it's taken from the parse tree, making it syntactic, so I'll leave it in for now.
771 | 				if (debugFeatureNames)
772 | 					System.out.println(featureNamePrefix + ".getImmediateParent");
773 | 				featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getImmediateParent" + "=" + tokenWithContext.getImmediateParent(), true);
774 | 				featureMap.put(featureNumber, 1.0);
775 | 				
776 | 			}
777 | 			//TODO: could branch out one more here...get semantic roles two away
778 | 		}	
779 | 	}
780 | 	private static svm_print_interface svm_print_null = new svm_print_interface()
781 | 	{
782 | 		public void print(String s) {}
783 | 	};
784 | 	
785 | 	
786 | 	public static void main(String[] args)
787 | 	{
788 | 		System.out.println("Testing SVM model creation.");
789 | 		
790 | 		//TODO this works very poorly with skewed classes; i.e., removing a bunch of named entity cases makes things work poorly *unless* we revert to predicting binary without probabilities; so weird
791 | 		
792 | 		//Also, LibSVM estimates probabilities using internal 5-fold cross validation, so we need enough data in the test set to allow that to work
793 | 		
794 | 		/*TokenWithContext[] trainingTokens = 
795 | 		{
796 | 				//http://stackoverflow.com/questions/5988574/why-does-svm-predict-and-svm-predict-probability-give-different-results-in-java
797 | 				
798 | 				new TokenWithContext("this", null, null, null, null, null, false),
799 | 				new TokenWithContext("is", null, null, null, null, null, false),
800 | 				new TokenWithContext("a", null, null, null, null, null, false),
801 | 				new TokenWithContext("Sentential", null, null, null, null, null, true),
802 | 				new TokenWithContext("sentence", null, null, null, null, null, false),
803 | 				new TokenWithContext("with", null, null, null, null, null, false),
804 | 				new TokenWithContext("Apple", null, null, null, null, null, true),
805 | 				new TokenWithContext("iPad", null, null, null, null, null, true),
806 | 				new TokenWithContext("features", null, null, null, null, null, false),
807 | 				new TokenWithContext("plus", null, null, null, null, null, false),
808 | 				new TokenWithContext("some", null, null, null, null, null, false),
809 | 				new TokenWithContext("extra", null, null, null, null, null, false),
810 | 				new TokenWithContext("Lucky", null, null, null, null, null, true),
811 | 				new TokenWithContext("Brand", null, null, null, null, null, true),
812 | 				new TokenWithContext("words", null, null, null, null, null, false),
813 | 				new TokenWithContext("thrown", null, null, null, null, null, false),
814 | 				new TokenWithContext("in", null, null, null, null, null, false),
815 | 				new TokenWithContext("for", null, null, null, null, null, false),
816 | 				new TokenWithContext("good", null, null, null, null, null, false),
817 | 				new TokenWithContext("measure", null, null, null, null, null, false),
818 | 				
819 | 				new TokenWithContext("this", null, null, null, null, null, false),
820 | 				new TokenWithContext("is", null, null, null, null, null, false),
821 | 				new TokenWithContext("a", null, null, null, null, null, false),
822 | 				new TokenWithContext("Sentential", null, null, null, null, null, true),
823 | 				new TokenWithContext("sentence", null, null, null, null, null, false),
824 | 				new TokenWithContext("with", null, null, null, null, null, false),
825 | 				new TokenWithContext("Apple", null, null, null, null, null, true),
826 | 				new TokenWithContext("iPad", null, null, null, null, null, true),
827 | 				new TokenWithContext("features", null, null, null, null, null, false),
828 | 				new TokenWithContext("plus", null, null, null, null, null, false),
829 | 				new TokenWithContext("some", null, null, null, null, null, false),
830 | 				new TokenWithContext("extra", null, null, null, null, null, false),
831 | 				new TokenWithContext("Lucky", null, null, null, null, null, true),
832 | 				new TokenWithContext("Brand", null, null, null, null, null, true),
833 | 				new TokenWithContext("words", null, null, null, null, null, false),
834 | 				new TokenWithContext("thrown", null, null, null, null, null, false),
835 | 				new TokenWithContext("in", null, null, null, null, null, false),
836 | 				new TokenWithContext("for", null, null, null, null, null, false),
837 | 				new TokenWithContext("good", null, null, null, null, null, false),
838 | 				
839 | 				new TokenWithContext("good", "good", PartOfSpeech.ADJP, null, null, null, true),
840 | 		};
841 | 		
842 | 		SVMTokenModel model = new SVMTokenModelSentiment(Arrays.asList(trainingTokens), null, ClassWeighting.EQUAL, null, null, null);
843 | 
844 | 		*/
845 | 
846 | 		String[] sentences = {
847 | 				"feature[+2], ##the car 's features are wonderful .",
848 | 				"feature[+2], ##the car has a wonderful set of features .",
849 | 				"feature[+2], ##the camera has a wonderful set of features .",
850 | 				"lens[+2], ##the camera has a great lens .",
851 | 				"grip[+1], ##the camera has a fine grip .",
852 | 				"grip[-1], ##i didn't like the grip on the camera .",
853 | 				
854 | 		};
855 | 		
856 | 		List<TokenWithContext> trainingTokens = new ArrayList<TokenWithContext>();
857 | 		for (String nextSentence : sentences) {
858 | 			Sentence sentence = new Sentence(new SimpleSentence(nextSentence, true), Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false);
859 | 			trainingTokens.addAll(sentence.getTokens());
860 | 		}
861 | 		
862 | 		List<SVMTokenModel> models = new ArrayList<SVMTokenModel>();
863 | 		models.add(new SVMTokenModelSentiment(Task.BINGLIU, trainingTokens, null, ClassWeighting.EQUAL, null, null, null));
864 | 		models.add(new SVMTokenModelFeature(Task.BINGLIU, trainingTokens, null, ClassWeighting.EQUAL, null, null, null));
865 | 
866 | 		Sentence testSentence = new Sentence(new SimpleSentence("shmork[+2], ##the camera has a decent shmork .", true), Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false);
867 | 		//Sentence testSentence = new Sentence("shmork[+2], ##i did n't grok the camera 's features .", Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false);
868 | 
869 | 		for (SVMTokenModel model : models)
870 | 		{
871 | 			System.out.println("Model is " + model.getName());
872 | 			for (TokenWithContext nextToken : trainingTokens)
873 | 			{
874 | 				Prediction prediction = model.predict(nextToken);
875 | 				System.out.println((prediction.getClassNumber() == model.getClassForToken(nextToken) ? "Correct " : "Incorrect ") + nextToken.getToken() + " " + prediction);
876 | 				//System.out.println(nextToken.getToken() + " " + prediction);
877 | 			}
878 | /*			for (TokenWithContext nextToken : testSentence.getTokens())
879 | 			{
880 | 				Prediction prediction = model.predict(nextToken);
881 | 				System.out.println((prediction.getClassNumber() == model.getClassForToken(nextToken) ? "Correct " : "Incorrect ") + nextToken.getToken() + " " + prediction);
882 | 				//System.out.println(nextToken.getToken() + " " + prediction);
883 | 			}
884 | */			System.out.println("-----");
885 | 		}
886 | 
887 | 		
888 | 		for (TokenWithContext nextToken : testSentence.getTokens())
889 | 		{
890 | 
891 | 			
892 | 			Prediction topPrediction = null;
893 | 			ModelType topPredictionModel = null;
894 | 			Double nominalClassForTopPrediction = null;
895 | 			for (SVMTokenModel model : models)
896 | 			{
897 | 				Map<Integer, Double> featuresLexical = model.getFeaturesForToken(nextToken, 0);
898 | 				Map<Integer, Double> featuresSyntactic = model.getFeaturesForToken(nextToken, 1);
899 | 				System.out.println("lexical features: " + featuresLexical.size() + " / syntactic featuers: " + featuresSyntactic.size());
900 | 				
901 | 				Prediction prediction = model.predict(nextToken);
902 | 				if (prediction.getClassNumber() != 0 && (topPrediction == null || prediction.getProbability() > topPrediction.getProbability()))
903 | 				{
904 | 					topPrediction = prediction;
905 | 					topPredictionModel = model.getModelType();
906 | 					nominalClassForTopPrediction = model.getClassForToken(nextToken);
907 | 				}
908 | 				
909 | 			}
910 | 
911 | 			if (topPrediction != null)
912 | 				System.out.print("[ ");
913 | 			System.out.print(nextToken.getToken());
914 | 			if (topPrediction != null)
915 | 				System.out.print(" (" + topPredictionModel.toString() + " " + (topPrediction.getClassNumber() == nominalClassForTopPrediction ? "Correct" : "Incorrect") + ") ]");
916 | 			System.out.print(" ");
917 | 		}
918 | 
919 | 	}
920 | 
921 | 
922 | 	
923 | 
924 | 	
925 | }
926 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/SVMTokenModelFeature.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | import java.io.Writer;
 4 | import java.util.List;
 5 | 
 6 | import ca.carter.thesis.model.Task;
 7 | import ca.carter.thesis.model.TokenWithContext;
 8 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure;
 9 | 
10 | 
11 | public class SVMTokenModelFeature extends SVMTokenModel {
12 | 
13 | 	//up to March 16th, was using 8 and 0.0078125 with reasonably good results
14 | 	//tried log average of 2.639 and 0.00592 and had terrible results
15 | 	//tried numeric average of 4.1v and 0.00664
16 | 	
17 | 	private static final double svmC = 8; //64; //128; //2; //33.612;  //102.8; //55.72; // 2.63;
18 | 	private static final double svmGamma = 0.0078125; //0.0078125; //6.103515625e-05; //0.0013810679; //0.00012207; //0.0025771639; //0.0048828; //0.0016601563;	//0.0001220703125, 0.0001220703125, 0.0078125, 0.0001220703125, 0.0001220703125
19 |  	private static final double svmEpsilon = 1.0E-3;
20 | 	
21 |  	//for single view
22 | // 	private static final double[] svmCForViews = {13.9288090127, 0.1649384888};
23 | // 	private static final double[] svmGammaForViews = {0.0078125, 0.2871745887};
24 |  	
25 |  	//hand-tuned for views
26 |  	private static final double[] svmCForViews = {1.265625, 39.0625}; // {1.265625, 39.0625} <--bestyet-- {1.265625, 39.0625} <--slightlyworse?-- {2.53125, 19.53125} <--bestyet-- {2.53125, 19.53125} <--disimprovement-- {3.375, 15.625} <--merelyrollingbackto-- {3.375, 15.625} <-?- {4.5, 12.5} <--noticeableimprovement-- {6, 10} <--noticeableimprovement--{10,6} <--disimproves-- {8,8}
27 |  	//private static final double[] svmGammaForViews = {0.0076293945 * 4.5, 0.0078125 * 3}; // {0.0076293945, 0.0078125 * 2} <--bestyet-- {..., ... / 2} <--slightlyworse?-- {0.015258789, 0.0078125} <--bestyet-- {0.0091552732, 0.0078125} <--disimprovement-- {0.0091552732, 0.0078125} <--merelyrollingbackto-- {0.0091552732, 0.0078125} <-?- {0.012207031, 0.0078125} <--noticeableimprovement-- {0.009765625, 0.0078125} <--noticeableimprovement-- {0.0078125, 0.009765625} <--disimproves-- {0.0078125,0.0078125}
28 |  	private static final double[] svmGammaForViews = {0.0076293945 * 4.5, 0.0078125 * 3}; // {0.0076293945, 0.0078125 * 2} <--bestyet-- {..., ... / 2} <--slightlyworse?-- {0.015258789, 0.0078125} <--bestyet-- {0.0091552732, 0.0078125} <--disimprovement-- {0.0091552732, 0.0078125} <--merelyrollingbackto-- {0.0091552732, 0.0078125} <-?- {0.012207031, 0.0078125} <--noticeableimprovement-- {0.009765625, 0.0078125} <--noticeableimprovement-- {0.0078125, 0.009765625} <--disimproves-- {0.0078125,0.0078125}
29 | 
30 |  	//auto-tuned with 80%
31 | // 	private static final double[] svmCForViews = {13.929, 2};
32 | // 	private static final double[] svmGammaForViews = {0.0059207678, 0.0717936472};
33 | 
34 |  	//auto-tuned with 20%
35 | // 	private static final double[] svmCForViews = {13.9288090127, 0.1649384888}; 
36 | // 	private static final double[] svmGammaForViews = {0.0078125, 0.2871745887};
37 | 
38 |  	
39 | 	public SVMTokenModelFeature(Task task, List<TokenWithContext> tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) {
40 | 		super(task, tokens, fileToOutput, classWeighting, c, gamma, epsilon);
41 | 	}
42 | 
43 | 	@Override
44 | 	public Double getClassForToken(TokenWithContext token)
45 | 	{	
46 | 		if (token.getPartOfSentimentStructure() == PartOfSentimentStructure.FEATURE)
47 | 			return 1.0;
48 | 		else
49 | 			return 0.0;
50 | 	}
51 | 
52 | 	@Override
53 | 	public String getName() {
54 | 		return "product feature";
55 | 	}
56 | 
57 | 	@Override
58 | 	public double getC(int viewNum) {
59 | 		if (this.specifiedC != null)
60 | 			return this.specifiedC;
61 | 		else
62 | 		{
63 | 			if (useViews)
64 | 				return svmCForViews[viewNum];
65 | 			else
66 | 				return svmC;
67 | 		}
68 | 	}
69 | 
70 | 	@Override
71 | 	public double getGamma(int viewNum) {
72 | 		if (this.specifiedGamma != null)
73 | 			return this.specifiedGamma;
74 | 		else
75 | 		{
76 | 			if (useViews)
77 | 				return svmGammaForViews[viewNum];
78 | 			else
79 | 				return svmGamma;
80 | 		}
81 | 	}
82 | 	
83 | 	@Override
84 | 	public double getEpsilon() {
85 | 		if (this.specifiedEpsilon != null)
86 | 			return specifiedEpsilon;
87 | 		else
88 | 			return svmEpsilon;
89 | 	}
90 | 
91 | 	@Override
92 | 	public ModelType getModelType()
93 | 	{
94 | 		return ModelType.FEATURE;
95 | 	}
96 | 	
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/SVMTokenModelSentiment.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.ml;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.io.Writer;
  8 | import java.util.ArrayList;
  9 | import java.util.List;
 10 | 
 11 | import ca.carter.thesis.ProcessReviews;
 12 | import ca.carter.thesis.model.Sentiment;
 13 | import ca.carter.thesis.model.Task;
 14 | import ca.carter.thesis.model.TokenWithContext;
 15 | import ca.carter.thesis.model.phrasetree.PartOfSpeech;
 16 | 
 17 | public class SVMTokenModelSentiment extends SVMTokenModel {
 18 | 
 19 | 	//as of March 16th, working OK with 24.251, 0.0011218, 1.0E-4
 20 | 	
 21 | 	private static final double svmC = 24.251; //41.600; //742.4; //73.51;
 22 | 	private static final double svmGamma = 0.0011218; //0.0013672; //0.000048828125;	//3.0517578125e-05 , 0.0001220703125, 3.0517578125e-05, 3.0517578125e-05, 3.0517578125e-05
 23 |  	private static final double svmEpsilon = 1.0E-4;
 24 | 
 25 |  	//tuned for single view
 26 | // 	private static final double[] svmCForViews = {512, 512};
 27 | // 	private static final double[] svmGammaForViews = {0.0001220703, 0.0001220703};
 28 |  	
 29 |  	//hand tuned for two views
 30 |  	private static final double[] svmCForViews = {147.998047, 2.87743798}; // {147.998047, 2.87743798} <--bestyet-- {147.998047, 2.87743798} <--slightlyworse-- {73.9990235, 5.75487596} <--better-- {73.9990235, 5.75487596} <--notmuchchange-- {59.1992188, 7.67316795} <--merelyrollingbackgammato-- {59.1992188, 7.67316795} <-?- {47.359375, 10.2308906} <--improvement-- {37.8875, 13.6411875} <--neglibibleimprovementinPneglibibledecreaseinR-- {30.31, 18.1825} <--improves-- {24.251,24,251}
 31 |  	//private static final double[] svmGammaForViews = {0.0011218 * 3, 0.0068469238 * 15}; // {0.0011218 * 2 , 0.0068469238} <--bestyet-- {0.0011218 / 2 , 0.0068469238} <--slightlyworse-- {0.0011218, 0.0034234619} <--better-- {0.0011218, 0.0027387695} <--notmuchchange-- {0.0011218, 0.0027387695} <--merelyrollingbackgammato-- {0.00084135, 0.0027387695} <-?- {0.0011218, 0.0021910156} <--improvement-- {0.0011218, 0.0010516875} (latter should have been 0.00175) <--neglibibleimprovementinPneglibibledecreaseinR--  {0.0011218, 0.00140225} <--improves-- {0.0011218, 0.0011218}
 32 |  	private static final double[] svmGammaForViews = {0.0011218 * 3, 0.0068469238 * 15}; // {0.0011218 * 2 , 0.0068469238} <--bestyet-- {0.0011218 / 2 , 0.0068469238} <--slightlyworse-- {0.0011218, 0.0034234619} <--better-- {0.0011218, 0.0027387695} <--notmuchchange-- {0.0011218, 0.0027387695} <--merelyrollingbackgammato-- {0.00084135, 0.0027387695} <-?- {0.0011218, 0.0021910156} <--improvement-- {0.0011218, 0.0010516875} (latter should have been 0.00175) <--neglibibleimprovementinPneglibibledecreaseinR--  {0.0011218, 0.00140225} <--improves-- {0.0011218, 0.0011218}
 33 | 
 34 |  	//auto-tuned with 80%
 35 | // 	private static final double[] svmCForViews = {6208.3750564266, 10.5560632862};
 36 | // 	private static final double[] svmGammaForViews = {0.0001610727, 0.0078125};
 37 | 
 38 |  	//auto-tuned with 20%
 39 | // 	private static final double[] svmCForViews = {97.0058602567, 10.5560632862};
 40 | // 	private static final double[] svmGammaForViews = {0.0011217757, 0.0136023526};
 41 | 
 42 |  	
 43 | 	private static final List<String> posWords = new ArrayList<String>();
 44 | 	private static final List<String> negWords = new ArrayList<String>();
 45 | 	
 46 | 	private static final String positiveWordsFile = "/bingliulexicon/positive-words.txt";
 47 | 	private static final String negativeWordsFile = "/bingliulexicon/negative-words.txt";
 48 | 	
 49 | 	private static boolean startedUp = false;
 50 | 
 51 | 	
 52 | 	public SVMTokenModelSentiment(Task task, List<TokenWithContext> tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) {
 53 | 		super(task, tokens, fileToOutput, classWeighting, c, gamma, epsilon);
 54 | 	}
 55 | 
 56 | 	private void loadBootstrapping()
 57 | 	{
 58 | 		synchronized(this)
 59 | 		{
 60 | 			if (startedUp)
 61 | 				return;
 62 | 			startedUp = true;
 63 | 		}
 64 | 		
 65 | 		try
 66 | 		{
 67 | 			loadSentimentWordList(new File(ProcessReviews.defaultRootdir + positiveWordsFile), posWords);
 68 | 			loadSentimentWordList(new File(ProcessReviews.defaultRootdir + negativeWordsFile), negWords);
 69 | 			
 70 | 			if (posWords.isEmpty() || negWords.isEmpty())
 71 | 			{
 72 | 				System.err.println("############ Could not bootstrap sentiment-bearing words ##############");
 73 | 				System.exit(-1);
 74 | 			}
 75 | 			
 76 | 			//System.out.print("Bootstrapping sentiment-bearing words with list of " + posWords.size() + " positive words and " + negWords.size() + " negative words.");
 77 | 		}
 78 | 		catch (IOException e)
 79 | 		{
 80 | 			System.err.println("Could not open file containing sentiment words for bootstrapping.");
 81 | 			e.printStackTrace();
 82 | 			System.exit(-1);
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	private void loadSentimentWordList(File file, List<String> list) throws IOException
 87 | 	{
 88 | 		BufferedReader br = new BufferedReader(new FileReader(file));
 89 | 		String line;
 90 | 		while ((line = br.readLine()) != null) {
 91 | 			if (line.isEmpty() || line.charAt(0) == ';')
 92 | 			{
 93 | 				//do nothing
 94 | 			}
 95 | 			else
 96 | 			{
 97 | 				list.add(line);
 98 | 			}
 99 | 		}
100 | 		br.close();
101 | 	}
102 | 	
103 | 	//TODO: switch away from multi-class 
104 | 	
105 | 	
106 | 	//here, unusually, we bootstrap the negative and positive words
107 | 	//TODO: need to test negation features
108 | 	
109 | 	@Override
110 | 	public Double getClassForToken(TokenWithContext token)
111 | 	{	
112 | 		if (!startedUp)
113 | 			loadBootstrapping();
114 | 		
115 | 		String cleanedToken = token.getToken().toLowerCase();
116 | 		String cleanedLemma = token.getLemma().toLowerCase();
117 | 		
118 | 		if (negWords.contains(cleanedToken) || negWords.contains(cleanedLemma))
119 | 			return -1.0;		
120 | 		else if (posWords.contains(cleanedToken) || posWords.contains(cleanedLemma))
121 | 			return 1.0;
122 | 		else
123 | 			return 0.0;
124 | 	}
125 | 	
126 | 	public static Double lookupClassForToken(TokenWithContext token)
127 | 	{
128 | 		if (!startedUp)
129 | 			System.err.println("Cannot look up class for sentiment token. Have not loaded lexicons.");
130 | 		
131 | 		String cleanedToken = token.getToken().toLowerCase();
132 | 		String cleanedLemma = token.getLemma().toLowerCase();
133 | 		
134 | 		if (negWords.contains(cleanedToken) || negWords.contains(cleanedLemma))
135 | 			return -1.0;		
136 | 		else if (posWords.contains(cleanedToken) || posWords.contains(cleanedLemma))
137 | 			return 1.0;
138 | 		else
139 | 			return 0.0;
140 | 
141 | 	}
142 | 
143 | 	public static Sentiment decodeClassNumber(Double classNumber)
144 | 	{
145 | 		if (classNumber == null)
146 | 			return null;
147 | 	
148 | 		switch((int) Math.round(classNumber)  )
149 | 		{
150 | 		case -1:
151 | 			return Sentiment.NEG;
152 | 		case 0:
153 | 			return Sentiment.OBJ;
154 | 		case 1:
155 | 			return Sentiment.POS;
156 | 		default:
157 | 			return null;
158 | 		}
159 | 	}
160 | 	
161 | 	@Override
162 | 	public String getName() {
163 | 		return "sentiment word";
164 | 	}
165 | 	
166 | 	/*
167 | 	public static void main(String[] args)
168 | 	{
169 | 		SVMTokenModelSentiment model = new SVMTokenModelSentiment(null);
170 | 		
171 | 		System.out.println("Negative words: " + model.negWords.size());
172 | 		System.out.println("Positive words: " + model.posWords.size());
173 | 		
174 | 		Sentence testSentence = new Sentence("It has a great big screen but a terrible little tiny shutter release.", Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline());
175 | 
176 | 		for (TokenWithContext nextToken : testSentence.getTokens())
177 | 		{
178 | 			System.out.println(nextToken.getToken() + " " + model.getClassForToken(nextToken) );
179 | 		}
180 | 		
181 | 	}
182 | 	*/
183 | 	
184 | 	@Override
185 | 	public double getC(int viewNum) {
186 | 		if (this.specifiedC != null)
187 | 			return this.specifiedC;
188 | 		else
189 | 		{
190 | 			if (useViews)
191 | 				return svmCForViews[viewNum];
192 | 			else
193 | 				return svmC;
194 | 		}
195 | 	}
196 | 
197 | 	@Override
198 | 	public double getGamma(int viewNum) {
199 | 		if (this.specifiedGamma != null)
200 | 			return this.specifiedGamma;
201 | 		else
202 | 		{
203 | 			if (useViews)
204 | 				return svmGammaForViews[viewNum];
205 | 			else
206 | 				return svmGamma;
207 | 		}
208 | 	}
209 | 	
210 | 	@Override
211 | 	public double getEpsilon() {
212 | 		if (this.specifiedEpsilon != null)
213 | 			return specifiedEpsilon;
214 | 		else
215 | 			return svmEpsilon;
216 | 	}
217 | 	@Override
218 | 	public ModelType getModelType()
219 | 	{
220 | 		return ModelType.SENTIMENT;
221 | 	}
222 | 
223 | 	
224 | 	public static void main(String[] args)
225 | 	{
226 | //		public SVMTokenModelSentiment(Task task, List<TokenWithContext> tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) {
227 | 
228 | 		SVMTokenModelSentiment model = new SVMTokenModelSentiment(Task.BINGLIU, null, null, null, null, null, null);
229 | 		
230 | 		System.out.println(model.getClassForToken(new TokenWithContext(1, "horrible", null, PartOfSpeech.JJ, null, null, null, false)));
231 | 		System.out.println(model.getClassForToken(new TokenWithContext(1, "great", null, PartOfSpeech.JJ, null, null, null, false)));
232 | 		System.out.println(model.getClassForToken(new TokenWithContext(1, "uneventful", null, PartOfSpeech.JJ, null, null, null, false)));
233 | 		System.out.println(model.getClassForToken(new TokenWithContext(1, "outstanding", null, PartOfSpeech.JJ, null, null, null, false)));
234 | 		System.out.println(model.getClassForToken(new TokenWithContext(1, "OUTSTANDING", null, PartOfSpeech.JJ, null, null, null, false)));
235 | 
236 | 		
237 | 	}
238 | 	
239 | }
240 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/ml/Views.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.ml;
 2 | 
 3 | public enum Views {
 4 | 	LEXICAL,
 5 | 	SYNTACTIC,
 6 | 	BAGOFWORDS;
 7 | 	
 8 | 	public static Integer getNumberForView(Views view)
 9 | 	{
10 | 		switch (view)
11 | 		{
12 | 		case LEXICAL:
13 | 			return 0;
14 | 		case SYNTACTIC:
15 | 			return 1;
16 | 		case BAGOFWORDS:
17 | 			return 2;
18 | 		default:
19 | 			return null;
20 | 		}
21 | 	}
22 | 
23 | 	public static Views getViewForNumber(int number)
24 | 	{
25 | 		switch (number)
26 | 		{
27 | 		case 0:
28 | 			return LEXICAL;
29 | 		case 1:
30 | 			return SYNTACTIC;
31 | 		case 2:
32 | 			return BAGOFWORDS;
33 | 		default:
34 | 			return null;
35 | 		}
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/AspectMatchPolicy.java:
--------------------------------------------------------------------------------
1 | package ca.carter.thesis.model;
2 | 
3 | public enum AspectMatchPolicy {
4 | 	PARTIAL,
5 | 	EXACT
6 | }
7 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/ProductFeatureOpinion.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.model;
  2 | 
  3 | import java.util.Properties;
  4 | 
  5 | import edu.stanford.nlp.ling.CoreLabel;
  6 | import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
  7 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
  8 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
  9 | import edu.stanford.nlp.pipeline.Annotation;
 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 11 | import edu.stanford.nlp.util.CoreMap;
 12 | 
 13 | public class ProductFeatureOpinion {
 14 | 	private String rawFeature;
 15 | 	private String feature;
 16 | 	private String lemmatizedFeature;
 17 | 	private int sentimentValue;
 18 | 	private ProductOpinionFeatureDetail detail;	//this is whether the feature is unlisted, etc.; values other than unlisted are probably not useful for my work
 19 | 	//private List<String> paraphrases;	
 20 | 	
 21 | 	private Integer from = null;
 22 | 	private Integer to = null;
 23 | 	
 24 | 	
 25 | 	//for parsing features and listed polarities from Bing Liu data
 26 | 	public ProductFeatureOpinion(String feature, StanfordCoreNLP pipeline) {
 27 | 		super();
 28 | 
 29 | 		this.rawFeature = feature;
 30 | 		
 31 | 		//t-mobile service[+2][u]
 32 | 		
 33 | 		String parts[] = feature.split("[\\[\\]{}]+");
 34 | 		int partsLength = parts.length;
 35 | 
 36 | 		this.feature = parts[0].trim();
 37 | 		
 38 | 		//this.paraphrases = WikipediaParaphraser.getParaphrases(this.feature, true);
 39 | 		
 40 | 		Annotation document = new Annotation(this.feature);
 41 | 		pipeline.annotate(document);
 42 | 
 43 | 		// these are all the sentences in this document
 44 | 		// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
 45 | 		CoreMap sentenceFragment = document.get(SentencesAnnotation.class).get(0);
 46 | 
 47 | 		
 48 | 		
 49 | 		StringBuilder sb = new StringBuilder();
 50 | 		boolean first = true;
 51 | 		for (CoreLabel token: sentenceFragment.get(TokensAnnotation.class)) {
 52 | 			String lemma = token.get(LemmaAnnotation.class);
 53 | 			if (first)
 54 | 				first = false;
 55 | 			else
 56 | 				sb.append(" ");
 57 | 			sb.append(lemma);
 58 | 		}
 59 | 		this.lemmatizedFeature = sb.toString();
 60 | 		
 61 | 		if (parts.length == 1)
 62 | 		{
 63 | 			//this is triggered in a very small number of cases where a given feature is both good and bad in a sentence 
 64 | 			//i.e.,				"look##this thing , while looking pretty cool , is not as sexy as the ipod .",
 65 | 			sentimentValue = 0;
 66 | 		}
 67 | 		else
 68 | 		{
 69 | 			switch (parts[1].charAt(0))
 70 | 			{
 71 | 			case '+':
 72 | 				if (parts[1].length() == 1)
 73 | 					sentimentValue = 1;
 74 | 				else
 75 | 					sentimentValue = Integer.valueOf(parts[1].substring(1));
 76 | 				break;
 77 | 			case '-':
 78 | 				sentimentValue = -1 * Integer.valueOf(parts[1].substring(1));
 79 | 				break;
 80 | 			default:
 81 | 				sentimentValue = Integer.valueOf(parts[1].substring(0));
 82 | 			}
 83 | 		}
 84 | 		
 85 | 		if (partsLength > 2)
 86 | 			detail = ProductOpinionFeatureDetail.byValue(parts[2]);
 87 | 		if (partsLength > 3)
 88 | 		{
 89 | 			if ((parts[2].equalsIgnoreCase("p") && parts[3].equalsIgnoreCase("u")) || (parts[2].equalsIgnoreCase("u") && parts[3].equalsIgnoreCase("p")))
 90 | 				//resolve redundancy in this case
 91 | 				detail = ProductOpinionFeatureDetail.PRONOUN;
 92 | 			else
 93 | 				System.out.println("More than one product opinion feature detail, which is an unusual and/or conflicting occurrence: " + feature);
 94 | 		}
 95 | 		
 96 | 	}
 97 | 	
 98 | 	//for feeding in aspects from XML where the polarity is already defined
 99 | 	public ProductFeatureOpinion(String feature, String polarity, int from, int to, StanfordCoreNLP pipeline) {
100 | 		super();
101 | 
102 | 		this.feature = feature;
103 | 		this.from = from;
104 | 		this.to = to;
105 | 		
106 | 		Annotation document = new Annotation(this.feature);
107 | 		pipeline.annotate(document);
108 | 
109 | 		// these are all the sentences in this document
110 | 		// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
111 | 		CoreMap sentenceFragment = document.get(SentencesAnnotation.class).get(0);
112 | 
113 | 		StringBuilder sb = new StringBuilder();
114 | 		boolean first = true;
115 | 		for (CoreLabel token: sentenceFragment.get(TokensAnnotation.class)) {
116 | 			String lemma = token.get(LemmaAnnotation.class);
117 | 			if (first)
118 | 				first = false;
119 | 			else
120 | 				sb.append(" ");
121 | 			sb.append(lemma);
122 | 		}
123 | 		this.lemmatizedFeature = sb.toString();
124 | 		
125 | 		if ("positive".equals(polarity))
126 | 			sentimentValue = 2;
127 | 		else if ("negative".equals(polarity))
128 | 			sentimentValue = -2;
129 | 		else if ("neutral".equals(polarity))
130 | 			sentimentValue = 0;
131 | 		else
132 | 		{
133 | 			System.out.println("Could not assign polarity " + polarity);
134 | 			sentimentValue = 0;
135 | 		}
136 | 	}
137 | 	public String getFeature() {
138 | 		return feature;
139 | 	}
140 | 	public void setFeature(String feature) {
141 | 		this.feature = feature;
142 | 	}
143 | 	public String getLemmatizedFeature() {
144 | 		return lemmatizedFeature;
145 | 	}
146 | 	public void setLemmatizedFeature(String lemmatizedFeature) {
147 | 		this.lemmatizedFeature = lemmatizedFeature;
148 | 	}
149 | 	public int getSentimentValue() {
150 | 		return sentimentValue;
151 | 	}
152 | 	public void setSentimentValue(int sentimentValue) {
153 | 		this.sentimentValue = sentimentValue;
154 | 	}
155 | 	public Sentiment getSentiment() {
156 | 		if (sentimentValue > 0)
157 | 			return Sentiment.POS;
158 | 		else if (sentimentValue < 0)
159 | 			return Sentiment.NEG;
160 | 		else
161 | 			return Sentiment.OBJ;
162 | 	}
163 | 	public ProductOpinionFeatureDetail getDetail() {
164 | 		return detail;
165 | 	}
166 | 	public void setDetail(ProductOpinionFeatureDetail detail) {
167 | 		this.detail = detail;
168 | 	}
169 | 	public String getRawFeature() {
170 | 		return rawFeature;
171 | 	}
172 | 	public static StanfordCoreNLP getDefaultPipeline()
173 | 	{
174 | 		//System.out.println("Getting default pipeline.");
175 | 		//TODO: stem it
176 | 
177 | 		Properties props = new Properties();
178 | 		props.put("annotators", "tokenize, ssplit, pos, lemma");
179 | 		return new StanfordCoreNLP(props);
180 | 	}
181 | 
182 | 	
183 | 	
184 | 	@Override
185 | 	public String toString() {
186 | 		return "ProductFeatureOpinion [feature=" + feature
187 | 				+ ", lemmatizedFeature=" + lemmatizedFeature + ", sentimentValue="
188 | 				+ sentimentValue + (detail != null ? ", detail=" + detail : ", no extra details") + "]";
189 | 	}
190 | 	public static void main(String[] args)
191 | 	{
192 | 		//String testOpinion = "t-mobile service[+2][u]";
193 | 		String testOpinion = "feature[+2}, ";
194 | 		//String testOpinion = "look";
195 | 		
196 | 		ProductFeatureOpinion test = new ProductFeatureOpinion(testOpinion, ProductFeatureOpinion.getDefaultPipeline());
197 | 		
198 | 		System.out.println(test.toString());
199 | 	}
200 | }
201 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/ProductOpinionFeatureDetail.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model;
 2 | 
 3 | public enum ProductOpinionFeatureDetail {
 4 | 
 5 | 	UNLISTED, 				// ("u"),
 6 | 	PRONOUN,  				//("p"),
 7 | 	SUGGESTION, 			//("s"),
 8 | 	COMPARISONCONTRAST, 	//("cc"),
 9 | 	COMPARISONSAMEBRAND ;	//("cs");
10 | 	
11 | 	/*
12 | 	  [u] : feature not appeared in the sentence.
13 | 	  [p] : feature not appeared in the sentence. Pronoun resolution is needed.
14 | 	  [s] : suggestion or recommendation.
15 | 	  [cc]: comparison with a competing product from a different brand.
16 | 	  [cs]: comparison with a competing product from the same brand.
17 | 	  */
18 | 	
19 | 	//private final String abbrev;
20 | 	//private ProductOpinionFeatureDetail(String abbrev) {
21 |     //    this.abbrev = abbrev;
22 |     //}
23 | 
24 | 	public static ProductOpinionFeatureDetail byValue(String abbrev)
25 | 	{
26 | 		if (abbrev == null | abbrev.isEmpty())
27 | 			return null;
28 | 		
29 | 		switch (abbrev.charAt(0))
30 | 		{
31 | 			case 'u':
32 | 				return UNLISTED;
33 | 			case 'p':
34 | 				return PRONOUN;
35 | 			case 's':
36 | 				return SUGGESTION;
37 | 			case 'c':
38 | 				switch (abbrev.charAt(1))
39 | 				{
40 | 				case 'c':
41 | 					return COMPARISONCONTRAST;
42 | 				case 's':
43 | 					return COMPARISONSAMEBRAND;
44 | 				default:
45 | 					return null;
46 | 				}
47 | 			default:
48 | 				return null;
49 | 		}
50 | 	}
51 | 	
52 | }
53 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/ReconciledFeatureOpinion.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.model;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | import ca.carter.thesis.ml.SVMTokenModelSentiment;
  7 | 
  8 | public class ReconciledFeatureOpinion {
  9 | 	private List<TokenWithContext> tokensWithFeature = new ArrayList<TokenWithContext>();
 10 | 	private List<TokenWithContext> tokensWithSentiment = new ArrayList<TokenWithContext>();
 11 | 	private ProductFeatureOpinion opinion;
 12 | 	private List<Sentiment> sentiment = new ArrayList<Sentiment>();
 13 | 	
 14 | 	
 15 | 	public ReconciledFeatureOpinion(ProductFeatureOpinion opinion) {
 16 | 		super();
 17 | 		this.opinion = opinion;
 18 | 	}
 19 | 	public ProductFeatureOpinion getOpinion() {
 20 | 		return opinion;
 21 | 	}
 22 | 	public void setOpinion(ProductFeatureOpinion opinion) {
 23 | 		this.opinion = opinion;
 24 | 	}
 25 | 	public List<TokenWithContext> getTokensWithFeature() {
 26 | 		return tokensWithFeature;
 27 | 	}
 28 | 	public void setTokensWithFeature(List<TokenWithContext> tokensWithFeature) {
 29 | 		this.tokensWithFeature = tokensWithFeature;
 30 | 	}
 31 | 	public List<TokenWithContext> getTokensWithSentiment() {
 32 | 		return tokensWithSentiment;
 33 | 	}
 34 | 	public void setTokensWithSentiment(List<TokenWithContext> tokensWithSentiment) {
 35 | 		this.tokensWithSentiment = tokensWithSentiment;
 36 | 	}
 37 | 	public List<Sentiment> getSentiment() {
 38 | 		return sentiment;
 39 | 	}
 40 | 	public void setSentiment(ArrayList<Sentiment> sentiment) {
 41 | 		this.sentiment = sentiment;
 42 | 	}
 43 | 	
 44 | 	public void addSentimentToken(TokenWithContext sentiToken)
 45 | 	{
 46 | 		tokensWithSentiment.add(sentiToken);
 47 | 		
 48 | 		//the classifier is very good at deciding whether a word is sentiment-bearing; it is not as good at deciding whether rarely-seen words are positive or negative; so, if it exists in our lexicon, trust that; otherwise, use the prediction; if word is not in lexicon, only use the prediction
 49 | 		Sentiment lexicalizedSentiment = SVMTokenModelSentiment.decodeClassNumber(SVMTokenModelSentiment.lookupClassForToken(sentiToken));
 50 | 		
 51 | 		Sentiment predictedSentiment = null;
 52 | 		if (lexicalizedSentiment != Sentiment.OBJ)
 53 | 		{
 54 | 			//if (predictedSentiment != lexicalizedSentiment)
 55 | 			//	System.out.println("Correcting sentiment for '" + sentiToken.getToken() + "' from " + SVMTokenModelSentiment.decodeClassNumber(sentiToken.getPredictedClass()) + " (predicted) to " + lexicalizedSentiment + " (according to lexicon).");
 56 | 			
 57 | 			predictedSentiment = lexicalizedSentiment;
 58 | 		}
 59 | 		else
 60 | 			predictedSentiment = SVMTokenModelSentiment.decodeClassNumber(sentiToken.getPredictedClass());
 61 | 		
 62 | 		
 63 | 		//System.out.println("Predicted " + sentiToken.getPredictedClass());
 64 | 		
 65 | 		if (sentiToken.isSemanticOutgoingEdgesIncludeNegation() && predictedSentiment == Sentiment.POS)
 66 | 			sentiment.add(Sentiment.NEG);
 67 | 		else if (sentiToken.isSemanticOutgoingEdgesIncludeNegation() && predictedSentiment == Sentiment.NEG)
 68 | 			sentiment.add(Sentiment.POS);
 69 | 		else
 70 | 			sentiment.add(predictedSentiment);
 71 | 
 72 | 	}
 73 | 
 74 | 	public boolean isComplete()
 75 | 	{
 76 | 		if (tokensWithFeature == null || tokensWithFeature.isEmpty())
 77 | 			return false;
 78 | 		if (tokensWithSentiment == null || tokensWithSentiment.isEmpty())
 79 | 			return false;
 80 | 		return true;
 81 | 	}
 82 | 
 83 | 	//use voting among sentiments to see how we did
 84 | 	public boolean isCorrect()
 85 | 	{
 86 | 		//biggest gap among tokens must be no greater than one token (for simplicity of business logic)
 87 | 		if (tokensWithFeature == null || tokensWithFeature.isEmpty())
 88 | 			return false;
 89 | 		if (tokensWithSentiment == null || tokensWithSentiment.isEmpty())
 90 | 			return false;
 91 | 		
 92 | 		//check to see if we got the sentiment polarity correct
 93 | 		int numPos = 0;
 94 | 		int numNeg = 0;
 95 | 		boolean sentimentIsCorrect = false;
 96 | 		for (Sentiment nextSentiment : sentiment)
 97 | 		{
 98 | 			switch (nextSentiment)
 99 | 			{
100 | 			case POS:
101 | 				numPos++;
102 | 				break;
103 | 			case NEG:
104 | 				numNeg++;
105 | 				break;
106 | 			}
107 | 		}
108 | 		if (numPos > numNeg && opinion.getSentiment() == Sentiment.POS)
109 | 			sentimentIsCorrect = true;
110 | 		else if (numNeg > numPos && opinion.getSentiment() == Sentiment.NEG)
111 | 			sentimentIsCorrect = true;
112 | 		else
113 | 			return false;
114 | 			
115 | 		//TODO: maybe assign score based on getting more tokens correct and correctly predicting the feature
116 | 		if (sentimentIsCorrect)
117 | 			return true;
118 | 		
119 | 		return false;
120 | 	}
121 | 	@Override
122 | 	public String toString() {
123 | 		StringBuilder sb = new StringBuilder();
124 | 		
125 | 		sb.append("ReconciledFeatureOpinion [tokensWithFeature=");
126 | 		if (tokensWithFeature == null)
127 | 			sb.append("null");
128 | 		else
129 | 		{
130 | 			boolean first = true;
131 | 			sb.append("[");
132 | 			for (TokenWithContext token : tokensWithFeature)
133 | 			{
134 | 				if (!first)
135 | 					sb.append(", ");
136 | 				first = false;
137 | 				sb.append(token.getToken());
138 | 			}
139 | 			sb.append("]");
140 | 		}
141 | 		sb.append(", tokensWithSentiment=");
142 | 		if (tokensWithSentiment == null)
143 | 			sb.append("null");
144 | 		else
145 | 		{
146 | 			boolean first = true;
147 | 			sb.append("[");
148 | 			for (TokenWithContext token : tokensWithSentiment)
149 | 			{
150 | 				if (!first)
151 | 					sb.append(", ");
152 | 				first = false;
153 | 				sb.append(token.getToken());
154 | 			}
155 | 			sb.append("]");
156 | 		}
157 | 		sb.append(", opinion=").append(opinion);
158 | 
159 | 		sb.append(", sentiment=");
160 | 		if (sentiment == null)
161 | 			sb.append("null");
162 | 		else
163 | 		{
164 | 			boolean first = true;
165 | 			sb.append("[");
166 | 			for (Sentiment nextSentiment : sentiment)
167 | 			{
168 | 				if (!first)
169 | 					sb.append(", ");
170 | 				first = false;
171 | 				sb.append(nextSentiment);
172 | 			}
173 | 			sb.append("]");
174 | 		}
175 | 		sb.append("]");
176 | 				
177 | 		return sb.toString();
178 | 	}
179 | 	
180 | 	/*
181 | 	public static void main(String[] args)
182 | 	{
183 | 		Sentence sentence = new Sentence("screen[+2],sound[+2]##great screen and great sound .");
184 | 
185 | 		ReconciledFeatureOpinion firstOpinion = 
186 | 	
187 | 	}
188 | 	*/
189 | 	
190 | 	
191 | }
192 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/Review.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | public class Review {
 6 | 	String title;
 7 | 	List<String> sentences;
 8 | 	
 9 | 	public Review(String title) {
10 | 		super();
11 | 		this.title = title;
12 | 	}
13 | 	public String getTitle() {
14 | 		return title;
15 | 	}
16 | 	public void setTitle(String title) {
17 | 		this.title = title;
18 | 	}
19 | 	public List<String> getSentences() {
20 | 		return sentences;
21 | 	}
22 | 	public void setSentences(List<String> sentences) {
23 | 		this.sentences = sentences;
24 | 	}
25 | 	
26 | 	
27 | }
28 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/SemanticallyTaggedTokenWithContext.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model;
 2 | 
 3 | public class SemanticallyTaggedTokenWithContext implements Cloneable {
 4 | 	private String semanticRole;
 5 | 	private TokenWithContext tokenWithContext;
 6 | 	
 7 | 	
 8 | 	
 9 | 	public SemanticallyTaggedTokenWithContext(String semanticRole,
10 | 			TokenWithContext tokenWithContext) {
11 | 		super();
12 | 		this.semanticRole = semanticRole;
13 | 		this.tokenWithContext = tokenWithContext;
14 | 	}
15 | 	public String getSemanticRole() {
16 | 		return semanticRole;
17 | 	}
18 | 	public void setSemanticRole(String semanticRole) {
19 | 		this.semanticRole = semanticRole;
20 | 	}
21 | 	public TokenWithContext getTokenWithContext() {
22 | 		return tokenWithContext;
23 | 	}
24 | 	public void setTokenWithContext(TokenWithContext tokenWithContext) {
25 | 		this.tokenWithContext = tokenWithContext;
26 | 	}
27 | 	
28 | 	
29 | 	
30 | 	/* (non-Javadoc)
31 | 	 * @see java.lang.Object#clone()
32 | 	 */
33 | 	@Override
34 | 	protected Object clone() throws CloneNotSupportedException {
35 | 		// TODO Auto-generated method stub
36 | 		SemanticallyTaggedTokenWithContext clone = (SemanticallyTaggedTokenWithContext) super.clone();
37 | 		clone.tokenWithContext = (TokenWithContext) tokenWithContext.clone();
38 | 		return clone;
39 | 	}
40 | 	@Override
41 | 	public String toString() {
42 | 		return "SemanticallyTaggedTokenWithContext [semanticRole="
43 | 				+ semanticRole + ", tokenWithContext=" + tokenWithContext.getToken() + "]";
44 | 	}
45 | 	
46 | 	
47 | }
48 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/Sentiment.java:
--------------------------------------------------------------------------------
1 | package ca.carter.thesis.model;
2 | 
3 | public enum Sentiment {
4 | 	POS,
5 | 	OBJ,
6 | 	NEG;
7 | 	
8 | }
9 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/SimpleSentence.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | public class SimpleSentence {
 6 | 	private String sentence;
 7 | 	private List<ProductFeatureOpinion> opinions;
 8 | 	private boolean needsOpinionParsing;
 9 | 	
10 | 	public SimpleSentence(String sentence, boolean needsOpinionParsing) {
11 | 		super();
12 | 		this.sentence = sentence;
13 | 		this.opinions = null;
14 | 		this.needsOpinionParsing = needsOpinionParsing;
15 | 	}
16 | 	public String getSentence() {
17 | 		return sentence;
18 | 	}
19 | 	public void setSentence(String sentence) {
20 | 		this.sentence = sentence;
21 | 	}
22 | 	public List<ProductFeatureOpinion> getOpinions() {
23 | 		return opinions;
24 | 	}
25 | 	public void setOpinions(List<ProductFeatureOpinion> opinions) {
26 | 		this.opinions = opinions;
27 | 	}
28 | 	public boolean isNeedsOpinionParsing() {
29 | 		return needsOpinionParsing;
30 | 	}
31 | 	
32 | 	
33 | }
34 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/Task.java:
--------------------------------------------------------------------------------
1 | package ca.carter.thesis.model;
2 | 
3 | public enum Task {
4 | 	BINGLIU,
5 | 	SEMEVALTASK4PART1,
6 | 	SEMEVALTASK4PART2
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/TokenWithContext.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.model;
  2 | 
  3 | import java.util.List;
  4 | 
  5 | import ca.carter.thesis.ml.ModelType;
  6 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure;
  7 | import ca.carter.thesis.model.phrasetree.PartOfSpeech;
  8 | 
  9 | public class TokenWithContext implements Cloneable {
 10 | 	private String token;
 11 | 	private String lemma;
 12 | 	private PartOfSpeech pos;
 13 | 	private List<TokenWithContext> previousTokens;
 14 | 	private List<TokenWithContext> nextTokens;
 15 | 	private List<PartOfSpeech> parentage; //the clause hierarchy above this token
 16 | 	private boolean isNamedEntity;
 17 | 	private boolean isCoreferenceHead;
 18 | 	private String flatResolvedCoreference;
 19 | 	private String attribute;					//if adjective, what attribute the adjective describes, according to WordNet
 20 | 	private int positionInSentence;
 21 | 	
 22 | 	//for inside/outside tagging; i.e., named entity, feature, sentiment, null
 23 | 	private PartOfSentimentStructure partOfSentimentStructure;
 24 | 	
 25 | 	//dependency graph features
 26 | 	private String semanticSpecificRole;
 27 | 	private String semanticGeneralRole;
 28 | 	private TokenWithContext semanticIncomingEdge;
 29 | 	private boolean semanticOutgoingEdgesIncludeNegation;
 30 | 	private List<SemanticallyTaggedTokenWithContext> semanticallyTaggedTokensWithContext;
 31 | 	
 32 | 	private ProductFeatureOpinion opinion;
 33 | 	
 34 | 	private ModelType predictedModel = null;
 35 | 	private Double predictedClass = null;
 36 | 	
 37 | 	public TokenWithContext(int positionInSentence, String token, String lemma, PartOfSpeech pos,
 38 | 			List<TokenWithContext> previousTokens,
 39 | 			List<TokenWithContext> nextTokens, List<PartOfSpeech> parentage, boolean isNamedEntity) {
 40 | 		super();
 41 | 		this.positionInSentence = positionInSentence;
 42 | 		this.token = token;
 43 | 		this.lemma = lemma;
 44 | 		this.pos = pos;
 45 | 		this.previousTokens = previousTokens;
 46 | 		this.nextTokens = nextTokens;
 47 | 		this.parentage = parentage;
 48 | 		this.isNamedEntity = isNamedEntity;
 49 | 		
 50 | 	}
 51 | 	
 52 | 
 53 | 	public int getPositionInSentence() {
 54 | 		return positionInSentence;
 55 | 	}
 56 | 	public void setPositionInSentence(int positionInSentence) {
 57 | 		this.positionInSentence = positionInSentence;
 58 | 	}
 59 | 	public String getToken() {
 60 | 		return token;
 61 | 	}
 62 | 	public void setToken(String token) {
 63 | 		this.token = token;
 64 | 	}
 65 | 	public String getLemma() {
 66 | 		return lemma;
 67 | 	}
 68 | 	public void setLemma(String lemma) {
 69 | 		this.lemma = lemma;
 70 | 	}
 71 | 	public PartOfSpeech getPos() {
 72 | 		return pos;
 73 | 	}
 74 | 	public void setPos(PartOfSpeech pos) {
 75 | 		this.pos = pos;
 76 | 	}
 77 | 	public PartOfSentimentStructure getPartOfSentimentStructure() {
 78 | 		return partOfSentimentStructure;
 79 | 	}
 80 | 	public void setPartOfSentimentStructure(
 81 | 			PartOfSentimentStructure partOfSentimentStructure) {
 82 | 		this.partOfSentimentStructure = partOfSentimentStructure;
 83 | 	}
 84 | 	public List<TokenWithContext> getPreviousTokens() {
 85 | 		return previousTokens;
 86 | 	}
 87 | 	public void setPreviousTokens(List<TokenWithContext> previousTokens) {
 88 | 		this.previousTokens = previousTokens;
 89 | 	}
 90 | 	public List<TokenWithContext> getNextTokens() {
 91 | 		return nextTokens;
 92 | 	}
 93 | 	public void setNextTokens(List<TokenWithContext> nextTokens) {
 94 | 		this.nextTokens = nextTokens;
 95 | 	}
 96 | 	public boolean isAdjective()
 97 | 	{
 98 | 		if (pos == null)
 99 | 			return false;
100 | 					
101 | 		if (pos == PartOfSpeech.JJ || pos == PartOfSpeech.JJR || pos == PartOfSpeech.JJS)
102 | 			return true;
103 | 		
104 | 		return false;
105 | 	}
106 | 	public List<PartOfSpeech> getLocalParentage() {
107 | 		//returns only the portion of the parentage up to the next S or SBAR (whichever is higher); so
108 | 		//[S, VP, NP, SBAR, S, VP, PP, NP, SBAR, S, VP, SBAR, S, VP, PP, NP] becomes
109 | 		//                                             [SBAR, S, VP, PP, NP] instead
110 | 		
111 | 		if (parentage == null)
112 | 			return null;
113 | 		
114 | 		for (int index = parentage.size() - 1; index >= 0; index--)
115 | 		{
116 | 			if (parentage.get(index) == PartOfSpeech.S)
117 | 			{
118 | 				//lookbehind
119 | 				if (index > 0 && parentage.get(index - 1) == PartOfSpeech.SBAR)
120 | 					index--;
121 | 
122 | 				return parentage.subList(index, parentage.size());
123 | 			}
124 | 		}
125 | 			
126 | 		return parentage;
127 | 	}
128 | 	public List<PartOfSpeech> getParentage() {
129 | 		//returns all the clauses of which this is a part; could look like [S, NP] in a simple case, or
130 | 		//[S, VP, NP, SBAR, S, VP, PP, NP, SBAR, S, VP, SBAR, S, VP, PP, NP] in an ugly case
131 | 		return parentage;
132 | 	}
133 | 	public void setParentage(List<PartOfSpeech> parentage) {
134 | 		this.parentage = parentage;
135 | 	}
136 | 	public boolean isCoreferenceHead() {
137 | 		return isCoreferenceHead;
138 | 	}
139 | 	public void setCoreferenceHead(boolean isCoreferenceHead) {
140 | 		this.isCoreferenceHead = isCoreferenceHead;
141 | 	}
142 | 	public String getFlatResolvedCoreference() {
143 | 		return flatResolvedCoreference;
144 | 	}
145 | 	public void setFlatResolvedCoreference(String flatResolvedCoreference) {
146 | 		this.flatResolvedCoreference = flatResolvedCoreference;
147 | 	}
148 | 	public PartOfSpeech getImmediateParent() {
149 | 		if (parentage == null || parentage.isEmpty())
150 | 			return null;
151 | 		else
152 | 			return parentage.get(0);
153 | 	}
154 | 	public TokenWithContext getPreviousToken() {
155 | 		if (previousTokens == null || previousTokens.isEmpty())
156 | 			return null;
157 | 		else
158 | 			return previousTokens.get(previousTokens.size() - 1);
159 | 	}
160 | 	public TokenWithContext getNextToken() {
161 | 		if (nextTokens == null || nextTokens.isEmpty())
162 | 			return null;
163 | 		else
164 | 			return nextTokens.get(0);
165 | 	}
166 | 	
167 | 	
168 | 	//not part of classifier feature set
169 | 	public ModelType getPredictedModel() {
170 | 		return predictedModel;
171 | 	}
172 | 	public void setPredictedModel(ModelType predictedModel) {
173 | 		this.predictedModel = predictedModel;
174 | 	}
175 | 	public Double getPredictedClass() {
176 | 		return predictedClass;
177 | 	}
178 | 	public void setPredictedClass(Double predictedClass) {
179 | 		this.predictedClass = predictedClass;
180 | 	}
181 | 
182 | 
183 | 	public List<SemanticallyTaggedTokenWithContext> getSemanticallyTaggedTokensWithContext() {
184 | 		return semanticallyTaggedTokensWithContext;
185 | 	}
186 | 	public void setSemanticallyTaggedTokensWithContext(
187 | 			List<SemanticallyTaggedTokenWithContext> semanticallyTaggedTokensWithContext) {
188 | 		this.semanticallyTaggedTokensWithContext = semanticallyTaggedTokensWithContext;
189 | 	}
190 | 	public String getSemanticSpecificRole() {
191 | 		return semanticSpecificRole;
192 | 	}
193 | 	public void setSemanticSpecificRole(String semanticSpecificRole) {
194 | 		this.semanticSpecificRole = semanticSpecificRole;
195 | 	}
196 | 	public String getSemanticGeneralRole() {
197 | 		return semanticGeneralRole;
198 | 	}
199 | 	public void setSemanticGeneralRole(String semanticGeneralRole) {
200 | 		this.semanticGeneralRole = semanticGeneralRole;
201 | 	}
202 | 	public TokenWithContext getSemanticIncomingEdge() {
203 | 		return semanticIncomingEdge;
204 | 	}
205 | 	public void setSemanticIncomingEdge(TokenWithContext semanticIncomingEdge) {
206 | 		this.semanticIncomingEdge = semanticIncomingEdge;
207 | 	}
208 | 	public boolean isSemanticOutgoingEdgesIncludeNegation() {
209 | 		return semanticOutgoingEdgesIncludeNegation;
210 | 	}
211 | 	public void setSemanticOutgoingEdgesIncludeNegation(
212 | 			boolean semanticOutgoingEdgesIncludeNegation) {
213 | 		this.semanticOutgoingEdgesIncludeNegation = semanticOutgoingEdgesIncludeNegation;
214 | 	}
215 | 	public boolean isNamedEntity() {
216 | 		return this.isNamedEntity;
217 | 	}
218 | 	public void setNamedEntity(boolean isNamedEntity) {
219 | 		this.isNamedEntity = isNamedEntity;
220 | 	}
221 | 	public String getAttribute() {
222 | 		return attribute;
223 | 	}
224 | 	public void setAttribute(String attribute) {
225 | 		this.attribute = attribute;
226 | 	}
227 | 
228 | 
229 | 	//not part of classifier features
230 | 	public ProductFeatureOpinion getOpinion() {
231 | 		return opinion;
232 | 	}
233 | 	public void setOpinion(ProductFeatureOpinion opinion) {
234 | 		this.opinion = opinion;
235 | 	}
236 | 	
237 | 	
238 | 	private String flattenTokenList(List<TokenWithContext> list)
239 | 	{
240 | 		if (list == null)
241 | 			return "[<null list>]";
242 | 		
243 | 		StringBuilder sb = new StringBuilder();
244 | 		
245 | 		boolean first = true;
246 | 		sb.append("[");
247 | 		for (TokenWithContext nextToken : list)
248 | 		{
249 | 			if (first)
250 | 				first = false;
251 | 			else
252 | 				sb.append(",");
253 | 				
254 | 			if (nextToken == null)
255 | 				sb.append("null");
256 | 			else
257 | 				sb.append(nextToken.getToken());
258 | 		}
259 | 		sb.append("]");
260 | 
261 | 		return sb.toString();
262 | 	}
263 | 	
264 | 	
265 | 	public String getFormattedTokenContext()
266 | 	{
267 | 		StringBuilder sb = new StringBuilder();
268 | 		
269 | 		if (this.getPreviousTokens() != null)
270 | 		{
271 | 			for (TokenWithContext toPrint : this.getPreviousTokens())
272 | 			{
273 | 				if (toPrint != null)
274 | 					sb.append(toPrint.getToken()).append(" ");
275 | 			}
276 | 		}
277 | 		
278 | 		
279 | 		sb.append("_").append(this.getToken()).append("_ ");
280 | 		
281 | 		if (this.getPreviousTokens() != null)
282 | 		{
283 | 			for (TokenWithContext toPrint : this.getNextTokens())
284 | 			{
285 | 				if (toPrint != null)
286 | 					sb.append(toPrint.getToken()).append(" ");
287 | 			}
288 | 		}
289 | 			
290 | 		
291 | 
292 | 		return sb.toString();
293 | 		
294 | 	}
295 | 	
296 | 	
297 | 	
298 | 
299 | 
300 | 	/* (non-Javadoc)
301 | 	 * @see java.lang.Object#clone()
302 | 	 */
303 | 	@Override
304 | 	protected Object clone() throws CloneNotSupportedException {
305 | 		// TODO Auto-generated method stub
306 | 		TokenWithContext clone = (TokenWithContext) super.clone();
307 | 
308 | 		clone.semanticallyTaggedTokensWithContext = null;
309 | 
310 | 		clone.previousTokens = null;
311 | 	
312 | 		clone.nextTokens = null;
313 | 			
314 | 		clone.parentage = null;
315 | 
316 | 		clone.semanticIncomingEdge = null;
317 | 			
318 | 		return clone;
319 | 	}
320 | 
321 | 	
322 | 	@Override
323 | 	public String toString() {
324 | 		
325 | 		return "TokenWithContext [token=" + token + ", lemma=" + lemma
326 | 				+ ", pos=" + pos + ", previousTokens=" + flattenTokenList(previousTokens)
327 | 				+ ", nextTokens=" + flattenTokenList(nextTokens) + ", parentage=" + parentage
328 | 				+ ", localParentage=" + getLocalParentage()
329 | 				+ ", isNamedEntity=" + isNamedEntity
330 | 				+ ", isCoreferenceHead=" + isCoreferenceHead
331 | 				+ ", flatResolvedCoreference=" + flatResolvedCoreference
332 | 				+ ", partOfSentimentStructure=" + partOfSentimentStructure
333 | 				+ ", semanticSpecificRole=" + semanticSpecificRole
334 | 				+ ", semanticGeneralRole=" + semanticGeneralRole
335 | 				+ ", semanticIncomingEdge=" + (semanticIncomingEdge == null ? "null" : semanticIncomingEdge.getToken() + "-" + semanticIncomingEdge.getPos() + "-" + semanticIncomingEdge.getSemanticSpecificRole() )
336 |  				+ ", semanticOutgoingEdgesIncludeNegation=" + semanticOutgoingEdgesIncludeNegation
337 |  				+ ", semanticallyTaggedTokensWithContext=" + semanticallyTaggedTokensWithContext
338 |  				+ ", opinion=" + opinion
339 |  				+ ", attribute=" + attribute
340 | 				+ "]";
341 | 	}
342 | 	
343 | 	
344 | 
345 | 	
346 | 
347 | }
348 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/AbstractPhraseTreePart.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model.phrasetree;
 2 | 
 3 | public abstract class AbstractPhraseTreePart {
 4 | 	protected PartOfSpeech pos;
 5 | 
 6 | 	public PartOfSpeech getPos() {
 7 | 		return pos;
 8 | 	}
 9 | 	public void setPos(PartOfSpeech pos) {
10 | 		this.pos = pos;
11 | 	}
12 | 
13 | 	//for speed, we avoid doing class instance comparisons, and instead implement a simple fixed boolean return.
14 | 	public boolean isToken()
15 | 	{
16 | 		return false;
17 | 	}
18 | 
19 | 	public String value()
20 | 	{
21 | 		return pos.toString();
22 | 	}
23 | 	
24 | 	//a convenience method so that we can avoid some class casting in PhraseTree.toString();
25 | 	protected String toString(int indent)
26 | 	{
27 | 		return null;
28 | 	}
29 | 	
30 | }
31 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/PartOfSentimentStructure.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model.phrasetree;
 2 | 
 3 | public enum PartOfSentimentStructure {
 4 | 	PRODUCT,
 5 | 	FEATURE,
 6 | 	OPINION,
 7 | 	OPINIONHOLDER,
 8 | 	TIMEOFOPINION
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/PartOfSpeech.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.model.phrasetree;
  2 | 
  3 | 
  4 | 
  5 | 
  6 | public enum PartOfSpeech implements Cloneable {
  7 | 	
  8 | 	//from http://bulba.sdsu.edu/jeanette/thesis/PennTags.html
  9 | 	
 10 | 	//clauses
 11 | 	S,
 12 | 	SBAR,
 13 | 	SBARQ,
 14 | 	SINV,
 15 | 	SQ,
 16 | 	
 17 | 	//phrases
 18 | 	ADJP, // - Adjective Phrase.
 19 | 	ADVP, // - Adverb Phrase.
 20 | 	CONJP, // - Conjunction Phrase.
 21 | 	FRAG, // - Fragment.
 22 | 	INTJ, // - Interjection. Corresponds approximately to the part-of-speech tag UH.
 23 | 	LST, // - List marker. Includes surrounding punctuation.
 24 | 	NAC, // - Not a Constituent; used to show the scope of certain prenominal modifiers within an NP.
 25 | 	NP, // - Noun Phrase.
 26 | 	NPTMP, // - weird temporal noun phrase
 27 | 	NX, // - Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently.
 28 | 	PP, // - Prepositional Phrase.
 29 | 	PRN, // - Parenthetical.
 30 | 	PRT, // - Particle. Category for words that should be tagged RP.
 31 | 	QP, // - Quantifier Phrase (i.e. complex measure/amount phrase); used within NP.
 32 | 	RRC, // - Reduced Relative Clause.
 33 | 	UCP, // - Unlike Coordinated Phrase.
 34 | 	VP, // - Verb Phrase.
 35 | 	WHADJP, // - Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot.
 36 | 	WHAVP, // - Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why.
 37 | 	WHNP, // - Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards.
 38 | 	WHPP, // - Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP.
 39 | 	X, // - Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the...the-constructions.
 40 | 	XS, // - Unknown sentence? Seems to apply to "more than" or "less than" type constructs, i.e., ...and then finally after less than 60 days ,...
 41 | 
 42 | 	WHADVP, //extra
 43 | 	PUNCTCOLON,
 44 | 	PUNCTCOMMA,
 45 | 	PUNCTENDOFSENTENCE,
 46 | 	PUNCTCURRENCY,
 47 | 	PUNCTQUOTATIONMARK,
 48 | 	PUNCTHASH,
 49 | 	LRB,	// (
 50 | 	RRB,	// )
 51 | 	
 52 | 	//from http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
 53 | 
 54 | 	CC ,
 55 | 	CD ,
 56 | 	DT ,
 57 | 	EX ,
 58 | 	FW ,
 59 | 	IN ,
 60 | 	JJ ,
 61 | 	JJR ,
 62 | 	JJS ,
 63 | 	LS ,
 64 | 	MD ,
 65 | 	NN ,
 66 | 	NNS ,
 67 | 	NNP ,
 68 | 	NNPS ,
 69 | 	PDT ,
 70 | 	POS ,
 71 | 	PRP ,
 72 | 	PRP$ ,
 73 | 	RB ,
 74 | 	RBR ,
 75 | 	RBS ,
 76 | 	RP ,
 77 | 	SYM ,
 78 | 	TO ,
 79 | 	UH ,
 80 | 	VB ,
 81 | 	VBD ,
 82 | 	VBG ,
 83 | 	VBN ,
 84 | 	VBP ,
 85 | 	VBZ ,
 86 | 	WDT ,
 87 | 	WP ,
 88 | 	WP$ ,
 89 | 	WRB;
 90 | 	
 91 | 	//alternatively, this could probably be parsed out of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams.sisterSplitters()
 92 | 	
 93 | 	public static PartOfSpeech fromString (String string) throws java.lang.IllegalArgumentException
 94 | 	{
 95 | 		if (string.equals(":"))
 96 | 			return PartOfSpeech.PUNCTCOLON;
 97 | 		else if (string.equals(","))
 98 | 			return PartOfSpeech.PUNCTCOMMA;
 99 | 		else if (string.equals("."))
100 | 			return PartOfSpeech.PUNCTENDOFSENTENCE;
101 | 		else if (string.equals("$"))
102 | 			return PartOfSpeech.PUNCTCURRENCY;
103 | 		else if (string.equals("-LRB-"))
104 | 			return PartOfSpeech.LRB;
105 | 		else if (string.equals("-RRB-"))
106 | 			return PartOfSpeech.RRB;
107 | 		else if (string.equals("NP-TMP"))
108 | 			//return PartOfSpeech.NP;	//TODO: this is an interesting case, as in "the p/n button switches your dvd players video output signal between pal and ntsc ."
109 | 			return PartOfSpeech.NPTMP;
110 | 		else if (string.equals("''") || string.equals("\"") || string.equals("``"))
111 | 			return PartOfSpeech.PUNCTQUOTATIONMARK;
112 | 		else if (string.equals("#"))
113 | 			return PartOfSpeech.PUNCTHASH;
114 | 		else
115 | 			//may throw IllegalArgumentException
116 | 			return PartOfSpeech.valueOf(string);
117 | 	}
118 | }
119 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/PhraseTree.java:
--------------------------------------------------------------------------------
  1 | package ca.carter.thesis.model.phrasetree;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.Stack;
  6 | import java.util.StringTokenizer;
  7 | 
  8 | import ca.carter.thesis.languagemodels.DefaultTokenizer;
  9 | import edu.stanford.nlp.ling.CoreLabel;
 10 | import edu.stanford.nlp.ling.Sentence;
 11 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 12 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
 13 | import edu.stanford.nlp.pipeline.Annotation;
 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 15 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
 16 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.AnnotatedTree;
 17 | import edu.stanford.nlp.trees.Tree;
 18 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
 19 | import edu.stanford.nlp.util.CoreMap;
 20 | 
 21 | public class PhraseTree extends AbstractPhraseTreePart {
 22 | 	private List<AbstractPhraseTreePart> leaves;
 23 | 	private List<List<PartOfSpeech>> flatLeaves;
 24 | 	
 25 |     private static final LexicalizedParser lexParser  = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
 26 | 	
 27 |     public PhraseTree()
 28 |     {
 29 |     	super();
 30 |     }
 31 |     
 32 |     public PhraseTree(String string)
 33 |     {
 34 |     	StanfordCoreNLP pipeline = ca.carter.thesis.model.Sentence.getDefaultPipeline();
 35 | 		Annotation document = new Annotation(string);
 36 | 		pipeline.annotate(document);
 37 | 		List<CoreMap> sentences = document.get(SentencesAnnotation.class);
 38 | 
 39 | 		CoreMap sentence = sentences.get(0);
 40 | 		Tree lexTree = sentence.get(TreeAnnotation.class);
 41 | 
 42 |     	this.pos = PartOfSpeech.valueOf(lexTree.firstChild().value());
 43 |     	this.leaves = organizeTree(lexTree.firstChild(), true);
 44 |     	this.flatLeaves = flatTree(lexTree.firstChild(), null, null, true);    	
 45 | 
 46 | 		Tree sentiTree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class);
 47 | 		sentiTree.pennPrint();
 48 | 		
 49 | 		//RNNCoreAnnotations.
 50 | 		
 51 | 		System.out.println("Lexi tree is :  " + lexTree);
 52 | 		System.out.println("Senti tree is : " + sentiTree);
 53 | 		
 54 | 
 55 |     }
 56 |    
 57 |     public PhraseTree(Tree lexTree, String pos)
 58 |     {
 59 |     	this.leaves = organizeTree(lexTree, true);
 60 | 
 61 |     	this.flatLeaves = flatTree(lexTree.firstChild(), null, null, true);
 62 | 
 63 |     	try
 64 |     	{
 65 |     		this.pos = PartOfSpeech.fromString(pos);
 66 |     	}
 67 |     	catch (java.lang.IllegalArgumentException e)
 68 |     	{
 69 |     		System.err.println("No part of speech for " + pos);
 70 |     	}
 71 |     }
 72 | 
 73 |     
 74 | 	/*protected Tree getParseTreeForPhrase(String phrase)
 75 | 	{
 76 | 		StringTokenizer st = DefaultTokenizer.getDefaultTokenizer(phrase);
 77 | 		int numTokens = st.countTokens();
 78 | 		String[] tokens = new String[numTokens];
 79 | 		for (int i = 0; i < numTokens; i++)
 80 | 		{
 81 | 			tokens[i] = st.nextToken();
 82 | 		}
 83 | 		
 84 |         List<CoreLabel> rawWords = Sentence.toCoreLabelList(tokens);
 85 |         Tree parseTree = lexParser.apply(rawWords);
 86 | 
 87 |         return parseTree;
 88 | 	}*/
 89 | 		
 90 | 	private List<AbstractPhraseTreePart> organizeTree(Tree tree, boolean topLevel)
 91 | 	{
 92 | 		List<AbstractPhraseTreePart> leaves = new ArrayList<AbstractPhraseTreePart>();
 93 | 		
 94 | 		for (Tree child : tree.getChildrenAsList())
 95 | 		{
 96 | 			if (child.isPreTerminal())
 97 | 			{
 98 | 				PartOfSpeech pos = null;
 99 | 				String childValue = child.value();
100 | 				try
101 | 				{
102 | 					pos = PartOfSpeech.fromString(childValue);
103 | 				}
104 | 				catch (java.lang.IllegalArgumentException e)
105 | 				{
106 | 					System.err.println("Could not get part of speech. Value of child is " + child.value() + " and its first child is " + child.firstChild().value());
107 | 					//System.err.println(tree.getChildrenAsList());
108 | 					e.printStackTrace();	
109 | 					throw(e); //only throw for debugging purposes; spits out full sentence if we do
110 | 				}
111 | 				leaves.add(new TokenLeaf(child.firstChild().value(), pos));
112 | 
113 | 			}
114 | 			else
115 | 			{
116 | 				leaves.add(new PhraseTree(child, child.value()));
117 | 			}			
118 | 		}
119 | 		
120 | 		return leaves;
121 | 	}
122 | 	
123 | 	protected List<List<PartOfSpeech>> flatTree(Tree tree, List<List<PartOfSpeech>> list, Stack<PartOfSpeech> depthStack, boolean topLevel)
124 | 	{
125 | 		if (topLevel)
126 | 		{
127 | 			list = new ArrayList<List<PartOfSpeech>>();
128 | 			depthStack = new Stack<PartOfSpeech>();
129 | 		}
130 | 		
131 | 		for (Tree child : tree.getChildrenAsList())
132 | 		{
133 | 			if (child.isPreTerminal() || child.isLeaf())
134 | 			{
135 | 				list.add((Stack<PartOfSpeech>) depthStack.clone());
136 | 			}
137 | 			else
138 | 			{
139 | 				PartOfSpeech pos = null;
140 | 				String childValue = child.value();
141 | 
142 | 				try
143 | 				{
144 | 					pos = PartOfSpeech.fromString(childValue);
145 | 				}
146 | 				catch (java.lang.IllegalArgumentException e)
147 | 				{
148 | 					//not all that important
149 | 					if (child.firstChild() != null)
150 | 					{
151 | 						System.err.println("Could not get part of speech. Value of child is " + childValue + " and its first child is " + (child.firstChild() == null ? "null" : child.firstChild().value()));
152 | 						throw(e); //only throw for debugging purposes; spits out full sentence if we do
153 | 					}
154 | 				}
155 | 
156 | 				
157 | 				depthStack.push(pos);
158 | 				flatTree(child, list, depthStack, false);
159 | 				depthStack.pop();
160 | 
161 | 				//leaves.add(new PhraseTree(child, child.value()));
162 | 			}			
163 | 		}
164 | 		
165 | 		if (topLevel)
166 | 			return list;
167 | 		else
168 | 			return null;
169 | 	}
170 |     	
171 | 	public List<AbstractPhraseTreePart> getLeaves() {
172 | 		return leaves;
173 | 	}
174 | 
175 | 	public void setLeaves(List<AbstractPhraseTreePart> leaves) {
176 | 		this.leaves = leaves;
177 | 	}
178 | 
179 | 	@Override
180 | 	public String toString()
181 | 	{
182 | 		return toString(0);
183 | 	}
184 | 
185 | 	public List<List<PartOfSpeech>> getFlatLeaves() {
186 | 		return flatLeaves;
187 | 	}
188 | 
189 | 	public void setFlatLeaves(List<List<PartOfSpeech>> flatLeaves) {
190 | 		this.flatLeaves = flatLeaves;
191 | 	}
192 | 
193 | 	@Override
194 | 	protected String toString(int indent)
195 | 	{
196 | 		StringBuilder sb = new StringBuilder();
197 | 		sb.append(new String(new char[indent]).replace('\0', ' '));
198 | 		sb.append(pos).append("(\n");
199 | 		for (AbstractPhraseTreePart nextLeaf : leaves)
200 | 		{
201 | 			if (nextLeaf.isToken())
202 | 				sb.append(new String(new char[indent + 2]).replace('\0', ' ')).append(nextLeaf.toString()).append("\n");
203 | 			else
204 | 				sb.append(nextLeaf.toString(indent + 2)).append("\n");
205 | 		}
206 | 		sb.append(new String(new char[indent]).replace('\0', ' '));
207 | 		sb.append(")");
208 | 		return sb.toString();
209 | 	}
210 | 	
211 | 	
212 | 	public static void main(String[] args)
213 | 	{
214 | 		System.out.println("Starting");
215 | 
216 | 		/*
217 | 		PhraseTree noPt = new PhraseTree();
218 |         String[] sent = { "This", "is", "an", "easy", "sentence", "-", "in", "theory", ",", "so", "it", "goes", "eh", "?" };
219 |         List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
220 |         Tree parse = PhraseTree.lexParser.apply(rawWords);
221 |         parse.pennPrint();
222 |         System.out.println();
223 |         */
224 | 
225 | 		
226 | 		PhraseTree[] phraseTrees =
227 | 			{
228 | 				//new PhraseTree("We intend to raise this violation of the Security Council resolution, if it goes forward, in the U.N.,"),
229 | 				//new PhraseTree("UN Security Council"),
230 | 				//new PhraseTree("From the loan that we took up, EUR 5 bln from EU and EUR 1 bln from the World Bank are intended for the Finance Ministry. Not any EUR intended for the Finance Ministry and not any EUR from the money intended for BNR can go to salaries or bonuses. This money is for Romania,"),
231 | 				//new PhraseTree("The line was really rather long."),
232 | 				//new PhraseTree("the voice quality is very good , and it gets great reception ( that is , in places where you get t-mobile coverage , which is not that good ; see below ) ."),
233 | 				//new PhraseTree("This is an easy phrase to parse - in theory , or so it goes , eh ?"),
234 | 
235 | 				//testing negation
236 | 				//new PhraseTree("remote control are only so-so ; it doesn't show the complete filenames of mp3s with really long names ."),
237 | 
238 | 				//testing negation
239 | 				new PhraseTree("the voice quality is poor, but not its reception"),
240 | 			};
241 | 
242 | 		for (PhraseTree pt : phraseTrees)
243 | 		{
244 | 			System.out.println("toString: " + pt.toString());
245 | 			System.out.println("getFlatLeaves: " + pt.getFlatLeaves());
246 | 		}
247 | 		
248 | 		System.out.println("Done");
249 | 
250 | 	}
251 | 	
252 | 	
253 | 	/*
254 | 	//System.out.println("-----------");
255 | 	System.out.print(subTree.pennString());
256 | 	System.out.println(
257 | 	subTree.isPhrasal() + " / " +
258 | 	subTree.isPrePreTerminal() + " / " +
259 | 	subTree.isPreTerminal() + " / " +
260 | 	//subTree.label() + " / " + "\n" +
261 | 	//subTree.labels() + " / " + "\n" +
262 | 	//subTree.value() + " / " +  "\n" + //actual word or part of speech, depending on node
263 | 	//subTree.getChildrenAsList() +
264 | 	""
265 | 	);
266 | 	*/
267 | 	
268 | 	/*
269 |     public void demoAPI(LexicalizedParser lp) {
270 |         // This option shows parsing a list of correctly tokenized words
271 |         String[] sent = { "This", "is", "an", "easy", "sentence", "." };
272 |         List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
273 |         Tree parse = lp.apply(rawWords);
274 |         parse.pennPrint();
275 |         System.out.println();
276 | 
277 | 
278 |         // This option shows loading and using an explicit tokenizer
279 |         String sent2 = "This is another sentence.";
280 |         TokenizerFactory<CoreLabel> tokenizerFactory = 
281 |           PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
282 |         List<CoreLabel> rawWords2 = 
283 |           tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize();
284 |         parse = lp.apply(rawWords2);
285 | 
286 |         TreebankLanguagePack tlp = new PennTreebankLanguagePack();
287 |         GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
288 |         GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
289 |         List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
290 |         System.out.println(tdl);
291 |         System.out.println();
292 | 
293 |         TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
294 |         tp.printTree(parse);
295 |       }
296 |       */
297 | }
298 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/StringWithTree.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model.phrasetree;
 2 | 
 3 | public class StringWithTree {
 4 | 	private String string;
 5 | 	private PhraseTree phraseTree;
 6 | 
 7 | 	public StringWithTree(String string) {
 8 | 		super();
 9 | 		this.string = string;
10 | 		if (string != null)
11 | 			phraseTree = new PhraseTree(string);
12 | 	}
13 | 	public String getString() {
14 | 		return string;
15 | 	}
16 | 	public void setString(String string) {
17 | 		this.string = string;
18 | 	}
19 | 	public PhraseTree getPhraseTree() {
20 | 		return phraseTree;
21 | 	}
22 | 	public void setPhraseTree(PhraseTree phraseTree) {
23 | 		this.phraseTree = phraseTree;
24 | 	}
25 | 	@Override
26 | 	public String toString() {
27 | 		return string;
28 | 	}
29 | 
30 | 	
31 | }
32 | 


--------------------------------------------------------------------------------
/src/ca/carter/thesis/model/phrasetree/TokenLeaf.java:
--------------------------------------------------------------------------------
 1 | package ca.carter.thesis.model.phrasetree;
 2 | 
 3 | public class TokenLeaf extends AbstractPhraseTreePart {
 4 | 	private String token;
 5 | 
 6 | 	public static final String capitalizedWordIndicator = "c";
 7 | 
 8 | 	public TokenLeaf(String string, PartOfSpeech pos) throws java.lang.IllegalArgumentException
 9 | 	{
10 | 		super();
11 | 		this.token = string;
12 | 		this.pos = pos;
13 | 	}
14 | 	
15 | 	public String getToken() {
16 | 		return token;
17 | 	}
18 | 
19 | 	public void setToken(String token) {
20 | 		this.token = token;
21 | 	}
22 | 	
23 | 	@Override
24 | 	public String value()
25 | 	{
26 | 		return token;
27 | 	}
28 | 	
29 | 	@Override
30 | 	public boolean isToken()
31 | 	{
32 | 		return true;
33 | 	}
34 | 
35 | 	@Override
36 | 	public String toString() {
37 | 		return pos + "(" + token + ")";
38 | 	}	
39 | 	
40 | 	
41 | }
42 | 


--------------------------------------------------------------------------------