├── README.md ├── bin ├── commons-logging.jar ├── ejml-0.23.jar ├── joda-time.jar ├── jollyday.jar ├── jwnl.jar ├── libsvm.jar ├── processreviews-sept2014.jar └── stanford-corenlp-3.3.1.jar ├── commandline-howto.txt ├── exportjar.jardesc ├── jwnl14_file_properties.xml ├── lib ├── commons-logging.jar ├── ejml-0.23.jar ├── jackson-annotations-2.2.3.jar ├── jackson-core-2.3.2.jar ├── jackson-databind-2.2.3.jar ├── joda-time.jar ├── jollyday.jar ├── jwnl.jar ├── libsvm.jar └── stanford-corenlp-3.3.1.jar └── src └── ca └── carter └── thesis ├── ProcessReviews.java ├── RetrainingThread.java ├── ReviewFileReaderFlat.java ├── ReviewFileReaderXML.java ├── SeedModelCreatorThread.java ├── SentenceProcessorThread.java ├── TokenPredictorThread.java ├── WikipediaParaphraser.java ├── WordNetResolver.java ├── evaluation └── ResultsSummary.java ├── languagemodels ├── DefaultTokenizer.java └── StopWords.java ├── ml ├── BinaryPrediction.java ├── ClassWeighting.java ├── FeatureDistance.java ├── FeatureRepository.java ├── FeatureType.java ├── ModelType.java ├── Prediction.java ├── PredictionTokenWithContextPair.java ├── SVMTokenModel.java ├── SVMTokenModelFeature.java ├── SVMTokenModelSentiment.java └── Views.java └── model ├── AspectMatchPolicy.java ├── ProductFeatureOpinion.java ├── ProductOpinionFeatureDetail.java ├── ReconciledFeatureOpinion.java ├── Review.java ├── SemanticallyTaggedTokenWithContext.java ├── Sentence.java ├── Sentiment.java ├── SimpleSentence.java ├── Task.java ├── TokenWithContext.java └── phrasetree ├── AbstractPhraseTreePart.java ├── PartOfSentimentStructure.java ├── PartOfSpeech.java ├── PhraseTree.java ├── StringWithTree.java └── TokenLeaf.java /README.md: -------------------------------------------------------------------------------- 1 | # Inferring aspect-specific opinion structure in product reviews using co-training 2 | This is an algorithm for aspect-based sentiment analysis using co-training, a semi-supervised machine learning algorithm that partitions the machine learning features into two sufficient and uncorrelated "views" and then self-learns. 3 | 4 | ### Required data sets 5 | 6 | The application uses data provided by third parties. To use this project, you'll need to download and unzip: 7 | 8 | http://www.cs.uic.edu/~liub/FBS/CustomerReviewData.zip (Hu and Liu, KDD-2004) 9 | 10 | http://metashare.ilsp.gr:8080/repository/browse/semeval-2014-absa-test-data-gold-annotations/b98d11cec18211e38229842b2b6a04d77591d40acd7542b7af823a54fb03a155/ (Ganu et al., 2009) 11 | 12 | Many thanks to the authors and annotators of these two data sets. 13 | 14 | ### Executable version 15 | 16 | The executable version of this project is in the bin directory. 17 | 18 | Command line arguments can be inspected by running the command: 19 | `java -jar processreviews-sept2014.jar help` 20 | 21 | The Stanford CoreNLP models are also required (stanford-corenlp-3.3.1-models.jar) and should be put in the bin directory (if running from the command line) and the lib directory (if using the source code itself). This can be downloaded from http://search.maven.org/#browse%7C304725258 22 | 23 | ### Source code 24 | 25 | ProcessReviews.java is the main class. 26 | 27 | To use the source code, you'll have to edit a couple of things to start: 28 | - in jwnl14_file_properties.xml, you'll need to update the *dictionary_path* parameter 29 | - in *ProcessReviews.java*, you'll have to update the *defaultRootDir* variable to point to the required data sets 30 | 31 | You can use the *exportjar.jardesc* file in Eclipse to package up a new executable jar file. 32 | 33 | ### Citing 34 | 35 | If using this code, please cite the paper: 36 | 37 | @inproceedings{carter:aspectspecificcotraining, 38 | author = {Carter, Dave and Inkpen, Diana}, 39 | title = {Inferring Aspect-Specific Opinion Structure in Product Reviews using Co-training}, 40 | booktitle = {Proceedings of CICLing-2015}, 41 | series = {Lecture Notes in Computer Science 9042}, 42 | year = {2015}, 43 | isbn = {978-3-319-18117-2}, 44 | location = {Cairo, Egypt}, 45 | pages = {225--240}, 46 | numpages = {16}, 47 | url = {http://dx.doi.org/10.1007/978-3-319-18117-2_17}, 48 | doi = {10.1007/978-3-319-18117-2_17}, 49 | publisher = {Springer-Verlag}, 50 | address = {Berlin, Heidelberg}, 51 | } 52 | 53 | -------------------------------------------------------------------------------- /bin/commons-logging.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/commons-logging.jar -------------------------------------------------------------------------------- /bin/ejml-0.23.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/ejml-0.23.jar -------------------------------------------------------------------------------- /bin/joda-time.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/joda-time.jar -------------------------------------------------------------------------------- /bin/jollyday.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/jollyday.jar -------------------------------------------------------------------------------- /bin/jwnl.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/jwnl.jar -------------------------------------------------------------------------------- /bin/libsvm.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/libsvm.jar -------------------------------------------------------------------------------- /bin/processreviews-sept2014.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/processreviews-sept2014.jar -------------------------------------------------------------------------------- /bin/stanford-corenlp-3.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/bin/stanford-corenlp-3.3.1.jar -------------------------------------------------------------------------------- /commandline-howto.txt: -------------------------------------------------------------------------------- 1 | java -jar processreviews.jar > results.txt 2>&1 2 | 3 | SENTENCES, no cotraining 4 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 6 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t6cnt8-2500.txt 2>&1 & 5 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 7 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t7cnt9-2500.txt 2>&1 & 6 | 7 | SENTENCES, cotraining 8 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 12 13 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t12c13t8-2500.txt 2>&1 & 9 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 13 12 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t13c12t8-2500.txt 2>&1 & 10 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 14 15 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t14c15t9-2500.txt 2>&1 & 11 | java -Xmx8000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 15 14 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-sentences-t15c14t9-2500.txt 2>&1 & 12 | 13 | 14 | TASK2, no cotraining 15 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 6 null 8 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t6cnt8-2500.txt 2>&1 & 16 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 7 null 9 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t7cnt9-2500.txt 2>&1 & 17 | 18 | TASK2, cotraining 19 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500.txt 2>&1 20 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500.txt 2>&1 21 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500.txt 2>&1 22 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500.txt 2>&1 23 | 24 | TASK2, cotraining, trying different thresholds 25 | 26 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055.txt 2>&1 & 27 | 28 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.65 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c065.txt 2>&1 & 29 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.65 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c065.txt 2>&1 & 30 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.65 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c065.txt 2>&1 & 31 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.65 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c065.txt 2>&1 & 32 | 33 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.75 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c075.txt 2>&1 & 34 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.75 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c075.txt 2>&1 & 35 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.75 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c075.txt 2>&1 & 36 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.75 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c075.txt 2>&1 & 37 | 38 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.85 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c085.txt 2>&1 & 39 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.85 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c085.txt 2>&1 & 40 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.85 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c085.txt 2>&1 & 41 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.85 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c085.txt 2>&1 & 42 | 43 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.95 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c095.txt 2>&1 & 44 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 12 8 2500 EXACT 0.95 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13c12t8-2500-c095.txt 2>&1 & 45 | on mbp - java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 15 9 2500 EXACT 0.95 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14c15t9-2500-c095.txt 2>&1 & 46 | on mbp - java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 14 9 2500 EXACT 0.95 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15c14t9-2500-c095.txt 2>&1 & 47 | 48 | TASK2, trying different max number of iterations 49 | 50 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 null 8 2500 EXACT 0.55 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-0coiteration.txt 2>&1 & 51 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 1 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-1coiteration.txt 2>&1 & 52 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 2 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-2coiteration.txt 2>&1 & 53 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 3 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-3coiteration.txt 2>&1 & 54 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 4 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-4coiteration.txt 2>&1 & 55 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 13 8 2500 EXACT 0.55 5 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12c13t8-2500-c055-5coiteration.txt 2>&1 & 56 | 57 | 58 | TASK2, cotraining baselines 59 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 12 null 8 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t12cnt8-2500.txt 2>&1 & 60 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 13 null 8 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t13cnt8-2500.txt 2>&1 & 61 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 14 null 9 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t14cnt9-2500.txt 2>&1 & 62 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART2 15 null 9 > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask2-t15cnt9-2500.txt 2>&1 & 63 | 64 | 65 | TASK1, no contraining 66 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 6 null 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t6cnt8-2500.txt 2>&1 & 67 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 7 null 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t7cnt9-2500.txt 2>&1 & 68 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 6 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t6cnt8-2500.txt 2>&1 & 69 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 7 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t7cnt9-2500.txt 2>&1 & 70 | 71 | TASK1, cotraining 72 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 13 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t12c13t8-2500.txt 2>&1 & 73 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 12 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t13c12t8-2500.txt 2>&1 & 74 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 15 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t14c15t9-2500.txt 2>&1 & 75 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 14 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t15c14t9-2500.txt 2>&1 & 76 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 13 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t12c13t8-2500.txt 2>&1 & 77 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 12 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t13c12t8-2500.txt 2>&1 & 78 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 15 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t14c15t9-2500.txt 2>&1 & 79 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 14 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t15c14t9-2500.txt 2>&1 & 80 | 81 | TASK1, cotraining baselines 82 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 null 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t12cnt8-2500.txt 2>&1 & 83 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 null 8 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t13cnt8-2500.txt 2>&1 & 84 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 null 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t14cnt9-2500.txt 2>&1 & 85 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 null 9 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-exact-t15cnt9-2500.txt 2>&1 & 86 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 12 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t12cnt8-2500.txt 2>&1 & 87 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 13 null 8 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t13cnt8-2500.txt 2>&1 & 88 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 14 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t14cnt9-2500.txt 2>&1 & 89 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 15 null 9 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/semeval-subtask1-partial-t15cnt9-2500.txt 2>&1 & 90 | 91 | 92 | TASK1, BINGLIU, CROSS VALIDATION 93 | 94 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 1 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t1cntn-2500.txt 2>&1 & 95 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 2 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t2cntn-2500.txt 2>&1 & 96 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 3 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t3cntn-2500.txt 2>&1 & 97 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 4 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t4cntn-2500.txt 2>&1 & 98 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 5 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-exact-t5cntn-2500.txt 2>&1 & 99 | 100 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t1cntn-2500.txt 2>&1 & 101 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t2cntn-2500.txt 2>&1 & 102 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t3cntn-2500.txt 2>&1 & 103 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t4cntn-2500.txt 2>&1 & 104 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar SEMEVALTASK4PART1 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-subtask1-partial-t5cntn-2500.txt 2>&1 & 105 | 106 | (underway) 107 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 1 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t1cntn-2500-nogen.txt 2>&1 & 108 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 2 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t2cntn-2500-nogen.txt 2>&1 & 109 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 3 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t3cntn-2500-nogen.txt 2>&1 & 110 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 4 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t4cntn-2500-nogen.txt 2>&1 & 111 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 5 null null 2500 EXACT > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-exact-t5cntn-2500-nogen.txt 2>&1 & 112 | 113 | (underway) 114 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t1cntn-2500-nogen.txt 2>&1 & 115 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t2cntn-2500-nogen.txt 2>&1 & 116 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t3cntn-2500-nogen.txt 2>&1 & 117 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t4cntn-2500-nogen.txt 2>&1 & 118 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar SEMEVALTASK4PART1 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-subtask1-partial-t5cntn-2500-nogen.txt 2>&1 & 119 | 120 | 121 | SENTENCES, BINGLIU, CROSS VALIDATION 122 | 123 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1cntn-2500.txt 2>&1 & 124 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2cntn-2500.txt 2>&1 & 125 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3cntn-2500.txt 2>&1 & 126 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4cntn-2500.txt 2>&1 & 127 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5cntn-2500.txt 2>&1 & 128 | 129 | (underway Nov 8) 130 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1cntn-2500-nogen.txt 2>&1 & 131 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 2 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t2cntn-2500-nogen.txt 2>&1 & 132 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 3 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t3cntn-2500-nogen.txt 2>&1 & 133 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 4 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t4cntn-2500-nogen.txt 2>&1 & 134 | java -Xmx6000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 5 null null 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t5cntn-2500-nogen.txt 2>&1 & 135 | 136 | SENTENCES, BINGLIU, DOMAIN ADAPTATION 137 | 138 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2,3,4,5 null 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2345cnt1-2500.txt 2>&1 & 139 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,3,4,5 null 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1345cnt2-2500.txt 2>&1 & 140 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,4,5 null 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1245cnt3-2500.txt 2>&1 & 141 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,3,5 null 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1235cnt4-2500.txt 2>&1 & 142 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1,2,3,4 null 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1234cnt5-2500.txt 2>&1 & 143 | 144 | (started) 145 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 2,3,4,5 null 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t2345cnt1-2500-nogen.txt 2>&1 & 146 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,3,4,5 null 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1345cnt2-2500-nogen.txt 2>&1 & 147 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,4,5 null 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1245cnt3-2500-nogen.txt 2>&1 & 148 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,3,5 null 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1235cnt4-2500-nogen.txt 2>&1 & 149 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-nov2014-nogenericremoval.jar BINGLIU 1,2,3,4 null 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Nov\ 2014/bingliu-sentences-t1234cnt5-2500-nogen.txt 2>&1 & 150 | 151 | 152 | SENTENCES, BINGLIU, DOMAIN ADAPTATION COTRAINING 153 | 154 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 3,4,5 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c345t1-2500.txt 2>&1 & 155 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 2,4,5 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c245t1-2500.txt 2>&1 & 156 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 2,3,5 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c235t1-2500.txt 2>&1 & 157 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 2,3,4 1 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c234t1-2500.txt 2>&1 & 158 | 159 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 3,4,5 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c345t2-2500.txt 2>&1 & 160 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,4,5 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c145t2-2500.txt 2>&1 & 161 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,3,5 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c135t2-2500.txt 2>&1 & 162 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,3,4 2 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c134t2-2500.txt 2>&1 & 163 | 164 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,4,5 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c245t3-2500.txt 2>&1 & 165 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,4,5 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c145t3-2500.txt 2>&1 & 166 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,2,5 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c125t3-2500.txt 2>&1 & 167 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,2,4 3 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c124t3-2500.txt 2>&1 & 168 | 169 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,3,5 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c235t4-2500.txt 2>&1 & 170 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,3,5 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c135t4-2500.txt 2>&1 & 171 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,2,5 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c125t4-2500.txt 2>&1 & 172 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 5 1,2,3 4 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t5c123t4-2500.txt 2>&1 & 173 | 174 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 1 2,3,4 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t1c234t5-2500.txt 2>&1 & 175 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 2 1,3,4 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t2c134t5-2500.txt 2>&1 & 176 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 3 1,2,4 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t3c124t5-2500.txt 2>&1 & 177 | java -Xmx7000m -jar /Users/davecarter/Dropbox/Thesis\ work/workspace/process-reviews/dist/processreviews-sept2014.jar BINGLIU 4 1,2,3 5 2500 PARTIAL > /Users/davecarter/Dropbox/Thesis\ work/Results\ Sept\ 2014/bingliu-sentences-t4c123t5-2500.txt 2>&1 & 178 | 179 | 180 | 181 | 182 | EC2 stuff: 183 | 184 | https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances: 185 | ssh -i ~/.ssh/amazon-ec2-free.pem ec2-user@54.213.228.153 186 | scp -i /Users/davecart/.ssh/amazon-ec2-free.pem -r ~/Dropbox/EC2\ image/* ec2-user@54.213.248.172: -------------------------------------------------------------------------------- /exportjar.jardesc: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /jwnl14_file_properties.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /lib/commons-logging.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/commons-logging.jar -------------------------------------------------------------------------------- /lib/ejml-0.23.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/ejml-0.23.jar -------------------------------------------------------------------------------- /lib/jackson-annotations-2.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-annotations-2.2.3.jar -------------------------------------------------------------------------------- /lib/jackson-core-2.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-core-2.3.2.jar -------------------------------------------------------------------------------- /lib/jackson-databind-2.2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jackson-databind-2.2.3.jar -------------------------------------------------------------------------------- /lib/joda-time.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/joda-time.jar -------------------------------------------------------------------------------- /lib/jollyday.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jollyday.jar -------------------------------------------------------------------------------- /lib/jwnl.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/jwnl.jar -------------------------------------------------------------------------------- /lib/libsvm.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/libsvm.jar -------------------------------------------------------------------------------- /lib/stanford-corenlp-3.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davecart/cotraining/961c78d3f60e9fbe28bfd9ff356891b8c7f0bc03/lib/stanford-corenlp-3.3.1.jar -------------------------------------------------------------------------------- /src/ca/carter/thesis/RetrainingThread.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import ca.carter.thesis.ml.SVMTokenModel; 4 | 5 | public class RetrainingThread extends Thread { 6 | 7 | SVMTokenModel model; 8 | 9 | 10 | public RetrainingThread(SVMTokenModel model) { 11 | super(); 12 | this.model = model; 13 | } 14 | 15 | public void run() { 16 | model.retrain(null); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ReviewFileReaderFlat.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.LinkedList; 8 | import java.util.Queue; 9 | 10 | import ca.carter.thesis.model.SimpleSentence; 11 | 12 | public class ReviewFileReaderFlat { 13 | 14 | static Queue readReviewFile(File file, Integer limit) throws IOException 15 | { 16 | Queue output = new LinkedList(); 17 | 18 | BufferedReader br = new BufferedReader(new FileReader(file)); 19 | String line; 20 | int lineNum = 0; 21 | while ((line = br.readLine()) != null) { 22 | if (line.isEmpty() || line.charAt(0) == '*') 23 | { 24 | //do nothing 25 | } 26 | else 27 | { 28 | output.add(new SimpleSentence(line, true)); 29 | 30 | if (limit != null && lineNum++ >= limit) 31 | break; 32 | } 33 | } 34 | br.close(); 35 | 36 | return output; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ReviewFileReaderXML.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import javax.xml.parsers.DocumentBuilderFactory; 4 | import javax.xml.parsers.DocumentBuilder; 5 | 6 | import org.w3c.dom.Document; 7 | import org.w3c.dom.NodeList; 8 | import org.w3c.dom.Node; 9 | import org.w3c.dom.Element; 10 | 11 | import ca.carter.thesis.model.ProductFeatureOpinion; 12 | import ca.carter.thesis.model.Sentence; 13 | import ca.carter.thesis.model.SimpleSentence; 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 15 | 16 | import java.io.File; 17 | import java.io.IOException; 18 | import java.util.ArrayList; 19 | import java.util.LinkedList; 20 | import java.util.Queue; 21 | 22 | //mostly borrowed from http://www.mkyong.com/java/how-to-read-xml-file-in-java-dom-parser/ 23 | 24 | public class ReviewFileReaderXML { 25 | 26 | static Queue readReviewFile(File file, Integer limit) throws IOException 27 | { 28 | StanfordCoreNLP pipeline = Sentence.getDefaultPipeline(); 29 | 30 | Queue output = new LinkedList(); 31 | 32 | try { 33 | DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); 34 | DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); 35 | Document doc = dBuilder.parse(file); 36 | 37 | //optional, but recommended 38 | //read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work 39 | doc.getDocumentElement().normalize(); 40 | 41 | //System.out.println("Root element :" + doc.getDocumentElement().getNodeName()); 42 | 43 | NodeList nList = doc.getElementsByTagName("sentence"); 44 | 45 | //System.out.println("----------------------------"); 46 | 47 | final int listLength = nList.getLength(); 48 | int sentenceNum = 0; 49 | 50 | for (int temp = 0; temp < Math.min(listLength, (limit == null ? listLength : limit)); temp++) { 51 | 52 | Node nNode = nList.item(temp); 53 | 54 | //System.out.println("\nCurrent Element :" + nNode.getNodeName()); 55 | 56 | if (nNode.getNodeType() == Node.ELEMENT_NODE) { 57 | 58 | Element eElement = (Element) nNode; 59 | 60 | String sentenceText = eElement.getElementsByTagName("text").item(0).getTextContent(); 61 | 62 | SimpleSentence sentenceToReturn = new SimpleSentence(sentenceText, false); 63 | 64 | //System.out.println("ID : " + eElement.getAttribute("id")); 65 | 66 | // if (sentenceNum == 1521 || sentenceNum == 1522) 67 | // System.out.println("Sentence " + temp + ": " + sentenceText); //eElement.getElementsByTagName("text").item(0).getTextContent()); 68 | sentenceNum++; 69 | 70 | if (eElement.getElementsByTagName("aspectTerms").getLength() == 0 ) 71 | { 72 | //System.out.println("No aspects"); 73 | } 74 | else 75 | { 76 | sentenceToReturn.setOpinions(new ArrayList()); 77 | 78 | 79 | NodeList aspectsNodeList = ((Element) eElement.getElementsByTagName("aspectTerms").item(0) ).getElementsByTagName("aspectTerm"); 80 | final int numberOfAspects = aspectsNodeList.getLength(); 81 | 82 | //System.out.println(numberOfAspects + " aspects"); 83 | 84 | for (int i = 0; i < numberOfAspects; i++) 85 | { 86 | Element aspectElement = (Element) aspectsNodeList.item(i); 87 | 88 | //System.out.println(" Aspect: " + aspectElement.getAttribute("term")); 89 | //System.out.println(" Polarity: " + aspectElement.getAttribute("polarity")); 90 | //System.out.println(" From: " + aspectElement.getAttribute("from")); 91 | //System.out.println(" To: " + aspectElement.getAttribute("to")); 92 | 93 | String aspect = aspectElement.getAttribute("term"); 94 | String polarity = aspectElement.getAttribute("polarity"); 95 | int from = Integer.valueOf(aspectElement.getAttribute("from")); 96 | int to = Integer.valueOf(aspectElement.getAttribute("to")); 97 | 98 | if ("conflict".equals(polarity)) 99 | { 100 | sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, "positive", from, to , pipeline)); 101 | sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, "negative", from, to , pipeline)); 102 | 103 | } 104 | else 105 | { 106 | sentenceToReturn.getOpinions().add(new ProductFeatureOpinion(aspect, polarity, from, to , pipeline)); 107 | } 108 | } 109 | 110 | } 111 | 112 | output.add(sentenceToReturn); 113 | } 114 | } 115 | } catch (Exception e) { 116 | e.printStackTrace(); 117 | } 118 | 119 | return output; 120 | } 121 | 122 | public static void main(String[] args) { 123 | File xmlFile = new File("/Users/davecarter/Dropbox/Thesis data/Semeval-2014-task4/Restaurants_Train_v2.xml"); 124 | 125 | 126 | try { 127 | Queue sentences = readReviewFile(xmlFile, null); 128 | 129 | System.out.println("Parsed " + sentences.size() + " sentences."); 130 | } catch (IOException e) { 131 | // TODO Auto-generated catch block 132 | e.printStackTrace(); 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/SeedModelCreatorThread.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.io.IOException; 6 | import java.io.Writer; 7 | import java.util.List; 8 | 9 | import ca.carter.thesis.ml.ClassWeighting; 10 | import ca.carter.thesis.ml.ModelType; 11 | import ca.carter.thesis.ml.SVMTokenModel; 12 | import ca.carter.thesis.ml.SVMTokenModelFeature; 13 | import ca.carter.thesis.ml.SVMTokenModelSentiment; 14 | import ca.carter.thesis.model.Task; 15 | import ca.carter.thesis.model.TokenWithContext; 16 | 17 | public class SeedModelCreatorThread extends Thread { 18 | 19 | private List models; 20 | private String modelFileOutput; 21 | private String fileName; 22 | private ModelType modelType; 23 | private Task task; 24 | private List seedTokens; 25 | private ClassWeighting classWeighting; 26 | private Double c; 27 | private Double gamma; 28 | private Double epsilon; 29 | 30 | public SeedModelCreatorThread(List models, String modelFileOutput, String fileName, 31 | ModelType modelType, Task task, List seedTokens, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) { 32 | super(); 33 | this.models = models; 34 | this.modelFileOutput = modelFileOutput; 35 | this.fileName = fileName; 36 | this.modelType = modelType; 37 | this.task = task; 38 | this.seedTokens = seedTokens; 39 | this.classWeighting = classWeighting; 40 | this.c = c; 41 | this.gamma = gamma; 42 | this.epsilon = epsilon; 43 | } 44 | 45 | 46 | 47 | public void run() { 48 | Writer[] modelWriter = null; 49 | if (modelFileOutput != null) 50 | { 51 | try { 52 | modelWriter = new Writer[2]; 53 | modelWriter[0] = new BufferedWriter(new FileWriter(modelFileOutput + "view0lexical" + fileName)); 54 | modelWriter[1] = new BufferedWriter(new FileWriter(modelFileOutput + "view1syntactic" + fileName)); 55 | } catch (IOException e) { 56 | // TODO Auto-generated catch block 57 | e.printStackTrace(); 58 | } 59 | } 60 | 61 | SVMTokenModel model = null; 62 | switch (modelType) 63 | { 64 | case FEATURE: 65 | model = new SVMTokenModelFeature(task, seedTokens, modelWriter, classWeighting, c, gamma, epsilon); 66 | break; 67 | case SENTIMENT: 68 | model = new SVMTokenModelSentiment(task, seedTokens, modelWriter, classWeighting, c, gamma, epsilon); 69 | break; 70 | } 71 | 72 | System.out.println(modelType + ": C is " + model.getC(0) + ", gamma is " + model.getGamma(0) + ", and epsilon is " + model.getEpsilon() + "; using " + classWeighting + " weighting policy." ); 73 | models.add(model); 74 | 75 | 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/SentenceProcessorThread.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import java.util.List; 4 | import java.util.Queue; 5 | 6 | import ca.carter.thesis.model.ProductFeatureOpinion; 7 | import ca.carter.thesis.model.Sentence; 8 | import ca.carter.thesis.model.SimpleSentence; 9 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 10 | 11 | public class SentenceProcessorThread extends Thread 12 | { 13 | private Queue rawSentences; 14 | private List sentences; 15 | private String genericName; 16 | private String brandName; 17 | private int titleLineIgnored = 0; 18 | private int sentencesProcessed = 0; 19 | 20 | protected static Integer lock = 1; 21 | 22 | public SentenceProcessorThread(Queue rawSentences, List sentences, String genericName, String brandName) 23 | { 24 | this.sentences = sentences; 25 | this.rawSentences = rawSentences; 26 | this.genericName = genericName; 27 | this.brandName = brandName; 28 | } 29 | 30 | public int getTitleLineIgnored() { 31 | return titleLineIgnored; 32 | } 33 | 34 | public int getSentencesProcessed() { 35 | return sentencesProcessed; 36 | } 37 | 38 | public void run() { 39 | 40 | try { 41 | StanfordCoreNLP pipeline = Sentence.getDefaultPipeline(); 42 | StanfordCoreNLP featurePipeline = ProductFeatureOpinion.getDefaultPipeline(); 43 | 44 | SimpleSentence nextLine = null; 45 | 46 | if (rawSentences == null || rawSentences.isEmpty()) 47 | { 48 | System.err.println("Sentence list was null/empty."); 49 | return; 50 | } 51 | 52 | synchronized(lock) { 53 | nextLine = rawSentences.poll(); 54 | } 55 | 56 | while (nextLine != null) { 57 | if (nextLine.isNeedsOpinionParsing() == true && nextLine.getSentence().startsWith("[t]")) 58 | { 59 | titleLineIgnored++; 60 | //TODO: this would be useful at some point; but for now, skipping titles 61 | } 62 | else 63 | { 64 | try 65 | { 66 | Sentence sentence = null; 67 | if (nextLine.isNeedsOpinionParsing() == true) 68 | sentence = new Sentence(nextLine, pipeline, featurePipeline, genericName, brandName, false); 69 | else 70 | sentence = new Sentence(nextLine, pipeline, featurePipeline, genericName, brandName, false); 71 | 72 | sentencesProcessed++; 73 | synchronized(sentences) 74 | { 75 | sentences.add(sentence); 76 | } 77 | } 78 | catch (Exception e) 79 | { 80 | System.err.println("Had trouble parsing " + nextLine); 81 | e.printStackTrace(); 82 | } 83 | } 84 | synchronized(lock) { 85 | nextLine = rawSentences.poll(); 86 | } 87 | 88 | } 89 | } catch (Exception e) { 90 | // TODO Auto-generated catch block 91 | e.printStackTrace(); 92 | } 93 | 94 | System.out.println("Processed " + sentencesProcessed); 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/TokenPredictorThread.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import ca.carter.thesis.ml.Prediction; 4 | import ca.carter.thesis.ml.SVMTokenModel; 5 | import ca.carter.thesis.model.TokenWithContext; 6 | 7 | public class TokenPredictorThread extends Thread { 8 | 9 | private SVMTokenModel model; 10 | private TokenWithContext nextCotrainingToken; 11 | private Prediction prediction; 12 | 13 | 14 | 15 | public TokenPredictorThread(SVMTokenModel model 16 | ) { 17 | super(); 18 | this.model = model; 19 | } 20 | 21 | 22 | 23 | public TokenWithContext getNextCotrainingToken() { 24 | return nextCotrainingToken; 25 | } 26 | 27 | 28 | 29 | public void setNextCotrainingToken(TokenWithContext nextCotrainingToken) { 30 | this.nextCotrainingToken = nextCotrainingToken; 31 | } 32 | 33 | 34 | 35 | public Prediction getPrediction() { 36 | return prediction; 37 | } 38 | 39 | 40 | 41 | public void setPrediction(Prediction prediction) { 42 | this.prediction = prediction; 43 | } 44 | 45 | 46 | 47 | public void run() { 48 | prediction = model.predict(nextCotrainingToken); 49 | if (prediction == null) 50 | System.err.println("Null prediction for " + nextCotrainingToken); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/WikipediaParaphraser.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStreamReader; 5 | import java.net.HttpURLConnection; 6 | import java.net.URL; 7 | import java.net.URLEncoder; 8 | import java.util.ArrayList; 9 | import java.util.Collections; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.Map; 13 | 14 | import com.fasterxml.jackson.databind.JsonNode; 15 | import com.fasterxml.jackson.databind.ObjectMapper; 16 | 17 | public class WikipediaParaphraser { 18 | 19 | //caching requests to be polite to the wikipedia folks 20 | private static final Map> cache = Collections.synchronizedMap( new HashMap>() ); 21 | private static final boolean debug = false; 22 | 23 | public static List getParaphrases(String string, boolean printDebug) 24 | { 25 | try 26 | { 27 | List cachedCopy = cache.get(string); 28 | if (cachedCopy != null) 29 | { 30 | if (debug) 31 | System.out.println("Cache hit pos."); 32 | return cachedCopy; 33 | } 34 | else if (cache.containsKey(string)) 35 | { 36 | if (debug) 37 | System.out.println("Cache hit neg."); 38 | return null; 39 | } 40 | else 41 | { 42 | if (debug) 43 | System.out.println("Cache miss."); 44 | } 45 | 46 | //String jsonFromWikipedia = 47 | // "{\"query\":{\"normalized\":[{\"from\":\"picture quality\",\"to\":\"Picture quality\"}],\"pages\":{\"38253269\":{\"pageid\":38253269,\"ns\":0,\"title\":\"Picture quality\",\"revisions\":[{\"contentformat\":\"text/x-wiki\",\"contentmodel\":\"wikitext\",\"*\":\"#redirect [[image quality]]\"}]}}}}"; 48 | //"{\"query\":{\"normalized\":[{\"from\":\"scroll button\",\"to\":\"Scroll button\"}],\"pages\":{\"-1\":{\"ns\":0,\"title\":\"Scroll button\",\"missing\":\"\"}}}}" 49 | 50 | String jsonFromWikipedia = getTextFromURL("http://en.wikipedia.org/w/api.php?format=json&action=query&titles=" + URLEncoder.encode(string, "UTF-8") + "&prop=revisions&rvprop=content"); 51 | 52 | //http://en.wikipedia.org/w/api.php?format=json&action=query&titles=picture%20quality&prop=revisions&rvprop=content 53 | 54 | ObjectMapper mapper = new ObjectMapper(); 55 | JsonNode rootNode = mapper.readTree(jsonFromWikipedia); 56 | 57 | try 58 | { 59 | JsonNode nameNode = rootNode.get("query").get("pages").elements().next().get("revisions").get(0).get("*"); 60 | String nodeTitle = nameNode.asText().toLowerCase(); 61 | 62 | if (nodeTitle.startsWith("#redirect")) 63 | { 64 | List results = new ArrayList(); 65 | if (debug) 66 | System.out.println(nodeTitle); 67 | String trimmedNodeTitle = nodeTitle.substring(nodeTitle.indexOf("[[") + 2, nodeTitle.indexOf("]]")).replace('_', ' ').trim(); 68 | if (trimmedNodeTitle.endsWith("(disambiguation)")) 69 | trimmedNodeTitle = trimmedNodeTitle.substring(0, trimmedNodeTitle.indexOf("(disambiguation)")).trim(); 70 | 71 | String[] splitBySection = trimmedNodeTitle.split("#"); 72 | for (String nextSection : splitBySection) 73 | { 74 | if (!nextSection.equalsIgnoreCase(string)) 75 | results.add(nextSection); 76 | } 77 | if (!results.isEmpty()) 78 | { 79 | if (printDebug) 80 | System.out.println("Possible paraphrase: " + string + " ==> " + serializeList(results)); 81 | 82 | cache.put(string, results); 83 | return results; 84 | } 85 | 86 | } 87 | 88 | } 89 | catch (Exception e) 90 | { 91 | //do nothing; just a failure 92 | } 93 | 94 | 95 | } catch (Exception e) { 96 | e.printStackTrace(); 97 | } 98 | 99 | cache.put(string, null); 100 | return null; 101 | } 102 | 103 | //almost verbatim from http://stackoverflow.com/questions/1485708/how-do-i-do-a-http-get-in-java 104 | private static String getTextFromURL(String urlToRead) { 105 | URL url; 106 | HttpURLConnection conn; 107 | BufferedReader rd; 108 | String line; 109 | String result = ""; 110 | try { 111 | url = new URL(urlToRead); 112 | conn = (HttpURLConnection) url.openConnection(); 113 | conn.setRequestProperty("User-Agent", "CarterThesis/1.0 (Macintosh; U; Intel Mac OS X 10.9; en-CA; rv:1.9.2.2) Gecko/20100316 Firefox/3.6.2"); 114 | conn.setRequestMethod("GET"); 115 | rd = new BufferedReader(new InputStreamReader(conn.getInputStream())); 116 | while ((line = rd.readLine()) != null) { 117 | result += line; 118 | } 119 | rd.close(); 120 | } catch (Exception e) { 121 | e.printStackTrace(); 122 | } 123 | return result; 124 | } 125 | 126 | public static String serializeList(List list) 127 | { 128 | if (list == null) 129 | return null; 130 | 131 | StringBuilder sb = new StringBuilder(); 132 | 133 | boolean first = true; 134 | for (String nextParaphrase : list) 135 | { 136 | if (!first) 137 | sb.append(", "); 138 | first = false; 139 | sb.append(nextParaphrase); 140 | } 141 | 142 | return sb.toString(); 143 | } 144 | 145 | public static void main(String[] args) 146 | { 147 | String[] testPhrases = { 148 | "picture quality", 149 | "set up", 150 | "rechargable battery", // ==> Rechargeable_battery 151 | "auto focus", // ==> Autofocus 152 | "picture quality", // ==> image quality 153 | "movie", // ==> film 154 | "spot metering", // ==> metering mode#spot metering 155 | "dvd player", //should have none, but wikipedia will tend to correct the capitalization 156 | "video format", 157 | "lens cap", // ==> lens cover 158 | "lense", // ==> lens 159 | "photo", // ==> photograph 160 | "white balance", // ==> color balance 161 | "uploading" 162 | }; 163 | 164 | for (String nextTestPhrase : testPhrases) 165 | { 166 | System.out.print(nextTestPhrase); 167 | System.out.print(" -> "); 168 | List paraphrases = getParaphrases(nextTestPhrase, false); 169 | if (paraphrases != null) 170 | { 171 | System.out.print(serializeList(paraphrases)); 172 | } 173 | System.out.print("\n"); 174 | 175 | } 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/WordNetResolver.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | 7 | import net.didion.jwnl.JWNL; 8 | import net.didion.jwnl.JWNLException; 9 | import net.didion.jwnl.data.IndexWord; 10 | import net.didion.jwnl.data.POS; 11 | import net.didion.jwnl.data.Pointer; 12 | import net.didion.jwnl.data.PointerType; 13 | import net.didion.jwnl.data.Synset; 14 | import net.didion.jwnl.data.Word; 15 | import net.didion.jwnl.dictionary.Dictionary; 16 | 17 | public class WordNetResolver { 18 | 19 | private static final String jwnlRoot ="/Users/" + System.getProperty("user.name") + "/Dropbox/Thesis work/workspace/process-reviews/jwnl14_file_properties.xml"; 20 | 21 | private static boolean initialized = false; 22 | private static Dictionary dict; 23 | 24 | static 25 | { 26 | 27 | try { 28 | JWNL.initialize(new FileInputStream(new File(jwnlRoot))); 29 | 30 | dict = Dictionary.getInstance(); 31 | 32 | initialized = true; 33 | 34 | } catch (FileNotFoundException e) { 35 | // TODO Auto-generated catch block 36 | e.printStackTrace(); 37 | } catch (JWNLException e) { 38 | // TODO Auto-generated catch block 39 | e.printStackTrace(); 40 | } 41 | catch (Throwable t) 42 | { 43 | t.printStackTrace(); 44 | } 45 | 46 | } 47 | 48 | public static synchronized String getAttributeForAdjective(String adj) 49 | { 50 | try 51 | { 52 | if (!initialized) 53 | { 54 | while (!initialized) 55 | { 56 | Thread.sleep(1000); 57 | } 58 | } 59 | IndexWord indexWord = null; 60 | try 61 | { 62 | indexWord = dict.lookupIndexWord(POS.ADJECTIVE, adj); 63 | } 64 | catch (java.util.NoSuchElementException e) 65 | { 66 | System.out.println("Could not find adjective " + adj); 67 | e.printStackTrace(); 68 | } 69 | 70 | 71 | if (indexWord == null) 72 | return null; 73 | 74 | Synset firstSense = indexWord.getSense(1); //numbering starts at 1, not 0 75 | 76 | if (firstSense == null) 77 | return null; 78 | 79 | Pointer[] attributePointers = firstSense.getPointers(PointerType.ATTRIBUTE); //should be length 0 or 1 in most cases 80 | 81 | if (attributePointers == null || attributePointers.length == 0) 82 | { 83 | if (indexWord.getSenseCount() == 1) 84 | { 85 | //check for stuff like "low-cost", "low-priced" in synset when it is a fairly well-defined word 86 | for (Word word: firstSense.getWords()) 87 | { 88 | if (word.getLemma().startsWith("low-") || word.getLemma().startsWith("high-")) 89 | { 90 | String tentativeValue = word.getLemma().substring(word.getLemma().indexOf("-") + 1); 91 | 92 | if (tentativeValue.endsWith("d")) 93 | { 94 | //if it's a past participle we want to return "price", not "priced" 95 | IndexWord coreVerb = dict.lookupIndexWord(POS.VERB, tentativeValue); 96 | if (coreVerb != null) 97 | { 98 | Synset firstVerbSense = coreVerb.getSense(1); //numbering starts at 1, not 0 99 | if (firstVerbSense != null) 100 | { 101 | String tentativeReplacementVerb = firstVerbSense.getWords()[0].getLemma(); 102 | if (tentativeReplacementVerb.substring(0, 2).equalsIgnoreCase(tentativeValue.substring(0, 2))) 103 | return tentativeReplacementVerb; 104 | } 105 | } 106 | 107 | } 108 | 109 | return tentativeValue; 110 | } 111 | } 112 | 113 | } 114 | 115 | //fall through 116 | return null; 117 | } 118 | 119 | Synset attributeSynset = attributePointers[0].getTargetSynset(); 120 | 121 | if (attributeSynset == null || attributeSynset.getWordsSize() == 0) 122 | return null; 123 | 124 | return attributeSynset.getWords()[0].getLemma(); 125 | } 126 | catch (Exception e) 127 | { 128 | System.out.println("Could not look up " + adj); 129 | e.printStackTrace(); 130 | return null; 131 | } 132 | 133 | } 134 | 135 | 136 | public static boolean isFeatureNearlySynonymous(String word1, String word2) 137 | { 138 | try 139 | { 140 | IndexWord indexWord = dict.lookupIndexWord(POS.NOUN, word1); 141 | 142 | if (indexWord == null) 143 | return false; //TODO: check the reverse? 144 | 145 | for (Synset nextSense : indexWord.getSenses()) 146 | { 147 | for (Word nextWord : nextSense.getWords()) 148 | { 149 | if (nextWord.getLemma().equalsIgnoreCase(word2)) 150 | return true; 151 | } 152 | } 153 | 154 | return false; 155 | } 156 | catch (Exception e) 157 | { 158 | System.out.println("Could not look up " + word1 + " and " + word2); 159 | e.printStackTrace(); 160 | return false; 161 | } 162 | 163 | } 164 | 165 | 166 | public static void main(String[] args) 167 | { 168 | try { 169 | String[] testWords = { 170 | 171 | "small", //size 172 | "large", 173 | "loud", 174 | "bright", 175 | "wide", 176 | "full", 177 | "empty", 178 | "light", //weight 179 | "easy", //ease 180 | "big", //size 181 | "compact", //size 182 | "useful", //none 183 | "affordable", //price **doesn't have attribute, but other words in synset are "low-cost" and "low-priced" 184 | "pricey", 185 | "heavy", //weight 186 | "beautiful", //beauty 187 | "cold", 188 | "razor-sharp", 189 | "wicked", 190 | "fast", 191 | 192 | }; 193 | 194 | for (String testWord : testWords) 195 | { 196 | System.out.println(testWord + " --> " + WordNetResolver.getAttributeForAdjective(testWord)); 197 | } 198 | 199 | System.out.println(WordNetResolver.isFeatureNearlySynonymous("cost", "price")); 200 | System.out.println(WordNetResolver.isFeatureNearlySynonymous("price", "cost")); 201 | } catch (Exception e) { 202 | // TODO Auto-generated catch block 203 | e.printStackTrace(); 204 | } 205 | } 206 | 207 | } 208 | 209 | 210 | /* 211 | * Examples that might be correctable with a better heuristic: 212 | * Found temperature for adjective hot while considering heat 213 | * Found volume for adjective loud while considering sound 214 | * Found comfort for adjective comfortable while considering earbud 215 | Found beauty for adjective ugly while considering style 216 | Found difficulty for adjective difficult while considering software 217 | Found comfort for adjective comfortable while considering earbud 218 | Found attractiveness for adjective attractive while considering design 219 | Found comfort for adjective comfortable while considering earpiece 220 | Found clarity for adjective clear while considering sound quality 221 | 222 | 223 | */ 224 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/evaluation/ResultsSummary.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.evaluation; 2 | 3 | import ca.carter.thesis.ml.ModelType; 4 | 5 | public class ResultsSummary { 6 | int numTested; 7 | int falsePositives; 8 | double truePositives; //is double for cases where we have a partial result (i.e., multiple aspect-sentiment pairs in a sentence) 9 | double falseNegatives; 10 | int trueNegatives; 11 | ModelType modelType; 12 | 13 | public ResultsSummary(int numTested, double truePositives, int trueNegatives, int falsePositives, 14 | double falseNegatives, ModelType modelType ) { 15 | super(); 16 | this.numTested = numTested; 17 | this.falsePositives = falsePositives; 18 | this.truePositives = truePositives; 19 | this.falseNegatives = falseNegatives; 20 | this.trueNegatives = trueNegatives; 21 | this.modelType = modelType; 22 | } 23 | 24 | 25 | public ModelType getModelType() { 26 | return modelType; 27 | } 28 | 29 | 30 | public void setModelType(ModelType modelType) { 31 | this.modelType = modelType; 32 | } 33 | 34 | 35 | public int getNumTested() { 36 | return numTested; 37 | } 38 | 39 | 40 | public void setNumTested(int numTested) { 41 | this.numTested = numTested; 42 | } 43 | 44 | 45 | public int getFalsePositives() { 46 | return falsePositives; 47 | } 48 | 49 | 50 | public void setFalsePositives(int falsePositives) { 51 | this.falsePositives = falsePositives; 52 | } 53 | 54 | 55 | public double getTruePositives() { 56 | return truePositives; 57 | } 58 | 59 | 60 | public void setTruePositives(double truePositives) { 61 | this.truePositives = truePositives; 62 | } 63 | 64 | 65 | public double getFalseNegatives() { 66 | return falseNegatives; 67 | } 68 | 69 | 70 | public void setFalseNegatives(double falseNegatives) { 71 | this.falseNegatives = falseNegatives; 72 | } 73 | 74 | 75 | public int getTrueNegatives() { 76 | return trueNegatives; 77 | } 78 | 79 | 80 | public void setTrueNegatives(int trueNegatives) { 81 | this.trueNegatives = trueNegatives; 82 | } 83 | 84 | public void printOutResults() 85 | { 86 | System.out.println("True positives: " + truePositives + " (" + (100 * truePositives / numTested) + "%)"); 87 | System.out.println("True negatives: " + trueNegatives + " (" + (100 * trueNegatives / numTested) + "%)"); 88 | System.out.println("False positives: " + falsePositives + " (" + (100 * falsePositives / numTested) + "%)"); 89 | System.out.println("False negatives: " + falseNegatives + " (" + (100 * falseNegatives / numTested) + "%)"); 90 | 91 | 92 | System.out.println("Precision: " + getPrecision()); 93 | System.out.println("Recall/sensitivity: " + getRecall()); 94 | System.out.println("Accuracy: " + getAccuracy()); 95 | System.out.println("Specificity: " + getSpecificity()); 96 | System.out.println("F1 = " + getF1()); 97 | System.out.println("Total tested: " + numTested); 98 | 99 | System.out.println(toThreePlaces(getPrecision()) + " & " + toThreePlaces(getRecall()) + " & " + toThreePlaces(getF1()) + " & " + toThreePlaces(getAccuracy())); 100 | 101 | } 102 | 103 | public static double toThreePlaces(double num) 104 | { 105 | return Math.round(num * 1000) / 1000.0; 106 | } 107 | 108 | public double getPrecision() { 109 | if (truePositives == 0) 110 | return 0.0; 111 | 112 | return 1.0 * truePositives / (truePositives + falsePositives); 113 | } 114 | public double getRecall() { 115 | if (truePositives == 0) 116 | return 0.0; 117 | 118 | return 1.0 * truePositives / (truePositives + falseNegatives); 119 | 120 | } 121 | public double getAccuracy() { 122 | return 1.0 * (truePositives + trueNegatives) / (numTested); 123 | 124 | } 125 | public double getSpecificity() { 126 | if (trueNegatives == 0) 127 | return 0.0; 128 | 129 | return 1.0 * trueNegatives / (trueNegatives + falsePositives); 130 | 131 | } 132 | public double getF1() { 133 | if (truePositives == 0) 134 | return 0.0; 135 | 136 | return (2.0 * getPrecision() * getRecall() / (getPrecision() + getRecall()) ); 137 | } 138 | 139 | } 140 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/languagemodels/DefaultTokenizer.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.languagemodels; 2 | 3 | import java.util.StringTokenizer; 4 | 5 | public class DefaultTokenizer { 6 | public static StringTokenizer getDefaultTokenizer(String text) 7 | { 8 | return new StringTokenizer(text," \t\n\r\f:,'\""); 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/languagemodels/StopWords.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.languagemodels; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | public class StopWords { 8 | 9 | //not used; stop words may be part of product names or feature types (though presumably would never be sentiment-bearing words) 10 | 11 | //from MySQL 5.x : https://dev.mysql.com/doc/refman/5.7/en/fulltext-stopwords.html 12 | 13 | private static final List stopWordList = new ArrayList(); 14 | 15 | private static final String[] stopWords = 16 | { 17 | "a's", 18 | "able", 19 | "about", 20 | "above", 21 | "according", 22 | "accordingly", 23 | "across", 24 | "actually", 25 | "after", 26 | "afterwards", 27 | "again", 28 | "against", 29 | "ain't", 30 | "all", 31 | "allow", 32 | "allows", 33 | "almost", 34 | "alone", 35 | "along", 36 | "already", 37 | "also", 38 | "although", 39 | "always", 40 | "am", 41 | "among", 42 | "amongst", 43 | "an", 44 | "and", 45 | "another", 46 | "any", 47 | "anybody", 48 | "anyhow", 49 | "anyone", 50 | "anything", 51 | "anyway", 52 | "anyways", 53 | "anywhere", 54 | "apart", 55 | "appear", 56 | "appreciate", 57 | "appropriate", 58 | "are", 59 | "aren't", 60 | "around", 61 | "as", 62 | "aside", 63 | "ask", 64 | "asking", 65 | "associated", 66 | "at", 67 | "available", 68 | "away", 69 | "awfully", 70 | "be", 71 | "became", 72 | "because", 73 | "become", 74 | "becomes", 75 | "becoming", 76 | "been", 77 | "before", 78 | "beforehand", 79 | "behind", 80 | "being", 81 | "believe", 82 | "below", 83 | "beside", 84 | "besides", 85 | "best", 86 | "better", 87 | "between", 88 | "beyond", 89 | "both", 90 | "brief", 91 | "but", 92 | "by", 93 | "c'mon", 94 | "c's", 95 | "came", 96 | "can", 97 | "can't", 98 | "cannot", 99 | "cant", 100 | "cause", 101 | "causes", 102 | "certain", 103 | "certainly", 104 | "changes", 105 | "clearly", 106 | "co", 107 | "com", 108 | "come", 109 | "comes", 110 | "concerning", 111 | "consequently", 112 | "consider", 113 | "considering", 114 | "contain", 115 | "containing", 116 | "contains", 117 | "corresponding", 118 | "could", 119 | "couldn't", 120 | "course", 121 | "currently", 122 | "definitely", 123 | "described", 124 | "despite", 125 | "did", 126 | "didn't", 127 | "different", 128 | "do", 129 | "does", 130 | "doesn't", 131 | "doing", 132 | "don't", 133 | "done", 134 | "down", 135 | "downwards", 136 | "during", 137 | "each", 138 | "edu", 139 | "eg", 140 | "eight", 141 | "either", 142 | "else", 143 | "elsewhere", 144 | "enough", 145 | "entirely", 146 | "especially", 147 | "et", 148 | "etc", 149 | "even", 150 | "ever", 151 | "every", 152 | "everybody", 153 | "everyone", 154 | "everything", 155 | "everywhere", 156 | "ex", 157 | "exactly", 158 | "example", 159 | "except", 160 | "far", 161 | "few", 162 | "fifth", 163 | "first", 164 | "five", 165 | "followed", 166 | "following", 167 | "follows", 168 | "for", 169 | "former", 170 | "formerly", 171 | "forth", 172 | "four", 173 | "from", 174 | "further", 175 | "furthermore", 176 | "get", 177 | "gets", 178 | "getting", 179 | "given", 180 | "gives", 181 | "go", 182 | "goes", 183 | "going", 184 | "gone", 185 | "got", 186 | "gotten", 187 | "greetings", 188 | "had", 189 | "hadn't", 190 | "happens", 191 | "hardly", 192 | "has", 193 | "hasn't", 194 | "have", 195 | "haven't", 196 | "having", 197 | "he", 198 | "he's", 199 | "hello", 200 | "help", 201 | "hence", 202 | "her", 203 | "here", 204 | "here's", 205 | "hereafter", 206 | "hereby", 207 | "herein", 208 | "hereupon", 209 | "hers", 210 | "herself", 211 | "hi", 212 | "him", 213 | "himself", 214 | "his", 215 | "hither", 216 | "hopefully", 217 | "how", 218 | "howbeit", 219 | "however", 220 | "i'd", 221 | "i'll", 222 | "i'm", 223 | "i've", 224 | "ie", 225 | "if", 226 | "ignored", 227 | "immediate", 228 | "in", 229 | "inasmuch", 230 | "inc", 231 | "indeed", 232 | "indicate", 233 | "indicated", 234 | "indicates", 235 | "inner", 236 | "insofar", 237 | "instead", 238 | "into", 239 | "inward", 240 | "is", 241 | "isn't", 242 | "it", 243 | "it'd", 244 | "it'll", 245 | "it's", 246 | "its", 247 | "itself", 248 | "just", 249 | "keep", 250 | "keeps", 251 | "kept", 252 | "know", 253 | "knows", 254 | "known", 255 | "last", 256 | "lately", 257 | "later", 258 | "latter", 259 | "latterly", 260 | "least", 261 | "less", 262 | "lest", 263 | "let", 264 | "let's", 265 | "like", 266 | "liked", 267 | "likely", 268 | "little", 269 | "look", 270 | "looking", 271 | "looks", 272 | "ltd", 273 | "mainly", 274 | "many", 275 | "may", 276 | "maybe", 277 | "me", 278 | "mean", 279 | "meanwhile", 280 | "merely", 281 | "might", 282 | "more", 283 | "moreover", 284 | "most", 285 | "mostly", 286 | "much", 287 | "must", 288 | "my", 289 | "myself", 290 | "name", 291 | "namely", 292 | "nd", 293 | "near", 294 | "nearly", 295 | "necessary", 296 | "need", 297 | "needs", 298 | "neither", 299 | "never", 300 | "nevertheless", 301 | "new", 302 | "next", 303 | "nine", 304 | "no", 305 | "nobody", 306 | "non", 307 | "none", 308 | "noone", 309 | "nor", 310 | "normally", 311 | "not", 312 | "nothing", 313 | "novel", 314 | "now", 315 | "nowhere", 316 | "obviously", 317 | "of", 318 | "off", 319 | "often", 320 | "oh", 321 | "ok", 322 | "okay", 323 | "old", 324 | "on", 325 | "once", 326 | "one", 327 | "ones", 328 | "only", 329 | "onto", 330 | "or", 331 | "other", 332 | "others", 333 | "otherwise", 334 | "ought", 335 | "our", 336 | "ours", 337 | "ourselves", 338 | "out", 339 | "outside", 340 | "over", 341 | "overall", 342 | "own", 343 | "particular", 344 | "particularly", 345 | "per", 346 | "perhaps", 347 | "placed", 348 | "please", 349 | "plus", 350 | "possible", 351 | "presumably", 352 | "probably", 353 | "provides", 354 | "que", 355 | "quite", 356 | "qv", 357 | "rather", 358 | "rd", 359 | "re", 360 | "really", 361 | "reasonably", 362 | "regarding", 363 | "regardless", 364 | "regards", 365 | "relatively", 366 | "respectively", 367 | "right", 368 | "said", 369 | "same", 370 | "saw", 371 | "say", 372 | "saying", 373 | "says", 374 | "second", 375 | "secondly", 376 | "see", 377 | "seeing", 378 | "seem", 379 | "seemed", 380 | "seeming", 381 | "seems", 382 | "seen", 383 | "self", 384 | "selves", 385 | "sensible", 386 | "sent", 387 | "serious", 388 | "seriously", 389 | "seven", 390 | "several", 391 | "shall", 392 | "she", 393 | "should", 394 | "shouldn't", 395 | "since", 396 | "six", 397 | "so", 398 | "some", 399 | "somebody", 400 | "somehow", 401 | "someone", 402 | "something", 403 | "sometime", 404 | "sometimes", 405 | "somewhat", 406 | "somewhere", 407 | "soon", 408 | "sorry", 409 | "specified", 410 | "specify", 411 | "specifying", 412 | "still", 413 | "sub", 414 | "such", 415 | "sup", 416 | "sure", 417 | "t's", 418 | "take", 419 | "taken", 420 | "tell", 421 | "tends", 422 | "th", 423 | "than", 424 | "thank", 425 | "thanks", 426 | "thanx", 427 | "that", 428 | "that's", 429 | "thats", 430 | "the", 431 | "their", 432 | "theirs", 433 | "them", 434 | "themselves", 435 | "then", 436 | "thence", 437 | "there", 438 | "there's", 439 | "thereafter", 440 | "thereby", 441 | "therefore", 442 | "therein", 443 | "theres", 444 | "thereupon", 445 | "these", 446 | "they", 447 | "they'd", 448 | "they'll", 449 | "they're", 450 | "they've", 451 | "think", 452 | "third", 453 | "this", 454 | "thorough", 455 | "thoroughly", 456 | "those", 457 | "though", 458 | "three", 459 | "through", 460 | "throughout", 461 | "thru", 462 | "thus", 463 | "to", 464 | "together", 465 | "too", 466 | "took", 467 | "toward", 468 | "towards", 469 | "tried", 470 | "tries", 471 | "truly", 472 | "try", 473 | "trying", 474 | "twice", 475 | "two", 476 | "un", 477 | "under", 478 | "unfortunately", 479 | "unless", 480 | "unlikely", 481 | "until", 482 | "unto", 483 | "up", 484 | "upon", 485 | "us", 486 | "use", 487 | "used", 488 | "useful", 489 | "uses", 490 | "using", 491 | "usually", 492 | "value", 493 | "various", 494 | "very", 495 | "via", 496 | "viz", 497 | "vs", 498 | "want", 499 | "wants", 500 | "was", 501 | "wasn't", 502 | "way", 503 | "we", 504 | "we'd", 505 | "we'll", 506 | "we're", 507 | "we've", 508 | "welcome", 509 | "well", 510 | "went", 511 | "were", 512 | "weren't", 513 | "what", 514 | "what's", 515 | "whatever", 516 | "when", 517 | "whence", 518 | "whenever", 519 | "where", 520 | "where's", 521 | "whereafter", 522 | "whereas", 523 | "whereby", 524 | "wherein", 525 | "whereupon", 526 | "wherever", 527 | "whether", 528 | "which", 529 | "while", 530 | "whither", 531 | "who", 532 | "who's", 533 | "whoever", 534 | "whole", 535 | "whom", 536 | "whose", 537 | "why", 538 | "will", 539 | "willing", 540 | "wish", 541 | "with", 542 | "within", 543 | "without", 544 | "won't", 545 | "wonder", 546 | "would", 547 | "wouldn't", 548 | "yes", 549 | "yet", 550 | "you", 551 | "you'd", 552 | "you'll", 553 | "you're", 554 | "you've", 555 | "your", 556 | "yours", 557 | "yourself", 558 | "yourselves", 559 | "zero", 560 | }; 561 | 562 | static { 563 | for (String nextStopWord : stopWords) 564 | { 565 | stopWordList.add(nextStopWord); 566 | } 567 | Collections.sort(stopWordList); 568 | } 569 | 570 | public static boolean isStopWord(String token) 571 | { 572 | if (token == null) 573 | return true; 574 | else 575 | { 576 | int pos = Collections.binarySearch(stopWordList, token.toLowerCase()); 577 | return pos >= 0; 578 | } 579 | } 580 | } 581 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/BinaryPrediction.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.util.Map; 4 | 5 | public class BinaryPrediction { 6 | private double probability; 7 | private boolean classNumber; 8 | private Map classProbabilities; 9 | 10 | 11 | public BinaryPrediction(boolean classNumber, double probability, 12 | Map classProbabilities) { 13 | super(); 14 | this.classNumber = classNumber; 15 | this.probability = probability; 16 | this.classProbabilities = classProbabilities; 17 | } 18 | public BinaryPrediction(boolean classNumber, double probability) { 19 | super(); 20 | this.probability = probability; 21 | this.classNumber = classNumber; 22 | } 23 | public double getProbability() { 24 | return probability; 25 | } 26 | public void setProbability(double probability) { 27 | this.probability = probability; 28 | } 29 | public boolean getClassNumber() { 30 | return classNumber; 31 | } 32 | public void setClassNumber(boolean classNumber) { 33 | this.classNumber = classNumber; 34 | } 35 | public Map getClassProbabilities() { 36 | return classProbabilities; 37 | } 38 | public void setClassProbabilities(Map classProbabilities) { 39 | this.classProbabilities = classProbabilities; 40 | } 41 | @Override 42 | public String toString() { 43 | return classNumber + ", probability " + probability + " " + classProbabilities; 44 | } 45 | 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/ClassWeighting.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | public enum ClassWeighting { 4 | EQUAL, 5 | OVERSIZENEG, 6 | UNDERSIZEPOS, 7 | OVERSIZEPOS, 8 | UNDERSIZENEG 9 | } 10 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/FeatureDistance.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | public enum FeatureDistance { 4 | SELF ( 0), 5 | BEFORE (-1), //also, above 6 | AFTER ( 1), 7 | 8 | //for neighbours only 9 | PLUSONE (1), 10 | PLUSTWO (2), 11 | PLUSTHREE (3), 12 | 13 | //for neighbours and parental lineage 14 | MINUSONE (-1), 15 | MINUSTWO (-2), 16 | MINUSTHREE (-3), 17 | 18 | //for parental lineage only 19 | MINUSFOUR (-4), 20 | MINUSFIVE (-5), 21 | MINUSSIX (-6), 22 | MINUSSEVEN (-7), 23 | MINUSEIGHT (-8), 24 | MINUSNINE (-9), 25 | MINUSTEN (-10), 26 | MINUSMORE (-11); 27 | 28 | private final int numericInterpretation; 29 | 30 | FeatureDistance(int numericInterpretation) 31 | { 32 | this.numericInterpretation = numericInterpretation; 33 | } 34 | public int getNumericInterpretation() 35 | { 36 | return this.numericInterpretation; 37 | } 38 | public boolean canBeGeneralized() 39 | { 40 | switch(this) 41 | { 42 | case MINUSONE: 43 | case MINUSTWO: 44 | case MINUSTHREE: 45 | case MINUSFOUR: 46 | case MINUSFIVE: 47 | case MINUSSIX: 48 | case MINUSMORE: 49 | case PLUSONE: 50 | case PLUSTWO: 51 | case PLUSTHREE: 52 | return true; 53 | default: 54 | return false; 55 | } 56 | } 57 | 58 | public FeatureDistance getGeneralCase() 59 | { 60 | if (this.numericInterpretation > 0) 61 | return AFTER; 62 | else if (this.numericInterpretation < 0) 63 | return BEFORE; 64 | else 65 | return SELF; 66 | } 67 | 68 | public static FeatureDistance byDistance(int distance) 69 | { 70 | //hard coded for performance; if more needed, change to an immutable map + lookup 71 | switch(distance) 72 | { 73 | case 0: 74 | return SELF; 75 | case 1: 76 | return PLUSONE; 77 | case 2: 78 | return PLUSTWO; 79 | case 3: 80 | return PLUSTHREE; 81 | case -1: 82 | return MINUSONE; 83 | case -2: 84 | return MINUSTWO; 85 | case -3: 86 | return MINUSTHREE; 87 | case -4: 88 | return MINUSFOUR; 89 | case -5: 90 | return MINUSFIVE; 91 | case -6: 92 | return MINUSSIX; 93 | case -7: 94 | return MINUSSEVEN; 95 | case -8: 96 | return MINUSEIGHT; 97 | case -9: 98 | return MINUSNINE; 99 | case -10: 100 | return MINUSTEN; 101 | default: 102 | if (distance < -10) 103 | return MINUSMORE; 104 | else 105 | return null; 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/FeatureRepository.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | 7 | public class FeatureRepository { 8 | 9 | // protected List masterClassList = new ArrayList(); 10 | protected List masterTokenList = new ArrayList(); 11 | 12 | //feature list management: add to list if it doesn't exist, else increment the count 13 | protected int getNumberInList(String token, boolean addMissing) 14 | { 15 | //useful for both features and classes 16 | int featureNumber = masterTokenList.indexOf(token); 17 | if (addMissing && featureNumber < 0) 18 | { 19 | masterTokenList.add(token); 20 | featureNumber = masterTokenList.size() - 1; 21 | } 22 | 23 | return featureNumber; 24 | } 25 | 26 | public int getNumberOfFeatures() 27 | { 28 | return masterTokenList.size(); 29 | } 30 | 31 | public String getNameOfFeature(int key) 32 | { 33 | return masterTokenList.get(key); 34 | } 35 | 36 | public FeatureRepository() 37 | { 38 | } 39 | 40 | //default features using raw (non-normalized) word counts 41 | //override this for fancier features 42 | //if we're building a new model, we want to add the missing features; otherwise, if we're merely predicting, we probably don't 43 | /* 44 | @Override 45 | public Map getFeaturesForTriple(Triple nextTriple, boolean addMissingFeatures, boolean isTraining) { 46 | 47 | //can remove named entities as a step towards coupled training 48 | //String toAnalyze = nextTriple.getPhrase().getString(); 49 | String toAnalyze = NamedEntityModelImpl.removeThisModelsFeatures(nextTriple); 50 | 51 | if (toAnalyze == null) 52 | return null; 53 | 54 | StringTokenizer st = DefaultTokenizer.getDefaultTokenizer(toAnalyze); 55 | int numTokens = st.countTokens(); 56 | 57 | Map tokenCounts = new HashMap(); 58 | 59 | List wordsWithCaps = null; 60 | boolean lastWordHadCap = false; 61 | 62 | for (int i = 0; i < numTokens; i++) 63 | { 64 | //tokenization notes: 65 | // remove trailing punctuation (so that final words in sentences are folded with non-terminating words 66 | // if a token contains punctuation (hyphen), record it both with and without 67 | // if a token contains punctuation (period), record it both with and without for variations in hyphenated names and acronyms (UN versus U.N.) 68 | // if a series of tokens has capital letters, record it together 69 | List tokenVariations = new ArrayList(); 70 | 71 | String nextToken = stripTrailingPunctuation(st.nextToken()); 72 | tokenVariations.add(nextToken); 73 | 74 | //hyphens 75 | if (nextToken.contains("-")) 76 | { 77 | for (String nextChunk : nextToken.split("-")) 78 | { 79 | tokenVariations.add(nextChunk); 80 | } 81 | } 82 | 83 | //periods and acronyms 84 | if (nextToken.contains(".")) 85 | tokenVariations.add(nextToken.replaceAll("\\.", "")); 86 | 87 | //strings of capitalized words (which may be named entities) 88 | boolean isCapitalized = nextToken.matches(".*[A-Z].*"); 89 | if (isCapitalized) 90 | { 91 | if (wordsWithCaps == null) 92 | wordsWithCaps = new ArrayList(); 93 | wordsWithCaps.add(nextToken); 94 | lastWordHadCap = true; 95 | } 96 | if ((!isCapitalized && lastWordHadCap && wordsWithCaps != null) || (isCapitalized && i == numTokens - 1)) 97 | { 98 | int wordsWithCapsSize = wordsWithCaps.size(); 99 | if (wordsWithCaps.size() > 1) 100 | { 101 | //do all combinations: so "UN Security Council" becomes "UN Security" + "Security Council" + "UN Security Council" 102 | for (int length = 2; length <= wordsWithCapsSize; length++ ) 103 | { 104 | for (int firstWord = 0; firstWord <= wordsWithCapsSize - length; firstWord++) 105 | { 106 | //System.out.println("length is " + length + "; firstWord is " + firstWord ); 107 | StringBuilder sb = new StringBuilder(); 108 | for (int j = 0; j < length; j++) 109 | { 110 | if (j > 0) 111 | sb.append(" "); 112 | sb.append(wordsWithCaps.get(firstWord + j)); 113 | } 114 | String permutation = sb.toString(); 115 | tokenVariations.add(permutation); 116 | if (permutation.contains("-")) 117 | tokenVariations.add(permutation.replace('-', ' ')); 118 | if (permutation.contains(".")) 119 | tokenVariations.add(permutation.replaceAll("\\.", "")); 120 | if (permutation.contains(".") && permutation.contains("-")) 121 | tokenVariations.add(permutation.replace('-', ' ').replaceAll("\\.", "")); 122 | } 123 | } 124 | } 125 | lastWordHadCap = false; 126 | wordsWithCaps = null; 127 | } 128 | 129 | for (String nextTokenVariation : tokenVariations) 130 | { 131 | addFeature(nextTokenVariation, tokenCounts, addMissingFeatures); 132 | } 133 | } 134 | 135 | return tokenCounts; 136 | } 137 | */ 138 | 139 | /* 140 | protected void addFeature(String token, Map tokenCounts, boolean addMissingFeatures) 141 | { 142 | addFeature(token, tokenCounts, addMissingFeatures, 1.0); 143 | } 144 | 145 | protected void addFeature(String token, Map tokenCounts, boolean addMissingFeatures, double amountToAddToFeature) 146 | { 147 | //System.out.println(nextTokenVariation); 148 | 149 | int featureNumber = getNumberInList(masterTokenList, token, addMissingFeatures); 150 | 151 | if (addMissingFeatures || featureNumber > 0) //first half is redundant but add speed; feature number can only be less than zero if addMissingFeatures is false 152 | { 153 | Double existingCount = tokenCounts.get(featureNumber); 154 | if (existingCount == null) 155 | tokenCounts.put(featureNumber, amountToAddToFeature); 156 | else 157 | tokenCounts.put(featureNumber, existingCount + amountToAddToFeature); 158 | } 159 | 160 | } 161 | */ 162 | 163 | /* 164 | private String stripTrailingPunctuation(String token) 165 | { 166 | Matcher m = patEndsWithPunctuation.matcher(token); 167 | if (m.find()) 168 | return token.substring(0, token.length() - 1); 169 | else 170 | return token; 171 | } 172 | */ 173 | 174 | /* 175 | @Override 176 | public Prediction getCertainPrediction(Triple triple) { 177 | return new Prediction(getClassNumber(triple), 1.0); 178 | } 179 | */ 180 | 181 | 182 | } 183 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/FeatureType.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | public enum FeatureType { 4 | 5 | /* The purpose of this class is to identify specific types of features for use in a general-purpose feature store. 6 | 7 | /* TokenWithContext 8 | * [token=first, lemma=first, 9 | * pos=RB, 10 | * previousTokens=[null,null,null], nextTokens=[let,me,say], 11 | * parentage=[ADVP], localParentage=[ADVP], 12 | * isNamedEntity=true, isCoreferenceHead=true, 13 | * flatResolvedCoreference=first, 14 | * partOfSentimentStructure=null, --> what we're trying to classify 15 | * semanticSpecificRole=advmod, semanticGeneralRole=mod, 16 | * semanticIncomingEdge=let-VB-root, //this is just a TokenWithContext 17 | * semanticOutgoingEdgesIncludeNegation=false, semanticallyTaggedTokensWithContext=null] 18 | */ 19 | 20 | TOKEN, 21 | LEMMA, 22 | POS, 23 | TOKENNEIGHBOUR, 24 | LEMMANEIGHBOUR, 25 | POSNEIGHBOUR, 26 | PARENTAGE, 27 | LOCALPARENTAGE, 28 | BOOLEAN, //isNamedEntity, isCoreferenceHead, semanticOutgoingEdgesIncludeNegation 29 | 30 | RESOLVEDCOREFERENCE, //TODO: need to somehow incorporate the properties the resolved coreference 31 | 32 | SEMANTICSPECIFICROLE, 33 | SEMANTICGENERALROLE, 34 | 35 | SEMANTICINCOMINGEDGEROLE, 36 | SEMANTICINCOMINGEDGETOKEN, 37 | SEMANTICINCOMINGEDGELEMMA, 38 | SEMANTICINCOMINGEDGEPOS, 39 | 40 | SEMANTICOUTGOINGEDGEROLE, 41 | SEMANTICOUTGOINGEDGETOKEN, 42 | SEMANTICOUTGOINGEDGELEMMA, 43 | SEMANTICOUTGOINGEDGEPOS 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/ModelType.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | public enum ModelType { 4 | FEATURE, 5 | SENTIMENT; 6 | } 7 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/Prediction.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.Map; 5 | 6 | public class Prediction { 7 | private double probability; 8 | private int classNumber; 9 | private Map classProbabilities; 10 | 11 | 12 | public Prediction(int classNumber, double probability, 13 | Map classProbabilities) { 14 | super(); 15 | this.classNumber = classNumber; 16 | this.probability = probability; 17 | this.classProbabilities = classProbabilities; 18 | } 19 | public Prediction(int classNumber, double probability) { 20 | super(); 21 | this.probability = probability; 22 | this.classNumber = classNumber; 23 | } 24 | public double getProbability() { 25 | return probability; 26 | } 27 | public void setProbability(double probability) { 28 | this.probability = probability; 29 | } 30 | public int getClassNumber() { 31 | return classNumber; 32 | } 33 | public void setClassNumber(int classNumber) { 34 | this.classNumber = classNumber; 35 | } 36 | public Map getClassProbabilities() { 37 | return classProbabilities; 38 | } 39 | public void setClassProbabilities(Map classProbabilities) { 40 | this.classProbabilities = classProbabilities; 41 | } 42 | @Override 43 | public String toString() { 44 | return classNumber + ", probability " + new DecimalFormat("#.###").format(probability) + " " + classProbabilities; 45 | } 46 | 47 | 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/PredictionTokenWithContextPair.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import ca.carter.thesis.model.TokenWithContext; 4 | 5 | public class PredictionTokenWithContextPair { 6 | private TokenWithContext tokenWithContext; 7 | private Prediction prediction; 8 | 9 | 10 | 11 | public PredictionTokenWithContextPair(TokenWithContext tokenWithContext, 12 | Prediction prediction) { 13 | super(); 14 | this.tokenWithContext = tokenWithContext; 15 | this.prediction = prediction; 16 | } 17 | public TokenWithContext getTokenWithContext() { 18 | return tokenWithContext; 19 | } 20 | public void setTokenWithContext(TokenWithContext tokenWithContext) { 21 | this.tokenWithContext = tokenWithContext; 22 | } 23 | public Prediction getPrediction() { 24 | return prediction; 25 | } 26 | public void setPrediction(Prediction prediction) { 27 | this.prediction = prediction; 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/SVMTokenModel.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.io.IOException; 4 | import java.io.Writer; 5 | import java.lang.reflect.InvocationTargetException; 6 | import java.lang.reflect.Method; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Map.Entry; 12 | import java.util.TreeMap; 13 | import java.util.Vector; 14 | 15 | import ca.carter.thesis.languagemodels.StopWords; 16 | import ca.carter.thesis.model.*; 17 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure; 18 | import ca.carter.thesis.model.phrasetree.PartOfSpeech; 19 | import libsvm.svm; 20 | import libsvm.svm_model; 21 | import libsvm.svm_node; 22 | import libsvm.svm_parameter; 23 | import libsvm.svm_print_interface; 24 | import libsvm.svm_problem; 25 | 26 | /****** 27 | * 28 | * @author davecarter 29 | * 30 | * Co-training enabled model of a vector space split into two views 31 | * This is used as the basis for both the sentiment/opinion word classifier and the product aspect classifier 32 | * 33 | * The two views are lexical (including part of speech, lemma, etc.) and syntactic, respectively 34 | * 35 | */ 36 | 37 | public abstract class SVMTokenModel { 38 | 39 | protected static final boolean useViews = true; 40 | public static final boolean useOnlyOneView = false; 41 | public static final Views useOnlyOneViewWhichView = Views.BAGOFWORDS; 42 | 43 | private final int numberOfViews = (useViews && !useOnlyOneView ? 2 : 1); 44 | private svm_problem[] problemViews; 45 | private svm_parameter[] parametersViews; 46 | private svm_model[] svmModelViews = new svm_model[numberOfViews];; 47 | private Task task; 48 | 49 | private final ClassWeighting classWeighting; 50 | public static final int kernel = svm_parameter.RBF; 51 | 52 | protected final Double specifiedC; 53 | protected final Double specifiedGamma; 54 | protected final Double specifiedEpsilon; 55 | 56 | List tokens; 57 | 58 | FeatureRepository featureRepository = new FeatureRepository(); 59 | 60 | private static final boolean omitStopWords = false; 61 | 62 | private static final boolean debugFeatureNames = false; 63 | 64 | public SVMTokenModel(Task task, List tokens, Writer[] writer, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) { 65 | 66 | this.task = task; 67 | this.tokens = tokens; 68 | this.classWeighting = (classWeighting == null ? ClassWeighting.EQUAL : classWeighting); 69 | 70 | this.specifiedC = c; 71 | this.specifiedGamma = gamma; 72 | this.specifiedEpsilon = epsilon; 73 | 74 | if (tokens == null) 75 | { 76 | System.err.println("No tokens present. Can not create training model."); 77 | return; 78 | } 79 | 80 | //writer != null ----- this means that, as we're building the model, we should also record it to a text file for, say, parameter estimation using an external tool 81 | 82 | //build master features list 83 | for (TokenWithContext nextTrainingToken : tokens) 84 | { 85 | buildFeaturesForToken(nextTrainingToken); 86 | } 87 | 88 | 89 | 90 | retrain(writer); 91 | } 92 | 93 | public void retrain(Writer[] writer) 94 | { 95 | boolean outputToFile = (writer != null); 96 | 97 | svm.svm_set_print_string_function(svm_print_null); 98 | 99 | List> xClassesViews = new ArrayList>(); 100 | List> yFeaturesViews = new ArrayList>(); 101 | 102 | for (int viewNum = 0 ; viewNum < numberOfViews; viewNum++) 103 | { 104 | xClassesViews.add(new Vector()); 105 | yFeaturesViews.add(new Vector()); 106 | } 107 | 108 | int max_index[] = {0 , 0}; //todo: this is views-based; backfill single-view case 109 | 110 | int numInPositiveClass = 0; 111 | int numInNegativeClass = 0; 112 | 113 | for (TokenWithContext nextTrainingToken : tokens) 114 | { 115 | if (omitStopWords && StopWords.isStopWord(nextTrainingToken.getToken())) 116 | continue; 117 | 118 | try 119 | { 120 | //if we added this token in the cotraining phase, it will have a predicted class that we should use instead of the ground truth class, which would be unknown in a semi-supervised case 121 | double classNumber; 122 | 123 | if (nextTrainingToken.getPredictedClass() != null) 124 | { 125 | classNumber = (nextTrainingToken.getPredictedModel() == this.getModelType() ? nextTrainingToken.getPredictedClass() : 0.0); 126 | } 127 | else 128 | classNumber = getClassForToken(nextTrainingToken); 129 | 130 | //debugging code only 131 | //if (classNumber != 0) 132 | // System.out.println("Positive token for " + this.getModelType() + " " + getFormattedTokenContext(nextTrainingToken) + "\n" + nextTrainingToken.toString() ); 133 | 134 | if (classNumber != 0) 135 | numInPositiveClass++; 136 | else 137 | numInNegativeClass++; 138 | 139 | if (outputToFile) 140 | { 141 | for (Writer nextWriter : writer) 142 | { 143 | nextWriter.write((int) classNumber + " "); 144 | } 145 | } 146 | 147 | for (int viewNum = 0 ; viewNum < numberOfViews; viewNum++) 148 | { 149 | Map featureMap = getFeaturesForToken(nextTrainingToken, viewNum); 150 | //System.out.println(nextTrainingToken.getToken() + " " + featureMap); 151 | 152 | if (featureMap == null) 153 | continue; 154 | 155 | //build into SVM model 156 | int m = featureMap.size(); 157 | svm_node[] x = new svm_node[m]; 158 | int j = 0; 159 | 160 | for (Entry nextFeature : featureMap.entrySet()) 161 | { 162 | x[j] = new svm_node(); 163 | x[j].index = nextFeature.getKey(); 164 | x[j].value = nextFeature.getValue(); 165 | 166 | j++; 167 | 168 | if (outputToFile) 169 | writer[viewNum].write(nextFeature.getKey() + ":" + nextFeature.getValue() + " "); 170 | } 171 | 172 | if (outputToFile) 173 | writer[viewNum].write("\n"); 174 | 175 | if(m>0) max_index[viewNum] = Math.max(max_index[viewNum], x[m-1].index); 176 | 177 | xClassesViews.get(viewNum).addElement(classNumber); 178 | yFeaturesViews.get(viewNum).addElement(x); 179 | } 180 | } 181 | catch (IOException e) 182 | { 183 | 184 | } 185 | } 186 | 187 | try { 188 | if (outputToFile) 189 | { 190 | for (Writer nextWriter : writer) 191 | { 192 | nextWriter.close(); 193 | } 194 | System.out.println("Done writing " + this.getName() + " model files. Will not build classifier model."); 195 | return; 196 | } 197 | } catch (IOException e) { 198 | // TODO Auto-generated catch block 199 | e.printStackTrace(); 200 | } 201 | 202 | //System.out.println( xClasses ); 203 | //System.out.println( yFeatures ); 204 | 205 | problemViews = new svm_problem[numberOfViews]; 206 | 207 | for (int viewNum = 0; viewNum < numberOfViews; viewNum++) 208 | { 209 | //System.out.println("View " + viewNum); 210 | 211 | problemViews[viewNum] = new svm_problem(); 212 | 213 | problemViews[viewNum].l = xClassesViews.get(viewNum).size(); 214 | problemViews[viewNum].x = new svm_node[problemViews[viewNum].l][]; 215 | for(int i=0;i 0) 246 | parameters.gamma = 1.0/max_index; 247 | */ 248 | 249 | parameters.probability = 1; 250 | parameters.cache_size = 250; 251 | 252 | if (specifiedEpsilon != null) 253 | parameters.eps = specifiedEpsilon; 254 | else 255 | parameters.eps = getEpsilon(); 256 | 257 | parameters.kernel_type = kernel; 258 | 259 | //determined classififier-by-classifier by grid search on initial 20% training data, averaged over the five data sets 260 | //parameters.gamma = 1 / (numInPositiveClass + numInNegativeClass); //getGamma(); ...since number of data points changes, so should gamma; and experiment to do parameter estimation found that ideal gamma is usually around 1 / number of data points 261 | parameters.C = getC(viewNum); 262 | parameters.gamma = getGamma(viewNum); 263 | 264 | //weight the C of the classes to accommodate the imbalance; re. Asa Ben-Hur and Jason Weston "A User's Guide to Support Vector Machines", section 7 265 | if (classWeighting != ClassWeighting.EQUAL) 266 | { 267 | parameters.weight_label = new int[2]; 268 | parameters.weight = new double[2]; 269 | parameters.weight_label[0] = 0; //negative class 270 | parameters.weight_label[1] = 1; //positive class 271 | 272 | switch (classWeighting) 273 | { 274 | case OVERSIZENEG: 275 | parameters.weight[0] = getC(viewNum) * numInPositiveClass / numInNegativeClass; 276 | parameters.weight[1] = getC(viewNum); 277 | break; 278 | case UNDERSIZENEG: 279 | parameters.weight[0] = getC(viewNum) / (numInPositiveClass / numInNegativeClass); 280 | parameters.weight[1] = getC(viewNum); 281 | break; 282 | case OVERSIZEPOS: 283 | parameters.weight[0] = getC(viewNum); 284 | parameters.weight[1] = getC(viewNum) * (numInPositiveClass / numInNegativeClass); 285 | break; 286 | case UNDERSIZEPOS: 287 | parameters.weight[0] = getC(viewNum); 288 | parameters.weight[1] = getC(viewNum) / (numInPositiveClass / numInNegativeClass); 289 | break; 290 | default: 291 | } 292 | } 293 | 294 | parametersViews[viewNum] = parameters; 295 | 296 | String error_msg = svm.svm_check_parameter(problemViews[viewNum],parametersViews[viewNum]); 297 | 298 | /* 299 | * default gamma given no tuning information 300 | parameters.C = 1; 301 | if(parameters.gamma == 0 && max_index > 0) 302 | parameters.gamma = 1.0/max_index; 303 | */ 304 | 305 | 306 | /* 307 | System.out.println("svm_type " + parameters.svm_type); 308 | System.out.println("kernel_type " + parameters.kernel_type); //0 is linear (special case of RBF), 2 is RBF 309 | System.out.println("degree " + parameters.degree); 310 | System.out.println("gamma " + parameters.gamma); 311 | System.out.println("coef0 " + parameters.coef0); 312 | System.out.println("nu " + parameters.nu); 313 | System.out.println("C " + parameters.C); 314 | System.out.println("eps " + parameters.eps); 315 | System.out.println("p " + parameters.p); 316 | System.out.println("shrinking " + parameters.shrinking); 317 | System.out.println("probability " + parameters.probability); 318 | System.out.println("nr_weight " + parameters.nr_weight); 319 | System.out.println("weight_label " + parameters.weight_label); 320 | System.out.println("weight " + parameters.weight); 321 | */ 322 | 323 | if(error_msg != null) 324 | { 325 | System.err.print("ERROR: "+error_msg+"\n"); 326 | //System.exit(1); 327 | } 328 | 329 | svmModelViews[viewNum] = svm.svm_train(problemViews[viewNum],parametersViews[viewNum]); 330 | } 331 | 332 | } 333 | 334 | 335 | abstract public double getC(int viewNum); 336 | abstract public double getGamma(int viewNum); 337 | abstract public double getEpsilon(); 338 | 339 | abstract public ModelType getModelType(); 340 | 341 | public void addTokenForNextTraining(TokenWithContext token) 342 | { 343 | buildFeaturesForToken(token); 344 | tokens.add(token); 345 | } 346 | 347 | abstract public Double getClassForToken(TokenWithContext token); 348 | //abstract protected Double getExpectedClassNumber(TokenWithContext token); 349 | 350 | abstract public String getName(); 351 | 352 | private void buildFeaturesForToken(TokenWithContext token) 353 | { 354 | featureRepository.getNumberInList(token.getToken(), true); 355 | } 356 | 357 | //get the features for the token bean in a sparse map 358 | private Map getFeaturesForToken(TokenWithContext token, int viewNumber) 359 | { 360 | //by reflection, go get all get____ for Strings and is_____ for booleans 361 | //then process lists, etc. manually 362 | 363 | //using TreeMap because it means it will be sorted as it is built; needed for flat file output 364 | Map localMap = new TreeMap(); 365 | 366 | // simplest, token only version, is : 367 | // int tokenNumber = featureRepository.getNumberInList(token.getToken(), true); 368 | // localMap.put(tokenNumber, 1.0); 369 | 370 | for (Method nextMethod : TokenWithContext.class.getDeclaredMethods()) 371 | { 372 | try { 373 | String value = null; 374 | int featureNumber; 375 | 376 | //TODO xxx need to arrange this by view number, splitting lexical and syntactic 377 | 378 | if (nextMethod.getName().startsWith("get") && validForView(nextMethod.getName(), viewNumber) ) 379 | { 380 | if (nextMethod.getName().startsWith("getPredicted") || nextMethod.getName().startsWith("getFormatted")) 381 | { 382 | //do nothing; this are not real features 383 | } 384 | else if (nextMethod.getName().equals("getOpinion")) 385 | { 386 | //TODO: do nothing, since it is something we are trying to predict 387 | } 388 | else if (nextMethod.getName().equals("getPartOfSentimentStructure")) 389 | { 390 | //TODO: should do nothing, since these are something we're trying to predict 391 | } 392 | else if (nextMethod.getName().equals("getPositionInSentence")) 393 | { 394 | 395 | // TODO: consider putting this in/taking this out; could be valuable or could be a total red herring 396 | /*double positionValue = (1.0 * (double) token.getPositionInSentence() / 100); 397 | featureNumber = featureRepository.getNumberInList("sentencepos", true); 398 | localMap.put(featureNumber, positionValue);*/ 399 | 400 | } 401 | else if (nextMethod.getReturnType() == String.class) 402 | { 403 | value = (String) nextMethod.invoke(token, null); 404 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true); 405 | if (debugFeatureNames) 406 | System.out.println(nextMethod.getName() + " -> " + value); 407 | localMap.put(featureNumber, 1.0); 408 | } 409 | else if (nextMethod.getReturnType() == TokenWithContext.class) 410 | { 411 | TokenWithContext localValue = (TokenWithContext) nextMethod.invoke(token, null); 412 | addFeaturesForNearbyTokenWithContext(localMap, localValue, nextMethod.getName(), null, viewNumber); 413 | } 414 | else if (nextMethod.getReturnType() == PartOfSpeech.class) 415 | { 416 | PartOfSpeech localValue = (PartOfSpeech) nextMethod.invoke(token, null); 417 | if (localValue != null) 418 | value = localValue.toString(); 419 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true); 420 | if (debugFeatureNames) 421 | System.out.println(nextMethod.getName() + " -> " + value); 422 | localMap.put(featureNumber, 1.0); 423 | } 424 | else if (nextMethod.getReturnType() == PartOfSentimentStructure.class) 425 | { 426 | PartOfSentimentStructure localValue = (PartOfSentimentStructure) nextMethod.invoke(token, null); 427 | if (localValue != null) 428 | value = localValue.toString(); 429 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true); 430 | if (debugFeatureNames) 431 | System.out.println(nextMethod.getName() + " -> " + value); 432 | localMap.put(featureNumber, 1.0); 433 | } 434 | else if (nextMethod.getReturnType() == List.class) 435 | { 436 | final List list = (List) nextMethod.invoke(token, null); 437 | 438 | final boolean isListBefore = (nextMethod.getName().contains("Previous") || nextMethod.getName().contains("Parentage")); 439 | final boolean isListAfter = (nextMethod.getName().contains("Next")); 440 | 441 | //if (!isListBefore && !isListAfter) 442 | // System.err.println("Not sure if the list for " + nextMethod.getName() + " is before or after the given token, so it'll be hard to figure out how to assign distance relationships."); 443 | 444 | if (list != null && ! list.isEmpty()) 445 | { 446 | int relativePosition = 0; 447 | if (isListBefore) 448 | relativePosition = 0 - list.size(); 449 | else if (isListAfter) 450 | relativePosition = 1; 451 | else 452 | relativePosition = 0; 453 | 454 | //necessary for something like the preceeding token list for the second token in a sentence 455 | Object firstNonNullInList = null; 456 | for (Object nextInList : list) 457 | { 458 | if (nextInList != null) 459 | { 460 | firstNonNullInList = nextInList; 461 | break; 462 | } 463 | } 464 | 465 | //if the list is useable, check the class type of the first item in the list 466 | if (firstNonNullInList == null) 467 | { 468 | //entire list is null, so there's nothing more interesting to report 469 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=allnull", true); 470 | localMap.put(featureNumber, 1.0); 471 | if (debugFeatureNames) 472 | System.out.println(nextMethod.getName() + " -> " + value); 473 | } 474 | else if (firstNonNullInList instanceof TokenWithContext) 475 | { 476 | for (Object nextItem : list) 477 | { 478 | final TokenWithContext nextTokenInList = (TokenWithContext) nextItem; 479 | FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition); 480 | if (featureDistance != FeatureDistance.MINUSMORE) 481 | addFeaturesForNearbyTokenWithContext(localMap, nextTokenInList, nextMethod.getName(), featureDistance, viewNumber); 482 | if (isListBefore || isListAfter) 483 | relativePosition++; 484 | } 485 | } 486 | else if (firstNonNullInList instanceof PartOfSpeech) //&& viewNumber == 0 //despite being POS, is only called from view 1; only in terms of syntactic structure 487 | { 488 | for (Object nextItem : list) 489 | { 490 | final PartOfSpeech nextPOS = (PartOfSpeech) nextItem; 491 | FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition); 492 | if (featureDistance != FeatureDistance.MINUSMORE) 493 | { 494 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + "=" + nextItem.toString(), true); 495 | localMap.put(featureNumber, 1.0); 496 | if (debugFeatureNames) 497 | System.out.println(nextMethod.getName() + " -> " + value); 498 | } 499 | if (isListBefore || isListAfter) 500 | relativePosition++; 501 | } 502 | } 503 | else if (firstNonNullInList instanceof SemanticallyTaggedTokenWithContext) //&& viewNumber == 1 //only called from view 1 504 | { 505 | for (Object nextItem : list) 506 | { 507 | //TODO: might be worthwhile to pull in more TokenWithContext features here? 508 | final SemanticallyTaggedTokenWithContext nextTokenInList = (SemanticallyTaggedTokenWithContext) nextItem; 509 | FeatureDistance featureDistance = FeatureDistance.byDistance(relativePosition); 510 | if (featureDistance != FeatureDistance.MINUSMORE) 511 | { 512 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".Role=" + nextTokenInList.getSemanticRole(), true); 513 | localMap.put(featureNumber, 1.0); 514 | if (debugFeatureNames) 515 | System.out.println(nextMethod.getName() + featureDistance + ".Role=" + nextTokenInList.getSemanticRole() + " -> " + value); 516 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".RoleAndToken=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getLemma(), true); 517 | localMap.put(featureNumber, 1.0); 518 | if (debugFeatureNames) 519 | System.out.println(nextMethod.getName() + featureDistance + ".RoleAndToken=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getLemma() + " -> " + value); 520 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + featureDistance + ".RoleAndPOS=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getPos(), true); 521 | localMap.put(featureNumber, 1.0); 522 | if (debugFeatureNames) 523 | System.out.println(nextMethod.getName() + featureDistance + ".RoleAndPOS=" + nextTokenInList.getSemanticRole() + nextTokenInList.getTokenWithContext().getPos() + " -> " + value); 524 | 525 | } 526 | 527 | if (isListBefore || isListAfter) 528 | relativePosition++; 529 | } 530 | } 531 | else 532 | { 533 | System.err.println("Unhandled list type for " + nextMethod.getName() + " / " + firstNonNullInList.getClass().getName()); 534 | } 535 | } 536 | 537 | 538 | } 539 | else 540 | { 541 | //should never get called; if so, we need to implement a new type in here 542 | System.err.println("Unhandled getter : " + nextMethod.getName() + " " + nextMethod.getReturnType().getName()); 543 | } 544 | } 545 | else if (nextMethod.getReturnType() == boolean.class && nextMethod.getName().startsWith("is") && validForView(nextMethod.getName(), viewNumber)) 546 | { 547 | value = ((Boolean) nextMethod.invoke(token, null)).toString(); 548 | featureNumber = featureRepository.getNumberInList(nextMethod.getName() + "=" + value, true); 549 | localMap.put(featureNumber, 1.0); 550 | if (debugFeatureNames) 551 | System.out.println(nextMethod.getName() + " -> " + value); 552 | } 553 | 554 | } catch (IllegalArgumentException e) { 555 | e.printStackTrace(); 556 | } catch (IllegalAccessException e) { 557 | e.printStackTrace(); 558 | } catch (InvocationTargetException e) { 559 | e.printStackTrace(); 560 | } 561 | } 562 | 563 | 564 | 565 | return localMap; 566 | } 567 | 568 | private boolean validForView(String getterName, int viewNum) 569 | { 570 | if (!useViews) 571 | return true; 572 | 573 | final String[] validLexicalMethods = {"getToken", 574 | "getLemma", 575 | "getPos", 576 | "getPreviousTokens", 577 | "getNextTokens", 578 | "isAdjective", 579 | "getAttribute", //WordNet attribute property; i.e., fast -> speed 580 | //"isSemanticOutgoingEdgesIncludeNegation" 581 | }; 582 | final String[] validSyntacticMethods = {"getPositionInSentence", 583 | "getPartOfSentimentStructure", 584 | "getLocalParentage", 585 | "getParentage", 586 | "isCoreferenceHead", 587 | "getFlatResolvedCoreference", 588 | "getImmediateParent", 589 | "getPreviousToken", 590 | "getNextToken", 591 | "getSemanticallyTaggedTokensWithContext", 592 | "getSemanticSpecificRole", 593 | "getSemanticGeneralRole", 594 | "getSemanticIncomingEdge", 595 | "isSemanticOutgoingEdgesIncludeNegation", 596 | "isNamedEntity" 597 | }; 598 | final String[] validBagOfWordsMethods = {"getToken"}; 599 | 600 | //TODO: lots of String comparisons here against a fixed list; there should be a way to speed this up 601 | 602 | if (useOnlyOneView) 603 | { 604 | if (useOnlyOneViewWhichView == Views.LEXICAL) //lexical 605 | { 606 | for (String nextMethod : validLexicalMethods) 607 | { 608 | if (nextMethod.equals(getterName)) 609 | return true; 610 | } 611 | } 612 | else if (useOnlyOneViewWhichView == Views.SYNTACTIC) //syntactic 613 | { 614 | for (String nextMethod : validSyntacticMethods) 615 | { 616 | if (nextMethod.equals(getterName)) 617 | return true; 618 | } 619 | } 620 | else if (useOnlyOneViewWhichView == Views.BAGOFWORDS) 621 | { 622 | for (String nextMethod : validBagOfWordsMethods) 623 | { 624 | if (nextMethod.equals(getterName)) 625 | return true; 626 | } 627 | } 628 | } 629 | else 630 | { 631 | if (Views.getViewForNumber(viewNum) == Views.LEXICAL) //lexical 632 | { 633 | for (String nextMethod : validLexicalMethods) 634 | { 635 | if (nextMethod.equals(getterName)) 636 | return true; 637 | } 638 | } 639 | else if (Views.getViewForNumber(viewNum) == Views.SYNTACTIC) //syntactic 640 | { 641 | for (String nextMethod : validSyntacticMethods) 642 | { 643 | if (nextMethod.equals(getterName)) 644 | return true; 645 | } 646 | 647 | } 648 | } 649 | 650 | return false; 651 | 652 | } 653 | 654 | //adapted from LibSVM sample code 655 | @SuppressWarnings("unused") 656 | public Prediction predict(TokenWithContext token) 657 | { 658 | if (omitStopWords && StopWords.isStopWord(token.getToken())) 659 | return new Prediction(0, -1, null); //estimate probability at less than zero, so as to not include stop words in subsequent cotraining models 660 | 661 | Prediction bestPredictionSoFar = null; 662 | 663 | for (int viewNum = 0; viewNum < numberOfViews; viewNum++) 664 | { 665 | 666 | Map features = getFeaturesForToken(token, viewNum); 667 | 668 | int nr_class=svm.svm_get_nr_class(svmModelViews[viewNum]); 669 | double[] prob_estimates = null; 670 | int[] labels = null; 671 | 672 | labels=new int[nr_class]; 673 | svm.svm_get_labels(svmModelViews[viewNum],labels); 674 | prob_estimates = new double[nr_class]; 675 | 676 | svm_node[] x = new svm_node[features.size()]; 677 | int i = 0; 678 | for (Entry nextFeature : features.entrySet()) 679 | { 680 | x[i] = new svm_node(); 681 | x[i].index = nextFeature.getKey(); 682 | x[i].value = nextFeature.getValue(); 683 | 684 | i++; 685 | } 686 | 687 | double v = svm.svm_predict_probability(svmModelViews[viewNum],x,prob_estimates); 688 | Map classProbabilities = new HashMap(); 689 | for(int j=0;j bestPredictionSoFar.getProbability()) ) 696 | bestPredictionSoFar = new Prediction((int) v, classProbabilities.get((int) v), classProbabilities); 697 | } 698 | 699 | return bestPredictionSoFar; 700 | } 701 | 702 | public void addFeaturesForNearbyTokenWithContext(Map featureMap, TokenWithContext tokenWithContext, String tokenWithContextGetterName, FeatureDistance featureDistance, int view) 703 | { 704 | int featureNumber; 705 | 706 | final String featureNamePrefix; 707 | 708 | if (featureDistance == null) 709 | featureNamePrefix = tokenWithContextGetterName; 710 | else 711 | { 712 | //handle cases where having a word, say, before, is interesting, but the fact that it's one or two or three words before is too specific 713 | 714 | featureNamePrefix = tokenWithContextGetterName + featureDistance.toString(); 715 | 716 | if (featureDistance.canBeGeneralized()) 717 | { 718 | addFeaturesForNearbyTokenWithContext(featureMap, tokenWithContext, tokenWithContextGetterName, featureDistance.getGeneralCase(), view); 719 | } 720 | } 721 | 722 | if (tokenWithContext != null) 723 | { 724 | //in a TokenWithContext case, we want several features: token, lemma, POS, part of sentiment, etc. 725 | 726 | if (!useViews || view == 0) //lexical 727 | { 728 | if (debugFeatureNames) 729 | System.out.println(featureNamePrefix + ".getToken"); 730 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getToken" + "=" + tokenWithContext.getToken(), true); 731 | featureMap.put(featureNumber, 1.0); 732 | if (debugFeatureNames) 733 | System.out.println(featureNamePrefix + ".getLemma"); 734 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getLemma" + "=" + tokenWithContext.getLemma(), true); 735 | featureMap.put(featureNumber, 1.0); 736 | if (debugFeatureNames) 737 | System.out.println(featureNamePrefix + ".getPos"); 738 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getPos" + "=" + tokenWithContext.getPos(), true); 739 | featureMap.put(featureNumber, 1.0); 740 | 741 | } 742 | if (!useViews || view == 1) //syntactic 743 | { 744 | //featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getPartOfSentimentStructure" + "=" + tokenWithContext.getPartOfSentimentStructure(), true); 745 | //featureMap.put(featureNumber, 1.0); 746 | if (debugFeatureNames) 747 | System.out.println(featureNamePrefix + ".isCoreferenceHead"); 748 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isCoreferenceHead" + "=" + tokenWithContext.isCoreferenceHead(), true); 749 | featureMap.put(featureNumber, 1.0); 750 | if (debugFeatureNames) 751 | System.out.println(featureNamePrefix + ".getFlatResolvedCoreference"); 752 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getFlatResolvedCoreference" + "=" + tokenWithContext.getFlatResolvedCoreference(), true); 753 | featureMap.put(featureNumber, 1.0); 754 | if (debugFeatureNames) 755 | System.out.println(featureNamePrefix + ".isNamedEntity"); 756 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isNamedEntity" + "=" + tokenWithContext.isNamedEntity(), true); 757 | featureMap.put(featureNumber, 1.0); 758 | if (debugFeatureNames) 759 | System.out.println(featureNamePrefix + ".getSemanticGeneralRole"); 760 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getSemanticGeneralRole" + "=" + tokenWithContext.getSemanticGeneralRole(), true); 761 | featureMap.put(featureNumber, 1.0); 762 | if (debugFeatureNames) 763 | System.out.println(featureNamePrefix + ".getSemanticSpecificRole"); 764 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getSemanticSpecificRole" + "=" + tokenWithContext.getSemanticSpecificRole(), true); 765 | featureMap.put(featureNumber, 1.0); 766 | if (debugFeatureNames) 767 | System.out.println(featureNamePrefix + ".isSemanticOutgoingEdgesIncludeNegation"); 768 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".isSemanticOutgoingEdgesIncludeNegation" + "=" + tokenWithContext.isSemanticOutgoingEdgesIncludeNegation(), true); 769 | featureMap.put(featureNumber, 1.0); 770 | //POS feature; could be argued that this does not belong in this view; on the other hand, it's taken from the parse tree, making it syntactic, so I'll leave it in for now. 771 | if (debugFeatureNames) 772 | System.out.println(featureNamePrefix + ".getImmediateParent"); 773 | featureNumber = featureRepository.getNumberInList(featureNamePrefix + ".getImmediateParent" + "=" + tokenWithContext.getImmediateParent(), true); 774 | featureMap.put(featureNumber, 1.0); 775 | 776 | } 777 | //TODO: could branch out one more here...get semantic roles two away 778 | } 779 | } 780 | private static svm_print_interface svm_print_null = new svm_print_interface() 781 | { 782 | public void print(String s) {} 783 | }; 784 | 785 | 786 | public static void main(String[] args) 787 | { 788 | System.out.println("Testing SVM model creation."); 789 | 790 | //TODO this works very poorly with skewed classes; i.e., removing a bunch of named entity cases makes things work poorly *unless* we revert to predicting binary without probabilities; so weird 791 | 792 | //Also, LibSVM estimates probabilities using internal 5-fold cross validation, so we need enough data in the test set to allow that to work 793 | 794 | /*TokenWithContext[] trainingTokens = 795 | { 796 | //http://stackoverflow.com/questions/5988574/why-does-svm-predict-and-svm-predict-probability-give-different-results-in-java 797 | 798 | new TokenWithContext("this", null, null, null, null, null, false), 799 | new TokenWithContext("is", null, null, null, null, null, false), 800 | new TokenWithContext("a", null, null, null, null, null, false), 801 | new TokenWithContext("Sentential", null, null, null, null, null, true), 802 | new TokenWithContext("sentence", null, null, null, null, null, false), 803 | new TokenWithContext("with", null, null, null, null, null, false), 804 | new TokenWithContext("Apple", null, null, null, null, null, true), 805 | new TokenWithContext("iPad", null, null, null, null, null, true), 806 | new TokenWithContext("features", null, null, null, null, null, false), 807 | new TokenWithContext("plus", null, null, null, null, null, false), 808 | new TokenWithContext("some", null, null, null, null, null, false), 809 | new TokenWithContext("extra", null, null, null, null, null, false), 810 | new TokenWithContext("Lucky", null, null, null, null, null, true), 811 | new TokenWithContext("Brand", null, null, null, null, null, true), 812 | new TokenWithContext("words", null, null, null, null, null, false), 813 | new TokenWithContext("thrown", null, null, null, null, null, false), 814 | new TokenWithContext("in", null, null, null, null, null, false), 815 | new TokenWithContext("for", null, null, null, null, null, false), 816 | new TokenWithContext("good", null, null, null, null, null, false), 817 | new TokenWithContext("measure", null, null, null, null, null, false), 818 | 819 | new TokenWithContext("this", null, null, null, null, null, false), 820 | new TokenWithContext("is", null, null, null, null, null, false), 821 | new TokenWithContext("a", null, null, null, null, null, false), 822 | new TokenWithContext("Sentential", null, null, null, null, null, true), 823 | new TokenWithContext("sentence", null, null, null, null, null, false), 824 | new TokenWithContext("with", null, null, null, null, null, false), 825 | new TokenWithContext("Apple", null, null, null, null, null, true), 826 | new TokenWithContext("iPad", null, null, null, null, null, true), 827 | new TokenWithContext("features", null, null, null, null, null, false), 828 | new TokenWithContext("plus", null, null, null, null, null, false), 829 | new TokenWithContext("some", null, null, null, null, null, false), 830 | new TokenWithContext("extra", null, null, null, null, null, false), 831 | new TokenWithContext("Lucky", null, null, null, null, null, true), 832 | new TokenWithContext("Brand", null, null, null, null, null, true), 833 | new TokenWithContext("words", null, null, null, null, null, false), 834 | new TokenWithContext("thrown", null, null, null, null, null, false), 835 | new TokenWithContext("in", null, null, null, null, null, false), 836 | new TokenWithContext("for", null, null, null, null, null, false), 837 | new TokenWithContext("good", null, null, null, null, null, false), 838 | 839 | new TokenWithContext("good", "good", PartOfSpeech.ADJP, null, null, null, true), 840 | }; 841 | 842 | SVMTokenModel model = new SVMTokenModelSentiment(Arrays.asList(trainingTokens), null, ClassWeighting.EQUAL, null, null, null); 843 | 844 | */ 845 | 846 | String[] sentences = { 847 | "feature[+2], ##the car 's features are wonderful .", 848 | "feature[+2], ##the car has a wonderful set of features .", 849 | "feature[+2], ##the camera has a wonderful set of features .", 850 | "lens[+2], ##the camera has a great lens .", 851 | "grip[+1], ##the camera has a fine grip .", 852 | "grip[-1], ##i didn't like the grip on the camera .", 853 | 854 | }; 855 | 856 | List trainingTokens = new ArrayList(); 857 | for (String nextSentence : sentences) { 858 | Sentence sentence = new Sentence(new SimpleSentence(nextSentence, true), Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false); 859 | trainingTokens.addAll(sentence.getTokens()); 860 | } 861 | 862 | List models = new ArrayList(); 863 | models.add(new SVMTokenModelSentiment(Task.BINGLIU, trainingTokens, null, ClassWeighting.EQUAL, null, null, null)); 864 | models.add(new SVMTokenModelFeature(Task.BINGLIU, trainingTokens, null, ClassWeighting.EQUAL, null, null, null)); 865 | 866 | Sentence testSentence = new Sentence(new SimpleSentence("shmork[+2], ##the camera has a decent shmork .", true), Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false); 867 | //Sentence testSentence = new Sentence("shmork[+2], ##i did n't grok the camera 's features .", Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline(), null, null, false); 868 | 869 | for (SVMTokenModel model : models) 870 | { 871 | System.out.println("Model is " + model.getName()); 872 | for (TokenWithContext nextToken : trainingTokens) 873 | { 874 | Prediction prediction = model.predict(nextToken); 875 | System.out.println((prediction.getClassNumber() == model.getClassForToken(nextToken) ? "Correct " : "Incorrect ") + nextToken.getToken() + " " + prediction); 876 | //System.out.println(nextToken.getToken() + " " + prediction); 877 | } 878 | /* for (TokenWithContext nextToken : testSentence.getTokens()) 879 | { 880 | Prediction prediction = model.predict(nextToken); 881 | System.out.println((prediction.getClassNumber() == model.getClassForToken(nextToken) ? "Correct " : "Incorrect ") + nextToken.getToken() + " " + prediction); 882 | //System.out.println(nextToken.getToken() + " " + prediction); 883 | } 884 | */ System.out.println("-----"); 885 | } 886 | 887 | 888 | for (TokenWithContext nextToken : testSentence.getTokens()) 889 | { 890 | 891 | 892 | Prediction topPrediction = null; 893 | ModelType topPredictionModel = null; 894 | Double nominalClassForTopPrediction = null; 895 | for (SVMTokenModel model : models) 896 | { 897 | Map featuresLexical = model.getFeaturesForToken(nextToken, 0); 898 | Map featuresSyntactic = model.getFeaturesForToken(nextToken, 1); 899 | System.out.println("lexical features: " + featuresLexical.size() + " / syntactic featuers: " + featuresSyntactic.size()); 900 | 901 | Prediction prediction = model.predict(nextToken); 902 | if (prediction.getClassNumber() != 0 && (topPrediction == null || prediction.getProbability() > topPrediction.getProbability())) 903 | { 904 | topPrediction = prediction; 905 | topPredictionModel = model.getModelType(); 906 | nominalClassForTopPrediction = model.getClassForToken(nextToken); 907 | } 908 | 909 | } 910 | 911 | if (topPrediction != null) 912 | System.out.print("[ "); 913 | System.out.print(nextToken.getToken()); 914 | if (topPrediction != null) 915 | System.out.print(" (" + topPredictionModel.toString() + " " + (topPrediction.getClassNumber() == nominalClassForTopPrediction ? "Correct" : "Incorrect") + ") ]"); 916 | System.out.print(" "); 917 | } 918 | 919 | } 920 | 921 | 922 | 923 | 924 | 925 | } 926 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/SVMTokenModelFeature.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.io.Writer; 4 | import java.util.List; 5 | 6 | import ca.carter.thesis.model.Task; 7 | import ca.carter.thesis.model.TokenWithContext; 8 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure; 9 | 10 | 11 | public class SVMTokenModelFeature extends SVMTokenModel { 12 | 13 | //up to March 16th, was using 8 and 0.0078125 with reasonably good results 14 | //tried log average of 2.639 and 0.00592 and had terrible results 15 | //tried numeric average of 4.1v and 0.00664 16 | 17 | private static final double svmC = 8; //64; //128; //2; //33.612; //102.8; //55.72; // 2.63; 18 | private static final double svmGamma = 0.0078125; //0.0078125; //6.103515625e-05; //0.0013810679; //0.00012207; //0.0025771639; //0.0048828; //0.0016601563; //0.0001220703125, 0.0001220703125, 0.0078125, 0.0001220703125, 0.0001220703125 19 | private static final double svmEpsilon = 1.0E-3; 20 | 21 | //for single view 22 | // private static final double[] svmCForViews = {13.9288090127, 0.1649384888}; 23 | // private static final double[] svmGammaForViews = {0.0078125, 0.2871745887}; 24 | 25 | //hand-tuned for views 26 | private static final double[] svmCForViews = {1.265625, 39.0625}; // {1.265625, 39.0625} <--bestyet-- {1.265625, 39.0625} <--slightlyworse?-- {2.53125, 19.53125} <--bestyet-- {2.53125, 19.53125} <--disimprovement-- {3.375, 15.625} <--merelyrollingbackto-- {3.375, 15.625} <-?- {4.5, 12.5} <--noticeableimprovement-- {6, 10} <--noticeableimprovement--{10,6} <--disimproves-- {8,8} 27 | //private static final double[] svmGammaForViews = {0.0076293945 * 4.5, 0.0078125 * 3}; // {0.0076293945, 0.0078125 * 2} <--bestyet-- {..., ... / 2} <--slightlyworse?-- {0.015258789, 0.0078125} <--bestyet-- {0.0091552732, 0.0078125} <--disimprovement-- {0.0091552732, 0.0078125} <--merelyrollingbackto-- {0.0091552732, 0.0078125} <-?- {0.012207031, 0.0078125} <--noticeableimprovement-- {0.009765625, 0.0078125} <--noticeableimprovement-- {0.0078125, 0.009765625} <--disimproves-- {0.0078125,0.0078125} 28 | private static final double[] svmGammaForViews = {0.0076293945 * 4.5, 0.0078125 * 3}; // {0.0076293945, 0.0078125 * 2} <--bestyet-- {..., ... / 2} <--slightlyworse?-- {0.015258789, 0.0078125} <--bestyet-- {0.0091552732, 0.0078125} <--disimprovement-- {0.0091552732, 0.0078125} <--merelyrollingbackto-- {0.0091552732, 0.0078125} <-?- {0.012207031, 0.0078125} <--noticeableimprovement-- {0.009765625, 0.0078125} <--noticeableimprovement-- {0.0078125, 0.009765625} <--disimproves-- {0.0078125,0.0078125} 29 | 30 | //auto-tuned with 80% 31 | // private static final double[] svmCForViews = {13.929, 2}; 32 | // private static final double[] svmGammaForViews = {0.0059207678, 0.0717936472}; 33 | 34 | //auto-tuned with 20% 35 | // private static final double[] svmCForViews = {13.9288090127, 0.1649384888}; 36 | // private static final double[] svmGammaForViews = {0.0078125, 0.2871745887}; 37 | 38 | 39 | public SVMTokenModelFeature(Task task, List tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) { 40 | super(task, tokens, fileToOutput, classWeighting, c, gamma, epsilon); 41 | } 42 | 43 | @Override 44 | public Double getClassForToken(TokenWithContext token) 45 | { 46 | if (token.getPartOfSentimentStructure() == PartOfSentimentStructure.FEATURE) 47 | return 1.0; 48 | else 49 | return 0.0; 50 | } 51 | 52 | @Override 53 | public String getName() { 54 | return "product feature"; 55 | } 56 | 57 | @Override 58 | public double getC(int viewNum) { 59 | if (this.specifiedC != null) 60 | return this.specifiedC; 61 | else 62 | { 63 | if (useViews) 64 | return svmCForViews[viewNum]; 65 | else 66 | return svmC; 67 | } 68 | } 69 | 70 | @Override 71 | public double getGamma(int viewNum) { 72 | if (this.specifiedGamma != null) 73 | return this.specifiedGamma; 74 | else 75 | { 76 | if (useViews) 77 | return svmGammaForViews[viewNum]; 78 | else 79 | return svmGamma; 80 | } 81 | } 82 | 83 | @Override 84 | public double getEpsilon() { 85 | if (this.specifiedEpsilon != null) 86 | return specifiedEpsilon; 87 | else 88 | return svmEpsilon; 89 | } 90 | 91 | @Override 92 | public ModelType getModelType() 93 | { 94 | return ModelType.FEATURE; 95 | } 96 | 97 | 98 | } 99 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/SVMTokenModelSentiment.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.Writer; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | import ca.carter.thesis.ProcessReviews; 12 | import ca.carter.thesis.model.Sentiment; 13 | import ca.carter.thesis.model.Task; 14 | import ca.carter.thesis.model.TokenWithContext; 15 | import ca.carter.thesis.model.phrasetree.PartOfSpeech; 16 | 17 | public class SVMTokenModelSentiment extends SVMTokenModel { 18 | 19 | //as of March 16th, working OK with 24.251, 0.0011218, 1.0E-4 20 | 21 | private static final double svmC = 24.251; //41.600; //742.4; //73.51; 22 | private static final double svmGamma = 0.0011218; //0.0013672; //0.000048828125; //3.0517578125e-05 , 0.0001220703125, 3.0517578125e-05, 3.0517578125e-05, 3.0517578125e-05 23 | private static final double svmEpsilon = 1.0E-4; 24 | 25 | //tuned for single view 26 | // private static final double[] svmCForViews = {512, 512}; 27 | // private static final double[] svmGammaForViews = {0.0001220703, 0.0001220703}; 28 | 29 | //hand tuned for two views 30 | private static final double[] svmCForViews = {147.998047, 2.87743798}; // {147.998047, 2.87743798} <--bestyet-- {147.998047, 2.87743798} <--slightlyworse-- {73.9990235, 5.75487596} <--better-- {73.9990235, 5.75487596} <--notmuchchange-- {59.1992188, 7.67316795} <--merelyrollingbackgammato-- {59.1992188, 7.67316795} <-?- {47.359375, 10.2308906} <--improvement-- {37.8875, 13.6411875} <--neglibibleimprovementinPneglibibledecreaseinR-- {30.31, 18.1825} <--improves-- {24.251,24,251} 31 | //private static final double[] svmGammaForViews = {0.0011218 * 3, 0.0068469238 * 15}; // {0.0011218 * 2 , 0.0068469238} <--bestyet-- {0.0011218 / 2 , 0.0068469238} <--slightlyworse-- {0.0011218, 0.0034234619} <--better-- {0.0011218, 0.0027387695} <--notmuchchange-- {0.0011218, 0.0027387695} <--merelyrollingbackgammato-- {0.00084135, 0.0027387695} <-?- {0.0011218, 0.0021910156} <--improvement-- {0.0011218, 0.0010516875} (latter should have been 0.00175) <--neglibibleimprovementinPneglibibledecreaseinR-- {0.0011218, 0.00140225} <--improves-- {0.0011218, 0.0011218} 32 | private static final double[] svmGammaForViews = {0.0011218 * 3, 0.0068469238 * 15}; // {0.0011218 * 2 , 0.0068469238} <--bestyet-- {0.0011218 / 2 , 0.0068469238} <--slightlyworse-- {0.0011218, 0.0034234619} <--better-- {0.0011218, 0.0027387695} <--notmuchchange-- {0.0011218, 0.0027387695} <--merelyrollingbackgammato-- {0.00084135, 0.0027387695} <-?- {0.0011218, 0.0021910156} <--improvement-- {0.0011218, 0.0010516875} (latter should have been 0.00175) <--neglibibleimprovementinPneglibibledecreaseinR-- {0.0011218, 0.00140225} <--improves-- {0.0011218, 0.0011218} 33 | 34 | //auto-tuned with 80% 35 | // private static final double[] svmCForViews = {6208.3750564266, 10.5560632862}; 36 | // private static final double[] svmGammaForViews = {0.0001610727, 0.0078125}; 37 | 38 | //auto-tuned with 20% 39 | // private static final double[] svmCForViews = {97.0058602567, 10.5560632862}; 40 | // private static final double[] svmGammaForViews = {0.0011217757, 0.0136023526}; 41 | 42 | 43 | private static final List posWords = new ArrayList(); 44 | private static final List negWords = new ArrayList(); 45 | 46 | private static final String positiveWordsFile = "/bingliulexicon/positive-words.txt"; 47 | private static final String negativeWordsFile = "/bingliulexicon/negative-words.txt"; 48 | 49 | private static boolean startedUp = false; 50 | 51 | 52 | public SVMTokenModelSentiment(Task task, List tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) { 53 | super(task, tokens, fileToOutput, classWeighting, c, gamma, epsilon); 54 | } 55 | 56 | private void loadBootstrapping() 57 | { 58 | synchronized(this) 59 | { 60 | if (startedUp) 61 | return; 62 | startedUp = true; 63 | } 64 | 65 | try 66 | { 67 | loadSentimentWordList(new File(ProcessReviews.defaultRootdir + positiveWordsFile), posWords); 68 | loadSentimentWordList(new File(ProcessReviews.defaultRootdir + negativeWordsFile), negWords); 69 | 70 | if (posWords.isEmpty() || negWords.isEmpty()) 71 | { 72 | System.err.println("############ Could not bootstrap sentiment-bearing words ##############"); 73 | System.exit(-1); 74 | } 75 | 76 | //System.out.print("Bootstrapping sentiment-bearing words with list of " + posWords.size() + " positive words and " + negWords.size() + " negative words."); 77 | } 78 | catch (IOException e) 79 | { 80 | System.err.println("Could not open file containing sentiment words for bootstrapping."); 81 | e.printStackTrace(); 82 | System.exit(-1); 83 | } 84 | } 85 | 86 | private void loadSentimentWordList(File file, List list) throws IOException 87 | { 88 | BufferedReader br = new BufferedReader(new FileReader(file)); 89 | String line; 90 | while ((line = br.readLine()) != null) { 91 | if (line.isEmpty() || line.charAt(0) == ';') 92 | { 93 | //do nothing 94 | } 95 | else 96 | { 97 | list.add(line); 98 | } 99 | } 100 | br.close(); 101 | } 102 | 103 | //TODO: switch away from multi-class 104 | 105 | 106 | //here, unusually, we bootstrap the negative and positive words 107 | //TODO: need to test negation features 108 | 109 | @Override 110 | public Double getClassForToken(TokenWithContext token) 111 | { 112 | if (!startedUp) 113 | loadBootstrapping(); 114 | 115 | String cleanedToken = token.getToken().toLowerCase(); 116 | String cleanedLemma = token.getLemma().toLowerCase(); 117 | 118 | if (negWords.contains(cleanedToken) || negWords.contains(cleanedLemma)) 119 | return -1.0; 120 | else if (posWords.contains(cleanedToken) || posWords.contains(cleanedLemma)) 121 | return 1.0; 122 | else 123 | return 0.0; 124 | } 125 | 126 | public static Double lookupClassForToken(TokenWithContext token) 127 | { 128 | if (!startedUp) 129 | System.err.println("Cannot look up class for sentiment token. Have not loaded lexicons."); 130 | 131 | String cleanedToken = token.getToken().toLowerCase(); 132 | String cleanedLemma = token.getLemma().toLowerCase(); 133 | 134 | if (negWords.contains(cleanedToken) || negWords.contains(cleanedLemma)) 135 | return -1.0; 136 | else if (posWords.contains(cleanedToken) || posWords.contains(cleanedLemma)) 137 | return 1.0; 138 | else 139 | return 0.0; 140 | 141 | } 142 | 143 | public static Sentiment decodeClassNumber(Double classNumber) 144 | { 145 | if (classNumber == null) 146 | return null; 147 | 148 | switch((int) Math.round(classNumber) ) 149 | { 150 | case -1: 151 | return Sentiment.NEG; 152 | case 0: 153 | return Sentiment.OBJ; 154 | case 1: 155 | return Sentiment.POS; 156 | default: 157 | return null; 158 | } 159 | } 160 | 161 | @Override 162 | public String getName() { 163 | return "sentiment word"; 164 | } 165 | 166 | /* 167 | public static void main(String[] args) 168 | { 169 | SVMTokenModelSentiment model = new SVMTokenModelSentiment(null); 170 | 171 | System.out.println("Negative words: " + model.negWords.size()); 172 | System.out.println("Positive words: " + model.posWords.size()); 173 | 174 | Sentence testSentence = new Sentence("It has a great big screen but a terrible little tiny shutter release.", Sentence.getDefaultPipeline(), ProductFeatureOpinion.getDefaultPipeline()); 175 | 176 | for (TokenWithContext nextToken : testSentence.getTokens()) 177 | { 178 | System.out.println(nextToken.getToken() + " " + model.getClassForToken(nextToken) ); 179 | } 180 | 181 | } 182 | */ 183 | 184 | @Override 185 | public double getC(int viewNum) { 186 | if (this.specifiedC != null) 187 | return this.specifiedC; 188 | else 189 | { 190 | if (useViews) 191 | return svmCForViews[viewNum]; 192 | else 193 | return svmC; 194 | } 195 | } 196 | 197 | @Override 198 | public double getGamma(int viewNum) { 199 | if (this.specifiedGamma != null) 200 | return this.specifiedGamma; 201 | else 202 | { 203 | if (useViews) 204 | return svmGammaForViews[viewNum]; 205 | else 206 | return svmGamma; 207 | } 208 | } 209 | 210 | @Override 211 | public double getEpsilon() { 212 | if (this.specifiedEpsilon != null) 213 | return specifiedEpsilon; 214 | else 215 | return svmEpsilon; 216 | } 217 | @Override 218 | public ModelType getModelType() 219 | { 220 | return ModelType.SENTIMENT; 221 | } 222 | 223 | 224 | public static void main(String[] args) 225 | { 226 | // public SVMTokenModelSentiment(Task task, List tokens, Writer[] fileToOutput, ClassWeighting classWeighting, Double c, Double gamma, Double epsilon) { 227 | 228 | SVMTokenModelSentiment model = new SVMTokenModelSentiment(Task.BINGLIU, null, null, null, null, null, null); 229 | 230 | System.out.println(model.getClassForToken(new TokenWithContext(1, "horrible", null, PartOfSpeech.JJ, null, null, null, false))); 231 | System.out.println(model.getClassForToken(new TokenWithContext(1, "great", null, PartOfSpeech.JJ, null, null, null, false))); 232 | System.out.println(model.getClassForToken(new TokenWithContext(1, "uneventful", null, PartOfSpeech.JJ, null, null, null, false))); 233 | System.out.println(model.getClassForToken(new TokenWithContext(1, "outstanding", null, PartOfSpeech.JJ, null, null, null, false))); 234 | System.out.println(model.getClassForToken(new TokenWithContext(1, "OUTSTANDING", null, PartOfSpeech.JJ, null, null, null, false))); 235 | 236 | 237 | } 238 | 239 | } 240 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/ml/Views.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.ml; 2 | 3 | public enum Views { 4 | LEXICAL, 5 | SYNTACTIC, 6 | BAGOFWORDS; 7 | 8 | public static Integer getNumberForView(Views view) 9 | { 10 | switch (view) 11 | { 12 | case LEXICAL: 13 | return 0; 14 | case SYNTACTIC: 15 | return 1; 16 | case BAGOFWORDS: 17 | return 2; 18 | default: 19 | return null; 20 | } 21 | } 22 | 23 | public static Views getViewForNumber(int number) 24 | { 25 | switch (number) 26 | { 27 | case 0: 28 | return LEXICAL; 29 | case 1: 30 | return SYNTACTIC; 31 | case 2: 32 | return BAGOFWORDS; 33 | default: 34 | return null; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/AspectMatchPolicy.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | public enum AspectMatchPolicy { 4 | PARTIAL, 5 | EXACT 6 | } 7 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/ProductFeatureOpinion.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | import java.util.Properties; 4 | 5 | import edu.stanford.nlp.ling.CoreLabel; 6 | import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; 7 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 8 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 9 | import edu.stanford.nlp.pipeline.Annotation; 10 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 11 | import edu.stanford.nlp.util.CoreMap; 12 | 13 | public class ProductFeatureOpinion { 14 | private String rawFeature; 15 | private String feature; 16 | private String lemmatizedFeature; 17 | private int sentimentValue; 18 | private ProductOpinionFeatureDetail detail; //this is whether the feature is unlisted, etc.; values other than unlisted are probably not useful for my work 19 | //private List paraphrases; 20 | 21 | private Integer from = null; 22 | private Integer to = null; 23 | 24 | 25 | //for parsing features and listed polarities from Bing Liu data 26 | public ProductFeatureOpinion(String feature, StanfordCoreNLP pipeline) { 27 | super(); 28 | 29 | this.rawFeature = feature; 30 | 31 | //t-mobile service[+2][u] 32 | 33 | String parts[] = feature.split("[\\[\\]{}]+"); 34 | int partsLength = parts.length; 35 | 36 | this.feature = parts[0].trim(); 37 | 38 | //this.paraphrases = WikipediaParaphraser.getParaphrases(this.feature, true); 39 | 40 | Annotation document = new Annotation(this.feature); 41 | pipeline.annotate(document); 42 | 43 | // these are all the sentences in this document 44 | // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types 45 | CoreMap sentenceFragment = document.get(SentencesAnnotation.class).get(0); 46 | 47 | 48 | 49 | StringBuilder sb = new StringBuilder(); 50 | boolean first = true; 51 | for (CoreLabel token: sentenceFragment.get(TokensAnnotation.class)) { 52 | String lemma = token.get(LemmaAnnotation.class); 53 | if (first) 54 | first = false; 55 | else 56 | sb.append(" "); 57 | sb.append(lemma); 58 | } 59 | this.lemmatizedFeature = sb.toString(); 60 | 61 | if (parts.length == 1) 62 | { 63 | //this is triggered in a very small number of cases where a given feature is both good and bad in a sentence 64 | //i.e., "look##this thing , while looking pretty cool , is not as sexy as the ipod .", 65 | sentimentValue = 0; 66 | } 67 | else 68 | { 69 | switch (parts[1].charAt(0)) 70 | { 71 | case '+': 72 | if (parts[1].length() == 1) 73 | sentimentValue = 1; 74 | else 75 | sentimentValue = Integer.valueOf(parts[1].substring(1)); 76 | break; 77 | case '-': 78 | sentimentValue = -1 * Integer.valueOf(parts[1].substring(1)); 79 | break; 80 | default: 81 | sentimentValue = Integer.valueOf(parts[1].substring(0)); 82 | } 83 | } 84 | 85 | if (partsLength > 2) 86 | detail = ProductOpinionFeatureDetail.byValue(parts[2]); 87 | if (partsLength > 3) 88 | { 89 | if ((parts[2].equalsIgnoreCase("p") && parts[3].equalsIgnoreCase("u")) || (parts[2].equalsIgnoreCase("u") && parts[3].equalsIgnoreCase("p"))) 90 | //resolve redundancy in this case 91 | detail = ProductOpinionFeatureDetail.PRONOUN; 92 | else 93 | System.out.println("More than one product opinion feature detail, which is an unusual and/or conflicting occurrence: " + feature); 94 | } 95 | 96 | } 97 | 98 | //for feeding in aspects from XML where the polarity is already defined 99 | public ProductFeatureOpinion(String feature, String polarity, int from, int to, StanfordCoreNLP pipeline) { 100 | super(); 101 | 102 | this.feature = feature; 103 | this.from = from; 104 | this.to = to; 105 | 106 | Annotation document = new Annotation(this.feature); 107 | pipeline.annotate(document); 108 | 109 | // these are all the sentences in this document 110 | // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types 111 | CoreMap sentenceFragment = document.get(SentencesAnnotation.class).get(0); 112 | 113 | StringBuilder sb = new StringBuilder(); 114 | boolean first = true; 115 | for (CoreLabel token: sentenceFragment.get(TokensAnnotation.class)) { 116 | String lemma = token.get(LemmaAnnotation.class); 117 | if (first) 118 | first = false; 119 | else 120 | sb.append(" "); 121 | sb.append(lemma); 122 | } 123 | this.lemmatizedFeature = sb.toString(); 124 | 125 | if ("positive".equals(polarity)) 126 | sentimentValue = 2; 127 | else if ("negative".equals(polarity)) 128 | sentimentValue = -2; 129 | else if ("neutral".equals(polarity)) 130 | sentimentValue = 0; 131 | else 132 | { 133 | System.out.println("Could not assign polarity " + polarity); 134 | sentimentValue = 0; 135 | } 136 | } 137 | public String getFeature() { 138 | return feature; 139 | } 140 | public void setFeature(String feature) { 141 | this.feature = feature; 142 | } 143 | public String getLemmatizedFeature() { 144 | return lemmatizedFeature; 145 | } 146 | public void setLemmatizedFeature(String lemmatizedFeature) { 147 | this.lemmatizedFeature = lemmatizedFeature; 148 | } 149 | public int getSentimentValue() { 150 | return sentimentValue; 151 | } 152 | public void setSentimentValue(int sentimentValue) { 153 | this.sentimentValue = sentimentValue; 154 | } 155 | public Sentiment getSentiment() { 156 | if (sentimentValue > 0) 157 | return Sentiment.POS; 158 | else if (sentimentValue < 0) 159 | return Sentiment.NEG; 160 | else 161 | return Sentiment.OBJ; 162 | } 163 | public ProductOpinionFeatureDetail getDetail() { 164 | return detail; 165 | } 166 | public void setDetail(ProductOpinionFeatureDetail detail) { 167 | this.detail = detail; 168 | } 169 | public String getRawFeature() { 170 | return rawFeature; 171 | } 172 | public static StanfordCoreNLP getDefaultPipeline() 173 | { 174 | //System.out.println("Getting default pipeline."); 175 | //TODO: stem it 176 | 177 | Properties props = new Properties(); 178 | props.put("annotators", "tokenize, ssplit, pos, lemma"); 179 | return new StanfordCoreNLP(props); 180 | } 181 | 182 | 183 | 184 | @Override 185 | public String toString() { 186 | return "ProductFeatureOpinion [feature=" + feature 187 | + ", lemmatizedFeature=" + lemmatizedFeature + ", sentimentValue=" 188 | + sentimentValue + (detail != null ? ", detail=" + detail : ", no extra details") + "]"; 189 | } 190 | public static void main(String[] args) 191 | { 192 | //String testOpinion = "t-mobile service[+2][u]"; 193 | String testOpinion = "feature[+2}, "; 194 | //String testOpinion = "look"; 195 | 196 | ProductFeatureOpinion test = new ProductFeatureOpinion(testOpinion, ProductFeatureOpinion.getDefaultPipeline()); 197 | 198 | System.out.println(test.toString()); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/ProductOpinionFeatureDetail.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | public enum ProductOpinionFeatureDetail { 4 | 5 | UNLISTED, // ("u"), 6 | PRONOUN, //("p"), 7 | SUGGESTION, //("s"), 8 | COMPARISONCONTRAST, //("cc"), 9 | COMPARISONSAMEBRAND ; //("cs"); 10 | 11 | /* 12 | [u] : feature not appeared in the sentence. 13 | [p] : feature not appeared in the sentence. Pronoun resolution is needed. 14 | [s] : suggestion or recommendation. 15 | [cc]: comparison with a competing product from a different brand. 16 | [cs]: comparison with a competing product from the same brand. 17 | */ 18 | 19 | //private final String abbrev; 20 | //private ProductOpinionFeatureDetail(String abbrev) { 21 | // this.abbrev = abbrev; 22 | //} 23 | 24 | public static ProductOpinionFeatureDetail byValue(String abbrev) 25 | { 26 | if (abbrev == null | abbrev.isEmpty()) 27 | return null; 28 | 29 | switch (abbrev.charAt(0)) 30 | { 31 | case 'u': 32 | return UNLISTED; 33 | case 'p': 34 | return PRONOUN; 35 | case 's': 36 | return SUGGESTION; 37 | case 'c': 38 | switch (abbrev.charAt(1)) 39 | { 40 | case 'c': 41 | return COMPARISONCONTRAST; 42 | case 's': 43 | return COMPARISONSAMEBRAND; 44 | default: 45 | return null; 46 | } 47 | default: 48 | return null; 49 | } 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/ReconciledFeatureOpinion.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import ca.carter.thesis.ml.SVMTokenModelSentiment; 7 | 8 | public class ReconciledFeatureOpinion { 9 | private List tokensWithFeature = new ArrayList(); 10 | private List tokensWithSentiment = new ArrayList(); 11 | private ProductFeatureOpinion opinion; 12 | private List sentiment = new ArrayList(); 13 | 14 | 15 | public ReconciledFeatureOpinion(ProductFeatureOpinion opinion) { 16 | super(); 17 | this.opinion = opinion; 18 | } 19 | public ProductFeatureOpinion getOpinion() { 20 | return opinion; 21 | } 22 | public void setOpinion(ProductFeatureOpinion opinion) { 23 | this.opinion = opinion; 24 | } 25 | public List getTokensWithFeature() { 26 | return tokensWithFeature; 27 | } 28 | public void setTokensWithFeature(List tokensWithFeature) { 29 | this.tokensWithFeature = tokensWithFeature; 30 | } 31 | public List getTokensWithSentiment() { 32 | return tokensWithSentiment; 33 | } 34 | public void setTokensWithSentiment(List tokensWithSentiment) { 35 | this.tokensWithSentiment = tokensWithSentiment; 36 | } 37 | public List getSentiment() { 38 | return sentiment; 39 | } 40 | public void setSentiment(ArrayList sentiment) { 41 | this.sentiment = sentiment; 42 | } 43 | 44 | public void addSentimentToken(TokenWithContext sentiToken) 45 | { 46 | tokensWithSentiment.add(sentiToken); 47 | 48 | //the classifier is very good at deciding whether a word is sentiment-bearing; it is not as good at deciding whether rarely-seen words are positive or negative; so, if it exists in our lexicon, trust that; otherwise, use the prediction; if word is not in lexicon, only use the prediction 49 | Sentiment lexicalizedSentiment = SVMTokenModelSentiment.decodeClassNumber(SVMTokenModelSentiment.lookupClassForToken(sentiToken)); 50 | 51 | Sentiment predictedSentiment = null; 52 | if (lexicalizedSentiment != Sentiment.OBJ) 53 | { 54 | //if (predictedSentiment != lexicalizedSentiment) 55 | // System.out.println("Correcting sentiment for '" + sentiToken.getToken() + "' from " + SVMTokenModelSentiment.decodeClassNumber(sentiToken.getPredictedClass()) + " (predicted) to " + lexicalizedSentiment + " (according to lexicon)."); 56 | 57 | predictedSentiment = lexicalizedSentiment; 58 | } 59 | else 60 | predictedSentiment = SVMTokenModelSentiment.decodeClassNumber(sentiToken.getPredictedClass()); 61 | 62 | 63 | //System.out.println("Predicted " + sentiToken.getPredictedClass()); 64 | 65 | if (sentiToken.isSemanticOutgoingEdgesIncludeNegation() && predictedSentiment == Sentiment.POS) 66 | sentiment.add(Sentiment.NEG); 67 | else if (sentiToken.isSemanticOutgoingEdgesIncludeNegation() && predictedSentiment == Sentiment.NEG) 68 | sentiment.add(Sentiment.POS); 69 | else 70 | sentiment.add(predictedSentiment); 71 | 72 | } 73 | 74 | public boolean isComplete() 75 | { 76 | if (tokensWithFeature == null || tokensWithFeature.isEmpty()) 77 | return false; 78 | if (tokensWithSentiment == null || tokensWithSentiment.isEmpty()) 79 | return false; 80 | return true; 81 | } 82 | 83 | //use voting among sentiments to see how we did 84 | public boolean isCorrect() 85 | { 86 | //biggest gap among tokens must be no greater than one token (for simplicity of business logic) 87 | if (tokensWithFeature == null || tokensWithFeature.isEmpty()) 88 | return false; 89 | if (tokensWithSentiment == null || tokensWithSentiment.isEmpty()) 90 | return false; 91 | 92 | //check to see if we got the sentiment polarity correct 93 | int numPos = 0; 94 | int numNeg = 0; 95 | boolean sentimentIsCorrect = false; 96 | for (Sentiment nextSentiment : sentiment) 97 | { 98 | switch (nextSentiment) 99 | { 100 | case POS: 101 | numPos++; 102 | break; 103 | case NEG: 104 | numNeg++; 105 | break; 106 | } 107 | } 108 | if (numPos > numNeg && opinion.getSentiment() == Sentiment.POS) 109 | sentimentIsCorrect = true; 110 | else if (numNeg > numPos && opinion.getSentiment() == Sentiment.NEG) 111 | sentimentIsCorrect = true; 112 | else 113 | return false; 114 | 115 | //TODO: maybe assign score based on getting more tokens correct and correctly predicting the feature 116 | if (sentimentIsCorrect) 117 | return true; 118 | 119 | return false; 120 | } 121 | @Override 122 | public String toString() { 123 | StringBuilder sb = new StringBuilder(); 124 | 125 | sb.append("ReconciledFeatureOpinion [tokensWithFeature="); 126 | if (tokensWithFeature == null) 127 | sb.append("null"); 128 | else 129 | { 130 | boolean first = true; 131 | sb.append("["); 132 | for (TokenWithContext token : tokensWithFeature) 133 | { 134 | if (!first) 135 | sb.append(", "); 136 | first = false; 137 | sb.append(token.getToken()); 138 | } 139 | sb.append("]"); 140 | } 141 | sb.append(", tokensWithSentiment="); 142 | if (tokensWithSentiment == null) 143 | sb.append("null"); 144 | else 145 | { 146 | boolean first = true; 147 | sb.append("["); 148 | for (TokenWithContext token : tokensWithSentiment) 149 | { 150 | if (!first) 151 | sb.append(", "); 152 | first = false; 153 | sb.append(token.getToken()); 154 | } 155 | sb.append("]"); 156 | } 157 | sb.append(", opinion=").append(opinion); 158 | 159 | sb.append(", sentiment="); 160 | if (sentiment == null) 161 | sb.append("null"); 162 | else 163 | { 164 | boolean first = true; 165 | sb.append("["); 166 | for (Sentiment nextSentiment : sentiment) 167 | { 168 | if (!first) 169 | sb.append(", "); 170 | first = false; 171 | sb.append(nextSentiment); 172 | } 173 | sb.append("]"); 174 | } 175 | sb.append("]"); 176 | 177 | return sb.toString(); 178 | } 179 | 180 | /* 181 | public static void main(String[] args) 182 | { 183 | Sentence sentence = new Sentence("screen[+2],sound[+2]##great screen and great sound ."); 184 | 185 | ReconciledFeatureOpinion firstOpinion = 186 | 187 | } 188 | */ 189 | 190 | 191 | } 192 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/Review.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | import java.util.List; 4 | 5 | public class Review { 6 | String title; 7 | List sentences; 8 | 9 | public Review(String title) { 10 | super(); 11 | this.title = title; 12 | } 13 | public String getTitle() { 14 | return title; 15 | } 16 | public void setTitle(String title) { 17 | this.title = title; 18 | } 19 | public List getSentences() { 20 | return sentences; 21 | } 22 | public void setSentences(List sentences) { 23 | this.sentences = sentences; 24 | } 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/SemanticallyTaggedTokenWithContext.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | public class SemanticallyTaggedTokenWithContext implements Cloneable { 4 | private String semanticRole; 5 | private TokenWithContext tokenWithContext; 6 | 7 | 8 | 9 | public SemanticallyTaggedTokenWithContext(String semanticRole, 10 | TokenWithContext tokenWithContext) { 11 | super(); 12 | this.semanticRole = semanticRole; 13 | this.tokenWithContext = tokenWithContext; 14 | } 15 | public String getSemanticRole() { 16 | return semanticRole; 17 | } 18 | public void setSemanticRole(String semanticRole) { 19 | this.semanticRole = semanticRole; 20 | } 21 | public TokenWithContext getTokenWithContext() { 22 | return tokenWithContext; 23 | } 24 | public void setTokenWithContext(TokenWithContext tokenWithContext) { 25 | this.tokenWithContext = tokenWithContext; 26 | } 27 | 28 | 29 | 30 | /* (non-Javadoc) 31 | * @see java.lang.Object#clone() 32 | */ 33 | @Override 34 | protected Object clone() throws CloneNotSupportedException { 35 | // TODO Auto-generated method stub 36 | SemanticallyTaggedTokenWithContext clone = (SemanticallyTaggedTokenWithContext) super.clone(); 37 | clone.tokenWithContext = (TokenWithContext) tokenWithContext.clone(); 38 | return clone; 39 | } 40 | @Override 41 | public String toString() { 42 | return "SemanticallyTaggedTokenWithContext [semanticRole=" 43 | + semanticRole + ", tokenWithContext=" + tokenWithContext.getToken() + "]"; 44 | } 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/Sentiment.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | public enum Sentiment { 4 | POS, 5 | OBJ, 6 | NEG; 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/SimpleSentence.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | import java.util.List; 4 | 5 | public class SimpleSentence { 6 | private String sentence; 7 | private List opinions; 8 | private boolean needsOpinionParsing; 9 | 10 | public SimpleSentence(String sentence, boolean needsOpinionParsing) { 11 | super(); 12 | this.sentence = sentence; 13 | this.opinions = null; 14 | this.needsOpinionParsing = needsOpinionParsing; 15 | } 16 | public String getSentence() { 17 | return sentence; 18 | } 19 | public void setSentence(String sentence) { 20 | this.sentence = sentence; 21 | } 22 | public List getOpinions() { 23 | return opinions; 24 | } 25 | public void setOpinions(List opinions) { 26 | this.opinions = opinions; 27 | } 28 | public boolean isNeedsOpinionParsing() { 29 | return needsOpinionParsing; 30 | } 31 | 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/Task.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | public enum Task { 4 | BINGLIU, 5 | SEMEVALTASK4PART1, 6 | SEMEVALTASK4PART2 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/TokenWithContext.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model; 2 | 3 | import java.util.List; 4 | 5 | import ca.carter.thesis.ml.ModelType; 6 | import ca.carter.thesis.model.phrasetree.PartOfSentimentStructure; 7 | import ca.carter.thesis.model.phrasetree.PartOfSpeech; 8 | 9 | public class TokenWithContext implements Cloneable { 10 | private String token; 11 | private String lemma; 12 | private PartOfSpeech pos; 13 | private List previousTokens; 14 | private List nextTokens; 15 | private List parentage; //the clause hierarchy above this token 16 | private boolean isNamedEntity; 17 | private boolean isCoreferenceHead; 18 | private String flatResolvedCoreference; 19 | private String attribute; //if adjective, what attribute the adjective describes, according to WordNet 20 | private int positionInSentence; 21 | 22 | //for inside/outside tagging; i.e., named entity, feature, sentiment, null 23 | private PartOfSentimentStructure partOfSentimentStructure; 24 | 25 | //dependency graph features 26 | private String semanticSpecificRole; 27 | private String semanticGeneralRole; 28 | private TokenWithContext semanticIncomingEdge; 29 | private boolean semanticOutgoingEdgesIncludeNegation; 30 | private List semanticallyTaggedTokensWithContext; 31 | 32 | private ProductFeatureOpinion opinion; 33 | 34 | private ModelType predictedModel = null; 35 | private Double predictedClass = null; 36 | 37 | public TokenWithContext(int positionInSentence, String token, String lemma, PartOfSpeech pos, 38 | List previousTokens, 39 | List nextTokens, List parentage, boolean isNamedEntity) { 40 | super(); 41 | this.positionInSentence = positionInSentence; 42 | this.token = token; 43 | this.lemma = lemma; 44 | this.pos = pos; 45 | this.previousTokens = previousTokens; 46 | this.nextTokens = nextTokens; 47 | this.parentage = parentage; 48 | this.isNamedEntity = isNamedEntity; 49 | 50 | } 51 | 52 | 53 | public int getPositionInSentence() { 54 | return positionInSentence; 55 | } 56 | public void setPositionInSentence(int positionInSentence) { 57 | this.positionInSentence = positionInSentence; 58 | } 59 | public String getToken() { 60 | return token; 61 | } 62 | public void setToken(String token) { 63 | this.token = token; 64 | } 65 | public String getLemma() { 66 | return lemma; 67 | } 68 | public void setLemma(String lemma) { 69 | this.lemma = lemma; 70 | } 71 | public PartOfSpeech getPos() { 72 | return pos; 73 | } 74 | public void setPos(PartOfSpeech pos) { 75 | this.pos = pos; 76 | } 77 | public PartOfSentimentStructure getPartOfSentimentStructure() { 78 | return partOfSentimentStructure; 79 | } 80 | public void setPartOfSentimentStructure( 81 | PartOfSentimentStructure partOfSentimentStructure) { 82 | this.partOfSentimentStructure = partOfSentimentStructure; 83 | } 84 | public List getPreviousTokens() { 85 | return previousTokens; 86 | } 87 | public void setPreviousTokens(List previousTokens) { 88 | this.previousTokens = previousTokens; 89 | } 90 | public List getNextTokens() { 91 | return nextTokens; 92 | } 93 | public void setNextTokens(List nextTokens) { 94 | this.nextTokens = nextTokens; 95 | } 96 | public boolean isAdjective() 97 | { 98 | if (pos == null) 99 | return false; 100 | 101 | if (pos == PartOfSpeech.JJ || pos == PartOfSpeech.JJR || pos == PartOfSpeech.JJS) 102 | return true; 103 | 104 | return false; 105 | } 106 | public List getLocalParentage() { 107 | //returns only the portion of the parentage up to the next S or SBAR (whichever is higher); so 108 | //[S, VP, NP, SBAR, S, VP, PP, NP, SBAR, S, VP, SBAR, S, VP, PP, NP] becomes 109 | // [SBAR, S, VP, PP, NP] instead 110 | 111 | if (parentage == null) 112 | return null; 113 | 114 | for (int index = parentage.size() - 1; index >= 0; index--) 115 | { 116 | if (parentage.get(index) == PartOfSpeech.S) 117 | { 118 | //lookbehind 119 | if (index > 0 && parentage.get(index - 1) == PartOfSpeech.SBAR) 120 | index--; 121 | 122 | return parentage.subList(index, parentage.size()); 123 | } 124 | } 125 | 126 | return parentage; 127 | } 128 | public List getParentage() { 129 | //returns all the clauses of which this is a part; could look like [S, NP] in a simple case, or 130 | //[S, VP, NP, SBAR, S, VP, PP, NP, SBAR, S, VP, SBAR, S, VP, PP, NP] in an ugly case 131 | return parentage; 132 | } 133 | public void setParentage(List parentage) { 134 | this.parentage = parentage; 135 | } 136 | public boolean isCoreferenceHead() { 137 | return isCoreferenceHead; 138 | } 139 | public void setCoreferenceHead(boolean isCoreferenceHead) { 140 | this.isCoreferenceHead = isCoreferenceHead; 141 | } 142 | public String getFlatResolvedCoreference() { 143 | return flatResolvedCoreference; 144 | } 145 | public void setFlatResolvedCoreference(String flatResolvedCoreference) { 146 | this.flatResolvedCoreference = flatResolvedCoreference; 147 | } 148 | public PartOfSpeech getImmediateParent() { 149 | if (parentage == null || parentage.isEmpty()) 150 | return null; 151 | else 152 | return parentage.get(0); 153 | } 154 | public TokenWithContext getPreviousToken() { 155 | if (previousTokens == null || previousTokens.isEmpty()) 156 | return null; 157 | else 158 | return previousTokens.get(previousTokens.size() - 1); 159 | } 160 | public TokenWithContext getNextToken() { 161 | if (nextTokens == null || nextTokens.isEmpty()) 162 | return null; 163 | else 164 | return nextTokens.get(0); 165 | } 166 | 167 | 168 | //not part of classifier feature set 169 | public ModelType getPredictedModel() { 170 | return predictedModel; 171 | } 172 | public void setPredictedModel(ModelType predictedModel) { 173 | this.predictedModel = predictedModel; 174 | } 175 | public Double getPredictedClass() { 176 | return predictedClass; 177 | } 178 | public void setPredictedClass(Double predictedClass) { 179 | this.predictedClass = predictedClass; 180 | } 181 | 182 | 183 | public List getSemanticallyTaggedTokensWithContext() { 184 | return semanticallyTaggedTokensWithContext; 185 | } 186 | public void setSemanticallyTaggedTokensWithContext( 187 | List semanticallyTaggedTokensWithContext) { 188 | this.semanticallyTaggedTokensWithContext = semanticallyTaggedTokensWithContext; 189 | } 190 | public String getSemanticSpecificRole() { 191 | return semanticSpecificRole; 192 | } 193 | public void setSemanticSpecificRole(String semanticSpecificRole) { 194 | this.semanticSpecificRole = semanticSpecificRole; 195 | } 196 | public String getSemanticGeneralRole() { 197 | return semanticGeneralRole; 198 | } 199 | public void setSemanticGeneralRole(String semanticGeneralRole) { 200 | this.semanticGeneralRole = semanticGeneralRole; 201 | } 202 | public TokenWithContext getSemanticIncomingEdge() { 203 | return semanticIncomingEdge; 204 | } 205 | public void setSemanticIncomingEdge(TokenWithContext semanticIncomingEdge) { 206 | this.semanticIncomingEdge = semanticIncomingEdge; 207 | } 208 | public boolean isSemanticOutgoingEdgesIncludeNegation() { 209 | return semanticOutgoingEdgesIncludeNegation; 210 | } 211 | public void setSemanticOutgoingEdgesIncludeNegation( 212 | boolean semanticOutgoingEdgesIncludeNegation) { 213 | this.semanticOutgoingEdgesIncludeNegation = semanticOutgoingEdgesIncludeNegation; 214 | } 215 | public boolean isNamedEntity() { 216 | return this.isNamedEntity; 217 | } 218 | public void setNamedEntity(boolean isNamedEntity) { 219 | this.isNamedEntity = isNamedEntity; 220 | } 221 | public String getAttribute() { 222 | return attribute; 223 | } 224 | public void setAttribute(String attribute) { 225 | this.attribute = attribute; 226 | } 227 | 228 | 229 | //not part of classifier features 230 | public ProductFeatureOpinion getOpinion() { 231 | return opinion; 232 | } 233 | public void setOpinion(ProductFeatureOpinion opinion) { 234 | this.opinion = opinion; 235 | } 236 | 237 | 238 | private String flattenTokenList(List list) 239 | { 240 | if (list == null) 241 | return "[]"; 242 | 243 | StringBuilder sb = new StringBuilder(); 244 | 245 | boolean first = true; 246 | sb.append("["); 247 | for (TokenWithContext nextToken : list) 248 | { 249 | if (first) 250 | first = false; 251 | else 252 | sb.append(","); 253 | 254 | if (nextToken == null) 255 | sb.append("null"); 256 | else 257 | sb.append(nextToken.getToken()); 258 | } 259 | sb.append("]"); 260 | 261 | return sb.toString(); 262 | } 263 | 264 | 265 | public String getFormattedTokenContext() 266 | { 267 | StringBuilder sb = new StringBuilder(); 268 | 269 | if (this.getPreviousTokens() != null) 270 | { 271 | for (TokenWithContext toPrint : this.getPreviousTokens()) 272 | { 273 | if (toPrint != null) 274 | sb.append(toPrint.getToken()).append(" "); 275 | } 276 | } 277 | 278 | 279 | sb.append("_").append(this.getToken()).append("_ "); 280 | 281 | if (this.getPreviousTokens() != null) 282 | { 283 | for (TokenWithContext toPrint : this.getNextTokens()) 284 | { 285 | if (toPrint != null) 286 | sb.append(toPrint.getToken()).append(" "); 287 | } 288 | } 289 | 290 | 291 | 292 | return sb.toString(); 293 | 294 | } 295 | 296 | 297 | 298 | 299 | 300 | /* (non-Javadoc) 301 | * @see java.lang.Object#clone() 302 | */ 303 | @Override 304 | protected Object clone() throws CloneNotSupportedException { 305 | // TODO Auto-generated method stub 306 | TokenWithContext clone = (TokenWithContext) super.clone(); 307 | 308 | clone.semanticallyTaggedTokensWithContext = null; 309 | 310 | clone.previousTokens = null; 311 | 312 | clone.nextTokens = null; 313 | 314 | clone.parentage = null; 315 | 316 | clone.semanticIncomingEdge = null; 317 | 318 | return clone; 319 | } 320 | 321 | 322 | @Override 323 | public String toString() { 324 | 325 | return "TokenWithContext [token=" + token + ", lemma=" + lemma 326 | + ", pos=" + pos + ", previousTokens=" + flattenTokenList(previousTokens) 327 | + ", nextTokens=" + flattenTokenList(nextTokens) + ", parentage=" + parentage 328 | + ", localParentage=" + getLocalParentage() 329 | + ", isNamedEntity=" + isNamedEntity 330 | + ", isCoreferenceHead=" + isCoreferenceHead 331 | + ", flatResolvedCoreference=" + flatResolvedCoreference 332 | + ", partOfSentimentStructure=" + partOfSentimentStructure 333 | + ", semanticSpecificRole=" + semanticSpecificRole 334 | + ", semanticGeneralRole=" + semanticGeneralRole 335 | + ", semanticIncomingEdge=" + (semanticIncomingEdge == null ? "null" : semanticIncomingEdge.getToken() + "-" + semanticIncomingEdge.getPos() + "-" + semanticIncomingEdge.getSemanticSpecificRole() ) 336 | + ", semanticOutgoingEdgesIncludeNegation=" + semanticOutgoingEdgesIncludeNegation 337 | + ", semanticallyTaggedTokensWithContext=" + semanticallyTaggedTokensWithContext 338 | + ", opinion=" + opinion 339 | + ", attribute=" + attribute 340 | + "]"; 341 | } 342 | 343 | 344 | 345 | 346 | 347 | } 348 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/AbstractPhraseTreePart.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | public abstract class AbstractPhraseTreePart { 4 | protected PartOfSpeech pos; 5 | 6 | public PartOfSpeech getPos() { 7 | return pos; 8 | } 9 | public void setPos(PartOfSpeech pos) { 10 | this.pos = pos; 11 | } 12 | 13 | //for speed, we avoid doing class instance comparisons, and instead implement a simple fixed boolean return. 14 | public boolean isToken() 15 | { 16 | return false; 17 | } 18 | 19 | public String value() 20 | { 21 | return pos.toString(); 22 | } 23 | 24 | //a convenience method so that we can avoid some class casting in PhraseTree.toString(); 25 | protected String toString(int indent) 26 | { 27 | return null; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/PartOfSentimentStructure.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | public enum PartOfSentimentStructure { 4 | PRODUCT, 5 | FEATURE, 6 | OPINION, 7 | OPINIONHOLDER, 8 | TIMEOFOPINION 9 | } 10 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/PartOfSpeech.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | 4 | 5 | 6 | public enum PartOfSpeech implements Cloneable { 7 | 8 | //from http://bulba.sdsu.edu/jeanette/thesis/PennTags.html 9 | 10 | //clauses 11 | S, 12 | SBAR, 13 | SBARQ, 14 | SINV, 15 | SQ, 16 | 17 | //phrases 18 | ADJP, // - Adjective Phrase. 19 | ADVP, // - Adverb Phrase. 20 | CONJP, // - Conjunction Phrase. 21 | FRAG, // - Fragment. 22 | INTJ, // - Interjection. Corresponds approximately to the part-of-speech tag UH. 23 | LST, // - List marker. Includes surrounding punctuation. 24 | NAC, // - Not a Constituent; used to show the scope of certain prenominal modifiers within an NP. 25 | NP, // - Noun Phrase. 26 | NPTMP, // - weird temporal noun phrase 27 | NX, // - Used within certain complex NPs to mark the head of the NP. Corresponds very roughly to N-bar level but used quite differently. 28 | PP, // - Prepositional Phrase. 29 | PRN, // - Parenthetical. 30 | PRT, // - Particle. Category for words that should be tagged RP. 31 | QP, // - Quantifier Phrase (i.e. complex measure/amount phrase); used within NP. 32 | RRC, // - Reduced Relative Clause. 33 | UCP, // - Unlike Coordinated Phrase. 34 | VP, // - Verb Phrase. 35 | WHADJP, // - Wh-adjective Phrase. Adjectival phrase containing a wh-adverb, as in how hot. 36 | WHAVP, // - Wh-adverb Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing a wh-adverb such as how or why. 37 | WHNP, // - Wh-noun Phrase. Introduces a clause with an NP gap. May be null (containing the 0 complementizer) or lexical, containing some wh-word, e.g. who, which book, whose daughter, none of which, or how many leopards. 38 | WHPP, // - Wh-prepositional Phrase. Prepositional phrase containing a wh-noun phrase (such as of which or by whose authority) that either introduces a PP gap or is contained by a WHNP. 39 | X, // - Unknown, uncertain, or unbracketable. X is often used for bracketing typos and in bracketing the...the-constructions. 40 | XS, // - Unknown sentence? Seems to apply to "more than" or "less than" type constructs, i.e., ...and then finally after less than 60 days ,... 41 | 42 | WHADVP, //extra 43 | PUNCTCOLON, 44 | PUNCTCOMMA, 45 | PUNCTENDOFSENTENCE, 46 | PUNCTCURRENCY, 47 | PUNCTQUOTATIONMARK, 48 | PUNCTHASH, 49 | LRB, // ( 50 | RRB, // ) 51 | 52 | //from http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html 53 | 54 | CC , 55 | CD , 56 | DT , 57 | EX , 58 | FW , 59 | IN , 60 | JJ , 61 | JJR , 62 | JJS , 63 | LS , 64 | MD , 65 | NN , 66 | NNS , 67 | NNP , 68 | NNPS , 69 | PDT , 70 | POS , 71 | PRP , 72 | PRP$ , 73 | RB , 74 | RBR , 75 | RBS , 76 | RP , 77 | SYM , 78 | TO , 79 | UH , 80 | VB , 81 | VBD , 82 | VBG , 83 | VBN , 84 | VBP , 85 | VBZ , 86 | WDT , 87 | WP , 88 | WP$ , 89 | WRB; 90 | 91 | //alternatively, this could probably be parsed out of edu.stanford.nlp.parser.lexparser.EnglishTreebankParserParams.sisterSplitters() 92 | 93 | public static PartOfSpeech fromString (String string) throws java.lang.IllegalArgumentException 94 | { 95 | if (string.equals(":")) 96 | return PartOfSpeech.PUNCTCOLON; 97 | else if (string.equals(",")) 98 | return PartOfSpeech.PUNCTCOMMA; 99 | else if (string.equals(".")) 100 | return PartOfSpeech.PUNCTENDOFSENTENCE; 101 | else if (string.equals("$")) 102 | return PartOfSpeech.PUNCTCURRENCY; 103 | else if (string.equals("-LRB-")) 104 | return PartOfSpeech.LRB; 105 | else if (string.equals("-RRB-")) 106 | return PartOfSpeech.RRB; 107 | else if (string.equals("NP-TMP")) 108 | //return PartOfSpeech.NP; //TODO: this is an interesting case, as in "the p/n button switches your dvd players video output signal between pal and ntsc ." 109 | return PartOfSpeech.NPTMP; 110 | else if (string.equals("''") || string.equals("\"") || string.equals("``")) 111 | return PartOfSpeech.PUNCTQUOTATIONMARK; 112 | else if (string.equals("#")) 113 | return PartOfSpeech.PUNCTHASH; 114 | else 115 | //may throw IllegalArgumentException 116 | return PartOfSpeech.valueOf(string); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/PhraseTree.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Stack; 6 | import java.util.StringTokenizer; 7 | 8 | import ca.carter.thesis.languagemodels.DefaultTokenizer; 9 | import edu.stanford.nlp.ling.CoreLabel; 10 | import edu.stanford.nlp.ling.Sentence; 11 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 12 | import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 13 | import edu.stanford.nlp.pipeline.Annotation; 14 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 15 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; 16 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations.AnnotatedTree; 17 | import edu.stanford.nlp.trees.Tree; 18 | import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; 19 | import edu.stanford.nlp.util.CoreMap; 20 | 21 | public class PhraseTree extends AbstractPhraseTreePart { 22 | private List leaves; 23 | private List> flatLeaves; 24 | 25 | private static final LexicalizedParser lexParser = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); 26 | 27 | public PhraseTree() 28 | { 29 | super(); 30 | } 31 | 32 | public PhraseTree(String string) 33 | { 34 | StanfordCoreNLP pipeline = ca.carter.thesis.model.Sentence.getDefaultPipeline(); 35 | Annotation document = new Annotation(string); 36 | pipeline.annotate(document); 37 | List sentences = document.get(SentencesAnnotation.class); 38 | 39 | CoreMap sentence = sentences.get(0); 40 | Tree lexTree = sentence.get(TreeAnnotation.class); 41 | 42 | this.pos = PartOfSpeech.valueOf(lexTree.firstChild().value()); 43 | this.leaves = organizeTree(lexTree.firstChild(), true); 44 | this.flatLeaves = flatTree(lexTree.firstChild(), null, null, true); 45 | 46 | Tree sentiTree = sentence.get(SentimentCoreAnnotations.AnnotatedTree.class); 47 | sentiTree.pennPrint(); 48 | 49 | //RNNCoreAnnotations. 50 | 51 | System.out.println("Lexi tree is : " + lexTree); 52 | System.out.println("Senti tree is : " + sentiTree); 53 | 54 | 55 | } 56 | 57 | public PhraseTree(Tree lexTree, String pos) 58 | { 59 | this.leaves = organizeTree(lexTree, true); 60 | 61 | this.flatLeaves = flatTree(lexTree.firstChild(), null, null, true); 62 | 63 | try 64 | { 65 | this.pos = PartOfSpeech.fromString(pos); 66 | } 67 | catch (java.lang.IllegalArgumentException e) 68 | { 69 | System.err.println("No part of speech for " + pos); 70 | } 71 | } 72 | 73 | 74 | /*protected Tree getParseTreeForPhrase(String phrase) 75 | { 76 | StringTokenizer st = DefaultTokenizer.getDefaultTokenizer(phrase); 77 | int numTokens = st.countTokens(); 78 | String[] tokens = new String[numTokens]; 79 | for (int i = 0; i < numTokens; i++) 80 | { 81 | tokens[i] = st.nextToken(); 82 | } 83 | 84 | List rawWords = Sentence.toCoreLabelList(tokens); 85 | Tree parseTree = lexParser.apply(rawWords); 86 | 87 | return parseTree; 88 | }*/ 89 | 90 | private List organizeTree(Tree tree, boolean topLevel) 91 | { 92 | List leaves = new ArrayList(); 93 | 94 | for (Tree child : tree.getChildrenAsList()) 95 | { 96 | if (child.isPreTerminal()) 97 | { 98 | PartOfSpeech pos = null; 99 | String childValue = child.value(); 100 | try 101 | { 102 | pos = PartOfSpeech.fromString(childValue); 103 | } 104 | catch (java.lang.IllegalArgumentException e) 105 | { 106 | System.err.println("Could not get part of speech. Value of child is " + child.value() + " and its first child is " + child.firstChild().value()); 107 | //System.err.println(tree.getChildrenAsList()); 108 | e.printStackTrace(); 109 | throw(e); //only throw for debugging purposes; spits out full sentence if we do 110 | } 111 | leaves.add(new TokenLeaf(child.firstChild().value(), pos)); 112 | 113 | } 114 | else 115 | { 116 | leaves.add(new PhraseTree(child, child.value())); 117 | } 118 | } 119 | 120 | return leaves; 121 | } 122 | 123 | protected List> flatTree(Tree tree, List> list, Stack depthStack, boolean topLevel) 124 | { 125 | if (topLevel) 126 | { 127 | list = new ArrayList>(); 128 | depthStack = new Stack(); 129 | } 130 | 131 | for (Tree child : tree.getChildrenAsList()) 132 | { 133 | if (child.isPreTerminal() || child.isLeaf()) 134 | { 135 | list.add((Stack) depthStack.clone()); 136 | } 137 | else 138 | { 139 | PartOfSpeech pos = null; 140 | String childValue = child.value(); 141 | 142 | try 143 | { 144 | pos = PartOfSpeech.fromString(childValue); 145 | } 146 | catch (java.lang.IllegalArgumentException e) 147 | { 148 | //not all that important 149 | if (child.firstChild() != null) 150 | { 151 | System.err.println("Could not get part of speech. Value of child is " + childValue + " and its first child is " + (child.firstChild() == null ? "null" : child.firstChild().value())); 152 | throw(e); //only throw for debugging purposes; spits out full sentence if we do 153 | } 154 | } 155 | 156 | 157 | depthStack.push(pos); 158 | flatTree(child, list, depthStack, false); 159 | depthStack.pop(); 160 | 161 | //leaves.add(new PhraseTree(child, child.value())); 162 | } 163 | } 164 | 165 | if (topLevel) 166 | return list; 167 | else 168 | return null; 169 | } 170 | 171 | public List getLeaves() { 172 | return leaves; 173 | } 174 | 175 | public void setLeaves(List leaves) { 176 | this.leaves = leaves; 177 | } 178 | 179 | @Override 180 | public String toString() 181 | { 182 | return toString(0); 183 | } 184 | 185 | public List> getFlatLeaves() { 186 | return flatLeaves; 187 | } 188 | 189 | public void setFlatLeaves(List> flatLeaves) { 190 | this.flatLeaves = flatLeaves; 191 | } 192 | 193 | @Override 194 | protected String toString(int indent) 195 | { 196 | StringBuilder sb = new StringBuilder(); 197 | sb.append(new String(new char[indent]).replace('\0', ' ')); 198 | sb.append(pos).append("(\n"); 199 | for (AbstractPhraseTreePart nextLeaf : leaves) 200 | { 201 | if (nextLeaf.isToken()) 202 | sb.append(new String(new char[indent + 2]).replace('\0', ' ')).append(nextLeaf.toString()).append("\n"); 203 | else 204 | sb.append(nextLeaf.toString(indent + 2)).append("\n"); 205 | } 206 | sb.append(new String(new char[indent]).replace('\0', ' ')); 207 | sb.append(")"); 208 | return sb.toString(); 209 | } 210 | 211 | 212 | public static void main(String[] args) 213 | { 214 | System.out.println("Starting"); 215 | 216 | /* 217 | PhraseTree noPt = new PhraseTree(); 218 | String[] sent = { "This", "is", "an", "easy", "sentence", "-", "in", "theory", ",", "so", "it", "goes", "eh", "?" }; 219 | List rawWords = Sentence.toCoreLabelList(sent); 220 | Tree parse = PhraseTree.lexParser.apply(rawWords); 221 | parse.pennPrint(); 222 | System.out.println(); 223 | */ 224 | 225 | 226 | PhraseTree[] phraseTrees = 227 | { 228 | //new PhraseTree("We intend to raise this violation of the Security Council resolution, if it goes forward, in the U.N.,"), 229 | //new PhraseTree("UN Security Council"), 230 | //new PhraseTree("From the loan that we took up, EUR 5 bln from EU and EUR 1 bln from the World Bank are intended for the Finance Ministry. Not any EUR intended for the Finance Ministry and not any EUR from the money intended for BNR can go to salaries or bonuses. This money is for Romania,"), 231 | //new PhraseTree("The line was really rather long."), 232 | //new PhraseTree("the voice quality is very good , and it gets great reception ( that is , in places where you get t-mobile coverage , which is not that good ; see below ) ."), 233 | //new PhraseTree("This is an easy phrase to parse - in theory , or so it goes , eh ?"), 234 | 235 | //testing negation 236 | //new PhraseTree("remote control are only so-so ; it doesn't show the complete filenames of mp3s with really long names ."), 237 | 238 | //testing negation 239 | new PhraseTree("the voice quality is poor, but not its reception"), 240 | }; 241 | 242 | for (PhraseTree pt : phraseTrees) 243 | { 244 | System.out.println("toString: " + pt.toString()); 245 | System.out.println("getFlatLeaves: " + pt.getFlatLeaves()); 246 | } 247 | 248 | System.out.println("Done"); 249 | 250 | } 251 | 252 | 253 | /* 254 | //System.out.println("-----------"); 255 | System.out.print(subTree.pennString()); 256 | System.out.println( 257 | subTree.isPhrasal() + " / " + 258 | subTree.isPrePreTerminal() + " / " + 259 | subTree.isPreTerminal() + " / " + 260 | //subTree.label() + " / " + "\n" + 261 | //subTree.labels() + " / " + "\n" + 262 | //subTree.value() + " / " + "\n" + //actual word or part of speech, depending on node 263 | //subTree.getChildrenAsList() + 264 | "" 265 | ); 266 | */ 267 | 268 | /* 269 | public void demoAPI(LexicalizedParser lp) { 270 | // This option shows parsing a list of correctly tokenized words 271 | String[] sent = { "This", "is", "an", "easy", "sentence", "." }; 272 | List rawWords = Sentence.toCoreLabelList(sent); 273 | Tree parse = lp.apply(rawWords); 274 | parse.pennPrint(); 275 | System.out.println(); 276 | 277 | 278 | // This option shows loading and using an explicit tokenizer 279 | String sent2 = "This is another sentence."; 280 | TokenizerFactory tokenizerFactory = 281 | PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 282 | List rawWords2 = 283 | tokenizerFactory.getTokenizer(new StringReader(sent2)).tokenize(); 284 | parse = lp.apply(rawWords2); 285 | 286 | TreebankLanguagePack tlp = new PennTreebankLanguagePack(); 287 | GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); 288 | GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); 289 | List tdl = gs.typedDependenciesCCprocessed(); 290 | System.out.println(tdl); 291 | System.out.println(); 292 | 293 | TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); 294 | tp.printTree(parse); 295 | } 296 | */ 297 | } 298 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/StringWithTree.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | public class StringWithTree { 4 | private String string; 5 | private PhraseTree phraseTree; 6 | 7 | public StringWithTree(String string) { 8 | super(); 9 | this.string = string; 10 | if (string != null) 11 | phraseTree = new PhraseTree(string); 12 | } 13 | public String getString() { 14 | return string; 15 | } 16 | public void setString(String string) { 17 | this.string = string; 18 | } 19 | public PhraseTree getPhraseTree() { 20 | return phraseTree; 21 | } 22 | public void setPhraseTree(PhraseTree phraseTree) { 23 | this.phraseTree = phraseTree; 24 | } 25 | @Override 26 | public String toString() { 27 | return string; 28 | } 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/ca/carter/thesis/model/phrasetree/TokenLeaf.java: -------------------------------------------------------------------------------- 1 | package ca.carter.thesis.model.phrasetree; 2 | 3 | public class TokenLeaf extends AbstractPhraseTreePart { 4 | private String token; 5 | 6 | public static final String capitalizedWordIndicator = "c"; 7 | 8 | public TokenLeaf(String string, PartOfSpeech pos) throws java.lang.IllegalArgumentException 9 | { 10 | super(); 11 | this.token = string; 12 | this.pos = pos; 13 | } 14 | 15 | public String getToken() { 16 | return token; 17 | } 18 | 19 | public void setToken(String token) { 20 | this.token = token; 21 | } 22 | 23 | @Override 24 | public String value() 25 | { 26 | return token; 27 | } 28 | 29 | @Override 30 | public boolean isToken() 31 | { 32 | return true; 33 | } 34 | 35 | @Override 36 | public String toString() { 37 | return pos + "(" + token + ")"; 38 | } 39 | 40 | 41 | } 42 | --------------------------------------------------------------------------------