├── .gitignore ├── LICENSE ├── README.md ├── classifier ├── ClassifierFactory.java ├── IClassifier.java ├── Parameter.java ├── StructuredPerceptron.java └── svm │ ├── LibSVMInterface.java │ ├── libsvm │ ├── svm.java │ ├── svm.m4 │ ├── svm.m4~ │ ├── svm_model.java │ ├── svm_node.java │ ├── svm_parameter.java │ ├── svm_print_interface.java │ └── svm_problem.java │ ├── svm_predict.java │ ├── svm_scale.java │ ├── svm_toy.java │ └── svm_train.java ├── cluster ├── Cluster.java ├── EM.java ├── EvaluateModelCandidates.java ├── HAC.java ├── IClustering.java ├── TFIDF.java ├── agglomeration │ ├── AgglomerationMethod.java │ └── AverageLinkage.java └── experiment │ ├── DissimilarityMeasure.java │ └── EecbDissimilarityMeasure.java ├── costfunction ├── ICostFunction.java └── LinearCostFunction.java ├── data ├── EecbCharSeq.java ├── EecbClusterDocument.java ├── EecbElement.java ├── EecbEntity.java ├── EecbEntityMention.java ├── EecbEvent.java ├── EecbEventMention.java ├── EecbMention.java ├── EecbSrlAnnotation.java ├── EecbToken.java └── EecbTopic.java ├── dataset ├── CorefSystem.java ├── CrossTopic.java ├── DatasetFactory.java ├── IDataSet.java └── TopicGeneration.java ├── example ├── ReadLearnedWeight.java ├── VectorNormalization.java └── Weight.java ├── experiment ├── CrossCoreferenceResolution.java ├── ExperimentConfigurationFactory.java ├── ExperimentConstructor.java └── ProcessDataSVM.java ├── featureExtractor ├── SRLAlignment.java ├── SRLDocument.java ├── SRLDocumentReader.java ├── SRLExtraction.java ├── SemanticOutputInterface.java ├── SimilarityVector.java ├── SrlResultIncorporation.java ├── WordSimilarity.java └── Wordnet.java ├── features ├── Feature.java ├── FeatureFactory.java ├── FeatureVectorGenerator.java ├── NominalFeature.java ├── NumericFeature.java └── individualfeature │ ├── Animacy.java │ ├── Gender.java │ ├── Head.java │ ├── Lemma.java │ ├── MentionWord.java │ ├── NEType.java │ ├── NSrlA0.java │ ├── NSrlA1.java │ ├── NSrlA2.java │ ├── NSrlAMLoc.java │ ├── NSrlAgreeCount.java │ ├── NSrlPA0.java │ ├── NSrlPA1.java │ ├── NSrlPA2.java │ ├── NSrlPAMLoc.java │ ├── Number.java │ ├── SrlA0.java │ ├── SrlA1.java │ ├── SrlA2.java │ ├── SrlAMLoc.java │ ├── SrlAgreeCount.java │ ├── SrlLeft.java │ ├── SrlPA0.java │ ├── SrlPA1.java │ ├── SrlPA2.java │ ├── SrlPAMLoc.java │ ├── SrlRight.java │ └── Synonym.java ├── general ├── AverageAnytimeDataCollection.java ├── Counter.java ├── CounterMap.java ├── DoubleOperation.java ├── FinalScore.java ├── FixedSizePriorityQueue.java ├── MapFactory.java ├── MatrixOperation.java ├── PorterStemmer.java ├── PriorityQueue.java ├── SetOperation.java └── StringOperation.java ├── io ├── EECBMentionExtractor.java ├── EecbReader.java ├── EgenericDataSetReader.java ├── EmentionExtractor.java ├── LargeFileWriting.java ├── LargetFileReading.java ├── LibSVM.java └── ResultOutput.java ├── lossfunction ├── ILossFunction.java ├── LinkLossFunction.java └── MetricLossFunction.java ├── method ├── CoreferenceResolutionDecoding.java ├── Dagger.java ├── Decoding.java └── IMethod.java ├── pruning └── Pruning.java ├── score ├── AssignmentAlgorithm.java ├── AssignmentProblem.java ├── CoNLLScorerHelper.java ├── HungarianAlgorithm.java ├── ScorerCEAF.java └── ScorerHelper.java ├── search ├── BeamSearch.java ├── ConstraintGeneration.java ├── ISearch.java ├── SearchFactory.java └── State.java ├── server ├── ClusterConnection.java ├── ExperimentArguments.java ├── ExperimentGeneration.java ├── ExperimentProperties.java ├── JobAssignment.java ├── JobState.java ├── JobSubmit.java ├── Node.java ├── Pipeline.java ├── PipelineConfiguration.java ├── ResultAggregation.java └── pipeline.properties ├── training ├── AROWOnline.java ├── AROWOnlineToBatch.java ├── Batch.java ├── Development.java ├── ITraining.java ├── Online.java ├── OnlinePA.txt ├── OnlineToBatch.java ├── PAOnline.java └── PAOnlineToBatch.java ├── tuning └── TuningFactory.java └── util ├── Command.java ├── CosineSimilarity.java ├── DocumentAlignment.java ├── EecbConstants.java └── EecbConstructor.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.jar 5 | *.war 6 | *.ear 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The code is open source and licensed under the GNU General Public License(v2 or later). This is the full GPL, which allows many free uses, but not its use in distributed proprietary software. 2 | 3 | This program is free software; you can redistribute it and/or 4 | modify it under the terms of the GNU General Public License 5 | as published by the Free Software Foundation; either version 2 6 | of the License, or (at your option) any later version. 7 | 8 | This program is distributed in the hope that it will be useful, 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | GNU General Public License for more details. 12 | 13 | If you have any problem with code, please tell me. Thanks very much. 14 | 15 | Jun Xie(xiejuncs@gmail.com) 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cross-document-coreference-resolution 2 | ===================================== 3 | README 4 | 5 | Jun Xie(xie@eecs.oregonstate.edu) 6 | 7 | This is a cross document coreference resolution system written in java. 8 | 9 | Stanford coreference resolution system is a within document coreference resolution system. Hence, the essential data class is called Document. In the Document class, there are a lot of fields to represent the document information, such as gold ordered mentions by sentence, predicted orded mentions by sentence, predicted coreference clusters, gold coreference clusters, all predicted mentions, all gold mentions. Hence, in order to use stanford coreference resolution system, the very important task is to process each EECB file into a Document object. The example I am imitating is the stanford ACE 2005 machine reading sub-system. For the ACE 2005 corpus, there are two files related with one document, one is key.apf.xml and the other is raw.sgm. They constructed another class called AceDocument to perform the similar role as Document class. The AceDocument class is to represent the gold annotations, for example, AceEntityMention by sentences and AceEventMention by sentences, all AceEntityMention and all AceEventMention. Combined with the predicted mentions processed by Rulebased mention detection component, they changed the AcdDocument object to Document object. Based on the formed Document class, the system does coreference resolution. 10 | 11 | The overall architecture for EECB corpus is similar to their ACE 2005 machine reading sub-system. Due to the difference between EECB corpus and ACE corpus, the implementation is a bit different. The annotation is stored in a text file, called mentions.txt. Each line is represented as follows: 12 | 13 | N or V? (0) Topic(1) Doc(2) Sentence Number(3) CorefID(4) StartIdx(5) EndIdx(6) StartCharIdx(7) EndCharIdx(8) 14 | 15 | So I need to extract the event and entity mention according to the mentions.txt and the original source text, and represent the tokens, mentions, entity, event in my own built data structures for each topic, which consits of several documents (The reason for this is that our task is cross document coreference resolution). Based on those data strutures, I extract the gold annotations based on those data structures and predicted annotations according to the Rule based mention detection component provided by Stanford system. Then I need to adapt my EECBDocument class to Document class. Now, I am working and debugging on the transformation part. After the transformation part, then I can proceed to search part. 16 | 17 | In addition, the mentions.txt does not provide the semantic role annotations. I also need to import the annotations produced by this software(http://www.surdeanu.name/mihai/swirl/) into my code. 18 | -------------------------------------------------------------------------------- /classifier/ClassifierFactory.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier; 2 | 3 | import java.io.File; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | import java.util.Properties; 8 | 9 | import edu.oregonstate.dataset.TopicGeneration; 10 | import edu.oregonstate.experiment.ExperimentConstructor; 11 | import edu.oregonstate.features.FeatureFactory; 12 | import edu.oregonstate.io.ResultOutput; 13 | import edu.oregonstate.util.EecbConstants; 14 | import edu.oregonstate.util.EecbConstructor; 15 | import edu.stanford.nlp.util.StringUtils; 16 | 17 | /** 18 | * Run Classification Method given the Data Path 19 | * 20 | * @author Jun Xie (xie@eecs.oregonstate.edu) 21 | * 22 | */ 23 | public class ClassifierFactory extends ExperimentConstructor { 24 | 25 | /** training topics */ 26 | private final String[] trainingTopics; 27 | 28 | /** classifier */ 29 | private final IClassifier classifier; 30 | 31 | /** phase, for example the second round */ 32 | private final String phaseIndex; 33 | 34 | public ClassifierFactory(Properties props) { 35 | super(props); 36 | 37 | // get training topics 38 | TopicGeneration topicGenerator = new TopicGeneration(props); 39 | trainingTopics = topicGenerator.trainingTopics(); 40 | 41 | // build a classifier 42 | classifier = EecbConstructor.createClassifier(props.getProperty(EecbConstants.CLASSIFIER_METHOD, "StructuredPerceptron")); 43 | 44 | phaseIndex = props.getProperty(EecbConstants.PHASE_PROP, "0"); 45 | } 46 | 47 | /** 48 | * perform the experiment 49 | */ 50 | public void performExperiment() { 51 | List paths = getPaths(); 52 | 53 | ResultOutput.writeTextFile(experimentFolder + "/searchstep", "" + paths.size()); 54 | ResultOutput.writeTextFile(experimentLogFile, "the total number of training files : " + paths.size()); 55 | 56 | Parameter returnPara = classifier.train(paths, Integer.parseInt(phaseIndex)); 57 | ResultOutput.writeTextFile(experimentLogFile, "\n\nThe " + phaseIndex + "'s learned model \n"); 58 | ResultOutput.printParameter(returnPara, experimentLogFile); 59 | 60 | // output 61 | double[] averageWeight = returnPara.generateWeightForTesting(); 62 | String outputFile = experimentFolder + "/model/model" + phaseIndex; 63 | String outputString = ResultOutput.printStructredModel(averageWeight, FeatureFactory.getFeatureTemplate()); 64 | ResultOutput.writeTextFile(outputFile, outputString); 65 | } 66 | 67 | /** 68 | * get the path of training data 69 | * 70 | * @param j 71 | * @return 72 | */ 73 | private List getPaths() { 74 | List filePaths = new ArrayList(); 75 | filePaths.addAll(getPaths(trainingTopics)); 76 | 77 | return filePaths; 78 | } 79 | 80 | /** 81 | * aggregate the training data 82 | * 83 | * @param topics 84 | * @param j 85 | * @return 86 | */ 87 | private List getPaths(String[] topics) { 88 | List allfiles = new ArrayList(); 89 | for (String topic : topics) { 90 | List files = getDivisionPaths(topic); 91 | String topicPath = experimentFolder + "/" + topic + "/data/"; 92 | List filePaths = new ArrayList(); 93 | for (String file : files) { 94 | filePaths.add(topicPath + file); 95 | } 96 | 97 | allfiles.addAll(filePaths); 98 | } 99 | 100 | return allfiles; 101 | } 102 | 103 | // get a sequence of data file, such as 1, 2, 3, 4, 5 104 | private List getDivisionPaths(String topic) { 105 | String topicPath = experimentFolder + "/" + topic + "/data/"; 106 | List files = new ArrayList(Arrays.asList(new File(topicPath).list())); 107 | 108 | return files; 109 | } 110 | 111 | 112 | public static void main(String[] args) { 113 | if (args.length > 1) { 114 | System.out.println("there are more parameters, you just can specify one path parameter....."); 115 | System.exit(1); 116 | } 117 | 118 | if (args.length == 0) { 119 | // run the experiment in the local machine for debugging 120 | args = new String[1]; 121 | args[0] = "../corpus/config.properties"; 122 | } 123 | 124 | String[] propArgs = new String[]{"-props", args[0]}; 125 | 126 | Properties props = StringUtils.argsToProperties(propArgs); 127 | ExperimentConstructor classifier = new ClassifierFactory(props); 128 | classifier.performExperiment(); 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /classifier/IClassifier.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * interface of classifier 7 | * 8 | * @author Jun Xie (xiejuncs@gmail.com) 9 | * 10 | */ 11 | public interface IClassifier { 12 | 13 | /* train the model according to file path and parameters */ 14 | public Parameter train(String path, Parameter para); 15 | 16 | /* train the model according to file paths and parameters */ 17 | public Parameter train(List path, Parameter para); 18 | 19 | /* use zero vector to train the model */ 20 | public Parameter train(List path, int modelIndex); 21 | 22 | } 23 | -------------------------------------------------------------------------------- /classifier/Parameter.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier; 2 | 3 | import edu.oregonstate.experiment.ExperimentConstructor; 4 | import edu.oregonstate.general.DoubleOperation; 5 | import edu.oregonstate.util.EecbConstants; 6 | 7 | /** 8 | * there are three fields for this class, including weights, totalWeights and violations, number of instance, and variance 9 | * 10 | * @author Jun Xie (xiejuncs@gmail.com) 11 | * 12 | */ 13 | public class Parameter { 14 | 15 | /* current weight */ 16 | private double[] mWeight; 17 | 18 | /* the total weight */ 19 | private double[] mTotalWeight; 20 | 21 | /* no of violations */ 22 | private int mNoOfViolation; 23 | 24 | /* number of instance */ 25 | private int mNumberofInstance; 26 | 27 | /* variance, used in the AROW algorithm */ 28 | private double[][] mVariance; 29 | 30 | public Parameter(double[] weights) { 31 | this(weights, DoubleOperation.generateIdentityMatrix(weights.length), new double[weights.length]); 32 | } 33 | 34 | public Parameter(double[] weights, double[][] variance, double[] totalWeights) { 35 | this(weights, variance, totalWeights, 0, 0); 36 | } 37 | 38 | public Parameter(double[] weights, double[][] variance, double[] totalWeights, int noOfViolations, int numberOfInstances) { 39 | mWeight = weights; 40 | mTotalWeight = totalWeights; 41 | mNoOfViolation = noOfViolations; 42 | mNumberofInstance = numberOfInstances; 43 | mVariance = variance; 44 | } 45 | 46 | public double[] getWeight() { 47 | return mWeight; 48 | } 49 | 50 | public double[] getTotalWeight() { 51 | return mTotalWeight; 52 | } 53 | 54 | public int getNoOfViolation() { 55 | return mNoOfViolation; 56 | } 57 | 58 | public int getNumberOfInstance() { 59 | return mNumberofInstance; 60 | } 61 | 62 | public double[][] getVariance() { 63 | return mVariance; 64 | } 65 | 66 | /** 67 | * make a deep copy of the current object 68 | * 69 | * @return 70 | */ 71 | public Parameter makeCopy() { 72 | int length = mWeight.length; 73 | double[] copyWeight = new double[length]; 74 | double[] copyTotalWeight = new double[length]; 75 | double[][] copyVariance = new double[length][length]; 76 | System.arraycopy(mWeight, 0, copyWeight, 0, length); 77 | System.arraycopy(mTotalWeight, 0, copyTotalWeight, 0, length); 78 | 79 | // copy of variance by each row 80 | for (int index = 0; index < length; index++) { 81 | System.arraycopy(mVariance[index], 0, copyVariance[index], 0, length); 82 | } 83 | 84 | Parameter copyPara = new Parameter(copyWeight, copyVariance, copyTotalWeight, mNoOfViolation, mNumberofInstance); 85 | return copyPara; 86 | } 87 | 88 | /** 89 | * generate weight for testing, average weight or latest weight 90 | * 91 | * @param para 92 | * @return 93 | */ 94 | // return the average weight 95 | public double[] generateWeightForTesting() { 96 | boolean averageWeight = Boolean.parseBoolean(ExperimentConstructor.experimentProps.getProperty(EecbConstants.SEARCH_WEIGHT, "true")); 97 | Parameter finalPara = this.makeCopy(); 98 | double[] learnedWeight; 99 | if (averageWeight) { 100 | learnedWeight = DoubleOperation.divide(finalPara.getTotalWeight(), finalPara.getNoOfViolation()); 101 | } else { 102 | learnedWeight = finalPara.getWeight(); 103 | } 104 | return learnedWeight; 105 | } 106 | 107 | } 108 | -------------------------------------------------------------------------------- /classifier/StructuredPerceptron.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | import java.util.Properties; 7 | 8 | import edu.oregonstate.experiment.ExperimentConstructor; 9 | import edu.oregonstate.features.FeatureFactory; 10 | import edu.oregonstate.general.DoubleOperation; 11 | import edu.oregonstate.io.ResultOutput; 12 | import edu.oregonstate.training.ITraining; 13 | import edu.oregonstate.util.EecbConstants; 14 | import edu.oregonstate.util.EecbConstructor; 15 | 16 | /** 17 | * Learn the weight 18 | * 19 | * @author Jun Xie (xie@eecs.oregonstate.edu) 20 | * 21 | */ 22 | public class StructuredPerceptron implements IClassifier { 23 | 24 | /* experiment property file */ 25 | private final Properties mProps; 26 | 27 | /* the total number of iterations */ 28 | private final int mIterations; 29 | 30 | /* experiment folder */ 31 | private final String experimentFolder; 32 | 33 | /* logFile */ 34 | private final String logFile; 35 | 36 | /* model index */ 37 | private int modelIndex; 38 | 39 | /* the weight used for keeping track of the progress */ 40 | private List weights; 41 | 42 | /** the length of the features */ 43 | private final int length; 44 | 45 | /** training model */ 46 | private final ITraining trainingModel; 47 | 48 | /** learning rate constant or not */ 49 | private final boolean learningRateConstant; 50 | 51 | /** enable print the result of each iteration during training */ 52 | private final boolean enablePrintIterationResult; 53 | 54 | /** print the result of the iteration for how many gap */ 55 | private final int printIteartionGap; 56 | 57 | /** 58 | * constructor 59 | */ 60 | public StructuredPerceptron() { 61 | mProps = ExperimentConstructor.experimentProps; 62 | experimentFolder = ExperimentConstructor.experimentFolder; 63 | mIterations = Integer.parseInt(mProps.getProperty(EecbConstants.CLASSIFIER_EPOCH_PROP, "50")); 64 | 65 | logFile = ExperimentConstructor.experimentLogFile; 66 | modelIndex = 0; 67 | String trainingStyle = mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_METHOD, "OnlineToBatch"); 68 | trainingModel = EecbConstructor.createTrainingModel(trainingStyle); 69 | List featureTemplate = FeatureFactory.getFeatureTemplate(); 70 | length = featureTemplate.size(); 71 | weights = new ArrayList(); 72 | 73 | learningRateConstant = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PERCEPTRON_LEARINGRATE_CONSTANT, "false")); 74 | enablePrintIterationResult = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_ITERATION_RESULT, "false")); 75 | printIteartionGap = Integer.parseInt(mProps.getProperty(EecbConstants.CLASSIFIER_ITEARTION_GAP, "2")); 76 | } 77 | 78 | /** 79 | * use zero vector to train the model 80 | */ 81 | public Parameter train(List paths, int index) { 82 | ResultOutput.writeTextFile(logFile, "\nBegin to learn model : " + modelIndex); 83 | ResultOutput.writeTextFile(logFile, "\nStructured Perceptron with Iteration : " + mIterations); 84 | 85 | // model index 86 | modelIndex = index; 87 | double[] weight = new double[length]; 88 | Parameter para = new Parameter(weight); 89 | 90 | Parameter trainedPara = train(paths, para); 91 | 92 | return trainedPara; 93 | } 94 | 95 | /** 96 | * train the model according to lots of files 97 | */ 98 | public Parameter train(List paths, Parameter para) { 99 | double startingRate = Double.parseDouble(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PERCEPTRON_STARTRATE, "0.1")); 100 | double endRate = 0.0; 101 | if (learningRateConstant) { 102 | endRate = startingRate; 103 | } 104 | double[] learningRates = DoubleOperation.createDescendingArray(startingRate, endRate, mIterations); 105 | 106 | // do gradient update 107 | for (int i = 0; i < mIterations; i++) { 108 | double learningRate = learningRates[i]; 109 | weights.add(para.getWeight()); 110 | ResultOutput.writeTextFile(logFile, "the " + modelIndex + "'s model " + i + "iteration"); 111 | // ResultOutput.printParameter(para, logFile); 112 | 113 | // shuffle the path 114 | Collections.shuffle(paths); 115 | int beforeViolation = para.getNoOfViolation(); 116 | 117 | // do weight update 118 | para = trainingModel.train(paths, para, learningRate); 119 | 120 | // print number of violated constraint 121 | int afterviolation = para.getNoOfViolation(); 122 | ResultOutput.writeTextFile(experimentFolder + "/violation/violation-" + modelIndex +".csv", (afterviolation - beforeViolation) + "\t" + para.getNumberOfInstance()); 123 | 124 | } 125 | 126 | // calculate the weight difference between the previous iteration and the current iteration 127 | DoubleOperation.calcualateWeightDifference(weights, experimentFolder + "/weightdifference/weight-difference-"+ modelIndex + ".csv"); 128 | DoubleOperation.printWeightNorm(weights, experimentFolder + "/weightnorm/weight-norm-"+ modelIndex + ".csv"); 129 | 130 | return para; 131 | } 132 | 133 | /** 134 | * train the model 135 | */ 136 | public Parameter train(String path, Parameter para) { 137 | return para; 138 | } 139 | 140 | } -------------------------------------------------------------------------------- /classifier/svm/LibSVMInterface.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier.svm; 2 | 3 | import edu.oregonstate.classifier.svm.libsvm.svm; 4 | import edu.oregonstate.classifier.svm.libsvm.svm_model; 5 | 6 | /** 7 | * An interface to LibSVM 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public class LibSVMInterface { 13 | 14 | 15 | public static void main(String[] args) { 16 | String filePath = "/scratch/Software/libsvm-3.17/tools/prune.model"; 17 | 18 | svm_model model = null; 19 | try { 20 | model = svm.svm_load_model(filePath); 21 | } catch (Exception e) { 22 | throw new RuntimeException(e); 23 | } 24 | 25 | System.out.println("done"); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /classifier/svm/libsvm/svm_model.java: -------------------------------------------------------------------------------- 1 | // 2 | // svm_model 3 | // 4 | package edu.oregonstate.classifier.svm.libsvm; 5 | 6 | public class svm_model implements java.io.Serializable 7 | { 8 | public svm_parameter param; // parameter 9 | public int nr_class; // number of classes, = 2 in regression/one class svm 10 | public int l; // total #SV 11 | public svm_node[][] SV; // SVs (SV[l]) 12 | public double[][] sv_coef; // coefficients for SVs in decision functions (sv_coef[k-1][l]) 13 | public double[] rho; // constants in decision functions (rho[k*(k-1)/2]) 14 | public double[] probA; // pariwise probability information 15 | public double[] probB; 16 | public int[] sv_indices; // sv_indices[0,...,nSV-1] are values in [1,...,num_traning_data] to indicate SVs in the training set 17 | 18 | // for classification only 19 | 20 | public int[] label; // label of each class (label[k]) 21 | public int[] nSV; // number of SVs for each class (nSV[k]) 22 | // nSV[0] + nSV[1] + ... + nSV[k-1] = l 23 | }; 24 | -------------------------------------------------------------------------------- /classifier/svm/libsvm/svm_node.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier.svm.libsvm; 2 | public class svm_node implements java.io.Serializable 3 | { 4 | public int index; 5 | public double value; 6 | } 7 | -------------------------------------------------------------------------------- /classifier/svm/libsvm/svm_parameter.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier.svm.libsvm; 2 | 3 | public class svm_parameter implements Cloneable,java.io.Serializable 4 | { 5 | /* svm_type */ 6 | public static final int C_SVC = 0; 7 | public static final int NU_SVC = 1; 8 | public static final int ONE_CLASS = 2; 9 | public static final int EPSILON_SVR = 3; 10 | public static final int NU_SVR = 4; 11 | 12 | /* kernel_type */ 13 | public static final int LINEAR = 0; 14 | public static final int POLY = 1; 15 | public static final int RBF = 2; 16 | public static final int SIGMOID = 3; 17 | public static final int PRECOMPUTED = 4; 18 | 19 | public int svm_type; 20 | public int kernel_type; 21 | public int degree; // for poly 22 | public double gamma; // for poly/rbf/sigmoid 23 | public double coef0; // for poly/sigmoid 24 | 25 | // these are for training only 26 | public double cache_size; // in MB 27 | public double eps; // stopping criteria 28 | public double C; // for C_SVC, EPSILON_SVR and NU_SVR 29 | public int nr_weight; // for C_SVC 30 | public int[] weight_label; // for C_SVC 31 | public double[] weight; // for C_SVC 32 | public double nu; // for NU_SVC, ONE_CLASS, and NU_SVR 33 | public double p; // for EPSILON_SVR 34 | public int shrinking; // use the shrinking heuristics 35 | public int probability; // do probability estimates 36 | 37 | public Object clone() 38 | { 39 | try 40 | { 41 | return super.clone(); 42 | } catch (CloneNotSupportedException e) 43 | { 44 | return null; 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /classifier/svm/libsvm/svm_print_interface.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier.svm.libsvm; 2 | public interface svm_print_interface 3 | { 4 | public void print(String s); 5 | } 6 | -------------------------------------------------------------------------------- /classifier/svm/libsvm/svm_problem.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.classifier.svm.libsvm; 2 | 3 | public class svm_problem implements java.io.Serializable 4 | { 5 | public int l; 6 | public double[] y; 7 | public svm_node[][] x; 8 | } 9 | -------------------------------------------------------------------------------- /cluster/Cluster.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster; 2 | 3 | import java.util.List; 4 | import java.util.ArrayList; 5 | import Jama.Matrix; 6 | 7 | import edu.oregonstate.data.EecbClusterDocument; 8 | 9 | /** 10 | * cluster representation 11 | * 12 | * Jun Xie(xiejuncs@gmail.com) 13 | */ 14 | public class Cluster { 15 | 16 | public int mID; 17 | public List documents; 18 | public List children; 19 | 20 | public Cluster(int id) { 21 | mID = id; 22 | documents = new ArrayList(); 23 | children = new ArrayList(); 24 | } 25 | 26 | public void addDocument(EecbClusterDocument document) { 27 | documents.add(document); 28 | } 29 | 30 | public void addDocuments(List docus) { 31 | documents.addAll(docus); 32 | } 33 | 34 | public void addChildrens(List child) { 35 | children.addAll(child); 36 | } 37 | 38 | public List getDocuments() { 39 | return documents; 40 | } 41 | 42 | public void addChildren(Cluster cluster) { 43 | children.add(cluster); 44 | } 45 | 46 | public List getChildren() { 47 | return children; 48 | } 49 | 50 | public int getID() { 51 | return mID; 52 | } 53 | 54 | /** to = to + from, and then delete from*/ 55 | public static void mergeClusters(Cluster to, Cluster from) { 56 | int toID = to.getID(); 57 | to.addChildren(to); 58 | for (EecbClusterDocument m : from.getDocuments()) { 59 | to.addDocument(m); 60 | } 61 | to.addChildren(from); 62 | System.out.println("merge clusters :" + toID + " <----- " + from.getID()); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /cluster/HAC.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster; 2 | 3 | import java.util.List; 4 | import java.util.ArrayList; 5 | import java.util.Map; 6 | import java.util.HashMap; 7 | import java.util.Collection; 8 | import java.util.Collections; 9 | 10 | import edu.oregonstate.data.EecbClusterDocument; 11 | import edu.oregonstate.cluster.Cluster; 12 | import edu.oregonstate.cluster.agglomeration.AgglomerationMethod; 13 | import edu.oregonstate.cluster.experiment.DissimilarityMeasure; 14 | 15 | /** 16 | * implementation of Hierarchical Agglomerative clustering method 17 | * STEPS 18 | * 1. starts with each obj in a separate cluster; 19 | * 2. repeatedly joins the closeset pair of clusters 20 | * 3. uniti there is only one cluster 21 | * 22 | * Jun Xie(xiejuncs@gmail.com) 23 | */ 24 | public class HAC { 25 | 26 | public List mClusters; 27 | public List mDocuments; 28 | // interface to incorporate different dissimilarityMeasure and different agglomerationMethod 29 | private DissimilarityMeasure dissimilarityMeasure; 30 | private List mergeResult; // all dendrogram clusters 31 | private AgglomerationMethod method; 32 | public List mergeSequence; 33 | 34 | public HAC(List documents, DissimilarityMeasure dissimilarityMeasure, 35 | AgglomerationMethod agglomerationMethod) { 36 | mDocuments = documents; 37 | mClusters = new ArrayList(); 38 | this.dissimilarityMeasure = dissimilarityMeasure; 39 | mergeResult = new ArrayList(); 40 | method = agglomerationMethod; 41 | mergeSequence = new ArrayList(); 42 | initialize(); 43 | } 44 | 45 | public List getSequence() { 46 | return mergeSequence; 47 | } 48 | 49 | public List getMergeResult() { 50 | return mergeResult; 51 | } 52 | 53 | /** 54 | * makeing the clustering 55 | */ 56 | public void cluster() { 57 | Map dissimilarityMatrix = computeDissimilarityMatrix(); 58 | String minIndex = minimum(dissimilarityMatrix); 59 | // merge until there is only one cluster 60 | boolean flag = true; 61 | while(flag) { 62 | String[] indexs = minIndex.split("-"); 63 | int to = Integer.parseInt(indexs[0]); 64 | int from = Integer.parseInt(indexs[1]); 65 | mergeSequence.add(mClusters.get(to).getID() + "-" + mClusters.get(from).getID()); 66 | Cluster.mergeClusters(mClusters.get(to), mClusters.get(from)); 67 | 68 | 69 | Cluster intermediateResult = new Cluster(to); 70 | intermediateResult.addChildrens(mClusters.get(to).getChildren()); 71 | intermediateResult.addDocuments(mClusters.get(to).getDocuments()); 72 | // also need to deep copy the cluster object not just an ArrayList of cluster object 73 | mClusters.remove(from); 74 | mergeResult.add(intermediateResult); // soft copy, need deep copy 75 | dissimilarityMatrix = new HashMap(); 76 | dissimilarityMatrix = computeDissimilaritywithDiffernetMethod(); 77 | if (dissimilarityMatrix.size() == 0) break; 78 | minIndex = minimum(dissimilarityMatrix); 79 | } 80 | } 81 | 82 | private Map computeDissimilaritywithDiffernetMethod() { 83 | Map dissimilarityMatrix = new HashMap(); 84 | /** calcluate the dissimilarity score for each pair (i,j) s.t. i != j*/ 85 | for (int i = 0; i < mClusters.size(); i++) { 86 | for (int j = 0; j < i; j++) { 87 | double dissimilarity = method.computeDissimilarity(mClusters.get(i), mClusters.get(j)); 88 | dissimilarityMatrix.put(Integer.toString(i) + "-" + Integer.toString(j), dissimilarity); 89 | } 90 | } 91 | 92 | return dissimilarityMatrix; 93 | } 94 | 95 | 96 | /*Compare HashMap to get the index with the minimum value*/ 97 | public String minimum(Map scores) { 98 | Collection c = scores.values(); 99 | Double minvalue = Collections.min(c); 100 | String minIndex = ""; 101 | for (String key : scores.keySet()) { 102 | Double value = scores.get(key); 103 | if (value == minvalue) { 104 | minIndex = key; 105 | break; 106 | } 107 | } 108 | 109 | return minIndex; 110 | } 111 | 112 | 113 | private Map computeDissimilarityMatrix() { 114 | Map dissimilarityMatrix = new HashMap(); 115 | /** calcluate the dissimilarity score for each pair (i,j) s.t. i != j*/ 116 | for (int i = 0; i < mDocuments.size(); i++) { 117 | for (int j = 0; j < i; j++) { 118 | double dissimilarity = dissimilarityMeasure.computeDissimilarity(mDocuments, i, j); 119 | dissimilarityMatrix.put(Integer.toString(i) + "-" + Integer.toString(j), dissimilarity); 120 | } 121 | } 122 | 123 | return dissimilarityMatrix; 124 | } 125 | 126 | /** 127 | * initialize each document in a separate cluster 128 | */ 129 | private void initialize() { 130 | for (int i = 0; i < mDocuments.size(); i++) { 131 | Cluster cluster = new Cluster(i); 132 | cluster.addDocument(mDocuments.get(i)); 133 | mClusters.add(cluster); 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /cluster/IClustering.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster; 2 | 3 | public interface IClustering { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /cluster/TFIDF.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster; 2 | 3 | import java.util.Collections; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.ArrayList; 7 | import java.util.Set; 8 | import java.util.HashSet; 9 | import java.util.Map; 10 | import java.util.Iterator; 11 | 12 | import edu.oregonstate.general.CounterMap; 13 | 14 | /** 15 | * Implementation of tf-idf-weighted term vector representation. 16 | * 17 | * the clustering algorithm uses the vector-space model to represent each document. 18 | * In this model, each document $d$ is considered to be a vector in the term-space. 19 | * In particular, the algorithm employed the tf-idf term weighting model, in which 20 | * each document can be represented as ($tf_{1}\log(n/df_{1}))$,....). where $tf_{i}$ 21 | * is the frequence of the ith term in the document and $df_{i}$ is the number of 22 | * documents that contain the $i$th term. To account for documents of different lengths, 23 | * the length of each document vector is normalized so that it is of unit length $|d| = 1$ 24 | * 25 | * Jun Xie(xiejuncs@gmail.com) 26 | */ 27 | public class TFIDF { 28 | 29 | private List> mDocuments; 30 | public Set dictionary; 31 | private Map wordTotalCount; 32 | private Map> invertedIndex; 33 | private Map> wordCount; 34 | private CounterMap tfidf; // word and document index 35 | private int documentCount; 36 | 37 | /** 38 | * initialize all fields 39 | * 40 | * @param documents : each document contains a list of string 41 | * @param titles : all titles name 42 | */ 43 | public TFIDF(List> documents) { 44 | mDocuments = documents; 45 | documentCount = mDocuments.size(); 46 | dictionary = new HashSet(); 47 | invertedIndex = new HashMap>(); 48 | wordCount = new HashMap>(); 49 | tfidf = new CounterMap(); 50 | wordTotalCount = new HashMap(); 51 | } 52 | 53 | /** 54 | * build TF IDF 55 | * 56 | * @return retrun normalized tfidf vector 57 | */ 58 | public CounterMap buildTFIDF() { 59 | buildDictionary(); 60 | index(); 61 | calculateTFIDF(); 62 | 63 | return tfidf; 64 | } 65 | 66 | /** 67 | * calculate the tfidf 68 | */ 69 | private void calculateTFIDF() { 70 | for (String token : wordCount.keySet()) { 71 | Map docFreq = wordCount.get(token); 72 | int tokenFreq = invertedIndex.get(token).size(); 73 | for (Integer doc : docFreq.keySet()) { 74 | Integer count = docFreq.get(doc); 75 | double w = 0.0; 76 | if (count > 0) { 77 | w = 1 + Math.log10(count); 78 | } 79 | Double value = w * Math.log10(documentCount/tokenFreq); 80 | tfidf.setCount(token, doc, value); 81 | } 82 | 83 | } 84 | } 85 | 86 | /** 87 | * build inverted Index 88 | */ 89 | private void index() { 90 | Iterator it = dictionary.iterator(); 91 | while (it.hasNext()) { 92 | String token = it.next(); 93 | List posting = new ArrayList(); 94 | Map count = new HashMap(); 95 | for (int i = 0; i < mDocuments.size(); i++) { 96 | List document = mDocuments.get(i); 97 | if (document.contains(token)) { 98 | posting.add(i); 99 | } 100 | int occurance = Collections.frequency(document, token); 101 | count.put(i, occurance); 102 | wordCount.put(token, count); 103 | } 104 | invertedIndex.put(token, posting); 105 | } 106 | } 107 | 108 | /** 109 | * terms that appear in a single document were removed NOTE processing in later part 110 | */ 111 | private void buildDictionary() { 112 | for (int i = 0; i < mDocuments.size(); i++) { 113 | List document = mDocuments.get(i); 114 | for (String token : document) { 115 | boolean contains = wordTotalCount.containsKey(token); 116 | int count = 0; 117 | if (contains) { 118 | count = wordTotalCount.get(token); 119 | } 120 | wordTotalCount.put(token, (count + 1)); 121 | } 122 | } 123 | 124 | // delete the token with the count being 1 125 | Iterator it = wordTotalCount.keySet().iterator(); 126 | while (it.hasNext()) { 127 | String token = it.next(); 128 | if (wordTotalCount.get(token) < 2) { 129 | it.remove(); 130 | } 131 | } 132 | 133 | dictionary = wordTotalCount.keySet(); 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /cluster/agglomeration/AgglomerationMethod.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster.agglomeration; 2 | 3 | import edu.oregonstate.cluster.Cluster; 4 | 5 | /** 6 | * 7 | * 8 | * @author Jun Xie(xiejuncs@gmail.com) 9 | * 10 | */ 11 | 12 | public interface AgglomerationMethod { 13 | 14 | /** 15 | * Compute the dissimilarity between two clusters 16 | * 17 | * @return dissimilarity between cluster (i,j). 18 | */ 19 | public double computeDissimilarity(Cluster c1, Cluster c2); 20 | } 21 | -------------------------------------------------------------------------------- /cluster/agglomeration/AverageLinkage.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster.agglomeration; 2 | 3 | import edu.oregonstate.cluster.Cluster; 4 | import edu.oregonstate.cluster.experiment.DissimilarityMeasure; 5 | import edu.oregonstate.cluster.experiment.EecbDissimilarityMeasure; 6 | import Jama.Matrix; 7 | import edu.oregonstate.data.EecbClusterDocument; 8 | /** 9 | * average link or group average 10 | * Formula 11 | * 12 | * dist(c_{i}, c_{j}) = 1/n_{i}n{j} \sum_{d_{r} \in c_{i}} \sum_{d_{s} \in c_{j}} dist(d_{r}, d_{s}) 13 | * 14 | * @author Jun Xie(xiejuncs@gmail.com) 15 | * 16 | */ 17 | public class AverageLinkage implements AgglomerationMethod { 18 | 19 | /** calculate the dissimilarity between two clusters */ 20 | public double computeDissimilarity(Cluster c1, Cluster c2) { 21 | DissimilarityMeasure meausre = new EecbDissimilarityMeasure(); 22 | double dissimilarity = 0.0; 23 | int n1 = c1.getDocuments().size(); 24 | int n2 = c2.getDocuments().size(); 25 | 26 | for (EecbClusterDocument d1 : c1.getDocuments()) { 27 | for (EecbClusterDocument d2 : c2.getDocuments()) { 28 | Matrix m1 = d1.vector; 29 | Matrix m2 = d2.vector; 30 | dissimilarity += 1 - meausre.cosineSimilarity(m1, m2); 31 | } 32 | } 33 | 34 | return dissimilarity / (n1 * n2); 35 | } 36 | 37 | public String toString() { 38 | return "Average"; 39 | } 40 | 41 | } -------------------------------------------------------------------------------- /cluster/experiment/DissimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster.experiment; 2 | 3 | import java.util.List; 4 | import Jama.Matrix; 5 | import edu.oregonstate.data.EecbClusterDocument; 6 | 7 | /** 8 | * Computes the dissimilarity between two observations in an experiment. 9 | * 10 | * @author Matthias.Hauswirth@usi.ch 11 | */ 12 | public interface DissimilarityMeasure { 13 | 14 | public double computeDissimilarity(List vectors, int observation1, int observation2); 15 | 16 | public double cosineSimilarity(Matrix obs1, Matrix obs2); 17 | } -------------------------------------------------------------------------------- /cluster/experiment/EecbDissimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.cluster.experiment; 2 | 3 | import java.util.List; 4 | 5 | import Jama.Matrix; 6 | import edu.oregonstate.data.EecbClusterDocument; 7 | 8 | /** 9 | * Jun Xie(xiejuncs@gmail.com) 10 | */ 11 | public class EecbDissimilarityMeasure implements DissimilarityMeasure { 12 | 13 | /** 14 | * how to calculate the dis similarity function 15 | */ 16 | public double computeDissimilarity(List vectors, int observation1, int observation2) { 17 | double similarity = 0.0; 18 | Matrix obs1 = vectors.get(observation1).vector; 19 | Matrix obs2 = vectors.get(observation2).vector; 20 | similarity = cosineSimilarity(obs1, obs2); 21 | return 1 - similarity; 22 | } 23 | 24 | /** 25 | * use cosine similarity to compute dissimilarity 26 | * 27 | * @param obs1 28 | * @param obs2 29 | * @return 30 | */ 31 | public double cosineSimilarity(Matrix obs1, Matrix obs2) { 32 | double sum = 0.0; 33 | for (int i = 0; i < obs1.getRowDimension(); i++) { 34 | sum = obs1.get(i, 0) * obs2.get(i, 0); 35 | } 36 | double norm1 = add(obs1); 37 | double norm2 = add(obs2); 38 | 39 | return sum / Math.sqrt(norm1 * norm2); 40 | } 41 | 42 | public double add(Matrix obs) { 43 | double sum = 0.0; 44 | for (int i = 0; i < obs.getRowDimension(); i++) { 45 | sum += obs.get(i, 0) * obs.get(i, 0); 46 | } 47 | return sum; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /costfunction/ICostFunction.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.costfunction; 2 | 3 | import edu.stanford.nlp.stats.Counter; 4 | 5 | /** 6 | * the interface of Cost Function, 7 | * 8 | * The most used cost function is linear combination of features 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public interface ICostFunction { 14 | 15 | // calculate cost function according to features and the model 16 | public double calculateCostFunction(Counter features, double[] model); 17 | } 18 | 19 | -------------------------------------------------------------------------------- /costfunction/LinearCostFunction.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.costfunction; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.features.FeatureFactory; 6 | import edu.stanford.nlp.stats.Counter; 7 | 8 | public class LinearCostFunction implements ICostFunction { 9 | 10 | public LinearCostFunction() { 11 | } 12 | 13 | /** 14 | * according to feature vector and model vector, calculate the cost 15 | */ 16 | public double calculateCostFunction(Counter features, double[] model) { 17 | double sum = 0.0; 18 | List featureTemplate = FeatureFactory.getFeatureTemplate(); 19 | for (int i = 0; i < featureTemplate.size(); i++) { 20 | String feature = featureTemplate.get(i); 21 | double value = features.getCount(feature); 22 | sum += value * model[i]; 23 | } 24 | return sum; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /data/EecbCharSeq.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.MatchException; 4 | import edu.stanford.nlp.trees.Span; 5 | import java.util.Vector; 6 | 7 | /** 8 | * The textual form of the mention occured in the document 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class EecbCharSeq { 14 | 15 | /** The exact text matched by this sequence */ 16 | private String mText; 17 | /** in order to bookkeep the the start and end of the tokens */ 18 | private Span mTokenOffset; 19 | 20 | private int mSentenceID; 21 | 22 | /** 23 | * The reason for this is that we extract the annotation according to character index of the tokens 24 | */ 25 | private Span mByteOffset; 26 | 27 | public EecbCharSeq(String text, int start, int end, int sentenceID) { 28 | mText = text; 29 | mByteOffset = new Span(start, end); 30 | mTokenOffset = null; 31 | mSentenceID = sentenceID; 32 | } 33 | 34 | public int sentenceID() { 35 | return mSentenceID; 36 | } 37 | 38 | public int getByteStart() { 39 | return mByteOffset.start(); 40 | } 41 | 42 | public int getByteEnd() { 43 | return mByteOffset.end(); 44 | } 45 | 46 | public Span getByteOffset() { 47 | return mByteOffset; 48 | } 49 | 50 | public String getText() { 51 | return mText; 52 | } 53 | 54 | public Span getTokenOffset() { 55 | return mTokenOffset; 56 | } 57 | 58 | public int getTokenStart() { 59 | if (mTokenOffset == null) 60 | return -1; 61 | return mTokenOffset.start(); 62 | } 63 | 64 | public int getTokenEnd() { 65 | if (mTokenOffset == null) 66 | return -1; 67 | return mTokenOffset.end(); 68 | } 69 | 70 | /** 71 | * Matches this char seq against the full token stream As a result of this 72 | * method mTokenOffset is initialized 73 | */ 74 | public void match(Vector tokens) throws MatchException { 75 | int start = -1; 76 | int end = -1; 77 | 78 | for (int i = 0; i < tokens.size(); i++) { 79 | if (tokens.get(i).getSentence() != mSentenceID) continue; 80 | 81 | if (tokens.get(i).getByteOffset().start() == mByteOffset.start()) { 82 | start = i; 83 | } else if (mByteOffset.start() > tokens.get(i).getByteOffset().start() 84 | && mByteOffset.start() < tokens.get(i).getByteOffset().end()) { 85 | start = i; 86 | } 87 | 88 | if (tokens.get(i).getByteOffset().end() == mByteOffset.end()) { 89 | end = i; 90 | break; 91 | } else if (mByteOffset.end() >= tokens.get(i).getByteOffset().start() 92 | && mByteOffset.end() < tokens.get(i).getByteOffset().end()) { 93 | end = i; 94 | break; 95 | } 96 | } 97 | 98 | if (start >= 0 && end >= 0) { 99 | mTokenOffset = new Span(start, end); 100 | // mPhrase = makePhrase(tokens, mTokenOffset); 101 | } else { 102 | throw new MatchException("Match failed!"); 103 | } 104 | } 105 | 106 | /** 107 | * for debug convience 108 | * 109 | * @param label 110 | * @param offset 111 | * @return 112 | */ 113 | public String toXml(String label, int offset) { 114 | StringBuffer buffer = new StringBuffer(); 115 | EecbElement.appendOffset(buffer, offset); 116 | buffer.append("<" + label + ">\n"); 117 | EecbElement.appendOffset(buffer, offset + 2); 118 | buffer.append("" + mText + ""); 120 | buffer.append("\n"); 121 | EecbElement.appendOffset(buffer, offset); 122 | buffer.append(""); 123 | return buffer.toString(); 124 | } 125 | 126 | @Override 127 | public String toString() { 128 | return "EecbCharSeq [mText = " + mText + ", mByteOffset=" + mByteOffset +", mTokenOffset=" + mTokenOffset + "]"; 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /data/EecbClusterDocument.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import Jama.Matrix; 4 | 5 | /** 6 | * Jun Xie(xiejuncs@gmail.com) 7 | */ 8 | public class EecbClusterDocument { 9 | 10 | public int mID; 11 | public String mPrefix; 12 | public Matrix vector; 13 | 14 | public EecbClusterDocument(int id, Matrix vec) { 15 | mID = id; 16 | vector = vec; 17 | } 18 | 19 | // set the prefix, the format as 1(cluster)-1(document), 20 | public void setPrefix(String prefix) { 21 | mPrefix = prefix; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /data/EecbElement.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | /** 4 | * Base class for all EECB annotation elements 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public class EecbElement { 10 | 11 | /** unique identifier for this element*/ 12 | protected String mID; 13 | 14 | public EecbElement(String mID) { 15 | this.mID = mID; 16 | } 17 | 18 | public String getId() {return mID; } 19 | 20 | // indentation for debug. 21 | // Entity/Event without indentation 22 | // EntityMention/EventMention with two indentations 23 | public static void appendOffset(StringBuffer buffer, int offset) { 24 | for(int i = 0; i < offset; i ++){ 25 | buffer.append(" "); 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /data/EecbEntity.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention; 7 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken; 8 | 9 | /** 10 | * EECB entity. In the EECB corpus, an corefid is used to represent the entity. 11 | * For example: People magazine 12 | * "People Magazine" is a entity mention. Its entity identifier is 27. 13 | *

14 | * 15 | * @author Jun Xie (xie@eecs.oregonstate.edu) 16 | * 17 | */ 18 | public class EecbEntity extends EecbElement { 19 | private List mMentions; 20 | 21 | public EecbEntity(String id) { 22 | super(id); 23 | mMentions = new ArrayList(); 24 | } 25 | 26 | public List getMentions() {return mMentions;} 27 | 28 | public void addMention(EecbEntityMention m) { 29 | mMentions.add(m); 30 | m.setParent(this); 31 | } 32 | 33 | public String toXML(int offset) { 34 | StringBuffer buffer = new StringBuffer(); 35 | appendOffset(buffer, offset); 36 | buffer.append("\n"); 37 | for(EecbEntityMention m: mMentions){ 38 | buffer.append(m.toXml(offset + 2)); 39 | buffer.append("\n"); 40 | } 41 | appendOffset(buffer, offset); 42 | buffer.append(""); 43 | return buffer.toString(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /data/EecbEntityMention.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Eecb Entity Mention, for example, an noun phrase 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public class EecbEntityMention extends EecbMention { 13 | 14 | @Override 15 | public String toString() { 16 | return "EecbEntityMention [mHead=" + mHead + ", mExtent" + this.mExtent +", mSentence = " + sentenceID() +"]"; 17 | } 18 | 19 | /** The set of event mentions that contain this entity mention */ 20 | private List mEventMentions; 21 | 22 | /** The parent entity */ 23 | private EecbEntity mParent; 24 | 25 | private EecbCharSeq mHead; 26 | 27 | /** Position of the head word of this mention */ 28 | private int mHeadTokenPosition; 29 | 30 | public EecbEntityMention(String id, EecbCharSeq extent, EecbCharSeq head, int sentence) { 31 | super(id, extent, sentence); 32 | mExtent = extent; 33 | mHead = head; 34 | mParent = null; 35 | mHeadTokenPosition = -1; 36 | mEventMentions = new ArrayList(); 37 | } 38 | 39 | public void setParent(EecbEntity e) { mParent = e; } 40 | public EecbEntity getParent() { return mParent; } 41 | public EecbCharSeq getHead() { return mHead; } 42 | public EecbCharSeq getExtent() { return mExtent; } 43 | public int getHeadTokenPosition() { return mHeadTokenPosition; } 44 | 45 | public void addEventMention(EecbEventMention rm) { 46 | mEventMentions.add(rm); 47 | } 48 | public List getEventMentions() { 49 | return mEventMentions; 50 | } 51 | 52 | public String toXml(int offset) { 53 | StringBuffer buffer = new StringBuffer(); 54 | appendOffset(buffer, offset); 55 | buffer.append("\n"); 56 | buffer.append(mExtent.toXml("extent", offset + 2)); 57 | buffer.append("\n"); 58 | buffer.append(mHead.toXml("head", offset + 2)); 59 | buffer.append("\n"); 60 | appendOffset(buffer, offset); 61 | buffer.append(""); 62 | return buffer.toString(); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /data/EecbEvent.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Store only Eecb Event 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public class EecbEvent extends EecbElement { 13 | 14 | /** The list of mentions for this event */ 15 | private List mMentions; 16 | 17 | public EecbEvent(String id) { 18 | super(id); 19 | mMentions = new ArrayList(); 20 | } 21 | 22 | public void addMention(EecbEventMention m) { 23 | mMentions.add(m); 24 | } 25 | 26 | public EecbEventMention getMention(int index) { 27 | return mMentions.get(index); 28 | } 29 | 30 | /** Get the size of Event Mentions */ 31 | public int getMentionCount() { 32 | return mMentions.size(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /data/EecbEventMention.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import java.util.Collection; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | import java.util.Set; 7 | 8 | /** 9 | * Store only EECB event mention 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class EecbEventMention extends EecbMention { 15 | 16 | /** Maps argument role to argument mentions */ 17 | private Map mRolesToArguments; 18 | 19 | /** the parent event */ 20 | private EecbEvent mParent; 21 | 22 | /** anchor text for this event, just the phrase annotated by the mentions.txt */ 23 | private EecbCharSeq mAnchor; 24 | 25 | /** scope is the whole sentence, while the extent is the sentence segment the mention is in*/ 26 | public EecbEventMention(String id, EecbCharSeq extent, EecbCharSeq anchor, int sentence) { 27 | super(id, extent, sentence); 28 | this.mAnchor = anchor; 29 | mRolesToArguments = new HashMap(); 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return "EecbEventMention [mAnchor = " + mAnchor + ", mParent=" + mParent + 35 | ", mRolesToArguments = " + mRolesToArguments + ", mExtent = " + mExtent + 36 | ", mId = " + mID + ", mSentence = " + mSentenceID + "]"; 37 | } 38 | 39 | public Collection getArgs() { 40 | return mRolesToArguments.values(); 41 | } 42 | 43 | public Set getRoles() { 44 | return mRolesToArguments.keySet(); 45 | } 46 | 47 | public EecbEntityMention getArg(String role) { 48 | return mRolesToArguments.get(role); 49 | } 50 | 51 | public void addArg(EecbEntityMention em, String role){ 52 | mRolesToArguments.put(role, em); 53 | } 54 | 55 | public void setAnchor(EecbCharSeq anchor) { 56 | mAnchor = anchor; 57 | } 58 | 59 | public EecbCharSeq getAnchor() { 60 | return mAnchor; 61 | } 62 | 63 | public void setParent(EecbEvent e) { 64 | mParent = e; 65 | } 66 | 67 | public EecbEvent getParent() { 68 | return mParent; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /data/EecbMention.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | /** 4 | * Superclass of all Eecb mentions (entities, events, etc) 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public class EecbMention extends EecbElement { 10 | 11 | protected EecbCharSeq mExtent; 12 | protected int mSentenceID; 13 | 14 | protected EecbMention(String id, EecbCharSeq mExtent, int sentenceID) { 15 | super(id); 16 | this.mExtent = mExtent; 17 | this.mSentenceID = sentenceID; 18 | } 19 | 20 | public EecbCharSeq getExtent() {return mExtent;} 21 | 22 | public int sentenceID() { 23 | return this.mSentenceID; 24 | } 25 | 26 | public String toXml(int offset) { return ""; } 27 | } 28 | -------------------------------------------------------------------------------- /data/EecbSrlAnnotation.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | /** 4 | * Implementation for aligning the result of SRL and the gold annotations 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public class EecbSrlAnnotation { 10 | 11 | /** The id. */ 12 | String id; 13 | 14 | /** the text */ 15 | String mText; 16 | 17 | /** The start offset */ 18 | int start; 19 | 20 | /** The end offset */ 21 | int end; 22 | 23 | /** head dependency */ 24 | int parentPosition; 25 | 26 | /*mention headWord*/ 27 | String headString; 28 | 29 | // head index 30 | int headStartIndex; 31 | int headEndIndex; 32 | 33 | /**predicate*/ 34 | String predicate; 35 | 36 | public EecbSrlAnnotation(String id, String text, String predicate, int parentPosition, int start, int end) { 37 | this.id = id; 38 | this.mText = text; 39 | this.predicate = predicate; 40 | this.parentPosition = parentPosition; 41 | this.start = start; 42 | this.end = end; 43 | headStartIndex = 0; 44 | headEndIndex = 0; 45 | } 46 | 47 | public EecbSrlAnnotation() { 48 | 49 | } 50 | 51 | public void setHead(String headString) { 52 | this.headString = headString; 53 | } 54 | 55 | public void setHeadStartIndex(int startIndex) { 56 | headStartIndex = startIndex; 57 | } 58 | 59 | public int getHeadStartIndex() { 60 | return headStartIndex; 61 | } 62 | 63 | public void setHeadEndIndex(int endIndex) { 64 | headEndIndex = endIndex; 65 | } 66 | 67 | public int getHeadEndIndex() { 68 | return headEndIndex; 69 | } 70 | 71 | public String getHead() { 72 | return this.headString; 73 | } 74 | 75 | /** The ID of the annotation. */ 76 | public String getId() 77 | { 78 | return id; 79 | } // getId() 80 | 81 | /** Set the ID of the annotation. */ 82 | public void setId(String i) 83 | { 84 | id = i; 85 | } // setId() 86 | 87 | public String getText() { 88 | return this.mText; 89 | } 90 | 91 | public String getPredicate() { 92 | return this.predicate; 93 | } 94 | 95 | /** The start offset. */ 96 | public int getStartOffset() 97 | { 98 | return start; 99 | } // getStartOffset() 100 | 101 | /** The end offset. */ 102 | public int getEndOffset() 103 | { 104 | return end; 105 | } // getEndOffset() 106 | 107 | public int getLength() 108 | { 109 | return end - start; 110 | } 111 | 112 | /** Set the start offset. */ 113 | public void setStartOffset(int s) 114 | { 115 | start = s; 116 | } // setStartOffset() 117 | 118 | /** Set the end offset. */ 119 | public void setEndOffset(int e) 120 | { 121 | end = e; 122 | } // setEndOffset() 123 | 124 | /** Output representation of the annotation */ 125 | @Override 126 | public String toString() { 127 | return mText + "(" + start + "/" + end + "; "+ headStartIndex + "/" + headEndIndex +")"; 128 | } 129 | 130 | public void setText(String text) { 131 | mText = text; 132 | } 133 | 134 | 135 | } 136 | -------------------------------------------------------------------------------- /data/EecbToken.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.data; 2 | 3 | import edu.stanford.nlp.ie.machinereading.common.StringDictionary; 4 | import edu.stanford.nlp.trees.Span; 5 | 6 | /** 7 | * Every Token in the EECB corpus 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public class EecbToken { 13 | 14 | /** the actual token bytes 15 | * Normally we work with mWord (see below), but mLiteral is needed when 16 | * we need to check if a sequence of tokens exists in a gazetteer 17 | */ 18 | private String mLiteral; 19 | 20 | /** The index of the literal in the WORDS hash */ 21 | private int mWord; 22 | 23 | private int mPos; 24 | 25 | private int mChunk; 26 | 27 | private Span mByteOffset; 28 | 29 | private int mSentence; 30 | 31 | /** Dictionary for all words in the corpus */ 32 | public static StringDictionary WORDS; 33 | 34 | /** Dictionary for all lemmas in the corpus */ 35 | public static StringDictionary LEMMAS; 36 | 37 | /** Dictionary for all other strings in the corpus */ 38 | public static StringDictionary OTHERS; 39 | 40 | static { 41 | WORDS = new StringDictionary("words"); 42 | LEMMAS = new StringDictionary("lemmas"); 43 | OTHERS = new StringDictionary("others"); 44 | WORDS.setMode(true); 45 | LEMMAS.setMode(true); 46 | OTHERS.setMode(true); 47 | } 48 | 49 | public int getSentence() { 50 | return mSentence; 51 | } 52 | 53 | public String getLiteral() { 54 | return mLiteral; 55 | } 56 | 57 | public int getPos() { 58 | return mPos; 59 | } 60 | 61 | public int getChunk() { 62 | return mChunk; 63 | } 64 | 65 | public Span getByteOffset() { 66 | return mByteOffset; 67 | } 68 | 69 | public int getByteStart() { 70 | return mByteOffset.start(); 71 | } 72 | 73 | public int getByteEnd() { 74 | return mByteOffset.end(); 75 | } 76 | 77 | public static String removeSpaces(String s) { 78 | if (s == null) 79 | return s; 80 | return s.replaceAll(" ", "_"); 81 | } 82 | 83 | /** 84 | * Constructs an AceToken from a tokenized line generated by Tokey 85 | */ 86 | public EecbToken(String word, String pos, String chunk, int start, int end, int sentence) { 87 | mLiteral = word; 88 | if (word == null) { 89 | mWord = -1; 90 | } else { 91 | mWord = WORDS.get(removeSpaces(word), false); 92 | } 93 | if (pos == null) 94 | mPos = -1; 95 | else 96 | mPos = OTHERS.get(pos, false); 97 | if (chunk == null) 98 | mChunk = -1; 99 | else 100 | mChunk = OTHERS.get(chunk, false); 101 | 102 | mByteOffset = new Span(start, end); 103 | mSentence = sentence; 104 | } 105 | 106 | @Override 107 | public String toString() { 108 | return mLiteral + ", " + mByteOffset + ", " + mSentence; 109 | } 110 | 111 | } -------------------------------------------------------------------------------- /dataset/IDataSet.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.dataset; 2 | 3 | import edu.stanford.nlp.dcoref.Document; 4 | 5 | /** 6 | * get training data 7 | * 8 | * @author Jun Xie (xie@eecs.oregonstate.edu) 9 | * 10 | */ 11 | public interface IDataSet { 12 | 13 | public Document getData(String topics, boolean goldOnly); 14 | } 15 | -------------------------------------------------------------------------------- /dataset/TopicGeneration.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.dataset; 2 | 3 | import java.util.Properties; 4 | 5 | import edu.oregonstate.general.StringOperation; 6 | import edu.oregonstate.util.EecbConstants; 7 | 8 | /** 9 | * generate training topics, testing topics, development topics 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class TopicGeneration { 15 | 16 | // training topics 17 | private String[] trainingTopics; 18 | 19 | // testing topics 20 | private String[] testingTopics; 21 | 22 | // development topics 23 | private String[] developmentTopics; 24 | 25 | /** every experiment just one topic */ 26 | private String topic; 27 | 28 | // experiment properties 29 | private final Properties experimentProps; 30 | 31 | public TopicGeneration(Properties experimentProperties) { 32 | experimentProps = experimentProperties; 33 | trainingTopics = null; 34 | testingTopics = null; 35 | developmentTopics = null; 36 | generateTopics(); 37 | } 38 | 39 | /** 40 | * generate topics 41 | * 42 | */ 43 | public void generateTopics() { 44 | String[] sets = new String[]{EecbConstants.DATAGENERATION_TRAININGSET_PROP, EecbConstants.DATAGENERATION_DEVELOPMENTSET_PROP, EecbConstants.DATAGENERATION_TESTINGSET_PROP}; 45 | 46 | for (String set : sets) { 47 | String topicString = experimentProps.getProperty(set, ""); 48 | if (topicString != "") { 49 | 50 | if (set.equals(EecbConstants.DATAGENERATION_TRAININGSET_PROP)) { 51 | trainingTopics = StringOperation.splitString(topicString, ","); 52 | } 53 | 54 | if (set.equals(EecbConstants.DATAGENERATION_DEVELOPMENTSET_PROP)) { 55 | developmentTopics = StringOperation.splitString(topicString, ","); 56 | } 57 | 58 | if (set.equals(EecbConstants.DATAGENERATION_TESTINGSET_PROP)) { 59 | testingTopics = StringOperation.splitString(topicString, ","); 60 | } 61 | } 62 | } 63 | 64 | } 65 | 66 | /** 67 | * just one topic processed by the current job 68 | * 69 | * @return a topic 70 | */ 71 | public String topic() { 72 | if (trainingTopics != null) { 73 | topic = trainingTopics[0] + "-trainingtopic"; 74 | } 75 | 76 | if (testingTopics != null) { 77 | topic = testingTopics[0] + "-testingtopic"; 78 | } 79 | 80 | if (developmentTopics != null) { 81 | topic = developmentTopics[0] + "-developmenttopic"; 82 | } 83 | 84 | return topic; 85 | } 86 | 87 | // return training topic 88 | public String[] trainingTopics() { 89 | return trainingTopics; 90 | } 91 | 92 | // return testing topic 93 | public String[] testingTopics() { 94 | return testingTopics; 95 | } 96 | 97 | // return development topic 98 | public String[] developmentTopics() { 99 | return developmentTopics; 100 | } 101 | 102 | } 103 | -------------------------------------------------------------------------------- /example/ReadLearnedWeight.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.example; 2 | 3 | import java.util.*; 4 | 5 | import edu.stanford.nlp.io.IOUtils; 6 | 7 | public class ReadLearnedWeight { 8 | 9 | public static void main(String[] args) { 10 | String filePath = "/nfs/guille/xfern/users/xie/Experiment/corpus/EECB1.0/tokenoutput/file"; 11 | List lines = IOUtils.linesFromFile(filePath); 12 | Map maps = new TreeMap(); 13 | for (String line : lines) { 14 | String[] elements = line.split("\t"); 15 | String word = elements[1]; 16 | if (!maps.containsKey(word)) { 17 | maps.put(word, 0); 18 | } 19 | int counter = maps.get(word) + 1; 20 | maps.put(word, counter ); 21 | } 22 | 23 | for (String word : maps.keySet()) { 24 | System.out.println(word + " " + maps.get(word)); 25 | } 26 | 27 | 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /example/VectorNormalization.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.example; 2 | 3 | import java.util.*; 4 | 5 | import edu.oregonstate.general.StringOperation; 6 | 7 | public class VectorNormalization { 8 | 9 | public static void main(String[] args) { 10 | 11 | List numbers = new ArrayList(); 12 | numbers.addAll(null); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /example/Weight.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.example; 2 | 3 | import java.util.*; 4 | 5 | public class Weight { 6 | 7 | public static void main(String[] args) { 8 | 9 | List previousIDs = new ArrayList(); 10 | previousIDs.add(1); 11 | previousIDs.add(2); 12 | previousIDs.add(3); 13 | 14 | List currentIDs = new ArrayList(); 15 | currentIDs.add(1); 16 | currentIDs.add(3); 17 | currentIDs.add(4); 18 | 19 | currentIDs.removeAll(previousIDs); 20 | System.out.println(currentIDs); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /experiment/CrossCoreferenceResolution.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.experiment; 2 | 3 | import java.io.File; 4 | import java.lang.reflect.Constructor; 5 | import java.util.*; 6 | 7 | import edu.oregonstate.io.ResultOutput; 8 | import edu.oregonstate.server.Pipeline; 9 | import edu.stanford.nlp.util.StringUtils; 10 | 11 | /** 12 | * cross coreference resolution 13 | * 14 | * @author Jun Xie (xiejuncs@gmail.com) 15 | * 16 | */ 17 | public class CrossCoreferenceResolution extends ExperimentConstructor { 18 | 19 | private Map methodToClasses = new HashMap(); 20 | 21 | private final String configFolder; 22 | 23 | /** 24 | * set experiment properties 25 | * 26 | * @param props 27 | */ 28 | public CrossCoreferenceResolution(Properties props, String configfolder) { 29 | super(props); 30 | 31 | configFolder = configfolder; 32 | 33 | /** 34 | * map the procedure to the corresponding main class 35 | */ 36 | methodToClasses.put("datageneration", "edu.oregonstate.dataset.DatasetFactory"); 37 | methodToClasses.put("searchtrueloss", "edu.oregonstate.search.SearchFactory"); 38 | methodToClasses.put("learn", "edu.oregonstate.classifier.ClassifierFactory"); 39 | methodToClasses.put("searchlearnedweightwithoutfeature", "edu.oregonstate.search.SearchFactory"); 40 | methodToClasses.put("resultaggregation", "edu.oregonstate.server.ResultAggregation"); 41 | methodToClasses.put("searchlearnedweightwithfeature", "edu.oregonstate.search.SearchFactory"); 42 | methodToClasses.put("lasso", "edu.oregonstate.search.SearchFactory"); 43 | 44 | } 45 | 46 | /** 47 | * perform the cross coreference resolution experiment 48 | */ 49 | public void performExperiment() { 50 | String procedure = experimentProps.getProperty("procedures"); 51 | Pipeline pipeline = new Pipeline(); 52 | pipeline.generateProcedures(procedure); 53 | List procedures = pipeline.getProcedure(); 54 | 55 | //TODO 56 | File experimentDirectory = new File(configFolder); 57 | String[] experiments = experimentDirectory.list(); 58 | 59 | for (String stepInformation : procedures) { 60 | System.out.println(stepInformation); 61 | String[] elements = stepInformation.split("-"); 62 | String step = elements[0]; 63 | String phaseIndex = elements[1]; 64 | String prefix = phaseIndex + "-" + step; 65 | String mainClass = methodToClasses.get(step); 66 | for (String experiment : experiments) { 67 | if (experiment.startsWith(prefix)) { 68 | try { 69 | 70 | Class experimentClass = Class.forName(mainClass); 71 | Class[] proto = new Class[1]; 72 | proto[0] = Properties.class; 73 | Object[] params = new Object[1]; 74 | 75 | // get property of the experiment 76 | String[] propArgs = new String[]{"-props", configFolder + "/" + experiment}; 77 | Properties prop = StringUtils.argsToProperties(propArgs); 78 | 79 | params[0] = prop; 80 | Constructor ct = experimentClass.getConstructor(proto); 81 | ExperimentConstructor experimenter = (ExperimentConstructor) ct.newInstance(params); 82 | experimenter.performExperiment(); 83 | } catch (Exception e) { 84 | throw new RuntimeException(e); 85 | } 86 | } 87 | } 88 | } 89 | 90 | } 91 | 92 | /** 93 | * The main entry point of the experiment 94 | * 95 | * @param args 96 | */ 97 | public static void main(String[] args) { 98 | if (args.length > 1) { 99 | System.out.println("there are more parameters, you just can specify one path parameter....."); 100 | System.exit(1); 101 | } 102 | 103 | String configFolder = "../corpus/alignexperiment"; 104 | if (args.length == 0) { 105 | // run the experiment in the local machine for debugging 106 | args = new String[1]; 107 | args[0] = configFolder + "/config.properties"; 108 | } 109 | 110 | String[] propArgs = new String[]{"-props", args[0]}; 111 | 112 | Properties props = StringUtils.argsToProperties(propArgs); 113 | ExperimentConstructor experiment = new CrossCoreferenceResolution(props, configFolder); 114 | ResultOutput.printTime(experimentLogFile, "The start of the experiment: "); 115 | experiment.performExperiment(); 116 | ResultOutput.printTime(experimentLogFile, "The end of the experiment"); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /experiment/ExperimentConfigurationFactory.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.experiment; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Properties; 8 | 9 | import edu.oregonstate.featureExtractor.WordSimilarity; 10 | import edu.oregonstate.io.ResultOutput; 11 | import edu.oregonstate.training.Development; 12 | import edu.oregonstate.util.EecbConstants; 13 | import edu.stanford.nlp.io.IOUtils; 14 | import edu.stanford.nlp.stats.ClassicCounter; 15 | import edu.stanford.nlp.util.Triple; 16 | 17 | /** 18 | * find out what is the configuration set for the experiment 19 | * 20 | * @author jun (xiejuncs@gmail.com) 21 | * 22 | */ 23 | public class ExperimentConfigurationFactory { 24 | 25 | // set the property for the experiment 26 | private final Properties props; 27 | 28 | // corpus folder 29 | private final String corpusPath; 30 | 31 | public ExperimentConfigurationFactory(Properties properties) { 32 | props = properties; 33 | corpusPath = ExperimentConstructor.experimentCorpusPath; 34 | } 35 | 36 | // define the experiment name as the result folder name 37 | public String defineExperimentName() { 38 | StringBuilder sb = new StringBuilder(); 39 | 40 | // get the EXPERIMENT_PROP value and get the elements 41 | // for each element, get its value and concatenate them together. 42 | // such that Pairwise-StructuredPerceptron 43 | String experimentProp = props.getProperty(EecbConstants.EXPERIMENT_PROP); 44 | String[] experimentElements = experimentProp.split(","); 45 | int length = experimentElements.length; 46 | for (int index = 0; index < length; index++ ) { 47 | 48 | String key = experimentElements[index].trim(); 49 | String value = props.getProperty(key.trim()); 50 | 51 | if (index == (length - 1)) { 52 | sb.append(value); 53 | } else { 54 | sb.append(value + "-"); 55 | } 56 | } 57 | 58 | return sb.toString().trim(); 59 | } 60 | 61 | // configure the WordNet at the beginning of the experiment 62 | public void configureWordNet() { 63 | String wordnetPath = props.getProperty(EecbConstants.WORDNET_PROP); 64 | System.setProperty("wordnet.database.dir", wordnetPath); 65 | } 66 | 67 | // load data from Word Similarity Dictionary 68 | public Map> loadSimilarityDictionary(String similarityPath) { 69 | WordSimilarity similarity = new WordSimilarity(similarityPath); 70 | similarity.load(); 71 | return similarity.getDatas(); 72 | } 73 | 74 | /** 75 | * get mention boundary from gold mention file 76 | * @return 77 | */ 78 | public Map>> loadGoldMentionBoundary() { 79 | String mentionPath = corpusPath + "/mentions.txt"; 80 | List records = IOUtils.linesFromFile(mentionPath); 81 | Map>> goldMentionBoundary = new HashMap>>(); 82 | // the format of the gold mention file 83 | // # N or V? (0) Topic(1) Doc(2) Sentence Number(3) CorefID(4) StartIdx(5) EndIdx(6) StartCharIdx(7) EndCharIdx(8) 84 | // # CharIdx doesn't include spaces 85 | // # sentence number starts from 0 86 | for (String record : records) { 87 | String[] elements = record.split("\t"); 88 | 89 | // index topic 90 | String topic = elements[1]; 91 | boolean containTopic = goldMentionBoundary.containsKey(topic); 92 | if (!containTopic) { 93 | goldMentionBoundary.put(topic, new HashMap>()); 94 | } 95 | 96 | // index the combination of document and sentence 97 | String document = elements[2]; 98 | String sentenceNumber = elements[3]; 99 | String DocSen = document + "-" + sentenceNumber; 100 | boolean containDocSen = goldMentionBoundary.get(topic).containsKey(DocSen); 101 | if (!containDocSen) { 102 | goldMentionBoundary.get(topic).put(DocSen, new ArrayList()); 103 | } 104 | 105 | // add record as triple : corefID, startCharIdx, endCharIdx 106 | String corefID = elements[0] + "-" + elements[4]; 107 | int startCharIdx = Integer.parseInt(elements[7]); 108 | int endCharIdx = Integer.parseInt(elements[8]); 109 | Triple triple = new Triple(corefID, startCharIdx, endCharIdx); 110 | goldMentionBoundary.get(topic).get(DocSen).add(triple); 111 | } 112 | 113 | return goldMentionBoundary; 114 | } 115 | 116 | // tune the stopping rate 117 | public static double tuneStoppingRate(double[] weight, int j) { 118 | double stoppingrate = 0.0; 119 | 120 | String stopping = ExperimentConstructor.experimentProps.getProperty(EecbConstants.SEARCH_STOPPINGCRITERION, "none"); 121 | if (stopping.equals("tuning")) { 122 | Development development = new Development(j, weight, 1.0, 3.0, 10); 123 | stoppingrate = development.tuning(); 124 | ResultOutput.writeTextFile(ExperimentConstructor.experimentLogFile, "\nthe stopping rate is : " + stoppingrate + " for " + j + "\n"); 125 | } 126 | 127 | return stoppingrate; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /experiment/ExperimentConstructor.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.experiment; 2 | 3 | import java.util.Map; 4 | 5 | import java.util.Properties; 6 | 7 | import edu.oregonstate.util.Command; 8 | import edu.oregonstate.util.EecbConstants; 9 | import edu.stanford.nlp.stats.ClassicCounter; 10 | 11 | /** 12 | * the abstract class of experiment 13 | * 14 | * @author Jun Xie (xiejuncs@gmail.com) 15 | * 16 | */ 17 | public abstract class ExperimentConstructor { 18 | 19 | // used for recording the information of the whole experiment 20 | public static String experimentLogFile; 21 | 22 | // experiment result folder 23 | public static String experimentFolder; 24 | 25 | // property file 26 | public static Properties experimentProps; 27 | 28 | // corpus path 29 | public static String experimentCorpusPath; 30 | 31 | // debug Mode 32 | public static boolean debugMode; 33 | 34 | // Dekang Lin's Noun Similarity thesaurus 35 | public static Map> nounSimilarityThesaurus; 36 | 37 | // Dekang Lin's Verb Similarity thesaurus, in order to get its top 10, use the Lemma word form 38 | public static Map> verbSimilarityThesaurus; 39 | 40 | // Dekang Lin's Adjective Similarity thesaurus 41 | public static Map> adjectiveSimilarityThesaurus; 42 | 43 | // post-process the corpus for predicted mentions 44 | public static boolean postProcess; 45 | 46 | /** whether the experiment is gold mentions or predicted mentions */ 47 | public static boolean goldMentions; 48 | 49 | /** 50 | * configure the experiment 51 | * 52 | * @param props 53 | */ 54 | public ExperimentConstructor(Properties props) { 55 | experimentProps = props; 56 | 57 | // debug mode 58 | debugMode = Boolean.parseBoolean(props.getProperty(EecbConstants.DEBUG_PROP, "false")); 59 | 60 | // corpus folder, which stores the EECB corpus and TEMPORARY folder which is used for print the log file 61 | experimentCorpusPath = props.getProperty(EecbConstants.CORPUS_PROP); 62 | 63 | StringBuilder sb = new StringBuilder(); 64 | //String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-"); 65 | sb.append(experimentCorpusPath + "/TEMPORYRESUT/"); 66 | 67 | ExperimentConfigurationFactory factory = new ExperimentConfigurationFactory(props); 68 | String name = factory.defineExperimentName(); 69 | sb.append(name); 70 | 71 | // create the result folder 72 | experimentFolder = sb.toString().trim(); 73 | Command.mkdir(experimentFolder); 74 | 75 | // create folder to store the CONLL results 76 | Command.mkdir(experimentFolder + "/conll"); 77 | 78 | // create folder to store the serialized results 79 | Command.mkdir(experimentFolder + "/document"); 80 | 81 | // create folder to store the model result 82 | Command.mkdir(experimentFolder + "/model"); 83 | 84 | // create folder to store the violation result 85 | Command.mkdir(experimentFolder + "/violation"); 86 | 87 | // create folder to store weight difference 88 | Command.mkdir(experimentFolder + "/weightdifference"); 89 | 90 | // create folder to store weight norm 91 | Command.mkdir(experimentFolder + "/weightnorm"); 92 | 93 | // create folder to store the constraints, the name of the file is just the topic name 94 | Command.mkdir(experimentFolder + "/constraints"); 95 | 96 | // specify the log file path 97 | experimentLogFile = sb.toString().trim() + "/experimentlog"; 98 | 99 | // configure the WORDNET 100 | factory.configureWordNet(); 101 | 102 | // Dekang Lin's Similarity thesaurus respecitvely for noun, adjective and verb 103 | nounSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simN.lsp"); 104 | verbSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simV.lsp"); 105 | adjectiveSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simA.lsp"); 106 | 107 | // whether need to do post-process on predicted mentions 108 | // because gold mention also includes the singleton cluster, 109 | // so no matter whether gold mention or predicted mention, 110 | // do post-process 111 | goldMentions = Boolean.parseBoolean(experimentProps.getProperty(EecbConstants.DATAGENERATION_GOLDMENTION_PROP)); 112 | postProcess = true; 113 | } 114 | 115 | // perform the experiments 116 | public abstract void performExperiment(); 117 | 118 | } 119 | -------------------------------------------------------------------------------- /featureExtractor/SRLDocument.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.featureExtractor; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * SRL document, need to specify the document ID 7 | * A collection of annotated tokens 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public class SRLDocument { 13 | 14 | /** document ID */ 15 | private final String mDocumentID; 16 | 17 | /** sentences */ 18 | private List> sentences; 19 | 20 | public SRLDocument(String documentID) { 21 | mDocumentID = documentID; 22 | sentences = new ArrayList>(); 23 | } 24 | 25 | public String getDocumentID() { 26 | return mDocumentID; 27 | } 28 | 29 | public void addSentence(List sentence) { 30 | sentences.add(sentence); 31 | } 32 | 33 | public List> getSentences() { 34 | return sentences; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /featureExtractor/SRLDocumentReader.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.featureExtractor; 2 | 3 | import java.util.*; 4 | 5 | import edu.stanford.nlp.io.IOUtils; 6 | 7 | 8 | /** 9 | * read SRL result Document 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class SRLDocumentReader { 15 | 16 | /** document path */ 17 | private final String mDocumentPath; 18 | 19 | public SRLDocumentReader(String documentPath) { 20 | mDocumentPath = documentPath; 21 | } 22 | 23 | /** 24 | * read raw input and format as SRLDocument 25 | * Seperating the sentences 26 | * 27 | * @return 28 | */ 29 | public SRLDocument readDocument() { 30 | // read srl result from the output of Semantic role labeling software 31 | List srlResults = IOUtils.linesFromFile(mDocumentPath); 32 | 33 | // define a SRLDocument 34 | String[] elements = mDocumentPath.split("/"); 35 | String topic = elements[elements.length - 1].split("\\.")[0]; 36 | SRLDocument document = new SRLDocument(topic); 37 | 38 | // format the srl result as the SRLDocument 39 | List sentence = new ArrayList(); 40 | for (int index = 0; index <= srlResults.size(); index++) { 41 | if (index == srlResults.size()) { 42 | document.addSentence(sentence); 43 | break; 44 | } 45 | 46 | String line = srlResults.get(index); 47 | if ((line.equals(""))) { 48 | document.addSentence(sentence); 49 | sentence = new ArrayList (); 50 | continue; 51 | } 52 | 53 | String[] token = line.split("\t"); 54 | sentence.add(token); 55 | } 56 | 57 | return document; 58 | } 59 | 60 | /** 61 | * the main entry of the program 62 | * 63 | * @param args 64 | */ 65 | public static void main(String[] args) { 66 | args = new String[]{"data/srl/16.output"}; 67 | 68 | String documentPath = args[0]; 69 | 70 | SRLDocumentReader reader = new SRLDocumentReader(documentPath); 71 | SRLDocument document = reader.readDocument(); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /featureExtractor/SimilarityVector.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.featureExtractor; 2 | 3 | import java.util.HashMap; 4 | import edu.oregonstate.util.CosineSimilarity; 5 | import edu.stanford.nlp.stats.Counter; 6 | 7 | /** 8 | * calculate similarity score for two similarity vector. 9 | * call the function in CosineSimilarity 10 | * NOTE 11 | * in order to use Cosine Similarity, convert from Counter data structure to Hash Map first 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public class SimilarityVector { 17 | 18 | private Counter mcounter; 19 | 20 | public SimilarityVector(Counter counter) { 21 | mcounter = counter; 22 | } 23 | 24 | public Counter getCounter() { 25 | return mcounter; 26 | } 27 | 28 | /** 29 | * calculate the cosine similarity of two similarity vector 30 | * 31 | * @param c1 32 | * @param c2 33 | * @return cosine similarity 34 | */ 35 | public static double getCosineSimilarity(SimilarityVector c1, SimilarityVector c2) { 36 | if (c1.mcounter.size() == 0 || c2.mcounter.size() == 0) return 0; 37 | Counter counter1 = c1.mcounter; 38 | Counter counter2 = c2.mcounter; 39 | HashMap hcounter1 = convertCounter(counter1); 40 | HashMap hcounter2 = convertCounter(counter2); 41 | double score = CosineSimilarity.calculateCosineSimilarity(hcounter1, hcounter2); 42 | return score; 43 | } 44 | 45 | /** 46 | * convert from counter data structure to hash map data structure 47 | * and then call the CosineSimilarity defined in the util package 48 | * 49 | * @param counter 50 | * @return 51 | */ 52 | public static HashMap convertCounter(Counter counter) { 53 | HashMap hcounter = new HashMap(); 54 | for (String key : counter.keySet()) { 55 | hcounter.put(key, counter.getCount(key)); 56 | } 57 | return hcounter; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /featureExtractor/WordSimilarity.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.featureExtractor; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.DataInputStream; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStreamReader; 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | import edu.stanford.nlp.stats.ClassicCounter; 12 | 13 | /** 14 | * extract the top-ten most-similar words in Dekang Lin's similarity thesaurus for all the nouns/adjectives/verbs in a 15 | * cluster 16 | *

17 | * Proximity-based Thesaurus: (http://webdocs.cs.ualberta.ca/~lindek/downloads.htm) 18 | * 19 | * @author Jun Xie (xie@eecs.oregonstate.edu) 20 | * 21 | */ 22 | public class WordSimilarity { 23 | 24 | // file path 25 | private final String filePath; 26 | 27 | // data used for mention word feature 28 | private Map> datas; 29 | 30 | public WordSimilarity(String path) { 31 | this.filePath = path; 32 | datas = new HashMap>(); 33 | } 34 | 35 | // return the data 36 | public Map> getDatas() { 37 | return datas; 38 | } 39 | 40 | /** load the word similarity dictionary */ 41 | public void load() { 42 | try { 43 | FileInputStream fstream = new FileInputStream(filePath); 44 | DataInputStream in = new DataInputStream(fstream); 45 | BufferedReader br = new BufferedReader(new InputStreamReader(in)); 46 | 47 | String strLine; 48 | boolean pass = true; 49 | String currentIndex = ""; 50 | ClassicCounter mentionWords = new ClassicCounter(); 51 | int i = 0; 52 | while ((strLine = br.readLine()) != null) { 53 | if (strLine.startsWith("(")) { 54 | pass = false; 55 | String[] words = strLine.split(" "); 56 | currentIndex = words[0].substring(1); 57 | mentionWords = new ClassicCounter(); 58 | mentionWords.incrementCount(currentIndex); 59 | i = 1; 60 | } 61 | if (pass) continue; 62 | if (!strLine.startsWith("(") && !strLine.startsWith(")") && i < 12) { 63 | String[] words = strLine.split("\t"); 64 | mentionWords.incrementCount(words[0]); 65 | i += 1; 66 | } 67 | 68 | if (i == 11) { 69 | datas.put(currentIndex, mentionWords); 70 | pass = true; 71 | } 72 | } 73 | 74 | br.close(); 75 | in.close(); 76 | fstream.close(); 77 | } catch (IOException ex) { 78 | ex.printStackTrace(); 79 | System.exit(1); 80 | } 81 | } 82 | 83 | } -------------------------------------------------------------------------------- /featureExtractor/Wordnet.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.featureExtractor; 2 | 3 | import java.util.Arrays; 4 | import java.util.Set; 5 | import java.util.HashSet; 6 | 7 | import edu.smu.tspell.wordnet.NounSynset; 8 | import edu.smu.tspell.wordnet.Synset; 9 | import edu.smu.tspell.wordnet.SynsetType; 10 | import edu.smu.tspell.wordnet.VerbSynset; 11 | import edu.smu.tspell.wordnet.WordNetDatabase; 12 | import edu.smu.tspell.wordnet.WordSense; 13 | 14 | 15 | /** 16 | * find the links between synonyms, we need to calculate the percentage of newly-introduced mentions links after the merge 17 | * that are wordnet synonyms 18 | */ 19 | public class Wordnet { 20 | 21 | // get an wordnet database instance 22 | private final WordNetDatabase wordnet; 23 | 24 | public Wordnet() { 25 | wordnet = WordNetDatabase.getFileInstance(); 26 | } 27 | 28 | /** 29 | * get synonym according to the lemma and its synset type 30 | * 31 | * @param lemma 32 | * @param type 33 | * @return 34 | */ 35 | public Set getSynonym(String lemma, SynsetType type) { 36 | Set synonyms = new HashSet(); 37 | Synset[] synsets = wordnet.getSynsets(lemma, type); 38 | for (Synset synset : synsets) { 39 | String[] wordforms = synset.getWordForms(); 40 | synonyms.addAll(Arrays.asList(wordforms)); 41 | } 42 | 43 | return synonyms; 44 | } 45 | 46 | /** 47 | * get derivationally form 48 | * 49 | * @param lemma 50 | * @param type 51 | * @return 52 | */ 53 | public Set getDerivationallyRelatedForms(String lemma, SynsetType type) { 54 | Set derivationallyForm = new HashSet(); 55 | Synset[] synsets = wordnet.getSynsets(lemma, type); 56 | for (Synset synset : synsets) { 57 | WordSense[] senses = synset.getDerivationallyRelatedForms(lemma); 58 | for (WordSense sense : senses) { 59 | derivationallyForm.add(sense.getWordForm()); 60 | } 61 | } 62 | 63 | return derivationallyForm; 64 | } 65 | 66 | /** 67 | * get noun hypernym 68 | * 69 | * @param lemma 70 | * @param type 71 | * @return 72 | */ 73 | public Set getNounHypernym(String lemma) { 74 | Set hypernyms = new HashSet(); 75 | Synset[] synsets = wordnet.getSynsets(lemma, SynsetType.NOUN); 76 | for (Synset synset : synsets) { 77 | NounSynset nounSynset = (NounSynset) synset; 78 | NounSynset[] hypernymSynset = nounSynset.getHypernyms(); 79 | for (NounSynset set : hypernymSynset) { 80 | hypernyms.addAll(Arrays.asList(set.getWordForms())); 81 | } 82 | } 83 | 84 | return hypernyms; 85 | } 86 | 87 | /** 88 | * get verb hypernym 89 | * 90 | * @param lemma 91 | * @param type 92 | * @return 93 | */ 94 | public Set getVerbHypernym(String lemma) { 95 | Set hypernyms = new HashSet(); 96 | Synset[] synsets = wordnet.getSynsets(lemma, SynsetType.VERB); 97 | for (Synset synset : synsets) { 98 | VerbSynset verbSynset = (VerbSynset) synset; 99 | VerbSynset[] hypernymSynset = verbSynset.getHypernyms(); 100 | for (VerbSynset set : hypernymSynset) { 101 | hypernyms.addAll(Arrays.asList(set.getWordForms())); 102 | } 103 | } 104 | 105 | return hypernyms; 106 | } 107 | 108 | /** 109 | * set word net path first 110 | * 111 | * @param wordnetPath 112 | */ 113 | public static void setWordNet(String wordnetPath) { 114 | System.setProperty("wordnet.database.dir", wordnetPath); 115 | } 116 | 117 | /** 118 | * WORDNET examples 119 | * 120 | * @param args 121 | */ 122 | public static void main(String[] args) { 123 | String wordnetPath = "/home/jun/JavaFile/corpus/WordNet-3.0/dict"; 124 | Wordnet.setWordNet(wordnetPath); 125 | 126 | Wordnet wordnet = new Wordnet(); 127 | Set synonyms = wordnet.getSynonym("region", SynsetType.NOUN); 128 | Set nounHypernyms = wordnet.getNounHypernym("tent"); 129 | Set verbHypernyms = wordnet.getVerbHypernym("shout"); 130 | Set derivationallyForm = wordnet.getDerivationallyRelatedForms("develop", SynsetType.VERB); 131 | 132 | System.out.println("done"); 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /features/Feature.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features; 2 | 3 | import java.util.HashMap; 4 | import java.util.Set; 5 | 6 | import edu.stanford.nlp.dcoref.CorefCluster; 7 | import edu.stanford.nlp.dcoref.Document; 8 | import edu.stanford.nlp.stats.ClassicCounter; 9 | import edu.stanford.nlp.stats.Counter; 10 | import edu.oregonstate.featureExtractor.SimilarityVector; 11 | import edu.oregonstate.general.SetOperation; 12 | 13 | /** 14 | * the abstract feature definition, every individual feature should incorporate this feature 15 | * 16 | * @author Jun Xie (xie@eecs.oregonstate.edu) 17 | * 18 | */ 19 | public abstract class Feature { 20 | 21 | // feature name 22 | protected String featureName; 23 | 24 | public Feature() { 25 | featureName = getClass().getSimpleName(); 26 | } 27 | 28 | // the extended class override this method to 29 | // single its type : Numeric 30 | public boolean isNominal() { 31 | return false; 32 | } 33 | 34 | // the extended class override this method to 35 | // single its type : Numeric 36 | public boolean isNumeric() { 37 | return false; 38 | } 39 | 40 | // return feature name 41 | public String getFeatureName() { 42 | return featureName; 43 | } 44 | 45 | // generate feature value according to the document, the two clusters, and mention type 46 | public abstract double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType); 47 | 48 | /** 49 | * calculate specific feature similarity given two clusters 50 | * 51 | * @param former 52 | * @param latter 53 | * @param name 54 | * @return 55 | */ 56 | protected double calculateCosineSimilarity(CorefCluster former, CorefCluster latter, String name, String mentionType) { 57 | double cosineSimilarity = 0.0; 58 | 59 | if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) { 60 | return cosineSimilarity; 61 | } 62 | 63 | HashMap> formerCentroid = former.predictedCentroid; 64 | HashMap> latterCentroid = latter.predictedCentroid; 65 | 66 | Counter formerVector = formerCentroid.get(name); 67 | Counter latterVector = latterCentroid.get(name); 68 | 69 | if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) { 70 | return cosineSimilarity; 71 | } 72 | 73 | cosineSimilarity = SimilarityVector.getCosineSimilarity(new SimilarityVector(formerVector), new SimilarityVector(latterVector)); 74 | 75 | return cosineSimilarity; 76 | } 77 | 78 | /** 79 | * How many shared arguments two clusters have in a given role 80 | * 81 | * @param former 82 | * @param latter 83 | * @param name 84 | * @return 85 | */ 86 | protected double calculateAgreement(CorefCluster former, CorefCluster latter, String name, String mentionType) { 87 | if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) { 88 | return 0.0; 89 | } 90 | 91 | HashMap> formerCentroid = former.predictedCentroid; 92 | HashMap> latterCentroid = latter.predictedCentroid; 93 | 94 | Counter formerVector = formerCentroid.get(name); 95 | Counter latterVector = latterCentroid.get(name); 96 | 97 | if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) { 98 | return 0.0; 99 | } 100 | 101 | Set commonElementSet = SetOperation.intersection(formerVector, latterVector); 102 | 103 | return commonElementSet.size(); 104 | } 105 | 106 | /** 107 | * 108 | * How many non-shared arguments two clusters have in a given role 109 | * 110 | * @param former 111 | * @param latter 112 | * @param name 113 | * @return 114 | */ 115 | protected double calculateNonAgreement(CorefCluster former, CorefCluster latter, String name, String mentionType) { 116 | String featureName = name.substring(1); 117 | 118 | if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) { 119 | return 0.0; 120 | } 121 | 122 | HashMap> formerCentroid = former.predictedCentroid; 123 | HashMap> latterCentroid = latter.predictedCentroid; 124 | 125 | Counter formerVector = formerCentroid.get(featureName); 126 | Counter latterVector = latterCentroid.get(featureName); 127 | 128 | if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) { 129 | return 0.0; 130 | } 131 | 132 | Set union = SetOperation.union(formerVector, latterVector); 133 | Set intersection = SetOperation.intersection(formerVector, latterVector); 134 | 135 | return (union.size() - intersection.size()); 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /features/NominalFeature.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features; 2 | 3 | /** 4 | * Nominal Feature 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public abstract class NominalFeature extends Feature { 10 | 11 | @Override 12 | public boolean isNominal() { 13 | return true; 14 | } 15 | 16 | // the Nominal Features 17 | // For example, there is a weather nominal feature 18 | // the values of this feature can be hot, cold, or anything like that 19 | public abstract String[] getValues(); 20 | } 21 | -------------------------------------------------------------------------------- /features/NumericFeature.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features; 2 | 3 | /** 4 | * Numeric Feature 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public abstract class NumericFeature extends Feature { 10 | 11 | @Override 12 | public boolean isNumeric() { 13 | return true; 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /features/individualfeature/Animacy.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | public class Animacy extends NumericFeature { 8 | 9 | public Animacy() { 10 | featureName = this.getClass().getSimpleName(); 11 | } 12 | 13 | @Override 14 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 15 | double animacySimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 16 | 17 | return animacySimilarity; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /features/individualfeature/Gender.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Cosine Similarity of gender 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class Gender extends NumericFeature { 14 | 15 | public Gender() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double genderSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 22 | 23 | return genderSimilarity; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/Head.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Entity Head feature, Cosine Similarity of head-word vectors of two clusters 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class Head extends NumericFeature { 14 | 15 | public Head() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double headSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 22 | 23 | return headSimilarity; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /features/individualfeature/Lemma.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Event Lemmas : Cosine Similarity of the lemma vectors of two clusters 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class Lemma extends NumericFeature { 14 | 15 | public Lemma() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double lemmaSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 22 | 23 | return lemmaSimilarity; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /features/individualfeature/MentionWord.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * 2nd Order Similarity of Mention Words : cosine similarity of vectors containing words 9 | * that are distributionally similar to words in the cluster mentions 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class MentionWord extends NumericFeature { 15 | 16 | public MentionWord() { 17 | featureName = this.getClass().getSimpleName(); 18 | } 19 | 20 | @Override 21 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 22 | double mentionWordSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 23 | 24 | return mentionWordSimilarity; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NEType.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * cosine similarity of NE label vectors 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NEType extends NumericFeature { 14 | 15 | public NEType() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double NETypeSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 22 | 23 | return NETypeSimilarity; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlA0.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Arguments in a Specific Role A0 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlA0 extends NumericFeature { 14 | 15 | public NSrlA0() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlA0 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlA0 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlA1.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Arguments in a Specific Role A1 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlA1 extends NumericFeature { 14 | 15 | public NSrlA1() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlA1 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlA1 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlA2.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Arguments in a Specific Role A2 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlA2 extends NumericFeature { 14 | 15 | public NSrlA2() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlA2 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlA2 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlAMLoc.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Arguments in a Specific Role AMLOC 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlAMLoc extends NumericFeature { 14 | 15 | public NSrlAMLoc() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlAMLoc = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlAMLoc > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlAgreeCount.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Number of non-Coreferent Arguments or Predicates : 9 | * The total number of uncommon arguments and predicates between mentions in the two clusters 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class NSrlAgreeCount extends NumericFeature { 15 | 16 | public NSrlAgreeCount() { 17 | featureName = this.getClass().getSimpleName(); 18 | } 19 | 20 | @Override 21 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 22 | double totalNonAgreement = 0.0; 23 | String[] verbElements = {"NSrlA0", "NSrlA1", "NSrlA2", "NSrlAMLoc"}; 24 | String[] nounElements = {"NSrlPA0", "NSrlPA1", "NSrlPA2", "NSrlPAMLoc"}; 25 | 26 | if (mentionType.equals("")) { 27 | for (String feature : verbElements) { 28 | double number = calculateNonAgreement(former, latter, feature, mentionType); 29 | totalNonAgreement += (number > 0.0) ? 1.0 : 0.0; 30 | } 31 | } else { 32 | for (String feature : nounElements) { 33 | double number = calculateNonAgreement(former, latter, feature, mentionType); 34 | totalNonAgreement += (number > 0.0) ? 1.0 : 0.0; 35 | } 36 | } 37 | 38 | return totalNonAgreement; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlPA0.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Predicate in a Specific Role A0 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlPA0 extends NumericFeature { 14 | 15 | public NSrlPA0() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlPA0 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlPA0 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlPA1.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Predicate in a Specific Role A1 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlPA1 extends NumericFeature { 14 | 15 | public NSrlPA1() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlPA1 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlPA1 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlPA2.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Predicate in a Specific Role A2 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlPA2 extends NumericFeature { 14 | 15 | public NSrlPA2() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlPA2 = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlPA2 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/NSrlPAMLoc.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Non-coreferent Predicate in a Specific Role AM-LOC 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class NSrlPAMLoc extends NumericFeature { 14 | 15 | public NSrlPAMLoc() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double nSrlPAMLoc = calculateNonAgreement(former, latter, featureName, mentionType); 22 | double indicator = (nSrlPAMLoc > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/Number.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Cosine Similarity of number 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class Number extends NumericFeature { 14 | 15 | public Number() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double numberSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType); 22 | 23 | return numberSimilarity; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /features/individualfeature/SrlA0.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * coreferent arguments in A0 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlA0 extends NumericFeature { 14 | 15 | public SrlA0() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlA0 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlA0 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlA1.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * coreferent arguments in A1 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlA1 extends NumericFeature { 14 | 15 | public SrlA1() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlA1 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlA1 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlA2.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * coreferent arguments in A2 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlA2 extends NumericFeature { 14 | 15 | public SrlA2() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlA2 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlA2 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlAMLoc.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * coreferent arguments in AMLoc 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlAMLoc extends NumericFeature { 14 | 15 | public SrlAMLoc() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlAMLoc = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlAMLoc > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlAgreeCount.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import edu.oregonstate.features.NumericFeature; 7 | import edu.stanford.nlp.dcoref.CorefCluster; 8 | import edu.stanford.nlp.dcoref.Document; 9 | 10 | /** 11 | * Number of Coreferent Arguments or Predicates : 12 | * The total number of shared arguments and predicates between mentions in the two clusters 13 | * 14 | * @author Jun Xie (xie@eecs.oregonstate.edu) 15 | * 16 | */ 17 | public class SrlAgreeCount extends NumericFeature { 18 | 19 | public SrlAgreeCount() { 20 | featureName = this.getClass().getSimpleName(); 21 | } 22 | 23 | @Override 24 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 25 | double totalAgreement = 0.0; 26 | String[] verbElements = {"SrlA0", "SrlA1", "SrlA2", "SrlAMLoc", "SrlLeft", "SrlRight"}; 27 | String[] nounElements = {"SrlPA0", "SrlPA1", "SrlPA2", "SrlPAMLoc"}; 28 | 29 | List verbRoles = Arrays.asList(verbElements); 30 | List nounRoles = Arrays.asList(nounElements); 31 | 32 | if (mentionType.equals("")) { 33 | for (String feature : verbRoles) { 34 | double number = calculateAgreement(former, latter, feature, mentionType); 35 | totalAgreement += (number > 0.0) ? 1.0 : 0.0; 36 | } 37 | } else { 38 | for (String feature : nounRoles) { 39 | double number = calculateAgreement(former, latter, feature, mentionType); 40 | totalAgreement += (number > 0.0) ? 1.0 : 0.0; 41 | } 42 | } 43 | 44 | return totalAgreement; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /features/individualfeature/SrlLeft.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * closest Left mention Feature 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlLeft extends NumericFeature { 14 | 15 | public SrlLeft() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlLeft = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlLeft > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlPA0.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Coreferent Predicate in a A0 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlPA0 extends NumericFeature { 14 | 15 | public SrlPA0() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlPA0 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlPA0 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlPA1.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Coreferent Predicate in a A1 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlPA1 extends NumericFeature { 14 | 15 | public SrlPA1() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlPA1 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlPA1 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlPA2.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Coreferent Predicate in a A2 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlPA2 extends NumericFeature { 14 | 15 | public SrlPA2() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlPA2 = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlPA2 > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlPAMLoc.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * Coreferent Predicate in a AMLoc 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public class SrlPAMLoc extends NumericFeature { 14 | 15 | public SrlPAMLoc() { 16 | featureName = this.getClass().getSimpleName(); 17 | } 18 | 19 | @Override 20 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 21 | double srlPAMLoc = calculateAgreement(former, latter, featureName, mentionType); 22 | double indicator = (srlPAMLoc > 0.0) ? 1.0 : 0.0; 23 | 24 | return indicator; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /features/individualfeature/SrlRight.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | public class SrlRight extends NumericFeature { 8 | 9 | public SrlRight() { 10 | featureName = this.getClass().getSimpleName(); 11 | } 12 | 13 | @Override 14 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 15 | double srlRight = calculateAgreement(former, latter, featureName, mentionType); 16 | double indicator = (srlRight > 0.0) ? 1.0 : 0.0; 17 | 18 | return indicator; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /features/individualfeature/Synonym.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.features.individualfeature; 2 | 3 | import edu.oregonstate.features.NumericFeature; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | import edu.stanford.nlp.dcoref.Mention; 7 | import edu.stanford.nlp.util.IntPair; 8 | 9 | /** 10 | * The percentage of newly-introduced metnion links after the merge taht are WordNet synonyms 11 | * 12 | * @author Jun Xie (xie@eecs.oregonstate.edu) 13 | * 14 | */ 15 | public class Synonym extends NumericFeature { 16 | 17 | // whether do pronoun resolution 18 | private final boolean DOPRONOUN; 19 | 20 | public Synonym() { 21 | featureName = this.getClass().getSimpleName(); 22 | DOPRONOUN = false; 23 | } 24 | 25 | @Override 26 | public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) { 27 | double synonymNom = 0.0; 28 | double synonymDenom = 0.0; 29 | 30 | for(Mention m1 : former.getCorefMentions()) { 31 | for(Mention m2 : latter.getCorefMentions()) { 32 | if(!DOPRONOUN && (m1.isPronominal() || m2.isPronominal())) continue; 33 | IntPair menPair = new IntPair(Math.min(m1.mentionID, m2.mentionID), Math.max(m1.mentionID, m2.mentionID)); 34 | 35 | synonymDenom++; 36 | if(document.mentionSynonymInWN.contains(menPair)) { 37 | synonymNom++; 38 | } 39 | } 40 | } 41 | 42 | // if two pronoun clusters, then synonymDenom is 0, then the value will be NaN 43 | double synonym = 0.0; 44 | if (synonymDenom > 0) { 45 | synonym = synonymNom/synonymDenom; 46 | } 47 | 48 | return synonym; 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /general/MapFactory.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.general; 2 | 3 | import java.util.*; 4 | import java.io.Serializable; 5 | 6 | /** 7 | * The MapFactory is a mechanism for specifying what kind of map is to be used 8 | * by some object. For example, if you want a Counter which is backed by an 9 | * IdentityHashMap instead of the defaul HashMap, you can pass in an 10 | * IdentityHashMapFactory. 11 | * 12 | * @author Dan Klein 13 | */ 14 | 15 | public abstract class MapFactory implements Serializable { 16 | private static final long serialVersionUID = 1L; 17 | public static class HashMapFactory extends MapFactory { 18 | public Map buildMap() { 19 | return new HashMap(); 20 | } 21 | } 22 | 23 | public static class IdentityHashMapFactory extends MapFactory { 24 | public Map buildMap() { 25 | return new IdentityHashMap(); 26 | } 27 | } 28 | 29 | public static class TreeMapFactory extends MapFactory { 30 | public Map buildMap() { 31 | return new TreeMap(); 32 | } 33 | } 34 | 35 | public static class WeakHashMapFactory extends MapFactory { 36 | public Map buildMap() { 37 | return new WeakHashMap(); 38 | } 39 | } 40 | 41 | public abstract Map buildMap(); 42 | } 43 | 44 | -------------------------------------------------------------------------------- /general/MatrixOperation.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.general; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import Jama.Matrix; 9 | 10 | /** 11 | * Jun Xie(xiejuncs@gmail.com) 12 | */ 13 | public class MatrixOperation { 14 | 15 | /** 16 | * Read a matrix from a comma sperated file 17 | * 18 | * @param fileName 19 | * @return 20 | */ 21 | public static Matrix readMatrix(String fileName) { 22 | try { 23 | BufferedReader reader = new BufferedReader(new FileReader(fileName)); 24 | List data_array = new ArrayList(); 25 | 26 | String line; 27 | while ((line = reader.readLine()) != null) { 28 | if (line.equals("")) { 29 | continue; 30 | } 31 | String fields[] = line.split(","); 32 | double data[] = new double[fields.length]; 33 | for (int i = 0; i < fields.length; i++) { 34 | data[i] = Double.parseDouble(fields[i]); 35 | } 36 | data_array.add(data); 37 | } 38 | 39 | reader.close(); 40 | if (data_array.size() > 0) { 41 | int cols = data_array.get(0).length; 42 | int rows = data_array.size(); 43 | Matrix matrix = new Matrix(rows, cols); 44 | for (int r = 0; r < rows; ++r) { 45 | for (int c = 0; c < cols; ++c) { 46 | matrix.set(r, c, data_array.get(r)[c]); 47 | } 48 | } 49 | return matrix; 50 | } 51 | } catch (Exception e) { 52 | e.printStackTrace(); 53 | System.exit(1); 54 | } 55 | 56 | return new Matrix(0, 0); 57 | } 58 | 59 | /** 60 | * the last column correspond to the target 61 | * Hence, remove target values from the last column of a data set. 62 | *

63 | * Meanwhile, we need to add 1 to 0 column of each row as a bias term 64 | * 65 | * @param data_set 66 | * @return 67 | */ 68 | public static Matrix getDataPoints(Matrix data_set) { 69 | Matrix features = data_set.getMatrix(0, data_set.getRowDimension() - 1, 0, data_set.getColumnDimension() - 2); 70 | int rows = features.getRowDimension(); 71 | int cols = features.getColumnDimension() + 1; 72 | Matrix modifiedFeatures = new Matrix(rows, cols); 73 | for (int r = 0; r < rows; ++r) { 74 | for (int c = 0; c < cols; ++c) { 75 | if (c == 0) { 76 | modifiedFeatures.set(r, c, 1.0); 77 | } else { 78 | modifiedFeatures.set(r, c, features.get(r, c-1)); 79 | } 80 | } 81 | } 82 | return modifiedFeatures; 83 | } 84 | 85 | /** 86 | * Returns the target values from the last column of a data set. 87 | * 88 | * @param data_set 89 | * @return 90 | */ 91 | public static Matrix getTargets(Matrix data_set) { 92 | return data_set.getMatrix(0, data_set.getRowDimension() - 1, data_set.getColumnDimension() - 1, data_set.getColumnDimension() - 1); 93 | } 94 | 95 | /** 96 | * divide the the averageModel by the number of mEpoch * number of topic 97 | * 98 | * @param averageModel 99 | * @param mEpoch 100 | * @return 101 | */ 102 | public static Matrix divide(Matrix averageModel, int mEpoch) { 103 | for (int i = 0; i < averageModel.getRowDimension(); i++) { 104 | double updateValue = averageModel.get(i, 0) / mEpoch; 105 | averageModel.set(i, 0, updateValue); 106 | } 107 | 108 | return averageModel; 109 | } 110 | 111 | /** 112 | * add model to the averageModel 113 | * NOTE 114 | * model and averageModel are both column vector 115 | * 116 | * @param model 117 | * @param averageModel 118 | * @return 119 | */ 120 | public static Matrix addWeight(Matrix model, Matrix averageModel) { 121 | for (int i = 0; i < averageModel.getRowDimension(); i++) { 122 | double updateValue = averageModel.get(i, 0) + model.get(i, 0); 123 | averageModel.set(i, 0, updateValue); 124 | } 125 | 126 | return averageModel; 127 | } 128 | 129 | /** 130 | * get average matrix 131 | * 132 | * @param averageWeight 133 | * @param wholeSearchStep 134 | * @return 135 | */ 136 | public static Matrix getAverageMatrix (Matrix averageWeight, int wholeSearchStep) { 137 | Matrix matrix = new Matrix(averageWeight.getRowDimension(), 1); 138 | for (int i = 0; i < averageWeight.getRowDimension(); i++) { 139 | matrix.set(i, 0, averageWeight.get(i, 0) / wholeSearchStep); 140 | } 141 | return matrix; 142 | } 143 | 144 | /** 145 | * matrix normalization 146 | * 147 | * @param weight 148 | */ 149 | public static Matrix normalization(Matrix weight) { 150 | double sum = weight.norm2(); 151 | 152 | if (sum == 0.0) return weight; 153 | 154 | for (int i = 0; i < weight.getRowDimension(); i++){ 155 | double value = weight.get(i, 0); 156 | weight.set(i, 0, value / sum); 157 | } 158 | 159 | return weight; 160 | } 161 | 162 | } 163 | -------------------------------------------------------------------------------- /general/SetOperation.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.general; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | 6 | import edu.stanford.nlp.stats.Counter; 7 | 8 | /** 9 | * Set Operation 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class SetOperation { 15 | 16 | /** 17 | * intersection of the keyset of two counter objects 18 | * 19 | * @param formerVector 20 | * @param latterVector 21 | * @return 22 | */ 23 | public static Set intersection(Counter formerVector, Counter latterVector) { 24 | Set commonElementSet = new HashSet(); 25 | 26 | // get the lower case of the set 27 | // Set formerSet = StringOperation.lowercase(formerVector.keySet()); 28 | // Set latterSet = StringOperation.lowercase(latterVector.keySet()); 29 | 30 | Set formerSet = formerVector.keySet(); 31 | Set latterSet = latterVector.keySet(); 32 | 33 | commonElementSet.addAll(formerSet); 34 | commonElementSet.retainAll(latterSet); 35 | 36 | return commonElementSet; 37 | } 38 | 39 | /** 40 | * union of the keyset of two counter objects 41 | * 42 | * @param formerVector 43 | * @param latterVector 44 | * @return 45 | */ 46 | public static Set union(Counter formerVector, Counter latterVector) { 47 | Set union = new HashSet(); 48 | union.addAll(formerVector.keySet()); 49 | union.addAll(latterVector.keySet()); 50 | return union; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /general/StringOperation.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.general; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | * String Operation 7 | * 8 | * @author Jun Xie (xie@eecs.oregonstate.edu) 9 | * 10 | */ 11 | public class StringOperation { 12 | 13 | private StringOperation() { 14 | } 15 | 16 | /** 17 | * split the string according to the splitter and trim the spaces 18 | * 19 | * @param string 20 | * @param splitter 21 | * @return 22 | */ 23 | public static String[] splitString(String string, String splitter) { 24 | String[] elements = string.split(splitter); 25 | // trim the space before and after 26 | int length = elements.length; 27 | String[] trimdElements = new String[length]; 28 | for (int index = 0; index < length; index++) { 29 | String value = elements[index]; 30 | trimdElements[index] = value.trim(); 31 | } 32 | 33 | return trimdElements; 34 | } 35 | 36 | /** 37 | * convert the set of string to a set of lower case string 38 | * 39 | * @param set 40 | * @return 41 | */ 42 | public static Set lowercase(Set set) { 43 | Set result = new HashSet(); 44 | 45 | for (String element : set) { 46 | result.add(element.toLowerCase()); 47 | } 48 | 49 | return result; 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /io/EgenericDataSetReader.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.io; 2 | 3 | import java.util.List; 4 | import java.util.logging.Level; 5 | import java.util.logging.Logger; 6 | 7 | import edu.stanford.nlp.ling.Label; 8 | import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder; 9 | import edu.stanford.nlp.ling.CoreLabel; 10 | import edu.stanford.nlp.pipeline.Annotation; 11 | import edu.stanford.nlp.pipeline.Annotator; 12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 13 | import edu.stanford.nlp.trees.HeadFinder; 14 | import edu.stanford.nlp.trees.Tree; 15 | 16 | /** 17 | * 18 | * @author xie 19 | * 20 | */ 21 | public class EgenericDataSetReader { 22 | protected Logger logger; 23 | 24 | /** Finds the syntactic head of a syntactic constituent*/ 25 | protected final HeadFinder headFinder = new NoPunctuationHeadFinder(); 26 | 27 | /** Stanford CoreNLP processor to use for pre-processing*/ 28 | protected StanfordCoreNLP processor; 29 | 30 | /** 31 | * Additional processor that implements only syntactic parsing (needed for head detection) 32 | * We need this processor to detect heads of predicted entities that can not be matched to an existing constituent 33 | * This is created on demand, not necessary 34 | */ 35 | protected Annotator parseProcessor; 36 | 37 | /** If true, we perform syntactic analysis of the dataset sentences and annotations*/ 38 | protected final boolean preProcessSentences; 39 | 40 | /** 41 | * If true, sets the head span to match the syntactic head of the extent. 42 | * Otherwise, the head span is not modified. 43 | * This is enabled for the NFL domain, where head spans are not given. 44 | */ 45 | protected final boolean calculateHeadSpan; 46 | 47 | /** If true, it regenerates the index spans for all tree nodes (useful for KBP) */ 48 | protected final boolean forceGenerationofIndexSpans; 49 | 50 | /** Only around for legacy results */ 51 | protected boolean useNewHeadFinder = true; 52 | 53 | public EgenericDataSetReader() { 54 | this(null, false, false, false); 55 | } 56 | 57 | public EgenericDataSetReader(StanfordCoreNLP processor, boolean preProcessSentences, boolean calculateHeadSpan, boolean forceGenerationIndexSpans) { 58 | this.logger = Logger.getLogger(EgenericDataSetReader.class.getName()); 59 | this.logger.setLevel(Level.SEVERE); 60 | 61 | if (processor != null) setProcessor(processor); 62 | parseProcessor = null; 63 | this.preProcessSentences = preProcessSentences; 64 | this.calculateHeadSpan = calculateHeadSpan; 65 | this.forceGenerationofIndexSpans = forceGenerationIndexSpans; 66 | } 67 | 68 | public void setProcessor(StanfordCoreNLP processor) { 69 | this.processor = processor; 70 | } 71 | 72 | public void setUseNewHeadFinder(boolean useNewHeadFinder) { 73 | this.useNewHeadFinder = useNewHeadFinder; 74 | } 75 | 76 | public Annotator getParse() { 77 | if (parseProcessor == null) { 78 | parseProcessor = StanfordCoreNLP.getExistingAnnotator("parse"); 79 | assert(parseProcessor != null); 80 | } 81 | return parseProcessor; 82 | } 83 | 84 | public void setLoggerLevel(Level level) { 85 | logger.setLevel(level); 86 | } 87 | 88 | public Level getLoggerLevel() { 89 | return logger.getLevel(); 90 | } 91 | 92 | /** 93 | * Converts the tree labels to CoreLabels. 94 | * We need this because we store additional info in the CoreLabel, like token span. 95 | * @param tree 96 | */ 97 | public static void convertToCoreLabels(Tree tree) { 98 | Label l = tree.label(); 99 | if (! (l instanceof CoreLabel)) { 100 | CoreLabel cl = new CoreLabel(); 101 | cl.setValue(l.value()); 102 | tree.setLabel(cl); 103 | } 104 | 105 | for (Tree kid : tree.children()) 106 | convertToCoreLabels(kid); 107 | } 108 | 109 | /** 110 | * For EECB topic 111 | * 112 | * @param files 113 | * @param topic 114 | * @return 115 | * @throws Exception 116 | */ 117 | public Annotation read(List files, String topic) throws Exception { 118 | return null; 119 | } 120 | 121 | /** 122 | * For EECB document 123 | * 124 | * @param documentIdentifier 125 | * @return 126 | * @throws Exception 127 | */ 128 | public Annotation read(String documentIdentifier) throws Exception { 129 | return null; 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /io/LargeFileWriting.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.io; 2 | 3 | import java.io.DataOutputStream; 4 | import java.io.File; 5 | import java.io.FileOutputStream; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.Properties; 11 | 12 | import edu.oregonstate.experiment.ExperimentConstructor; 13 | import edu.oregonstate.general.DoubleOperation; 14 | import edu.oregonstate.util.EecbConstants; 15 | 16 | /** 17 | * write large data set to a output file 18 | * 19 | * @author Jun Xie (xie@eecs.oregonstate.edu) 20 | * 21 | */ 22 | public class LargeFileWriting { 23 | 24 | /* File Path */ 25 | private final String mPath; 26 | 27 | /* experiment settings */ 28 | private final Properties mProps; 29 | 30 | public LargeFileWriting(String path) { 31 | mPath = path; 32 | mProps = ExperimentConstructor.experimentProps; 33 | } 34 | 35 | /** 36 | * write arrays to file 37 | * 38 | * @param records 39 | */ 40 | public void writeArrays(List records) { 41 | boolean binary = Boolean.parseBoolean(mProps.getProperty(EecbConstants.IO_BINARY_PROP, "false")); 42 | 43 | try { 44 | // write file into binary form or not 45 | if (binary) { 46 | writeRawinByte(records); 47 | } else { 48 | writeRawinText(records); 49 | } 50 | } catch (Exception e) { 51 | throw new RuntimeException(e); 52 | } 53 | } 54 | 55 | /** 56 | * in its raw form 57 | * 58 | * @param records 59 | * @throws IOException 60 | */ 61 | private void writeRawinText(List records) throws IOException { 62 | File file = new File(mPath); 63 | try { 64 | FileWriter writer = new FileWriter(file, true); 65 | for (String record: records) { 66 | writer.write(record); 67 | writer.write("\n"); 68 | } 69 | writer.flush(); 70 | writer.close(); 71 | } finally { 72 | 73 | } 74 | } 75 | 76 | /** 77 | * into byte form 78 | * 79 | * @param records 80 | */ 81 | private void writeRawinByte(List records) { 82 | try { 83 | System.out.print("Writing byte...\n"); 84 | DataOutputStream dos = new DataOutputStream( new FileOutputStream(mPath)); 85 | for (String record: records) { 86 | double[] features = DoubleOperation.transformString(record, ","); 87 | 88 | for (int i = 0; i < features.length; i++) { 89 | dos.writeDouble(features[i]); 90 | 91 | if ( i == features.length - 1) { 92 | dos.writeChar('\n'); 93 | } else { 94 | dos.writeChar('\t'); 95 | } 96 | } 97 | } 98 | 99 | dos.close(); 100 | } catch (Exception e) { 101 | throw new RuntimeException(e); 102 | } 103 | 104 | } 105 | 106 | /** 107 | * Example to run this class 108 | * 109 | * @param args 110 | */ 111 | public static void main(String[] args) { 112 | int RECORD_COUNT = 4000000; 113 | String RECORD = "Help I am trapped in a fortune cookie factory"; 114 | List records = new ArrayList(); 115 | for (int i = 0; i < RECORD_COUNT; i++) { 116 | records.add(RECORD); 117 | } 118 | String path = "example.txt"; 119 | 120 | LargeFileWriting writer = new LargeFileWriting(path); 121 | writer.writeArrays(records); 122 | } 123 | 124 | } 125 | -------------------------------------------------------------------------------- /io/LibSVM.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.io; 2 | 3 | /** 4 | * an interface to LibSVM to load the model 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public class LibSVM { 10 | 11 | 12 | } 13 | -------------------------------------------------------------------------------- /lossfunction/ILossFunction.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.lossfunction; 2 | 3 | import edu.oregonstate.search.State; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * the interface of Loss Functions 9 | * 10 | * There are a lot of loss functions, for example, highe loss, 0-1 loss, hamming loss. 11 | * Through this class, given different object, the loss function can be calculated. 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public interface ILossFunction { 17 | 18 | /* calculate loss function */ 19 | public double[] calculateLossFunction(Document document, State state); 20 | 21 | /* scoring the document */ 22 | public double[] getMetricScore(Document document); 23 | } 24 | -------------------------------------------------------------------------------- /lossfunction/MetricLossFunction.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.lossfunction; 2 | 3 | import edu.stanford.nlp.dcoref.CorefCluster; 4 | import edu.stanford.nlp.dcoref.CorefScorer; 5 | import edu.stanford.nlp.dcoref.CorefScorer.ScoreType; 6 | import edu.stanford.nlp.dcoref.Document; 7 | import edu.oregonstate.experiment.ExperimentConstructor; 8 | import edu.oregonstate.general.DoubleOperation; 9 | import edu.oregonstate.search.State; 10 | import edu.oregonstate.util.Command; 11 | import edu.oregonstate.util.EecbConstants; 12 | import edu.oregonstate.util.EecbConstructor; 13 | 14 | /** 15 | * Loss Function used to calculate the loss score 16 | * 17 | * @author Jun Xie (xie@eecs.oregonstate.edu) 18 | * 19 | */ 20 | public class MetricLossFunction implements ILossFunction { 21 | 22 | /* score type: Pairwise */ 23 | private ScoreType mtype; 24 | 25 | /* numerator and denominator of precision and recall */ 26 | private double precisionNumSum; 27 | private double precisionDenSum; 28 | private double recallNumSum; 29 | private double recallDenSum; 30 | 31 | public MetricLossFunction() { 32 | mtype = CorefScorer.ScoreType.valueOf(ExperimentConstructor.experimentProps.getProperty(EecbConstants.LOSSFUNCTION_SCORE_PROP, "Pairwise")); 33 | } 34 | 35 | /* calculate loss function according to different state, but with the same document */ 36 | public double[] calculateLossFunction(Document document, State state) { 37 | Command.generateStateDocument(document, state); 38 | double[] scores = calculateF1(document, mtype); 39 | return scores; 40 | } 41 | 42 | /* calculate F1, Precision and Recall according to the Score Type */ 43 | private double[] calculateF1(Document document, ScoreType type) { 44 | double F1 = 0.0; 45 | CorefScorer score = EecbConstructor.createCorefScorer(type); 46 | 47 | score.calculateScore(document); 48 | F1 = score.getF1(); 49 | double precision = score.getPrecision(); 50 | double recall = score.getRecall(); 51 | 52 | precisionNumSum = score.precisionNumSum; 53 | precisionDenSum = score.precisionDenSum; 54 | recallNumSum = score.recallNumSum; 55 | recallDenSum = score.recallDenSum; 56 | 57 | double[] result = {DoubleOperation.transformNaN(F1), DoubleOperation.transformNaN(precision), DoubleOperation.transformNaN(recall)}; 58 | return result; 59 | } 60 | 61 | /* the detail information of a score */ 62 | public String getDetailScoreInformation() { 63 | return precisionNumSum + " " + precisionDenSum + " " + recallNumSum + " " + recallDenSum; 64 | } 65 | 66 | /* scoring the document at the first time */ 67 | public double[] getMetricScore(Document document) { 68 | return calculateF1(document, mtype); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /method/Decoding.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.method; 2 | 3 | /** 4 | * used the learned weight to do decoding 5 | * 6 | * Take Coreference Resolution as an example, the decoding part is 7 | * to find a coreference resolution chain using search algorithm 8 | * 9 | * @author Jun Xie (xie@eecs.oregonstate.edu) 10 | * 11 | */ 12 | public abstract class Decoding { 13 | 14 | // decoding phase, used for define the output file name and debug information 15 | // for example: training-1 16 | protected String decodingPhase; 17 | 18 | public Decoding(String phase) { 19 | decodingPhase = phase; 20 | } 21 | 22 | /** 23 | * decode according to different application 24 | * 25 | * @param weight 26 | */ 27 | public abstract void decode(double[] weight); 28 | } 29 | -------------------------------------------------------------------------------- /method/IMethod.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.method; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | 7 | /** 8 | * experiment framework 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public interface IMethod { 14 | 15 | /* according to different method, execute different method */ 16 | public List executeMethod(); 17 | } 18 | -------------------------------------------------------------------------------- /pruning/Pruning.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.pruning; 2 | 3 | /** 4 | * Generate the constraints which can prune the search space in the beam search 5 | * 6 | * @author Jun Xie (xie@eecs.oregonstate.edu) 7 | * 8 | */ 9 | public class Pruning { 10 | 11 | /** 12 | * Right now, we can just use an topic to put into the test, for example, the sixth topic. 13 | * The reason for this topic is that this topic has less number of mentions. 14 | * @param args 15 | */ 16 | public static void main(String[] args) { 17 | 18 | } 19 | } -------------------------------------------------------------------------------- /score/AssignmentAlgorithm.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.score; 2 | 3 | public interface AssignmentAlgorithm { 4 | 5 | int[][] computeAssignments(double[][] costMatrix); 6 | } 7 | -------------------------------------------------------------------------------- /score/AssignmentProblem.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.score; 2 | 3 | public class AssignmentProblem { 4 | 5 | private final double[][] costMatrix; 6 | 7 | public AssignmentProblem(double[][] aCostMatrix) { 8 | costMatrix = aCostMatrix; 9 | } 10 | 11 | private double[][] copyOfMatrix() { 12 | double[][] retval = new double[costMatrix.length][]; 13 | for (int i = 0; i < costMatrix.length; i++) { 14 | retval[i] = new double[costMatrix[i].length]; 15 | System.arraycopy(costMatrix[i], 0, retval[i], 0, costMatrix[i].length); 16 | } 17 | return retval; 18 | } 19 | 20 | public int[][] solve(AssignmentAlgorithm algorithm) { 21 | double[][] costMatrix = copyOfMatrix(); 22 | return algorithm.computeAssignments(costMatrix); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /score/ScorerCEAF.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.score; 2 | 3 | import java.util.Arrays; 4 | import java.util.Iterator; 5 | import java.util.Map; 6 | import java.util.Set; 7 | import java.util.List; 8 | import java.util.ArrayList; 9 | 10 | import edu.stanford.nlp.dcoref.CorefCluster; 11 | import edu.stanford.nlp.dcoref.CorefScorer; 12 | import edu.stanford.nlp.dcoref.Document; 13 | import edu.stanford.nlp.dcoref.Mention; 14 | 15 | /** 16 | * CEAF score implementation (See paper: Evaluation metrics for End-to-End Coreference Resolution Systems) 17 | * 18 | * CEAF applies a similarity metric (which should be either mention based or entity based) for each pair of 19 | * entities (i.e. a set of mentions) to measure the goodness of each possible alignment. The best mapping is 20 | * used for calculating CEAF precision, recall and F-measure. 21 | * 22 | * There are two types similarity metric, called phi3 and phi4. We implemented the phi4 case. 23 | * 24 | * @author Jun Xie (xie@eecs.oregonstate.edu) 25 | * 26 | */ 27 | public class ScorerCEAF extends CorefScorer { 28 | 29 | // update all fields of CorefScorer to public. 30 | public ScorerCEAF() { 31 | super(); 32 | scoreType = ScoreType.CEAF; 33 | } 34 | 35 | /** 36 | * calculate precision according to the equation 5 in the paper 37 | */ 38 | protected void calculatePrecision(Document doc){ 39 | Map response = doc.corefClusters; 40 | Map reference = doc.goldCorefClusters; 41 | precisionNumSum = scoreHelper(reference, response); 42 | precisionDenSum = scoreHelper(response, response); 43 | } 44 | 45 | // calculate the simialrity 46 | public double similarity(CorefCluster responseCluster, CorefCluster referenceCluster) { 47 | Set responseMentions = responseCluster.corefMentions; 48 | Set referenceMentions = referenceCluster.corefMentions; 49 | List responseMentionIDs = new ArrayList(); 50 | List referenceMentionIDs = new ArrayList(); 51 | 52 | for (Mention mention : responseMentions) { 53 | responseMentionIDs.add(mention.mentionID); 54 | } 55 | for (Mention mention : referenceMentions ) { 56 | referenceMentionIDs.add(mention.mentionID); 57 | } 58 | int responseSize = responseMentionIDs.size(); 59 | int referenceSize = referenceMentionIDs.size(); 60 | responseMentionIDs.retainAll(referenceMentionIDs); 61 | int overlap = responseMentionIDs.size(); 62 | return (2 * overlap ) / (responseSize * referenceSize); 63 | } 64 | 65 | // calculate the cost function 66 | public double scoreHelper(Map reference, Map response) { 67 | double cost = 0.0; 68 | if (reference.size() == 0 || response.size() == 0) return 0.0; 69 | int size = reference.size() >= response.size() ? reference.size() : response.size(); 70 | double[][] scores = new double[size][size]; 71 | double max = 1.0; 72 | for (double[] score : scores) { 73 | Arrays.fill(score, max); 74 | } 75 | Set responseSet = response.keySet(); 76 | Iterator responseIt = responseSet.iterator(); 77 | int i = 0; 78 | int j = 0; 79 | while (responseIt.hasNext()) { 80 | CorefCluster responseCluster = response.get(responseIt.next()); 81 | j = 0; 82 | Set referenceSet = reference.keySet(); 83 | Iterator referenceIt = referenceSet.iterator(); 84 | while (referenceIt.hasNext()) { 85 | CorefCluster referenceCluster = reference.get(referenceIt.next()); 86 | scores[j][i] = max - similarity(responseCluster, referenceCluster); // how to calculate the similarity 87 | j++; 88 | } 89 | i++; 90 | } 91 | 92 | AssignmentProblem ap = new AssignmentProblem(scores); 93 | int[][] solution = ap.solve(new HungarianAlgorithm()); 94 | for (i = 0; i < solution.length; i++) { 95 | if (solution[i][0] >= 0) { 96 | cost += max - scores[solution[i][0]][i]; //how to calculate this 97 | } 98 | } 99 | 100 | return cost; 101 | } 102 | 103 | /** 104 | * calculate recall according to the equation 6 in the paper 105 | */ 106 | protected void calculateRecall(Document doc){ 107 | Map response = doc.corefClusters; 108 | Map reference = doc.goldCorefClusters; 109 | recallNumSum = scoreHelper(reference, response); 110 | recallDenSum = scoreHelper(reference, reference); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /score/ScorerHelper.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.score; 2 | 3 | import java.util.logging.Logger; 4 | 5 | import edu.oregonstate.io.ResultOutput; 6 | import edu.stanford.nlp.dcoref.CorefScorer; 7 | import edu.stanford.nlp.dcoref.Document; 8 | import edu.stanford.nlp.dcoref.ScorerBCubed; 9 | import edu.stanford.nlp.dcoref.ScorerMUC; 10 | import edu.stanford.nlp.dcoref.ScorerPairwise; 11 | import edu.stanford.nlp.dcoref.SieveCoreferenceSystem; 12 | import edu.stanford.nlp.dcoref.ScorerBCubed.BCubedType; 13 | 14 | /** 15 | * All stuffs related to the Score Function. Now, there 16 | * are four score metrics implemented, respectively Pairwise, 17 | * MUC, BCubed and CEAF. 18 | * 19 | * @author Jun Xie (xie@eecs.oregonstate.edu) 20 | * 21 | */ 22 | public class ScorerHelper { 23 | 24 | // we evaluate the score on a specific document 25 | private Document mDocument; 26 | private Logger mLogger; 27 | private String mPath; 28 | private boolean mPostProcess; 29 | 30 | public ScorerHelper(Document document, Logger logger, String path, boolean postProcess) { 31 | mDocument = document; 32 | mLogger = logger; 33 | mPath = path; 34 | mPostProcess = postProcess; 35 | } 36 | 37 | /** print score of the document, whether post-processing or not */ 38 | public void printScore() { 39 | if (!mPostProcess) { 40 | ResultOutput.writeTextFile(mPath, "do not postprocess the data"); 41 | 42 | CorefScorer score = new ScorerBCubed(BCubedType.Bconll); 43 | score.calculateScore(mDocument); 44 | score.printF1(mLogger, true); 45 | 46 | CorefScorer ceafscore = new ScorerCEAF(); 47 | ceafscore.calculateScore(mDocument); 48 | ceafscore.printF1(mLogger, true); 49 | 50 | CorefScorer mucscore = new ScorerMUC(); 51 | mucscore.calculateScore(mDocument); 52 | mucscore.printF1(mLogger, true); 53 | 54 | CorefScorer pairscore = new ScorerPairwise(); 55 | pairscore.calculateScore(mDocument); 56 | pairscore.printF1(mLogger, true); 57 | 58 | // Average of MUC, B^{3} and CEAF-\phi_{4}. 59 | double conllF1 = (score.getF1() + ceafscore.getF1() + mucscore.getF1()) / 3; 60 | ResultOutput.writeTextFile(mPath, "conllF1: " + conllF1); 61 | } else { 62 | ResultOutput.writeTextFile(mPath, "do postprocess the data"); 63 | SieveCoreferenceSystem.postProcessing(mDocument); 64 | 65 | CorefScorer score = new ScorerBCubed(BCubedType.Bconll); 66 | score.calculateScore(mDocument); 67 | score.printF1(mLogger, true); 68 | 69 | CorefScorer postmucscore = new ScorerMUC(); 70 | postmucscore.calculateScore(mDocument); 71 | postmucscore.printF1(mLogger, true); 72 | 73 | CorefScorer postpairscore = new ScorerPairwise(); 74 | postpairscore.calculateScore(mDocument); 75 | postpairscore.printF1(mLogger, true); 76 | 77 | CorefScorer ceafscore = new ScorerCEAF(); 78 | ceafscore.calculateScore(mDocument); 79 | ceafscore.printF1(mLogger, true); 80 | 81 | 82 | // Average of MUC, B^{3} and CEAF-\phi_{4}. 83 | double conllF1 = (score.getF1() + ceafscore.getF1() + postmucscore.getF1()) / 3; 84 | ResultOutput.writeTextFile(mPath, "conllF1: " + conllF1); 85 | } 86 | } 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /search/ISearch.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.search; 2 | 3 | import edu.oregonstate.classifier.Parameter; 4 | import edu.stanford.nlp.dcoref.CorefCluster; 5 | import edu.stanford.nlp.dcoref.Document; 6 | 7 | /** 8 | * search interface, all search method need to implement this interface 9 | * 10 | * @author Jun Xie (xie@eecs.oregonstate.edu) 11 | * 12 | */ 13 | public interface ISearch { 14 | 15 | /* learn weight according to the parameter, and then print training file into phase */ 16 | public Parameter trainingBySearch(Document document, Parameter para, String phase); 17 | 18 | /* apply the learned weight to the testing document, and return the best loss state, later, we can output a terminate state for final performance */ 19 | public State testingBySearch(Document document, double[] weight, String phase, boolean outputFeature, double stoppingrate); 20 | } -------------------------------------------------------------------------------- /server/ClusterConnection.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.util.List; 7 | 8 | import org.apache.log4j.Logger; 9 | 10 | import com.jcraft.jsch.ChannelExec; 11 | import com.jcraft.jsch.JSch; 12 | import com.jcraft.jsch.Session; 13 | 14 | /** 15 | * 16 | * @author Yonglei Zheng 17 | * 18 | */ 19 | public class ClusterConnection { 20 | 21 | // print log information 22 | private static final Logger log = Logger.getLogger(ClusterConnection.class); 23 | 24 | // host name 25 | private String host; 26 | 27 | private String user; 28 | 29 | private String password; 30 | 31 | private String stdout; 32 | 33 | private String stderr; 34 | 35 | private Session session; 36 | 37 | private int exitStatus; 38 | 39 | // Millisecond: 0.001 40 | private static final long COMMAND_TIME_INTERVAL = 1000; 41 | 42 | public ClusterConnection() { 43 | this("submit-em64t-01.hpc.engr.oregonstate.edu", "xie", "88jx$85"); 44 | } 45 | 46 | public ClusterConnection(String host, String user, String password) { 47 | this.host = host; 48 | this.user = user; 49 | this.password = password; 50 | } 51 | 52 | public void connect() throws Exception { 53 | disconnect(); 54 | 55 | JSch jsch = new JSch(); 56 | session = jsch.getSession(user, host, 22); 57 | String homeDir = System.getProperty("user.home"); 58 | String knownHostPath = homeDir + File.separator + ".ssh" 59 | + File.separator + "known_hosts"; 60 | jsch.setKnownHosts(knownHostPath); 61 | // If two machines have SSH passwordless logins setup, the following 62 | // line is not needed: 63 | session.setPassword(password); 64 | session.connect(); 65 | } 66 | 67 | public void disconnect() { 68 | if (session != null) { 69 | session.disconnect(); 70 | } 71 | } 72 | 73 | @Override 74 | protected void finalize() throws Throwable { 75 | disconnect(); 76 | } 77 | 78 | public void execCommand(String cmd) throws Exception { 79 | ChannelExec channel = (ChannelExec) session.openChannel("exec"); 80 | channel.setCommand(cmd); 81 | channel.setInputStream(null); 82 | channel.setErrStream(null); 83 | InputStream in = channel.getInputStream(); 84 | InputStream err = channel.getErrStream(); 85 | stdout = ""; 86 | stderr = ""; 87 | channel.connect(); 88 | while (true) { 89 | stdout += getRespond(in); 90 | stderr += getRespond(err); 91 | if (channel.isClosed()) { 92 | exitStatus = channel.getExitStatus(); 93 | break; 94 | } 95 | } 96 | channel.disconnect(); 97 | System.out.println("=========================================="); 98 | System.out.println("Command '" + cmd + "' executed"); 99 | System.out.println("stdout:\n" + (stdout.isEmpty() ? "[EMPTY]" : stdout)); 100 | System.out.println("stderr:\n" + (stderr.isEmpty() ? "[EMPTY]" : stderr)); 101 | System.out.println("exit-status: " + exitStatus); 102 | Thread.sleep(COMMAND_TIME_INTERVAL); 103 | } 104 | 105 | public String getRespond(InputStream is) throws IOException { 106 | StringBuffer buffer = new StringBuffer(); 107 | byte[] tmp = new byte[1024]; 108 | while (is.available() > 0) { 109 | int i = is.read(tmp, 0, 1024); 110 | if (i < 0) 111 | break; 112 | buffer.append(new String(tmp, 0, i)); 113 | } 114 | return buffer.toString().trim(); 115 | } 116 | 117 | public String getStdout() { 118 | return stdout; 119 | } 120 | 121 | public String getStderr() { 122 | return stderr; 123 | } 124 | 125 | public List queryJobIds() throws Exception { 126 | execCommand("qstat -u xie"); 127 | return JobState.parseJobIds(stdout); 128 | } 129 | 130 | public int submitJob(String scriptPath) throws Exception { 131 | execCommand("qsub " + scriptPath); 132 | String stdout = getStdout().trim(); 133 | if (!stdout.startsWith("Your job") 134 | || !stdout.endsWith("has been submitted")) { 135 | 136 | throw new Exception("Job cannot be submitted! script:" + scriptPath 137 | + "\nstdout:" + stdout + "\nstderr:" + stderr); 138 | } 139 | 140 | stdout = stdout.replaceAll("Your job", "").trim(); 141 | int jobId = Integer.valueOf(stdout.split("\\s+")[0]); 142 | log.info("jobId is " + jobId); 143 | return jobId; 144 | } 145 | 146 | public void deleteJob(int jobId) throws Exception { 147 | execCommand("qdel " + jobId); 148 | } 149 | 150 | } 151 | -------------------------------------------------------------------------------- /server/ExperimentArguments.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.lang.reflect.Field; 4 | 5 | public class ExperimentArguments { 6 | 7 | public String[] PROCEDURES_PROP = {"datageneration-0, lasso-1, searchlearnedweightwithoutfeature-1, resultaggregation-1"}; // dagger-3, searchlearnedweightwithoutfeature-0, 8 | // , tunemodel-6, " + "searchlearnedweightwithoutfeature-6, resultaggregation-6" 9 | //public String[] PROCEDURES_PROP = , searchtrueloss-1, learn-1, searchlearnedweightwithoutfeature-1, resultaggregation-1 10 | //datageneration-0, resultaggregation-0, lasso-1, searchlearnedweightwithoutfeature-1, resultaggregation-1 11 | public String[] EXPERIMENT_PROP = {"datageneration.goldmention, feature.atomic.names"}; // MUST be included in every experiment config file 12 | 13 | // corpus path 14 | public String[] CORPUS_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus"}; // MUST 15 | 16 | // CONLL scorer path MUST 17 | public String[] CONLL_SCORER_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus/scorer/v4/scorer.pl"}; // MUST 18 | 19 | // whether the experiment is in the debug model or cluster model 20 | // used to print out the detail information, while in the real clustering 21 | // running, we would like to like faster by reducing the output 22 | public String[] DEBUG_PROP = {"false"}; // MUST 23 | 24 | // WORDNET path 25 | public String[] WORDNET_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus/WordNet-3.0/dict"}; // MUST 26 | 27 | // 28 | // data generation 29 | // 30 | // within (false) or cross (true) reading data 31 | public String[] DATAGENERATION_DATASET_PROP = {"true"}; 32 | 33 | // gold mention (true) or predicted mention (false) 34 | public String[] DATAGENERATION_GOLDMENTION_PROP = { "true", "false"}; // MUST 35 | 36 | // GOLD cluster post process 37 | public String[] DATAGENERATION_POSTPROCESS_GOLD_PROP = {"false"}; 38 | 39 | // annotators used in the experiment 40 | public String[] DATAGENERATION_ANNOTATORS_PROP = {"tokenize, ssplit, pos, lemma, ner, parse, dcoref"}; // MUST 41 | 42 | // training set 43 | public String[] DATAGENERATION_TRAININGSET_PROP = {"5, 6"}; // MUST 43, 44 | //"5, 6, 8, 11, 16, 25, 30, 31, 37, 40, 43, 44" 45 | // testing set 46 | public String[] DATAGENERATION_TESTINGSET_PROP = {"10, 14"}; 47 | //"1, 2, 4, 7, 9, 10, 13, 14, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 32, 33, 34, 35, 36, 39, 41, 42, 45" 48 | 49 | //public String[] DATAGENERATION_DEVELOPMENTSET_PROP = {"3, 12, 38"}; 50 | 51 | // 52 | // search 53 | // 54 | // public String[] SEARCH_TYPE = {"searchtrueloss"}; 55 | 56 | 57 | // // best state score 58 | // public String[] BEST_STATE_PROP = {"true"}; // MUST 59 | // 60 | // // whether use all sieves or all sieves except Pronoun sieve 61 | // public String[] SIEVE_PROP = {"partial"}; 62 | // 63 | // // do training to learn a weight 64 | // public String[] DOTRAINING_PROP = {"true"}; 65 | // 66 | // // use existed weight to do testing, whether do validation or do final testing 67 | // public String[] EXISTEDWEIGHT_PROP = {"false"}; 68 | // 69 | // // classifier 70 | // public String[] CLASSIFIER_PROP = {"StructuredPerceptron"}; 71 | // public String[] CLASSIFIER_EPOCH_PROP = {"10"}; 72 | // 73 | // // cost function used, for example, linear 74 | // public String[] COSTFUNCTION_PROP = {"LinearCostFunction"}; 75 | // 76 | // // loss function used score type 77 | // public String[] LOSSFUNCTION_PROP = {"MetricLossFunction"}; 78 | // public String[] LOSSFUNCTION_SCORE_PROP = {"Pairwise"}; 79 | // 80 | // // search, its beam width, maximum step 81 | // public String[] SEARCH_PROP = {"BeamSearch"}; 82 | // public String[] SEARCH_BEAMWIDTH_PROP = {"1"}; 83 | // public String[] SEARCH_MAXIMUMSTEP_PROP = {"600"}; 84 | // 85 | // // stopping criterion (if tune, then its stopping rate) 86 | // public String[] STOPPING_CRITERION = {"none"}; 87 | // 88 | // // whether print the testing performance on training set 89 | // public String[] TRAINING_VALIDATION_PROP = {"true"}; 90 | // 91 | // // average weight or latest weight 92 | // public String[] WEIGHT_PROP = {"true"}; 93 | // 94 | // // Method configuration 95 | // public String[] METHOD_PROP = {"Dagger"}; 96 | // public String[] METHOD_FUNCTION_NUMBER_PROP = {"1", "3", "5"}; 97 | // 98 | // // use which training method to train the algorithm, Online, OnlineToBatch, Batch 99 | // public String[] TRAINING_STYLE_PROP = {"AROWOnline"}; 100 | // public String[] TRAINING_NORMALIZE_WEIGHT = {"false"}; 101 | // 102 | // // stanford preprocessing 103 | // public String[] STANFORD_PREPROCESSING = {"true"}; 104 | // 105 | // // state feature 106 | // public String[] STATE_FEATURE = {"false"}; 107 | // 108 | // // Atomic features 109 | public String[] FEATURE_ATOMIC_NAMES = {"F"}; // "N" 110 | 111 | public static void main(String[] args) { 112 | ExperimentArguments generator = new ExperimentArguments(); 113 | Class generatorClass = generator.getClass(); 114 | 115 | Field[] fields = generatorClass.getFields(); 116 | 117 | for (Field field : fields) { 118 | try { 119 | System.out.println(field.getName() + "--->" + field.get(generator)); 120 | } catch (Exception e) { 121 | throw new RuntimeException(e); 122 | } 123 | } 124 | 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /server/JobState.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.text.DateFormat; 4 | import java.text.SimpleDateFormat; 5 | import java.util.ArrayList; 6 | import java.util.Date; 7 | import java.util.List; 8 | 9 | public class JobState { 10 | 11 | private static final DateFormat DATE_FORMAT = new SimpleDateFormat( 12 | "MM/dd/yyyy HH:mm:ss"); 13 | 14 | private int jobId; 15 | 16 | private double prior; 17 | 18 | private String displayName; 19 | 20 | private String fullName; 21 | 22 | private String userName; 23 | 24 | private String state; 25 | 26 | private Date startTime; 27 | 28 | private String queue; 29 | 30 | private Integer slotsJaTaskId; 31 | 32 | public JobState(int jobId, double prior, String displayName, 33 | String userName, String state, Date startTime, String queue, 34 | Integer slotsJaTaskId) { 35 | this.jobId = jobId; 36 | this.prior = prior; 37 | this.displayName = displayName; 38 | this.userName = userName; 39 | this.state = state; 40 | this.startTime = startTime; 41 | this.queue = queue; 42 | this.slotsJaTaskId = slotsJaTaskId; 43 | } 44 | 45 | public static List parseJobIds(String info) throws Exception { 46 | List jobIds = new ArrayList(); 47 | if (info.isEmpty()) { 48 | return jobIds; 49 | } 50 | String[] lines = info.trim().split("\\n"); 51 | for (int i = 0; i < lines.length; ++i) { 52 | String line = lines[i].trim(); 53 | if (i == 0) { 54 | if (!"job-ID prior name user state submit/start at queue jclass slots ja-task-ID" 55 | .equals(line)) { 56 | throw new Exception("Unexpected header: " + line); 57 | } 58 | } else if (i >= 2) { 59 | // int jobId = Integer.valueOf(line.substring(0, 7).trim()); 60 | int jobId = Integer.valueOf(line.split("\\s+")[0]); 61 | jobIds.add(jobId); 62 | } 63 | } 64 | return jobIds; 65 | } 66 | 67 | @Override 68 | public String toString() { 69 | StringBuffer buffer = new StringBuffer(); 70 | buffer.append("===== Job State ====="); 71 | buffer.append("jobId: " + jobId + "\n"); 72 | buffer.append("prior: " + prior + "\n"); 73 | buffer.append("displayName: " + displayName + "\n"); 74 | buffer.append("userName: " + userName + "\n"); 75 | buffer.append("state: " + state + "\n"); 76 | buffer.append("startTime: " + DATE_FORMAT.format(startTime) + "\n"); 77 | buffer.append("queue: " + queue + "\n"); 78 | buffer.append("slotsJaTaskId: " + slotsJaTaskId + "\n"); 79 | return buffer.toString(); 80 | } 81 | 82 | public int getJobId() { 83 | return jobId; 84 | } 85 | 86 | public double getPrior() { 87 | return prior; 88 | } 89 | 90 | public String getDisplayName() { 91 | return displayName; 92 | } 93 | 94 | public String getFullName() { 95 | return fullName; 96 | } 97 | 98 | public String getUserName() { 99 | return userName; 100 | } 101 | 102 | public String getState() { 103 | return state; 104 | } 105 | 106 | public Date getStartTime() { 107 | return startTime; 108 | } 109 | 110 | public String getQueue() { 111 | return queue; 112 | } 113 | 114 | public Integer getSlotsJaTaskId() { 115 | return slotsJaTaskId; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /server/JobSubmit.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.io.File; 4 | import java.text.DateFormat; 5 | import java.text.SimpleDateFormat; 6 | import java.util.Date; 7 | 8 | /** 9 | * submit the jobs to cluster automatically 10 | * 11 | * @author Jun Xie (xie@eecs.oregonstate.edu) 12 | * 13 | */ 14 | public class JobSubmit { 15 | 16 | public static void main(String[] args) throws Exception { 17 | String originalPath = "/nfs/guille/xfern/users/xie/Experiment/experiment/"; 18 | 19 | DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); 20 | //get current date time with Date() 21 | Date date = new Date(); 22 | String folderName = dateFormat.format(date); 23 | System.out.println(folderName); 24 | 25 | String folderPath = originalPath + folderName; 26 | Runtime.getRuntime().exec("chmod -R u+x " + folderPath); 27 | 28 | File corpusDir = new File(folderPath); 29 | String[] directories = corpusDir.list(); 30 | 31 | //submit job 32 | for (String directory : directories) { 33 | if (directory.startsWith("Job")) continue; 34 | String simplePath = folderPath + "/" + directory + "/simple.sh"; 35 | System.out.println(simplePath); 36 | Runtime.getRuntime().exec("qsub " + simplePath); 37 | } 38 | System.out.println("done!"); 39 | } 40 | } -------------------------------------------------------------------------------- /server/Node.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.lang.reflect.Field; 4 | import java.util.*; 5 | 6 | 7 | public class Node { 8 | 9 | public List configuration; 10 | 11 | public Node() { 12 | configuration = new ArrayList(); 13 | } 14 | 15 | public Node(List configuration) { 16 | this.configuration = configuration; 17 | } 18 | 19 | public String toString() { 20 | StringBuilder sb = new StringBuilder(); 21 | 22 | for (String element : configuration) { 23 | sb.append(element + "\n"); 24 | } 25 | 26 | return sb.toString().trim(); 27 | } 28 | 29 | public Node cat(String element) { 30 | List newconfiguration = new ArrayList(); 31 | for (String elment : configuration) { 32 | newconfiguration.add(elment); 33 | } 34 | newconfiguration.add(element); 35 | 36 | return new Node(newconfiguration); 37 | } 38 | 39 | public static void main(String[] args) { 40 | // get its corresponding property 41 | ExperimentProperties properties = new ExperimentProperties(); 42 | Class propertyClass = properties.getClass(); 43 | Field[] propertyFields = propertyClass.getFields(); 44 | Map propertyMap = new HashMap(); 45 | for (Field field : propertyFields) { 46 | try { 47 | propertyMap.put(field.getName(), field.get(properties).toString()); 48 | } catch (Exception e) { 49 | throw new RuntimeException(e); 50 | } 51 | } 52 | 53 | // get its specific arguments 54 | ExperimentArguments arguments = new ExperimentArguments(); 55 | Class argumentClass = arguments.getClass(); 56 | Field[] argumentFields = argumentClass.getFields(); 57 | List>> argumentMap = new ArrayList>>(); 58 | for (Field field : argumentFields) { 59 | try { 60 | List argument = Arrays.asList(((String[]) field.get(arguments))); 61 | Map> specificArgument = new HashMap>(); 62 | specificArgument.put(field.getName(), argument); 63 | argumentMap.add(specificArgument); 64 | } catch (Exception e) { 65 | throw new RuntimeException(e); 66 | } 67 | } 68 | 69 | int length = argumentMap.size(); 70 | List combinations = new ArrayList(); 71 | Queue queue = new LinkedList(); 72 | Node initialNode = new Node(); 73 | queue.offer(initialNode); 74 | 75 | int index = 0; 76 | while (queue.size() > 0) { 77 | Node node = queue.poll(); 78 | 79 | if (index == length) { 80 | break; 81 | } 82 | 83 | Map> array = argumentMap.get(index); 84 | for (String key : array.keySet()) { 85 | String configKey = propertyMap.get(key); 86 | 87 | List elements = array.get(key); 88 | 89 | for (String element : elements) { 90 | Node child = node.cat(configKey + " = " + element); 91 | queue.offer(child); 92 | if (child.configuration.size() == length) { 93 | combinations.add(child); 94 | } 95 | } 96 | } 97 | 98 | if (allSameLength(queue)) { 99 | index += 1; 100 | } 101 | 102 | } 103 | 104 | System.out.println("done"); 105 | } 106 | 107 | private static boolean allSameLength(Queue queue) { 108 | Set lengths = new HashSet(); 109 | Iterator iterator = queue.iterator(); 110 | while(iterator.hasNext()) { 111 | Node node = iterator.next(); 112 | lengths.add(node.configuration.size()); 113 | } 114 | 115 | return lengths.size() == 1 ? true : false; 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /server/ResultAggregation.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.server; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.util.Properties; 6 | 7 | import edu.oregonstate.dataset.TopicGeneration; 8 | import edu.oregonstate.experiment.ExperimentConstructor; 9 | import edu.oregonstate.io.ResultOutput; 10 | import edu.oregonstate.util.Command; 11 | import edu.oregonstate.util.EecbConstants; 12 | import edu.stanford.nlp.util.StringUtils; 13 | 14 | /** 15 | * aggregate the results created by different jobs, for example 16 | * during the final testing, different jobs run on different topics, 17 | * So by aggregate the results produced by different jobs, produce the final 18 | * result and output to the experiment logFile 19 | * 20 | * @author Jun Xie (xie@eecs.oregonstate.edu) 21 | * 22 | */ 23 | public class ResultAggregation extends ExperimentConstructor { 24 | 25 | /** phase, for example the second round */ 26 | private final String phaseIndex; 27 | 28 | /** conll result */ 29 | private final String conllResultPath; 30 | 31 | /** corefCluster */ 32 | private final String[] COREFCLUSTER = {"goldCorefCluster", "predictedCorefCluster"}; 33 | 34 | public ResultAggregation(Properties props) { 35 | super(props); 36 | 37 | phaseIndex = props.getProperty(EecbConstants.PHASE_PROP, "0"); 38 | 39 | conllResultPath = experimentFolder + "/conll/" + phaseIndex; 40 | } 41 | 42 | /** 43 | * perform the result aggregation 44 | */ 45 | public void performExperiment() { 46 | TopicGeneration topicGenerator = new TopicGeneration(experimentProps); 47 | 48 | String[] trainingTopics = topicGenerator.trainingTopics(); 49 | calculatePerformance(trainingTopics, "trainingtopic"); 50 | 51 | String[] testingTopics = topicGenerator.testingTopics(); 52 | calculatePerformance(testingTopics, "testingtopic"); 53 | 54 | String[] developmentTopics = topicGenerator.developmentTopics(); 55 | calculatePerformance(developmentTopics, "developmenttopic"); 56 | } 57 | 58 | /** 59 | * calculate the performance on the entire set 60 | * because each file is independently processed, so the result is generated independently 61 | * 62 | * @param topics 63 | * @param set 64 | */ 65 | private void calculatePerformance(String[] topics, String set) { 66 | if (topics == null) { 67 | return; 68 | } 69 | 70 | // whether the output file exist in the disk 71 | boolean fileExist = true; 72 | 73 | String appendPhaseIndex = ""; 74 | if (!phaseIndex.equals("0")) { 75 | appendPhaseIndex = phaseIndex + "-"; 76 | } 77 | for (String resultType : COREFCLUSTER) { 78 | String outputPath = conllResultPath + "/" + resultType + "-" + phaseIndex + "-" + set; 79 | for (String topic : topics) { 80 | String topicPath = conllResultPath + "/" + resultType + "-" + appendPhaseIndex + "" + set + "-" + topic; 81 | 82 | if (!Command.fileExists(topicPath)) { 83 | fileExist = false; 84 | break; 85 | } 86 | 87 | try { 88 | BufferedReader br = new BufferedReader(new FileReader(topicPath)); 89 | String currentLine = ""; 90 | while ((currentLine = br.readLine()) != null) { 91 | ResultOutput.writeTextFile(outputPath, currentLine); 92 | } 93 | 94 | br.close(); 95 | } catch (Exception e) { 96 | throw new RuntimeException(e); 97 | } 98 | } 99 | } 100 | 101 | // CoNLL scoring 102 | if (fileExist) { 103 | String goldCorefCluster = conllResultPath + "/goldCorefCluster-" + phaseIndex + "-" + set; 104 | String predictedCorefCluster = conllResultPath + "/predictedCorefCluster-" + phaseIndex + "-" + set; 105 | double[] finalScores = ResultOutput.printCorpusResult(experimentLogFile, goldCorefCluster, predictedCorefCluster, "the " + phaseIndex + "'s model 's performance on " + set); 106 | ResultOutput.writeTextFile(experimentFolder + "/" + set + ".csv", finalScores[0] + "\t" + finalScores[1] + "\t" + finalScores[2] + "\t" + finalScores[3] + "\t" + finalScores[4]); 107 | } 108 | } 109 | 110 | public static void main(String[] args) { 111 | if (args.length > 1) { 112 | System.out.println("there are more parameters, you just can specify one path parameter....."); 113 | System.exit(1); 114 | } 115 | 116 | if (args.length == 0) { 117 | // run the experiment in the local machine for debugging 118 | args = new String[1]; 119 | args[0] = "/nfs/guille/xfern/users/xie/Experiment/experiment/2013-04-23/0-experiment/0-resultaggregation-config.properties"; 120 | } 121 | 122 | String[] propArgs = new String[]{"-props", args[0]}; 123 | 124 | Properties props = StringUtils.argsToProperties(propArgs); 125 | ExperimentConstructor resultAggregator = new ResultAggregation(props); 126 | resultAggregator.performExperiment(); 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /server/pipeline.properties: -------------------------------------------------------------------------------- 1 | # procedures pipeline for the whole experiment 2 | procedures = datageneration 3 | # , searchtrueloss, learn, dagger-3, searchlearnedweightwithoutfeature 4 | 5 | # name of the experiment 6 | experiment = goldmention, method.function.number, feature.atomic.names 7 | 8 | # corpus directory 9 | corpus = /scratch/JavaFile/corpus 10 | 11 | # scorer path 12 | conll.scorer = /nfs/guille/xfern/users/xie/Experiment/corpus/scorer/v4/scorer.pl 13 | 14 | # debug case 15 | debug = true 16 | 17 | # wordnet 18 | wordnet = /nfs/guille/xfern/users/xie/Experiment/corpus/WordNet-3.0/dict 19 | 20 | 21 | # the properties of datageneration 22 | # set generation format: WithinCross (false), CrossTopic (true) 23 | datageneration.dataset = true 24 | datageneration.goldmention = false 25 | datageneration.postprocess.gold = true 26 | datageneration.trainingset = 6, 16 27 | datageneration.testingset = 20, 38 28 | # annotators for Stanford CoreNLP 29 | datageneration.annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref 30 | 31 | # the properties of search 32 | -------------------------------------------------------------------------------- /training/AROWOnline.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | /** 11 | * AROW Implementation based on Online training style 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public class AROWOnline extends ITraining { 17 | 18 | /** 19 | * implement the batch 20 | */ 21 | public Parameter train(List paths, Parameter para, double learningRate) { 22 | double[] previousWeight = para.getWeight(); 23 | int violation = para.getNoOfViolation(); 24 | int numberOfInstance = 0; 25 | 26 | // use to update weight 27 | Parameter finalParameter = para.makeCopy(); 28 | double[] finalWeight = finalParameter.getWeight(); 29 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 30 | double[][] finalVariance = finalParameter.getVariance(); 31 | 32 | for (String path : paths) { 33 | List>> dataset = reader.readData(path); 34 | List> goodDataset = dataset.get(0); 35 | List> badDataset = dataset.get(1); 36 | 37 | List randomLists = createRandomIndex(goodDataset.size()); 38 | 39 | for (int i = 0; i < randomLists.size(); i++){ 40 | int index = randomLists.get(i); 41 | 42 | List goodRecords = goodDataset.get(index); 43 | List badRecords = badDataset.get(index); 44 | // get the data 45 | List> goodStates = reader.processString(goodRecords); 46 | List> badStates = reader.processString(badRecords); 47 | 48 | if (!incorporateZeroVector) { 49 | if (reader.isAllZero(goodStates)) continue; 50 | } 51 | 52 | // form constraint 53 | for (State goodState : goodStates) { 54 | for (State badState : badStates) { 55 | numberOfInstance += 1; 56 | 57 | // if loss score equal, do not consider this kind of constraint 58 | double gLossScore = goodState.getF1Score(); 59 | double bLossScore = badState.getF1Score(); 60 | if (gLossScore == bLossScore) { 61 | continue; 62 | } 63 | 64 | // get the features of good state and bad state 65 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 66 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 67 | 68 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 69 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 70 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 71 | violation += 1; 72 | } 73 | 74 | double[] feature = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 75 | double margin = 0.0; 76 | if (enableNormalizeWeight) { 77 | double[] normalizedWeight = DoubleOperation.normalize(finalWeight); 78 | margin = DoubleOperation.time(normalizedWeight, feature); 79 | } else { 80 | margin = DoubleOperation.time(finalWeight, feature); 81 | } 82 | 83 | if (margin < 1) { 84 | double beta = 1 / ( DoubleOperation.transformation(feature, finalVariance) + mHyperParameter ); 85 | double alpha = Math.max(0, beta * (1 - DoubleOperation.time(feature, finalWeight))); 86 | double constant = alpha; 87 | double[] delta = DoubleOperation.time(DoubleOperation.matrixTime(finalVariance, feature), constant) ; 88 | boolean zeroVector = DoubleOperation.isZeroVector(delta); 89 | 90 | // update the weight and variance 91 | if (!zeroVector) { 92 | finalWeight = DoubleOperation.add(finalWeight, delta); 93 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 94 | 95 | double[] sumX = DoubleOperation.matrixTime(finalVariance, feature); 96 | double[][] sumXX = DoubleOperation.vectorProduct(sumX, feature); 97 | double[][] betaSumXX = DoubleOperation.time(sumXX, beta); 98 | double[][] betaSumXXSum = DoubleOperation.time(betaSumXX, finalVariance); 99 | finalVariance = DoubleOperation.matrixMinus(finalVariance, betaSumXXSum); 100 | } 101 | } 102 | 103 | } 104 | } 105 | } 106 | } 107 | 108 | return new Parameter(finalWeight, finalVariance, finalTotalWeight, violation, numberOfInstance); 109 | } 110 | 111 | } 112 | -------------------------------------------------------------------------------- /training/AROWOnlineToBatch.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | /** 11 | * AROW Implementation based on OnlineToBatch training style 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public class AROWOnlineToBatch extends ITraining { 17 | 18 | /** 19 | * implement the batch 20 | */ 21 | public Parameter train(List paths, Parameter para, double learningRate) { 22 | double[] previousWeight = para.getWeight(); 23 | int violation = para.getNoOfViolation(); 24 | int numberOfInstance = 0; 25 | 26 | // use to update weight 27 | Parameter finalParameter = para.makeCopy(); 28 | double[] finalWeight = finalParameter.getWeight(); 29 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 30 | double[][] finalVariance = finalParameter.getVariance(); 31 | 32 | for (String path : paths) { 33 | List>> dataset = reader.readData(path); 34 | List> goodDataset = dataset.get(0); 35 | List> badDataset = dataset.get(1); 36 | 37 | List randomLists = createRandomIndex(goodDataset.size()); 38 | 39 | for (int i = 0; i < randomLists.size(); i++){ 40 | int index = randomLists.get(i); 41 | 42 | List goodRecords = goodDataset.get(index); 43 | List badRecords = badDataset.get(index); 44 | // get the data 45 | List> goodStates = reader.processString(goodRecords); 46 | List> badStates = reader.processString(badRecords); 47 | 48 | if (!incorporateZeroVector) { 49 | if (reader.isAllZero(goodStates)) continue; 50 | } 51 | 52 | // fix the weight and variance for the current batch 53 | double[] fixedWeight = new double[length]; 54 | System.arraycopy(finalWeight, 0, fixedWeight, 0, length); 55 | double[][] fixedVariance = new double[length][length]; 56 | for (int row = 0; row < length; row++) { 57 | System.arraycopy(finalVariance[row], 0, fixedVariance[row], 0, length); 58 | } 59 | 60 | // form constraint 61 | for (State goodState : goodStates) { 62 | for (State badState : badStates) { 63 | numberOfInstance += 1; 64 | 65 | // if loss score equal, do not consider this kind of constraint 66 | double gLossScore = goodState.getF1Score(); 67 | double bLossScore = badState.getF1Score(); 68 | if (gLossScore == bLossScore) { 69 | continue; 70 | } 71 | 72 | // get the features of good state and bad state 73 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 74 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 75 | 76 | // calculate the number of violated constraints 77 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 78 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 79 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 80 | violation += 1; 81 | } 82 | 83 | double[] feature = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 84 | double margin = DoubleOperation.time(fixedWeight, feature); 85 | if (margin < 1) { 86 | double beta = 1 / ( DoubleOperation.transformation(feature, fixedVariance) + mHyperParameter ); 87 | double alpha = Math.max(0, beta * (1 - DoubleOperation.time(feature, fixedWeight))); 88 | double constant = alpha; 89 | double[] delta = DoubleOperation.time(DoubleOperation.matrixTime(fixedVariance, feature), constant) ; 90 | boolean zeroVector = DoubleOperation.isZeroVector(delta); 91 | 92 | // update the weight and variance 93 | if (!zeroVector) { 94 | // update the weight 95 | finalWeight = DoubleOperation.add(finalWeight, delta); 96 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 97 | 98 | double[] sumX = DoubleOperation.matrixTime(fixedVariance, feature); 99 | double[][] sumXX = DoubleOperation.vectorProduct(sumX, feature); 100 | double[][] betaSumXX = DoubleOperation.time(sumXX, beta); 101 | double[][] betaSumXXSum = DoubleOperation.time(betaSumXX, fixedVariance); 102 | 103 | // update the variance 104 | finalVariance = DoubleOperation.matrixMinus(finalVariance, betaSumXXSum); 105 | } 106 | } 107 | 108 | } 109 | } 110 | } 111 | } 112 | 113 | return new Parameter(finalWeight, finalVariance, finalTotalWeight, violation, numberOfInstance); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /training/Batch.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | public class Batch extends ITraining { 11 | 12 | /** 13 | * implement the batch 14 | */ 15 | public Parameter train(List paths, Parameter para, double learningRate) { 16 | double[] previousWeight = para.getWeight(); 17 | double[] previousTotalWeight = para.getTotalWeight(); 18 | int violations = para.getNoOfViolation(); 19 | int numberOfInstance = 0; 20 | 21 | double[] delta = new double[length]; 22 | double[] totalDelta = new double[length]; 23 | for (String path : paths) { 24 | List>> dataset = reader.readData(path); 25 | List> goodDataset = dataset.get(0); 26 | List> badDataset = dataset.get(1); 27 | 28 | for (int index = 0; index < goodDataset.size(); index++){ 29 | List goodRecords = goodDataset.get(index); 30 | List badRecords = badDataset.get(index); 31 | 32 | // get the data 33 | List> goodStates = reader.processString(goodRecords); 34 | List> badStates = reader.processString(badRecords); 35 | 36 | if (!incorporateZeroVector) { 37 | if (reader.isAllZero(goodStates)) continue; 38 | } 39 | 40 | // form constraint 41 | for (State goodState : goodStates) { 42 | for (State badState : badStates) { 43 | numberOfInstance += 1; 44 | 45 | // if loss score equal, do not consider this kind of constraint 46 | double gLossScore = goodState.getF1Score(); 47 | double bLossScore = badState.getF1Score(); 48 | if (gLossScore == bLossScore) { 49 | continue; 50 | } 51 | 52 | // get the features of good state and bad state 53 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 54 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 55 | 56 | // calculate the action score of good state and bad state 57 | double goodCostScoreForUpdating = DoubleOperation.time(previousWeight, gNumericalFeatures); 58 | double badCostScoreForUpdating = DoubleOperation.time(previousWeight, bNumericalFeatures); 59 | 60 | // violated current constraint 61 | if (goodCostScoreForUpdating <= badCostScoreForUpdating) { 62 | violations += 1; 63 | double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 64 | delta = DoubleOperation.add(delta, direction); 65 | totalDelta = DoubleOperation.add(totalDelta, delta); 66 | } 67 | } 68 | } 69 | } 70 | } 71 | 72 | double[] weightedDelta = DoubleOperation.time(delta, learningRate); 73 | double[] weightedTotalDelta = DoubleOperation.time(totalDelta, learningRate); 74 | 75 | double[] currentWeight = DoubleOperation.add(previousWeight, weightedDelta); 76 | double[] currentTotalWeight = DoubleOperation.add(previousTotalWeight, weightedTotalDelta); 77 | 78 | return new Parameter(currentWeight, para.getVariance(), currentTotalWeight, violations, numberOfInstance); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /training/ITraining.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | import java.util.Properties; 7 | 8 | import edu.oregonstate.classifier.Parameter; 9 | import edu.oregonstate.experiment.ExperimentConstructor; 10 | import edu.oregonstate.features.FeatureFactory; 11 | import edu.oregonstate.general.DoubleOperation; 12 | import edu.oregonstate.io.LargetFileReading; 13 | import edu.oregonstate.util.EecbConstants; 14 | 15 | /** 16 | * whether incorporate the negative instance, according to the paper : Tuning as Ranking 17 | * 18 | * @author Jun Xie (xie@eecs.oregonstate.edu) 19 | * 20 | */ 21 | public abstract class ITraining { 22 | 23 | /** the length of the weight */ 24 | protected final int length; 25 | 26 | /** largest file reader */ 27 | protected final LargetFileReading reader; 28 | 29 | /** whether incorporate the zero good state */ 30 | protected final boolean incorporateZeroVector; 31 | 32 | /** hyper-parameter for AROW */ 33 | protected final double mHyperParameter; 34 | 35 | /** whether enable PA learning rate loss score */ 36 | private final boolean enablePALearningRateLossScore; 37 | 38 | /** enable discrepancy */ 39 | private final boolean enablePADiscrepancy; 40 | 41 | /** enable margin */ 42 | private final boolean enablePAMargin; 43 | 44 | /** enable normalize the weight */ 45 | protected final boolean enableNormalizeWeight; 46 | 47 | public ITraining() { 48 | Properties mProps = ExperimentConstructor.experimentProps; 49 | length = FeatureFactory.getFeatureTemplate().size(); 50 | reader = new LargetFileReading(); 51 | incorporateZeroVector = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_INCORPORATE_ZERO_CASE, "true")); 52 | enablePALearningRateLossScore = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_RATE_LOSSSCORE, "true")); 53 | enablePADiscrepancy = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_DISCREPANCY, "true")); 54 | enablePAMargin = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_MARGIN, "true")); 55 | enableNormalizeWeight = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_NORMALIZE_WEIGHT, "true")); 56 | mHyperParameter = Double.parseDouble(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_HYPERPARAMETER, "1.0")); 57 | } 58 | 59 | /* different weight update styles, including Batch, Online and OnlineToBatch */ 60 | public abstract Parameter train(List paths, Parameter para, double learningRate); 61 | 62 | /** 63 | * create random integer list 64 | * 65 | * @param size 66 | * @return 67 | */ 68 | protected List createRandomIndex(int size) { 69 | List arrays = new ArrayList(); 70 | for (int i = 0; i < size; i++) { 71 | arrays.add(i); 72 | } 73 | 74 | Collections.shuffle(arrays); 75 | return arrays; 76 | } 77 | 78 | /** 79 | * calculate the loss 80 | * 81 | * @param gLossScore 82 | * @param bLossScore 83 | * @param gNumericalFeatures 84 | * @param bNumericalFeatures 85 | * @param weight 86 | * @return 87 | */ 88 | protected double calculatePALoss(double gLossScore, double bLossScore, double[] gNumericalFeatures, 89 | double[] bNumericalFeatures, double[] weight) { 90 | double loss = 0.0; 91 | 92 | // calculate margin 93 | if (enablePAMargin) { 94 | double margin = 1.0; 95 | if (enablePALearningRateLossScore) { 96 | margin = gLossScore - bLossScore; 97 | } 98 | loss += margin; 99 | } 100 | 101 | // calculate the discrepancy 102 | if (enablePADiscrepancy) { 103 | double[] weightForCalculatingCost = null; 104 | if (enableNormalizeWeight) { 105 | weightForCalculatingCost = DoubleOperation.normalize(weight); 106 | } else { 107 | weightForCalculatingCost = weight; 108 | } 109 | 110 | double bCostScore = DoubleOperation.time(bNumericalFeatures, weightForCalculatingCost); 111 | double gCostScore = DoubleOperation.time(gNumericalFeatures, weightForCalculatingCost); 112 | loss += bCostScore - gCostScore; 113 | } 114 | 115 | return loss; 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /training/Online.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | public class Online extends ITraining { 11 | 12 | /** 13 | * implement the batch 14 | */ 15 | public Parameter train(List paths, Parameter para, double learningRate) { 16 | double[] previousWeight = para.getWeight(); 17 | int violation = para.getNoOfViolation(); 18 | int numberOfInstance = 0; 19 | 20 | // use to update weight 21 | Parameter finalParameter = para.makeCopy(); 22 | double[] finalWeight = finalParameter.getWeight(); 23 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 24 | 25 | for (String path : paths) { 26 | List>> dataset = reader.readData(path); 27 | List> goodDataset = dataset.get(0); 28 | List> badDataset = dataset.get(1); 29 | 30 | for (int index = 0; index < goodDataset.size(); index++){ 31 | List goodRecords = goodDataset.get(index); 32 | List badRecords = badDataset.get(index); 33 | // get the data 34 | List> goodStates = reader.processString(goodRecords); 35 | List> badStates = reader.processString(badRecords); 36 | 37 | if (!incorporateZeroVector) { 38 | if (reader.isAllZero(goodStates)) continue; 39 | } 40 | 41 | // form constraint 42 | for (State goodState : goodStates) { 43 | for (State badState : badStates) { 44 | numberOfInstance += 1; 45 | 46 | // if loss score equal, do not consider this kind of constraint 47 | double gLossScore = goodState.getF1Score(); 48 | double bLossScore = badState.getF1Score(); 49 | if (gLossScore == bLossScore) { 50 | continue; 51 | } 52 | 53 | // get the features of good state and bad state 54 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 55 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 56 | 57 | // calculate the action score of good state and bad state 58 | double goodCostScoreForUpdating = DoubleOperation.time(finalWeight, gNumericalFeatures); 59 | double badCostScoreForUpdating = DoubleOperation.time(finalWeight, bNumericalFeatures); 60 | 61 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 62 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 63 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 64 | violation += 1; 65 | } 66 | 67 | // violated current constraint 68 | if (goodCostScoreForUpdating <= badCostScoreForUpdating) { 69 | double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 70 | double[] term = DoubleOperation.time(direction, learningRate); 71 | finalWeight = DoubleOperation.add(finalWeight, term); 72 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 73 | } 74 | } 75 | } 76 | } 77 | } 78 | 79 | return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /training/OnlinePA.txt: -------------------------------------------------------------------------------- 1 | from https://cwiki.apache.org/confluence/display/MAHOUT/Online+Passive+Aggressive 2 | 3 | Data must be shuffled and normalized either between 0..1 or by mean and standard deviation. 4 | 5 | Technical details: 6 | 7 | The training approach taken is to minimize the ranking loss of the correct label vs the incorrect ones. We define this loss as hinge(1 - correct label score + wrong label score) where wrong label score is the score of the highest scoring label that is not the correct label. The hinge function is hinge = x if x > 0, 0 otherwise. 8 | 9 | Parameters: 10 | 11 | There is only one - learningRate. You set it to a larger number to converge faster, or a smaller number to be more cautious. The normal way to use it is via cross validation. Good values are (0.1, 1.0, 10.0). 12 | -------------------------------------------------------------------------------- /training/OnlineToBatch.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | public class OnlineToBatch extends ITraining { 11 | 12 | /** 13 | * implement the batch 14 | */ 15 | public Parameter train(List paths, Parameter para, double learningRate) { 16 | double[] previousWeight = para.getWeight(); 17 | int violation = para.getNoOfViolation(); 18 | int numberOfInstance = 0; 19 | 20 | // use to update weight 21 | Parameter finalParameter = para.makeCopy(); 22 | double[] finalWeight = finalParameter.getWeight(); 23 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 24 | 25 | for (String path : paths) { 26 | List>> dataset = reader.readData(path); 27 | List> goodDataset = dataset.get(0); 28 | List> badDataset = dataset.get(1); 29 | 30 | // shuffle the data again 31 | List randomLists = createRandomIndex(goodDataset.size()); 32 | 33 | for (int i = 0; i < randomLists.size(); i++){ 34 | int index = randomLists.get(i); 35 | List goodRecords = goodDataset.get(index); 36 | List badRecords = badDataset.get(index); 37 | // get the data 38 | List> goodStates = reader.processString(goodRecords); 39 | List> badStates = reader.processString(badRecords); 40 | 41 | if (!incorporateZeroVector) { 42 | if (reader.isAllZero(goodStates)) continue; 43 | } 44 | 45 | // fix the weight for the current batch 46 | double[] fixedWeight = new double[length]; 47 | System.arraycopy(finalWeight, 0, fixedWeight, 0, length); 48 | 49 | // form constraint 50 | for (State goodState : goodStates) { 51 | for (State badState : badStates) { 52 | numberOfInstance += 1; 53 | 54 | // if loss score equal, do not consider this kind of constraint 55 | double gLossScore = goodState.getF1Score(); 56 | double bLossScore = badState.getF1Score(); 57 | if (gLossScore == bLossScore) { 58 | continue; 59 | } 60 | 61 | // get the features of good state and bad state 62 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 63 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 64 | 65 | // calculate the action score of good state and bad state 66 | double goodCostScoreForUpdating = DoubleOperation.time(fixedWeight, gNumericalFeatures); 67 | double badCostScoreForUpdating = DoubleOperation.time(fixedWeight, bNumericalFeatures); 68 | 69 | // calculate the number of violations 70 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 71 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 72 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 73 | violation += 1; 74 | } 75 | 76 | // violated current constraint 77 | if (goodCostScoreForUpdating <= badCostScoreForUpdating) { 78 | double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 79 | if (DoubleOperation.isAllZero(direction)) continue; 80 | 81 | //ResultOutput.writeTextFile(ExperimentConstructor.logFile, "learning rate : " + learningRate); 82 | double[] term = DoubleOperation.time(direction, learningRate); 83 | finalWeight = DoubleOperation.add(finalWeight, term); 84 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 85 | } 86 | } 87 | } 88 | } 89 | } 90 | 91 | return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance); 92 | } 93 | 94 | } -------------------------------------------------------------------------------- /training/PAOnline.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | /** 11 | * use PA algorithm to update the learned weight, use the Online mode 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public class PAOnline extends ITraining { 17 | 18 | /** 19 | * implement the batch 20 | */ 21 | public Parameter train(List paths, Parameter para, double learningRate) { 22 | double[] previousWeight = para.getWeight(); 23 | int violation = para.getNoOfViolation(); 24 | int numberOfInstance = 0; 25 | 26 | // use to update weight 27 | Parameter finalParameter = para.makeCopy(); 28 | double[] finalWeight = finalParameter.getWeight(); 29 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 30 | 31 | for (String path : paths) { 32 | List>> dataset = reader.readData(path); 33 | List> goodDataset = dataset.get(0); 34 | List> badDataset = dataset.get(1); 35 | 36 | List randomLists = createRandomIndex(goodDataset.size()); 37 | 38 | for (int i = 0; i < randomLists.size(); i++){ 39 | int index = randomLists.get(i); 40 | 41 | List goodRecords = goodDataset.get(index); 42 | List badRecords = badDataset.get(index); 43 | // get the data 44 | List> goodStates = reader.processString(goodRecords); 45 | List> badStates = reader.processString(badRecords); 46 | 47 | if (!incorporateZeroVector) { 48 | if (reader.isAllZero(goodStates)) continue; 49 | } 50 | 51 | // form constraint 52 | for (State goodState : goodStates) { 53 | for (State badState : badStates) { 54 | numberOfInstance += 1; 55 | 56 | // if loss score equal, do not consider this kind of constraint 57 | double gLossScore = goodState.getF1Score(); 58 | double bLossScore = badState.getF1Score(); 59 | if (gLossScore == bLossScore) { 60 | continue; 61 | } 62 | 63 | // get the features of good state and bad state 64 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 65 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 66 | 67 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 68 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 69 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 70 | violation += 1; 71 | } 72 | 73 | // calculate the loss 74 | double loss = calculatePALoss(gLossScore, bLossScore, gNumericalFeatures, bNumericalFeatures, finalWeight); 75 | if (loss > 0) { 76 | double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 77 | 78 | if (DoubleOperation.isAllZero(direction)) continue; 79 | 80 | double directionNorm = DoubleOperation.calculateTwoNorm(direction); 81 | double tau = loss / directionNorm; 82 | // ResultOutput.writeTextFile(ExperimentConstructor.logFile, "tau : " + tau); 83 | double[] term = DoubleOperation.time(direction, tau); 84 | finalWeight = DoubleOperation.add(finalWeight, term); 85 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 86 | } 87 | } 88 | } 89 | } 90 | } 91 | 92 | return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance); 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /training/PAOnlineToBatch.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.training; 2 | 3 | import java.util.List; 4 | 5 | import edu.oregonstate.classifier.Parameter; 6 | import edu.oregonstate.general.DoubleOperation; 7 | import edu.oregonstate.search.State; 8 | import edu.stanford.nlp.dcoref.CorefCluster; 9 | 10 | /** 11 | * PA using the OnlineToBatch Mode 12 | * 13 | * @author Jun Xie (xie@eecs.oregonstate.edu) 14 | * 15 | */ 16 | public class PAOnlineToBatch extends ITraining { 17 | 18 | /** 19 | * implement the batch 20 | */ 21 | public Parameter train(List paths, Parameter para, double learningRate) { 22 | double[] previousWeight = para.getWeight(); 23 | int violation = para.getNoOfViolation(); 24 | int numberOfInstance = 0; 25 | 26 | // use to update weight 27 | Parameter finalParameter = para.makeCopy(); 28 | double[] finalWeight = finalParameter.getWeight(); 29 | double[] finalTotalWeight = finalParameter.getTotalWeight(); 30 | 31 | for (String path : paths) { 32 | List>> dataset = reader.readData(path); 33 | List> goodDataset = dataset.get(0); 34 | List> badDataset = dataset.get(1); 35 | 36 | List randomLists = createRandomIndex(goodDataset.size()); 37 | 38 | for (int i = 0; i < randomLists.size(); i++){ 39 | int index = randomLists.get(i); 40 | List goodRecords = goodDataset.get(index); 41 | List badRecords = badDataset.get(index); 42 | // get the data 43 | List> goodStates = reader.processString(goodRecords); 44 | List> badStates = reader.processString(badRecords); 45 | 46 | if (!incorporateZeroVector) { 47 | if (reader.isAllZero(goodStates)) continue; 48 | } 49 | 50 | double[] fixedWeight = new double[length]; 51 | System.arraycopy(finalWeight, 0, fixedWeight, 0, length); 52 | 53 | // form constraint 54 | for (State goodState : goodStates) { 55 | for (State badState : badStates) { 56 | numberOfInstance += 1; 57 | 58 | // if loss score equal, do not consider this kind of constraint 59 | double gLossScore = goodState.getF1Score(); 60 | double bLossScore = badState.getF1Score(); 61 | if (gLossScore == bLossScore) { 62 | continue; 63 | } 64 | 65 | // get the features of good state and bad state 66 | double[] gNumericalFeatures = goodState.getNumericalFeatures(); 67 | double[] bNumericalFeatures = badState.getNumericalFeatures(); 68 | 69 | double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures); 70 | double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures); 71 | if (goodCostScoreForCounting <= badCostScoreForCounting) { 72 | violation += 1; 73 | } 74 | 75 | // calculate the loss 76 | double loss = calculatePALoss(gLossScore, bLossScore, gNumericalFeatures, bNumericalFeatures, fixedWeight); 77 | if (loss > 0) { 78 | double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures); 79 | 80 | if (DoubleOperation.isAllZero(direction)) continue; 81 | 82 | double directionNorm = DoubleOperation.calculateTwoNorm(direction); 83 | double tau = loss / directionNorm; 84 | double[] term = DoubleOperation.time(direction, tau); 85 | finalWeight = DoubleOperation.add(finalWeight, term); 86 | finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight); 87 | } 88 | } 89 | } 90 | } 91 | } 92 | 93 | return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /tuning/TuningFactory.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.tuning; 2 | 3 | /** 4 | * tune the parameter or choose which model generated by the Dagger framework are the best by decoding 5 | * on the development set 6 | * 7 | * @author Jun Xie (xie@eecs.oregonstate.edu) 8 | * 9 | */ 10 | public class TuningFactory { 11 | 12 | 13 | } -------------------------------------------------------------------------------- /util/Command.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.util; 2 | 3 | import java.io.File; 4 | import java.util.List; 5 | 6 | import edu.oregonstate.search.State; 7 | import edu.stanford.nlp.dcoref.CorefCluster; 8 | import edu.stanford.nlp.dcoref.Document; 9 | import edu.stanford.nlp.dcoref.Mention; 10 | import edu.stanford.nlp.util.SystemUtils; 11 | 12 | /** 13 | * those commands used for creating file or something else 14 | * 15 | * @author Jun Xie (xie@eecs.oregonstate.edu) 16 | * 17 | */ 18 | public class Command { 19 | 20 | // create a directory given a path string 21 | public static void mkdir(String path) { 22 | if (!fileExists(path)) { 23 | String command = "mkdir " + path; 24 | execCommand(command.split(" ")); 25 | } 26 | } 27 | 28 | // execute the Unix command 29 | public static void execCommand(String... command) { 30 | try { 31 | ProcessBuilder ps = new ProcessBuilder(command); 32 | SystemUtils.run(ps); 33 | } catch (Exception e) { 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | // execute the chmod command for a whole folderPath 39 | public static void chmod(String folderPath) { 40 | String command = "chmod -R u+x " + folderPath; 41 | execCommand(command.split(" ")); 42 | } 43 | 44 | /** 45 | * delete the whole directory 46 | * 47 | * @param directoryName 48 | */ 49 | public static void rmdir(String directoryName) { 50 | File directory = new File(directoryName); 51 | 52 | if (directory != null) { 53 | String[] command = new String[] {"rm", "-rf", directoryName}; 54 | execCommand(command); 55 | } 56 | 57 | return; 58 | } 59 | 60 | /** 61 | * whether the file exists in the disk 62 | * 63 | * @param filePath 64 | * @return 65 | */ 66 | public static boolean fileExists(String filePath) { 67 | File file = new File(filePath); 68 | return file.exists(); 69 | } 70 | 71 | /** 72 | * count the total number of mentions 73 | * 74 | * @param mentionList 75 | * @return 76 | */ 77 | public static int countMentions(List> mentionList) { 78 | int totalNumber = 0; 79 | for (List mentions : mentionList ) { 80 | totalNumber += mentions.size(); 81 | } 82 | 83 | return totalNumber; 84 | } 85 | 86 | /** 87 | * update the allPredictedMentions, which is used for Stanford scoring function 88 | * The reason for this is that the corefClusters information has been updated. The mention id should be consistent 89 | * with the allPredictedMentions and corefClusters 90 | * 91 | * @param stateDocument 92 | * @param state 93 | */ 94 | public static void generateStateDocument(Document stateDocument, State state) { 95 | stateDocument.corefClusters = state.getState(); 96 | 97 | for (Integer id : stateDocument.corefClusters.keySet()) { 98 | CorefCluster cluster = stateDocument.corefClusters.get(id); 99 | for (Mention m : cluster.corefMentions) { 100 | int mentionID = m.mentionID; 101 | Mention correspondingMention = stateDocument.allPredictedMentions.get(mentionID); 102 | int clusterid = id; 103 | correspondingMention.corefClusterID = clusterid; 104 | } 105 | } 106 | } 107 | 108 | 109 | } 110 | -------------------------------------------------------------------------------- /util/CosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.util; 2 | 3 | import java.util.*; 4 | import java.lang.Math; 5 | 6 | /** 7 | * This class calculates the cosine similarity between two clusters, or entities 8 | * in our cases. Each cluster consists of one mention at the initialization phase, 9 | * because we initialize each mention as a cluster. 10 | *

11 | * In the search phase, we need to merge two clusters, each of which contains at 12 | * least one mention. So in this way, we need to extract features from the cluster 13 | * pairs. We use List to represent the features of each cluster and then compute 14 | * the cosine similarity between the feature pairs. 15 | *

16 | * The formula to calculate the cosine similarity is borrowed from link: 17 | * http://en.wikipedia.org/wiki/Cosine_similarity 18 | * 19 | * @author xie 20 | * 21 | */ 22 | public class CosineSimilarity { 23 | 24 | /** 25 | * calculate the cosine similarity between feature vectors of two clusters 26 | * 27 | * The feature vector is represented as HashMap. 28 | * 29 | * @param firstFeatures The feature vector of the first cluster 30 | * @param secondFeatures The feature vector of the second cluster 31 | * @return the similarity measure 32 | */ 33 | public static Double calculateCosineSimilarity(HashMap firstFeatures, HashMap secondFeatures) { 34 | Double similarity = 0.0; 35 | Double sum = 0.0; // the numerator of the cosine similarity 36 | Double fnorm = 0.0; // the first part of the denominator of the cosine similarity 37 | Double snorm = 0.0; // the second part of the denominator of the cosine similarity 38 | Set fkeys = firstFeatures.keySet(); 39 | Iterator fit = fkeys.iterator(); 40 | while (fit.hasNext()) { 41 | String featurename = fit.next(); 42 | boolean containKey = secondFeatures.containsKey(featurename); 43 | if (containKey) { 44 | sum = sum + firstFeatures.get(featurename) * secondFeatures.get(featurename); 45 | } 46 | } 47 | fnorm = calculateNorm(firstFeatures); 48 | snorm = calculateNorm(secondFeatures); 49 | similarity = sum / (fnorm * snorm); 50 | return similarity; 51 | } 52 | 53 | /** 54 | * calculate the norm of one feature vector 55 | * 56 | * @param feature of one cluster 57 | * @return 58 | */ 59 | public static Double calculateNorm(HashMap feature) { 60 | Double norm = 0.0; 61 | Set keys = feature.keySet(); 62 | Iterator it = keys.iterator(); 63 | while (it.hasNext()) { 64 | String featurename = it.next(); 65 | norm = norm + Math.pow(feature.get(featurename), 2); 66 | } 67 | return Math.sqrt(norm); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /util/DocumentAlignment.java: -------------------------------------------------------------------------------- 1 | package edu.oregonstate.util; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | import edu.oregonstate.experiment.ExperimentConstructor; 7 | import edu.stanford.nlp.dcoref.CorefCluster; 8 | import edu.stanford.nlp.dcoref.Document; 9 | import edu.stanford.nlp.dcoref.Mention; 10 | import edu.stanford.nlp.dcoref.SieveCoreferenceSystem; 11 | 12 | public class DocumentAlignment { 13 | 14 | /** 15 | * update the corefcluster ID of each mention in the orderedPredictionMentions 16 | * 17 | * @param document 18 | */ 19 | public static void alignDocument(Document document) { 20 | updateOrderedPredictedMentions(document); 21 | //updateOrderedGoldMentions(document); 22 | } 23 | 24 | private static void updateOrderedPredictedMentions(Document document) { 25 | List> predictedOrderedMentionsBySentence = document.getOrderedMentions(); 26 | Map corefClusters = document.corefClusters; 27 | for (Integer clusterID : corefClusters.keySet()) { 28 | CorefCluster cluster = corefClusters.get(clusterID); 29 | for (Mention m : cluster.getCorefMentions()) { 30 | int sentenceID = m.sentNum; 31 | List mentions = predictedOrderedMentionsBySentence.get(sentenceID); 32 | int mStartIndex = m.startIndex; 33 | int mEndIndex = m.endIndex; 34 | for (Mention mention : mentions) { 35 | int mentionStartIndex = mention.startIndex; 36 | int mentionEndIndex = mention.endIndex; 37 | if (mentionStartIndex == mStartIndex && mentionEndIndex == mEndIndex) { 38 | mention.mentionID = m.mentionID; 39 | break; 40 | } 41 | } 42 | 43 | 44 | int mentionID = m.mentionID; 45 | Mention correspondingMention = document.allPredictedMentions.get(mentionID); 46 | correspondingMention.corefClusterID = clusterID; 47 | } 48 | } 49 | } 50 | 51 | /** 52 | * update the corefcluster ID of each mention in the goldOrderedMentionsBySentence 53 | * 54 | * @param document 55 | */ 56 | private static void updateOrderedGoldMentions(Document document) { 57 | List> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence; 58 | Map goldClusters = document.goldCorefClusters; 59 | for (Integer clusterID : goldClusters.keySet()) { 60 | CorefCluster cluster = goldClusters.get(clusterID); 61 | for (Mention m : cluster.getCorefMentions()) { 62 | int sentenceID = m.sentNum; 63 | List mentions = goldOrderedMentionsBySentence.get(sentenceID); 64 | int mStartIndex = m.startIndex; 65 | int mEndIndex = m.endIndex; 66 | for (Mention mention : mentions) { 67 | int mentionStartIndex = mention.startIndex; 68 | int mentionEndIndex = mention.endIndex; 69 | if (mentionStartIndex == mStartIndex && mentionEndIndex == mEndIndex) { 70 | mention.mentionID = m.mentionID; 71 | break; 72 | } 73 | } 74 | 75 | int mentionID = m.mentionID; 76 | Mention correspondingMention = document.allGoldMentions.get(mentionID); 77 | correspondingMention.goldCorefClusterID = clusterID; 78 | } 79 | } 80 | } 81 | 82 | /** 83 | * whether post-process the document 84 | * 85 | * @param document 86 | */ 87 | public static void postProcessDocument(Document document) { 88 | boolean postProcessGold = Boolean.parseBoolean(ExperimentConstructor.experimentProps.getProperty(EecbConstants.DATAGENERATION_POSTPROCESS_GOLD_PROP, "false")); 89 | SieveCoreferenceSystem.postProcessing(document); 90 | if (postProcessGold) { 91 | SieveCoreferenceSystem.postProcessingGoldClusters(document); 92 | } 93 | } 94 | 95 | /** 96 | * merge from document to to document: four fields, which is just used for scoring in the system, not output for CoNLL scoring 97 | * 98 | * @param from 99 | * @param to 100 | */ 101 | public static void mergeDocument(Document from, Document to) { 102 | // add allGoldMentions 103 | for (Integer key : from.allGoldMentions.keySet()) { 104 | to.allGoldMentions.put(key, from.allGoldMentions.get(key)); 105 | } 106 | 107 | // add goldCorefClusters 108 | for (Integer key : from.goldCorefClusters.keySet()) { 109 | to.goldCorefClusters.put(key, from.goldCorefClusters.get(key)); 110 | } 111 | 112 | // add allPredictedMentions 113 | for (Integer key : from.allPredictedMentions.keySet()) { 114 | to.allPredictedMentions.put(key, from.allPredictedMentions.get(key)); 115 | } 116 | 117 | // add corefClusters 118 | for (Integer key : from.corefClusters.keySet()) { 119 | to.corefClusters.put(key, from.corefClusters.get(key)); 120 | } 121 | } 122 | 123 | } 124 | --------------------------------------------------------------------------------