├── .gitignore
├── LICENSE
├── README.md
├── classifier
    ├── ClassifierFactory.java
    ├── IClassifier.java
    ├── Parameter.java
    ├── StructuredPerceptron.java
    └── svm
    │   ├── LibSVMInterface.java
    │   ├── libsvm
    │       ├── svm.java
    │       ├── svm.m4
    │       ├── svm.m4~
    │       ├── svm_model.java
    │       ├── svm_node.java
    │       ├── svm_parameter.java
    │       ├── svm_print_interface.java
    │       └── svm_problem.java
    │   ├── svm_predict.java
    │   ├── svm_scale.java
    │   ├── svm_toy.java
    │   └── svm_train.java
├── cluster
    ├── Cluster.java
    ├── EM.java
    ├── EvaluateModelCandidates.java
    ├── HAC.java
    ├── IClustering.java
    ├── TFIDF.java
    ├── agglomeration
    │   ├── AgglomerationMethod.java
    │   └── AverageLinkage.java
    └── experiment
    │   ├── DissimilarityMeasure.java
    │   └── EecbDissimilarityMeasure.java
├── costfunction
    ├── ICostFunction.java
    └── LinearCostFunction.java
├── data
    ├── EecbCharSeq.java
    ├── EecbClusterDocument.java
    ├── EecbElement.java
    ├── EecbEntity.java
    ├── EecbEntityMention.java
    ├── EecbEvent.java
    ├── EecbEventMention.java
    ├── EecbMention.java
    ├── EecbSrlAnnotation.java
    ├── EecbToken.java
    └── EecbTopic.java
├── dataset
    ├── CorefSystem.java
    ├── CrossTopic.java
    ├── DatasetFactory.java
    ├── IDataSet.java
    └── TopicGeneration.java
├── example
    ├── ReadLearnedWeight.java
    ├── VectorNormalization.java
    └── Weight.java
├── experiment
    ├── CrossCoreferenceResolution.java
    ├── ExperimentConfigurationFactory.java
    ├── ExperimentConstructor.java
    └── ProcessDataSVM.java
├── featureExtractor
    ├── SRLAlignment.java
    ├── SRLDocument.java
    ├── SRLDocumentReader.java
    ├── SRLExtraction.java
    ├── SemanticOutputInterface.java
    ├── SimilarityVector.java
    ├── SrlResultIncorporation.java
    ├── WordSimilarity.java
    └── Wordnet.java
├── features
    ├── Feature.java
    ├── FeatureFactory.java
    ├── FeatureVectorGenerator.java
    ├── NominalFeature.java
    ├── NumericFeature.java
    └── individualfeature
    │   ├── Animacy.java
    │   ├── Gender.java
    │   ├── Head.java
    │   ├── Lemma.java
    │   ├── MentionWord.java
    │   ├── NEType.java
    │   ├── NSrlA0.java
    │   ├── NSrlA1.java
    │   ├── NSrlA2.java
    │   ├── NSrlAMLoc.java
    │   ├── NSrlAgreeCount.java
    │   ├── NSrlPA0.java
    │   ├── NSrlPA1.java
    │   ├── NSrlPA2.java
    │   ├── NSrlPAMLoc.java
    │   ├── Number.java
    │   ├── SrlA0.java
    │   ├── SrlA1.java
    │   ├── SrlA2.java
    │   ├── SrlAMLoc.java
    │   ├── SrlAgreeCount.java
    │   ├── SrlLeft.java
    │   ├── SrlPA0.java
    │   ├── SrlPA1.java
    │   ├── SrlPA2.java
    │   ├── SrlPAMLoc.java
    │   ├── SrlRight.java
    │   └── Synonym.java
├── general
    ├── AverageAnytimeDataCollection.java
    ├── Counter.java
    ├── CounterMap.java
    ├── DoubleOperation.java
    ├── FinalScore.java
    ├── FixedSizePriorityQueue.java
    ├── MapFactory.java
    ├── MatrixOperation.java
    ├── PorterStemmer.java
    ├── PriorityQueue.java
    ├── SetOperation.java
    └── StringOperation.java
├── io
    ├── EECBMentionExtractor.java
    ├── EecbReader.java
    ├── EgenericDataSetReader.java
    ├── EmentionExtractor.java
    ├── LargeFileWriting.java
    ├── LargetFileReading.java
    ├── LibSVM.java
    └── ResultOutput.java
├── lossfunction
    ├── ILossFunction.java
    ├── LinkLossFunction.java
    └── MetricLossFunction.java
├── method
    ├── CoreferenceResolutionDecoding.java
    ├── Dagger.java
    ├── Decoding.java
    └── IMethod.java
├── pruning
    └── Pruning.java
├── score
    ├── AssignmentAlgorithm.java
    ├── AssignmentProblem.java
    ├── CoNLLScorerHelper.java
    ├── HungarianAlgorithm.java
    ├── ScorerCEAF.java
    └── ScorerHelper.java
├── search
    ├── BeamSearch.java
    ├── ConstraintGeneration.java
    ├── ISearch.java
    ├── SearchFactory.java
    └── State.java
├── server
    ├── ClusterConnection.java
    ├── ExperimentArguments.java
    ├── ExperimentGeneration.java
    ├── ExperimentProperties.java
    ├── JobAssignment.java
    ├── JobState.java
    ├── JobSubmit.java
    ├── Node.java
    ├── Pipeline.java
    ├── PipelineConfiguration.java
    ├── ResultAggregation.java
    └── pipeline.properties
├── training
    ├── AROWOnline.java
    ├── AROWOnlineToBatch.java
    ├── Batch.java
    ├── Development.java
    ├── ITraining.java
    ├── Online.java
    ├── OnlinePA.txt
    ├── OnlineToBatch.java
    ├── PAOnline.java
    └── PAOnlineToBatch.java
├── tuning
    └── TuningFactory.java
└── util
    ├── Command.java
    ├── CosineSimilarity.java
    ├── DocumentAlignment.java
    ├── EecbConstants.java
    └── EecbConstructor.java


/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | 
3 | # Package Files #
4 | *.jar
5 | *.war
6 | *.ear
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The code is open source and licensed under the GNU General Public License(v2 or later). This is the full GPL, which allows many free uses, but not its use in distributed proprietary software.
 2 | 
 3 | This program is free software; you can redistribute it and/or
 4 | modify it under the terms of the GNU General Public License
 5 | as published by the Free Software Foundation; either version 2
 6 | of the License, or (at your option) any later version.
 7 | 
 8 | This program is distributed in the hope that it will be useful,
 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | GNU General Public License for more details.
12 | 
13 | If you have any problem with code, please tell me. Thanks very much.
14 | 
15 | Jun Xie(xiejuncs@gmail.com)
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | cross-document-coreference-resolution
 2 | =====================================
 3 | README
 4 | 
 5 | Jun Xie(xie@eecs.oregonstate.edu)
 6 | 
 7 | This is a cross document coreference resolution system written in java.  
 8 | 
 9 | Stanford coreference resolution system is a within document coreference resolution system. Hence, the essential data class is called Document. In the Document class, there are a lot of fields to represent the document information, such as gold ordered mentions by sentence, predicted orded mentions by sentence, predicted coreference clusters, gold coreference clusters, all predicted mentions, all gold mentions. Hence, in order to use stanford coreference resolution system, the very important task is to process each EECB file into a Document object. The example I am imitating is the stanford ACE 2005 machine reading sub-system. For the ACE 2005 corpus, there are two files related with one document, one is key.apf.xml and the other is raw.sgm. They constructed another class called AceDocument to perform the similar role as Document class. The AceDocument class is to represent the gold annotations, for example, AceEntityMention by sentences and AceEventMention by sentences, all AceEntityMention and all AceEventMention. Combined with the predicted mentions processed by Rulebased mention detection component, they changed the AcdDocument object to Document object. Based on the formed Document class, the system does coreference resolution. 
10 | 
11 | The overall architecture for EECB corpus is similar to their ACE 2005 machine reading sub-system. Due to the difference between EECB corpus and ACE corpus, the implementation is a bit different. The annotation is stored in a text file, called mentions.txt. Each line is represented as follows:
12 | 
13 | N or V? (0)  Topic(1)  Doc(2) Sentence Number(3) CorefID(4) StartIdx(5)  EndIdx(6) StartCharIdx(7)  EndCharIdx(8)
14 | 
15 | So I need to extract the event and entity mention according to the mentions.txt and the original source text, and represent the tokens, mentions, entity, event in my own built data structures for each topic, which consits of several documents (The reason for this is that our task is cross document coreference resolution). Based on those data strutures, I extract the gold annotations based on those data structures and predicted annotations according to the Rule based mention detection component provided by Stanford system. Then I need to adapt my EECBDocument class to Document class. Now, I am working and debugging on the transformation part. After the transformation part, then I can proceed to search part.  
16 | 
17 | In addition, the mentions.txt does not provide the semantic role annotations. I also need to import the annotations produced by this software(http://www.surdeanu.name/mihai/swirl/) into my code.
18 | 


--------------------------------------------------------------------------------
/classifier/ClassifierFactory.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.classifier;
  2 | 
  3 | import java.io.File;
  4 | import java.util.ArrayList;
  5 | import java.util.Arrays;
  6 | import java.util.List;
  7 | import java.util.Properties;
  8 | 
  9 | import edu.oregonstate.dataset.TopicGeneration;
 10 | import edu.oregonstate.experiment.ExperimentConstructor;
 11 | import edu.oregonstate.features.FeatureFactory;
 12 | import edu.oregonstate.io.ResultOutput;
 13 | import edu.oregonstate.util.EecbConstants;
 14 | import edu.oregonstate.util.EecbConstructor;
 15 | import edu.stanford.nlp.util.StringUtils;
 16 | 
 17 | /**
 18 |  * Run Classification Method given the Data Path
 19 |  * 
 20 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 21 |  *
 22 |  */
 23 | public class ClassifierFactory extends ExperimentConstructor {
 24 | 	
 25 | 	/** training topics */
 26 | 	private final String[] trainingTopics;
 27 | 	
 28 | 	/** classifier */
 29 | 	private final IClassifier classifier;
 30 | 	
 31 | 	/** phase, for example the second round */
 32 | 	private final String phaseIndex;
 33 | 	
 34 | 	public ClassifierFactory(Properties props) {
 35 | 		super(props);
 36 | 		
 37 | 		// get training topics
 38 | 		TopicGeneration topicGenerator = new TopicGeneration(props);
 39 | 		trainingTopics = topicGenerator.trainingTopics();
 40 | 		
 41 | 		// build a classifier
 42 | 		classifier = EecbConstructor.createClassifier(props.getProperty(EecbConstants.CLASSIFIER_METHOD, "StructuredPerceptron"));
 43 | 		
 44 | 		phaseIndex = props.getProperty(EecbConstants.PHASE_PROP, "0");
 45 | 	}
 46 | 	
 47 | 	/**
 48 | 	 * perform the experiment
 49 | 	 */
 50 | 	public void performExperiment() {
 51 | 		List<String> paths = getPaths();
 52 | 		
 53 | 		ResultOutput.writeTextFile(experimentFolder + "/searchstep", "" + paths.size());
 54 | 		ResultOutput.writeTextFile(experimentLogFile, "the total number of training files : " + paths.size());
 55 | 		
 56 | 		Parameter returnPara = classifier.train(paths, Integer.parseInt(phaseIndex));
 57 | 		ResultOutput.writeTextFile(experimentLogFile, "\n\nThe " + phaseIndex + "'s learned model \n");
 58 | 		ResultOutput.printParameter(returnPara, experimentLogFile);
 59 | 		
 60 | 		// output 
 61 | 		double[] averageWeight = returnPara.generateWeightForTesting();
 62 | 		String outputFile = experimentFolder + "/model/model" + phaseIndex;
 63 | 		String outputString = ResultOutput.printStructredModel(averageWeight, FeatureFactory.getFeatureTemplate());
 64 | 		ResultOutput.writeTextFile(outputFile, outputString);
 65 | 	}
 66 | 	
 67 | 	/**
 68 | 	 * get the path of training data
 69 | 	 * 
 70 | 	 * @param j
 71 | 	 * @return
 72 | 	 */
 73 | 	private List<String> getPaths() {
 74 | 		List<String> filePaths = new ArrayList<String>();
 75 | 		filePaths.addAll(getPaths(trainingTopics));
 76 | 
 77 | 		return filePaths;
 78 | 	}
 79 | 	
 80 | 	/**
 81 | 	 * aggregate the training data
 82 | 	 * 
 83 | 	 * @param topics
 84 | 	 * @param j
 85 | 	 * @return
 86 | 	 */
 87 | 	private List<String> getPaths(String[] topics) {
 88 | 		List<String> allfiles  = new ArrayList<String>();
 89 | 		for (String topic : topics) {
 90 | 			List<String> files = getDivisionPaths(topic);
 91 | 			String topicPath = experimentFolder + "/" + topic + "/data/";
 92 | 			List<String> filePaths = new ArrayList<String>();
 93 | 			for (String file : files) {
 94 | 				filePaths.add(topicPath + file);
 95 | 			}
 96 | 
 97 | 			allfiles.addAll(filePaths);
 98 | 		}
 99 | 
100 | 		return allfiles;
101 | 	}
102 | 
103 | 	// get a sequence of data file, such as 1, 2, 3, 4, 5
104 | 	private List<String> getDivisionPaths(String topic) {
105 | 		String topicPath = experimentFolder + "/" + topic + "/data/";
106 | 		List<String> files = new ArrayList<String>(Arrays.asList(new File(topicPath).list()));
107 | 
108 | 		return files;
109 | 	}
110 | 	
111 | 	
112 | 	public static void main(String[] args) {
113 | 		if (args.length > 1) {
114 | 			System.out.println("there are more parameters, you just can specify one path parameter.....");
115 | 			System.exit(1);
116 | 		}
117 | 		
118 | 		if (args.length == 0) {
119 | 			// run the experiment in the local machine for debugging
120 | 			args = new String[1];
121 | 			args[0] = "../corpus/config.properties";
122 | 		}
123 | 		
124 | 		String[] propArgs = new String[]{"-props", args[0]};
125 | 		
126 | 		Properties props = StringUtils.argsToProperties(propArgs);
127 | 		ExperimentConstructor classifier = new ClassifierFactory(props);
128 | 		classifier.performExperiment();
129 | 	}
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/classifier/IClassifier.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.classifier;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * interface of classifier
 7 |  * 
 8 |  * @author Jun Xie (xiejuncs@gmail.com)
 9 |  *
10 |  */
11 | public interface IClassifier {
12 | 	
13 | 	/* train the model according to file path and parameters */
14 | 	public Parameter train(String path, Parameter para);
15 | 	
16 | 	/* train the model according to file paths and parameters */
17 | 	public Parameter train(List<String> path, Parameter para);
18 | 	
19 | 	/* use zero vector to train the model */
20 | 	public Parameter train(List<String> path, int modelIndex);
21 | 	
22 | }
23 | 


--------------------------------------------------------------------------------
/classifier/Parameter.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.classifier;
  2 | 
  3 | import edu.oregonstate.experiment.ExperimentConstructor;
  4 | import edu.oregonstate.general.DoubleOperation;
  5 | import edu.oregonstate.util.EecbConstants;
  6 | 
  7 | /**
  8 |  * there are three fields for this class, including weights, totalWeights and violations, number of instance, and variance
  9 |  * 
 10 |  * @author Jun Xie (xiejuncs@gmail.com)
 11 |  *
 12 |  */
 13 | public class Parameter {
 14 | 
 15 | 	/* current weight */
 16 | 	private double[] mWeight;
 17 | 
 18 | 	/* the total weight */
 19 | 	private double[] mTotalWeight;
 20 | 
 21 | 	/* no of violations */
 22 | 	private int mNoOfViolation;
 23 | 
 24 | 	/* number of instance */
 25 | 	private int mNumberofInstance;
 26 | 
 27 | 	/* variance, used in the AROW algorithm */
 28 | 	private double[][] mVariance;
 29 | 
 30 | 	public Parameter(double[] weights) {
 31 | 		this(weights, DoubleOperation.generateIdentityMatrix(weights.length), new double[weights.length]);
 32 | 	}
 33 | 
 34 | 	public Parameter(double[] weights, double[][] variance, double[] totalWeights) {
 35 | 		this(weights, variance, totalWeights, 0, 0);
 36 | 	}
 37 | 
 38 | 	public Parameter(double[] weights, double[][] variance, double[] totalWeights, int noOfViolations, int numberOfInstances) {
 39 | 		mWeight = weights;
 40 | 		mTotalWeight = totalWeights;
 41 | 		mNoOfViolation = noOfViolations;
 42 | 		mNumberofInstance = numberOfInstances;
 43 | 		mVariance = variance;
 44 | 	}
 45 | 
 46 | 	public double[] getWeight() {
 47 | 		return mWeight;
 48 | 	}
 49 | 
 50 | 	public double[] getTotalWeight() {
 51 | 		return mTotalWeight;
 52 | 	}
 53 | 
 54 | 	public int getNoOfViolation() {
 55 | 		return mNoOfViolation;
 56 | 	}
 57 | 
 58 | 	public int getNumberOfInstance() {
 59 | 		return mNumberofInstance;
 60 | 	}
 61 | 
 62 | 	public double[][] getVariance() {
 63 | 		return mVariance;
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * make a deep copy of the current object
 68 | 	 * 
 69 | 	 * @return
 70 | 	 */
 71 | 	public Parameter makeCopy() {
 72 | 		int length = mWeight.length;
 73 | 		double[] copyWeight = new double[length];
 74 | 		double[] copyTotalWeight = new double[length];
 75 | 		double[][] copyVariance = new double[length][length];
 76 | 		System.arraycopy(mWeight, 0, copyWeight, 0, length);
 77 | 		System.arraycopy(mTotalWeight, 0, copyTotalWeight, 0, length);
 78 | 
 79 | 		// copy of variance by each row
 80 | 		for (int index = 0; index < length; index++) {
 81 | 			System.arraycopy(mVariance[index], 0, copyVariance[index], 0, length);
 82 | 		}
 83 | 
 84 | 		Parameter copyPara = new Parameter(copyWeight, copyVariance, copyTotalWeight, mNoOfViolation, mNumberofInstance);
 85 | 		return copyPara;
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * generate weight for testing, average weight or latest weight
 90 | 	 * 
 91 | 	 * @param para
 92 | 	 * @return
 93 | 	 */
 94 | 	// return the average weight
 95 | 	public double[] generateWeightForTesting() {
 96 | 		boolean averageWeight = Boolean.parseBoolean(ExperimentConstructor.experimentProps.getProperty(EecbConstants.SEARCH_WEIGHT, "true"));
 97 | 		Parameter finalPara = this.makeCopy();
 98 | 		double[] learnedWeight;
 99 | 		if (averageWeight) {
100 | 			learnedWeight = DoubleOperation.divide(finalPara.getTotalWeight(), finalPara.getNoOfViolation());
101 | 		} else {
102 | 			learnedWeight = finalPara.getWeight();
103 | 		}
104 | 		return learnedWeight;
105 | 	}
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------
/classifier/StructuredPerceptron.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.classifier;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.List;
  6 | import java.util.Properties;
  7 | 
  8 | import edu.oregonstate.experiment.ExperimentConstructor;
  9 | import edu.oregonstate.features.FeatureFactory;
 10 | import edu.oregonstate.general.DoubleOperation;
 11 | import edu.oregonstate.io.ResultOutput;
 12 | import edu.oregonstate.training.ITraining;
 13 | import edu.oregonstate.util.EecbConstants;
 14 | import edu.oregonstate.util.EecbConstructor;
 15 | 
 16 | /**
 17 |  * Learn the weight
 18 |  * 
 19 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 20 |  *
 21 |  */
 22 | public class StructuredPerceptron implements IClassifier {
 23 | 
 24 | 	/* experiment property file */
 25 | 	private final Properties mProps;
 26 | 
 27 | 	/* the total number of iterations */
 28 | 	private final int mIterations;
 29 | 	
 30 | 	/* experiment folder */
 31 | 	private final String experimentFolder;
 32 | 	
 33 | 	/* logFile */
 34 | 	private final String logFile;
 35 | 	
 36 | 	/* model index */
 37 | 	private int modelIndex;
 38 | 	
 39 | 	/* the weight used for keeping track of the progress */
 40 | 	private List<double[]> weights;
 41 | 	
 42 | 	/** the length of the features */
 43 | 	private final int length;
 44 | 	
 45 | 	/** training model */
 46 | 	private final ITraining trainingModel;
 47 | 	
 48 | 	/** learning rate constant or not */
 49 | 	private final boolean learningRateConstant;
 50 | 	
 51 | 	/** enable print the result of each iteration during training */
 52 | 	private final boolean enablePrintIterationResult;
 53 | 	
 54 | 	/** print the result of the iteration for how many gap */
 55 | 	private final int printIteartionGap;
 56 | 	
 57 | 	/**
 58 | 	 * constructor
 59 | 	 */
 60 | 	public StructuredPerceptron() {
 61 | 		mProps = ExperimentConstructor.experimentProps;
 62 | 		experimentFolder = ExperimentConstructor.experimentFolder;
 63 | 		mIterations = Integer.parseInt(mProps.getProperty(EecbConstants.CLASSIFIER_EPOCH_PROP, "50"));
 64 | 		
 65 | 		logFile = ExperimentConstructor.experimentLogFile;
 66 | 		modelIndex = 0;
 67 | 		String trainingStyle = mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_METHOD, "OnlineToBatch");
 68 | 		trainingModel = EecbConstructor.createTrainingModel(trainingStyle);
 69 | 		List<String> featureTemplate = FeatureFactory.getFeatureTemplate();
 70 | 		length = featureTemplate.size();
 71 | 		weights = new ArrayList<double[]>();
 72 | 		
 73 | 		learningRateConstant = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PERCEPTRON_LEARINGRATE_CONSTANT, "false"));
 74 | 		enablePrintIterationResult = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_ITERATION_RESULT, "false"));
 75 | 		printIteartionGap = Integer.parseInt(mProps.getProperty(EecbConstants.CLASSIFIER_ITEARTION_GAP, "2"));
 76 | 	}
 77 | 	
 78 | 	/**
 79 | 	 * use zero vector to train the model
 80 | 	 */
 81 | 	public Parameter train(List<String> paths, int index) {
 82 | 		ResultOutput.writeTextFile(logFile, "\nBegin to learn model : " + modelIndex);
 83 | 		ResultOutput.writeTextFile(logFile, "\nStructured Perceptron with Iteration : " + mIterations);
 84 | 		
 85 | 		// model index
 86 | 		modelIndex = index;
 87 | 		double[] weight = new double[length];
 88 | 		Parameter para = new Parameter(weight);
 89 | 
 90 | 		Parameter trainedPara = train(paths, para);
 91 | 		
 92 | 		return trainedPara;
 93 | 	}
 94 | 	
 95 | 	/**
 96 | 	 * train the model according to lots of files
 97 | 	 */
 98 | 	public Parameter train(List<String> paths, Parameter para) {
 99 | 		double startingRate = Double.parseDouble(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PERCEPTRON_STARTRATE, "0.1"));
100 | 		double endRate = 0.0;
101 | 		if (learningRateConstant) {
102 | 			endRate = startingRate;
103 | 		}
104 | 		double[] learningRates = DoubleOperation.createDescendingArray(startingRate, endRate, mIterations);
105 | 		
106 | 		// do gradient update
107 | 		for (int i = 0; i < mIterations; i++) {
108 | 			double learningRate = learningRates[i];
109 | 			weights.add(para.getWeight());
110 | 			ResultOutput.writeTextFile(logFile, "the " + modelIndex + "'s model " + i + "iteration");
111 | 			// ResultOutput.printParameter(para, logFile);
112 | 			
113 | 			// shuffle the path
114 | 			Collections.shuffle(paths);
115 | 			int beforeViolation = para.getNoOfViolation();
116 | 			
117 | 			// do weight update
118 | 			para = trainingModel.train(paths, para, learningRate);
119 | 			
120 | 			// print number of violated constraint
121 | 			int afterviolation = para.getNoOfViolation();
122 | 			ResultOutput.writeTextFile(experimentFolder + "/violation/violation-" + modelIndex +".csv", (afterviolation - beforeViolation) + "\t" + para.getNumberOfInstance());
123 | 			
124 | 		}
125 | 		
126 | 		// calculate the weight difference between the previous iteration and the current iteration
127 | 		DoubleOperation.calcualateWeightDifference(weights, experimentFolder + "/weightdifference/weight-difference-"+ modelIndex + ".csv");
128 | 		DoubleOperation.printWeightNorm(weights, experimentFolder + "/weightnorm/weight-norm-"+ modelIndex + ".csv");
129 | 		
130 | 		return para;
131 | 	}
132 | 	
133 | 	/**
134 | 	 * train the model
135 | 	 */
136 | 	public Parameter train(String path, Parameter para) {
137 | 		return para;
138 | 	}
139 | 	
140 | }


--------------------------------------------------------------------------------
/classifier/svm/LibSVMInterface.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.classifier.svm;
 2 | 
 3 | import edu.oregonstate.classifier.svm.libsvm.svm;
 4 | import edu.oregonstate.classifier.svm.libsvm.svm_model;
 5 | 
 6 | /**
 7 |  * An interface to LibSVM
 8 |  * 
 9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
10 |  *
11 |  */
12 | public class LibSVMInterface {
13 | 
14 | 	
15 | 	public static void main(String[] args) {
16 | 		String filePath = "/scratch/Software/libsvm-3.17/tools/prune.model";
17 | 		
18 | 		svm_model model = null;
19 | 		try {
20 | 			model = svm.svm_load_model(filePath);
21 | 		} catch (Exception e) {
22 | 			throw new RuntimeException(e);
23 | 		}
24 | 		
25 | 		System.out.println("done");
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/classifier/svm/libsvm/svm_model.java:
--------------------------------------------------------------------------------
 1 | //
 2 | // svm_model
 3 | //
 4 | package edu.oregonstate.classifier.svm.libsvm;
 5 | 
 6 | public class svm_model implements java.io.Serializable
 7 | {
 8 | 	public svm_parameter param;	// parameter
 9 | 	public int nr_class;		// number of classes, = 2 in regression/one class svm
10 | 	public int l;			// total #SV
11 | 	public svm_node[][] SV;	// SVs (SV[l])
12 | 	public double[][] sv_coef;	// coefficients for SVs in decision functions (sv_coef[k-1][l])
13 | 	public double[] rho;		// constants in decision functions (rho[k*(k-1)/2])
14 | 	public double[] probA;         // pariwise probability information
15 | 	public double[] probB;
16 | 	public int[] sv_indices;       // sv_indices[0,...,nSV-1] are values in [1,...,num_traning_data] to indicate SVs in the training set
17 | 
18 | 	// for classification only
19 | 
20 | 	public int[] label;		// label of each class (label[k])
21 | 	public int[] nSV;		// number of SVs for each class (nSV[k])
22 | 				// nSV[0] + nSV[1] + ... + nSV[k-1] = l
23 | };
24 | 


--------------------------------------------------------------------------------
/classifier/svm/libsvm/svm_node.java:
--------------------------------------------------------------------------------
1 | package edu.oregonstate.classifier.svm.libsvm;
2 | public class svm_node implements java.io.Serializable
3 | {
4 | 	public int index;
5 | 	public double value;
6 | }
7 | 


--------------------------------------------------------------------------------
/classifier/svm/libsvm/svm_parameter.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.classifier.svm.libsvm;
 2 | 
 3 | public class svm_parameter implements Cloneable,java.io.Serializable
 4 | {
 5 | 	/* svm_type */
 6 | 	public static final int C_SVC = 0;
 7 | 	public static final int NU_SVC = 1;
 8 | 	public static final int ONE_CLASS = 2;
 9 | 	public static final int EPSILON_SVR = 3;
10 | 	public static final int NU_SVR = 4;
11 | 
12 | 	/* kernel_type */
13 | 	public static final int LINEAR = 0;
14 | 	public static final int POLY = 1;
15 | 	public static final int RBF = 2;
16 | 	public static final int SIGMOID = 3;
17 | 	public static final int PRECOMPUTED = 4;
18 | 
19 | 	public int svm_type;
20 | 	public int kernel_type;
21 | 	public int degree;	// for poly
22 | 	public double gamma;	// for poly/rbf/sigmoid
23 | 	public double coef0;	// for poly/sigmoid
24 | 
25 | 	// these are for training only
26 | 	public double cache_size; // in MB
27 | 	public double eps;	// stopping criteria
28 | 	public double C;	// for C_SVC, EPSILON_SVR and NU_SVR
29 | 	public int nr_weight;		// for C_SVC
30 | 	public int[] weight_label;	// for C_SVC
31 | 	public double[] weight;		// for C_SVC
32 | 	public double nu;	// for NU_SVC, ONE_CLASS, and NU_SVR
33 | 	public double p;	// for EPSILON_SVR
34 | 	public int shrinking;	// use the shrinking heuristics
35 | 	public int probability; // do probability estimates
36 | 
37 | 	public Object clone() 
38 | 	{
39 | 		try 
40 | 		{
41 | 			return super.clone();
42 | 		} catch (CloneNotSupportedException e) 
43 | 		{
44 | 			return null;
45 | 		}
46 | 	}
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/classifier/svm/libsvm/svm_print_interface.java:
--------------------------------------------------------------------------------
1 | package edu.oregonstate.classifier.svm.libsvm;
2 | public interface svm_print_interface
3 | {
4 | 	public void print(String s);
5 | }
6 | 


--------------------------------------------------------------------------------
/classifier/svm/libsvm/svm_problem.java:
--------------------------------------------------------------------------------
1 | package edu.oregonstate.classifier.svm.libsvm;
2 | 
3 | public class svm_problem implements java.io.Serializable
4 | {
5 | 	public int l;
6 | 	public double[] y;
7 | 	public svm_node[][] x;
8 | }
9 | 


--------------------------------------------------------------------------------
/cluster/Cluster.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.cluster;
 2 | 
 3 | import java.util.List;
 4 | import java.util.ArrayList;
 5 | import Jama.Matrix;
 6 | 
 7 | import edu.oregonstate.data.EecbClusterDocument;
 8 | 
 9 | /**
10 |  * cluster representation
11 |  * 
12 |  * Jun Xie(xiejuncs@gmail.com)
13 |  */
14 | public class Cluster {
15 | 
16 | 	public int mID;
17 | 	public List<EecbClusterDocument> documents;
18 | 	public List<Cluster> children;
19 | 	
20 | 	public Cluster(int id) {
21 | 		mID = id;
22 | 		documents = new ArrayList<EecbClusterDocument>();
23 | 		children = new ArrayList<Cluster>();
24 | 	}
25 | 	
26 | 	public void addDocument(EecbClusterDocument document) {
27 | 		documents.add(document);
28 | 	}
29 | 	
30 | 	public void addDocuments(List<EecbClusterDocument> docus) {
31 | 		documents.addAll(docus);
32 | 	}
33 | 	
34 | 	public void addChildrens(List<Cluster> child) {
35 | 		children.addAll(child);
36 | 	}
37 | 	
38 | 	public List<EecbClusterDocument> getDocuments() {
39 | 		return documents;
40 | 	}
41 | 	
42 | 	public void addChildren(Cluster cluster) {
43 | 		children.add(cluster);
44 | 	}
45 | 	
46 | 	public List<Cluster> getChildren() {
47 | 		return children;
48 | 	}
49 | 	
50 | 	public int getID() {
51 | 		return mID;
52 | 	}
53 | 	
54 | 	/** to = to + from, and then delete from*/
55 | 	public static void mergeClusters(Cluster to, Cluster from) {
56 | 		int toID = to.getID();
57 | 		to.addChildren(to);
58 | 		for (EecbClusterDocument m : from.getDocuments()) {
59 | 			to.addDocument(m);
60 | 		}
61 | 		to.addChildren(from);
62 | 		System.out.println("merge clusters :" + toID + " <----- " + from.getID());
63 | 	}
64 | }
65 | 


--------------------------------------------------------------------------------
/cluster/HAC.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.cluster;
  2 | 
  3 | import java.util.List;
  4 | import java.util.ArrayList;
  5 | import java.util.Map;
  6 | import java.util.HashMap;
  7 | import java.util.Collection;
  8 | import java.util.Collections;
  9 | 
 10 | import edu.oregonstate.data.EecbClusterDocument;
 11 | import edu.oregonstate.cluster.Cluster;
 12 | import edu.oregonstate.cluster.agglomeration.AgglomerationMethod;
 13 | import edu.oregonstate.cluster.experiment.DissimilarityMeasure;
 14 | 
 15 | /**
 16 |  * implementation of Hierarchical Agglomerative clustering method
 17 |  * <b>STEPS</b>
 18 |  * 1. starts with each obj in a separate cluster;
 19 |  * 2. repeatedly joins the closeset pair of clusters
 20 |  * 3. uniti there is only one cluster
 21 |  * 
 22 |  * Jun Xie(xiejuncs@gmail.com)
 23 |  */
 24 | public class HAC {
 25 | 
 26 | 	public List<Cluster> mClusters;
 27 | 	public List<EecbClusterDocument> mDocuments;
 28 | 	// interface to incorporate different dissimilarityMeasure and different agglomerationMethod
 29 | 	private DissimilarityMeasure dissimilarityMeasure;
 30 | 	private List<Cluster> mergeResult; // all dendrogram clusters
 31 | 	private AgglomerationMethod method;
 32 | 	public List<String> mergeSequence;
 33 | 	
 34 | 	public HAC(List<EecbClusterDocument> documents, DissimilarityMeasure dissimilarityMeasure, 
 35 | 			AgglomerationMethod agglomerationMethod) {
 36 | 		mDocuments = documents;
 37 | 		mClusters = new ArrayList<Cluster>();
 38 | 		this.dissimilarityMeasure = dissimilarityMeasure;
 39 | 		mergeResult = new ArrayList<Cluster>();
 40 | 		method = agglomerationMethod;
 41 | 		mergeSequence = new ArrayList<String>();
 42 | 		initialize();
 43 | 	}
 44 | 	
 45 | 	public List<String> getSequence() {
 46 | 		return mergeSequence;
 47 | 	}
 48 | 	
 49 | 	public List<Cluster> getMergeResult() {
 50 | 		return mergeResult;
 51 | 	}
 52 | 
 53 | 	/**
 54 | 	 * makeing the clustering
 55 | 	 */
 56 | 	public void cluster() {
 57 | 		Map<String, Double> dissimilarityMatrix = computeDissimilarityMatrix();
 58 | 		String minIndex = minimum(dissimilarityMatrix);
 59 | 		// merge until there is only one cluster
 60 | 		boolean flag = true;
 61 | 		while(flag) {
 62 | 			String[] indexs = minIndex.split("-");
 63 | 			int to = Integer.parseInt(indexs[0]);
 64 | 			int from = Integer.parseInt(indexs[1]);
 65 | 			mergeSequence.add(mClusters.get(to).getID() + "-" + mClusters.get(from).getID());
 66 | 			Cluster.mergeClusters(mClusters.get(to), mClusters.get(from));
 67 | 			
 68 | 			
 69 | 			Cluster intermediateResult = new Cluster(to);
 70 | 			intermediateResult.addChildrens(mClusters.get(to).getChildren());
 71 | 			intermediateResult.addDocuments(mClusters.get(to).getDocuments());
 72 | 			// also need to deep copy the cluster object not just an ArrayList of cluster object
 73 | 			mClusters.remove(from);
 74 | 			mergeResult.add(intermediateResult); // soft copy, need deep copy
 75 | 			dissimilarityMatrix = new HashMap<String, Double>();
 76 | 			dissimilarityMatrix = computeDissimilaritywithDiffernetMethod();
 77 | 			if (dissimilarityMatrix.size() == 0) break;
 78 | 			minIndex = minimum(dissimilarityMatrix);
 79 | 		}
 80 | 	}
 81 | 	
 82 | 	private Map<String, Double> computeDissimilaritywithDiffernetMethod() {
 83 |         Map<String, Double> dissimilarityMatrix = new HashMap<String, Double>();
 84 |         /** calcluate the dissimilarity score for each pair (i,j) s.t. i != j*/
 85 |         for (int i = 0; i < mClusters.size(); i++) {
 86 |         	for (int j = 0; j < i; j++) {
 87 |         		double dissimilarity = method.computeDissimilarity(mClusters.get(i), mClusters.get(j));
 88 |         		dissimilarityMatrix.put(Integer.toString(i) + "-" + Integer.toString(j), dissimilarity);
 89 |         	}
 90 |         }
 91 |         
 92 | 		return dissimilarityMatrix;
 93 | 	}
 94 | 	
 95 | 	
 96 | 	/*Compare HashMap to get the index with the minimum value*/
 97 |     public String minimum(Map<String, Double> scores) {
 98 |             Collection<Double> c = scores.values();
 99 |             Double minvalue = Collections.min(c);
100 |             String minIndex = "";
101 |             for (String key : scores.keySet()) {
102 |             	Double value = scores.get(key);
103 |             	if (value == minvalue) {
104 |                 	minIndex = key;
105 |                 	break;
106 |                 }
107 |             }
108 | 
109 |             return minIndex;
110 |     }
111 | 	
112 | 	
113 | 	private Map<String, Double> computeDissimilarityMatrix() {
114 |         Map<String, Double> dissimilarityMatrix = new HashMap<String, Double>();
115 |         /** calcluate the dissimilarity score for each pair (i,j) s.t. i != j*/
116 |         for (int i = 0; i < mDocuments.size(); i++) {
117 |         	for (int j = 0; j < i; j++) {
118 |         		double dissimilarity = dissimilarityMeasure.computeDissimilarity(mDocuments, i, j);
119 |         		dissimilarityMatrix.put(Integer.toString(i) + "-" + Integer.toString(j), dissimilarity);
120 |         	}
121 |         }
122 |         
123 | 		return dissimilarityMatrix;
124 | 	}
125 | 	
126 | 	/**
127 | 	 * initialize each document in a separate cluster
128 | 	 */
129 | 	private void initialize() {
130 | 		for (int i = 0; i < mDocuments.size(); i++) {
131 | 			Cluster cluster = new Cluster(i);
132 | 			cluster.addDocument(mDocuments.get(i));
133 | 			mClusters.add(cluster);
134 | 		}
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/cluster/IClustering.java:
--------------------------------------------------------------------------------
1 | package edu.oregonstate.cluster;
2 | 
3 | public interface IClustering {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/cluster/TFIDF.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.cluster;
  2 | 
  3 | import java.util.Collections;
  4 | import java.util.HashMap;
  5 | import java.util.List;
  6 | import java.util.ArrayList;
  7 | import java.util.Set;
  8 | import java.util.HashSet;
  9 | import java.util.Map;
 10 | import java.util.Iterator;
 11 | 
 12 | import edu.oregonstate.general.CounterMap;
 13 | 
 14 | /**
 15 |  * Implementation of tf-idf-weighted term vector representation.
 16 |  * 
 17 |  * the clustering algorithm uses the vector-space model to represent each document.
 18 |  * In this model, each document $d$ is considered to be a vector in the term-space. 
 19 |  * In particular, the algorithm employed the tf-idf term weighting model, in which 
 20 |  * each document can be represented as ($tf_{1}\log(n/df_{1}))$,....). where $tf_{i}$
 21 |  * is the frequence of the ith term in the document and $df_{i}$ is the number of 
 22 |  * documents that contain the $i$th term. To account for documents of different lengths,
 23 |  * the length of each document vector is normalized so that it is of unit length $|d| = 1$
 24 |  * 
 25 |  * Jun Xie(xiejuncs@gmail.com)
 26 |  */
 27 | public class TFIDF {
 28 | 
 29 | 	private List<List<String>> mDocuments;
 30 | 	public Set<String> dictionary;
 31 | 	private Map<String, Integer> wordTotalCount;
 32 | 	private Map<String, List<Integer>> invertedIndex;
 33 | 	private Map<String, Map<Integer, Integer>> wordCount;
 34 | 	private CounterMap<String, Integer> tfidf; // word and document index
 35 | 	private int documentCount;
 36 | 	
 37 | 	/**
 38 | 	 * initialize all fields 
 39 | 	 * 
 40 | 	 * @param documents : each document contains a list of string
 41 | 	 * @param titles : all titles name
 42 | 	 */
 43 | 	public TFIDF(List<List<String>> documents) {
 44 | 		mDocuments = documents;
 45 | 		documentCount = mDocuments.size();
 46 | 		dictionary = new HashSet<String>();
 47 | 		invertedIndex = new HashMap<String, List<Integer>>();
 48 | 		wordCount = new HashMap<String, Map<Integer, Integer>>();
 49 | 		tfidf = new CounterMap<String, Integer>();
 50 | 		wordTotalCount = new HashMap<String, Integer>();
 51 | 	}
 52 | 	
 53 | 	/**
 54 | 	 * build TF IDF
 55 | 	 * 
 56 | 	 * @return retrun normalized tfidf vector
 57 | 	 */
 58 | 	public CounterMap<String, Integer> buildTFIDF() {
 59 | 		buildDictionary();
 60 | 		index();
 61 | 		calculateTFIDF();
 62 | 		
 63 | 		return tfidf;
 64 | 	}
 65 | 	
 66 | 	/**
 67 | 	 * calculate the tfidf
 68 | 	 */
 69 | 	private void calculateTFIDF() {
 70 | 		for (String token : wordCount.keySet()) {
 71 | 			Map<Integer, Integer> docFreq = wordCount.get(token);
 72 | 			int tokenFreq = invertedIndex.get(token).size();
 73 | 			for (Integer doc : docFreq.keySet()) {
 74 | 				Integer count = docFreq.get(doc);
 75 | 				double w = 0.0;
 76 | 				if (count > 0) {
 77 | 					w = 1 + Math.log10(count);
 78 | 				}
 79 | 				Double value = w * Math.log10(documentCount/tokenFreq);
 80 | 				tfidf.setCount(token, doc, value);
 81 | 			}
 82 | 			
 83 | 		}
 84 | 	}
 85 | 	
 86 | 	/**
 87 | 	 * build inverted Index
 88 | 	 */
 89 | 	private void index() {
 90 | 		Iterator<String> it = dictionary.iterator();
 91 | 		while (it.hasNext()) {
 92 | 			String token = it.next();
 93 | 			List<Integer> posting = new ArrayList<Integer>();
 94 | 			Map<Integer, Integer> count = new HashMap<Integer, Integer>();
 95 | 			for (int i = 0; i < mDocuments.size(); i++) {
 96 | 				List<String> document = mDocuments.get(i);
 97 | 				if (document.contains(token)) {
 98 | 					posting.add(i);
 99 | 				}
100 | 				int occurance = Collections.frequency(document, token);
101 | 				count.put(i, occurance);
102 | 				wordCount.put(token, count);
103 | 			}
104 | 			invertedIndex.put(token, posting);
105 | 		}
106 | 	}
107 | 	
108 | 	/**
109 | 	 * terms that appear in a single document were removed <b>NOTE</b> processing in later part
110 | 	 */
111 | 	private void buildDictionary() {
112 | 		for (int i = 0; i < mDocuments.size(); i++) {
113 | 			List<String> document = mDocuments.get(i);
114 | 			for (String token : document) {
115 | 				boolean contains = wordTotalCount.containsKey(token);
116 | 				int count = 0;
117 | 				if (contains) {
118 | 					count = wordTotalCount.get(token);
119 | 				}
120 | 				wordTotalCount.put(token, (count + 1));
121 | 			}
122 | 		}
123 | 		
124 | 		// delete the token with the count being 1
125 | 		Iterator<String> it = wordTotalCount.keySet().iterator();
126 | 		while (it.hasNext()) {
127 | 			String token = it.next();
128 | 			if (wordTotalCount.get(token) < 2) {
129 | 				it.remove();
130 | 			}
131 | 		}
132 | 		
133 | 		dictionary = wordTotalCount.keySet();
134 | 	}
135 | 	
136 | }
137 | 


--------------------------------------------------------------------------------
/cluster/agglomeration/AgglomerationMethod.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.cluster.agglomeration;
 2 | 
 3 | import edu.oregonstate.cluster.Cluster;
 4 | 
 5 | /**
 6 |  * 
 7 |  * 
 8 |  * @author Jun Xie(xiejuncs@gmail.com)
 9 |  *
10 |  */
11 | 
12 | public interface AgglomerationMethod {
13 | 
14 | 	/**
15 | 	 * Compute the dissimilarity between two clusters
16 | 	 * 
17 | 	 * @return dissimilarity between cluster (i,j).
18 | 	 */
19 | 	public double computeDissimilarity(Cluster c1, Cluster c2);
20 | }
21 | 


--------------------------------------------------------------------------------
/cluster/agglomeration/AverageLinkage.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.cluster.agglomeration;
 2 | 
 3 | import edu.oregonstate.cluster.Cluster;
 4 | import edu.oregonstate.cluster.experiment.DissimilarityMeasure;
 5 | import edu.oregonstate.cluster.experiment.EecbDissimilarityMeasure;
 6 | import Jama.Matrix;
 7 | import edu.oregonstate.data.EecbClusterDocument;
 8 | /**
 9 |  * average link or group average
10 |  * <b>Formula</b>
11 |  * 
12 |  * dist(c_{i}, c_{j}) = 1/n_{i}n{j} \sum_{d_{r} \in c_{i}} \sum_{d_{s} \in c_{j}} dist(d_{r}, d_{s})
13 |  * 
14 |  * @author Jun Xie(xiejuncs@gmail.com)
15 |  *
16 |  */
17 | public class AverageLinkage implements AgglomerationMethod {
18 | 
19 | 	/** calculate the dissimilarity between two clusters */
20 | 	public double computeDissimilarity(Cluster c1, Cluster c2) {
21 | 		DissimilarityMeasure meausre = new EecbDissimilarityMeasure();
22 | 		double dissimilarity = 0.0;
23 | 		int n1 = c1.getDocuments().size();
24 | 		int n2 = c2.getDocuments().size();
25 | 		
26 | 		for (EecbClusterDocument d1 : c1.getDocuments()) {
27 | 			for (EecbClusterDocument d2 : c2.getDocuments()) {
28 | 				Matrix m1 = d1.vector;
29 | 				Matrix m2 = d2.vector;
30 | 				dissimilarity += 1 - meausre.cosineSimilarity(m1, m2);
31 | 			}
32 | 		}
33 | 		
34 | 		return dissimilarity / (n1 * n2);
35 | 	}
36 | 
37 |     public String toString() {
38 |         return "Average";
39 |     }
40 | 
41 | }


--------------------------------------------------------------------------------
/cluster/experiment/DissimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.cluster.experiment;
 2 | 
 3 | import java.util.List;
 4 | import Jama.Matrix;
 5 | import edu.oregonstate.data.EecbClusterDocument;
 6 | 
 7 | /**
 8 |  * Computes the dissimilarity between two observations in an experiment.
 9 |  * 
10 |  * @author Matthias.Hauswirth@usi.ch
11 |  */
12 | public interface DissimilarityMeasure {
13 | 
14 |     public double computeDissimilarity(List<EecbClusterDocument> vectors, int observation1, int observation2);
15 | 
16 |     public double cosineSimilarity(Matrix obs1, Matrix obs2);
17 | }


--------------------------------------------------------------------------------
/cluster/experiment/EecbDissimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.cluster.experiment;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import Jama.Matrix;
 6 | import edu.oregonstate.data.EecbClusterDocument;
 7 | 
 8 | /**
 9 |  * Jun Xie(xiejuncs@gmail.com)
10 |  */
11 | public class EecbDissimilarityMeasure implements DissimilarityMeasure {
12 | 
13 | 	/**
14 | 	 * how to calculate the dis similarity function
15 | 	 */
16 | 	public double computeDissimilarity(List<EecbClusterDocument> vectors, int observation1, int observation2) {
17 | 		double similarity = 0.0;
18 | 		Matrix obs1 = vectors.get(observation1).vector;
19 | 		Matrix obs2 = vectors.get(observation2).vector;
20 | 		similarity = cosineSimilarity(obs1, obs2);
21 | 		return 1 - similarity;
22 | 	}
23 | 	
24 | 	/**
25 | 	 * use cosine similarity to compute dissimilarity
26 | 	 * 
27 | 	 * @param obs1
28 | 	 * @param obs2
29 | 	 * @return
30 | 	 */
31 | 	public double cosineSimilarity(Matrix obs1, Matrix obs2) {
32 | 		double sum = 0.0;
33 | 		for (int i = 0; i < obs1.getRowDimension(); i++) {
34 | 			sum = obs1.get(i, 0) * obs2.get(i, 0);
35 | 		}
36 | 		double norm1 = add(obs1);
37 | 		double norm2 = add(obs2);
38 | 		
39 | 		return sum / Math.sqrt(norm1 * norm2);
40 | 	}
41 | 
42 | 	public double add(Matrix obs) {
43 | 		double sum = 0.0;
44 | 		for (int i = 0; i < obs.getRowDimension(); i++) {
45 | 			sum += obs.get(i, 0) * obs.get(i, 0);
46 | 		}
47 | 		return sum;
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/costfunction/ICostFunction.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.costfunction;
 2 | 
 3 | import edu.stanford.nlp.stats.Counter;
 4 | 
 5 | /**
 6 |  * the interface of Cost Function,
 7 |  * 
 8 |  * The most used cost function is linear combination of features
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public interface ICostFunction {
14 | 
15 | 	// calculate cost function according to features and the model
16 | 	public double calculateCostFunction(Counter<String> features, double[] model);
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/costfunction/LinearCostFunction.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.costfunction;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.features.FeatureFactory;
 6 | import edu.stanford.nlp.stats.Counter;
 7 | 
 8 | public class LinearCostFunction implements ICostFunction {
 9 | 	
10 | 	public LinearCostFunction() {
11 | 	}
12 | 	
13 | 	/** 
14 | 	 * according to feature vector and model vector, calculate the cost
15 | 	 */
16 | 	public double calculateCostFunction(Counter<String> features, double[] model) {
17 |  		double sum = 0.0;
18 |  		List<String> featureTemplate = FeatureFactory.getFeatureTemplate();
19 |  		for (int i = 0; i < featureTemplate.size(); i++) {
20 |  			String feature = featureTemplate.get(i);
21 |  			double value = features.getCount(feature);
22 |  			sum += value * model[i];
23 |  		}
24 |  		return sum;
25 |  	}
26 | 	
27 | }
28 | 


--------------------------------------------------------------------------------
/data/EecbCharSeq.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.data;
  2 | 
  3 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.MatchException;
  4 | import edu.stanford.nlp.trees.Span;
  5 | import java.util.Vector;
  6 | 
  7 | /**
  8 |  * The textual form of the mention occured in the document
  9 |  * 
 10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 11 |  *
 12 |  */
 13 | public class EecbCharSeq {
 14 | 
 15 | 	/** The exact text matched by this sequence */
 16 | 	private String mText;
 17 | 	/** in order to bookkeep the the start and end of the tokens */
 18 | 	private Span mTokenOffset;
 19 | 	
 20 | 	private int mSentenceID;
 21 | 	
 22 | 	/**
 23 | 	 * The reason for this is that we extract the annotation according to character index of the tokens
 24 | 	 */
 25 | 	private Span mByteOffset;
 26 | 	
 27 | 	public EecbCharSeq(String text, int start, int end, int sentenceID) {
 28 | 		mText = text;
 29 | 		mByteOffset = new Span(start, end);
 30 | 		mTokenOffset = null;
 31 | 		mSentenceID = sentenceID;
 32 | 	}
 33 | 	
 34 | 	public int sentenceID() {
 35 | 		return mSentenceID;
 36 | 	}
 37 | 	
 38 | 	public int getByteStart() {
 39 | 	    return mByteOffset.start();
 40 | 	}
 41 | 
 42 | 	public int getByteEnd() {
 43 | 	    return mByteOffset.end();
 44 | 	}
 45 | 
 46 | 	public Span getByteOffset() {
 47 | 	    return mByteOffset;
 48 | 	}
 49 | 	
 50 | 	public String getText() {
 51 | 		return mText;
 52 | 	}
 53 | 	
 54 | 	public Span getTokenOffset() {
 55 | 		return mTokenOffset;
 56 | 	}
 57 | 	
 58 | 	public int getTokenStart() {
 59 | 	    if (mTokenOffset == null)
 60 | 	      return -1;
 61 | 	    return mTokenOffset.start();
 62 | 	}
 63 | 
 64 | 	public int getTokenEnd() {
 65 | 	    if (mTokenOffset == null)
 66 | 	      return -1;
 67 | 	    return mTokenOffset.end();
 68 | 	}
 69 | 	
 70 | 	/**
 71 | 	 * Matches this char seq against the full token stream As a result of this
 72 | 	 * method mTokenOffset is initialized
 73 | 	 */
 74 | 	public void match(Vector<EecbToken> tokens) throws MatchException {
 75 | 		int start = -1;
 76 | 	    int end = -1;
 77 | 	    
 78 | 	    for (int i = 0; i < tokens.size(); i++) {
 79 | 	    	if (tokens.get(i).getSentence() != mSentenceID) continue;
 80 | 	    	
 81 | 	    	if (tokens.get(i).getByteOffset().start() == mByteOffset.start()) {
 82 | 	            start = i;
 83 | 	        } else if (mByteOffset.start() > tokens.get(i).getByteOffset().start()
 84 | 	                && mByteOffset.start() < tokens.get(i).getByteOffset().end()) {
 85 | 	            start = i;
 86 | 	        }
 87 | 	    	
 88 | 	    	if (tokens.get(i).getByteOffset().end() == mByteOffset.end()) {
 89 | 	            end = i;
 90 | 	            break;
 91 | 	        } else if (mByteOffset.end() >= tokens.get(i).getByteOffset().start()
 92 | 	                && mByteOffset.end() < tokens.get(i).getByteOffset().end()) {
 93 | 	            end = i;
 94 | 	            break;
 95 | 	        }
 96 | 	    }
 97 | 	    
 98 | 	    if (start >= 0 && end >= 0) {
 99 | 	        mTokenOffset = new Span(start, end);
100 | 	        // mPhrase = makePhrase(tokens, mTokenOffset);
101 | 	    } else {
102 | 	        throw new MatchException("Match failed!");
103 | 	    }
104 | 	}
105 | 	
106 | 	/**
107 | 	 * for debug convience 
108 | 	 * 
109 | 	 * @param label
110 | 	 * @param offset
111 | 	 * @return
112 | 	 */
113 | 	public String toXml(String label, int offset) {
114 | 	    StringBuffer buffer = new StringBuffer();
115 | 	    EecbElement.appendOffset(buffer, offset);
116 | 	    buffer.append("<" + label + ">\n");
117 | 	    EecbElement.appendOffset(buffer, offset + 2);
118 | 	    buffer.append("<charseq START=\"" + mTokenOffset.start() + "\" END=\"" + mTokenOffset.end() + " CHARSTART=\"" + mByteOffset.start() + "\" CHAREND=\"" + 
119 | 	    		mByteOffset.end() + "\">" + mText + "</charseq>");
120 | 	    buffer.append("\n");
121 | 	    EecbElement.appendOffset(buffer, offset);
122 | 	    buffer.append("</" + label + ">");
123 | 	    return buffer.toString();
124 | 	}
125 | 	
126 | 	@Override
127 | 	public String toString() {
128 | 		return "EecbCharSeq [mText = " + mText + ", mByteOffset=" + mByteOffset +", mTokenOffset=" + mTokenOffset + "]";
129 | 	}
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/data/EecbClusterDocument.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | import Jama.Matrix;
 4 | 
 5 | /**
 6 |  * Jun Xie(xiejuncs@gmail.com)
 7 |  */
 8 | public class EecbClusterDocument {
 9 | 
10 | 	public int mID;
11 | 	public String mPrefix;
12 | 	public Matrix vector;
13 | 
14 | 	public EecbClusterDocument(int id, Matrix vec) {
15 | 		mID = id;
16 | 		vector = vec;
17 | 	}
18 | 	
19 | 	// set the prefix, the format as 1(cluster)-1(document),
20 | 	public void setPrefix(String prefix) {
21 | 		mPrefix = prefix;
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/data/EecbElement.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | /**
 4 |  * Base class for all EECB annotation elements
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public class EecbElement {
10 | 
11 | 	/** unique identifier for this element*/
12 | 	protected String mID;
13 | 	
14 | 	public EecbElement(String mID) {
15 | 		this.mID = mID;
16 | 	}
17 | 	
18 | 	public String getId() {return mID; }
19 | 
20 | 	// indentation for debug.
21 | 	// Entity/Event without indentation
22 | 	// EntityMention/EventMention with two indentations
23 | 	public static void appendOffset(StringBuffer buffer, int offset) {
24 | 		for(int i = 0; i < offset; i ++){
25 | 			buffer.append(" ");
26 | 		}
27 | 	}
28 | 	
29 | }
30 | 


--------------------------------------------------------------------------------
/data/EecbEntity.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceEntityMention;
 7 | import edu.stanford.nlp.ie.machinereading.domains.ace.reader.AceToken;
 8 | 
 9 | /**
10 |  * EECB entity. In the EECB corpus, an corefid is used to represent the entity.
11 |  * For example: <ENTITY COREFID="27">People magazine</ENTITY>
12 |  * "People Magazine" is a entity mention. Its entity identifier is 27.
13 |  * <p>
14 |  * 
15 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
16 |  *
17 |  */
18 | public class EecbEntity extends EecbElement {	
19 | 	private List<EecbEntityMention> mMentions;
20 | 	
21 | 	public EecbEntity(String id) {
22 | 		super(id);
23 | 		mMentions = new ArrayList<EecbEntityMention>();
24 | 	}
25 | 	
26 | 	public List<EecbEntityMention> getMentions() {return mMentions;}
27 | 	
28 | 	public void addMention(EecbEntityMention m) { 
29 | 	    mMentions.add(m);
30 | 	    m.setParent(this);
31 | 	}
32 | 	
33 | 	public String toXML(int offset) {
34 | 		StringBuffer buffer = new StringBuffer();
35 | 	    appendOffset(buffer, offset);
36 | 	    buffer.append("<entity ID=\"" + getId() + "\">\n");
37 | 	    for(EecbEntityMention m: mMentions){
38 | 	      buffer.append(m.toXml(offset + 2));
39 | 	      buffer.append("\n");
40 | 	    }
41 | 	    appendOffset(buffer, offset);
42 | 	    buffer.append("</entity>");
43 | 	    return buffer.toString();
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/data/EecbEntityMention.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * Eecb Entity Mention, for example, an noun phrase
 8 |  * 
 9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
10 |  *
11 |  */
12 | public class EecbEntityMention extends EecbMention {
13 | 	
14 | 	@Override
15 | 	public String toString() {
16 | 		return "EecbEntityMention [mHead=" + mHead + ", mExtent" + this.mExtent +", mSentence = " + sentenceID() +"]";
17 | 	  }
18 | 	
19 | 	/** The set of event mentions that contain this entity mention */
20 | 	private List<EecbEventMention> mEventMentions;
21 | 	
22 | 	/** The parent entity */
23 | 	private EecbEntity mParent;
24 | 	
25 | 	private EecbCharSeq mHead;
26 | 
27 | 	/** Position of the head word of this mention */
28 | 	private int mHeadTokenPosition;
29 | 	
30 | 	public EecbEntityMention(String id, EecbCharSeq extent, EecbCharSeq head, int sentence) {
31 | 		super(id, extent, sentence);
32 | 	    mExtent = extent;
33 | 	    mHead = head;
34 | 	    mParent = null;
35 | 	    mHeadTokenPosition = -1;
36 | 	    mEventMentions = new ArrayList<EecbEventMention>();
37 | 	}
38 | 	
39 | 	public void setParent(EecbEntity e) { mParent = e; }
40 | 	public EecbEntity getParent() { return mParent; }
41 | 	public EecbCharSeq getHead() { return mHead; }
42 | 	public EecbCharSeq getExtent() { return mExtent; }
43 | 	public int getHeadTokenPosition() { return mHeadTokenPosition; }
44 | 	
45 | 	public void addEventMention(EecbEventMention rm) {
46 | 	    mEventMentions.add(rm);
47 | 	}
48 | 	public List<EecbEventMention> getEventMentions() {
49 | 	    return mEventMentions;
50 | 	}
51 | 	
52 | 	public String toXml(int offset) {
53 | 	    StringBuffer buffer = new StringBuffer();
54 | 	    appendOffset(buffer, offset);
55 | 	    buffer.append("<entity_mention ID=\"" + getId() + "\">\n");
56 | 	    buffer.append(mExtent.toXml("extent", offset + 2));
57 | 	    buffer.append("\n");
58 | 	    buffer.append(mHead.toXml("head", offset + 2));
59 | 	    buffer.append("\n");
60 | 	    appendOffset(buffer, offset);
61 | 	    buffer.append("</entity_mention>");
62 | 	    return buffer.toString();
63 | 	}
64 | 	
65 | }
66 | 


--------------------------------------------------------------------------------
/data/EecbEvent.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * Store only Eecb Event
 8 |  * 
 9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
10 |  *
11 |  */
12 | public class EecbEvent extends EecbElement {
13 | 	
14 | 	/** The list of mentions for this event */
15 | 	private List<EecbEventMention> mMentions;
16 | 	
17 | 	public EecbEvent(String id) {
18 | 		super(id);
19 | 		mMentions = new ArrayList<EecbEventMention>();
20 | 	}
21 | 	
22 | 	public void addMention(EecbEventMention m) {
23 | 		mMentions.add(m);
24 | 	}
25 | 	
26 | 	public EecbEventMention getMention(int index) {
27 | 		return mMentions.get(index);
28 | 	}
29 | 	
30 | 	/** Get the size of Event Mentions */
31 | 	public int getMentionCount() {
32 | 		return mMentions.size();
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------
/data/EecbEventMention.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | import java.util.Set;
 7 | 
 8 | /**
 9 |  * Store only EECB event mention
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class EecbEventMention extends EecbMention {
15 | 
16 | 	/** Maps argument role to argument mentions */
17 | 	private Map<String, EecbEntityMention> mRolesToArguments;
18 | 	
19 | 	/** the parent event */
20 | 	private EecbEvent mParent;
21 | 	
22 | 	/** anchor text for this event, just the phrase annotated by the mentions.txt */
23 | 	private EecbCharSeq mAnchor;
24 | 
25 | 	/** scope is the whole sentence, while the extent is the sentence segment the mention is in*/
26 | 	public EecbEventMention(String id, EecbCharSeq extent, EecbCharSeq anchor, int sentence) {
27 | 		super(id, extent, sentence);
28 | 		this.mAnchor = anchor;
29 | 		mRolesToArguments = new HashMap<String, EecbEntityMention>();
30 | 	}
31 | 	
32 | 	@Override
33 | 	public String toString() {
34 | 		return "EecbEventMention [mAnchor = " + mAnchor + ", mParent=" + mParent + 
35 | 		", mRolesToArguments = " + mRolesToArguments + ", mExtent = " + mExtent +
36 | 		", mId = " + mID + ", mSentence = " + mSentenceID + "]";
37 | 	}
38 | 	
39 | 	public Collection<EecbEntityMention> getArgs() {
40 | 		return mRolesToArguments.values();
41 | 	}
42 | 	
43 | 	public Set<String> getRoles() {
44 | 		return mRolesToArguments.keySet();
45 | 	}
46 | 	
47 | 	public EecbEntityMention getArg(String role) {
48 | 		return mRolesToArguments.get(role);
49 | 	}
50 | 	
51 | 	public void addArg(EecbEntityMention em, String role){
52 | 		mRolesToArguments.put(role, em);
53 | 	}
54 | 	
55 | 	public void setAnchor(EecbCharSeq anchor) {
56 | 		mAnchor = anchor;
57 | 	}
58 | 	
59 | 	public EecbCharSeq getAnchor() {
60 | 		return mAnchor;
61 | 	}
62 | 	
63 | 	public void setParent(EecbEvent e) {
64 | 	    mParent = e;
65 | 	}
66 | 
67 | 	public EecbEvent getParent() {
68 | 	    return mParent;
69 | 	}
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/data/EecbMention.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.data;
 2 | 
 3 | /**
 4 |  * Superclass of all Eecb mentions (entities, events, etc)
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public class EecbMention extends EecbElement {
10 | 
11 | 	protected EecbCharSeq mExtent;
12 | 	protected int mSentenceID;
13 | 	
14 | 	protected EecbMention(String id, EecbCharSeq mExtent, int sentenceID) {
15 | 		super(id);
16 | 		this.mExtent = mExtent;
17 | 		this.mSentenceID = sentenceID;
18 | 	}
19 | 
20 | 	public EecbCharSeq getExtent() {return mExtent;}
21 | 	
22 | 	public int sentenceID() {
23 | 		return this.mSentenceID;
24 | 	}
25 | 	
26 | 	public String toXml(int offset) { return ""; }
27 | }
28 | 


--------------------------------------------------------------------------------
/data/EecbSrlAnnotation.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.data;
  2 | 
  3 | /**
  4 |  * Implementation for aligning the result of SRL and the gold annotations
  5 |  * 
  6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
  7 |  *
  8 |  */
  9 | public class EecbSrlAnnotation {
 10 | 	
 11 | 	/** The id. */
 12 | 	String id;
 13 | 	
 14 | 	/** the text */
 15 | 	String mText;
 16 | 	
 17 | 	/** The start offset */
 18 | 	int start;
 19 | 
 20 | 	/** The end offset */
 21 | 	int end;
 22 | 	
 23 | 	/** head dependency */
 24 | 	int parentPosition;
 25 | 	
 26 | 	/*mention headWord*/
 27 | 	String headString;
 28 | 	
 29 | 	// head index
 30 | 	int headStartIndex;
 31 | 	int headEndIndex;
 32 | 	
 33 | 	/**predicate*/
 34 | 	String predicate;
 35 | 
 36 | 	public EecbSrlAnnotation(String id, String text, String predicate, int parentPosition, int start, int end) {
 37 | 	  this.id = id;
 38 | 	  this.mText = text;
 39 | 	  this.predicate = predicate;
 40 | 	  this.parentPosition = parentPosition;
 41 | 	  this.start = start;
 42 | 	  this.end = end;
 43 | 	  headStartIndex = 0;
 44 | 	  headEndIndex = 0;
 45 | 	}
 46 | 	
 47 | 	public EecbSrlAnnotation() {
 48 | 		
 49 | 	}
 50 | 	
 51 | 	public void setHead(String headString) {
 52 | 		this.headString = headString;
 53 | 	}
 54 | 	
 55 | 	public void setHeadStartIndex(int startIndex) {
 56 | 		headStartIndex = startIndex;
 57 | 	}
 58 | 	
 59 | 	public int getHeadStartIndex() {
 60 | 		return headStartIndex;
 61 | 	}
 62 | 	
 63 | 	public void setHeadEndIndex(int endIndex) {
 64 | 		headEndIndex = endIndex;
 65 | 	}
 66 | 	
 67 | 	public int getHeadEndIndex() {
 68 | 		return headEndIndex;
 69 | 	}
 70 | 	
 71 | 	public String getHead() {
 72 | 		return this.headString;
 73 | 	}
 74 | 	
 75 | 	/** The ID of the annotation. */
 76 | 	public String getId()
 77 | 	{
 78 | 	  return id;
 79 | 	} // getId()
 80 | 
 81 | 	/** Set the ID of the annotation. */
 82 | 	public void setId(String i)
 83 | 	{
 84 | 	  id = i;
 85 | 	} // setId()
 86 | 
 87 | 	public String getText() {
 88 | 		return this.mText;
 89 | 	}
 90 | 	
 91 | 	public String getPredicate() {
 92 | 		return this.predicate;
 93 | 	}
 94 | 
 95 | 	/** The start offset. */
 96 | 	public int getStartOffset()
 97 | 	{
 98 | 	  return start;
 99 | 	} // getStartOffset()
100 | 
101 | 	/** The end offset. */
102 | 	public int getEndOffset()
103 | 	{
104 | 	  return end;
105 | 	} // getEndOffset()
106 | 
107 | 	public int getLength()
108 | 	{
109 | 	  return end - start;
110 | 	}
111 | 
112 | 	/** Set the start offset. */
113 | 	public void setStartOffset(int s)
114 | 	{
115 | 	  start = s;
116 | 	} // setStartOffset()
117 | 
118 | 	/** Set the end offset. */
119 | 	public void setEndOffset(int e)
120 | 	{
121 | 	  end = e;
122 | 	} // setEndOffset()
123 | 
124 | 	/** Output representation of the annotation */
125 | 	@Override
126 | 	public String toString() {
127 | 		return mText + "(" + start + "/" + end + "; "+ headStartIndex + "/" + headEndIndex +")";
128 | 	}
129 | 	
130 | 	public void setText(String text) {
131 | 		mText = text;
132 | 	}
133 | 	
134 | 	
135 | }
136 | 


--------------------------------------------------------------------------------
/data/EecbToken.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.data;
  2 | 
  3 | import edu.stanford.nlp.ie.machinereading.common.StringDictionary;
  4 | import edu.stanford.nlp.trees.Span;
  5 | 
  6 | /**
  7 |  * Every Token in the EECB corpus
  8 |  * 
  9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 10 |  *
 11 |  */
 12 | public class EecbToken {
 13 | 
 14 | 	/** the actual token bytes
 15 | 	 * Normally we work with mWord (see below), but mLiteral is needed when
 16 | 	 * we need to check if a sequence of tokens exists in a gazetteer 
 17 | 	 */
 18 | 	private String mLiteral;
 19 | 	
 20 | 	/** The index of the literal in the WORDS hash */
 21 | 	private int mWord;
 22 | 	
 23 | 	private int mPos;
 24 | 
 25 | 	private int mChunk;
 26 | 
 27 | 	private Span mByteOffset;
 28 | 	
 29 | 	private int mSentence;
 30 | 	
 31 | 	/** Dictionary for all words in the corpus */
 32 | 	public static StringDictionary WORDS;
 33 | 
 34 | 	/** Dictionary for all lemmas in the corpus */
 35 | 	public static StringDictionary LEMMAS;
 36 | 
 37 | 	/** Dictionary for all other strings in the corpus */
 38 | 	public static StringDictionary OTHERS;
 39 | 	
 40 | 	static {
 41 | 	    WORDS = new StringDictionary("words");
 42 | 	    LEMMAS = new StringDictionary("lemmas");
 43 | 	    OTHERS = new StringDictionary("others");
 44 | 	    WORDS.setMode(true);
 45 | 	    LEMMAS.setMode(true);
 46 | 	    OTHERS.setMode(true);	
 47 | 	}
 48 | 	
 49 | 	public int getSentence() {
 50 | 	    return mSentence;
 51 | 	}
 52 | 	
 53 | 	public String getLiteral() {
 54 | 	    return mLiteral;
 55 | 	}
 56 | 	
 57 | 	public int getPos() {
 58 | 	    return mPos;
 59 | 	}
 60 | 
 61 | 	public int getChunk() {
 62 | 	    return mChunk;
 63 | 	}
 64 | 	
 65 | 	public Span getByteOffset() {
 66 | 	    return mByteOffset;
 67 | 	}
 68 | 
 69 | 	public int getByteStart() {
 70 | 	    return mByteOffset.start();
 71 | 	}
 72 | 
 73 | 	public int getByteEnd() {
 74 | 	    return mByteOffset.end();
 75 | 	}
 76 | 	
 77 | 	public static String removeSpaces(String s) {
 78 | 	    if (s == null)
 79 | 	      return s;
 80 | 	    return s.replaceAll(" ", "_");
 81 | 	}
 82 | 	
 83 | 	/**
 84 | 	 * Constructs an AceToken from a tokenized line generated by Tokey
 85 | 	 */
 86 | 	public EecbToken(String word, String pos, String chunk, int start, int end, int sentence) {
 87 | 	    mLiteral = word;
 88 | 	    if (word == null) {
 89 | 	        mWord = -1;
 90 | 	    } else {
 91 | 	        mWord = WORDS.get(removeSpaces(word), false);
 92 | 	    }
 93 | 	    if (pos == null)
 94 | 	        mPos = -1;
 95 | 	    else
 96 | 	        mPos = OTHERS.get(pos, false);
 97 | 	    if (chunk == null)
 98 | 	        mChunk = -1;
 99 | 	    else
100 | 	        mChunk = OTHERS.get(chunk, false);
101 | 	    
102 | 	    mByteOffset = new Span(start, end);
103 | 	    mSentence = sentence;
104 | 	}
105 | 
106 | 	@Override
107 | 	public String toString() {
108 | 		return mLiteral + ", " + mByteOffset + ", " + mSentence;
109 | 	}
110 | 	
111 | }


--------------------------------------------------------------------------------
/dataset/IDataSet.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.dataset;
 2 | 
 3 | import edu.stanford.nlp.dcoref.Document;
 4 | 
 5 | /**
 6 |  * get training data
 7 |  * 
 8 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 9 |  *
10 |  */
11 | public interface IDataSet {
12 | 
13 | 	public Document getData(String topics, boolean goldOnly);
14 | }
15 | 


--------------------------------------------------------------------------------
/dataset/TopicGeneration.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.dataset;
  2 | 
  3 | import java.util.Properties;
  4 | 
  5 | import edu.oregonstate.general.StringOperation;
  6 | import edu.oregonstate.util.EecbConstants;
  7 | 
  8 | /**
  9 |  * generate training topics, testing topics, development topics
 10 |  * 
 11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 12 |  *
 13 |  */
 14 | public class TopicGeneration {
 15 | 
 16 | 	// training topics
 17 | 	private String[] trainingTopics;
 18 | 	
 19 | 	// testing topics
 20 | 	private String[] testingTopics;
 21 | 	
 22 | 	// development topics
 23 | 	private String[] developmentTopics;
 24 | 	
 25 | 	/** every experiment just one topic */
 26 | 	private String topic;
 27 | 	
 28 | 	// experiment properties
 29 | 	private final Properties experimentProps;
 30 | 	
 31 | 	public TopicGeneration(Properties experimentProperties) {
 32 | 		experimentProps = experimentProperties;
 33 | 		trainingTopics = null;
 34 | 		testingTopics = null;
 35 | 		developmentTopics = null;
 36 | 		generateTopics();
 37 | 	}
 38 | 	
 39 | 	/**
 40 | 	 * generate topics
 41 | 	 * 
 42 | 	 */
 43 | 	public void generateTopics() {
 44 | 		String[] sets = new String[]{EecbConstants.DATAGENERATION_TRAININGSET_PROP, EecbConstants.DATAGENERATION_DEVELOPMENTSET_PROP, EecbConstants.DATAGENERATION_TESTINGSET_PROP};
 45 | 		
 46 | 		for (String set : sets) {
 47 | 			String topicString = experimentProps.getProperty(set, "");
 48 | 			if (topicString != "") {
 49 | 				
 50 | 				if (set.equals(EecbConstants.DATAGENERATION_TRAININGSET_PROP)) {
 51 | 					trainingTopics = StringOperation.splitString(topicString, ",");
 52 | 				}
 53 | 				
 54 | 				if (set.equals(EecbConstants.DATAGENERATION_DEVELOPMENTSET_PROP)) {
 55 | 					developmentTopics = StringOperation.splitString(topicString, ",");
 56 | 				}
 57 | 				
 58 | 				if (set.equals(EecbConstants.DATAGENERATION_TESTINGSET_PROP)) {
 59 | 					testingTopics = StringOperation.splitString(topicString, ",");
 60 | 				}
 61 | 			}
 62 | 		}
 63 | 		
 64 | 	}
 65 | 	
 66 | 	/**
 67 | 	 * just one topic processed by the current job
 68 | 	 * 
 69 | 	 * @return a topic
 70 | 	 */
 71 | 	public String topic() {
 72 | 		if (trainingTopics != null) {
 73 | 			topic = trainingTopics[0] + "-trainingtopic";
 74 | 		}
 75 | 		
 76 | 		if (testingTopics != null) {
 77 | 			topic = testingTopics[0] + "-testingtopic";
 78 | 		}
 79 | 		
 80 | 		if (developmentTopics != null) {
 81 | 			topic = developmentTopics[0] + "-developmenttopic";
 82 | 		}
 83 | 		
 84 | 		return topic;
 85 | 	}
 86 | 	
 87 | 	// return training topic
 88 | 	public String[] trainingTopics() {
 89 | 		return trainingTopics;
 90 | 	}
 91 | 	
 92 | 	// return testing topic
 93 | 	public String[] testingTopics() {
 94 | 		return testingTopics;
 95 | 	}
 96 | 	
 97 | 	// return development topic
 98 | 	public String[] developmentTopics() {
 99 | 		return developmentTopics;
100 | 	}
101 | 	
102 | }
103 | 


--------------------------------------------------------------------------------
/example/ReadLearnedWeight.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.example;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | import edu.stanford.nlp.io.IOUtils;
 6 | 
 7 | public class ReadLearnedWeight {
 8 | 
 9 | 	public static void main(String[] args) {
10 | 		String filePath = "/nfs/guille/xfern/users/xie/Experiment/corpus/EECB1.0/tokenoutput/file";
11 | 		List<String> lines = IOUtils.linesFromFile(filePath);
12 | 		Map<String, Integer> maps = new TreeMap<String, Integer>();
13 | 		for (String line : lines) {
14 | 			String[] elements = line.split("\t");
15 | 			String word = elements[1];
16 | 			if (!maps.containsKey(word)) {
17 | 				maps.put(word, 0);
18 | 			}
19 | 			int counter = maps.get(word) + 1;
20 | 			maps.put(word, counter );
21 | 		}
22 | 		
23 | 		for (String word : maps.keySet()) {
24 | 			System.out.println(word + " " + maps.get(word));
25 | 		}
26 | 		
27 | 		
28 | 	}
29 | 	
30 | }
31 | 


--------------------------------------------------------------------------------
/example/VectorNormalization.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.example;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | import edu.oregonstate.general.StringOperation;
 6 | 
 7 | public class VectorNormalization {
 8 | 
 9 | 	public static void main(String[] args) {
10 | 		
11 | 		List<Integer> numbers = new ArrayList<Integer>();
12 | 		numbers.addAll(null);
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/example/Weight.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.example;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | public class Weight {
 6 | 
 7 | 	public static void main(String[] args) {
 8 | 		
 9 | 		List<Integer> previousIDs = new ArrayList<Integer>();
10 | 		previousIDs.add(1);
11 | 		previousIDs.add(2);
12 | 		previousIDs.add(3);
13 | 		
14 | 		List<Integer> currentIDs = new ArrayList<Integer>();
15 | 		currentIDs.add(1);
16 | 		currentIDs.add(3);
17 | 		currentIDs.add(4);
18 | 		
19 | 		currentIDs.removeAll(previousIDs);
20 | 		System.out.println(currentIDs);
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/experiment/CrossCoreferenceResolution.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.experiment;
  2 | 
  3 | import java.io.File;
  4 | import java.lang.reflect.Constructor;
  5 | import java.util.*;
  6 | 
  7 | import edu.oregonstate.io.ResultOutput;
  8 | import edu.oregonstate.server.Pipeline;
  9 | import edu.stanford.nlp.util.StringUtils;
 10 | 
 11 | /**
 12 |  * cross coreference resolution
 13 |  * 
 14 |  * @author Jun Xie (xiejuncs@gmail.com)
 15 |  *
 16 |  */
 17 | public class CrossCoreferenceResolution extends ExperimentConstructor {
 18 | 	
 19 | 	private Map<String, String> methodToClasses = new HashMap<String, String>();
 20 | 	
 21 | 	private final String configFolder;
 22 | 	
 23 | 	/**
 24 | 	 * set experiment properties
 25 | 	 * 
 26 | 	 * @param props
 27 | 	 */
 28 | 	public CrossCoreferenceResolution(Properties props, String configfolder) {
 29 | 		super(props);
 30 | 		
 31 | 		configFolder = configfolder;
 32 | 		
 33 | 		/**
 34 | 		 * map the procedure to the corresponding main class
 35 | 		 */
 36 | 		methodToClasses.put("datageneration", "edu.oregonstate.dataset.DatasetFactory");
 37 | 		methodToClasses.put("searchtrueloss", "edu.oregonstate.search.SearchFactory");
 38 | 		methodToClasses.put("learn", "edu.oregonstate.classifier.ClassifierFactory");
 39 | 		methodToClasses.put("searchlearnedweightwithoutfeature", "edu.oregonstate.search.SearchFactory");
 40 | 		methodToClasses.put("resultaggregation", "edu.oregonstate.server.ResultAggregation");
 41 | 		methodToClasses.put("searchlearnedweightwithfeature", "edu.oregonstate.search.SearchFactory");
 42 | 		methodToClasses.put("lasso", "edu.oregonstate.search.SearchFactory");
 43 | 		
 44 | 	}
 45 | 
 46 | 	/**
 47 | 	 * perform the cross coreference resolution experiment
 48 | 	 */
 49 | 	public void performExperiment() {
 50 | 		String procedure = experimentProps.getProperty("procedures");
 51 | 		Pipeline pipeline = new Pipeline();
 52 | 		pipeline.generateProcedures(procedure);
 53 | 		List<String> procedures = pipeline.getProcedure();
 54 | 		
 55 | 		//TODO
 56 | 		File experimentDirectory = new File(configFolder);
 57 | 		String[] experiments = experimentDirectory.list();
 58 | 		
 59 | 		for (String stepInformation : procedures) {
 60 | 			System.out.println(stepInformation);
 61 | 			String[] elements = stepInformation.split("-");
 62 | 			String step = elements[0];
 63 | 			String phaseIndex = elements[1];
 64 | 			String prefix = phaseIndex + "-" + step;
 65 | 			String mainClass = methodToClasses.get(step);
 66 | 			for (String experiment : experiments) {
 67 | 				if (experiment.startsWith(prefix)) {
 68 | 					try {
 69 | 						
 70 | 						Class experimentClass = Class.forName(mainClass);
 71 | 						Class[] proto = new Class[1];
 72 | 						proto[0] = Properties.class;
 73 | 						Object[] params = new Object[1];
 74 | 						
 75 | 						// get property of the experiment
 76 | 						String[] propArgs = new String[]{"-props", configFolder + "/" + experiment};
 77 | 						Properties prop = StringUtils.argsToProperties(propArgs);
 78 | 						
 79 | 						params[0] = prop;
 80 | 						Constructor ct = experimentClass.getConstructor(proto);
 81 | 						ExperimentConstructor experimenter = (ExperimentConstructor) ct.newInstance(params);
 82 | 						experimenter.performExperiment();
 83 | 					} catch (Exception e) {
 84 | 						throw new RuntimeException(e);
 85 | 					}
 86 | 				}
 87 | 			}
 88 | 		}
 89 | 		
 90 | 	}
 91 | 	
 92 | 	/**
 93 | 	 * The main entry point of the experiment
 94 | 	 * 
 95 | 	 * @param args
 96 | 	 */
 97 | 	public static void main(String[] args) {
 98 | 		if (args.length > 1) {
 99 | 			 System.out.println("there are more parameters, you just can specify one path parameter.....");
100 |              System.exit(1);
101 | 		}
102 | 		
103 | 		String configFolder = "../corpus/alignexperiment";
104 | 		if (args.length == 0) {
105 | 			// run the experiment in the local machine for debugging
106 | 			args = new String[1];
107 | 			args[0] = configFolder +  "/config.properties";
108 | 		}
109 | 		
110 | 		String[] propArgs = new String[]{"-props", args[0]};
111 | 		
112 | 		Properties props = StringUtils.argsToProperties(propArgs);
113 | 		ExperimentConstructor experiment = new CrossCoreferenceResolution(props, configFolder);
114 | 		ResultOutput.printTime(experimentLogFile, "The start of the experiment: ");
115 | 		experiment.performExperiment();
116 | 		ResultOutput.printTime(experimentLogFile, "The end of the experiment");
117 | 	}
118 | }
119 | 


--------------------------------------------------------------------------------
/experiment/ExperimentConfigurationFactory.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.experiment;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.Properties;
  8 | 
  9 | import edu.oregonstate.featureExtractor.WordSimilarity;
 10 | import edu.oregonstate.io.ResultOutput;
 11 | import edu.oregonstate.training.Development;
 12 | import edu.oregonstate.util.EecbConstants;
 13 | import edu.stanford.nlp.io.IOUtils;
 14 | import edu.stanford.nlp.stats.ClassicCounter;
 15 | import edu.stanford.nlp.util.Triple;
 16 | 
 17 | /**
 18 |  * find out what is the configuration set for the experiment
 19 |  * 
 20 |  * @author jun (xiejuncs@gmail.com)
 21 |  *
 22 |  */
 23 | public class ExperimentConfigurationFactory {
 24 | 	
 25 | 	// set the property for the experiment
 26 | 	private final Properties props;
 27 | 	
 28 | 	// corpus folder
 29 | 	private final String corpusPath;
 30 | 	
 31 | 	public ExperimentConfigurationFactory(Properties properties) {
 32 | 		props = properties;
 33 | 		corpusPath = ExperimentConstructor.experimentCorpusPath;
 34 | 	}
 35 | 	
 36 | 	// define the experiment name as the result folder name
 37 | 	public String defineExperimentName() {
 38 | 		StringBuilder sb = new StringBuilder();
 39 | 		
 40 | 		// get the EXPERIMENT_PROP value and get the elements
 41 | 		// for each element, get its value and concatenate them together.
 42 | 		// such that Pairwise-StructuredPerceptron
 43 | 		String experimentProp = props.getProperty(EecbConstants.EXPERIMENT_PROP);
 44 | 		String[] experimentElements = experimentProp.split(",");
 45 | 		int length = experimentElements.length;
 46 | 		for (int index = 0; index < length; index++ ) {
 47 | 
 48 | 			String key = experimentElements[index].trim();
 49 | 			String value = props.getProperty(key.trim());
 50 | 			
 51 | 			if (index == (length - 1)) {
 52 | 				sb.append(value);
 53 | 			} else {
 54 | 				sb.append(value + "-");
 55 | 			}
 56 | 		}
 57 | 		
 58 | 		return sb.toString().trim();
 59 | 	}
 60 | 	
 61 | 	// configure the WordNet at the beginning of the experiment
 62 | 	public void configureWordNet() {
 63 | 		String wordnetPath = props.getProperty(EecbConstants.WORDNET_PROP);
 64 | 		System.setProperty("wordnet.database.dir", wordnetPath);
 65 | 	}
 66 | 	
 67 | 	// load data from Word Similarity Dictionary
 68 | 	public Map<String, ClassicCounter<String>> loadSimilarityDictionary(String similarityPath) {
 69 | 		WordSimilarity similarity = new WordSimilarity(similarityPath);
 70 | 		similarity.load();
 71 | 		return similarity.getDatas();
 72 | 	}
 73 | 	
 74 | 	/**
 75 | 	 * get mention boundary from gold mention file
 76 | 	 * @return
 77 | 	 */
 78 | 	public Map<String, Map<String, List<Triple>>> loadGoldMentionBoundary() {
 79 | 		String mentionPath = corpusPath + "/mentions.txt";
 80 | 		List<String> records = IOUtils.linesFromFile(mentionPath);
 81 | 		Map<String, Map<String, List<Triple>>> goldMentionBoundary = new HashMap<String, Map<String, List<Triple>>>();
 82 | 		// the format of the gold mention file
 83 | 		// # N or V? (0)  Topic(1)  Doc(2) Sentence Number(3) CorefID(4) StartIdx(5)  EndIdx(6) StartCharIdx(7)  EndCharIdx(8)
 84 | 		// # CharIdx doesn't include spaces
 85 | 		// # sentence number starts from 0
 86 | 		for (String record : records) {
 87 | 			String[] elements = record.split("\t");
 88 | 			
 89 | 			// index topic
 90 | 			String topic = elements[1];
 91 | 			boolean containTopic = goldMentionBoundary.containsKey(topic);
 92 | 			if (!containTopic) {
 93 | 				goldMentionBoundary.put(topic, new HashMap<String, List<Triple>>());
 94 | 			}
 95 | 			
 96 | 			// index the combination of document and sentence
 97 | 			String document = elements[2];
 98 | 			String sentenceNumber = elements[3];
 99 | 			String DocSen = document + "-" + sentenceNumber;
100 | 			boolean containDocSen = goldMentionBoundary.get(topic).containsKey(DocSen);
101 | 			if (!containDocSen) {
102 | 				goldMentionBoundary.get(topic).put(DocSen, new ArrayList<Triple>());
103 | 			}
104 | 			
105 | 			// add record as triple : corefID, startCharIdx, endCharIdx
106 | 			String corefID = elements[0] + "-" + elements[4];
107 | 			int startCharIdx = Integer.parseInt(elements[7]);
108 | 			int endCharIdx = Integer.parseInt(elements[8]);
109 | 			Triple<String, Integer, Integer> triple = new Triple<String, Integer, Integer>(corefID, startCharIdx, endCharIdx);
110 | 			goldMentionBoundary.get(topic).get(DocSen).add(triple);
111 | 		}
112 | 		
113 | 		return goldMentionBoundary;
114 | 	}
115 | 	
116 | 	// tune the stopping rate
117 | 	public static double tuneStoppingRate(double[] weight, int j) {
118 | 		double stoppingrate = 0.0;
119 | 		
120 | 		String stopping = ExperimentConstructor.experimentProps.getProperty(EecbConstants.SEARCH_STOPPINGCRITERION, "none");
121 | 		if (stopping.equals("tuning")) {
122 | 			Development development = new Development(j, weight, 1.0, 3.0, 10);
123 | 			stoppingrate = development.tuning();
124 | 			ResultOutput.writeTextFile(ExperimentConstructor.experimentLogFile, "\nthe stopping rate is : " + stoppingrate + " for " + j + "\n");
125 | 		}
126 | 
127 | 		return stoppingrate;
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/experiment/ExperimentConstructor.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.experiment;
  2 | 
  3 | import java.util.Map;
  4 | 
  5 | import java.util.Properties;
  6 | 
  7 | import edu.oregonstate.util.Command;
  8 | import edu.oregonstate.util.EecbConstants;
  9 | import edu.stanford.nlp.stats.ClassicCounter;
 10 | 
 11 | /**
 12 |  * the abstract class of experiment
 13 |  * 
 14 |  * @author Jun Xie (xiejuncs@gmail.com)
 15 |  *
 16 |  */
 17 | public abstract class ExperimentConstructor {
 18 | 
 19 | 	// used for recording the information of the whole experiment
 20 | 	public static String experimentLogFile;
 21 | 	
 22 | 	// experiment result folder
 23 | 	public static String experimentFolder;
 24 | 	
 25 | 	// property file
 26 | 	public static Properties experimentProps;
 27 | 
 28 | 	// corpus path
 29 | 	public static String experimentCorpusPath;
 30 | 	
 31 | 	// debug Mode
 32 | 	public static boolean debugMode;
 33 | 	
 34 | 	// Dekang Lin's Noun Similarity thesaurus
 35 | 	public static Map<String, ClassicCounter<String>> nounSimilarityThesaurus;
 36 | 
 37 | 	// Dekang Lin's Verb Similarity thesaurus, in order to get its top 10, use the Lemma word form
 38 | 	public static Map<String, ClassicCounter<String>> verbSimilarityThesaurus;
 39 | 
 40 | 	// Dekang Lin's Adjective Similarity thesaurus
 41 | 	public static Map<String, ClassicCounter<String>> adjectiveSimilarityThesaurus;
 42 | 	
 43 | 	// post-process the corpus for predicted mentions
 44 | 	public static boolean postProcess;
 45 | 	
 46 | 	/** whether the experiment is gold mentions or predicted mentions */
 47 | 	public static boolean goldMentions;
 48 | 
 49 | 	/**
 50 | 	 * configure the experiment
 51 | 	 * 
 52 | 	 * @param props
 53 | 	 */
 54 | 	public ExperimentConstructor(Properties props) {
 55 | 		experimentProps = props;
 56 | 		
 57 | 		// debug mode
 58 | 		debugMode = Boolean.parseBoolean(props.getProperty(EecbConstants.DEBUG_PROP, "false"));
 59 | 		
 60 | 		// corpus folder, which stores the EECB corpus and TEMPORARY folder which is used for print the log file
 61 | 		experimentCorpusPath = props.getProperty(EecbConstants.CORPUS_PROP);
 62 | 		 
 63 | 		StringBuilder sb = new StringBuilder();
 64 | 		//String timeStamp = Calendar.getInstance().getTime().toString().replaceAll("\\s", "-").replaceAll(":", "-");
 65 |         sb.append(experimentCorpusPath + "/TEMPORYRESUT/");
 66 |         
 67 |         ExperimentConfigurationFactory factory = new ExperimentConfigurationFactory(props);
 68 |         String name = factory.defineExperimentName();
 69 |         sb.append(name);
 70 |         
 71 |         // create the result folder
 72 |         experimentFolder = sb.toString().trim();
 73 |         Command.mkdir(experimentFolder);
 74 |         
 75 |         // create folder to store the CONLL results
 76 |         Command.mkdir(experimentFolder + "/conll");
 77 |         
 78 |         // create folder to store the serialized results
 79 | 		Command.mkdir(experimentFolder + "/document");
 80 | 		
 81 | 		// create folder to store the model result
 82 | 		Command.mkdir(experimentFolder + "/model");
 83 | 		
 84 | 		// create folder to store the violation result
 85 | 		Command.mkdir(experimentFolder + "/violation");
 86 | 		
 87 | 		// create folder to store weight difference 
 88 | 		Command.mkdir(experimentFolder + "/weightdifference");
 89 | 		
 90 | 		// create folder to store weight norm
 91 | 		Command.mkdir(experimentFolder + "/weightnorm");
 92 | 		
 93 | 		// create folder to store the constraints, the name of the file is just the topic name
 94 | 		Command.mkdir(experimentFolder + "/constraints");
 95 | 			
 96 |         // specify the log file path
 97 |         experimentLogFile = sb.toString().trim() + "/experimentlog";
 98 | 		
 99 | 		// configure the WORDNET
100 |         factory.configureWordNet();
101 |        
102 |         // Dekang Lin's Similarity thesaurus respecitvely for noun, adjective and verb
103 |         nounSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simN.lsp");
104 |         verbSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simV.lsp");
105 |         adjectiveSimilarityThesaurus = factory.loadSimilarityDictionary(experimentCorpusPath + "/simA.lsp");
106 |         
107 |         // whether need to do post-process on predicted mentions
108 |         // because gold mention also includes the singleton cluster,
109 |         // so no matter whether gold mention or predicted mention, 
110 |         // do post-process
111 |         goldMentions = Boolean.parseBoolean(experimentProps.getProperty(EecbConstants.DATAGENERATION_GOLDMENTION_PROP));
112 |         postProcess = true;
113 | 	}
114 | 	
115 | 	// perform the experiments
116 | 	public abstract void performExperiment();
117 | 	
118 | }
119 | 


--------------------------------------------------------------------------------
/featureExtractor/SRLDocument.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.featureExtractor;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | /**
 6 |  * SRL document, need to specify the document ID
 7 |  * A collection of annotated tokens
 8 |  * 
 9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
10 |  *
11 |  */
12 | public class SRLDocument {
13 | 
14 | 	/** document ID */
15 | 	private final String mDocumentID;
16 | 	
17 | 	/** sentences */
18 | 	private List<List<String[]>> sentences;
19 | 	
20 | 	public SRLDocument(String documentID) {
21 | 		mDocumentID = documentID;
22 | 		sentences = new ArrayList<List<String[]>>();
23 | 	}
24 | 	
25 | 	public String getDocumentID() {
26 | 		return mDocumentID;
27 | 	}
28 | 	
29 | 	public void addSentence(List<String[]> sentence) {
30 | 		sentences.add(sentence);
31 | 	}
32 | 	
33 | 	public List<List<String[]>> getSentences() {
34 | 		return sentences;
35 | 	}
36 | 	
37 | }
38 | 


--------------------------------------------------------------------------------
/featureExtractor/SRLDocumentReader.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.featureExtractor;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | import edu.stanford.nlp.io.IOUtils;
 6 | 
 7 | 
 8 | /**
 9 |  * read SRL result Document
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class SRLDocumentReader {
15 | 
16 | 	/** document path */
17 | 	private final String mDocumentPath;
18 | 	
19 | 	public SRLDocumentReader(String documentPath) {
20 | 		mDocumentPath = documentPath;
21 | 	}
22 | 	
23 | 	/**
24 | 	 * read raw input and format as SRLDocument
25 | 	 * Seperating the sentences 
26 | 	 * 
27 | 	 * @return
28 | 	 */
29 | 	public SRLDocument readDocument() {
30 | 		// read srl result from the output of Semantic role labeling software
31 | 		List<String> srlResults = IOUtils.linesFromFile(mDocumentPath);
32 | 		
33 | 		// define a SRLDocument
34 | 		String[] elements = mDocumentPath.split("/");
35 | 		String topic = elements[elements.length - 1].split("\\.")[0];
36 | 		SRLDocument document = new SRLDocument(topic);
37 | 		
38 | 		// format the srl result as the SRLDocument
39 | 		List<String[]> sentence = new ArrayList<String[]>();
40 | 		for (int index = 0; index <= srlResults.size(); index++) {
41 | 			if (index == srlResults.size()) {
42 | 				document.addSentence(sentence);
43 | 				break;
44 | 			}
45 | 			
46 | 			String line = srlResults.get(index);
47 | 			if ((line.equals(""))) {
48 | 				document.addSentence(sentence);
49 | 				sentence = new ArrayList<String[]> ();
50 | 				continue;
51 | 			}
52 | 			
53 | 			String[] token = line.split("\t");
54 | 			sentence.add(token);
55 | 		}
56 | 		
57 | 		return document;
58 | 	}
59 | 	
60 | 	/**
61 | 	 * the main entry of the program 
62 | 	 * 
63 | 	 * @param args
64 | 	 */
65 | 	public static void main(String[] args) {
66 | 		args = new String[]{"data/srl/16.output"};
67 | 		
68 | 		String documentPath = args[0];
69 | 		
70 | 		SRLDocumentReader reader = new SRLDocumentReader(documentPath);
71 | 		SRLDocument document = reader.readDocument();
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/featureExtractor/SimilarityVector.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.featureExtractor;
 2 | 
 3 | import java.util.HashMap;
 4 | import edu.oregonstate.util.CosineSimilarity;
 5 | import edu.stanford.nlp.stats.Counter;
 6 | 
 7 | /**
 8 |  * calculate similarity score for two similarity vector.
 9 |  * call the function in CosineSimilarity
10 |  * <b>NOTE</b>
11 |  * in order to use Cosine Similarity, convert from Counter data structure to Hash Map first 
12 |  * 
13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
14 |  *
15 |  */
16 | public class SimilarityVector {
17 | 
18 | 	private Counter<String> mcounter;
19 | 	
20 | 	public SimilarityVector(Counter<String> counter) {
21 | 		mcounter = counter;
22 | 	}
23 | 	
24 | 	public Counter<String> getCounter() {
25 | 		return mcounter;
26 | 	}
27 | 	
28 | 	/**
29 | 	 * calculate the cosine similarity of two similarity vector 
30 | 	 * 
31 | 	 * @param c1
32 | 	 * @param c2
33 | 	 * @return cosine similarity
34 | 	 */
35 | 	public static double getCosineSimilarity(SimilarityVector c1, SimilarityVector c2) {
36 | 		if (c1.mcounter.size() == 0 || c2.mcounter.size() == 0) return 0;
37 | 		Counter<String> counter1 = c1.mcounter;
38 | 		Counter<String> counter2 = c2.mcounter;
39 | 		HashMap<String, Double> hcounter1 = convertCounter(counter1);
40 | 		HashMap<String, Double> hcounter2 = convertCounter(counter2);
41 | 		double score = CosineSimilarity.calculateCosineSimilarity(hcounter1, hcounter2);
42 | 		return score;
43 | 	}
44 | 	
45 | 	/**
46 | 	 * convert from counter data structure to hash map data structure
47 | 	 * and then call the CosineSimilarity defined in the util package
48 | 	 * 
49 | 	 * @param counter
50 | 	 * @return
51 | 	 */
52 | 	public static HashMap<String, Double> convertCounter(Counter<String> counter) {
53 | 		HashMap<String, Double> hcounter = new HashMap<String, Double>();
54 | 		for (String key : counter.keySet()) {
55 | 			hcounter.put(key, counter.getCount(key));
56 | 		}
57 | 		return hcounter;
58 | 	}
59 | 	
60 | }
61 | 


--------------------------------------------------------------------------------
/featureExtractor/WordSimilarity.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.featureExtractor;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.DataInputStream;
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.io.InputStreamReader;
 8 | import java.util.HashMap;
 9 | import java.util.Map;
10 | 
11 | import edu.stanford.nlp.stats.ClassicCounter;
12 | 
13 | /**
14 |  * extract the top-ten most-similar words in Dekang Lin's similarity thesaurus for all the nouns/adjectives/verbs in a 
15 |  * cluster
16 |  * <p>
17 |  * Proximity-based Thesaurus: (http://webdocs.cs.ualberta.ca/~lindek/downloads.htm)
18 |  * 
19 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
20 |  *
21 |  */
22 | public class WordSimilarity {
23 | 
24 | 	// file path
25 | 	private final String filePath;
26 | 
27 | 	// data used for mention word feature
28 | 	private Map<String, ClassicCounter<String>> datas;
29 | 
30 | 	public WordSimilarity(String path) {
31 | 		this.filePath = path;
32 | 		datas = new HashMap<String, ClassicCounter<String>>();
33 | 	}
34 | 
35 | 	// return the data
36 | 	public Map<String, ClassicCounter<String>> getDatas() {
37 | 		return datas;
38 | 	}
39 | 
40 | 	/** load the word similarity dictionary */
41 | 	public void load() {
42 | 		try {
43 | 			FileInputStream fstream = new FileInputStream(filePath);
44 | 			DataInputStream in = new DataInputStream(fstream);
45 | 			BufferedReader br = new BufferedReader(new InputStreamReader(in));
46 | 
47 | 			String strLine;
48 | 			boolean pass = true;
49 | 			String currentIndex = "";
50 | 			ClassicCounter<String> mentionWords = new ClassicCounter<String>();
51 | 			int i = 0;
52 | 			while ((strLine = br.readLine()) != null) {
53 | 				if (strLine.startsWith("(")) {
54 | 					pass = false;
55 | 					String[] words = strLine.split(" ");
56 | 					currentIndex = words[0].substring(1);
57 | 					mentionWords = new ClassicCounter<String>();
58 | 					mentionWords.incrementCount(currentIndex);
59 | 					i = 1;
60 | 				}
61 | 				if (pass) continue;
62 | 				if (!strLine.startsWith("(") && !strLine.startsWith(")") && i < 12) {
63 | 					String[] words = strLine.split("\t");
64 | 					mentionWords.incrementCount(words[0]);
65 | 					i += 1;
66 | 				}
67 | 
68 | 				if (i == 11) {
69 | 					datas.put(currentIndex, mentionWords);
70 | 					pass = true;
71 | 				}
72 | 			}
73 | 
74 | 			br.close();
75 | 			in.close();
76 | 			fstream.close();
77 | 		} catch (IOException ex) {
78 | 			ex.printStackTrace();
79 | 			System.exit(1);
80 | 		}		
81 | 	}
82 | 
83 | }


--------------------------------------------------------------------------------
/featureExtractor/Wordnet.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.featureExtractor;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.Set;
  5 | import java.util.HashSet;
  6 | 
  7 | import edu.smu.tspell.wordnet.NounSynset;
  8 | import edu.smu.tspell.wordnet.Synset;
  9 | import edu.smu.tspell.wordnet.SynsetType;
 10 | import edu.smu.tspell.wordnet.VerbSynset;
 11 | import edu.smu.tspell.wordnet.WordNetDatabase;
 12 | import edu.smu.tspell.wordnet.WordSense;
 13 | 
 14 | 
 15 | /**
 16 |  * find the links between synonyms, we need to calculate the percentage of newly-introduced mentions links after the merge 
 17 |  * that are wordnet synonyms
 18 |  */  
 19 | public class Wordnet {
 20 | 	
 21 | 	// get an wordnet database instance
 22 | 	private final WordNetDatabase wordnet;
 23 | 
 24 | 	public Wordnet() {
 25 | 		wordnet = WordNetDatabase.getFileInstance();
 26 | 	}
 27 | 
 28 | 	/**
 29 | 	 * get synonym according to the lemma and its synset type
 30 | 	 * 
 31 | 	 * @param lemma
 32 | 	 * @param type
 33 | 	 * @return
 34 | 	 */
 35 | 	public Set<String> getSynonym(String lemma, SynsetType type) {
 36 | 		Set<String> synonyms = new HashSet<String>();
 37 | 		Synset[] synsets = wordnet.getSynsets(lemma, type);
 38 | 		for (Synset synset : synsets) {
 39 | 			String[] wordforms = synset.getWordForms();
 40 | 			synonyms.addAll(Arrays.asList(wordforms));
 41 | 		}
 42 | 
 43 | 		return synonyms;
 44 | 	}
 45 | 
 46 | 	/**
 47 | 	 * get derivationally form
 48 | 	 * 
 49 | 	 * @param lemma
 50 | 	 * @param type
 51 | 	 * @return
 52 | 	 */
 53 | 	public Set<String> getDerivationallyRelatedForms(String lemma, SynsetType type) {
 54 | 		Set<String> derivationallyForm = new HashSet<String>();
 55 | 		Synset[] synsets = wordnet.getSynsets(lemma, type);
 56 | 		for (Synset synset : synsets) {
 57 | 			WordSense[] senses = synset.getDerivationallyRelatedForms(lemma);
 58 | 			for (WordSense sense : senses) {
 59 | 				derivationallyForm.add(sense.getWordForm());
 60 | 			}
 61 | 		}
 62 | 
 63 | 		return derivationallyForm;
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * get noun hypernym
 68 | 	 * 
 69 | 	 * @param lemma
 70 | 	 * @param type
 71 | 	 * @return
 72 | 	 */
 73 | 	public Set<String> getNounHypernym(String lemma) {
 74 | 		Set<String> hypernyms = new HashSet<String>();
 75 | 		Synset[] synsets = wordnet.getSynsets(lemma, SynsetType.NOUN);
 76 | 		for (Synset synset : synsets) {
 77 | 			NounSynset nounSynset = (NounSynset) synset;
 78 | 			NounSynset[] hypernymSynset = nounSynset.getHypernyms();
 79 | 			for (NounSynset set : hypernymSynset) {
 80 | 				hypernyms.addAll(Arrays.asList(set.getWordForms()));
 81 | 			}
 82 | 		}
 83 | 
 84 | 		return hypernyms;
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * get verb hypernym
 89 | 	 * 
 90 | 	 * @param lemma
 91 | 	 * @param type
 92 | 	 * @return
 93 | 	 */
 94 | 	public Set<String> getVerbHypernym(String lemma) {
 95 | 		Set<String> hypernyms = new HashSet<String>();
 96 | 		Synset[] synsets = wordnet.getSynsets(lemma, SynsetType.VERB);
 97 | 		for (Synset synset : synsets) {
 98 | 			VerbSynset verbSynset = (VerbSynset) synset;
 99 | 			VerbSynset[] hypernymSynset = verbSynset.getHypernyms();
100 | 			for (VerbSynset set : hypernymSynset) {
101 | 				hypernyms.addAll(Arrays.asList(set.getWordForms()));
102 | 			}
103 | 		}
104 | 
105 | 		return hypernyms;
106 | 	}	
107 | 
108 | 	/**
109 | 	 * set word net path first
110 | 	 * 
111 | 	 * @param wordnetPath
112 | 	 */
113 | 	public static void setWordNet(String wordnetPath) {
114 | 		System.setProperty("wordnet.database.dir", wordnetPath);
115 | 	}
116 | 
117 | 	/**
118 | 	 * WORDNET examples
119 | 	 * 
120 | 	 * @param args
121 | 	 */
122 | 	public static void main(String[] args) {
123 | 		String wordnetPath = "/home/jun/JavaFile/corpus/WordNet-3.0/dict";
124 | 		Wordnet.setWordNet(wordnetPath);
125 | 
126 | 		Wordnet wordnet = new Wordnet();
127 | 		Set<String> synonyms = wordnet.getSynonym("region", SynsetType.NOUN);
128 | 		Set<String> nounHypernyms = wordnet.getNounHypernym("tent");
129 | 		Set<String> verbHypernyms = wordnet.getVerbHypernym("shout");
130 | 		Set<String> derivationallyForm = wordnet.getDerivationallyRelatedForms("develop", SynsetType.VERB);
131 | 
132 | 		System.out.println("done");
133 | 	}
134 | 
135 | } 
136 | 


--------------------------------------------------------------------------------
/features/Feature.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.features;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Set;
  5 | 
  6 | import edu.stanford.nlp.dcoref.CorefCluster;
  7 | import edu.stanford.nlp.dcoref.Document;
  8 | import edu.stanford.nlp.stats.ClassicCounter;
  9 | import edu.stanford.nlp.stats.Counter;
 10 | import edu.oregonstate.featureExtractor.SimilarityVector;
 11 | import edu.oregonstate.general.SetOperation;
 12 | 
 13 | /**
 14 |  * the abstract feature definition, every individual feature should incorporate this feature
 15 |  * 
 16 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 17 |  *
 18 |  */
 19 | public abstract class Feature {
 20 | 	
 21 | 	// feature name
 22 | 	protected String featureName;
 23 | 	
 24 | 	public Feature() {
 25 | 		featureName = getClass().getSimpleName();
 26 | 	}
 27 | 	
 28 | 	// the extended class override this method to 
 29 | 	// single its type : Numeric
 30 | 	public boolean isNominal() {
 31 | 		return false;
 32 | 	}
 33 | 	
 34 | 	// the extended class override this method to 
 35 | 	// single its type : Numeric
 36 | 	public boolean isNumeric() {
 37 | 		return false;
 38 | 	}
 39 | 	
 40 | 	// return feature name
 41 | 	public String getFeatureName() {
 42 | 		return featureName;
 43 | 	}
 44 | 	
 45 | 	// generate feature value according to the document, the two clusters, and mention type
 46 | 	public abstract double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType);
 47 | 	
 48 | 	/**
 49 | 	 * calculate specific feature similarity given two clusters
 50 | 	 * 
 51 | 	 * @param former
 52 | 	 * @param latter
 53 | 	 * @param name
 54 | 	 * @return
 55 | 	 */
 56 | 	protected double calculateCosineSimilarity(CorefCluster former, CorefCluster latter, String name, String mentionType) {
 57 | 		double cosineSimilarity = 0.0;
 58 | 		
 59 | 		if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) {
 60 | 			return cosineSimilarity;
 61 | 		}
 62 | 		
 63 | 		HashMap<String, ClassicCounter<String>> formerCentroid = former.predictedCentroid;
 64 | 		HashMap<String, ClassicCounter<String>> latterCentroid = latter.predictedCentroid;
 65 | 		
 66 | 		Counter<String> formerVector = formerCentroid.get(name);
 67 | 		Counter<String> latterVector = latterCentroid.get(name);
 68 | 		
 69 | 		if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) {
 70 | 			return cosineSimilarity;
 71 | 		}
 72 | 		
 73 | 		cosineSimilarity = SimilarityVector.getCosineSimilarity(new SimilarityVector(formerVector), new SimilarityVector(latterVector));
 74 | 		
 75 | 		return cosineSimilarity;
 76 | 	}
 77 | 	
 78 | 	/**
 79 | 	 * How many shared arguments two clusters have in a given role
 80 | 	 * 
 81 | 	 * @param former
 82 | 	 * @param latter
 83 | 	 * @param name
 84 | 	 * @return
 85 | 	 */
 86 | 	protected double calculateAgreement(CorefCluster former, CorefCluster latter, String name, String mentionType) {		
 87 | 		if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) {
 88 | 			return 0.0;
 89 | 		}
 90 | 		
 91 | 		HashMap<String, ClassicCounter<String>> formerCentroid = former.predictedCentroid;
 92 | 		HashMap<String, ClassicCounter<String>> latterCentroid = latter.predictedCentroid;
 93 | 		
 94 | 		Counter<String> formerVector = formerCentroid.get(name);
 95 | 		Counter<String> latterVector = latterCentroid.get(name);
 96 | 		
 97 | 		if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) {
 98 | 			return 0.0;
 99 | 		}
100 | 		
101 | 		Set<String> commonElementSet = SetOperation.intersection(formerVector, latterVector);
102 | 		
103 | 		return commonElementSet.size();
104 | 	}
105 | 	
106 | 	/**
107 | 	 * 
108 | 	 * How many non-shared arguments two clusters have in a given role
109 | 	 * 
110 | 	 * @param former
111 | 	 * @param latter
112 | 	 * @param name
113 | 	 * @return
114 | 	 */
115 | 	protected double calculateNonAgreement(CorefCluster former, CorefCluster latter, String name, String mentionType) {
116 | 		String featureName = name.substring(1);
117 | 		
118 | 		if(mentionType.equals("-PRONOMINAL") && (name.startsWith("MentionWord") || name.startsWith("Head"))) {
119 | 			return 0.0;
120 | 		}
121 | 		
122 | 		HashMap<String, ClassicCounter<String>> formerCentroid = former.predictedCentroid;
123 | 		HashMap<String, ClassicCounter<String>> latterCentroid = latter.predictedCentroid;
124 | 		
125 | 		Counter<String> formerVector = formerCentroid.get(featureName);
126 | 		Counter<String> latterVector = latterCentroid.get(featureName);
127 | 		
128 | 		if(name.equals("Lemma") && latterVector.getCount("say")>0 && formerVector.getCount("say") > 0) {
129 | 			return 0.0;
130 | 		}
131 | 		
132 | 		Set<String> union = SetOperation.union(formerVector, latterVector);
133 | 		Set<String> intersection = SetOperation.intersection(formerVector, latterVector);
134 | 
135 | 		return (union.size() - intersection.size());
136 | 	}
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/features/NominalFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features;
 2 | 
 3 | /**
 4 |  * Nominal Feature
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public abstract class NominalFeature extends Feature {
10 | 
11 | 	@Override
12 | 	public boolean isNominal() {
13 | 		return true;
14 | 	}
15 | 	
16 | 	// the Nominal Features
17 | 	// For example, there is a weather nominal feature
18 | 	// the values of this feature can be hot, cold, or anything like that
19 | 	public abstract String[] getValues();
20 | }
21 | 


--------------------------------------------------------------------------------
/features/NumericFeature.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features;
 2 | 
 3 | /**
 4 |  * Numeric Feature 
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public abstract class NumericFeature extends Feature {
10 | 
11 | 	@Override
12 | 	public boolean isNumeric() {
13 | 		return true;
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/features/individualfeature/Animacy.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | public class Animacy extends NumericFeature {
 8 | 
 9 | 	public Animacy() {
10 | 		featureName = this.getClass().getSimpleName();
11 | 	}
12 | 	
13 | 	@Override
14 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
15 | 		double animacySimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
16 | 		
17 | 		return animacySimilarity;
18 | 	}
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/features/individualfeature/Gender.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Cosine Similarity of gender
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class Gender extends NumericFeature {
14 | 	
15 | 	public Gender() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double genderSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
22 | 		
23 | 		return genderSimilarity;
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/Head.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Entity Head feature, Cosine Similarity of head-word vectors of two clusters
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class Head extends NumericFeature {
14 | 
15 | 	public Head() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double headSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
22 | 		
23 | 		return headSimilarity;
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/features/individualfeature/Lemma.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Event Lemmas : Cosine Similarity of the lemma vectors of two clusters
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class Lemma extends NumericFeature {
14 | 
15 | 	public Lemma() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double lemmaSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
22 | 		
23 | 		return lemmaSimilarity;
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/features/individualfeature/MentionWord.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * 2nd Order Similarity of Mention Words : cosine similarity of vectors containing words
 9 |  * that are distributionally similar to words in the cluster mentions
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class MentionWord extends NumericFeature {
15 | 
16 | 	public MentionWord() {
17 | 		featureName = this.getClass().getSimpleName();
18 | 	}
19 | 	
20 | 	@Override
21 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
22 | 		double mentionWordSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
23 | 		
24 | 		return mentionWordSimilarity;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NEType.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * cosine similarity of NE label vectors
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NEType extends NumericFeature {
14 | 	
15 | 	public NEType() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double NETypeSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
22 | 		
23 | 		return NETypeSimilarity;
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlA0.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Arguments in a Specific Role A0
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlA0 extends NumericFeature {
14 | 
15 | 	public NSrlA0() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlA0 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlA0 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlA1.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Arguments in a Specific Role A1
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlA1 extends NumericFeature {
14 | 
15 | 	public NSrlA1() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlA1 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlA1 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlA2.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Arguments in a Specific Role A2
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlA2 extends NumericFeature {
14 | 
15 | 	public NSrlA2() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlA2 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlA2 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlAMLoc.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Arguments in a Specific Role AMLOC
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlAMLoc extends NumericFeature {
14 | 
15 | 	public NSrlAMLoc() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlAMLoc = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlAMLoc > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlAgreeCount.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Number of non-Coreferent Arguments or Predicates :
 9 |  * The total number of uncommon arguments and predicates between mentions in the two clusters
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class NSrlAgreeCount extends NumericFeature {
15 | 
16 | 	public NSrlAgreeCount() {
17 | 		featureName = this.getClass().getSimpleName();
18 | 	}
19 | 	
20 | 	@Override
21 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
22 | 		double totalNonAgreement = 0.0;
23 | 		String[] verbElements = {"NSrlA0", "NSrlA1", "NSrlA2", "NSrlAMLoc"};
24 | 		String[] nounElements = {"NSrlPA0", "NSrlPA1", "NSrlPA2", "NSrlPAMLoc"};
25 | 		
26 | 		if (mentionType.equals("")) {
27 | 			for (String feature : verbElements) {
28 | 				double number = calculateNonAgreement(former, latter, feature, mentionType);
29 | 				totalNonAgreement += (number > 0.0) ? 1.0 : 0.0;
30 | 			}
31 | 		} else {
32 | 			for (String feature : nounElements) {
33 | 				double number = calculateNonAgreement(former, latter, feature, mentionType);
34 | 				totalNonAgreement += (number > 0.0) ? 1.0 : 0.0;
35 | 			}
36 | 		}
37 | 		
38 | 		return totalNonAgreement;
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlPA0.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Predicate in a Specific Role A0
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlPA0 extends NumericFeature {
14 | 
15 | 	public NSrlPA0() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlPA0 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlPA0 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlPA1.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Predicate in a Specific Role A1
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlPA1 extends NumericFeature {
14 | 
15 | 	public NSrlPA1() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlPA1 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlPA1 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlPA2.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Predicate in a Specific Role A2
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlPA2 extends NumericFeature {
14 | 
15 | 	public NSrlPA2() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlPA2 = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlPA2 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/NSrlPAMLoc.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Non-coreferent Predicate in a Specific Role AM-LOC
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class NSrlPAMLoc extends NumericFeature {
14 | 
15 | 	public NSrlPAMLoc() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double nSrlPAMLoc = calculateNonAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (nSrlPAMLoc > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/Number.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Cosine Similarity of number
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class Number extends NumericFeature {
14 | 	
15 | 	public Number() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double numberSimilarity = calculateCosineSimilarity(former, latter, featureName, mentionType);
22 | 		
23 | 		return numberSimilarity;
24 | 	}
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlA0.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * coreferent arguments in A0
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlA0 extends NumericFeature {
14 | 
15 | 	public SrlA0() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlA0 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlA0 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlA1.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * coreferent arguments in A1
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlA1 extends NumericFeature {
14 | 
15 | 	public SrlA1() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlA1 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlA1 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlA2.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * coreferent arguments in A2
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlA2 extends NumericFeature {
14 | 
15 | 	public SrlA2() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlA2 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlA2 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlAMLoc.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * coreferent arguments in AMLoc
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlAMLoc extends NumericFeature {
14 | 
15 | 	public SrlAMLoc() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlAMLoc = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlAMLoc > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlAgreeCount.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.List;
 5 | 
 6 | import edu.oregonstate.features.NumericFeature;
 7 | import edu.stanford.nlp.dcoref.CorefCluster;
 8 | import edu.stanford.nlp.dcoref.Document;
 9 | 
10 | /**
11 |  * Number of Coreferent Arguments or Predicates :
12 |  * The total number of shared arguments and predicates between mentions in the two clusters
13 |  * 
14 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
15 |  *
16 |  */
17 | public class SrlAgreeCount extends NumericFeature {
18 | 
19 | 	public SrlAgreeCount() {
20 | 		featureName = this.getClass().getSimpleName();
21 | 	}
22 | 	
23 | 	@Override
24 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
25 | 		double totalAgreement = 0.0;
26 | 		String[] verbElements = {"SrlA0", "SrlA1", "SrlA2", "SrlAMLoc", "SrlLeft", "SrlRight"};
27 | 		String[] nounElements = {"SrlPA0", "SrlPA1", "SrlPA2", "SrlPAMLoc"};
28 | 		
29 | 		List<String> verbRoles = Arrays.asList(verbElements);
30 | 		List<String> nounRoles = Arrays.asList(nounElements);
31 | 		
32 | 		if (mentionType.equals("")) {
33 | 			for (String feature : verbRoles) {
34 | 				double number = calculateAgreement(former, latter, feature, mentionType);
35 | 				totalAgreement += (number > 0.0) ? 1.0 : 0.0;
36 | 			}
37 | 		} else {
38 | 			for (String feature : nounRoles) {
39 | 				double number = calculateAgreement(former, latter, feature, mentionType);
40 | 				totalAgreement += (number > 0.0) ? 1.0 : 0.0;
41 | 			}
42 | 		}
43 | 		
44 | 		return totalAgreement;
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlLeft.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * closest Left mention Feature
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlLeft extends NumericFeature {
14 | 
15 | 	public SrlLeft() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlLeft = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlLeft > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlPA0.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Coreferent Predicate in a A0
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlPA0 extends NumericFeature {
14 | 
15 | 	public SrlPA0() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlPA0 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlPA0 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlPA1.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Coreferent Predicate in a A1
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlPA1 extends NumericFeature {
14 | 
15 | 	public SrlPA1() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlPA1 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlPA1 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlPA2.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Coreferent Predicate in a A2
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlPA2 extends NumericFeature {
14 | 
15 | 	public SrlPA2() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlPA2 = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlPA2 > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlPAMLoc.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * Coreferent Predicate in a AMLoc
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public class SrlPAMLoc extends NumericFeature {
14 | 
15 | 	public SrlPAMLoc() {
16 | 		featureName = this.getClass().getSimpleName();
17 | 	}
18 | 	
19 | 	@Override
20 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
21 | 		double srlPAMLoc = calculateAgreement(former, latter, featureName, mentionType);
22 | 		double indicator = (srlPAMLoc > 0.0) ? 1.0 : 0.0;
23 | 		
24 | 		return indicator;
25 | 	}
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/features/individualfeature/SrlRight.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | public class SrlRight extends NumericFeature {
 8 | 	
 9 | 	public SrlRight() {
10 | 		featureName = this.getClass().getSimpleName();
11 | 	}
12 | 	
13 | 	@Override
14 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
15 | 		double srlRight = calculateAgreement(former, latter, featureName, mentionType);
16 | 		double indicator = (srlRight > 0.0) ? 1.0 : 0.0;
17 | 		
18 | 		return indicator;
19 | 	}
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/features/individualfeature/Synonym.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.features.individualfeature;
 2 | 
 3 | import edu.oregonstate.features.NumericFeature;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | import edu.stanford.nlp.dcoref.Mention;
 7 | import edu.stanford.nlp.util.IntPair;
 8 | 
 9 | /**
10 |  * The percentage of newly-introduced metnion links after the merge taht are WordNet synonyms
11 |  * 
12 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
13 |  *
14 |  */
15 | public class Synonym extends NumericFeature {
16 | 
17 | 	// whether do pronoun resolution
18 | 	private final boolean DOPRONOUN;
19 | 	
20 | 	public Synonym() {
21 | 		featureName = this.getClass().getSimpleName();
22 | 		DOPRONOUN = false;
23 | 	}
24 | 	
25 | 	@Override
26 | 	public double generateFeatureValue(Document document, CorefCluster former, CorefCluster latter, String mentionType) {
27 | 		double synonymNom = 0.0;
28 | 		double synonymDenom = 0.0;
29 | 		
30 | 		for(Mention m1 : former.getCorefMentions()) {
31 | 			for(Mention m2 : latter.getCorefMentions()) {
32 | 				if(!DOPRONOUN && (m1.isPronominal() || m2.isPronominal())) continue;
33 | 				IntPair menPair = new IntPair(Math.min(m1.mentionID, m2.mentionID), Math.max(m1.mentionID, m2.mentionID));
34 | 
35 | 				synonymDenom++;
36 | 				if(document.mentionSynonymInWN.contains(menPair)) {
37 | 					synonymNom++;
38 | 				}
39 | 			}
40 | 		}
41 | 		
42 | 		// if two pronoun clusters, then synonymDenom is 0, then the value will be NaN
43 | 		double synonym = 0.0;
44 | 		if (synonymDenom > 0) {
45 | 			synonym = synonymNom/synonymDenom;
46 | 		}
47 | 		
48 | 		return synonym;
49 | 	}
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/general/MapFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.general;
 2 | 
 3 | import java.util.*;
 4 | import java.io.Serializable;
 5 | 
 6 | /**
 7 |  * The MapFactory is a mechanism for specifying what kind of map is to be used
 8 |  * by some object.  For example, if you want a Counter which is backed by an
 9 |  * IdentityHashMap instead of the defaul HashMap, you can pass in an
10 |  * IdentityHashMapFactory.
11 |  *
12 |  * @author Dan Klein
13 |  */
14 | 
15 | public abstract class MapFactory<K,V> implements Serializable {
16 |   private static final long serialVersionUID = 1L;
17 |   public static class HashMapFactory<K,V> extends MapFactory<K,V> {
18 |     public Map<K,V> buildMap() {
19 |       return new HashMap<K,V>();
20 |     }
21 |   }
22 | 
23 |   public static class IdentityHashMapFactory<K,V> extends MapFactory<K,V> {
24 |     public Map<K,V> buildMap() {
25 |       return new IdentityHashMap<K,V>();
26 |     }
27 |   }
28 | 
29 |   public static class TreeMapFactory<K,V> extends MapFactory<K,V> {
30 |     public Map<K,V> buildMap() {
31 |       return new TreeMap<K,V>();
32 |     }
33 |   }
34 | 
35 |   public static class WeakHashMapFactory<K,V> extends MapFactory<K,V> {
36 |     public Map<K,V> buildMap() {
37 |       return new WeakHashMap<K,V>();
38 |     }
39 |   }
40 | 
41 |   public abstract Map<K,V> buildMap();
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/general/MatrixOperation.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.general;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileReader;
  5 | import java.util.ArrayList;
  6 | import java.util.List;
  7 | 
  8 | import Jama.Matrix;
  9 | 
 10 | /**
 11 |  * Jun Xie(xiejuncs@gmail.com)
 12 |  */
 13 | public class MatrixOperation {
 14 | 	
 15 | 	/**
 16 | 	 * Read a matrix from a comma sperated file
 17 | 	 * 
 18 | 	 * @param fileName
 19 | 	 * @return
 20 | 	 */
 21 | 	public static Matrix readMatrix(String fileName) {
 22 | 		try {
 23 | 			BufferedReader reader = new BufferedReader(new FileReader(fileName));
 24 | 			List<double[]> data_array = new ArrayList<double[]>();
 25 | 
 26 | 			String line;
 27 | 			while ((line = reader.readLine()) != null) {
 28 | 				if (line.equals("")) {
 29 | 					continue;
 30 | 				}
 31 | 				String fields[] = line.split(",");
 32 | 				double data[] = new double[fields.length];
 33 | 				for (int i = 0; i < fields.length; i++) {
 34 | 					data[i] = Double.parseDouble(fields[i]);
 35 | 				}
 36 | 				data_array.add(data);
 37 | 			}
 38 | 			
 39 | 			reader.close();
 40 | 			if (data_array.size() > 0) {
 41 | 				int cols = data_array.get(0).length;
 42 | 				int rows = data_array.size();
 43 | 				Matrix matrix = new Matrix(rows, cols);
 44 | 				for (int r = 0; r < rows; ++r) {
 45 | 					for (int c = 0; c < cols; ++c) {
 46 | 						matrix.set(r, c, data_array.get(r)[c]);
 47 | 					}
 48 | 				}
 49 | 				return matrix;
 50 | 			}
 51 | 		} catch (Exception e) {
 52 | 			e.printStackTrace();
 53 | 			System.exit(1);
 54 | 		}
 55 | 
 56 | 	    return new Matrix(0, 0);
 57 | 	}
 58 | 	
 59 | 	/**
 60 | 	 * the last column correspond to the target
 61 | 	 * Hence, remove target values from the last column of a data set.
 62 | 	 * <p>
 63 | 	 * Meanwhile, we need to add 1 to 0 column of each row as a bias term 
 64 | 	 * 
 65 | 	 * @param data_set
 66 | 	 * @return
 67 | 	 */
 68 | 	public static Matrix getDataPoints(Matrix data_set) {
 69 | 		Matrix features = data_set.getMatrix(0, data_set.getRowDimension() - 1, 0, data_set.getColumnDimension() - 2);
 70 | 		int rows = features.getRowDimension();
 71 | 		int cols = features.getColumnDimension() + 1;
 72 | 		Matrix modifiedFeatures = new Matrix(rows, cols);
 73 | 		for (int r = 0; r < rows; ++r) {
 74 | 			for (int c = 0; c < cols; ++c) {
 75 | 				if (c == 0) {
 76 | 					modifiedFeatures.set(r, c, 1.0);
 77 | 				} else {
 78 | 					modifiedFeatures.set(r, c, features.get(r, c-1));
 79 | 				}
 80 | 			}
 81 | 		}
 82 | 		return modifiedFeatures;
 83 | 	}
 84 | 	
 85 | 	/**
 86 | 	 * Returns the target values from the last column of a data set.
 87 | 	 * 
 88 | 	 * @param data_set
 89 | 	 * @return
 90 | 	 */
 91 | 	public static Matrix getTargets(Matrix data_set) {
 92 | 	    return data_set.getMatrix(0, data_set.getRowDimension() - 1, data_set.getColumnDimension() - 1, data_set.getColumnDimension() - 1);
 93 | 	}
 94 | 	
 95 | 	/**
 96 | 	 * divide the the averageModel by the number of mEpoch * number of topic
 97 | 	 * 
 98 | 	 * @param averageModel
 99 | 	 * @param mEpoch
100 | 	 * @return
101 | 	 */
102 | 	public static Matrix divide(Matrix averageModel, int mEpoch) {
103 | 		for (int i = 0; i < averageModel.getRowDimension(); i++) {
104 | 			double updateValue = averageModel.get(i, 0) / mEpoch;
105 | 			averageModel.set(i, 0, updateValue);
106 | 		}
107 | 		
108 | 		return averageModel;
109 | 	}
110 | 	
111 | 	/**
112 | 	 * add model to the averageModel
113 | 	 * <b>NOTE</b>
114 | 	 * model and averageModel are both column vector
115 | 	 * 
116 | 	 * @param model
117 | 	 * @param averageModel
118 | 	 * @return
119 | 	 */
120 | 	public static Matrix addWeight(Matrix model, Matrix averageModel) {
121 | 		for (int i = 0; i < averageModel.getRowDimension(); i++) {
122 | 			double updateValue = averageModel.get(i, 0) + model.get(i, 0);
123 | 			averageModel.set(i, 0,  updateValue);
124 | 		}
125 | 		
126 | 		return averageModel;
127 | 	}
128 | 	
129 | 	/** 
130 | 	 * get average matrix
131 | 	 * 
132 | 	 * @param averageWeight
133 | 	 * @param wholeSearchStep
134 | 	 * @return
135 | 	 */
136 | 	public static Matrix getAverageMatrix (Matrix averageWeight, int wholeSearchStep) {
137 | 		Matrix matrix = new Matrix(averageWeight.getRowDimension(), 1);
138 | 		for (int i = 0; i < averageWeight.getRowDimension(); i++) {
139 | 			matrix.set(i, 0, averageWeight.get(i, 0) / wholeSearchStep);
140 | 		}
141 | 		return matrix;
142 | 	}
143 | 	
144 | 	/**
145 | 	 * matrix normalization
146 | 	 * 
147 | 	 * @param weight
148 | 	 */
149 | 	public static Matrix normalization(Matrix weight) {
150 | 		double sum = weight.norm2();
151 | 		
152 | 		if (sum == 0.0) return weight;
153 | 		
154 | 		for (int i = 0; i < weight.getRowDimension(); i++){
155 | 			double value = weight.get(i, 0);
156 | 			weight.set(i, 0, value / sum);
157 | 		}
158 | 		
159 | 		return weight;
160 | 	}
161 | 
162 | }
163 | 


--------------------------------------------------------------------------------
/general/SetOperation.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.general;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | 
 6 | import edu.stanford.nlp.stats.Counter;
 7 | 
 8 | /**
 9 |  * Set Operation
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class SetOperation {
15 | 
16 | 	/**
17 | 	 * intersection of the keyset of two counter objects
18 | 	 * 
19 | 	 * @param formerVector
20 | 	 * @param latterVector
21 | 	 * @return
22 | 	 */
23 | 	public static Set<String> intersection(Counter<String> formerVector, Counter<String> latterVector) {
24 | 		Set<String> commonElementSet = new HashSet<String>();
25 | 		
26 | 		// get the lower case of the set
27 | //		Set<String> formerSet = StringOperation.lowercase(formerVector.keySet());
28 | //		Set<String> latterSet = StringOperation.lowercase(latterVector.keySet());
29 | 		
30 | 		Set<String> formerSet = formerVector.keySet();
31 | 		Set<String> latterSet = latterVector.keySet();
32 | 		
33 | 		commonElementSet.addAll(formerSet);
34 | 		commonElementSet.retainAll(latterSet);
35 | 		
36 | 		return commonElementSet;
37 | 	}
38 | 	
39 | 	/**
40 | 	 * union of the keyset of two counter objects
41 | 	 * 
42 | 	 * @param formerVector
43 | 	 * @param latterVector
44 | 	 * @return
45 | 	 */
46 | 	public static Set<String> union(Counter<String> formerVector, Counter<String> latterVector) {
47 | 		Set<String> union = new HashSet<String>();
48 | 		union.addAll(formerVector.keySet());
49 | 		union.addAll(latterVector.keySet());
50 | 		return union;
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/general/StringOperation.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.general;
 2 | 
 3 | import java.util.*;
 4 | 
 5 | /**
 6 |  * String Operation
 7 |  * 
 8 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 9 |  *
10 |  */
11 | public class StringOperation {
12 | 
13 | 	private StringOperation() {
14 | 	}
15 | 	
16 | 	/**
17 | 	 * split the string according to the splitter and trim the spaces
18 | 	 * 
19 | 	 * @param string
20 | 	 * @param splitter
21 | 	 * @return
22 | 	 */
23 | 	public static String[] splitString(String string, String splitter) {
24 | 		String[] elements = string.split(splitter);
25 | 		// trim the space before and after
26 | 		int length = elements.length;
27 | 		String[] trimdElements = new String[length];
28 | 		for (int index = 0; index < length; index++) {
29 | 			String value = elements[index];
30 | 			trimdElements[index] = value.trim();
31 | 		}
32 | 		
33 | 		return trimdElements;
34 | 	}
35 | 	
36 | 	/**
37 | 	 * convert the set of string to a set of lower case string
38 | 	 * 
39 | 	 * @param set
40 | 	 * @return
41 | 	 */
42 | 	public static Set<String> lowercase(Set<String> set) {
43 | 		Set<String> result = new HashSet<String>();
44 | 		
45 | 		for (String element : set) {
46 | 			result.add(element.toLowerCase());
47 | 		}
48 | 		
49 | 		return result;
50 | 	}
51 | 	
52 | }
53 | 


--------------------------------------------------------------------------------
/io/EgenericDataSetReader.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.io;
  2 | 
  3 | import java.util.List;
  4 | import java.util.logging.Level;
  5 | import java.util.logging.Logger;
  6 | 
  7 | import edu.stanford.nlp.ling.Label;
  8 | import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder;
  9 | import edu.stanford.nlp.ling.CoreLabel;
 10 | import edu.stanford.nlp.pipeline.Annotation;
 11 | import edu.stanford.nlp.pipeline.Annotator;
 12 | import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 13 | import edu.stanford.nlp.trees.HeadFinder;
 14 | import edu.stanford.nlp.trees.Tree;
 15 | 
 16 | /**
 17 |  * 
 18 |  * @author xie
 19 |  *
 20 |  */
 21 | public class EgenericDataSetReader {
 22 | 	protected Logger logger;
 23 | 	
 24 | 	/** Finds the syntactic head of a syntactic constituent*/
 25 | 	protected final HeadFinder headFinder = new NoPunctuationHeadFinder();
 26 | 	
 27 | 	/** Stanford CoreNLP processor to use for pre-processing*/
 28 | 	protected StanfordCoreNLP processor;
 29 | 	
 30 | 	/** 
 31 | 	 * Additional processor that implements only syntactic parsing (needed for head detection)
 32 | 	 * We need this processor to detect heads of predicted entities that can not be matched to an existing constituent
 33 | 	 * This is created on demand, not necessary
 34 | 	 */
 35 | 	protected Annotator parseProcessor;
 36 | 	
 37 | 	/** If true, we perform syntactic analysis of the dataset sentences and annotations*/
 38 | 	protected final boolean preProcessSentences;
 39 | 	
 40 | 	/**
 41 | 	 * If true, sets the head span to match the syntactic head of the extent.
 42 | 	 * Otherwise, the head span is not modified.
 43 | 	 * This is enabled for the NFL domain, where head spans are not given.
 44 | 	 */
 45 | 	protected final boolean calculateHeadSpan;
 46 | 	
 47 | 	/** If true, it regenerates the index spans for all tree nodes (useful for KBP) */
 48 | 	protected final boolean forceGenerationofIndexSpans;
 49 | 	
 50 | 	/** Only around for legacy results */
 51 | 	protected boolean useNewHeadFinder = true;
 52 | 	
 53 | 	public EgenericDataSetReader() {
 54 | 		this(null, false, false, false);
 55 | 	}
 56 | 	
 57 | 	public EgenericDataSetReader(StanfordCoreNLP processor, boolean preProcessSentences, boolean calculateHeadSpan, boolean forceGenerationIndexSpans) {
 58 | 		this.logger = Logger.getLogger(EgenericDataSetReader.class.getName());
 59 | 		this.logger.setLevel(Level.SEVERE);
 60 | 		
 61 | 		if (processor != null) setProcessor(processor);
 62 | 		parseProcessor = null;
 63 | 		this.preProcessSentences = preProcessSentences;
 64 | 		this.calculateHeadSpan = calculateHeadSpan;
 65 | 		this.forceGenerationofIndexSpans = forceGenerationIndexSpans;
 66 | 	}
 67 | 
 68 | 	public void setProcessor(StanfordCoreNLP processor) {
 69 | 		this.processor = processor;
 70 | 	}
 71 | 	
 72 | 	public void setUseNewHeadFinder(boolean useNewHeadFinder) {
 73 | 		this.useNewHeadFinder = useNewHeadFinder;
 74 | 	}
 75 | 	
 76 | 	public Annotator getParse() {
 77 | 		if (parseProcessor == null) {
 78 | 			parseProcessor = StanfordCoreNLP.getExistingAnnotator("parse");
 79 | 			assert(parseProcessor != null);
 80 | 		}
 81 | 		return parseProcessor;
 82 | 	}
 83 | 	
 84 | 	public void setLoggerLevel(Level level) {
 85 | 		logger.setLevel(level);
 86 | 	}
 87 | 	
 88 | 	public Level getLoggerLevel() {
 89 | 		return logger.getLevel();
 90 | 	}	
 91 | 	
 92 | 	/**
 93 | 	 * Converts the tree labels to CoreLabels.
 94 | 	 * We need this because we store additional info in the CoreLabel, like token span.
 95 | 	 * @param tree
 96 | 	 */
 97 | 	public static void convertToCoreLabels(Tree tree) {
 98 | 		Label l = tree.label();
 99 | 		if (! (l instanceof CoreLabel)) {
100 | 			CoreLabel cl = new CoreLabel();
101 | 			cl.setValue(l.value());
102 | 			tree.setLabel(cl);
103 | 		}
104 | 		
105 | 		for (Tree kid : tree.children())
106 | 			convertToCoreLabels(kid);
107 | 	}
108 | 	
109 | 	/**
110 | 	 * For EECB topic
111 | 	 * 
112 | 	 * @param files
113 | 	 * @param topic
114 | 	 * @return
115 | 	 * @throws Exception
116 | 	 */
117 | 	public Annotation read(List<String> files, String topic) throws Exception {
118 | 		return null;
119 | 	}
120 | 	
121 | 	/**
122 | 	 * For EECB document
123 | 	 * 
124 | 	 * @param documentIdentifier
125 | 	 * @return
126 | 	 * @throws Exception
127 | 	 */
128 | 	public Annotation read(String documentIdentifier) throws Exception {
129 | 		return null;
130 | 	}
131 | 	
132 | }
133 | 


--------------------------------------------------------------------------------
/io/LargeFileWriting.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.io;
  2 | 
  3 | import java.io.DataOutputStream;
  4 | import java.io.File;
  5 | import java.io.FileOutputStream;
  6 | import java.io.FileWriter;
  7 | import java.io.IOException;
  8 | import java.util.ArrayList;
  9 | import java.util.List;
 10 | import java.util.Properties;
 11 | 
 12 | import edu.oregonstate.experiment.ExperimentConstructor;
 13 | import edu.oregonstate.general.DoubleOperation;
 14 | import edu.oregonstate.util.EecbConstants;
 15 | 
 16 | /** 
 17 |  * write large data set to a output file
 18 |  * 
 19 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 20 |  *
 21 |  */
 22 | public class LargeFileWriting {
 23 | 
 24 | 	/* File Path */
 25 | 	private final String mPath;
 26 | 
 27 | 	/* experiment settings */
 28 | 	private final Properties mProps;
 29 | 
 30 | 	public LargeFileWriting(String path) {
 31 | 		mPath = path;
 32 | 		mProps = ExperimentConstructor.experimentProps;
 33 | 	}
 34 | 
 35 | 	/**
 36 | 	 * write arrays to file
 37 | 	 * 
 38 | 	 * @param records
 39 | 	 */
 40 | 	public void writeArrays(List<String> records) {
 41 | 		boolean binary = Boolean.parseBoolean(mProps.getProperty(EecbConstants.IO_BINARY_PROP, "false"));
 42 | 		
 43 | 		try {
 44 | 			// write file into binary form or not
 45 | 			if (binary) {
 46 | 				writeRawinByte(records);
 47 | 			} else {
 48 | 				writeRawinText(records);
 49 | 			}
 50 | 		} catch (Exception e) {
 51 | 			throw new RuntimeException(e);
 52 | 		}
 53 | 	}
 54 | 
 55 | 	/**
 56 | 	 * in its raw form
 57 | 	 * 
 58 | 	 * @param records
 59 | 	 * @throws IOException
 60 | 	 */
 61 | 	private void writeRawinText(List<String> records) throws IOException {
 62 | 		File file = new File(mPath);
 63 | 		try {
 64 | 			FileWriter writer = new FileWriter(file, true);
 65 | 			for (String record: records) {
 66 | 				writer.write(record);
 67 | 				writer.write("\n");
 68 | 			}
 69 | 			writer.flush();
 70 | 			writer.close();
 71 | 		} finally {
 72 | 
 73 | 		}
 74 | 	}
 75 | 	
 76 | 	/**
 77 | 	 * into byte form
 78 | 	 * 
 79 | 	 * @param records
 80 | 	 */
 81 | 	private void writeRawinByte(List<String> records) {
 82 | 		try {
 83 | 			System.out.print("Writing byte...\n");
 84 | 			DataOutputStream dos = new DataOutputStream( new FileOutputStream(mPath));
 85 | 			for (String record: records) {
 86 | 				double[] features = DoubleOperation.transformString(record, ",");
 87 | 
 88 | 				for (int i = 0; i < features.length; i++) {				
 89 | 					dos.writeDouble(features[i]);
 90 | 
 91 | 					if ( i == features.length - 1) {
 92 | 						dos.writeChar('\n');
 93 | 					} else {
 94 | 						dos.writeChar('\t');
 95 | 					}
 96 | 				}
 97 | 			}
 98 | 			
 99 | 			dos.close();
100 | 		} catch (Exception e) {
101 | 			throw new RuntimeException(e);
102 | 		}
103 | 		
104 | 	}
105 | 
106 | 	/**
107 | 	 * Example to run this class
108 | 	 * 
109 | 	 * @param args
110 | 	 */
111 | 	public static void main(String[] args) {
112 | 		int RECORD_COUNT = 4000000;
113 | 		String RECORD = "Help I am trapped in a fortune cookie factory";
114 | 		List<String> records = new ArrayList<String>();
115 | 		for (int i = 0; i < RECORD_COUNT; i++) {
116 | 			records.add(RECORD);
117 | 		}
118 | 		String path = "example.txt";
119 | 
120 | 		LargeFileWriting writer = new LargeFileWriting(path);
121 | 		writer.writeArrays(records);
122 | 	}
123 | 	
124 | }
125 | 


--------------------------------------------------------------------------------
/io/LibSVM.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.io;
 2 | 
 3 | /**
 4 |  * an interface to LibSVM to load the model
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public class LibSVM {
10 | 
11 | 	
12 | }
13 | 


--------------------------------------------------------------------------------
/lossfunction/ILossFunction.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.lossfunction;
 2 | 
 3 | import edu.oregonstate.search.State;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * the interface of Loss Functions
 9 |  * 
10 |  * There are a lot of loss functions, for example, highe loss, 0-1 loss, hamming loss.
11 |  * Through this class, given different object, the loss function can be calculated.
12 |  * 
13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
14 |  *
15 |  */
16 | public interface ILossFunction {
17 | 	
18 | 	/* calculate loss function */
19 | 	public double[] calculateLossFunction(Document document, State<CorefCluster> state);
20 | 	
21 | 	/* scoring the document */
22 | 	public double[] getMetricScore(Document document);
23 | }
24 | 


--------------------------------------------------------------------------------
/lossfunction/MetricLossFunction.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.lossfunction;
 2 | 
 3 | import edu.stanford.nlp.dcoref.CorefCluster;
 4 | import edu.stanford.nlp.dcoref.CorefScorer;
 5 | import edu.stanford.nlp.dcoref.CorefScorer.ScoreType;
 6 | import edu.stanford.nlp.dcoref.Document;
 7 | import edu.oregonstate.experiment.ExperimentConstructor;
 8 | import edu.oregonstate.general.DoubleOperation;
 9 | import edu.oregonstate.search.State;
10 | import edu.oregonstate.util.Command;
11 | import edu.oregonstate.util.EecbConstants;
12 | import edu.oregonstate.util.EecbConstructor;
13 | 
14 | /**
15 |  * Loss Function used to calculate the loss score
16 |  * 
17 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
18 |  *
19 |  */
20 | public class MetricLossFunction implements ILossFunction {
21 | 
22 | 	/* score type: Pairwise */
23 | 	private ScoreType mtype;
24 | 	
25 | 	/* numerator and denominator of  precision and recall */
26 | 	private double precisionNumSum;
27 |     private double precisionDenSum;
28 |     private double recallNumSum;
29 |     private double recallDenSum;
30 | 	
31 | 	public MetricLossFunction() {
32 | 		mtype = CorefScorer.ScoreType.valueOf(ExperimentConstructor.experimentProps.getProperty(EecbConstants.LOSSFUNCTION_SCORE_PROP, "Pairwise"));
33 | 	}
34 | 	
35 | 	/* calculate loss function according to different state, but with the same document */
36 | 	public double[] calculateLossFunction(Document document, State<CorefCluster> state) {
37 |     	Command.generateStateDocument(document, state);
38 |     	double[] scores = calculateF1(document, mtype);
39 |     	return scores;
40 | 	}
41 | 	
42 | 	/* calculate F1, Precision and Recall according to the Score Type */
43 |     private double[] calculateF1(Document document, ScoreType type) {
44 |         double F1 = 0.0;
45 |         CorefScorer score = EecbConstructor.createCorefScorer(type);
46 |         
47 |         score.calculateScore(document);
48 |         F1 = score.getF1();
49 |         double precision = score.getPrecision();
50 |         double recall = score.getRecall();
51 |         
52 |         precisionNumSum = score.precisionNumSum;
53 |         precisionDenSum = score.precisionDenSum;
54 |         recallNumSum = score.recallNumSum;
55 |         recallDenSum = score.recallDenSum;
56 |         
57 |         double[] result = {DoubleOperation.transformNaN(F1), DoubleOperation.transformNaN(precision), DoubleOperation.transformNaN(recall)};
58 |         return result;
59 |     }
60 |     
61 |     /* the detail information of a score */
62 |     public String getDetailScoreInformation() {
63 |     	return precisionNumSum + " " + precisionDenSum + " " + recallNumSum + " " + recallDenSum;
64 |     }
65 |     
66 |     /* scoring the document at the first time */
67 |     public double[] getMetricScore(Document document) {
68 |     	return calculateF1(document, mtype);
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/method/Decoding.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.method;
 2 | 
 3 | /**
 4 |  * used the learned weight to do decoding
 5 |  * 
 6 |  * Take Coreference Resolution as an example, the decoding part is 
 7 |  * to find a coreference resolution chain using search algorithm
 8 |  * 
 9 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
10 |  *
11 |  */
12 | public abstract class Decoding {
13 | 	
14 | 	// decoding phase, used for define the output file name and debug information
15 | 	// for example: training-1
16 | 	protected String decodingPhase;
17 | 	
18 | 	public Decoding(String phase) {
19 | 		decodingPhase = phase;
20 | 	}
21 | 	
22 | 	/**
23 | 	 * decode according to different application
24 | 	 * 
25 | 	 * @param weight
26 | 	 */
27 | 	public abstract void decode(double[] weight);
28 | }
29 | 


--------------------------------------------------------------------------------
/method/IMethod.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.method;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | 
 7 | /**
 8 |  * experiment framework
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public interface IMethod {
14 | 
15 | 	/* according to different method, execute different method */
16 | 	public List<Parameter> executeMethod();
17 | }
18 | 


--------------------------------------------------------------------------------
/pruning/Pruning.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.pruning;
 2 | 
 3 | /**
 4 |  * Generate the constraints which can prune the search space in the beam search
 5 |  * 
 6 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 7 |  *
 8 |  */
 9 | public class Pruning {
10 | 	
11 | 	/**
12 | 	 * Right now, we can just use an topic to put into the test, for example, the sixth topic.
13 | 	 * The reason for this topic is that this topic has less number of mentions.
14 | 	 * @param args
15 | 	 */
16 | 	public static void main(String[] args) {
17 | 	
18 | 	}
19 | }


--------------------------------------------------------------------------------
/score/AssignmentAlgorithm.java:
--------------------------------------------------------------------------------
1 | package edu.oregonstate.score;
2 | 
3 | public interface AssignmentAlgorithm {
4 | 
5 | 	int[][] computeAssignments(double[][] costMatrix);
6 | }
7 | 


--------------------------------------------------------------------------------
/score/AssignmentProblem.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.score;
 2 | 
 3 | public class AssignmentProblem {
 4 | 
 5 | 	private final double[][] costMatrix;
 6 | 	
 7 | 	public AssignmentProblem(double[][] aCostMatrix) {
 8 | 		costMatrix = aCostMatrix;
 9 | 	}
10 | 	
11 | 	private double[][] copyOfMatrix() {
12 | 		double[][] retval = new double[costMatrix.length][];
13 | 		for (int i = 0; i < costMatrix.length; i++) {
14 | 			retval[i] = new double[costMatrix[i].length];
15 | 			System.arraycopy(costMatrix[i], 0, retval[i], 0, costMatrix[i].length);
16 | 		}
17 | 		return retval;
18 | 	}
19 | 	
20 | 	public int[][] solve(AssignmentAlgorithm algorithm) {
21 | 		double[][] costMatrix = copyOfMatrix();
22 | 		return algorithm.computeAssignments(costMatrix);
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/score/ScorerCEAF.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.score;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.Iterator;
  5 | import java.util.Map;
  6 | import java.util.Set;
  7 | import java.util.List;
  8 | import java.util.ArrayList;
  9 | 
 10 | import edu.stanford.nlp.dcoref.CorefCluster;
 11 | import edu.stanford.nlp.dcoref.CorefScorer;
 12 | import edu.stanford.nlp.dcoref.Document;
 13 | import edu.stanford.nlp.dcoref.Mention;
 14 | 
 15 | /**
 16 |  * CEAF score implementation (See paper: Evaluation metrics for End-to-End Coreference Resolution Systems)
 17 |  * 
 18 |  * CEAF applies a similarity metric (which should be either mention based or entity based) for each pair of 
 19 |  * entities (i.e. a set of mentions) to measure the goodness of each possible alignment. The best mapping is
 20 |  * used for calculating CEAF precision, recall and F-measure.
 21 |  * 
 22 |  * There are two types similarity metric, called phi3 and phi4. We implemented the phi4 case.
 23 |  * 
 24 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 25 |  *
 26 |  */
 27 | public class ScorerCEAF extends CorefScorer {
 28 | 	
 29 | 	// update all fields of CorefScorer to public. 
 30 | 	public ScorerCEAF() {
 31 | 		super();
 32 | 		scoreType = ScoreType.CEAF;
 33 | 	}
 34 | 	
 35 | 	/**
 36 | 	 * calculate precision according to the equation 5 in the paper
 37 | 	 */
 38 | 	protected void calculatePrecision(Document doc){
 39 | 		Map<Integer, CorefCluster> response = doc.corefClusters;
 40 | 		Map<Integer, CorefCluster> reference = doc.goldCorefClusters;
 41 | 		precisionNumSum = scoreHelper(reference, response);
 42 | 		precisionDenSum = scoreHelper(response, response);
 43 | 	}
 44 | 
 45 | 	// calculate the simialrity	
 46 | 	public double similarity(CorefCluster responseCluster, CorefCluster referenceCluster) {
 47 | 		Set<Mention> responseMentions = responseCluster.corefMentions;
 48 | 		Set<Mention> referenceMentions = referenceCluster.corefMentions;
 49 | 		List<Integer> responseMentionIDs = new ArrayList<Integer>();
 50 | 		List<Integer> referenceMentionIDs = new ArrayList<Integer>();
 51 | 		
 52 | 		for (Mention mention : responseMentions) {
 53 | 			responseMentionIDs.add(mention.mentionID);
 54 | 		}
 55 | 		for (Mention mention : referenceMentions ) {
 56 | 			referenceMentionIDs.add(mention.mentionID);
 57 | 		}
 58 | 		int responseSize = responseMentionIDs.size();
 59 | 		int referenceSize = referenceMentionIDs.size();	
 60 | 		responseMentionIDs.retainAll(referenceMentionIDs);
 61 | 		int overlap = responseMentionIDs.size();
 62 | 		return (2 * overlap ) / (responseSize * referenceSize); 			
 63 | 	} 
 64 | 
 65 | 	// calculate the cost function
 66 | 	public double scoreHelper(Map<Integer, CorefCluster> reference, Map<Integer, CorefCluster> response) {
 67 | 		double cost = 0.0;
 68 | 		if (reference.size() == 0 || response.size() == 0) return 0.0;
 69 | 		int size = reference.size() >= response.size() ? reference.size() : response.size();
 70 | 		double[][] scores = new double[size][size];
 71 | 		double max = 1.0;
 72 | 		for (double[] score : scores) {
 73 | 			Arrays.fill(score, max);
 74 | 		}
 75 | 		Set<Integer> responseSet = response.keySet();
 76 | 		Iterator<Integer> responseIt = responseSet.iterator();
 77 | 		int i = 0;
 78 | 		int j = 0;
 79 | 		while (responseIt.hasNext()) {
 80 | 			CorefCluster responseCluster = response.get(responseIt.next());
 81 | 			j = 0;
 82 | 			Set<Integer> referenceSet = reference.keySet();
 83 | 			Iterator<Integer> referenceIt = referenceSet.iterator();
 84 | 			while (referenceIt.hasNext()) {
 85 | 				CorefCluster referenceCluster = reference.get(referenceIt.next());
 86 | 				scores[j][i] = max - similarity(responseCluster, referenceCluster); // how to calculate the similarity 
 87 | 				j++;
 88 | 			} 
 89 | 			i++;
 90 | 		}
 91 | 		
 92 | 		AssignmentProblem ap = new AssignmentProblem(scores);
 93 | 		int[][] solution = ap.solve(new HungarianAlgorithm());
 94 | 		for (i = 0; i < solution.length; i++) {
 95 |     			if (solution[i][0] >= 0) {
 96 |       				cost += max - scores[solution[i][0]][i]; //how to calculate this 
 97 |     			}
 98 |   		}		
 99 | 		
100 |   		return cost;
101 | 	}
102 | 	
103 | 	/**
104 | 	 * calculate recall according to the equation 6 in the paper
105 | 	 */
106 | 	protected void calculateRecall(Document doc){
107 | 		Map<Integer, CorefCluster> response = doc.corefClusters;
108 |         Map<Integer, CorefCluster> reference = doc.goldCorefClusters;
109 | 		recallNumSum = scoreHelper(reference, response);
110 | 		recallDenSum = scoreHelper(reference, reference);
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/score/ScorerHelper.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.score;
 2 | 
 3 | import java.util.logging.Logger;
 4 | 
 5 | import edu.oregonstate.io.ResultOutput;
 6 | import edu.stanford.nlp.dcoref.CorefScorer;
 7 | import edu.stanford.nlp.dcoref.Document;
 8 | import edu.stanford.nlp.dcoref.ScorerBCubed;
 9 | import edu.stanford.nlp.dcoref.ScorerMUC;
10 | import edu.stanford.nlp.dcoref.ScorerPairwise;
11 | import edu.stanford.nlp.dcoref.SieveCoreferenceSystem;
12 | import edu.stanford.nlp.dcoref.ScorerBCubed.BCubedType;
13 | 
14 | /**
15 |  * All stuffs related to the Score Function. Now, there 
16 |  * are four score metrics implemented, respectively Pairwise, 
17 |  * MUC, BCubed and CEAF.
18 |  * 
19 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
20 |  *
21 |  */
22 | public class ScorerHelper {
23 | 
24 | 	// we evaluate the score on a specific document
25 | 	private Document mDocument;
26 | 	private Logger mLogger;
27 | 	private String mPath;
28 | 	private boolean mPostProcess;
29 | 
30 | 	public ScorerHelper(Document document, Logger logger, String path, boolean postProcess) {
31 | 		mDocument = document;
32 | 		mLogger = logger;
33 | 		mPath = path;
34 | 		mPostProcess = postProcess;
35 | 	}
36 | 	
37 | 	/** print score of the document, whether post-processing or not */
38 | 	public void printScore() {
39 | 		if (!mPostProcess) {
40 |     		ResultOutput.writeTextFile(mPath, "do not postprocess the data");
41 |     		
42 |     		CorefScorer score = new ScorerBCubed(BCubedType.Bconll);
43 |         	score.calculateScore(mDocument);
44 |         	score.printF1(mLogger, true);
45 |         	
46 |         	CorefScorer ceafscore = new ScorerCEAF();
47 |         	ceafscore.calculateScore(mDocument);
48 |         	ceafscore.printF1(mLogger, true);
49 |     		
50 |     		CorefScorer mucscore = new ScorerMUC();
51 |     		mucscore.calculateScore(mDocument);
52 |     		mucscore.printF1(mLogger, true);
53 |     	
54 |     		CorefScorer pairscore = new ScorerPairwise();
55 |     		pairscore.calculateScore(mDocument);
56 |     		pairscore.printF1(mLogger, true);
57 |     		
58 |     		// Average of MUC, B^{3} and CEAF-\phi_{4}.
59 |     		double conllF1 = (score.getF1() + ceafscore.getF1() + mucscore.getF1()) / 3;
60 |         	ResultOutput.writeTextFile(mPath, "conllF1:     " + conllF1);
61 |     	} else {
62 |     		ResultOutput.writeTextFile(mPath, "do postprocess the data");
63 |         	SieveCoreferenceSystem.postProcessing(mDocument);
64 |         	
65 |         	CorefScorer score = new ScorerBCubed(BCubedType.Bconll);
66 |         	score.calculateScore(mDocument);
67 |         	score.printF1(mLogger, true);
68 |         	
69 |         	CorefScorer postmucscore = new ScorerMUC();
70 |         	postmucscore.calculateScore(mDocument);
71 |         	postmucscore.printF1(mLogger, true);
72 |         	
73 |         	CorefScorer postpairscore = new ScorerPairwise();
74 |         	postpairscore.calculateScore(mDocument);
75 |         	postpairscore.printF1(mLogger, true);
76 |         	
77 |         	CorefScorer ceafscore = new ScorerCEAF();
78 |         	ceafscore.calculateScore(mDocument);
79 |         	ceafscore.printF1(mLogger, true);
80 |         	
81 |         	
82 |         	// Average of MUC, B^{3} and CEAF-\phi_{4}.
83 |         	double conllF1 = (score.getF1() + ceafscore.getF1() + postmucscore.getF1()) / 3;
84 |         	ResultOutput.writeTextFile(mPath, "conllF1:     " + conllF1);
85 |     	}
86 | 	}
87 | 	
88 | 	
89 | }
90 | 


--------------------------------------------------------------------------------
/search/ISearch.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.search;
 2 | 
 3 | import edu.oregonstate.classifier.Parameter;
 4 | import edu.stanford.nlp.dcoref.CorefCluster;
 5 | import edu.stanford.nlp.dcoref.Document;
 6 | 
 7 | /**
 8 |  * search interface, all search method need to implement this interface
 9 |  * 
10 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
11 |  *
12 |  */
13 | public interface ISearch {
14 | 	
15 | 	/* learn weight according to the parameter, and then print training file into phase */
16 | 	public Parameter trainingBySearch(Document document, Parameter para, String phase);
17 | 	
18 | 	/* apply the learned weight to the testing document, and return the best loss state, later, we can output a terminate state for final performance */
19 | 	public State<CorefCluster> testingBySearch(Document document, double[] weight, String phase, boolean outputFeature, double stoppingrate);
20 | }


--------------------------------------------------------------------------------
/server/ClusterConnection.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.server;
  2 | 
  3 | import java.io.File;
  4 | import java.io.IOException;
  5 | import java.io.InputStream;
  6 | import java.util.List;
  7 | 
  8 | import org.apache.log4j.Logger;
  9 | 
 10 | import com.jcraft.jsch.ChannelExec;
 11 | import com.jcraft.jsch.JSch;
 12 | import com.jcraft.jsch.Session;
 13 | 
 14 | /**
 15 |  * 
 16 |  * @author Yonglei Zheng
 17 |  *
 18 |  */
 19 | public class ClusterConnection {
 20 | 
 21 | 	// print log information
 22 | 	private static final Logger log = Logger.getLogger(ClusterConnection.class);
 23 | 
 24 | 	// host name
 25 | 	private String host;
 26 | 
 27 | 	private String user;
 28 | 
 29 | 	private String password;
 30 | 
 31 | 	private String stdout;
 32 | 
 33 | 	private String stderr;
 34 | 
 35 | 	private Session session;
 36 | 
 37 | 	private int exitStatus;
 38 | 
 39 | 	// Millisecond: 0.001
 40 | 	private static final long COMMAND_TIME_INTERVAL = 1000;
 41 | 
 42 | 	public ClusterConnection() {
 43 | 		this("submit-em64t-01.hpc.engr.oregonstate.edu", "xie", "88jx$85");
 44 | 	}
 45 | 
 46 | 	public ClusterConnection(String host, String user, String password) {
 47 | 		this.host = host;
 48 | 		this.user = user;
 49 | 		this.password = password;
 50 | 	}
 51 | 
 52 | 	public void connect() throws Exception {
 53 | 		disconnect();
 54 | 
 55 | 		JSch jsch = new JSch();
 56 | 		session = jsch.getSession(user, host, 22);
 57 | 		String homeDir = System.getProperty("user.home");
 58 | 		String knownHostPath = homeDir + File.separator + ".ssh"
 59 | 				+ File.separator + "known_hosts";
 60 | 		jsch.setKnownHosts(knownHostPath);
 61 | 		// If two machines have SSH passwordless logins setup, the following
 62 | 		// line is not needed:
 63 | 		session.setPassword(password);
 64 | 		session.connect();
 65 | 	}
 66 | 
 67 | 	public void disconnect() {
 68 | 		if (session != null) {
 69 | 			session.disconnect();
 70 | 		}
 71 | 	}
 72 | 
 73 | 	@Override
 74 | 	protected void finalize() throws Throwable {
 75 | 		disconnect();
 76 | 	}
 77 | 
 78 | 	public void execCommand(String cmd) throws Exception {
 79 | 		ChannelExec channel = (ChannelExec) session.openChannel("exec");
 80 | 		channel.setCommand(cmd);
 81 | 		channel.setInputStream(null);
 82 | 		channel.setErrStream(null);
 83 | 		InputStream in = channel.getInputStream();
 84 | 		InputStream err = channel.getErrStream();
 85 | 		stdout = "";
 86 | 		stderr = "";
 87 | 		channel.connect();
 88 | 		while (true) {
 89 | 			stdout += getRespond(in);
 90 | 			stderr += getRespond(err);
 91 | 			if (channel.isClosed()) {
 92 | 				exitStatus = channel.getExitStatus();
 93 | 				break;
 94 | 			}
 95 | 		}
 96 | 		channel.disconnect();
 97 | 		System.out.println("==========================================");
 98 | 		System.out.println("Command '" + cmd + "' executed");
 99 | 		System.out.println("stdout:\n" + (stdout.isEmpty() ? "[EMPTY]" : stdout));
100 | 		System.out.println("stderr:\n" + (stderr.isEmpty() ? "[EMPTY]" : stderr));
101 | 		System.out.println("exit-status: " + exitStatus);
102 | 		Thread.sleep(COMMAND_TIME_INTERVAL);
103 | 	}
104 | 
105 | 	public String getRespond(InputStream is) throws IOException {
106 | 		StringBuffer buffer = new StringBuffer();
107 | 		byte[] tmp = new byte[1024];
108 | 		while (is.available() > 0) {
109 | 			int i = is.read(tmp, 0, 1024);
110 | 			if (i < 0)
111 | 				break;
112 | 			buffer.append(new String(tmp, 0, i));
113 | 		}
114 | 		return buffer.toString().trim();
115 | 	}
116 | 
117 | 	public String getStdout() {
118 | 		return stdout;
119 | 	}
120 | 
121 | 	public String getStderr() {
122 | 		return stderr;
123 | 	}
124 | 
125 | 	public List<Integer> queryJobIds() throws Exception {
126 | 		execCommand("qstat -u xie");
127 | 		return JobState.parseJobIds(stdout);
128 | 	}
129 | 
130 | 	public int submitJob(String scriptPath) throws Exception {
131 | 		execCommand("qsub " + scriptPath);
132 | 		String stdout = getStdout().trim();
133 | 		if (!stdout.startsWith("Your job")
134 | 				|| !stdout.endsWith("has been submitted")) {
135 | 			
136 | 			throw new Exception("Job cannot be submitted! script:" + scriptPath
137 | 					+ "\nstdout:" + stdout + "\nstderr:" + stderr);
138 | 		}
139 | 		
140 | 		stdout = stdout.replaceAll("Your job", "").trim();
141 | 		int jobId = Integer.valueOf(stdout.split("\\s+")[0]);
142 | 		log.info("jobId is " + jobId);
143 | 		return jobId;
144 | 	}
145 | 
146 | 	public void deleteJob(int jobId) throws Exception {
147 | 		execCommand("qdel " + jobId);
148 | 	}
149 | 	
150 | }
151 | 


--------------------------------------------------------------------------------
/server/ExperimentArguments.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.server;
  2 | 
  3 | import java.lang.reflect.Field;
  4 | 
  5 | public class ExperimentArguments {
  6 | 	
  7 | 	public String[] PROCEDURES_PROP = {"datageneration-0, lasso-1, searchlearnedweightwithoutfeature-1, resultaggregation-1"}; // dagger-3, searchlearnedweightwithoutfeature-0,
  8 | 																													  // , tunemodel-6, " + "searchlearnedweightwithoutfeature-6, resultaggregation-6"
  9 | 	//public String[] PROCEDURES_PROP = , searchtrueloss-1, learn-1, searchlearnedweightwithoutfeature-1, resultaggregation-1
 10 | 	//datageneration-0, resultaggregation-0, lasso-1, searchlearnedweightwithoutfeature-1, resultaggregation-1
 11 | 	public String[] EXPERIMENT_PROP = {"datageneration.goldmention, feature.atomic.names"};		// MUST be included in every experiment config file
 12 | 
 13 | 	// corpus path
 14 | 	public String[] CORPUS_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus"};		// MUST
 15 | 
 16 | 	// CONLL scorer path  MUST
 17 | 	public String[] CONLL_SCORER_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus/scorer/v4/scorer.pl"};		// MUST
 18 | 
 19 | 	// whether the experiment is in the debug model or cluster model
 20 | 	// used to print out the detail information, while in the real clustering
 21 | 	// running, we would like to like faster by reducing the output
 22 | 	public String[] DEBUG_PROP = {"false"};		// MUST
 23 | 
 24 | 	// WORDNET path
 25 | 	public String[] WORDNET_PROP = {"/nfs/guille/xfern/users/xie/Experiment/corpus/WordNet-3.0/dict"};		// MUST
 26 | 
 27 | 	//
 28 | 	// data generation
 29 | 	//
 30 | 	// within (false) or cross (true) reading data
 31 | 	public String[] DATAGENERATION_DATASET_PROP = {"true"};
 32 | 
 33 | 	// gold mention (true) or predicted mention (false)
 34 | 	public String[] DATAGENERATION_GOLDMENTION_PROP = { "true", "false"};		// MUST
 35 | 
 36 | 	// GOLD cluster post process
 37 | 	public String[] DATAGENERATION_POSTPROCESS_GOLD_PROP = {"false"};
 38 | 
 39 | 	// annotators used in the experiment
 40 | 	public String[] DATAGENERATION_ANNOTATORS_PROP = {"tokenize, ssplit, pos, lemma, ner, parse, dcoref"};		// MUST
 41 | 
 42 | 	// training set
 43 | 	public String[] DATAGENERATION_TRAININGSET_PROP = {"5, 6"};		// MUST 43,
 44 | 	//"5, 6, 8, 11, 16, 25, 30, 31, 37, 40, 43, 44"
 45 | 	// testing set
 46 | 	public String[] DATAGENERATION_TESTINGSET_PROP = {"10, 14"};
 47 | 	//"1, 2, 4, 7, 9, 10, 13, 14, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 32, 33, 34, 35, 36, 39, 41, 42, 45"
 48 | 	
 49 | 	//public String[] DATAGENERATION_DEVELOPMENTSET_PROP = {"3, 12, 38"};
 50 | 
 51 | 	//
 52 | 	// search
 53 | 	//
 54 | 	// public String[] SEARCH_TYPE = {"searchtrueloss"};
 55 | 	
 56 | 
 57 | 	//	// best state score 
 58 | 	//	public String[] BEST_STATE_PROP = {"true"};		// MUST
 59 | 	//
 60 | 	//	// whether use all sieves or all sieves except Pronoun sieve
 61 | 	//	public String[] SIEVE_PROP = {"partial"};
 62 | 	//
 63 | 	//	// do training to learn a weight
 64 | 	//	public String[] DOTRAINING_PROP = {"true"};
 65 | 	//
 66 | 	//	// use existed weight to do testing, whether do validation or do final testing
 67 | 	//	public String[] EXISTEDWEIGHT_PROP = {"false"};
 68 | 	//
 69 | 	//	// classifier
 70 | 	//	public String[] CLASSIFIER_PROP = {"StructuredPerceptron"};
 71 | 	//	public String[] CLASSIFIER_EPOCH_PROP = {"10"};
 72 | 	//
 73 | 	//	// cost function used, for example, linear 
 74 | 	//	public String[] COSTFUNCTION_PROP = {"LinearCostFunction"};
 75 | 	//
 76 | 	//	// loss function used score type
 77 | 	//	public String[] LOSSFUNCTION_PROP = {"MetricLossFunction"}; 
 78 | 	//	public String[] LOSSFUNCTION_SCORE_PROP = {"Pairwise"};
 79 | 	//
 80 | 	//	// search, its beam width, maximum step
 81 | 	//	public String[] SEARCH_PROP = {"BeamSearch"};
 82 | 	//	public String[] SEARCH_BEAMWIDTH_PROP = {"1"};
 83 | 	//	public String[] SEARCH_MAXIMUMSTEP_PROP = {"600"};
 84 | 	//
 85 | 	//	// stopping criterion (if tune, then its stopping rate)
 86 | 	//	public String[] STOPPING_CRITERION = {"none"};
 87 | 	//
 88 | 	//	// whether print the testing performance on training set
 89 | 	//	public String[] TRAINING_VALIDATION_PROP = {"true"};
 90 | 	//
 91 | 	//	// average weight or latest weight
 92 | 	//	public String[] WEIGHT_PROP = {"true"};
 93 | 	//
 94 | 	//	// Method configuration
 95 | 	//	public String[] METHOD_PROP = {"Dagger"};
 96 | 	//	public String[] METHOD_FUNCTION_NUMBER_PROP = {"1", "3", "5"};
 97 | 	//
 98 | 	//	// use which training method to train the algorithm, Online, OnlineToBatch, Batch
 99 | 	//	public String[] TRAINING_STYLE_PROP = {"AROWOnline"};
100 | 	//	public String[] TRAINING_NORMALIZE_WEIGHT = {"false"};
101 | 	//
102 | 	//	// stanford preprocessing
103 | 	//	public String[] STANFORD_PREPROCESSING = {"true"};
104 | 	//
105 | 	//	// state feature
106 | 	//	public String[] STATE_FEATURE = {"false"};
107 | 	//
108 | 	//	// Atomic features
109 | 	public String[] FEATURE_ATOMIC_NAMES = {"F"}; // "N"
110 | 
111 | 	public static void main(String[] args) {
112 | 		ExperimentArguments generator = new ExperimentArguments();
113 | 		Class generatorClass = generator.getClass();
114 | 
115 | 		Field[] fields = generatorClass.getFields();
116 | 
117 | 		for (Field field : fields) {
118 | 			try {
119 | 				System.out.println(field.getName() + "--->" + field.get(generator));
120 | 			} catch (Exception e) {
121 | 				throw new RuntimeException(e);
122 | 			}
123 | 		}
124 | 
125 | 	}
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/server/JobState.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.server;
  2 | 
  3 | import java.text.DateFormat;
  4 | import java.text.SimpleDateFormat;
  5 | import java.util.ArrayList;
  6 | import java.util.Date;
  7 | import java.util.List;
  8 | 
  9 | public class JobState {
 10 | 
 11 | 	private static final DateFormat DATE_FORMAT = new SimpleDateFormat(
 12 | 			"MM/dd/yyyy HH:mm:ss");
 13 | 
 14 | 	private int jobId;
 15 | 
 16 | 	private double prior;
 17 | 
 18 | 	private String displayName;
 19 | 
 20 | 	private String fullName;
 21 | 
 22 | 	private String userName;
 23 | 
 24 | 	private String state;
 25 | 
 26 | 	private Date startTime;
 27 | 
 28 | 	private String queue;
 29 | 
 30 | 	private Integer slotsJaTaskId;
 31 | 
 32 | 	public JobState(int jobId, double prior, String displayName,
 33 | 			String userName, String state, Date startTime, String queue,
 34 | 			Integer slotsJaTaskId) {
 35 | 		this.jobId = jobId;
 36 | 		this.prior = prior;
 37 | 		this.displayName = displayName;
 38 | 		this.userName = userName;
 39 | 		this.state = state;
 40 | 		this.startTime = startTime;
 41 | 		this.queue = queue;
 42 | 		this.slotsJaTaskId = slotsJaTaskId;
 43 | 	}
 44 | 
 45 | 	public static List<Integer> parseJobIds(String info) throws Exception {
 46 | 		List<Integer> jobIds = new ArrayList<Integer>();
 47 | 		if (info.isEmpty()) {
 48 | 			return jobIds;
 49 | 		}
 50 | 		String[] lines = info.trim().split("\\n");
 51 | 		for (int i = 0; i < lines.length; ++i) {
 52 | 			String line = lines[i].trim();
 53 | 			if (i == 0) {
 54 | 				if (!"job-ID  prior   name       user         state submit/start at     queue                          jclass                         slots ja-task-ID"
 55 | 						.equals(line)) {
 56 | 					throw new Exception("Unexpected header: " + line);
 57 | 				}
 58 | 			} else if (i >= 2) {
 59 | 				// int jobId = Integer.valueOf(line.substring(0, 7).trim());
 60 | 				int jobId = Integer.valueOf(line.split("\\s+")[0]);
 61 | 				jobIds.add(jobId);
 62 | 			}
 63 | 		}
 64 | 		return jobIds;
 65 | 	}
 66 | 
 67 | 	@Override
 68 | 	public String toString() {
 69 | 		StringBuffer buffer = new StringBuffer();
 70 | 		buffer.append("===== Job State =====");
 71 | 		buffer.append("jobId: " + jobId + "\n");
 72 | 		buffer.append("prior: " + prior + "\n");
 73 | 		buffer.append("displayName: " + displayName + "\n");
 74 | 		buffer.append("userName: " + userName + "\n");
 75 | 		buffer.append("state: " + state + "\n");
 76 | 		buffer.append("startTime: " + DATE_FORMAT.format(startTime) + "\n");
 77 | 		buffer.append("queue: " + queue + "\n");
 78 | 		buffer.append("slotsJaTaskId: " + slotsJaTaskId + "\n");
 79 | 		return buffer.toString();
 80 | 	}
 81 | 
 82 | 	public int getJobId() {
 83 | 		return jobId;
 84 | 	}
 85 | 
 86 | 	public double getPrior() {
 87 | 		return prior;
 88 | 	}
 89 | 
 90 | 	public String getDisplayName() {
 91 | 		return displayName;
 92 | 	}
 93 | 
 94 | 	public String getFullName() {
 95 | 		return fullName;
 96 | 	}
 97 | 
 98 | 	public String getUserName() {
 99 | 		return userName;
100 | 	}
101 | 
102 | 	public String getState() {
103 | 		return state;
104 | 	}
105 | 
106 | 	public Date getStartTime() {
107 | 		return startTime;
108 | 	}
109 | 
110 | 	public String getQueue() {
111 | 		return queue;
112 | 	}
113 | 
114 | 	public Integer getSlotsJaTaskId() {
115 | 		return slotsJaTaskId;
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/server/JobSubmit.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.server;
 2 | 
 3 | import java.io.File;
 4 | import java.text.DateFormat;
 5 | import java.text.SimpleDateFormat;
 6 | import java.util.Date;
 7 | 
 8 | /**
 9 |  * submit the jobs to cluster automatically
10 |  * 
11 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
12 |  *
13 |  */
14 | public class JobSubmit {
15 | 
16 | 	public static void main(String[] args) throws Exception {
17 | 		String originalPath = "/nfs/guille/xfern/users/xie/Experiment/experiment/";
18 | 		
19 | 		DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
20 | 		//get current date time with Date()
21 | 		Date date = new Date();
22 | 		String folderName = dateFormat.format(date);
23 | 		System.out.println(folderName);
24 | 		
25 | 		String folderPath = originalPath + folderName;
26 | 		Runtime.getRuntime().exec("chmod -R u+x " + folderPath);
27 | 		
28 | 		File corpusDir = new File(folderPath);
29 | 		String[] directories = corpusDir.list();
30 | 		
31 | 		//submit job
32 | 		for (String directory : directories) {
33 | 			if (directory.startsWith("Job")) continue;
34 | 			String simplePath = folderPath + "/" + directory + "/simple.sh";
35 | 			System.out.println(simplePath);
36 | 			Runtime.getRuntime().exec("qsub " + simplePath);
37 | 		}
38 | 		System.out.println("done!");
39 | 	}
40 | }


--------------------------------------------------------------------------------
/server/Node.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.server;
  2 | 
  3 | import java.lang.reflect.Field;
  4 | import java.util.*;
  5 | 
  6 | 
  7 | public class Node {
  8 | 
  9 | 	public List<String> configuration;
 10 | 	
 11 | 	public Node() {
 12 | 		configuration = new ArrayList<String>();
 13 | 	}
 14 | 	
 15 | 	public Node(List<String> configuration) {
 16 | 		this.configuration = configuration;
 17 | 	}
 18 | 	
 19 | 	public String toString() {
 20 | 		StringBuilder sb = new StringBuilder();
 21 | 		
 22 | 		for (String element : configuration) {
 23 | 			sb.append(element + "\n");
 24 | 		}
 25 | 		
 26 | 		return sb.toString().trim();
 27 | 	}
 28 | 	
 29 | 	public Node cat(String element) {		
 30 | 		List<String> newconfiguration = new ArrayList<String>();
 31 | 		for (String elment : configuration) {
 32 | 			newconfiguration.add(elment);
 33 | 		}
 34 | 		newconfiguration.add(element);
 35 | 		
 36 | 		return new Node(newconfiguration);
 37 | 	}
 38 | 	
 39 | 	public static void main(String[] args) {
 40 | 		// get its corresponding property
 41 | 		ExperimentProperties properties = new ExperimentProperties();
 42 | 		Class propertyClass = properties.getClass();
 43 | 		Field[] propertyFields = propertyClass.getFields();
 44 | 		Map<String, String> propertyMap = new HashMap<String, String>();
 45 | 		for (Field field : propertyFields) {
 46 | 			try {
 47 | 				propertyMap.put(field.getName(), field.get(properties).toString());
 48 | 			} catch (Exception e) {
 49 | 				throw new RuntimeException(e);
 50 | 			}
 51 | 		}
 52 | 		
 53 | 		// get its specific arguments
 54 | 		ExperimentArguments arguments = new ExperimentArguments();
 55 | 		Class argumentClass = arguments.getClass();
 56 | 		Field[] argumentFields = argumentClass.getFields();
 57 | 		List<Map<String, List<String>>> argumentMap = new ArrayList<Map<String,List<String>>>();
 58 | 		for (Field field : argumentFields) {
 59 | 			try {
 60 | 				List<String> argument = Arrays.asList(((String[]) field.get(arguments)));
 61 | 				Map<String, List<String>> specificArgument = new HashMap<String, List<String>>();
 62 | 				specificArgument.put(field.getName(), argument);
 63 | 				argumentMap.add(specificArgument);
 64 | 			} catch (Exception e) {
 65 | 				throw new RuntimeException(e);
 66 | 			}
 67 | 		}
 68 | 		
 69 | 		int length = argumentMap.size();
 70 | 		List<Node> combinations = new ArrayList<Node>();
 71 | 		Queue<Node> queue = new LinkedList<Node>();
 72 | 		Node initialNode = new Node();
 73 | 		queue.offer(initialNode);
 74 | 		
 75 | 		int index = 0;
 76 | 		while (queue.size() > 0) {
 77 | 			Node node = queue.poll();
 78 | 			
 79 | 			if (index == length) {
 80 | 				break;
 81 | 			}
 82 | 			
 83 | 			Map<String, List<String>> array = argumentMap.get(index);
 84 | 			for (String key : array.keySet()) {
 85 | 				String configKey = propertyMap.get(key);
 86 | 				
 87 | 				List<String> elements = array.get(key);
 88 | 				
 89 | 				for (String element : elements) {
 90 | 					Node child = node.cat(configKey + " = " + element);
 91 | 					queue.offer(child);
 92 | 					if (child.configuration.size() == length) {
 93 | 						combinations.add(child);
 94 | 					}
 95 | 				}
 96 | 			}
 97 | 			
 98 | 			if (allSameLength(queue)) {
 99 | 				index += 1;
100 | 			}
101 | 			
102 | 		}
103 | 		
104 | 		System.out.println("done");
105 | 	}
106 | 	
107 | 	private static boolean allSameLength(Queue<Node> queue) {
108 | 		Set<Integer> lengths = new HashSet<Integer>();
109 | 		Iterator<Node> iterator = queue.iterator();
110 | 		while(iterator.hasNext()) {
111 | 			Node node = iterator.next();
112 | 			lengths.add(node.configuration.size());
113 | 		}
114 | 		
115 | 		return lengths.size() == 1 ? true : false;
116 | 	}
117 | }
118 | 


--------------------------------------------------------------------------------
/server/ResultAggregation.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.server;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileReader;
  5 | import java.util.Properties;
  6 | 
  7 | import edu.oregonstate.dataset.TopicGeneration;
  8 | import edu.oregonstate.experiment.ExperimentConstructor;
  9 | import edu.oregonstate.io.ResultOutput;
 10 | import edu.oregonstate.util.Command;
 11 | import edu.oregonstate.util.EecbConstants;
 12 | import edu.stanford.nlp.util.StringUtils;
 13 | 
 14 | /**
 15 |  * aggregate the results created by different jobs, for example
 16 |  * during the final testing, different jobs run on different topics,
 17 |  * So by aggregate the results produced by different jobs, produce the final 
 18 |  * result and output to the experiment logFile 
 19 |  * 
 20 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 21 |  *
 22 |  */
 23 | public class ResultAggregation extends ExperimentConstructor {
 24 | 
 25 | 	/** phase, for example the second round */
 26 | 	private final String phaseIndex;
 27 | 	
 28 | 	/** conll result */
 29 | 	private final String conllResultPath;
 30 | 	
 31 | 	/** corefCluster */
 32 | 	private final String[] COREFCLUSTER = {"goldCorefCluster", "predictedCorefCluster"};
 33 | 	
 34 | 	public ResultAggregation(Properties props) {
 35 | 		super(props);
 36 | 		
 37 | 		phaseIndex = props.getProperty(EecbConstants.PHASE_PROP, "0");
 38 | 		
 39 | 		conllResultPath = experimentFolder + "/conll/" + phaseIndex;
 40 | 	}
 41 | 	
 42 | 	/**
 43 | 	 * perform the result aggregation
 44 | 	 */
 45 | 	public void performExperiment() {
 46 | 		TopicGeneration topicGenerator = new TopicGeneration(experimentProps);
 47 | 		
 48 | 		String[] trainingTopics = topicGenerator.trainingTopics();
 49 | 		calculatePerformance(trainingTopics, "trainingtopic");
 50 | 		
 51 | 		String[] testingTopics = topicGenerator.testingTopics();
 52 | 		calculatePerformance(testingTopics, "testingtopic");
 53 | 		
 54 | 		String[] developmentTopics = topicGenerator.developmentTopics();
 55 | 		calculatePerformance(developmentTopics, "developmenttopic");
 56 | 	}
 57 | 	
 58 | 	/**
 59 | 	 * calculate the performance on the entire set
 60 | 	 * because each file is independently processed, so the result is generated independently
 61 | 	 * 
 62 | 	 * @param topics
 63 | 	 * @param set
 64 | 	 */
 65 | 	private void calculatePerformance(String[] topics, String set) {
 66 | 		if (topics == null) {
 67 | 			return;
 68 | 		}
 69 | 		
 70 | 		// whether the output file exist in the disk
 71 | 		boolean fileExist = true;
 72 | 		
 73 | 		String appendPhaseIndex = "";
 74 | 		if (!phaseIndex.equals("0")) {
 75 | 			appendPhaseIndex = phaseIndex + "-";
 76 | 		}
 77 | 		for (String resultType : COREFCLUSTER) {
 78 | 			String outputPath = conllResultPath + "/" + resultType + "-" + phaseIndex + "-" + set;
 79 | 			for (String topic : topics) {
 80 | 				String topicPath = conllResultPath + "/" + resultType + "-" + appendPhaseIndex + "" + set + "-" + topic;
 81 | 				
 82 | 				if (!Command.fileExists(topicPath)) {
 83 | 					fileExist = false;
 84 | 					break;
 85 | 				}
 86 | 				
 87 | 				try {
 88 | 					BufferedReader br = new BufferedReader(new FileReader(topicPath));
 89 | 					String currentLine =  "";
 90 | 					while ((currentLine = br.readLine()) != null) {
 91 | 						ResultOutput.writeTextFile(outputPath, currentLine);
 92 | 					}
 93 | 					
 94 | 					br.close();
 95 | 				} catch (Exception e) {
 96 | 					throw new RuntimeException(e);
 97 | 				}
 98 | 			}
 99 | 		}
100 | 		
101 | 		// CoNLL scoring
102 | 		if (fileExist) {
103 | 			String goldCorefCluster = conllResultPath + "/goldCorefCluster-" + phaseIndex + "-" + set;
104 | 			String predictedCorefCluster = conllResultPath + "/predictedCorefCluster-" + phaseIndex + "-" + set;
105 | 			double[] finalScores = ResultOutput.printCorpusResult(experimentLogFile, goldCorefCluster, predictedCorefCluster, "the " + phaseIndex + "'s model 's performance on " + set);
106 | 			ResultOutput.writeTextFile(experimentFolder + "/" + set + ".csv", finalScores[0] + "\t" + finalScores[1] + "\t" + finalScores[2] + "\t" + finalScores[3] + "\t" + finalScores[4]);
107 | 		}
108 | 	}
109 | 	
110 | 	public static void main(String[] args) {
111 | 		if (args.length > 1) {
112 | 			System.out.println("there are more parameters, you just can specify one path parameter.....");
113 | 			System.exit(1);
114 | 		}
115 | 		
116 | 		if (args.length == 0) {
117 | 			// run the experiment in the local machine for debugging
118 | 			args = new String[1];
119 | 			args[0] = "/nfs/guille/xfern/users/xie/Experiment/experiment/2013-04-23/0-experiment/0-resultaggregation-config.properties";
120 | 		}
121 | 		
122 | 		String[] propArgs = new String[]{"-props", args[0]};
123 | 		
124 | 		Properties props = StringUtils.argsToProperties(propArgs);
125 | 		ExperimentConstructor resultAggregator = new ResultAggregation(props);
126 | 		resultAggregator.performExperiment();
127 | 	}
128 | 	
129 | }
130 | 


--------------------------------------------------------------------------------
/server/pipeline.properties:
--------------------------------------------------------------------------------
 1 | # procedures pipeline for the whole experiment
 2 | procedures = datageneration
 3 | # , searchtrueloss, learn, dagger-3, searchlearnedweightwithoutfeature
 4 | 
 5 | # name of the experiment
 6 | experiment = goldmention, method.function.number, feature.atomic.names
 7 | 
 8 | # corpus directory
 9 | corpus = /scratch/JavaFile/corpus
10 | 
11 | # scorer path
12 | conll.scorer = /nfs/guille/xfern/users/xie/Experiment/corpus/scorer/v4/scorer.pl
13 | 
14 | # debug case
15 | debug = true
16 | 
17 | # wordnet
18 | wordnet = /nfs/guille/xfern/users/xie/Experiment/corpus/WordNet-3.0/dict
19 | 
20 | 
21 | # the properties of datageneration
22 | # set generation format: WithinCross (false), CrossTopic (true)
23 | datageneration.dataset = true
24 | datageneration.goldmention = false
25 | datageneration.postprocess.gold = true
26 | datageneration.trainingset = 6, 16
27 | datageneration.testingset = 20, 38
28 | # annotators for Stanford CoreNLP
29 | datageneration.annotators = tokenize, ssplit, pos, lemma, ner, parse, dcoref
30 | 
31 | # the properties of search
32 | 


--------------------------------------------------------------------------------
/training/AROWOnline.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.training;
  2 | 
  3 | import java.util.List;
  4 | 
  5 | import edu.oregonstate.classifier.Parameter;
  6 | import edu.oregonstate.general.DoubleOperation;
  7 | import edu.oregonstate.search.State;
  8 | import edu.stanford.nlp.dcoref.CorefCluster;
  9 | 
 10 | /**
 11 |  * AROW Implementation based on Online training style
 12 |  * 
 13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 14 |  *
 15 |  */
 16 | public class AROWOnline extends ITraining {
 17 | 
 18 | 	/**
 19 | 	 * implement the batch 
 20 | 	 */
 21 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
 22 | 		double[] previousWeight = para.getWeight();
 23 | 		int violation = para.getNoOfViolation();
 24 | 		int numberOfInstance = 0;
 25 | 		
 26 | 		// use to update weight
 27 | 		Parameter finalParameter = para.makeCopy();
 28 | 		double[] finalWeight = finalParameter.getWeight();
 29 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
 30 | 		double[][] finalVariance = finalParameter.getVariance();
 31 | 		
 32 | 		for (String path : paths) {
 33 | 			List<List<List<String>>> dataset = reader.readData(path);
 34 | 			List<List<String>> goodDataset = dataset.get(0);
 35 | 			List<List<String>> badDataset = dataset.get(1);
 36 | 			
 37 | 			List<Integer> randomLists = createRandomIndex(goodDataset.size());
 38 | 			
 39 | 			for (int i = 0; i < randomLists.size(); i++){
 40 | 				int index = randomLists.get(i);
 41 | 				
 42 | 				List<String> goodRecords = goodDataset.get(index);
 43 | 				List<String> badRecords = badDataset.get(index);
 44 | 				// get the data
 45 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
 46 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
 47 | 				
 48 | 				if (!incorporateZeroVector) {
 49 | 					if (reader.isAllZero(goodStates)) continue;
 50 | 				}
 51 | 				
 52 | 				// form constraint
 53 | 				for (State<CorefCluster> goodState : goodStates) {
 54 | 					for (State<CorefCluster> badState : badStates) {
 55 | 						numberOfInstance += 1;
 56 | 						
 57 | 						// if loss score equal, do not consider this kind of constraint
 58 | 						double gLossScore = goodState.getF1Score();
 59 | 						double bLossScore = badState.getF1Score();
 60 | 						if (gLossScore == bLossScore) {
 61 | 							continue;
 62 | 						}
 63 | 						
 64 | 						// get the features of good state and bad state 
 65 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
 66 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
 67 | 						
 68 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
 69 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
 70 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
 71 | 							violation += 1;
 72 | 						}
 73 | 						
 74 | 						double[] feature = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
 75 | 						double margin = 0.0;
 76 | 						if (enableNormalizeWeight) {
 77 | 							double[] normalizedWeight = DoubleOperation.normalize(finalWeight);
 78 | 							margin = DoubleOperation.time(normalizedWeight, feature);
 79 | 						} else {
 80 | 							margin = DoubleOperation.time(finalWeight, feature);
 81 | 						}
 82 | 						
 83 | 						if (margin < 1) {
 84 | 							double beta = 1 / ( DoubleOperation.transformation(feature, finalVariance) + mHyperParameter );
 85 | 							double alpha = Math.max(0, beta * (1 - DoubleOperation.time(feature, finalWeight)));
 86 | 							double constant = alpha;
 87 | 							double[] delta = DoubleOperation.time(DoubleOperation.matrixTime(finalVariance, feature), constant) ;
 88 | 							boolean zeroVector = DoubleOperation.isZeroVector(delta);
 89 | 
 90 | 							// update the weight and variance
 91 | 							if (!zeroVector) {
 92 | 								finalWeight = DoubleOperation.add(finalWeight, delta);
 93 | 								finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
 94 | 
 95 | 								double[] sumX = DoubleOperation.matrixTime(finalVariance, feature);
 96 | 								double[][] sumXX = DoubleOperation.vectorProduct(sumX, feature);
 97 | 								double[][] betaSumXX = DoubleOperation.time(sumXX, beta);
 98 | 								double[][] betaSumXXSum = DoubleOperation.time(betaSumXX, finalVariance);
 99 | 								finalVariance = DoubleOperation.matrixMinus(finalVariance, betaSumXXSum);
100 | 							}
101 | 						}
102 | 						
103 | 					}
104 | 				}
105 | 			}
106 | 		}
107 | 		
108 | 		return new Parameter(finalWeight, finalVariance, finalTotalWeight, violation, numberOfInstance);
109 | 	}
110 | 	
111 | }
112 | 


--------------------------------------------------------------------------------
/training/AROWOnlineToBatch.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.training;
  2 | 
  3 | import java.util.List;
  4 | 
  5 | import edu.oregonstate.classifier.Parameter;
  6 | import edu.oregonstate.general.DoubleOperation;
  7 | import edu.oregonstate.search.State;
  8 | import edu.stanford.nlp.dcoref.CorefCluster;
  9 | 
 10 | /**
 11 |  * AROW Implementation based on OnlineToBatch training style
 12 |  * 
 13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 14 |  *
 15 |  */
 16 | public class AROWOnlineToBatch extends ITraining {
 17 | 
 18 | 	/**
 19 | 	 * implement the batch 
 20 | 	 */
 21 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
 22 | 		double[] previousWeight = para.getWeight();
 23 | 		int violation = para.getNoOfViolation();
 24 | 		int numberOfInstance = 0;
 25 | 		
 26 | 		// use to update weight
 27 | 		Parameter finalParameter = para.makeCopy();
 28 | 		double[] finalWeight = finalParameter.getWeight();
 29 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
 30 | 		double[][] finalVariance = finalParameter.getVariance();
 31 | 		
 32 | 		for (String path : paths) {
 33 | 			List<List<List<String>>> dataset = reader.readData(path);
 34 | 			List<List<String>> goodDataset = dataset.get(0);
 35 | 			List<List<String>> badDataset = dataset.get(1);
 36 | 			
 37 | 			List<Integer> randomLists = createRandomIndex(goodDataset.size());
 38 | 			
 39 | 			for (int i = 0; i < randomLists.size(); i++){
 40 | 				int index = randomLists.get(i);
 41 | 				
 42 | 				List<String> goodRecords = goodDataset.get(index);
 43 | 				List<String> badRecords = badDataset.get(index);
 44 | 				// get the data
 45 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
 46 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
 47 | 				
 48 | 				if (!incorporateZeroVector) {
 49 | 					if (reader.isAllZero(goodStates)) continue;
 50 | 				}
 51 | 				
 52 | 				// fix the weight and variance for the current batch
 53 | 				double[] fixedWeight = new double[length];
 54 | 				System.arraycopy(finalWeight, 0, fixedWeight, 0, length);
 55 | 				double[][] fixedVariance = new double[length][length];
 56 | 				for (int row = 0; row < length; row++) {
 57 | 					System.arraycopy(finalVariance[row], 0, fixedVariance[row], 0, length);
 58 | 				}
 59 | 				
 60 | 				// form constraint
 61 | 				for (State<CorefCluster> goodState : goodStates) {
 62 | 					for (State<CorefCluster> badState : badStates) {
 63 | 						numberOfInstance += 1;
 64 | 						
 65 | 						// if loss score equal, do not consider this kind of constraint
 66 | 						double gLossScore = goodState.getF1Score();
 67 | 						double bLossScore = badState.getF1Score();
 68 | 						if (gLossScore == bLossScore) {
 69 | 							continue;
 70 | 						}
 71 | 						
 72 | 						// get the features of good state and bad state 
 73 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
 74 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
 75 | 						
 76 | 						// calculate the number of violated constraints
 77 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
 78 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
 79 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
 80 | 							violation += 1;
 81 | 						}
 82 | 						
 83 | 						double[] feature = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
 84 | 						double margin = DoubleOperation.time(fixedWeight, feature);
 85 | 						if (margin < 1) {
 86 | 							double beta = 1 / ( DoubleOperation.transformation(feature, fixedVariance) + mHyperParameter );
 87 | 							double alpha = Math.max(0, beta * (1 - DoubleOperation.time(feature, fixedWeight)));
 88 | 							double constant = alpha;
 89 | 							double[] delta = DoubleOperation.time(DoubleOperation.matrixTime(fixedVariance, feature), constant) ;
 90 | 							boolean zeroVector = DoubleOperation.isZeroVector(delta);
 91 | 
 92 | 							// update the weight and variance
 93 | 							if (!zeroVector) {
 94 | 								// update the weight
 95 | 								finalWeight = DoubleOperation.add(finalWeight, delta);
 96 | 								finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
 97 | 
 98 | 								double[] sumX = DoubleOperation.matrixTime(fixedVariance, feature);
 99 | 								double[][] sumXX = DoubleOperation.vectorProduct(sumX, feature);
100 | 								double[][] betaSumXX = DoubleOperation.time(sumXX, beta);
101 | 								double[][] betaSumXXSum = DoubleOperation.time(betaSumXX, fixedVariance);
102 | 								
103 | 								// update the variance
104 | 								finalVariance = DoubleOperation.matrixMinus(finalVariance, betaSumXXSum);
105 | 							}
106 | 						}
107 | 						
108 | 					}
109 | 				}
110 | 			}
111 | 		}
112 | 		
113 | 		return new Parameter(finalWeight, finalVariance, finalTotalWeight, violation, numberOfInstance);
114 | 	}
115 | }
116 | 


--------------------------------------------------------------------------------
/training/Batch.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.training;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | import edu.oregonstate.general.DoubleOperation;
 7 | import edu.oregonstate.search.State;
 8 | import edu.stanford.nlp.dcoref.CorefCluster;
 9 | 
10 | public class Batch extends ITraining {
11 | 	
12 | 	/**
13 | 	 * implement the batch
14 | 	 */
15 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
16 | 		double[] previousWeight = para.getWeight();
17 | 		double[] previousTotalWeight = para.getTotalWeight();
18 | 		int violations = para.getNoOfViolation();
19 | 		int numberOfInstance = 0;
20 | 		
21 | 		double[] delta = new double[length];
22 | 		double[] totalDelta = new double[length];
23 | 		for (String path : paths) {
24 | 			List<List<List<String>>> dataset = reader.readData(path);
25 | 			List<List<String>> goodDataset = dataset.get(0);
26 | 			List<List<String>> badDataset = dataset.get(1);
27 | 			
28 | 			for (int index = 0; index < goodDataset.size(); index++){
29 | 				List<String> goodRecords = goodDataset.get(index);
30 | 				List<String> badRecords = badDataset.get(index);
31 | 				
32 | 				// get the data
33 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
34 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
35 | 				
36 | 				if (!incorporateZeroVector) {
37 | 					if (reader.isAllZero(goodStates)) continue;
38 | 				}
39 | 				
40 | 				// form constraint
41 | 				for (State<CorefCluster> goodState : goodStates) {
42 | 					for (State<CorefCluster> badState : badStates) {
43 | 						numberOfInstance += 1;
44 | 						
45 | 						// if loss score equal, do not consider this kind of constraint
46 | 						double gLossScore = goodState.getF1Score();
47 | 						double bLossScore = badState.getF1Score();
48 | 						if (gLossScore == bLossScore) {
49 | 							continue;
50 | 						}
51 | 						
52 | 						// get the features of good state and bad state
53 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
54 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
55 | 						
56 | 						// calculate the action score of good state and bad state	
57 | 						double goodCostScoreForUpdating = DoubleOperation.time(previousWeight, gNumericalFeatures);
58 | 						double badCostScoreForUpdating = DoubleOperation.time(previousWeight, bNumericalFeatures);
59 | 
60 | 						// violated current constraint
61 | 						if (goodCostScoreForUpdating <= badCostScoreForUpdating) {
62 | 							violations += 1;
63 | 							double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
64 | 							delta = DoubleOperation.add(delta, direction);
65 | 							totalDelta = DoubleOperation.add(totalDelta, delta);
66 | 						}
67 | 					}
68 | 				}
69 | 			}
70 | 		}
71 | 		
72 | 		double[] weightedDelta = DoubleOperation.time(delta, learningRate);
73 | 		double[] weightedTotalDelta = DoubleOperation.time(totalDelta, learningRate);
74 | 		
75 | 		double[] currentWeight = DoubleOperation.add(previousWeight, weightedDelta);
76 | 		double[] currentTotalWeight = DoubleOperation.add(previousTotalWeight, weightedTotalDelta);
77 | 		
78 | 		return new Parameter(currentWeight, para.getVariance(), currentTotalWeight, violations, numberOfInstance);
79 | 	}
80 | 	
81 | }
82 | 


--------------------------------------------------------------------------------
/training/ITraining.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.training;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.List;
  6 | import java.util.Properties;
  7 | 
  8 | import edu.oregonstate.classifier.Parameter;
  9 | import edu.oregonstate.experiment.ExperimentConstructor;
 10 | import edu.oregonstate.features.FeatureFactory;
 11 | import edu.oregonstate.general.DoubleOperation;
 12 | import edu.oregonstate.io.LargetFileReading;
 13 | import edu.oregonstate.util.EecbConstants;
 14 | 
 15 | /**
 16 |  * whether incorporate the negative instance, according to the paper : Tuning as Ranking
 17 |  * 
 18 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 19 |  *
 20 |  */
 21 | public abstract class ITraining {
 22 | 
 23 | 	/** the length of the weight */
 24 | 	protected final int length;
 25 | 	
 26 | 	/** largest file reader */
 27 | 	protected final LargetFileReading reader;
 28 | 	
 29 | 	/** whether incorporate the zero good state */
 30 | 	protected final boolean incorporateZeroVector;
 31 | 	
 32 | 	/** hyper-parameter for AROW */
 33 | 	protected final double mHyperParameter;
 34 | 	
 35 | 	/** whether enable PA learning rate loss score */
 36 | 	private final boolean enablePALearningRateLossScore;
 37 | 	
 38 | 	/** enable discrepancy */
 39 | 	private final boolean enablePADiscrepancy;
 40 | 	
 41 | 	/** enable margin */
 42 | 	private final boolean enablePAMargin;
 43 | 	
 44 | 	/** enable normalize the weight */
 45 | 	protected final boolean enableNormalizeWeight;
 46 | 	
 47 | 	public ITraining() {
 48 | 		Properties mProps = ExperimentConstructor.experimentProps;
 49 | 		length = FeatureFactory.getFeatureTemplate().size();
 50 | 		reader = new LargetFileReading();
 51 | 		incorporateZeroVector = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_INCORPORATE_ZERO_CASE, "true"));
 52 | 		enablePALearningRateLossScore = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_RATE_LOSSSCORE, "true"));
 53 | 		enablePADiscrepancy = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_DISCREPANCY, "true"));
 54 | 		enablePAMargin = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_PA_MARGIN, "true"));
 55 | 		enableNormalizeWeight = Boolean.parseBoolean(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_NORMALIZE_WEIGHT, "true"));
 56 | 		mHyperParameter = Double.parseDouble(mProps.getProperty(EecbConstants.CLASSIFIER_TRAINING_HYPERPARAMETER, "1.0"));
 57 | 	}
 58 | 	
 59 | 	/* different weight update styles, including Batch, Online and OnlineToBatch */
 60 | 	public abstract Parameter train(List<String> paths, Parameter para, double learningRate);
 61 | 	
 62 | 	/**
 63 | 	 * create random integer list
 64 | 	 * 
 65 | 	 * @param size
 66 | 	 * @return
 67 | 	 */
 68 | 	protected List<Integer> createRandomIndex(int size) {
 69 | 		List<Integer> arrays = new ArrayList<Integer>();
 70 | 		for (int i = 0; i < size; i++) {
 71 | 			arrays.add(i);
 72 | 		}
 73 | 		
 74 | 		Collections.shuffle(arrays);
 75 | 		return arrays;
 76 | 	}
 77 | 	
 78 | 	/**
 79 | 	 * calculate the loss
 80 | 	 * 
 81 | 	 * @param gLossScore
 82 | 	 * @param bLossScore
 83 | 	 * @param gNumericalFeatures
 84 | 	 * @param bNumericalFeatures
 85 | 	 * @param weight
 86 | 	 * @return
 87 | 	 */
 88 | 	protected double calculatePALoss(double gLossScore, double bLossScore, double[] gNumericalFeatures, 
 89 | 			 						 double[] bNumericalFeatures, double[] weight) {
 90 | 		double loss = 0.0;
 91 | 		
 92 | 		// calculate margin
 93 | 		if (enablePAMargin) {
 94 | 			double margin = 1.0;
 95 | 			if (enablePALearningRateLossScore) {
 96 | 				margin = gLossScore - bLossScore;
 97 | 			}
 98 | 			loss += margin;
 99 | 		}
100 | 		
101 | 		// calculate the discrepancy
102 | 		if (enablePADiscrepancy) {
103 | 			double[] weightForCalculatingCost = null;
104 | 			if (enableNormalizeWeight) {
105 | 				weightForCalculatingCost = DoubleOperation.normalize(weight);
106 | 			} else {
107 | 				weightForCalculatingCost = weight;
108 | 			}
109 | 		
110 | 			double bCostScore = DoubleOperation.time(bNumericalFeatures, weightForCalculatingCost);
111 | 			double gCostScore = DoubleOperation.time(gNumericalFeatures, weightForCalculatingCost);
112 | 			loss += bCostScore - gCostScore;
113 | 		}
114 | 		
115 | 		return loss;
116 | 	}
117 | 	
118 | }
119 | 


--------------------------------------------------------------------------------
/training/Online.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.training;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | import edu.oregonstate.general.DoubleOperation;
 7 | import edu.oregonstate.search.State;
 8 | import edu.stanford.nlp.dcoref.CorefCluster;
 9 | 
10 | public class Online extends ITraining {
11 | 	
12 | 	/**
13 | 	 * implement the batch 
14 | 	 */
15 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
16 | 		double[] previousWeight = para.getWeight();
17 | 		int violation = para.getNoOfViolation();
18 | 		int numberOfInstance = 0;
19 | 		
20 | 		// use to update weight
21 | 		Parameter finalParameter = para.makeCopy();
22 | 		double[] finalWeight = finalParameter.getWeight();
23 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
24 | 		
25 | 		for (String path : paths) {
26 | 			List<List<List<String>>> dataset = reader.readData(path);
27 | 			List<List<String>> goodDataset = dataset.get(0);
28 | 			List<List<String>> badDataset = dataset.get(1);
29 | 			
30 | 			for (int index = 0; index < goodDataset.size(); index++){
31 | 				List<String> goodRecords = goodDataset.get(index);
32 | 				List<String> badRecords = badDataset.get(index);
33 | 				// get the data
34 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
35 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
36 | 				
37 | 				if (!incorporateZeroVector) {
38 | 					if (reader.isAllZero(goodStates)) continue;
39 | 				}
40 | 				
41 | 				// form constraint
42 | 				for (State<CorefCluster> goodState : goodStates) {
43 | 					for (State<CorefCluster> badState : badStates) {
44 | 						numberOfInstance += 1;
45 | 						
46 | 						// if loss score equal, do not consider this kind of constraint
47 | 						double gLossScore = goodState.getF1Score();
48 | 						double bLossScore = badState.getF1Score();
49 | 						if (gLossScore == bLossScore) {
50 | 							continue;
51 | 						}
52 | 						
53 | 						// get the features of good state and bad state 
54 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
55 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
56 | 						
57 | 						// calculate the action score of good state and bad state	
58 | 						double goodCostScoreForUpdating = DoubleOperation.time(finalWeight, gNumericalFeatures);
59 | 						double badCostScoreForUpdating = DoubleOperation.time(finalWeight, bNumericalFeatures);
60 | 						
61 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
62 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
63 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
64 | 							violation += 1;
65 | 						}
66 | 
67 | 						// violated current constraint
68 | 						if (goodCostScoreForUpdating <= badCostScoreForUpdating) {
69 | 							double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);							
70 | 							double[] term = DoubleOperation.time(direction, learningRate);
71 | 							finalWeight = DoubleOperation.add(finalWeight, term);
72 | 							finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
73 | 						}
74 | 					}
75 | 				}
76 | 			}
77 | 		}
78 | 		
79 | 		return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance);
80 | 	}
81 | 
82 | }
83 | 


--------------------------------------------------------------------------------
/training/OnlinePA.txt:
--------------------------------------------------------------------------------
 1 | from https://cwiki.apache.org/confluence/display/MAHOUT/Online+Passive+Aggressive
 2 | 
 3 | Data must be shuffled and normalized either between 0..1 or by mean and standard deviation.
 4 | 
 5 | Technical details:
 6 | 
 7 | The training approach taken is to minimize the ranking loss of the correct label vs the incorrect ones. We define this loss as hinge(1 - correct label score + wrong label score) where wrong label score is the score of the highest scoring label that is not the correct label. The hinge function is hinge = x if x > 0, 0 otherwise.
 8 | 
 9 | Parameters:
10 | 
11 | There is only one - learningRate. You set it to a larger number to converge faster, or a smaller number to be more cautious. The normal way to use it is via cross validation. Good values are (0.1, 1.0, 10.0).
12 | 


--------------------------------------------------------------------------------
/training/OnlineToBatch.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.training;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | import edu.oregonstate.general.DoubleOperation;
 7 | import edu.oregonstate.search.State;
 8 | import edu.stanford.nlp.dcoref.CorefCluster;
 9 | 
10 | public class OnlineToBatch extends ITraining {
11 | 	
12 | 	/**
13 | 	 * implement the batch
14 | 	 */
15 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
16 | 		double[] previousWeight = para.getWeight();
17 | 		int violation = para.getNoOfViolation();
18 | 		int numberOfInstance = 0;
19 | 		
20 | 		// use to update weight
21 | 		Parameter finalParameter = para.makeCopy();
22 | 		double[] finalWeight = finalParameter.getWeight();
23 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
24 | 		
25 | 		for (String path : paths) {
26 | 			List<List<List<String>>> dataset = reader.readData(path);
27 | 			List<List<String>> goodDataset = dataset.get(0);
28 | 			List<List<String>> badDataset = dataset.get(1);
29 | 			
30 | 			// shuffle the data again
31 | 			List<Integer> randomLists = createRandomIndex(goodDataset.size());
32 | 			
33 | 			for (int i = 0; i < randomLists.size(); i++){
34 | 				int index = randomLists.get(i);
35 | 				List<String> goodRecords = goodDataset.get(index);
36 | 				List<String> badRecords = badDataset.get(index);
37 | 				// get the data
38 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
39 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
40 | 				
41 | 				if (!incorporateZeroVector) {
42 | 					if (reader.isAllZero(goodStates)) continue;
43 | 				}
44 | 				
45 | 				// fix the weight for the current batch
46 | 				double[] fixedWeight = new double[length];
47 | 				System.arraycopy(finalWeight, 0, fixedWeight, 0, length);
48 | 				
49 | 				// form constraint
50 | 				for (State<CorefCluster> goodState : goodStates) {
51 | 					for (State<CorefCluster> badState : badStates) {
52 | 						numberOfInstance += 1;
53 | 						
54 | 						// if loss score equal, do not consider this kind of constraint
55 | 						double gLossScore = goodState.getF1Score();
56 | 						double bLossScore = badState.getF1Score();
57 | 						if (gLossScore == bLossScore) {
58 | 							continue;
59 | 						}
60 | 						
61 | 						// get the features of good state and bad state 
62 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
63 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
64 | 						
65 | 						// calculate the action score of good state and bad state
66 | 						double goodCostScoreForUpdating = DoubleOperation.time(fixedWeight, gNumericalFeatures);
67 | 						double badCostScoreForUpdating = DoubleOperation.time(fixedWeight, bNumericalFeatures);
68 | 						
69 | 						// calculate the number of violations
70 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
71 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
72 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
73 | 							violation += 1;
74 | 						}
75 | 
76 | 						// violated current constraint
77 | 						if (goodCostScoreForUpdating <= badCostScoreForUpdating) {
78 | 							double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
79 | 							if (DoubleOperation.isAllZero(direction)) continue;
80 | 							
81 | 							//ResultOutput.writeTextFile(ExperimentConstructor.logFile, "learning rate : " + learningRate);
82 | 							double[] term = DoubleOperation.time(direction, learningRate);
83 | 							finalWeight = DoubleOperation.add(finalWeight, term);
84 | 							finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
85 | 						}
86 | 					}
87 | 				}
88 | 			}
89 | 		}
90 | 		
91 | 		return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance);
92 | 	}
93 | 	
94 | }


--------------------------------------------------------------------------------
/training/PAOnline.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.training;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | import edu.oregonstate.general.DoubleOperation;
 7 | import edu.oregonstate.search.State;
 8 | import edu.stanford.nlp.dcoref.CorefCluster;
 9 | 
10 | /**
11 |  * use PA algorithm to update the learned weight, use the Online mode
12 |  * 
13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
14 |  *
15 |  */
16 | public class PAOnline extends ITraining {
17 | 
18 | 	/**
19 | 	 * implement the batch 
20 | 	 */
21 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
22 | 		double[] previousWeight = para.getWeight();
23 | 		int violation = para.getNoOfViolation();
24 | 		int numberOfInstance = 0;
25 | 		
26 | 		// use to update weight
27 | 		Parameter finalParameter = para.makeCopy();
28 | 		double[] finalWeight = finalParameter.getWeight();
29 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
30 | 		
31 | 		for (String path : paths) {
32 | 			List<List<List<String>>> dataset = reader.readData(path);
33 | 			List<List<String>> goodDataset = dataset.get(0);
34 | 			List<List<String>> badDataset = dataset.get(1);
35 | 			
36 | 			List<Integer> randomLists = createRandomIndex(goodDataset.size());
37 | 			
38 | 			for (int i = 0; i < randomLists.size(); i++){
39 | 				int index = randomLists.get(i);
40 | 				
41 | 				List<String> goodRecords = goodDataset.get(index);
42 | 				List<String> badRecords = badDataset.get(index);
43 | 				// get the data
44 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
45 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
46 | 				
47 | 				if (!incorporateZeroVector) {
48 | 					if (reader.isAllZero(goodStates)) continue;
49 | 				}
50 | 				
51 | 				// form constraint
52 | 				for (State<CorefCluster> goodState : goodStates) {
53 | 					for (State<CorefCluster> badState : badStates) {
54 | 						numberOfInstance += 1;
55 | 						
56 | 						// if loss score equal, do not consider this kind of constraint
57 | 						double gLossScore = goodState.getF1Score();
58 | 						double bLossScore = badState.getF1Score();
59 | 						if (gLossScore == bLossScore) {
60 | 							continue;
61 | 						}
62 | 						
63 | 						// get the features of good state and bad state 
64 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
65 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
66 | 						
67 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
68 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
69 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
70 | 							violation += 1;
71 | 						}
72 | 						
73 | 						// calculate the loss
74 | 						double loss = calculatePALoss(gLossScore, bLossScore, gNumericalFeatures, bNumericalFeatures, finalWeight);
75 | 						if (loss > 0) {
76 | 							double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
77 | 							
78 | 							if (DoubleOperation.isAllZero(direction)) continue;
79 | 							
80 | 							double directionNorm = DoubleOperation.calculateTwoNorm(direction);							
81 | 							double tau = loss / directionNorm;
82 | 							// ResultOutput.writeTextFile(ExperimentConstructor.logFile, "tau : " + tau);
83 | 							double[] term = DoubleOperation.time(direction, tau);
84 | 							finalWeight = DoubleOperation.add(finalWeight, term);
85 | 							finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
86 | 						}
87 | 					}
88 | 				}
89 | 			}
90 | 		}
91 | 		
92 | 		return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance);
93 | 	}
94 | 	
95 | }
96 | 


--------------------------------------------------------------------------------
/training/PAOnlineToBatch.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.training;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import edu.oregonstate.classifier.Parameter;
 6 | import edu.oregonstate.general.DoubleOperation;
 7 | import edu.oregonstate.search.State;
 8 | import edu.stanford.nlp.dcoref.CorefCluster;
 9 | 
10 | /**
11 |  * PA using the OnlineToBatch Mode
12 |  * 
13 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
14 |  *
15 |  */
16 | public class PAOnlineToBatch extends ITraining {
17 | 
18 | 	/**
19 | 	 * implement the batch 
20 | 	 */
21 | 	public Parameter train(List<String> paths, Parameter para, double learningRate) {
22 | 		double[] previousWeight = para.getWeight();
23 | 		int violation = para.getNoOfViolation();
24 | 		int numberOfInstance = 0;
25 | 
26 | 		// use to update weight
27 | 		Parameter finalParameter = para.makeCopy();
28 | 		double[] finalWeight = finalParameter.getWeight();
29 | 		double[] finalTotalWeight = finalParameter.getTotalWeight();
30 | 
31 | 		for (String path : paths) {
32 | 			List<List<List<String>>> dataset = reader.readData(path);
33 | 			List<List<String>> goodDataset = dataset.get(0);
34 | 			List<List<String>> badDataset = dataset.get(1);
35 | 
36 | 			List<Integer> randomLists = createRandomIndex(goodDataset.size());
37 | 
38 | 			for (int i = 0; i < randomLists.size(); i++){
39 | 				int index = randomLists.get(i);
40 | 				List<String> goodRecords = goodDataset.get(index);
41 | 				List<String> badRecords = badDataset.get(index);
42 | 				// get the data
43 | 				List<State<CorefCluster>> goodStates = reader.processString(goodRecords);
44 | 				List<State<CorefCluster>> badStates = reader.processString(badRecords);
45 | 
46 | 				if (!incorporateZeroVector) {
47 | 					if (reader.isAllZero(goodStates)) continue;
48 | 				}
49 | 
50 | 				double[] fixedWeight = new double[length];
51 | 				System.arraycopy(finalWeight, 0, fixedWeight, 0, length);
52 | 
53 | 				// form constraint
54 | 				for (State<CorefCluster> goodState : goodStates) {
55 | 					for (State<CorefCluster> badState : badStates) {
56 | 						numberOfInstance += 1;
57 | 
58 | 						// if loss score equal, do not consider this kind of constraint
59 | 						double gLossScore = goodState.getF1Score();
60 | 						double bLossScore = badState.getF1Score();
61 | 						if (gLossScore == bLossScore) {
62 | 							continue;
63 | 						}
64 | 
65 | 						// get the features of good state and bad state 
66 | 						double[] gNumericalFeatures = goodState.getNumericalFeatures();
67 | 						double[] bNumericalFeatures = badState.getNumericalFeatures();
68 | 
69 | 						double goodCostScoreForCounting = DoubleOperation.time(previousWeight, gNumericalFeatures);
70 | 						double badCostScoreForCounting = DoubleOperation.time(previousWeight, bNumericalFeatures);
71 | 						if (goodCostScoreForCounting <= badCostScoreForCounting) {
72 | 							violation += 1;
73 | 						}
74 | 
75 | 						// calculate the loss
76 | 						double loss = calculatePALoss(gLossScore, bLossScore, gNumericalFeatures, bNumericalFeatures, fixedWeight);
77 | 						if (loss > 0) {
78 | 							double[] direction = DoubleOperation.minus(gNumericalFeatures, bNumericalFeatures);
79 | 
80 | 							if (DoubleOperation.isAllZero(direction)) continue;
81 | 
82 | 							double directionNorm = DoubleOperation.calculateTwoNorm(direction);
83 | 							double tau = loss / directionNorm;
84 | 							double[] term = DoubleOperation.time(direction, tau);
85 | 							finalWeight = DoubleOperation.add(finalWeight, term);
86 | 							finalTotalWeight = DoubleOperation.add(finalTotalWeight, finalWeight);
87 | 						}
88 | 					}
89 | 				}
90 | 			}
91 | 		}
92 | 
93 | 		return new Parameter(finalWeight, para.getVariance(), finalTotalWeight, violation, numberOfInstance);
94 | 	}
95 | }
96 | 


--------------------------------------------------------------------------------
/tuning/TuningFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.tuning;
 2 | 
 3 | /**
 4 |  * tune the parameter or choose which model generated by the Dagger framework are the best by decoding 
 5 |  * on the development set
 6 |  * 
 7 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 8 |  *
 9 |  */
10 | public class TuningFactory {
11 | 
12 | 	
13 | }


--------------------------------------------------------------------------------
/util/Command.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.util;
  2 | 
  3 | import java.io.File;
  4 | import java.util.List;
  5 | 
  6 | import edu.oregonstate.search.State;
  7 | import edu.stanford.nlp.dcoref.CorefCluster;
  8 | import edu.stanford.nlp.dcoref.Document;
  9 | import edu.stanford.nlp.dcoref.Mention;
 10 | import edu.stanford.nlp.util.SystemUtils;
 11 | 
 12 | /**
 13 |  * those commands used for creating file or something else
 14 |  * 
 15 |  * @author Jun Xie (xie@eecs.oregonstate.edu)
 16 |  *
 17 |  */
 18 | public class Command {
 19 | 
 20 | 	// create a directory given a path string
 21 | 	public static void mkdir(String path) {
 22 | 		if (!fileExists(path)) {
 23 | 			String command = "mkdir " + path;
 24 | 			execCommand(command.split(" "));
 25 | 		}
 26 | 	}
 27 | 
 28 | 	// execute the Unix command
 29 | 	public static void execCommand(String... command) {
 30 | 		try {
 31 | 			ProcessBuilder ps = new ProcessBuilder(command);
 32 | 			SystemUtils.run(ps);
 33 | 		} catch (Exception e) {
 34 | 			e.printStackTrace();
 35 | 		}
 36 | 	}
 37 | 
 38 | 	// execute the chmod command for a whole folderPath
 39 | 	public static void chmod(String folderPath) {
 40 | 		String command = "chmod -R u+x " + folderPath;
 41 | 		execCommand(command.split(" "));
 42 | 	}
 43 | 
 44 | 	/**
 45 | 	 * delete the whole directory
 46 | 	 * 
 47 | 	 * @param directoryName
 48 | 	 */
 49 | 	public static void rmdir(String directoryName) {
 50 | 		File directory = new File(directoryName);
 51 | 
 52 | 		if (directory != null) {
 53 | 			String[] command = new String[] {"rm", "-rf", directoryName};
 54 | 			execCommand(command);
 55 | 		}
 56 | 
 57 | 		return;
 58 | 	}
 59 | 
 60 | 	/**
 61 | 	 * whether the file exists in the disk
 62 | 	 * 
 63 | 	 * @param filePath
 64 | 	 * @return
 65 | 	 */
 66 | 	public static boolean fileExists(String filePath) {
 67 | 		File file = new File(filePath);
 68 | 		return file.exists();
 69 | 	}
 70 | 	
 71 | 	/** 
 72 | 	 * count the total number of mentions
 73 | 	 * 
 74 | 	 * @param mentionList
 75 | 	 * @return
 76 | 	 */
 77 | 	public static int countMentions(List<List<Mention>> mentionList) {
 78 | 		int totalNumber = 0;
 79 | 		for (List<Mention> mentions : mentionList ) {
 80 | 			totalNumber += mentions.size();
 81 | 		}
 82 | 
 83 | 		return totalNumber;
 84 | 	}
 85 | 
 86 | 	/**
 87 | 	 * update the allPredictedMentions, which is used for Stanford scoring function
 88 | 	 * The reason for this is that the corefClusters information has been updated. The mention id should be consistent 
 89 | 	 * with the allPredictedMentions and corefClusters
 90 | 	 * 
 91 | 	 * @param stateDocument
 92 | 	 * @param state
 93 | 	 */
 94 | 	public static void generateStateDocument(Document stateDocument, State<CorefCluster> state) {
 95 | 		stateDocument.corefClusters = state.getState();
 96 | 
 97 | 		for (Integer id : stateDocument.corefClusters.keySet()) {
 98 | 			CorefCluster cluster = stateDocument.corefClusters.get(id);
 99 | 			for (Mention m : cluster.corefMentions) {
100 | 				int mentionID = m.mentionID;
101 | 				Mention correspondingMention = stateDocument.allPredictedMentions.get(mentionID);
102 | 				int clusterid = id;
103 | 				correspondingMention.corefClusterID = clusterid;
104 | 			}
105 | 		}
106 | 	}
107 | 
108 | 
109 | }
110 | 


--------------------------------------------------------------------------------
/util/CosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package edu.oregonstate.util;
 2 | 
 3 | import java.util.*;
 4 | import java.lang.Math;
 5 | 
 6 | /**
 7 |  * This class calculates the cosine similarity between two clusters, or entities 
 8 |  * in our cases. Each cluster consists of one mention at the initialization phase, 
 9 |  * because we initialize each mention as a cluster.
10 |  * <P>
11 |  * In the search phase, we need to merge two clusters, each of which contains at 
12 |  * least one mention. So in this way, we need to extract features from the cluster 
13 |  * pairs. We use List to represent the features of each cluster and then compute 
14 |  * the cosine similarity between the feature pairs.
15 |  * <P>
16 |  * The formula to calculate the cosine similarity is borrowed from link:
17 |  * http://en.wikipedia.org/wiki/Cosine_similarity
18 |  * 
19 |  * @author xie
20 |  *
21 |  */
22 | public class CosineSimilarity {
23 | 
24 | 	/**
25 | 	 * calculate the cosine similarity between feature vectors of two clusters
26 | 	 * 
27 | 	 * The feature vector is represented as HashMap<String, Double>. 
28 | 	 * 
29 | 	 * @param firstFeatures The feature vector of the first cluster
30 | 	 * @param secondFeatures The feature vector of the second cluster 
31 | 	 * @return the similarity measure
32 | 	 */
33 | 	public static Double calculateCosineSimilarity(HashMap<String, Double> firstFeatures, HashMap<String, Double> secondFeatures) {
34 | 		Double similarity = 0.0;
35 | 		Double sum = 0.0;	// the numerator of the cosine similarity
36 | 		Double fnorm = 0.0;	// the first part of the denominator of the cosine similarity
37 | 		Double snorm = 0.0;	// the second part of the denominator of the cosine similarity
38 | 		Set<String> fkeys = firstFeatures.keySet();
39 | 		Iterator<String> fit = fkeys.iterator();
40 | 		while (fit.hasNext()) {
41 | 			String featurename = fit.next();
42 | 			boolean containKey = secondFeatures.containsKey(featurename);
43 | 			if (containKey) {
44 | 				sum = sum + firstFeatures.get(featurename) * secondFeatures.get(featurename);
45 | 			}
46 | 		}
47 | 		fnorm = calculateNorm(firstFeatures);
48 | 		snorm = calculateNorm(secondFeatures);
49 | 		similarity = sum / (fnorm * snorm);
50 | 		return similarity;
51 | 	}
52 | 	
53 | 	/**
54 | 	 * calculate the norm of one feature vector
55 | 	 * 
56 | 	 * @param feature of one cluster
57 | 	 * @return
58 | 	 */
59 | 	public static Double calculateNorm(HashMap<String, Double> feature) {
60 | 		Double norm = 0.0;
61 | 		Set<String> keys = feature.keySet();
62 | 		Iterator<String> it = keys.iterator();
63 | 		while (it.hasNext()) {
64 | 			String featurename = it.next();
65 | 			norm = norm + Math.pow(feature.get(featurename), 2);
66 | 		}
67 | 		return Math.sqrt(norm);
68 | 	}
69 | 	
70 | }
71 | 


--------------------------------------------------------------------------------
/util/DocumentAlignment.java:
--------------------------------------------------------------------------------
  1 | package edu.oregonstate.util;
  2 | 
  3 | import java.util.List;
  4 | import java.util.Map;
  5 | 
  6 | import edu.oregonstate.experiment.ExperimentConstructor;
  7 | import edu.stanford.nlp.dcoref.CorefCluster;
  8 | import edu.stanford.nlp.dcoref.Document;
  9 | import edu.stanford.nlp.dcoref.Mention;
 10 | import edu.stanford.nlp.dcoref.SieveCoreferenceSystem;
 11 | 
 12 | public class DocumentAlignment {
 13 | 	
 14 | 	/**
 15 | 	 * update the corefcluster ID of each mention in the orderedPredictionMentions
 16 | 	 * 
 17 | 	 * @param document
 18 | 	 */
 19 | 	public static void alignDocument(Document document) {
 20 | 		updateOrderedPredictedMentions(document);
 21 | 		//updateOrderedGoldMentions(document);
 22 | 	}
 23 | 	
 24 | 	private static void updateOrderedPredictedMentions(Document document) {
 25 | 		List<List<Mention>> predictedOrderedMentionsBySentence = document.getOrderedMentions();
 26 | 		Map<Integer, CorefCluster> corefClusters = document.corefClusters;
 27 | 		for (Integer clusterID : corefClusters.keySet()) {
 28 | 			CorefCluster cluster = corefClusters.get(clusterID);
 29 | 			for (Mention m : cluster.getCorefMentions()) {
 30 | 				int sentenceID = m.sentNum;
 31 | 				List<Mention> mentions = predictedOrderedMentionsBySentence.get(sentenceID);
 32 | 				int mStartIndex = m.startIndex;
 33 | 				int mEndIndex = m.endIndex;
 34 | 				for (Mention mention : mentions) {
 35 | 					int mentionStartIndex = mention.startIndex;
 36 | 					int mentionEndIndex = mention.endIndex;
 37 | 					if (mentionStartIndex == mStartIndex && mentionEndIndex == mEndIndex) {
 38 | 						mention.mentionID = m.mentionID;
 39 | 						break;
 40 | 					}
 41 | 				}
 42 | 				
 43 | 				
 44 | 				int mentionID = m.mentionID;
 45 |                 Mention correspondingMention = document.allPredictedMentions.get(mentionID);
 46 |                 correspondingMention.corefClusterID = clusterID;
 47 | 			}
 48 | 		}
 49 | 	}
 50 | 	
 51 | 	/**
 52 | 	 * update the corefcluster ID of each mention in the goldOrderedMentionsBySentence
 53 | 	 * 
 54 | 	 * @param document
 55 | 	 */
 56 | 	private static void updateOrderedGoldMentions(Document document) {
 57 | 		List<List<Mention>> goldOrderedMentionsBySentence = document.goldOrderedMentionsBySentence;
 58 | 		Map<Integer, CorefCluster> goldClusters = document.goldCorefClusters;
 59 | 		for (Integer clusterID : goldClusters.keySet()) {
 60 | 			CorefCluster cluster = goldClusters.get(clusterID);
 61 | 			for (Mention m : cluster.getCorefMentions()) {
 62 | 				int sentenceID = m.sentNum;
 63 | 				List<Mention> mentions = goldOrderedMentionsBySentence.get(sentenceID);
 64 | 				int mStartIndex = m.startIndex;
 65 | 				int mEndIndex = m.endIndex;
 66 | 				for (Mention mention : mentions) {
 67 | 					int mentionStartIndex = mention.startIndex;
 68 | 					int mentionEndIndex = mention.endIndex;
 69 | 					if (mentionStartIndex == mStartIndex && mentionEndIndex == mEndIndex) {
 70 | 						mention.mentionID = m.mentionID;
 71 | 						break;
 72 | 					}
 73 | 				}
 74 | 				
 75 | 				int mentionID = m.mentionID;
 76 |                 Mention correspondingMention = document.allGoldMentions.get(mentionID);
 77 |                 correspondingMention.goldCorefClusterID = clusterID;
 78 | 			}
 79 | 		}
 80 | 	}
 81 | 	
 82 | 	/**
 83 | 	 * whether post-process the document
 84 | 	 * 
 85 | 	 * @param document
 86 | 	 */
 87 | 	public static void postProcessDocument(Document document) {
 88 | 		boolean postProcessGold = Boolean.parseBoolean(ExperimentConstructor.experimentProps.getProperty(EecbConstants.DATAGENERATION_POSTPROCESS_GOLD_PROP, "false"));
 89 | 		SieveCoreferenceSystem.postProcessing(document);
 90 | 		if (postProcessGold) {
 91 | 			SieveCoreferenceSystem.postProcessingGoldClusters(document);
 92 | 		}
 93 | 	}
 94 | 	
 95 | 	/**
 96 | 	 * merge from document to to document: four fields, which is just used for scoring in the system, not output for CoNLL scoring
 97 | 	 * 
 98 | 	 * @param from
 99 | 	 * @param to
100 | 	 */
101 | 	public static void mergeDocument(Document from, Document to) {
102 | 		// add allGoldMentions
103 | 		for (Integer key : from.allGoldMentions.keySet()) {
104 | 			to.allGoldMentions.put(key, from.allGoldMentions.get(key));
105 | 		}
106 | 		
107 | 		// add goldCorefClusters
108 | 		for (Integer key : from.goldCorefClusters.keySet()) {
109 | 			to.goldCorefClusters.put(key, from.goldCorefClusters.get(key));
110 | 		}
111 | 		
112 | 		// add allPredictedMentions
113 | 		for (Integer key : from.allPredictedMentions.keySet()) {
114 | 			to.allPredictedMentions.put(key, from.allPredictedMentions.get(key));
115 | 		}
116 | 		
117 | 		// add corefClusters
118 | 		for (Integer key : from.corefClusters.keySet()) {
119 | 			to.corefClusters.put(key, from.corefClusters.get(key));
120 | 		}
121 | 	}
122 | 	
123 | }
124 | 


--------------------------------------------------------------------------------