├── .gitignore ├── LICENSE.txt ├── README.md ├── SMALLDevOPCs.txt ├── SMALLTrainOPCs.txt ├── modifiedBCS ├── LICENSE.txt ├── README.md ├── WriteCoNLLPreds.sh ├── base.conf ├── build.sbt ├── lib │ ├── BerkeleyParser-1.7.jar │ └── futile.jar ├── moarcoref-assembly-1.jar ├── project │ └── assembly.sbt └── src │ └── main │ └── java │ └── edu │ ├── berkeley │ └── nlp │ │ └── coref │ │ ├── ConjType.java │ │ ├── ConllDoc.scala │ │ ├── ConllDocReader.scala │ │ ├── ConllDocWriter.scala │ │ ├── CorefConllScorer.scala │ │ ├── CorefDoc.scala │ │ ├── CorefDocAssembler.scala │ │ ├── CorefEvaluator.scala │ │ ├── CorefFeaturizerTrainer.scala │ │ ├── CorefSystem.scala │ │ ├── Decoder.scala │ │ ├── DepConstTree.scala │ │ ├── DocumentGraph.scala │ │ ├── DocumentInferencer.scala │ │ ├── DocumentInferencerBasic.scala │ │ ├── DocumentInferencerBinary.scala │ │ ├── DocumentInferencerLoopy.scala │ │ ├── DocumentInferencerOracle.scala │ │ ├── DocumentInferencerRahman.scala │ │ ├── Driver.java │ │ ├── EntityFeaturizer.scala │ │ ├── Feature.scala │ │ ├── GUtil.scala │ │ ├── Gender.java │ │ ├── LexicalCountsBundle.scala │ │ ├── Mention.scala │ │ ├── MentionPropertyComputer.scala │ │ ├── MentionType.java │ │ ├── Number.java │ │ ├── NumberGenderComputer.scala │ │ ├── OraclePosteriorSampler.scala │ │ ├── OrderedClustering.scala │ │ ├── OrderedClusteringBound.scala │ │ ├── PairwiseIndexingFeaturizer.scala │ │ ├── PairwiseIndexingFeaturizerJoint.scala │ │ ├── PairwiseLossFunctions.scala │ │ ├── PairwiseScorer.scala │ │ ├── PronounDictionary.scala │ │ ├── PruningStrategy.scala │ │ ├── WordNetInterfacer.scala │ │ ├── bp │ │ ├── DocumentFactorGraph.scala │ │ ├── Domain.scala │ │ ├── Factor.scala │ │ └── Node.scala │ │ ├── lang │ │ ├── ArabicTreebankLanguagePack.java │ │ ├── CorefLanguagePack.scala │ │ ├── Language.java │ │ ├── ModArabicHeadFinder.java │ │ └── ModCollinsHeadFinder.java │ │ ├── nchains │ │ └── DiscourseAnalyzer.scala │ │ ├── preprocess │ │ ├── NerDriver.java │ │ ├── NerExample.scala │ │ ├── NerSystem.scala │ │ ├── PreprocessingDriver.java │ │ ├── Reprocessor.scala │ │ ├── SentenceSplitter.scala │ │ └── SentenceSplitterTokenizerDriver.java │ │ └── sem │ │ ├── QueryCountAnalyzer.scala │ │ ├── QueryCountCollector.scala │ │ └── QueryCountsBundle.scala │ └── harvard │ └── nlp │ └── moarcoref │ ├── AnimacyHelper.java │ ├── FeatureExtractor.scala │ ├── MiniDriver.java │ ├── MoarLexicalCountsBundle.scala │ ├── SeparatingFeaturizer.scala │ ├── SeparatingFeaturizerKeepFirst.scala │ ├── SmallerSeparatingFeaturizer.scala │ └── TextPickler.scala ├── nn ├── ana_model.lua ├── ante_model.lua ├── clust_batcher.lua ├── coref_utils.lua ├── model_utils.lua ├── mr_clust_embed.lua ├── sparse_doc_data.lua └── vanilla_mr.lua ├── nncoref_acl15_slides.pdf ├── nncoref_naacl16_slides.pdf ├── reference-coreference-scorers └── v8.01 │ ├── README.txt │ ├── scorer.bat │ ├── scorer.pl │ └── test │ ├── CorefMetricTest.pm │ ├── CorefMetricTestConfig.pm │ ├── DataFiles │ ├── TC-A-1.response │ ├── TC-A-10.response │ ├── TC-A-11.response │ ├── TC-A-12.response │ ├── TC-A-13.response │ ├── TC-A-2.response │ ├── TC-A-3.response │ ├── TC-A-4.response │ ├── TC-A-5.response │ ├── TC-A-6.response │ ├── TC-A-7.response │ ├── TC-A-8.response │ ├── TC-A-9.response │ ├── TC-A.key │ ├── TC-B-1.response │ ├── TC-B.key │ ├── TC-C-1.response │ ├── TC-C.key │ ├── TC-D-1.response │ ├── TC-D.key │ ├── TC-E-1.response │ ├── TC-E.key │ ├── TC-F-1.response │ ├── TC-F.key │ ├── TC-G-1.response │ ├── TC-G.key │ ├── TC-H-1.response │ ├── TC-H.key │ ├── TC-I-1.response │ ├── TC-I.key │ ├── TC-J-1.response │ ├── TC-J.key │ ├── TC-K-1.response │ ├── TC-K.key │ ├── TC-L-1.response │ ├── TC-L.key │ ├── TC-M-1.response │ ├── TC-M-2.response │ ├── TC-M-3.response │ ├── TC-M-4.response │ ├── TC-M-5.response │ ├── TC-M-6.response │ ├── TC-M.key │ ├── TC-N-1.response │ ├── TC-N-2.response │ ├── TC-N-3.response │ ├── TC-N-4.response │ ├── TC-N-5.response │ ├── TC-N-6.response │ └── TC-N.key │ ├── TestCases.README │ └── test.pl └── text_feats_to_hdf5_replacezero.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # latex things 57 | *.aux 58 | *.out 59 | *.synctex.gz 60 | *.pdf 61 | *.blg 62 | *.bbl 63 | 64 | # temp files 65 | *~ 66 | 67 | # Compiled Lua sources 68 | luac.out 69 | 70 | # luarocks build files 71 | *.src.rock 72 | *.zip 73 | *.tar.gz 74 | 75 | # Object files 76 | *.o 77 | *.os 78 | *.ko 79 | *.obj 80 | *.elf 81 | 82 | # Precompiled Headers 83 | *.gch 84 | *.pch 85 | 86 | # Libraries 87 | *.lib 88 | *.a 89 | *.la 90 | *.lo 91 | *.def 92 | *.exp 93 | 94 | # Shared objects (inc. Windows DLLs) 95 | *.dll 96 | *.so 97 | *.so.* 98 | *.dylib 99 | 100 | # Executables 101 | *.exe 102 | *.out 103 | *.app 104 | *.i*86 105 | *.x86_64 106 | *.hex 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nn_coref 2 | Neural Coref Models, as described in 3 | ["Learning Global Features for Coreference Resolution"](http://nlp.seas.harvard.edu/papers/corefmain.pdf), Sam Wiseman, Alexander M. Rush, and Stuart M. Shieber, NAACL 2016, 4 | 5 | and 6 | 7 | ["Learning Anaphoricity and Antecedent Ranking Features for Coreference Resolution"](http://people.seas.harvard.edu/~srush/acl15.pdf), Sam Wiseman, Alexander M. Rush, Stuart M. Shieber, and Jason Weston. ACL 2015. 8 | 9 | For questions/concerns/bugs please contact swiseman at seas.harvard.edu. 10 | 11 | 12 | ## Overview 13 | To keep things simple, the original ACL code is now in the acl15 branch. This README will cover duplicating the NAACL 2016 results. 14 | 15 | ## Prerequisites 16 | In addition to torch, nn, and the prerequisites listed in modifiedBCS/README.md, you will need the Element-Research rnn library: https://github.com/Element-Research/rnn 17 | 18 | ## Generating Features 19 | See the README in the modifiedBCS/ directory for running the Scala feature/mention extractor. Once you've generated text feature files, use text_feats_to_hdf_5_replacezero.py to convert them to hdf5 (to be consumed by Torch), as follows: 20 | 21 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-anaphTrainFeats.txt train_small ana -n 4 -r 14215``` 22 | 23 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-anaphDevFeats.txt dev_small ana -n 4 -r 14215``` 24 | 25 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-anaphTestFeats.txt test_small ana -n 4 -r 14215``` 26 | 27 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-pwTrainFeats.txt train_small pw -n 4 -r 28394``` 28 | 29 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-pwDevFeats.txt dev_small pw -n 4 -r 28394``` 30 | 31 | ```python text_feats_to_hdf5_replacezero.py SMALL-FINAL+MOARANAPH+MOARPW-pwTestFeats.txt test_small pw -n 4 -r 28394``` 32 | 33 | The "-r" argument takes the index of a dummy feature used to replace features unseen in the training set; above it is set to be one greater than the number of training features (and should never be less than this). The "-n" argument controls the number of processes spawned by the script. 34 | 35 | You can also download bzipped hdf5 features here: https://drive.google.com/folderview?id=0B1ytQXPDuw7OVzI3MlRLMEFCcHM&usp=sharing 36 | 37 | **Before doing any training or pre-training, please create a directory called nn/models/** 38 | 39 | ## Pre-training 40 | Given the hdf5 files generated in the previous step, you can pre-train anaphoricity and pairwise networks as follows: 41 | 42 | ```th ana_model.lua``` 43 | 44 | ```th ante_model.lua -gpuid 0``` 45 | 46 | See the respective files for additional options and documentation. 47 | 48 | You can download bzipped pre-trained anaphoricity and pairwise networks from https://drive.google.com/folderview?id=0B1ytQXPDuw7OYUcwSEVPRjFEM00&usp=sharing , where they are called small_200.model-na-0.100000.bz2 and small_700.model-pw-0.100000.bz2, respectively. 49 | 50 | ## Training the Full Model 51 | Assuming you've put your pre-trained networks in nn/models/, you can now train the full model as follows: 52 | 53 | ```th mr_clust_embed.lua -gpuid 0 -PT -save -savePfx trpldev``` 54 | 55 | The default settings in mr_clust_embed.lua reflect those used in our final experiments (and so, for instance, both dev and train will be used as training data), but see the file for additional options and documentation. 56 | 57 | You can download bzipped trained full model components from https://drive.google.com/folderview?id=0B1ytQXPDuw7OYUcwSEVPRjFEM00&usp=sharing , where the relevant files are trpldev-mce-700-200.model-na.bz2, trpldev-mce-700-200.model-pw.bz2, and trpldev-mce-700-200.model-lstm.bz2 58 | 59 | ## Predicting with Saved Models 60 | If you've trained (or downloaded) full model components, you can make predictions as follows: 61 | 62 | - If they don't exist, create the directories nn/bps/ and nn/conllouts/ . 63 | - Run ```th mr_clust_embed.lua -gpuid 0 -loadAndPredict -pwDevFeatPrefix test_small -anaDevFeatPrefix test_small -savedPWNetFi models/trpldev-mce-700-200.model-pw -savedNANetFi models/trpldev-mce-700-200.model-na -savedLSTMFi models/trpldevdup-mce-700-200.model-lstm``` 64 | - The above will create a back-pointer file in bps/ . Suppose the file is called bps/xyzdev.bps . Then to generate a CoNLL output file, run ```../modifiedBCS/WriteCoNLLPreds.sh bps bps/xyzdev.bps conllouts ../flat_test_2012/ ../gender.data``` 65 | - N.B. You may need to modify the paths to the jar files on the second line of modifiedBCS/WriteCoNLLPreds.sh to get this to work 66 | - The resulting output file (in conllouts/) can now be scored using the standard CoNLL scorer. 67 | 68 | Training as in the previous sub-section and evaluating as above should produce results very close to those in the NAACL paper, and probably a bit better. After re-training the cleaned-up and re-factored version in this repo, I got P/R/F scores of: 69 | 70 | MUC: 77.14/70.12/73.46 71 | 72 | BCUB: 66.43/57.47/61.62 73 | 74 | CEAFe: 62.29/54.01/57.85 75 | 76 | CoNLL: 64.31 77 | 78 | ## Training the ACL (non-cluster) Model 79 | The mention-ranking model from the ACL paper has been re-implemented and considerably simplified in vanilla_mr.lua. It can be run as follows: 80 | 81 | ```th vanilla_mr.lua -gpuid 0 -PT``` 82 | 83 | Unlike the original ACL implementation, this implementation is easy to run on a GPU, and with the new, even-smaller feature-set it should do at least as well. 84 | 85 | ## Copyright 86 | Copyright (c) 2016 Sam Wiseman. All Rights Reserved. 87 | 88 | ## License 89 | The code in this repository is covered by a GNU GPL License. See LICENSE.txt. 90 | 91 | -------------------------------------------------------------------------------- /modifiedBCS/README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | This directory contains the code necessary for extracting features and "oracle predicted clusters" (which are used as supervision) from the English CoNLL data. 4 | 5 | We use code written on top of the Berkeley Coref System (BCS) v1.1 (see http://nlp.cs.berkeley.edu/projects/coref.shtml) to extract features, and so we have included the BCS v1.1 code along with its license and dependencies here. The BCS code is in src/main/java/edu/berkeley/* and any additional code we have added is in src/main/java/edu/harvard/* . 6 | 7 | ## Compilation 8 | 9 | Although we provide a pre-compiled jar ("moarcoref-assembly-1.jar") in the modifiedBCS/ directory, you can use sbt to re-compile the scala and java source. After downloading sbt (www.scala-sbt.org), simply type 10 | 11 | ``` 12 | sbt assembly 13 | ``` 14 | 15 | from inside the modifiedBCS/ directory, which will produce a runnable jar in the target/ subdirectory. 16 | 17 | ## Data Prerequisites 18 | 19 | To extract features you will need the CoNLL 2012 English train, development, and test data, as well as the number and gender data that goes along with it. See http://conll.cemantix.org/2012/data.html for instructions on downloading and extracting it. 20 | 21 | BCS expects the CoNLL data to be in a flattened directories, so that all train, development, and test files are in flat train, development, and test directories (resp.). If you've extracted the CoNLL data into a top-level directory called conll-2012/, you can create a flattened train directory flat_train_2012/ using the following python code: 22 | 23 | ```python 24 | import subprocess 25 | import shutil 26 | import os 27 | 28 | def flatten(root_dir,flat_dir,file_suf="auto_conll"): 29 | if not os.path.exists(flat_dir): 30 | os.makedirs(flat_dir) 31 | 32 | matches = subprocess.check_output("find %s -name *%s" % (root_dir,file_suf),shell=True) 33 | matches = matches.split('\n')[:-1] 34 | for match in matches: 35 | match_fields = match.split('/') 36 | shutil.copyfile(match, os.path.join(flat_dir,match_fields[-4]+"_"+match_fields[-1])) 37 | 38 | 39 | flatten("conll-2012/v4/data/train/data/english", "flat_train_2012") 40 | ``` 41 | 42 | The same goes for creating flattened development and test directories. 43 | 44 | You will also need the list of animate and inanimate unigrams used by the Stanford Coref system. These can be found in the Stanford CoreNLP models jar under edu.stanford.nlp.models.dcoref . 45 | 46 | ## Running 47 | 48 | To extract the features described in the (NAACL) paper, first create a directory to store log files (say, `execdir'), and then type the following 49 | 50 | ``` 51 | java -jar -Xmx30g modifiedBCS/target/scala-2.11/moarcoref-assembly-1.jar ++modifiedBCS/base.conf -execDir execdir -numberGenderData gender.data -animacyPath animate.unigrams.txt -inanimacyPath inanimate.unigrams.txt -trainPath flat_train_2012 -devPath flat_dev_2012 -testPath flat_test_2012 -mode SMALLER -conjType NONE -pairwiseFeats FINAL+MOARANAPH+MOARPW 52 | ``` 53 | 54 | The above assumes the gender and animacy files are in the current directory, and that the flattened CoNLL directories are flat_train_2012/, flat_dev_2012/, and flat_test_2012/. 55 | 56 | The argument to pairwiseFeats specifies which features to extract. The argument `FINAL+MOARANAPH+MOARPW` corresponds to the features described in the paper. 57 | 58 | There are additional options described in edu.harvard.nlp.moarcoref.MiniDriver.java. 59 | 60 | ## Output Generated 61 | 62 | Running as above should give you 10 files, as follows: 63 | 64 | - SMALL-FINAL+MOARANAPH+MOARPW-anaph\[Train|Dev|Test\]Feats.txt 65 | 66 | Anaphoricity features. These files put each document on its own line, with each line having the following format: 67 | 68 | ``` 69 | num_mentions_in_doc|ment_0_feat_0 ment_0_feat_1 ...|ment_n_feat_0 ... 70 | ``` 71 | 72 | where n is the number of mentions in the document. 73 | 74 | - SMALL-FINAL+MOARANAPH+MOARPW-pw\[Train|Dev|Test\]Feats.txt 75 | 76 | Pairwise features. These files put each document on its own line, with each line having the following format: 77 | 78 | ``` 79 | num_mentions_in_doc|ment_0_ant_0_feat_0 ment_0_ant_0_feat_1 ...|ment_1_ant_0_feat_0 ment_1_ant_0_feat_1 ...|...|ment_n_ant_n_feat_0 ... 80 | ``` 81 | 82 | As such, there are n(n+1)/2 cells containing features on each line (one for each pair of mention-antecedent pairs plus self-link mention-mention pairs), and n(n+1)/2+1 cells in total, because the first cell contains the number of mentions. Since the pairwise features do not make sense for the self-link mention-mention pairs, we simply insert a dummy integer in the corresponding cell. 83 | 84 | - SMALL-FINAL+MOARANAPH+MOARPW-\[anaph|pw\]Mapping.txt 85 | 86 | A file mapping feature index numbers to feature descriptions. Each feature is on its own line, and the format is: 87 | 88 | ``` 89 | feature_idx : feature_description 90 | ``` 91 | 92 | - SMALL\[Train|Dev\]OPCs.txt 93 | 94 | Oracle Predicted Clustering files. These are the clusterings induced by the true gold clusters on the mentions extracted by the automatic mention extractor, and they constitute the supervision for this task. Again each document is on its own line, where each line contains clusters separated by a `|`, and the mention indices within a cluster are separated by a space, and are in ascending order. For example the following line 95 | 96 | ``` 97 | 0|1 2 4|3 98 | ``` 99 | 100 | indicates that there are 3 clusters over 5 mentions, with the first and third cluster just containing the first and fourth mentions (resp.), and the second cluster containing the 2nd, 3rd, and 5th mentions. 101 | 102 | ## System Requirements 103 | 104 | In addition to sbt you will need java. When running without any real memory restrictions, feature extraction requires around 30GB of RAM; it's likely that you can get away with a bit less than this, however. 105 | 106 | -------------------------------------------------------------------------------- /modifiedBCS/WriteCoNLLPreds.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | exec scala -J-Xmx3G -classpath "moarcoref-assembly-1.jar:lib/futile.jar:lib/BerkeleyParser-1.7.jar" "$0" "$@" 3 | !# 4 | 5 | import java.io._ 6 | import scala.collection.mutable.ListBuffer 7 | import scala.collection.mutable.ArrayBuffer 8 | import scala.io.Source 9 | import edu.berkeley.nlp.coref.NumberGenderComputer 10 | import edu.berkeley.nlp.coref._ 11 | import edu.berkeley.nlp.futile.fig.basic.IOUtils 12 | import edu.berkeley.nlp.futile.util.Logger 13 | 14 | object BP2CoNLL { 15 | 16 | // the following two functions are just copied from BCS CorefSystem.scala 17 | def checkFileReachableForRead(file: String, msg: String) { 18 | if (file.isEmpty) { 19 | throw new RuntimeException("Undefined " + msg + "; must be defined for the mode you're running in"); 20 | } 21 | if (!new File(file).exists()) { 22 | throw new RuntimeException(msg + " file/directory doesn't exist for read: " + file); 23 | } 24 | } 25 | def checkFileReachableForWrite(file: String, msg: String) { 26 | if (file.isEmpty) { 27 | throw new RuntimeException("Undefined " + msg + "; must be defined for the mode you're running in"); 28 | } 29 | 30 | if (file.contains("/") && !new File(file).getParentFile().exists()) { 31 | throw new RuntimeException(msg + " file/directory couldn't be opened for write: " + file); 32 | } 33 | } 34 | 35 | // same as original, except we sort files by names so we can dump features and then repredict 36 | def loadRawConllDocs(path: String, size: Int, gold: Boolean): Seq[ConllDoc] = { 37 | val suffix = if (gold) "gold_conll" else Driver.docSuffix; 38 | Logger.logss("Loading " + size + " docs from " + path + " ending with " + suffix); 39 | val files = new File(path).listFiles().filter(file => file.getAbsolutePath.endsWith(suffix)); //.sorted; 40 | val reader = new ConllDocReader(Driver.lang); 41 | val docs = new ArrayBuffer[ConllDoc]; 42 | var docCounter = 0; 43 | var fileIdx = 0; 44 | while (fileIdx < files.size && (size == -1 || docCounter < size)) { 45 | val newDocs = reader.readConllDocs(files(fileIdx).getAbsolutePath); 46 | docs ++= newDocs; 47 | docCounter += newDocs.size 48 | fileIdx += 1; 49 | } 50 | val numDocs = if (size == -1) docs.size else Math.min(size, files.size); 51 | Logger.logss(docs.size + " docs loaded from " + fileIdx + " files, retaining " + numDocs); 52 | if (docs.size == 0) { 53 | Logger.logss("WARNING: Zero docs loaded...double check your paths unless you meant for this happen"); 54 | } 55 | val docsToUse = docs.slice(0, numDocs); 56 | 57 | docsToUse; 58 | } 59 | 60 | // same as in original 61 | def loadCorefDocs(path: String, size: Int, numberGenderComputer: NumberGenderComputer, gold: Boolean): Seq[CorefDoc] = { 62 | val docs = loadRawConllDocs(path, size, gold); 63 | val assembler = CorefDocAssembler(Driver.lang, Driver.useGoldMentions); 64 | val mentionPropertyComputer = new MentionPropertyComputer(numberGenderComputer); 65 | val corefDocs = docs.map(doc => assembler.createCorefDoc(doc, mentionPropertyComputer)); 66 | CorefDoc.checkGoldMentionRecall(corefDocs); 67 | corefDocs; 68 | } 69 | 70 | def main(args: Array[String]) { 71 | val indir = args(0); 72 | val bpfi = args(1); 73 | val outdir = args(2); 74 | val devPath = args(3); 75 | val ngPath = args(4); 76 | val numberGenderComputer = NumberGenderComputer.readBergsmaLinData(ngPath); 77 | val devDGs = loadCorefDocs(devPath, -1, numberGenderComputer, false).map(new DocumentGraph(_, false)).sortBy(_.corefDoc.rawDoc.printableDocName); 78 | //val files = new File(indir).listFiles().filter(file => file.getAbsolutePath.contains(".bps")); 79 | val files = new File(indir).listFiles().filter(file => file.getAbsolutePath.contains(bpfi)); 80 | for (fi <- files) { 81 | println("doing " + fi.getAbsolutePath()); 82 | val bps = ListBuffer[Array[Int]](); 83 | for (line <- Source.fromFile(fi.getAbsolutePath()).getLines()) { 84 | val preds = line.split(' '); 85 | bps += preds.map(x => x.toInt); 86 | } 87 | val allPredBackptrs = bps.toArray; 88 | val allPredClusterings = (0 until devDGs.size).map(i => OrderedClustering.createFromBackpointers(allPredBackptrs(i))).toArray; 89 | val writer = IOUtils.openOutHard(outdir+"/" + fi.getName() + ".out"); 90 | for (i <- 0 until devDGs.size) { 91 | val outputClustering = new OrderedClusteringBound(devDGs(i).getMentions, allPredClusterings(i)); 92 | ConllDocWriter.writeDoc(writer, devDGs(i).corefDoc.rawDoc, outputClustering.postprocessForConll()); 93 | } 94 | writer.close(); 95 | } 96 | } 97 | 98 | } 99 | 100 | BP2CoNLL.main(args) 101 | -------------------------------------------------------------------------------- /modifiedBCS/base.conf: -------------------------------------------------------------------------------- 1 | create true 2 | useStandardExecPoolDirStrategy false 3 | overwriteExecDir true 4 | execDir specify_execDir 5 | -------------------------------------------------------------------------------- /modifiedBCS/build.sbt: -------------------------------------------------------------------------------- 1 | name := "moarcoref" 2 | 3 | version := "1" 4 | 5 | scalaVersion := "2.11.7" 6 | 7 | mainClass in assembly := Some("edu.harvard.nlp.moarcoref.MiniDriver") 8 | 9 | -------------------------------------------------------------------------------- /modifiedBCS/lib/BerkeleyParser-1.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiseman/nn_coref/e29b16deecd0d87d4b7c145e07e2908266fe63d6/modifiedBCS/lib/BerkeleyParser-1.7.jar -------------------------------------------------------------------------------- /modifiedBCS/lib/futile.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiseman/nn_coref/e29b16deecd0d87d4b7c145e07e2908266fe63d6/modifiedBCS/lib/futile.jar -------------------------------------------------------------------------------- /modifiedBCS/moarcoref-assembly-1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiseman/nn_coref/e29b16deecd0d87d4b7c145e07e2908266fe63d6/modifiedBCS/moarcoref-assembly-1.jar -------------------------------------------------------------------------------- /modifiedBCS/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7") 2 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/ConjType.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref; 2 | 3 | 4 | public enum ConjType { 5 | NONE, TYPE, TYPE_OR_RAW_PRON, CANONICAL, CANONICAL_NOPRONPRON, CANONICAL_ONLY_PAIR_CONJ, CANONICAL_OR_COMMON; 6 | } 7 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/ConllDoc.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.syntax.Tree 3 | 4 | // Chunks are semi-inclusive intervals. 5 | case class Chunk[T](val start: Int, 6 | val end: Int, 7 | val label: T); 8 | 9 | // rawText should only be used to save trouble when outputting the document 10 | // for scoring; never at any other time! 11 | case class ConllDoc(val docID: String, 12 | val docPartNo: Int, 13 | val words: Seq[Seq[String]], 14 | val pos: Seq[Seq[String]], 15 | val trees: Seq[DepConstTree], 16 | val nerChunks: Seq[Seq[Chunk[String]]], 17 | val corefChunks: Seq[Seq[Chunk[Int]]], 18 | val speakers: Seq[Seq[String]], 19 | val rawText: Seq[Seq[String]]) { 20 | 21 | val numSents = words.size; 22 | 23 | // updating...blah 24 | val allSpeakers = scala.collection.mutable.Set[String](); 25 | var gatheredSpeakers = false; 26 | 27 | def getSpeakers():scala.collection.mutable.Set[String] = { 28 | if (gatheredSpeakers){ 29 | return allSpeakers; 30 | } else { 31 | for (speakerSent <- speakers){ 32 | for (speaker <- speakerSent){ 33 | allSpeakers.add(speaker.replace("-","").replace("_","").replace(".","").toLowerCase); 34 | } 35 | } 36 | gatheredSpeakers = true; 37 | return allSpeakers; 38 | } 39 | } 40 | 41 | def printableDocName = docID + " (part " + docPartNo + ")"; 42 | 43 | def isConversation = docID.startsWith("bc") || docID.startsWith("wb"); 44 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/ConllDocWriter.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | 3 | import java.io.PrintWriter 4 | import scala.collection.mutable.HashMap 5 | import scala.collection.mutable.ArrayBuffer 6 | import edu.berkeley.nlp.coref.preprocess.PreprocessingDriver 7 | import edu.berkeley.nlp.futile.syntax.Tree 8 | import edu.berkeley.nlp.coref.preprocess.Reprocessor 9 | import scala.collection.mutable.HashSet 10 | import scala.collection.JavaConverters._ 11 | import edu.berkeley.nlp.futile.util.Logger 12 | 13 | object ConllDocWriter { 14 | 15 | def writeDoc(writer: PrintWriter, conllDoc: ConllDoc, clustering: OrderedClusteringBound) { 16 | // writeDocIncompleteConll(writer, conllDoc.docID, conllDoc.docPartNo, conllDoc.words, conllDoc.pos, conllDoc.trees.map(_.constTree), conllDoc.speakers, conllDoc.nerChunks, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size)); 17 | val corefBits = getCorefBits(conllDoc.words, convertOrderedClusteringBoundToChunks(clustering, conllDoc.words.size)); 18 | val numZeroesToAddToPartNo = 3 - conllDoc.docPartNo.toString.size; 19 | writer.println("#begin document (" + conllDoc.docID + "); part " + ("0" * numZeroesToAddToPartNo) + conllDoc.docPartNo); 20 | for (sentIdx <- 0 until conllDoc.rawText.size) { 21 | val sent = conllDoc.rawText(sentIdx); 22 | for (tokenIdx <- 0 until sent.size) { 23 | val line = conllDoc.rawText(sentIdx)(tokenIdx); 24 | val lineNoCoref = line.substring(0, Math.max(line.lastIndexOf("\t"), line.lastIndexOf(" ")) + 1); 25 | // writer.println(lineNoCoref + corefBits(sentIdx)(tokenIdx)); 26 | writer.println(lineNoCoref.replaceAll("\\s+", "\t") + corefBits(sentIdx)(tokenIdx)); 27 | } 28 | writer.println(); 29 | } 30 | writer.println("#end document"); 31 | } 32 | 33 | // Doesn't write predicate-argument structures, senses, or lemmas (but we don't use these). 34 | def writeIncompleteConllDoc(writer: PrintWriter, 35 | docName: String, 36 | partNo: Int, 37 | words: Seq[Seq[String]], 38 | pos: Seq[Seq[String]], 39 | parses: Seq[Tree[String]], 40 | speakers: Seq[Seq[String]], 41 | nerChunks: Seq[Seq[Chunk[String]]], 42 | corefChunks: Seq[Seq[Chunk[Int]]]) { 43 | val numZeroesToAddToPartNo = 3 - partNo.toString.size; 44 | val corefBits = getCorefBits(words, corefChunks); 45 | val parseBits = parses.map(tree => PreprocessingDriver.computeParseBits(Reprocessor.convertFromFutileTree(tree))); 46 | val nerBits = getNerBits(words, nerChunks); 47 | writer.println("#begin document (" + docName + "); part " + ("0" * numZeroesToAddToPartNo) + partNo); 48 | for (sentIdx <- 0 until words.size) { 49 | val sent = words(sentIdx); 50 | for (i <- 0 until sent.size) { 51 | writer.println(docName + "\t" + partNo + "\t" + i + "\t" + words(sentIdx)(i) + "\t" + pos(sentIdx)(i) + "\t" + parseBits(sentIdx)(i) + 52 | "\t-\t-\t-\t" + speakers(sentIdx)(i) + "\t" + nerBits(sentIdx)(i) + "\t" + corefBits(sentIdx)(i)); 53 | } 54 | writer.println(); 55 | } 56 | writer.println("#end document"); 57 | } 58 | 59 | private def convertOrderedClusteringBoundToChunks(clustering: OrderedClusteringBound, numSentences: Int): Seq[Seq[Chunk[Int]]] = { 60 | val chunksPerSentence = Array.tabulate(numSentences)(i => new ArrayBuffer[Chunk[Int]]()); 61 | for (i <- 0 until clustering.ments.size) { 62 | val ment = clustering.ments(i); 63 | chunksPerSentence(ment.sentIdx) += new Chunk(ment.startIdx, ment.endIdx, clustering.clustering.getClusterIdx(i)); 64 | } 65 | chunksPerSentence; 66 | } 67 | 68 | private def getNerBits(words: Seq[Seq[String]], nerChunks: Seq[Seq[Chunk[String]]]): Seq[Seq[String]] = { 69 | for (sentIdx <- 0 until words.size) yield { 70 | val chunkStarts = new HashMap[Int,String]; 71 | val chunkEnds = new HashSet[Int]; 72 | Logger.logss("NER CHUNKS: " + nerChunks); 73 | for (chunk <- nerChunks(sentIdx)) { 74 | chunkStarts.put(chunk.start, chunk.label); 75 | chunkEnds += chunk.end - 1; 76 | } 77 | for (tokenIdx <- 0 until words(sentIdx).size) yield { 78 | if (chunkStarts.contains(tokenIdx) && chunkEnds.contains(tokenIdx)) { 79 | "(" + chunkStarts.get(tokenIdx).getOrElse("") + ")"; 80 | } else if (chunkStarts.contains(tokenIdx)) { 81 | "(" + chunkStarts.get(tokenIdx).getOrElse("") + "*"; 82 | } else if (chunkEnds.contains(tokenIdx)) { 83 | "*)"; 84 | } else { 85 | "*"; 86 | } 87 | } 88 | } 89 | } 90 | 91 | private def getCorefBits(words: Seq[Seq[String]], corefChunks: Seq[Seq[Chunk[Int]]]): Seq[Seq[String]] = { 92 | for (sentIdx <- 0 until words.size) yield { 93 | val mentionStarts = new HashMap[Int,ArrayBuffer[Int]]; 94 | val mentionEnds = new HashMap[Int,ArrayBuffer[Int]]; 95 | val mentionStartEnds = new HashMap[Int,Int]; 96 | val chunksThisSent = corefChunks(sentIdx); 97 | for (chunk <- chunksThisSent) { 98 | val start = chunk.start; 99 | val end = chunk.end - 1; 100 | if (start == end) { 101 | mentionStartEnds.put(start, chunk.label); 102 | } else { 103 | if (!mentionStarts.contains(start)) { 104 | mentionStarts.put(start, new ArrayBuffer[Int]()) 105 | } 106 | mentionStarts(start) += chunk.label; 107 | if (!mentionEnds.contains(end)) { 108 | mentionEnds.put(end, new ArrayBuffer[Int]()) 109 | } 110 | mentionEnds(end) += chunk.label; 111 | } 112 | } 113 | for (tokenIdx <- 0 until words(sentIdx).size) yield { 114 | var corefBit = ""; 115 | if (mentionStarts.contains(tokenIdx)) { 116 | for (start <- mentionStarts(tokenIdx)) { 117 | corefBit += "(" + start + "|"; 118 | } 119 | } 120 | if (mentionStartEnds.contains(tokenIdx)) { 121 | corefBit += "(" + mentionStartEnds(tokenIdx) + ")|"; 122 | } 123 | if (mentionEnds.contains(tokenIdx)) { 124 | for (end <- mentionEnds(tokenIdx)) { 125 | corefBit += end + ")|"; 126 | } 127 | } 128 | if (corefBit.isEmpty) "-" else corefBit.dropRight(1); 129 | } 130 | } 131 | } 132 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/CorefConllScorer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import java.io.File 3 | import java.io.PrintWriter 4 | import java.util.regex.Pattern 5 | 6 | import scala.collection.mutable.ArrayBuffer 7 | import scala.collection.mutable.HashMap 8 | import scala.sys.process.stringSeqToProcess 9 | import scala.sys.process.Process 10 | 11 | import edu.berkeley.nlp.futile.util.Logger 12 | 13 | class CorefConllScorer(val conllEvalScriptPath: String) { 14 | 15 | def renderFinalScore(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound]) = { 16 | val summary = score(conllDocs, rawPredClusterings, goldClusterings, true); 17 | CorefConllScorer.processConllString(summary, false); 18 | } 19 | 20 | def renderSuffStats(conllDoc: ConllDoc, rawPredClustering: OrderedClusteringBound, goldClustering: OrderedClusteringBound) = { 21 | val summary = score(Seq(conllDoc), Seq(rawPredClustering), Seq(goldClustering), false); 22 | CorefConllScorer.processConllString(summary, true); 23 | } 24 | 25 | def score(conllDocs: Seq[ConllDoc], rawPredClusterings: Seq[OrderedClusteringBound], goldClusterings: Seq[OrderedClusteringBound], saveTempFiles: Boolean) = { 26 | val predClusterings = rawPredClusterings.map(_.postprocessForConll()); 27 | // var predFile = File.createTempFile("temp", ".conll"); 28 | val (predFile, goldFile) = if (Driver.conllOutputDir != "" && saveTempFiles) { 29 | val pFile = File.createTempFile("temp", ".conll", new File(Driver.conllOutputDir)); 30 | val gFile = new File(pFile.getAbsolutePath() + "-gold"); 31 | Logger.logss("PRED FILE: " + pFile.getAbsolutePath()); 32 | Logger.logss("GOLD FILE: " + gFile.getAbsolutePath()); 33 | Logger.logss("To score, run:"); 34 | Logger.logss("perl scorer.pl all " + gFile.getAbsolutePath() + " " + pFile.getAbsolutePath() + " none"); 35 | (pFile, gFile); 36 | } else { 37 | val pFile = File.createTempFile("temp", ".conll"); 38 | val gFile = new File(pFile.getAbsolutePath() + "-gold"); 39 | pFile.deleteOnExit(); 40 | gFile.deleteOnExit(); 41 | (pFile, gFile); 42 | } 43 | val predWriter = new PrintWriter(predFile); 44 | val goldWriter = new PrintWriter(goldFile); 45 | for (i <- 0 until conllDocs.size) { 46 | ConllDocWriter.writeDoc(predWriter, conllDocs(i), predClusterings(i)); 47 | ConllDocWriter.writeDoc(goldWriter, conllDocs(i), goldClusterings(i)); 48 | } 49 | // Flush and close the buffers 50 | predWriter.close(); 51 | goldWriter.close(); 52 | // Build and run the process for the CoNLL eval script script 53 | import scala.sys.process._ 54 | val output = Process(Seq(conllEvalScriptPath, "all", goldFile.getAbsolutePath(), predFile.getAbsolutePath(), "none")).lines; 55 | output.reduce(_ + "\n" + _); 56 | } 57 | } 58 | 59 | object CorefConllScorer { 60 | 61 | def processConllString(summary: String, renderSuffStats: Boolean) = { 62 | val pr = Pattern.compile("Coreference:.*\\(([0-9.]+) / ([0-9.]+)\\).*\\(([0-9.]+) / ([0-9.]+)\\)"); 63 | val prMatcher = pr.matcher(summary); 64 | var prCount = 0; 65 | var (mucPNum, mucPDenom, mucRNum, mucRDenom) = (0.0, 0.0, 0.0, 0.0); 66 | var (bcubPNum, bcubPDenom, bcubRNum, bcubRDenom) = (0.0, 0.0, 0.0, 0.0); 67 | var (ceafePNum, ceafePDenom, ceafeRNum, ceafeRDenom) = (0.0, 0.0, 0.0, 0.0); 68 | // Four matches: MUC, B-cubed, CEAFM, CEAFE (BLANC doesn't match because of different formatting) 69 | while (prMatcher.find()) { 70 | if (prCount == 0) { 71 | mucRNum = prMatcher.group(1).toDouble; 72 | mucRDenom = prMatcher.group(2).toDouble; 73 | mucPNum = prMatcher.group(3).toDouble; 74 | mucPDenom = prMatcher.group(4).toDouble; 75 | } 76 | if (prCount == 1) { 77 | bcubRNum = prMatcher.group(1).toDouble; 78 | bcubRDenom = prMatcher.group(2).toDouble; 79 | bcubPNum = prMatcher.group(3).toDouble; 80 | bcubPDenom = prMatcher.group(4).toDouble; 81 | } 82 | if (prCount == 3) { 83 | ceafeRNum = prMatcher.group(1).toDouble; 84 | ceafeRDenom = prMatcher.group(2).toDouble; 85 | ceafePNum = prMatcher.group(3).toDouble; 86 | ceafePDenom = prMatcher.group(4).toDouble; 87 | } 88 | prCount += 1; 89 | } 90 | val mucP = mucPNum/mucPDenom * 100.0; 91 | val mucR = mucRNum/mucRDenom * 100.0; 92 | val mucF = 2 * mucP * mucR/(mucP + mucR); 93 | val bcubP = bcubPNum/bcubPDenom * 100.0; 94 | val bcubR = bcubRNum/bcubRDenom * 100.0; 95 | val bcubF = 2 * bcubP * bcubR/(bcubP + bcubR); 96 | val ceafeP = ceafePNum/ceafePDenom * 100.0; 97 | val ceafeR = ceafeRNum/ceafeRDenom * 100.0; 98 | val ceafeF = 2 * ceafeP * ceafeR/(ceafeP + ceafeR); 99 | val avg = (mucF + bcubF + ceafeF)/3.0; 100 | if (renderSuffStats) { 101 | "MUC/BCUB/CEAFE P/R N/D:\t" + mucPNum + "\t" + mucPDenom + "\t" + mucRNum + "\t" + mucRDenom + "\t" + bcubPNum + "\t" + bcubPDenom + "\t" + bcubRNum + "\t" + bcubRDenom + "\t" + ceafePNum + "\t" + ceafePDenom + "\t" + ceafeRNum + "\t" +ceafeRDenom; 102 | } else { 103 | "MUC P-R-F1, BCUB P-R-F1, CEAFE P-R-F1, Average:\t" + fmt(mucP) + "\t" + fmt(mucR) + "\t" + fmt(mucF) + "\t" + fmt(bcubP) + "\t" + fmt(bcubR) + "\t" + fmt(bcubF) + "\t" + fmt(ceafeP) + "\t" + fmt(ceafeR) + "\t" + fmt(ceafeF) + "\t" + fmt(avg) + "\n" + 104 | "MUC = " + fmt(mucF) + ", BCUB = " + fmt(bcubF) + ", CEAFE = " + fmt(ceafeF) + ", AVG = " + fmt(avg); 105 | } 106 | } 107 | 108 | private def fmt(d: Double): String = { 109 | val str = "" + (d + 0.005); 110 | str.substring(0, Math.min(str.length(), str.indexOf(".") + 3)); 111 | } 112 | 113 | def main(args: Array[String]) { 114 | import scala.sys.process._ 115 | val cmd = Seq("ls", "clean-data/"); 116 | println(cmd.lines.toIndexedSeq); 117 | } 118 | 119 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/CorefDoc.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import java.io.File 3 | 4 | import scala.collection.JavaConverters.asScalaBufferConverter 5 | import scala.collection.JavaConverters.mapAsScalaMapConverter 6 | import scala.collection.mutable.HashSet 7 | import scala.collection.mutable.ArrayBuffer 8 | import scala.collection.mutable.HashMap 9 | 10 | import edu.berkeley.nlp.coref.lang.Language 11 | import edu.berkeley.nlp.futile.syntax.Trees.PennTreeRenderer 12 | import edu.berkeley.nlp.futile.util.Counter 13 | import edu.berkeley.nlp.futile.util.Logger 14 | 15 | case class CorefDoc(val rawDoc: ConllDoc, 16 | val goldMentions: Seq[Mention], 17 | val goldClustering: OrderedClustering, 18 | val predMentions: Seq[Mention]) { 19 | 20 | var oraclePredOrderedClustering: OrderedClustering = null; 21 | 22 | def numPredMents = predMentions.size; 23 | 24 | /** 25 | * Determines and caches an "oracle predicted clustering." For each predicted mention: 26 | * --If that mention does not have a corresponding gold mention (start and end indices match): 27 | * --Put the current mention in its own cluster. 28 | * --If that mention does have a corresponding gold mention: 29 | * --Fetch that mention's antecedents (if any) 30 | * --Choose the first with a corresponding predicted mention (if any) 31 | * --Assign this mention as the current mention's parent. 32 | */ 33 | def getOraclePredClustering = { 34 | if (oraclePredOrderedClustering == null) { 35 | val predToGoldIdxMap = new HashMap[Int,Int](); 36 | val goldToPredIdxMap = new HashMap[Int,Int](); 37 | for (pIdx <- 0 until predMentions.size) { 38 | for (gIdx <- 0 until goldMentions.size) { 39 | val pMent = predMentions(pIdx); 40 | val gMent = goldMentions(gIdx); 41 | if (pMent.sentIdx == gMent.sentIdx && pMent.startIdx == gMent.startIdx && pMent.endIdx == gMent.endIdx) { 42 | predToGoldIdxMap.put(pIdx, gIdx); 43 | goldToPredIdxMap.put(gIdx, pIdx); 44 | } 45 | } 46 | } 47 | val oracleClusterIds = new ArrayBuffer[Int]; 48 | var nextClusterId = 0; 49 | for (predIdx <- 0 until predMentions.size) { 50 | // Fetch the parent 51 | var parent = -1; 52 | if (predToGoldIdxMap.contains(predIdx)) { 53 | val correspondingGoldIdx = predToGoldIdxMap(predIdx); 54 | // Find the antecedents of the corresponding gold mention 55 | val goldAntecedentIdxs = goldClustering.getAllAntecedents(correspondingGoldIdx); 56 | // For each one, do a weird data sanitizing check, then try to find a corresponding 57 | // predicted mention to use as the predicted parent 58 | for (goldAntecedentIdx <- goldAntecedentIdxs.reverse) { 59 | val correspondingGold = goldMentions(correspondingGoldIdx); 60 | val goldAntecedent = goldMentions(goldAntecedentIdx); 61 | // wsj_0990 has some duplicate gold mentions, need to handle these... 62 | val sameMention = goldAntecedent.sentIdx == correspondingGold.sentIdx && goldAntecedent.startIdx == correspondingGold.startIdx && goldAntecedent.endIdx == correspondingGold.endIdx 63 | if (!sameMention && goldToPredIdxMap.contains(goldAntecedentIdx)) { 64 | val predAntecedentIdx = goldToPredIdxMap(goldAntecedentIdx) 65 | if (predAntecedentIdx >= predIdx) { 66 | val ment = predMentions(predIdx); 67 | val predAntecedent = predMentions(predAntecedentIdx); 68 | Logger.logss("Monotonicity violated:\n" + 69 | "Antecedent(" + predAntecedentIdx + "): " + predAntecedent.startIdx + " " + predAntecedent.endIdx + " " + predAntecedent.headIdx + "\n" + 70 | "Current(" + predMentions.indexOf(ment) + "): " + ment.startIdx + " " + ment.endIdx + " " + ment.headIdx + "\n" + 71 | "Gold antecedent(" + goldMentions.indexOf(goldAntecedent) + "): " + goldAntecedent.startIdx + " " + goldAntecedent.endIdx + " " + goldAntecedent.headIdx + "\n" + 72 | "Gold current(" + goldMentions.indexOf(correspondingGold) + "): " + correspondingGold.startIdx + " " + correspondingGold.endIdx + " " + correspondingGold.headIdx); 73 | Logger.logss("Setting parent to -1..."); 74 | parent = -1; 75 | } else { 76 | parent = predAntecedentIdx 77 | } 78 | } 79 | } 80 | } 81 | // Now compute the oracle cluster ID 82 | val clusterId = if (parent == -1) { 83 | nextClusterId += 1; 84 | nextClusterId - 1; 85 | } else { 86 | oracleClusterIds(parent); 87 | } 88 | oracleClusterIds += clusterId; 89 | } 90 | oraclePredOrderedClustering = OrderedClustering.createFromClusterIds(oracleClusterIds); 91 | } 92 | oraclePredOrderedClustering 93 | } 94 | } 95 | 96 | object CorefDoc { 97 | 98 | def checkGoldMentionRecall(docs: Seq[CorefDoc]) { 99 | var numGMs = docs.map(_.goldMentions.size).reduce(_ + _); 100 | val numPMs = docs.map(_.predMentions.size).reduce(_ + _); 101 | val numNomPMs = docs.map(doc => doc.predMentions.filter(_.mentionType == MentionType.NOMINAL).size).reduce(_ + _); 102 | val numPropPMs = docs.map(doc => doc.predMentions.filter(_.mentionType == MentionType.PROPER).size).reduce(_ + _); 103 | val numPronPMs = docs.map(doc => doc.predMentions.filter(_.mentionType == MentionType.PRONOMINAL).size).reduce(_ + _); 104 | var numGMsRecalled = 0; 105 | var numGMsUnrecalledNonConstituents = 0; 106 | for (doc <- docs; gm <- doc.goldMentions) { 107 | if (doc.predMentions.filter(pm => pm.startIdx == gm.startIdx && pm.endIdx == gm.endIdx).size >= 1) { 108 | numGMsRecalled += 1; 109 | } else { 110 | if (!doc.rawDoc.trees(gm.sentIdx).isConstituent(gm.startIdx, gm.endIdx)) { 111 | numGMsUnrecalledNonConstituents += 1; 112 | } 113 | } 114 | } 115 | Logger.logss("Detected " + numPMs + " predicted mentons (" + numNomPMs + " nominal, " + numPropPMs + " proper, " + numPronPMs + " pronominal), " + 116 | numGMsRecalled + " / " + numGMs + " = " + (numGMsRecalled.toDouble/numGMs) + " gold mentions recalled (" + numGMsUnrecalledNonConstituents + " missed ones are not constituents)") 117 | } 118 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/Decoder.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import scala.collection.mutable.ArrayBuffer 3 | import edu.berkeley.nlp.futile.util.Logger 4 | 5 | object Decoder { 6 | 7 | def decodeMax(docGraph: DocumentGraph, probFcn: Int => Array[Double]): Array[Int] = { 8 | val backpointers = new Array[Int](docGraph.size); 9 | for (i <- 0 until docGraph.size) { 10 | val allProbs = probFcn(i); 11 | var bestIdx = -1; 12 | var bestProb = Double.NegativeInfinity; 13 | for (j <- 0 to i) { 14 | val currProb = allProbs(j); 15 | if (bestIdx == -1 || currProb > bestProb) { 16 | bestIdx = j; 17 | bestProb = currProb; 18 | } 19 | } 20 | backpointers(i) = bestIdx; 21 | } 22 | backpointers; 23 | } 24 | 25 | def decodeLeftToRightMarginalize(docGraph: DocumentGraph, probFcn: Int => Array[Double]): Array[Int] = { 26 | val clustersSoFar = new ArrayBuffer[ArrayBuffer[Int]](); 27 | val backpointers = new Array[Int](docGraph.size); 28 | for (i <- 0 until docGraph.size) { 29 | val allProbs = probFcn(i); 30 | val clusterProbs = clustersSoFar.map(_.foldLeft(0.0)((total, mentIdx) => total + allProbs(mentIdx))); 31 | // Logger.logss("All probs: " + allProbs.toSeq.zipWithIndex); 32 | // Logger.logss("Clusters so far: " + clustersSoFar); 33 | // Logger.logss("Cluster probs: " + clusterProbs.zipWithIndex); 34 | // Just a sanity-check, should return the same clusters as the max method 35 | // val clusterProbs = clustersSoFar.map(_.foldLeft(0.0)((total, mentIdx) => Math.max(total, allProbs(mentIdx)))); 36 | val startNewProb = allProbs(i); 37 | val bestClusterProbAndIdx = clusterProbs.zipWithIndex.foldLeft((0.0, -1))((bestProbAndIdx, currProbAndIdx) => if (bestProbAndIdx._1 < currProbAndIdx._1) currProbAndIdx else bestProbAndIdx); 38 | if (startNewProb > bestClusterProbAndIdx._1) { 39 | backpointers(i) = i; 40 | clustersSoFar += ArrayBuffer(i); 41 | } else { 42 | backpointers(i) = clustersSoFar(bestClusterProbAndIdx._2).last; 43 | clustersSoFar(bestClusterProbAndIdx._2) += i; 44 | } 45 | } 46 | backpointers; 47 | } 48 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/DocumentInferencer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.util.Logger 3 | import edu.berkeley.nlp.futile.fig.basic.Indexer 4 | 5 | trait DocumentInferencer { 6 | 7 | def getInitialWeightVector(featureIndexer: Indexer[String]): Array[Double]; 8 | 9 | def computeLikelihood(docGraph: DocumentGraph, 10 | pairwiseScorer: PairwiseScorer, 11 | lossFcn: (CorefDoc, Int, Int) => Double): Double; 12 | 13 | def addUnregularizedStochasticGradient(docGraph: DocumentGraph, 14 | pairwiseScorer: PairwiseScorer, 15 | lossFcn: (CorefDoc, Int, Int) => Double, 16 | gradient: Array[Double]); 17 | 18 | def viterbiDecode(docGraph: DocumentGraph, 19 | pairwiseScorer: PairwiseScorer): Array[Int]; 20 | 21 | def finishPrintStats(); 22 | 23 | def viterbiDecodeAll(docGraphs: Seq[DocumentGraph], pairwiseScorer: PairwiseScorer): Array[Array[Int]] = { 24 | val allPredBackptrs = new Array[Array[Int]](docGraphs.size); 25 | for (i <- 0 until docGraphs.size) { 26 | val docGraph = docGraphs(i); 27 | Logger.logs("Decoding " + i); 28 | val predBackptrs = viterbiDecode(docGraph, pairwiseScorer); 29 | allPredBackptrs(i) = predBackptrs; 30 | } 31 | allPredBackptrs; 32 | } 33 | 34 | def viterbiDecodeAllFormClusterings(docGraphs: Seq[DocumentGraph], pairwiseScorer: PairwiseScorer): (Array[Array[Int]], Array[OrderedClustering]) = { 35 | val allPredBackptrs = viterbiDecodeAll(docGraphs, pairwiseScorer); 36 | val allPredClusteringsSeq = (0 until docGraphs.size).map(i => OrderedClustering.createFromBackpointers(allPredBackptrs(i))); 37 | (allPredBackptrs, allPredClusteringsSeq.toArray) 38 | } 39 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/DocumentInferencerBasic.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.fig.basic.Indexer 3 | 4 | class DocumentInferencerBasic extends DocumentInferencer { 5 | 6 | def getInitialWeightVector(featureIndexer: Indexer[String]): Array[Double] = Array.fill(featureIndexer.size())(0.0); 7 | 8 | /** 9 | * N.B. always returns a reference to the same matrix, so don't call twice in a row and 10 | * attempt to use the results of both computations 11 | */ 12 | private def computeMarginals(docGraph: DocumentGraph, 13 | gold: Boolean, 14 | lossFcn: (CorefDoc, Int, Int) => Double, 15 | pairwiseScorer: PairwiseScorer): Array[Array[Double]] = { 16 | computeMarginals(docGraph, gold, lossFcn, docGraph.featurizeIndexAndScoreNonPrunedUseCache(pairwiseScorer)._2) 17 | } 18 | 19 | private def computeMarginals(docGraph: DocumentGraph, 20 | gold: Boolean, 21 | lossFcn: (CorefDoc, Int, Int) => Double, 22 | scoresChart: Array[Array[Double]]): Array[Array[Double]] = { 23 | // var marginals = new Array[Array[Double]](docGraph.doc.predMentions.size()); 24 | // for (i <- 0 until marginals.size) { 25 | // marginals(i) = Array.fill(i+1)(Double.NegativeInfinity); 26 | // } 27 | val marginals = docGraph.cachedMarginalMatrix; 28 | for (i <- 0 until docGraph.size) { 29 | var normalizer = 0.0; 30 | // Restrict to gold antecedents if we're doing gold, but don't load the gold antecedents 31 | // if we're not. 32 | val goldAntecedents: Seq[Int] = if (gold) docGraph.getGoldAntecedentsUnderCurrentPruning(i) else null; 33 | for (j <- 0 to i) { 34 | // If this is a legal antecedent 35 | if (!docGraph.isPruned(i, j) && (!gold || goldAntecedents.contains(j))) { 36 | // N.B. Including lossFcn is okay even for gold because it should be zero 37 | val unnormalizedProb = Math.exp(scoresChart(i)(j) + lossFcn(docGraph.corefDoc, i, j)); 38 | marginals(i)(j) = unnormalizedProb; 39 | normalizer += unnormalizedProb; 40 | } else { 41 | marginals(i)(j) = 0.0; 42 | } 43 | } 44 | for (j <- 0 to i) { 45 | marginals(i)(j) /= normalizer; 46 | } 47 | } 48 | marginals; 49 | } 50 | 51 | def computeLikelihood(docGraph: DocumentGraph, 52 | pairwiseScorer: PairwiseScorer, 53 | lossFcn: (CorefDoc, Int, Int) => Double): Double = { 54 | var likelihood = 0.0; 55 | val marginals = computeMarginals(docGraph, false, lossFcn, pairwiseScorer); 56 | for (i <- 0 until docGraph.size) { 57 | val goldAntecedents = docGraph.getGoldAntecedentsUnderCurrentPruning(i); 58 | var currProb = 0.0; 59 | for (j <- goldAntecedents) { 60 | currProb += marginals(i)(j); 61 | } 62 | var currLogProb = Math.log(currProb); 63 | if (currLogProb.isInfinite()) { 64 | currLogProb = -30; 65 | } 66 | likelihood += currLogProb; 67 | } 68 | likelihood; 69 | } 70 | 71 | def addUnregularizedStochasticGradient(docGraph: DocumentGraph, 72 | pairwiseScorer: PairwiseScorer, 73 | lossFcn: (CorefDoc, Int, Int) => Double, 74 | gradient: Array[Double]) = { 75 | val (featsChart, scoresChart) = docGraph.featurizeIndexAndScoreNonPrunedUseCache(pairwiseScorer); 76 | // N.B. Can't have pred marginals and gold marginals around at the same time because 77 | // they both live in the same cached matrix 78 | val predMarginals = this.computeMarginals(docGraph, false, lossFcn, scoresChart); 79 | for (i <- 0 until docGraph.size) { 80 | for (j <- 0 to i) { 81 | if (predMarginals(i)(j) > 1e-20) { 82 | addToGradient(featsChart(i)(j), -predMarginals(i)(j), gradient); 83 | } 84 | } 85 | } 86 | val goldMarginals = this.computeMarginals(docGraph, true, lossFcn, scoresChart); 87 | for (i <- 0 until docGraph.size) { 88 | for (j <- 0 to i) { 89 | if (goldMarginals(i)(j) > 1e-20) { 90 | addToGradient(featsChart(i)(j), goldMarginals(i)(j), gradient); 91 | } 92 | } 93 | } 94 | } 95 | 96 | private def addToGradient(feats: Seq[Int], scale: Double, gradient: Array[Double]) { 97 | var i = 0; 98 | while (i < feats.size) { 99 | val feat = feats(i); 100 | gradient(feat) += 1.0 * scale; 101 | i += 1; 102 | } 103 | } 104 | 105 | def viterbiDecode(docGraph: DocumentGraph, scorer: PairwiseScorer): Array[Int] = { 106 | val (featsChart, scoresChart) = docGraph.featurizeIndexAndScoreNonPrunedUseCache(scorer); 107 | if (Driver.decodeType == "sum") { 108 | val backptrs = Decoder.decodeLeftToRightMarginalize(docGraph, (idx: Int) => { 109 | val probs = scoresChart(idx); 110 | GUtil.expAndNormalizeiHard(probs); 111 | probs; 112 | }); 113 | backptrs; 114 | } else { 115 | val backptrs = Decoder.decodeMax(docGraph, (idx: Int) => { 116 | val probs = scoresChart(idx); 117 | GUtil.expAndNormalizeiHard(probs); 118 | probs; 119 | }); 120 | backptrs; 121 | } 122 | } 123 | 124 | def finishPrintStats() = {} 125 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/DocumentInferencerBinary.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import scala.collection.mutable.HashMap 3 | 4 | import edu.berkeley.nlp.futile.fig.basic.Indexer 5 | 6 | // TODO: Tune both of these, also try out some subsampling/reweighting approaches 7 | class DocumentInferencerBinary(val logThreshold: Double, 8 | val clusterType: String, 9 | val negativeClassWeight: Double) extends DocumentInferencer { 10 | 11 | def getInitialWeightVector(featureIndexer: Indexer[String]): Array[Double] = Array.fill(featureIndexer.size())(0.0); 12 | 13 | private def subsample(docGraph: DocumentGraph, i: Int): Seq[Int] = { 14 | (0 until i); 15 | } 16 | 17 | def computeLikelihood(docGraph: DocumentGraph, 18 | pairwiseScorer: PairwiseScorer, 19 | lossFcn: (CorefDoc, Int, Int) => Double): Double = { 20 | val (featsChart, scoresChart) = docGraph.featurizeIndexAndScoreNonPrunedUseCache(pairwiseScorer); 21 | var likelihood = 0.0; 22 | for (i <- 0 until docGraph.size) { 23 | for (j <- subsample(docGraph, i)) { 24 | val pos = docGraph.isGoldNoPruning(i, j); 25 | var increment = if (pos) { 26 | scoresChart(i)(j) - Math.log(1 + Math.exp(scoresChart(i)(j))) 27 | } else { 28 | negativeClassWeight * -Math.log(1 + Math.exp(scoresChart(i)(j))); 29 | } 30 | if (increment.isNegInfinity) { 31 | increment = -30; 32 | } 33 | likelihood += increment; 34 | } 35 | } 36 | likelihood; 37 | } 38 | 39 | def addUnregularizedStochasticGradient(docGraph: DocumentGraph, 40 | pairwiseScorer: PairwiseScorer, 41 | lossFcn: (CorefDoc, Int, Int) => Double, 42 | gradient: Array[Double]) = { 43 | val (featsChart, scoresChart) = docGraph.featurizeIndexAndScoreNonPrunedUseCache(pairwiseScorer); 44 | for (i <- 0 until docGraph.size) { 45 | for (j <- subsample(docGraph, i)) { 46 | val expedScore = Math.exp(scoresChart(i)(j)); 47 | if (docGraph.isGoldNoPruning(i, j)) { 48 | addToGradient(featsChart(i)(j), 1.0 - expedScore/(1.0 + expedScore), gradient); 49 | } else { 50 | addToGradient(featsChart(i)(j), negativeClassWeight * -expedScore/(1.0 + expedScore), gradient); 51 | } 52 | } 53 | } 54 | } 55 | 56 | private def addToGradient(feats: Seq[Int], scale: Double, gradient: Array[Double]) { 57 | var i = 0; 58 | while (i < feats.size) { 59 | val feat = feats(i); 60 | gradient(feat) += 1.0 * scale; 61 | i += 1; 62 | } 63 | } 64 | 65 | def viterbiDecode(docGraph: DocumentGraph, scorer: PairwiseScorer): Array[Int] = { 66 | val (featsChart, scoresChart) = docGraph.featurizeIndexAndScoreNonPrunedUseCache(scorer); 67 | clusterType match { 68 | case "CLOSEST_FIRST" => { 69 | (0 until docGraph.size).map(i => { 70 | var nearest = i; 71 | for (j <- i-1 to 0 by -1) { 72 | if (nearest == i && scoresChart(i)(j) > logThreshold) { 73 | nearest = j; 74 | } 75 | } 76 | nearest; 77 | }).toArray; 78 | } 79 | case "BEST_FIRST" => { 80 | (0 until docGraph.size).map(i => { 81 | var best = i; 82 | var bestScore = Double.NegativeInfinity; 83 | for (j <- i-1 to 0 by -1) { 84 | if (scoresChart(i)(j) > logThreshold && scoresChart(i)(j) > bestScore) { 85 | best = j; 86 | bestScore = scoresChart(i)(j); 87 | } 88 | } 89 | best; 90 | }).toArray; 91 | } 92 | case _ => { // TRANSITIVE_CLOSURE 93 | var mapping = new HashMap[Int,Int](); 94 | var nextClusterIndex = 0; 95 | for (i <- 0 until docGraph.size) { 96 | var edgeAlreadyFound = false; 97 | for (j <- 0 until i) { 98 | if (scoresChart(i)(j) > logThreshold) { 99 | var antecedentCluster = mapping(j); 100 | // Merge the two 101 | if (edgeAlreadyFound && antecedentCluster != mapping(i)) { 102 | var newCluster = mapping(i); 103 | for (mentIdx <- mapping.keySet) { 104 | if (mapping(mentIdx) == antecedentCluster) { 105 | mapping(mentIdx) = newCluster; 106 | } 107 | } 108 | } else { 109 | edgeAlreadyFound = true; 110 | mapping(i) = antecedentCluster; 111 | } 112 | } 113 | } 114 | if (!edgeAlreadyFound) { 115 | mapping(i) = nextClusterIndex; 116 | nextClusterIndex += 1; 117 | } 118 | } 119 | (0 until docGraph.size).map(i => { 120 | var backptr = i; 121 | for (j <- 0 until i) { 122 | if (mapping(j) == mapping(i)) { 123 | backptr = j; 124 | } 125 | } 126 | backptr; 127 | }).toArray; 128 | } 129 | } 130 | } 131 | 132 | def finishPrintStats() = {} 133 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/DocumentInferencerOracle.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.fig.basic.Indexer 3 | 4 | class DocumentInferencerOracle extends DocumentInferencer { 5 | 6 | def getInitialWeightVector(featureIndexer: Indexer[String]): Array[Double] = Array.fill(featureIndexer.size())(0.0); 7 | 8 | def computeLikelihood(docGraph: DocumentGraph, 9 | pairwiseScorer: PairwiseScorer, 10 | lossFcn: (CorefDoc, Int, Int) => Double) = { 11 | 0.0; 12 | } 13 | 14 | def addUnregularizedStochasticGradient(docGraph: DocumentGraph, 15 | pairwiseScorer: PairwiseScorer, 16 | lossFcn: (CorefDoc, Int, Int) => Double, 17 | gradient: Array[Double]) = { 18 | } 19 | 20 | def viterbiDecode(docGraph: DocumentGraph, 21 | pairwiseScorer: PairwiseScorer): Array[Int] = { 22 | val clustering = docGraph.getOraclePredClustering(); 23 | val resultSeq = for (i <- 0 until docGraph.size) yield { 24 | val immediateAntecedentOrMinus1 = clustering.getImmediateAntecedent(i); 25 | if (immediateAntecedentOrMinus1 == -1) { 26 | i; 27 | } else { 28 | docGraph.getMentions.indexOf(immediateAntecedentOrMinus1); 29 | } 30 | } 31 | resultSeq.toArray; 32 | } 33 | 34 | def finishPrintStats() = {} 35 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/Feature.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | 3 | case class Feature(context: String, event: String, value: Double, basic: Boolean) { 4 | val name = context + " >> " + event; 5 | val contextAndTemplate = context + ":" + (if (basic) "basic" else "conj"); 6 | }; 7 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/GUtil.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.util.Counter 3 | import edu.berkeley.nlp.futile.util.Iterators 4 | import scala.collection.mutable.ArrayBuffer 5 | import scala.collection.JavaConverters._ 6 | import edu.berkeley.nlp.futile.math.SloppyMath 7 | import scala.util.Sorting 8 | import java.util.Collection 9 | 10 | object GUtil { 11 | 12 | def fmt(mat: Array[Array[Double]]): String = { 13 | var str = ""; 14 | for (i <- 0 until mat.size) { 15 | for (j <- 0 until mat(i).size) { 16 | str += GUtil.fmt(mat(i)(j)) + "\t"; 17 | } 18 | str += "\n"; 19 | } 20 | str; 21 | } 22 | 23 | // def fmt(col: Collection[Double]): String = { 24 | // if (col.size == 0) { 25 | // "[]" 26 | // } else { 27 | // "[" + col.foldLeft("")((curr, nextD) => curr + fmt(nextD) + ", ").dropRight(2) + "]"; 28 | // } 29 | // } 30 | 31 | def fmt(d: Double): String = { 32 | if (d.isNaN) { 33 | "NaN"; 34 | } else if (d.isPosInfinity) { 35 | "+Inf"; 36 | } else if (d.isNegInfinity) { 37 | "-Inf"; 38 | } else { 39 | if (d < 0) "-" + fmtPositiveNumber(-d) else fmtPositiveNumber(d); 40 | } 41 | } 42 | 43 | def fmtProb(d: Double): String = { 44 | fmtPositiveNumber(d); 45 | } 46 | 47 | def fmtPositiveNumber(d: Double): String = { 48 | require(d >= 0); 49 | if (d == 0) { 50 | "0"; 51 | } 52 | if (d < 1e-20) { 53 | "tiny" 54 | } else if (d < 0.001) { 55 | val numPlacesToMove = Math.ceil(-Math.log(d)/Math.log(10)).toInt; 56 | "%1.1f".format(d * Math.pow(10, numPlacesToMove)) + "e-" + numPlacesToMove; 57 | } else if (d < 10000) { 58 | "%1.3f".format(d); 59 | } else { 60 | val numPlacesToMove = Math.floor(Math.log(d)/Math.log(10)).toInt; 61 | "%1.1f".format(d / Math.pow(10, numPlacesToMove)) + "e" + numPlacesToMove; 62 | } 63 | } 64 | 65 | def fmtTwoDigitNumber(d: Double, numDecimalPlaces: Int): String = { 66 | ("%1." + numDecimalPlaces + "f").format(d); 67 | } 68 | 69 | def containsNaN(array: Array[Double]): Boolean = { 70 | var containsNaN = false; 71 | for (value <- array) { 72 | containsNaN = containsNaN || value.isNaN; 73 | } 74 | containsNaN; 75 | } 76 | 77 | def containsNaNOrNegInf(array: Array[Double]): Boolean = { 78 | var bad = false; 79 | for (value <- array) { 80 | bad = bad || value.isNaN || value.isNegInfinity; 81 | } 82 | bad; 83 | } 84 | 85 | def getNBest[A](stuff: Seq[A], scorer: (A) => Double, n: Int): Seq[(A, Double)] = { 86 | val counter = new Counter[A](); 87 | for (thing <- stuff) { 88 | counter.setCount(thing, scorer(thing)); 89 | } 90 | val results = new ArrayBuffer[(A, Double)](); 91 | for (thing <- Iterators.able(counter.asPriorityQueue()).asScala) { 92 | if (results.size < n) { 93 | results += new Tuple2(thing, counter.getCount(thing)); 94 | } 95 | } 96 | results; 97 | } 98 | 99 | def getTopNKeysSubCounter(counter: Counter[String], n: Int) = { 100 | val newCounter = new Counter[String](); 101 | val pq = counter.asPriorityQueue() 102 | var numPrinted = 0; 103 | while (pq.hasNext() && numPrinted < n) { 104 | val obj = pq.next(); 105 | newCounter.setCount(obj, counter.getCount(obj)); 106 | numPrinted += 1; 107 | } 108 | newCounter; 109 | } 110 | 111 | def normalizeiSoft(arr: Array[Double]): Boolean = { 112 | var idx = 0; 113 | var total = 0.0; 114 | while (idx < arr.size) { 115 | total += arr(idx); 116 | idx += 1; 117 | } 118 | if (total <= 0.0) { 119 | false; 120 | } else { 121 | idx = 0; 122 | while (idx < arr.size) { 123 | arr(idx) /= total; 124 | idx += 1; 125 | } 126 | true; 127 | } 128 | } 129 | 130 | def normalizeiHard(arr: Array[Double]) { 131 | var idx = 0; 132 | var total = 0.0; 133 | while (idx < arr.size) { 134 | total += arr(idx); 135 | idx += 1; 136 | } 137 | if (total <= 0.0) { 138 | throw new RuntimeException("Bad total for normalizing: " + total); 139 | } 140 | idx = 0; 141 | while (idx < arr.size) { 142 | arr(idx) /= total; 143 | idx += 1; 144 | } 145 | } 146 | 147 | def expAndNormalizeiHard(arr: Array[Double]) { 148 | var idx = 0; 149 | while (idx < arr.size) { 150 | arr(idx) = Math.exp(arr(idx)); 151 | idx += 1; 152 | } 153 | normalizeiHard(arr); 154 | } 155 | 156 | def renderMat[A](mat: Array[Array[A]]): String = { 157 | mat.map(row => row.map(_.toString).reduce((c1, c2) => c1 + ", " + c2)).reduce((r1, r2) => r1 + "\n" + r2); 158 | } 159 | 160 | def normalizei(vector: Array[Double]) { 161 | val normalizer = vector.reduce(_ + _); 162 | for (i <- 0 until vector.size) { 163 | vector(i) /= normalizer; 164 | } 165 | } 166 | 167 | def logNormalizei(vector: Array[Double]) { 168 | val normalizer = SloppyMath.logAdd(vector); 169 | for (i <- 0 until vector.size) { 170 | vector(i) -= normalizer; 171 | } 172 | } 173 | 174 | def logNormalizeiByRow(mat: Array[Array[Double]]) { 175 | for (i <- 0 until mat.size) { 176 | val normalizer = SloppyMath.logAdd(mat(i)); 177 | for (j <- 0 until mat(i).size) { 178 | mat(i)(j) -= normalizer; 179 | } 180 | } 181 | } 182 | 183 | def computeQuantile(nums: Array[Double], quantile: Double): Double = { 184 | val numsCpy = new Array[Double](nums.size); 185 | Array.copy(nums, 0, numsCpy, 0, nums.size); 186 | Sorting.quickSort(numsCpy); 187 | numsCpy((quantile * nums.size).toInt); 188 | } 189 | 190 | def main(args: Array[String]) { 191 | println(fmtProb(1.0)); 192 | println(fmtProb(0.01)); 193 | println(fmtProb(0.001)); 194 | println(fmtProb(0.0001)); 195 | println(fmtProb(0.00001)); 196 | println(fmtProb(0.000001)); 197 | println(fmtProb(0.0000001)); 198 | 199 | println(fmtProb(0.000000000000000000000001)); 200 | } 201 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/Gender.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref; 2 | 3 | public enum Gender { 4 | MALE, FEMALE, NEUTRAL, UNKNOWN; 5 | } 6 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/MentionPropertyComputer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | 3 | class MentionPropertyComputer(val ngComputer: NumberGenderComputer) { 4 | 5 | 6 | 7 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/MentionType.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref; 2 | 3 | 4 | public enum MentionType { 5 | 6 | PROPER(false), NOMINAL(false), PRONOMINAL(true), DEMONSTRATIVE(true); 7 | 8 | private boolean isClosedClass; 9 | 10 | private MentionType(boolean isClosedClass) { 11 | this.isClosedClass = isClosedClass; 12 | } 13 | 14 | public boolean isClosedClass() { 15 | return isClosedClass; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/Number.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref; 2 | 3 | 4 | public enum Number { 5 | SINGULAR, PLURAL, UNKNOWN; 6 | } 7 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/OraclePosteriorSampler.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.fig.prob.Dirichlet 3 | 4 | object OraclePosteriorSampler { 5 | 6 | def sample(alphas: Array[Double], rng: java.util.Random): Array[Double] = { 7 | new Dirichlet(alphas).sample(rng); 8 | } 9 | 10 | def randomPosterior(domainSize: Int, specialIndex: Int, rng: java.util.Random): Array[Double] = { 11 | val baseAlpha = 1.0; 12 | val specialAlpha = if (domainSize == 2) { 13 | 2.1 14 | } else if (domainSize == 5) { 15 | 3.5 16 | } else { 17 | throw new RuntimeException("Domain size " + domainSize + " doesn't have fitparameters"); 18 | } 19 | val alphas = Array.fill(domainSize)(baseAlpha); 20 | alphas(specialIndex) = specialAlpha; 21 | sample(alphas, rng); 22 | } 23 | 24 | def main(args: Array[String]) { 25 | val rng = new java.util.Random(0); 26 | // val alpha = 0.1; 27 | // val specialAlpha = 0.3; 28 | { 29 | // val alpha = 0.4; 30 | // val specialAlpha = 1.0; 31 | val alpha = 1.0; 32 | val specialAlpha = 2.1; 33 | val totalSamples = 1000; 34 | var numInversions = 0; 35 | var totalInverted = 0.0; 36 | var totalNoninverted = 0.0; 37 | for (i <- 0 until totalSamples) { 38 | val currSample = sample(Array(specialAlpha, alpha), rng).toSeq; 39 | val max = currSample.reduce(Math.max(_, _)); 40 | if (currSample(0) < max - 1e-8) { 41 | numInversions += 1; 42 | totalInverted += max; 43 | } else { 44 | totalNoninverted += max; 45 | } 46 | } 47 | println("Domain size 2"); 48 | println("Num inversions: " + numInversions + "/" + totalSamples); 49 | println("Avg max if not inverted: " + totalNoninverted/(totalSamples - numInversions)); 50 | println("Avg max if inverted: " + totalInverted/numInversions); 51 | } 52 | 53 | { 54 | // val alpha = 0.4; 55 | // val specialAlpha = 1.9; 56 | val alpha = 1.0; 57 | val specialAlpha = 3.5; 58 | val totalSamples = 1000; 59 | var numInversions = 0; 60 | var totalInverted = 0.0; 61 | var totalNoninverted = 0.0; 62 | for (i <- 0 until totalSamples) { 63 | val currSample = sample(Array(specialAlpha, alpha, alpha, alpha, alpha), rng).toSeq; 64 | val max = currSample.reduce(Math.max(_, _)); 65 | if (currSample(0) < max - 1e-8) { 66 | numInversions += 1; 67 | totalInverted += max; 68 | } else { 69 | totalNoninverted += max; 70 | } 71 | } 72 | println("Domain size 5"); 73 | println("Num inversions: " + numInversions + "/" + totalSamples); 74 | println("Avg max if not inverted: " + totalNoninverted/(totalSamples - numInversions)); 75 | println("Avg max if inverted: " + totalInverted/numInversions); 76 | } 77 | } 78 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/OrderedClustering.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import scala.collection.mutable.HashMap 3 | import scala.collection.JavaConverters._ 4 | import scala.collection.mutable.ArrayBuffer 5 | 6 | class OrderedClustering(val clusters: Seq[Seq[Int]]) { 7 | // Elements must be consecutive integers from 0 up to n 8 | private val allIndicesSorted = clusters.foldLeft(new ArrayBuffer[Int])(_ ++ _).sorted; 9 | require(allIndicesSorted.sameElements((0 until allIndicesSorted.size).toSeq), allIndicesSorted); 10 | private val mentionToClusterMap = new HashMap[Int,Seq[Int]]; 11 | for (cluster <- clusters) { 12 | for (i <- cluster) { 13 | mentionToClusterMap.put(i, cluster); 14 | } 15 | } 16 | 17 | def getCluster(idx: Int) = mentionToClusterMap(idx); 18 | 19 | def isSingleton(idx: Int) = mentionToClusterMap(idx).size == 1; 20 | 21 | def startsCluster(idx: Int) = mentionToClusterMap(idx)(0) == idx; 22 | 23 | def areInSameCluster(idx1: Int, idx2: Int) = mentionToClusterMap(idx1).contains(idx2); 24 | 25 | def getImmediateAntecedent(idx: Int) = { 26 | val cluster = mentionToClusterMap(idx); 27 | val mentIdxInCluster = cluster.indexOf(idx); 28 | if (mentIdxInCluster == 0) { 29 | -1 30 | } else { 31 | cluster(mentIdxInCluster - 1); 32 | } 33 | } 34 | 35 | def getAllAntecedents(idx: Int) = { 36 | val cluster = mentionToClusterMap(idx); 37 | cluster.slice(0, cluster.indexOf(idx)); 38 | } 39 | 40 | def getAllConsequents(idx: Int) = { 41 | val cluster = mentionToClusterMap(idx); 42 | cluster.slice(cluster.indexOf(idx) + 1, cluster.size); 43 | } 44 | 45 | 46 | // Needed for output printing 47 | def getClusterIdx(idx: Int) = { 48 | var clusterIdx = 0; 49 | for (i <- 0 until clusters.size) { 50 | if (clusters(i).sameElements(mentionToClusterMap(idx))) { 51 | clusterIdx = i; 52 | } 53 | } 54 | clusterIdx; 55 | } 56 | 57 | def getSubclustering(mentIdxsToKeep: Seq[Int]): OrderedClustering = { 58 | val oldIndicesToNewIndicesMap = new HashMap[Int,Int](); 59 | (0 until mentIdxsToKeep.size).map(i => oldIndicesToNewIndicesMap.put(mentIdxsToKeep(i), i)); 60 | val filteredConvertedClusters = clusters.map(cluster => cluster.filter(mentIdxsToKeep.contains(_)).map(mentIdx => oldIndicesToNewIndicesMap(mentIdx))); 61 | val filteredConvertedClustersNoEmpties = filteredConvertedClusters.filter(cluster => !cluster.isEmpty); 62 | new OrderedClustering(filteredConvertedClustersNoEmpties); 63 | } 64 | } 65 | 66 | object OrderedClustering { 67 | 68 | def createFromClusterIds(clusterIds: Seq[Int]) = { 69 | val mentIdAndClusterId = (0 until clusterIds.size).map(i => (i, clusterIds(i))); 70 | val clustersUnsorted = mentIdAndClusterId.groupBy(_._2).values; 71 | val finalClusters = clustersUnsorted.toSeq.sortBy(_.head).map(clusterWithClusterId => clusterWithClusterId.map(_._1)); 72 | new OrderedClustering(finalClusters.toSeq); 73 | } 74 | 75 | def createFromBackpointers(backpointers: Seq[Int]) = { 76 | var nextClusterID = 0; 77 | val clusters = new ArrayBuffer[ArrayBuffer[Int]](); 78 | val mentionToCluster = new HashMap[Int,ArrayBuffer[Int]](); 79 | for (i <- 0 until backpointers.size) { 80 | if (backpointers(i) == i) { 81 | val cluster = ArrayBuffer(i); 82 | clusters += cluster; 83 | mentionToCluster.put(i, cluster); 84 | } else { 85 | val cluster = mentionToCluster(backpointers(i)); 86 | cluster += i; 87 | mentionToCluster.put(i, cluster); 88 | } 89 | } 90 | new OrderedClustering(clusters); 91 | } 92 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/OrderedClusteringBound.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import scala.collection.JavaConverters._ 3 | 4 | class OrderedClusteringBound(val ments: Seq[Mention], 5 | val clustering: OrderedClustering) { 6 | 7 | def postprocessForConll(): OrderedClusteringBound = { 8 | val mentIdxsToKeep = (0 until ments.size).filter(i => !clustering.isSingleton(i)); 9 | new OrderedClusteringBound(mentIdxsToKeep.map(i => ments(i)), clustering.getSubclustering(mentIdxsToKeep)); 10 | } 11 | 12 | def getClusterIdx(ment: Mention) = { 13 | clustering.getClusterIdx(ments.indexOf(ment)); 14 | } 15 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/PairwiseIndexingFeaturizer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.fig.basic.Indexer 3 | import edu.berkeley.nlp.futile.util.Counter 4 | import edu.berkeley.nlp.futile.util.Logger 5 | import scala.collection.JavaConverters._ 6 | import edu.berkeley.nlp.coref.sem.QueryCountsBundle 7 | 8 | trait PairwiseIndexingFeaturizer { 9 | 10 | def getIndexer(): Indexer[String]; 11 | 12 | def getIndex(feature: String, addToFeaturizer: Boolean): Int; 13 | 14 | def getQueryCountsBundle: QueryCountsBundle; 15 | 16 | def featurizeIndex(docGraph: DocumentGraph, currMentIdx: Int, antecedentIdx: Int, addToFeaturizer: Boolean): Seq[Int]; 17 | 18 | def printFeatureTemplateCounts() { 19 | val indexer = getIndexer(); 20 | val templateCounts = new Counter[String](); 21 | for (i <- 0 until indexer.size) { 22 | val currFeatureName = indexer.get(i); 23 | val currFeatureTemplateStop = currFeatureName.indexOf("="); 24 | if (currFeatureTemplateStop == -1) { 25 | Logger.logss("No =: " + currFeatureName); 26 | } else { 27 | templateCounts.incrementCount(currFeatureName.substring(0, currFeatureTemplateStop), 1.0); 28 | } 29 | } 30 | templateCounts.keepTopNKeys(200); 31 | if (templateCounts.size > 200) { 32 | Logger.logss("Not going to print more than 200 templates"); 33 | } 34 | templateCounts.keySet().asScala.toSeq.sorted.foreach(template => Logger.logss(template + ": " + templateCounts.getCount(template).toInt)); 35 | } 36 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/PairwiseLossFunctions.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import edu.berkeley.nlp.futile.util.Logger 3 | 4 | object PairwiseLossFunctions { 5 | 6 | val noLoss = (doc: CorefDoc, ment: Int, ant: Int) => 0.0; 7 | 8 | val precisionLoss = (doc: CorefDoc, ment: Int, ant: Int) => { 9 | val oracleCluster = doc.getOraclePredClustering; 10 | // Only penalize if we predict a link and it's incorrect. N.B. because of our 11 | // conventions, ment == ant if we're predicting nonanaphoricity. 12 | if (!oracleCluster.areInSameCluster(ment, ant)) 1.0 else 0.0; 13 | }; 14 | 15 | val recallLoss = (doc: CorefDoc, ment: Int, ant: Int) => { 16 | val oracleCluster = doc.getOraclePredClustering; 17 | // Only penalize when we were supposed to make a prediction and we didn't or it 18 | // was wrong. 19 | if (!oracleCluster.startsCluster(ment) && !oracleCluster.areInSameCluster(ment, ant)) 1.0 else 0.0; 20 | }; 21 | 22 | // 1) Penalty when we link up someone who should start a new cluster (boosting this helps precision) 23 | // 2) Penalty when we start a new cluster with someone who should link up (boosting this helps recall) 24 | // 3) Penalty when we mess up a link 25 | val customLoss = (falseLinkScore: Double, falseNewScore: Double, wrongLinkScore: Double) => { 26 | (doc: CorefDoc, ment: Int, ant: Int) => { 27 | val oracleCluster = doc.getOraclePredClustering; 28 | if (oracleCluster.startsCluster(ment) && ment != ant) { 29 | falseLinkScore; 30 | } else if (!oracleCluster.startsCluster(ment) && ment == ant) { 31 | falseNewScore; 32 | } else if (!oracleCluster.startsCluster(ment) && !oracleCluster.areInSameCluster(ment, ant)) { 33 | wrongLinkScore; 34 | } else { 35 | 0.0; 36 | }; 37 | } 38 | }; 39 | 40 | // interpolationFactor interpolates between customLoss and a version of customLoss where 41 | // everything is weighted by the size of the gold cluster (should hypothetically be more 42 | // MUC-oriented than our current loss function) 43 | val weightedCustomLoss = (falseLinkScore: Double, falseNewScore: Double, wrongLinkScore: Double, interpolationFactor: Double) => { 44 | (doc: CorefDoc, ment: Int, ant: Int) => { 45 | val oracleCluster = doc.getOraclePredClustering; 46 | val oracleClusterSize = oracleCluster.getCluster(ment).size; 47 | val scalingFactor = (1 - interpolationFactor + interpolationFactor * oracleClusterSize); 48 | if (oracleCluster.startsCluster(ment) && ment != ant) { 49 | falseLinkScore * scalingFactor; 50 | } else if (!oracleCluster.startsCluster(ment) && ment == ant) { 51 | falseNewScore * scalingFactor; 52 | } else if (!oracleCluster.startsCluster(ment) && !oracleCluster.areInSameCluster(ment, ant)) { 53 | wrongLinkScore * scalingFactor; 54 | } else { 55 | 0.0; 56 | }; 57 | } 58 | } 59 | 60 | def apply(x: String) = getLossFcn(x); 61 | 62 | def getLossFcn(name: String): (CorefDoc, Int, Int) => Double = { 63 | if (name == "noLoss") { 64 | noLoss; 65 | } else if (name == "precisionLoss") { 66 | precisionLoss; 67 | } else if (name == "recallLoss") { 68 | recallLoss; 69 | } else if (name.startsWith("customLoss")) { 70 | val params = name.split("-"); 71 | require(params.size == 4); 72 | customLoss(params(1).toDouble, params(2).toDouble, params(3).toDouble); 73 | } else if (name.startsWith("weightedCustomLoss")) { 74 | val params = name.split("-"); 75 | require(params.size == 5); 76 | weightedCustomLoss(params(1).toDouble, params(2).toDouble, params(3).toDouble, params(4).toDouble); 77 | } else { 78 | throw new RuntimeException("Unsupported"); 79 | } 80 | } 81 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/PairwiseScorer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | 3 | @SerialVersionUID(1L) 4 | class PairwiseScorer(val featurizer: PairwiseIndexingFeaturizer, val weights: Array[Double]) extends Serializable { 5 | 6 | def numWeights = weights.size 7 | 8 | def scoreIndexedFeats(feats: Seq[Int]): Double = { 9 | var featIdx = 0; 10 | var featTotal = 0.0; 11 | while (featIdx < feats.size) { 12 | featTotal += weights(feats(featIdx)); 13 | featIdx += 1; 14 | } 15 | featTotal; 16 | } 17 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/PronounDictionary.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | import scala.collection.mutable.HashMap 3 | 4 | object PronounDictionary { 5 | val firstPersonPronouns = Set("i", "me", "myself", "mine", "my", "we", "us", "ourself", "ourselves", "ours", "our"); 6 | val secondPersonPronouns = Set("you", "yourself", "yours", "your", "yourselves"); 7 | val thirdPersonPronouns = Set("he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's", "they", "them", "themself", "themselves", "theirs", "their", "they", "them", "'em", "themselves"); 8 | val otherPronouns = Set("who", "whom", "whose", "where", "when","which"); 9 | 10 | val demonstratives = Set("this", "that", "these", "those"); 11 | 12 | // Borrowed from Stanford 13 | val singularPronouns = Set("i", "me", "myself", "mine", "my", "yourself", "he", "him", "himself", "his", "she", "her", "herself", "hers", "her", "it", "itself", "its", "one", "oneself", "one's"); 14 | val pluralPronouns = Set("we", "us", "ourself", "ourselves", "ours", "our", "yourself", "yourselves", "they", "them", "themself", "themselves", "theirs", "their"); 15 | val malePronouns = Set("he", "him", "himself", "his"); 16 | val femalePronouns = Set("her", "hers", "herself", "she"); 17 | val neutralPronouns = Set("it", "its", "itself", "where", "here", "there", "which"); 18 | 19 | 20 | val allPronouns = firstPersonPronouns ++ secondPersonPronouns ++ thirdPersonPronouns ++ otherPronouns; 21 | 22 | // Constructed based on Stanford's Dictionaries class 23 | val canonicalizations = new HashMap[String,String](); 24 | canonicalizations.put("i", "i"); 25 | canonicalizations.put("me", "i"); 26 | canonicalizations.put("my", "i"); 27 | canonicalizations.put("myself", "i"); 28 | canonicalizations.put("mine", "i"); 29 | canonicalizations.put("you", "you"); 30 | canonicalizations.put("your", "you"); 31 | canonicalizations.put("yourself", "you"); 32 | canonicalizations.put("yourselves", "you"); 33 | canonicalizations.put("yours", "you"); 34 | canonicalizations.put("he", "he"); 35 | canonicalizations.put("him", "he"); 36 | canonicalizations.put("his", "he"); 37 | canonicalizations.put("himself", "he"); 38 | canonicalizations.put("she", "she"); 39 | canonicalizations.put("her", "she"); 40 | canonicalizations.put("herself", "she"); 41 | canonicalizations.put("hers", "she"); 42 | 43 | canonicalizations.put("we", "we"); 44 | canonicalizations.put("us", "we"); 45 | canonicalizations.put("our", "we"); 46 | canonicalizations.put("ourself", "we"); 47 | canonicalizations.put("ourselves", "we"); 48 | canonicalizations.put("ours", "we"); 49 | canonicalizations.put("they", "they"); 50 | canonicalizations.put("them", "they"); 51 | canonicalizations.put("their", "they"); 52 | canonicalizations.put("themself", "they"); 53 | canonicalizations.put("themselves", "they"); 54 | canonicalizations.put("theirs", "they"); 55 | canonicalizations.put("'em", "they"); 56 | canonicalizations.put("it", "it"); 57 | canonicalizations.put("itself", "it"); 58 | canonicalizations.put("its", "it"); 59 | canonicalizations.put("one", "one"); 60 | canonicalizations.put("oneself", "one"); 61 | canonicalizations.put("one's", "one"); 62 | 63 | canonicalizations.put("this", "this"); 64 | canonicalizations.put("that", "that"); 65 | canonicalizations.put("these", "these"); 66 | canonicalizations.put("those", "those"); 67 | canonicalizations.put("which", "which"); 68 | canonicalizations.put("who", "who"); 69 | canonicalizations.put("whom", "who"); 70 | // canonicalizations.put("where", "where"); 71 | // canonicalizations.put("whose", "whose"); 72 | // This entry is here just to make results consistent with earlier ones 73 | // on our very small dev set 74 | canonicalizations.put("thy", "thy"); 75 | canonicalizations.put("y'all", "you"); 76 | canonicalizations.put("you're", "you"); 77 | canonicalizations.put("you'll", "you"); 78 | canonicalizations.put("'s", "'s"); 79 | 80 | def isPronLc(str: String): Boolean = { 81 | allPronouns.contains(str.toLowerCase()); 82 | } 83 | 84 | def isDemonstrative(str: String): Boolean = { 85 | demonstratives.contains(str.toLowerCase()); 86 | } 87 | 88 | def canonicalize(str: String): String = { 89 | if (!canonicalizations.contains(str.toLowerCase())) { 90 | ""; 91 | } else { 92 | canonicalizations(str.toLowerCase()); 93 | } 94 | } 95 | 96 | def main(args: Array[String]) { 97 | println(PronounDictionary.canonicalizations("'em")); 98 | println(PronounDictionary.isPronLc("them")); 99 | println(PronounDictionary.isPronLc("Them")); 100 | println(PronounDictionary.isPronLc("NotThem")); 101 | } 102 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/PruningStrategy.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref 2 | 3 | case class PruningStrategy(val strategy: String) { 4 | 5 | def getDistanceArgs(): (Int, Int) = { 6 | require(strategy.startsWith("distance")); 7 | val splitStrategy = strategy.split(":"); 8 | (splitStrategy(1).toInt, splitStrategy(2).toInt); 9 | } 10 | 11 | def getLogRatio(): Double = { 12 | require(strategy.startsWith("c2flogratio")); 13 | strategy.substring(strategy.indexOf(":") + 1).toDouble; 14 | } 15 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/WordNetInterfacer.scala: -------------------------------------------------------------------------------- 1 | //package edu.berkeley.nlp.coref 2 | //import java.net.URL 3 | //import edu.mit.jwi.item.IIndexWord 4 | //import edu.mit.jwi.item.IWord 5 | //import edu.mit.jwi.item.IWordID 6 | //import edu.mit.jwi.Dictionary 7 | //import edu.mit.jwi.item.POS 8 | //import edu.mit.jwi.morph.WordnetStemmer 9 | //import edu.mit.jwi.item.ISynset 10 | //import scala.collection.JavaConverters._ 11 | //import edu.mit.jwi.item.Pointer 12 | //import scala.collection.mutable.ArrayBuffer 13 | //import scala.collection.mutable.HashSet 14 | //import edu.mit.jwi.RAMDictionary 15 | //import edu.mit.jwi.data.ILoadPolicy 16 | //import edu.berkeley.nlp.futile.util.Logger 17 | // 18 | //class WordNetInterfacer(path: String) { 19 | // val url = new URL("file", null, path); 20 | // 21 | //// val dict = new Dictionary(url); 22 | //// dict.open(); 23 | // val originalDict = new Dictionary(url); 24 | // originalDict.open(); 25 | // val dict = new RAMDictionary(originalDict, ILoadPolicy.IMMEDIATE_LOAD); 26 | // dict.open(); 27 | // 28 | // val wns = new WordnetStemmer(dict); 29 | // 30 | // def getLemmas(head: String): Set[String] = { 31 | // getNounStemSet(head); 32 | // } 33 | // 34 | // def getSynonyms(head: String): Set[String] = { 35 | // getNounStemSet(head).flatMap((headStem: String) => { 36 | // val wordSynset = getWordSynset(headStem); 37 | // if (wordSynset != null) wordSynset.getWords().asScala.map(_.getLemma()) else Set[String](); 38 | // }); 39 | // } 40 | // 41 | // def getHypernyms(head: String): Set[String] = { 42 | // val initialSynset = getNounStemSet(head).flatMap((headStem: String) => { 43 | // if (getWordSynset(headStem) != null) Set[ISynset](getWordSynset(headStem)) else Set[ISynset](); 44 | // }).toSet 45 | // getHypernyms(10, initialSynset).flatMap(_.getWords().asScala.map(_.getLemma())).toSet; 46 | // } 47 | // 48 | // def areSynonyms(firstHead: String, secondHead: String) = { 49 | // val stemsFirstHead = getNounStemSet(firstHead); 50 | // val stemsSecondHead = getNounStemSet(secondHead); 51 | // var isSynonym = false; 52 | // for (wordAStem <- stemsFirstHead) { 53 | // val wordASynset: ISynset = getWordSynset(wordAStem); 54 | // if (wordASynset != null) { 55 | // for (wordBStem <- stemsSecondHead) { 56 | // isSynonym = isSynonym || wordASynset.getWords().asScala.map(_.getLemma()).contains(wordBStem); 57 | // } 58 | // } 59 | // } 60 | // isSynonym 61 | // } 62 | // 63 | // def areHypernyms(head: String, possibleHypernym: String) = { 64 | // val stemsHead = getNounStemSet(head); 65 | // val stemsPossibleHypernym = getNounStemSet(possibleHypernym); 66 | // var isHypernym = false; 67 | // for (headStem <- stemsHead) { 68 | // val headSynset: ISynset = getWordSynset(headStem); 69 | // if (headSynset != null) { 70 | // // 10 levels in the tree should be enough for anybody... 71 | // val hypernyms = getHypernyms(10, Set(headSynset)); 72 | // for(hypernym <- hypernyms){ 73 | // val hypernymWords = hypernym.getWords(); 74 | // for (i <- 0 until hypernymWords.size()) { 75 | // isHypernym = isHypernym || stemsPossibleHypernym.contains(hypernymWords.get(i).getLemma()); 76 | // } 77 | // } 78 | // } 79 | // } 80 | // isHypernym 81 | // } 82 | // 83 | // private def getHypernyms(numLevelsToRecurse: Int, synsets: Set[ISynset]): HashSet[ISynset] = { 84 | // var synsetsThisLevel = new HashSet[ISynset]() ++ synsets; 85 | // var synsetsNextLevel = new HashSet[ISynset](); 86 | // val allSynsets = new HashSet[ISynset](); 87 | // for (i <- 0 until numLevelsToRecurse) { 88 | // if (!synsetsThisLevel.isEmpty) { 89 | // for (synset <- synsetsThisLevel) { 90 | // synsetsNextLevel ++= synset.getRelatedSynsets(Pointer.HYPERNYM).asScala.map(dict.getSynset(_)); 91 | // } 92 | // // Don't visit nodes we've already been to 93 | // synsetsThisLevel = (synsetsNextLevel -- allSynsets); 94 | // allSynsets ++= synsetsNextLevel; 95 | // synsetsNextLevel = new HashSet[ISynset](); 96 | // } 97 | // } 98 | // allSynsets; 99 | // } 100 | // 101 | // private def getWordSynset(stemmedWord: String) = { 102 | // val idxWord: IIndexWord = dict.getIndexWord(stemmedWord, POS.NOUN); 103 | // if (idxWord != null) { 104 | // val wordID: IWordID = idxWord.getWordIDs().get(0); 105 | // val word: IWord = dict.getWord(wordID); 106 | // word.getSynset(); 107 | // } else { 108 | // null; 109 | // } 110 | // } 111 | // 112 | // private def getNounStemSet(head: String): Set[String] = { 113 | // require(head != null && !head.isEmpty()); 114 | // var toReturn = Set[String](); 115 | // try { 116 | // toReturn = wns.findStems(head, POS.NOUN).asScala.toSet; 117 | // } catch { 118 | // case e: IllegalArgumentException => Logger.logss("IllegalArgumentException on " + head); 119 | // case _ => Logger.logss("Badness"); System.exit(0); 120 | // } 121 | // toReturn; 122 | // } 123 | // 124 | //} 125 | // 126 | //object WordNetInterfacer { 127 | // 128 | // 129 | // 130 | // 131 | // def main(args: Array[String]) = { 132 | // val path = "/Users/gdurrett/Documents/Berkeley/Utils/WNdb-3.0/dict/"; 133 | // val url = new URL("file", null, path); 134 | // 135 | // val dict = new Dictionary(url); 136 | // dict.open(); 137 | // val idxWord: IIndexWord = dict.getIndexWord("dog", POS.NOUN); 138 | // val wordID: IWordID = idxWord.getWordIDs().get(0); 139 | // val word: IWord = dict.getWord(wordID); 140 | // println("Id = " + wordID); 141 | // println("Lemma = " + word.getLemma()); 142 | // println("Gloss = " + word.getSynset().getGloss()); 143 | // 144 | // val synset: ISynset = word.getSynset(); 145 | // // iterate over words associated with the synset 146 | // println("Synonyms"); 147 | // synset.getWords().asScala.foreach(word => println(word.getLemma())) 148 | // 149 | // val hypernyms = synset.getRelatedSynsets(Pointer.HYPERNYM); 150 | // println("Hypernyms"); 151 | // for(sid <- hypernyms.asScala){ 152 | // println(sid + ": " + dict.getSynset(sid).getWords().asScala.map(_.getLemma())); 153 | // } 154 | // 155 | // val wns = new WordnetStemmer(dict); 156 | // println(wns.findStems("dogs", POS.NOUN)); 157 | // println(wns.findStems("DOGS", POS.NOUN)); 158 | // println(wns.findStems("Presidents", POS.NOUN)); 159 | // 160 | // 161 | // println("==============="); 162 | // val wordNetInterfacer = new WordNetInterfacer(path); 163 | // println("Synonyms: dog cat? (should be false) " + wordNetInterfacer.areSynonyms("dog", "cat")); 164 | // println("Synonyms: dog domestic_dog? (should be true) " + wordNetInterfacer.areSynonyms("dog", "domestic_dog")); 165 | // 166 | // 167 | // println("Hypernyms: dog domestic_dog? (should be false) " + wordNetInterfacer.areHypernyms("dog", "domestic_dog")); 168 | // println("Hypernyms: dog canine? (should be true) " + wordNetInterfacer.areHypernyms("dog", "canine")); 169 | // println("Hypernyms: canine dog? (should be false) " + wordNetInterfacer.areHypernyms("canine", "dog")); 170 | // 171 | // 172 | // println("==============="); 173 | // println(wordNetInterfacer.getLemmas("dog")); 174 | // println(wordNetInterfacer.getSynonyms("dog")); 175 | // println(wordNetInterfacer.getSynonyms("cat")); 176 | // println(wordNetInterfacer.getHypernyms("cat")); 177 | // 178 | // } 179 | //} -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/bp/Domain.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.bp 2 | 3 | case class Domain[T](val entries: Array[T]) { 4 | def size = entries.size 5 | 6 | def indexOf(entry: T) = entries.indexOf(entry); 7 | 8 | def value(idx: Int): T = entries(idx); 9 | 10 | override def toString() = entries.foldLeft("")((str, entry) => str + entry + " ").dropRight(1); 11 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/bp/Node.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.bp 2 | import scala.collection.mutable.ArrayBuffer 3 | import edu.berkeley.nlp.futile.util.Logger 4 | import edu.berkeley.nlp.coref.GUtil 5 | 6 | class Node[T](val domain: Domain[T]) { 7 | var factors = new ArrayBuffer[Factor](); 8 | var receivedMessages: Array[Array[Double]] = null; 9 | var sentMessages: Array[Array[Double]] = null; 10 | var cachedBeliefsOrMarginals: Array[Double] = Array.fill(domain.size)(0.0); 11 | 12 | def registerFactor(factor: Factor) { 13 | factors += factor; 14 | } 15 | 16 | // TODO: Do I need this null thing? 17 | def initializeReceivedMessagesUniform() { 18 | if (receivedMessages == null) { 19 | receivedMessages = new Array[Array[Double]](factors.size); 20 | } else { 21 | for (i <- 0 until receivedMessages.size) { 22 | receivedMessages(i) = null; 23 | } 24 | } 25 | } 26 | 27 | // This is just here so we can let things be null...At some point, it was a problem because 28 | // the received messages remember which factors sent them, so clearing them for some reason 29 | // caused problems (maybe writing the value 1.0 was problematic when we weren't clearing the 30 | // received messages on the other end?). Can probably get rid of this somehow and just do the 31 | // obvious thing of initializing messages to 1.0. 32 | def receivedMessageValue(i: Int, j: Int): Double = { 33 | if (receivedMessages(i) == null) { 34 | 1.0; 35 | } else { 36 | receivedMessages(i)(j); 37 | } 38 | } 39 | 40 | def receiveMessage(factor: Factor, message: Array[Double]) { 41 | require(receivedMessages != null); 42 | require(!GUtil.containsNaN(message)); 43 | val idx = factors.indexOf(factor); 44 | require(idx != -1 && idx < receivedMessages.size); 45 | if (message.toSeq.contains(0.0)) { 46 | Logger.logss("For domain: " + domain + ", bad received message: " + message.toSeq + " from " + factor.getClass()); 47 | Logger.logss("Previous message: " + receivedMessages(factors.indexOf(factor)).toSeq); 48 | require(false); 49 | } 50 | if (message.reduce(_ + _) == 0) { 51 | Logger.logss("For domain: " + domain + ", bad received message: " + message.toSeq + " from " + factor.getClass()); 52 | Logger.logss("Previous message: " + receivedMessages(factors.indexOf(factor)).toSeq); 53 | require(false); 54 | } 55 | require(message.size == domain.size); 56 | receivedMessages(factors.indexOf(factor)) = message; 57 | } 58 | 59 | def sendMessages() { 60 | // sendMessagesUseRealSpace(); 61 | sendMessagesUseLogSpace(); 62 | } 63 | 64 | def sendMessagesUseRealSpace() { 65 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 66 | cachedBeliefsOrMarginals(i) = 1.0; 67 | } 68 | require(receivedMessages.size == factors.size); 69 | for (i <- 0 until receivedMessages.size) { 70 | var j = 0; 71 | while (j < cachedBeliefsOrMarginals.size) { 72 | cachedBeliefsOrMarginals(j) *= receivedMessageValue(i, j); 73 | j += 1; 74 | } 75 | } 76 | // Normalize beliefs 77 | val normalizedNonzero = GUtil.normalizeiSoft(cachedBeliefsOrMarginals); 78 | if (!normalizedNonzero) { 79 | Logger.logss("For domain: " + domain + ", received messages:" + receivedMessages.foldLeft("")((currStr, msg) => currStr + "\n" + msg.toSeq.toString)) 80 | require(false); 81 | } 82 | if (sentMessages == null) { 83 | sentMessages = new Array[Array[Double]](factors.size); 84 | } 85 | for (i <- 0 until factors.length) { 86 | sentMessages(i) = new Array[Double](domain.size); 87 | var j = 0; 88 | while (j < domain.size) { 89 | val rmVal = receivedMessageValue(i, j); 90 | if (rmVal == 0) { 91 | sentMessages(i)(j) = 0; 92 | } else { 93 | sentMessages(i)(j) = cachedBeliefsOrMarginals(j)/rmVal; 94 | } 95 | j += 1; 96 | } 97 | factors(i).receiveMessage(this, sentMessages(i)); 98 | } 99 | } 100 | 101 | def sendMessagesUseLogSpace() { 102 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 103 | cachedBeliefsOrMarginals(i) = 0.0; 104 | } 105 | require(receivedMessages.size == factors.size); 106 | for (i <- 0 until receivedMessages.size) { 107 | var j = 0; 108 | while (j < cachedBeliefsOrMarginals.size) { 109 | cachedBeliefsOrMarginals(j) += Math.log(receivedMessageValue(i, j)); 110 | j += 1; 111 | } 112 | } 113 | GUtil.logNormalizei(cachedBeliefsOrMarginals); 114 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 115 | cachedBeliefsOrMarginals(i) = Math.exp(cachedBeliefsOrMarginals(i)); 116 | } 117 | if (sentMessages == null) { 118 | sentMessages = new Array[Array[Double]](factors.size); 119 | } 120 | for (i <- 0 until factors.length) { 121 | sentMessages(i) = new Array[Double](domain.size); 122 | var j = 0; 123 | while (j < domain.size) { 124 | val rmVal = receivedMessageValue(i, j); 125 | if (rmVal == 0) { 126 | sentMessages(i)(j) = 0; 127 | } else { 128 | sentMessages(i)(j) = cachedBeliefsOrMarginals(j)/rmVal; 129 | } 130 | j += 1; 131 | } 132 | factors(i).receiveMessage(this, sentMessages(i)); 133 | } 134 | } 135 | 136 | def getMarginals(): Array[Double] = { 137 | getMarginalsUseLogSpace(); 138 | } 139 | 140 | def getMarginalsUseLogSpace(): Array[Double] = { 141 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 142 | cachedBeliefsOrMarginals(i) = 0.0; 143 | } 144 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 145 | for (j <- 0 until receivedMessages.size) { 146 | cachedBeliefsOrMarginals(i) += Math.log(receivedMessageValue(j, i)); 147 | } 148 | } 149 | GUtil.logNormalizei(cachedBeliefsOrMarginals); 150 | for (i <- 0 until cachedBeliefsOrMarginals.size) { 151 | cachedBeliefsOrMarginals(i) = Math.exp(cachedBeliefsOrMarginals(i)); 152 | } 153 | cachedBeliefsOrMarginals 154 | } 155 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/lang/ArabicTreebankLanguagePack.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.lang; 2 | 3 | import edu.berkeley.nlp.futile.treebank.AbstractTreebankLanguagePack; 4 | 5 | 6 | public class ArabicTreebankLanguagePack extends AbstractTreebankLanguagePack { 7 | private static final String[] collinsPunctTags = {"PUNC"}; 8 | 9 | private static final String[] pennPunctTags = {"PUNC"}; 10 | 11 | private static final String[] pennPunctWords = {".","\"",",","-LRB-","-RRB-","-",":","/","?","_","*","%","!",">","-PLUS-","...",";","..","&","=","ر","'","\\","`","......"}; 12 | 13 | private static final String[] pennSFPunctTags = {"PUNC"}; 14 | 15 | private static final String[] pennSFPunctWords = {".", "!", "?"}; 16 | 17 | /** 18 | * The first 3 are used by the Penn Treebank; # is used by the 19 | * BLLIP corpus, and ^ and ~ are used by Klein's lexparser. 20 | * Chris deleted '_' for Arabic as it appears in tags (NO_FUNC). 21 | * June 2006: CDM tested _ again with true (new) Treebank tags to see if it 22 | * was useful for densening up the tag space, but the results were negative. 23 | * Roger added + for Arabic but Chris deleted it again, since unless you've 24 | * recoded determiners, it screws up DET+NOUN, etc. (That is, it would only be useful if 25 | * you always wanted to cut at the first '+', but in practice that is not viable, certainly 26 | * not with the IBM ATB processing either.) 27 | */ 28 | private static final char[] annotationIntroducingChars = {'-', '=', '|', '#', '^', '~'}; 29 | 30 | /** 31 | * This is valid for "BobChrisTreeNormalizer" conventions only. 32 | * wsg: "ROOT" should always be the first value. See {@link #startSymbol} in 33 | * the parent class. 34 | */ 35 | private static final String[] pennStartSymbols = {"ROOT"}; 36 | 37 | 38 | /** 39 | * Returns a String array of punctuation tags for this treebank/language. 40 | * 41 | * @return The punctuation tags 42 | */ 43 | @Override 44 | public String[] punctuationTags() { 45 | return pennPunctTags; 46 | } 47 | 48 | 49 | /** 50 | * Returns a String array of punctuation words for this treebank/language. 51 | * 52 | * @return The punctuation words 53 | */ 54 | @Override 55 | public String[] punctuationWords() { 56 | return pennPunctWords; 57 | } 58 | 59 | 60 | /** 61 | * Returns a String array of sentence final punctuation tags for this 62 | * treebank/language. 63 | * 64 | * @return The sentence final punctuation tags 65 | */ 66 | @Override 67 | public String[] sentenceFinalPunctuationTags() { 68 | return pennSFPunctTags; 69 | } 70 | 71 | /** 72 | * Returns a String array of sentence final punctuation words for this 73 | * treebank/language. 74 | * 75 | * @return The sentence final punctuation tags 76 | */ 77 | public String[] sentenceFinalPunctuationWords() { 78 | return pennSFPunctWords; 79 | } 80 | 81 | /** 82 | * Returns a String array of treebank start symbols. 83 | * 84 | * @return The start symbols 85 | */ 86 | @Override 87 | public String[] startSymbols() { 88 | return pennStartSymbols; 89 | } 90 | 91 | /** 92 | * Returns the extension of treebank files for this treebank. 93 | * This is "tree". 94 | */ 95 | public String treebankFileExtension() { 96 | return "tree"; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/lang/CorefLanguagePack.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.lang 2 | 3 | trait CorefLanguagePack { 4 | def getMentionConstituentTypes: Seq[String]; 5 | def getPronominalTags: Seq[String]; 6 | def getProperTags: Seq[String]; 7 | } 8 | 9 | class EnglishCorefLanguagePack extends CorefLanguagePack { 10 | def getMentionConstituentTypes: Seq[String] = Seq("NP"); 11 | def getPronominalTags: Seq[String] = Seq("PRP", "PRP$"); 12 | def getProperTags: Seq[String] = Seq("NNP"); 13 | } 14 | 15 | class ChineseCorefLanguagePack extends CorefLanguagePack { 16 | def getMentionConstituentTypes: Seq[String] = Seq("NP"); 17 | def getPronominalTags: Seq[String] = Seq("PN"); 18 | def getProperTags: Seq[String] = Seq("NR"); 19 | } 20 | 21 | class ArabicCorefLanguagePack extends CorefLanguagePack { 22 | def getMentionConstituentTypes: Seq[String] = Seq("NP"); 23 | def getPronominalTags: Seq[String] = Seq("PRP", "PRP$"); 24 | def getProperTags: Seq[String] = Seq("NNP"); 25 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/lang/Language.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.lang; 2 | 3 | 4 | public enum Language { 5 | ENGLISH, ARABIC, CHINESE; 6 | } 7 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/preprocess/NerDriver.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.preprocess; 2 | 3 | import edu.berkeley.nlp.futile.util.Logger; 4 | import edu.berkeley.nlp.futile.fig.basic.Option; 5 | import edu.berkeley.nlp.futile.fig.exec.Execution; 6 | 7 | 8 | public class NerDriver implements Runnable { 9 | @Option(gloss = "") 10 | public static Mode mode = Mode.TRAIN; 11 | 12 | @Option(gloss = "Path to read/write the model") 13 | public static String modelPath = ""; 14 | 15 | // TRAINING_OPTIONS 16 | @Option(gloss = "Path to CoNLL training set") 17 | public static String trainPath = ""; 18 | @Option(gloss = "Training set size, -1 for all") 19 | public static int trainSize = -1; 20 | @Option(gloss = "Path to CoNLL test set") 21 | public static String testPath = ""; 22 | @Option(gloss = "Test set size, -1 for all") 23 | public static int testSize = -1; 24 | 25 | public static enum Mode { 26 | TRAIN, RUN; 27 | } 28 | 29 | public static void main(String[] args) { 30 | NerDriver main = new NerDriver(); 31 | Execution.run(args, main); // add .class here if that class should receive command-line args 32 | } 33 | 34 | public void run() { 35 | Logger.setFig(); 36 | switch (mode) { 37 | case TRAIN: NerSystem.trainNerSystem(); 38 | break; 39 | case RUN: 40 | // Read trees 41 | // PennTreeReader.berkeleyParserBadTree 42 | // Extract words and POS 43 | 44 | // 45 | break; 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/preprocess/NerExample.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.preprocess 2 | import edu.berkeley.nlp.futile.fig.basic.Indexer 3 | import scala.collection.mutable.ArrayBuffer 4 | 5 | case class NerExample(val words: Seq[String], val poss: Seq[String], val goldLabels: Seq[String], val labelIndexer: Indexer[String]) { 6 | 7 | def featurize(featureIndexer: Indexer[String], addToIndexer: Boolean): Array[Array[Array[Int]]] = { 8 | Array.tabulate(words.size, labelIndexer.size)((tokIdx, labelIdx) => { 9 | require(labelIndexer.size > labelIdx); 10 | val labelName = labelIndexer.getObject(labelIdx); 11 | // Extract word and word shape features 12 | val wordAt = (i: Int) => if (tokIdx + i < 0) "<>" else if (tokIdx + i >= words.size) "<>" else words(tokIdx + i); 13 | val wordShapeAt = (i: Int) => NerExample.shapeFor(wordAt(i)); 14 | val wordClassAt = (i: Int) => NerExample.classFor(wordAt(i)); 15 | val posAt = (i: Int) => if (tokIdx + i < 0) "<>" else if (tokIdx + i >= words.size) "<>" else poss(tokIdx + i); 16 | 17 | val feats = new ArrayBuffer[Int](); 18 | val maybeAddFeat = (feat: String) => { 19 | val labeledFeat = labelName + ":" + feat; 20 | if (addToIndexer || featureIndexer.contains(labeledFeat)) feats += featureIndexer.getIndex(labeledFeat) 21 | } 22 | // Words 23 | maybeAddFeat("-1W=" + wordAt(-2)); 24 | maybeAddFeat("-1W=" + wordAt(-1)); 25 | maybeAddFeat("0W=" + wordAt(0)); 26 | maybeAddFeat("1W=" + wordAt(1)); 27 | maybeAddFeat("2W=" + wordAt(2)); 28 | // POS 29 | maybeAddFeat("-2P=" + posAt(-2)); 30 | maybeAddFeat("-1P=" + posAt(-1)); 31 | maybeAddFeat("0P=" + posAt(0)); 32 | maybeAddFeat("1P=" + posAt(1)); 33 | maybeAddFeat("2P=" + posAt(2)); 34 | // Shape 35 | maybeAddFeat("-2S=" + wordShapeAt(-2)); 36 | maybeAddFeat("-1S=" + wordShapeAt(-1)); 37 | maybeAddFeat("0S=" + wordShapeAt(0)); 38 | maybeAddFeat("1S=" + wordShapeAt(1)); 39 | maybeAddFeat("2S=" + wordShapeAt(2)); 40 | // Class 41 | maybeAddFeat("-2C=" + wordClassAt(-1)); 42 | maybeAddFeat("-1C=" + wordClassAt(-1)); 43 | maybeAddFeat("0C=" + wordClassAt(0)); 44 | maybeAddFeat("1C=" + wordClassAt(1)); 45 | maybeAddFeat("2C=" + wordClassAt(1)); 46 | // POS-POS conjunctions 47 | maybeAddFeat("-2-1P=" + posAt(-2) + "," + posAt(-1)); 48 | maybeAddFeat("-10P=" + posAt(-1) + "," + posAt(0)); 49 | maybeAddFeat("01P=" + posAt(0) + "," + posAt(1)); 50 | maybeAddFeat("12P=" + posAt(1) + "," + posAt(2)); 51 | // // Word-word conjunctions 52 | // maybeAddFeat("-2-1W=" + wordAt(-2) + "," + wordAt(-1)); 53 | // maybeAddFeat("-10W=" + wordAt(-1) + "," + wordAt(0)); 54 | // maybeAddFeat("01W=" + wordAt(0) + "," + wordAt(1)); 55 | // maybeAddFeat("12W=" + wordAt(1) + "," + wordAt(2)); 56 | // Word-POS conjunctions 57 | maybeAddFeat("-2-1PW=" + posAt(-2) + "," + wordAt(-1)); 58 | maybeAddFeat("-10PW=" + posAt(-1) + "," + wordAt(0)); 59 | maybeAddFeat("01PW=" + posAt(0) + "," + wordAt(1)); 60 | maybeAddFeat("12PW=" + posAt(1) + "," + wordAt(2)); 61 | maybeAddFeat("-2-1WP=" + wordAt(-2) + "," + posAt(-1)); 62 | maybeAddFeat("-10WP=" + wordAt(-1) + "," + posAt(0)); 63 | maybeAddFeat("01WP=" + wordAt(0) + "," + posAt(1)); 64 | maybeAddFeat("12WP=" + wordAt(1) + "," + posAt(2)); 65 | // Word-class conjunctions 66 | maybeAddFeat("-2-1CW=" + wordClassAt(-2) + "," + wordAt(-1)); 67 | maybeAddFeat("-10CW=" + wordClassAt(-1) + "," + wordAt(0)); 68 | maybeAddFeat("01CW=" + wordClassAt(0) + "," + wordAt(1)); 69 | maybeAddFeat("12CW=" + wordClassAt(1) + "," + wordAt(2)); 70 | maybeAddFeat("-2-1WC=" + wordAt(-2) + "," + wordClassAt(-1)); 71 | maybeAddFeat("-10WC=" + wordAt(-1) + "," + wordClassAt(0)); 72 | maybeAddFeat("01WC=" + wordAt(0) + "," + wordClassAt(1)); 73 | maybeAddFeat("12WC=" + wordAt(1) + "," + wordClassAt(2)); 74 | feats.toArray; 75 | }); 76 | } 77 | } 78 | 79 | object NerExample { 80 | 81 | def shapeFor(word: String) = { 82 | val result = new StringBuilder(word.length); 83 | var i = 0; 84 | while (i < word.length) { 85 | val c = word(i); 86 | val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c; 87 | if (result.length > 1 && (result.last == x) && result(result.length - 2) == x) { 88 | result += 'e' 89 | } else if (result.length > 1 && result.last == 'e' && result(result.length - 2) == x) { 90 | () // nothing 91 | } else { 92 | result += x; 93 | } 94 | i += 1; 95 | } 96 | result.toString 97 | } 98 | 99 | def classFor(word: String) = { 100 | val sb = new StringBuilder; 101 | val wlen = word.length(); 102 | val numCaps = (word: Seq[Char]).count(_.isUpper); 103 | val hasDigit = word.exists(_.isDigit); 104 | val hasDash = word.contains('-'); 105 | val hasLower = numCaps < wlen; 106 | val ch0 = word.charAt(0); 107 | val lowered = word.toLowerCase(); 108 | if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) { 109 | if (numCaps == 1) { 110 | sb.append("-INITC"); 111 | } else { 112 | sb.append("-CAPS"); 113 | } 114 | } else if (!Character.isLetter(ch0) && numCaps > 0) { 115 | sb.append("-CAPS"); 116 | } else if (hasLower) { 117 | sb.append("-LC"); 118 | } 119 | 120 | if (hasDigit) { 121 | sb.append("-NUM"); 122 | } 123 | if (hasDash) { 124 | sb.append("-DASH"); 125 | } 126 | if (lowered.endsWith("s") && wlen >= 3) { 127 | // here length 3, so you don't miss out on ones like 80s 128 | val ch2 = lowered.charAt(wlen - 2); 129 | // not -ess suffixes or greek/latin -us, -is 130 | if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') { 131 | sb.append("-s"); 132 | } 133 | } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) { 134 | if (lowered.endsWith("ed")) { 135 | sb.append("-ed"); 136 | } else if (lowered.endsWith("ing")) { 137 | sb.append("-ing"); 138 | } else if (lowered.endsWith("ion")) { 139 | sb.append("-ion"); 140 | } else if (lowered.endsWith("er")) { 141 | sb.append("-er"); 142 | } else if (lowered.endsWith("est")) { 143 | sb.append("-est"); 144 | } else if (lowered.endsWith("ly")) { 145 | sb.append("-ly"); 146 | } else if (lowered.endsWith("ity")) { 147 | sb.append("-ity"); 148 | } else if (lowered.endsWith("y")) { 149 | sb.append("-y"); 150 | } else if (lowered.endsWith("al")) { 151 | sb.append("-al"); 152 | } 153 | } 154 | sb.toString; 155 | } 156 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/preprocess/Reprocessor.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.preprocess 2 | 3 | import edu.berkeley.nlp.PCFGLA.CoarseToFineMaxRuleParser 4 | import edu.berkeley.nlp.coref.ConllDoc 5 | import scala.collection.JavaConverters._ 6 | import scala.collection.mutable.ArrayBuffer 7 | import java.io.PrintWriter 8 | import edu.berkeley.nlp.coref.ConllDocReader 9 | import edu.berkeley.nlp.syntax.Tree 10 | import edu.berkeley.nlp.futile.util.Logger 11 | import java.util.Arrays 12 | import edu.berkeley.nlp.futile.fig.basic.IOUtils 13 | import edu.berkeley.nlp.coref.Chunk 14 | import edu.berkeley.nlp.coref.ConllDocWriter 15 | 16 | object Reprocessor { 17 | 18 | def redoConllDocument(parser: CoarseToFineMaxRuleParser, backoffParser: CoarseToFineMaxRuleParser, nerSystem: NerSystem, docReader: ConllDocReader, inputPath: String, outputPath: String) { 19 | val writer = IOUtils.openOutHard(outputPath); 20 | val docs = docReader.readConllDocs(inputPath); 21 | for (doc <- docs) { 22 | Logger.logss("Reprocessing: " + doc.docID + " part " + doc.docPartNo); 23 | val newPos = new ArrayBuffer[Seq[String]](); 24 | val newParses = new ArrayBuffer[edu.berkeley.nlp.futile.syntax.Tree[String]](); 25 | val newNerChunks = new ArrayBuffer[Seq[Chunk[String]]](); 26 | for (sentIdx <- 0 until doc.words.size) { 27 | if (sentIdx % 10 == 0) { 28 | Logger.logss("Sentence " + sentIdx); 29 | } 30 | val sent = doc.words(sentIdx); 31 | var parse = PreprocessingDriver.parse(parser, backoffParser, sent.asJava); 32 | parse = if (parse.getYield().size() != sent.length) { 33 | Logger.logss("Couldn't parse sentence: " + sent.toSeq); 34 | Logger.logss("Using default parse"); 35 | convertFromFutileTree(doc.trees(sentIdx).constTree); 36 | } else { 37 | parse; 38 | } 39 | val posTags = parse.getPreTerminalYield().asScala.toArray; 40 | newPos += posTags; 41 | newParses += convertToFutileTree(parse); 42 | val nerBioLabels = nerSystem.runNerSystem(sent.toArray, posTags); 43 | newNerChunks += convertBioToChunks(nerBioLabels); 44 | } 45 | ConllDocWriter.writeIncompleteConllDoc(writer, doc.docID, doc.docPartNo, doc.words, newPos, newParses, doc.speakers, newNerChunks, doc.corefChunks); 46 | } 47 | writer.close(); 48 | } 49 | 50 | def convertBioToChunks(nerBioLabels: Seq[String]): Seq[Chunk[String]] = { 51 | var lastNerStart = -1; 52 | val chunks = new ArrayBuffer[Chunk[String]](); 53 | for (i <- 0 until nerBioLabels.size) { 54 | if (nerBioLabels(i).startsWith("B")) { 55 | if (lastNerStart != -1) { 56 | chunks += new Chunk[String](lastNerStart, i, "MISC"); 57 | } 58 | lastNerStart = i; 59 | } else if (nerBioLabels(i).startsWith("O")) { 60 | if (lastNerStart != -1) { 61 | chunks += new Chunk[String](lastNerStart, i, "MISC"); 62 | lastNerStart = -1; 63 | } 64 | } 65 | } 66 | chunks; 67 | } 68 | 69 | def convertToFutileTree(slavTree: edu.berkeley.nlp.syntax.Tree[String]): edu.berkeley.nlp.futile.syntax.Tree[String] = { 70 | new edu.berkeley.nlp.futile.syntax.Tree[String](slavTree.getLabel(), slavTree.getChildren().asScala.map(convertToFutileTree(_)).asJava); 71 | } 72 | 73 | def convertFromFutileTree(myTree: edu.berkeley.nlp.futile.syntax.Tree[String]): edu.berkeley.nlp.syntax.Tree[String] = { 74 | new edu.berkeley.nlp.syntax.Tree[String](myTree.getLabel(), myTree.getChildren().asScala.map(convertFromFutileTree(_)).asJava); 75 | } 76 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/preprocess/SentenceSplitterTokenizerDriver.java: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.preprocess; 2 | 3 | import java.io.PrintWriter; 4 | import java.util.List; 5 | 6 | import edu.berkeley.nlp.futile.tokenizer.PTBLineLexer; 7 | import edu.berkeley.nlp.futile.util.Logger; 8 | import edu.berkeley.nlp.futile.fig.basic.IOUtils; 9 | import edu.berkeley.nlp.futile.fig.basic.Option; 10 | import edu.berkeley.nlp.futile.fig.exec.Execution; 11 | 12 | 13 | public class SentenceSplitterTokenizerDriver implements Runnable { 14 | @Option(gloss = "") 15 | public static Mode mode = Mode.TRAIN; 16 | 17 | @Option(gloss = "Raw text input") 18 | public static String inputPath = ""; 19 | @Option(gloss = "") 20 | public static String outputPath = ""; 21 | @Option(gloss = "") 22 | public static boolean respectInputLineBreaks = false; 23 | @Option(gloss = "") 24 | public static boolean respectInputTwoLineBreaks = true; 25 | 26 | @Option(gloss = "Path to read/write the model") 27 | public static String modelPath = ""; 28 | 29 | // TRAINING OPTIONS 30 | @Option(gloss = "Train the sentence splitter from the CoNLL data. If false, you " + 31 | "must provide your own data in the format\n" + 32 | ". <0 or 1>\n" + 33 | "where 0 indicates not a boundary and 1 indicates a boundary.") 34 | public static boolean trainFromConll = true; 35 | 36 | @Option(gloss = "Path to training set") 37 | public static String trainPath = ""; 38 | @Option(gloss = "Path to test set") 39 | public static String testPath = ""; 40 | @Option(gloss = "Path to CoNLL training set") 41 | public static String conllTrainPath = ""; 42 | @Option(gloss = "Training set size, -1 for all") 43 | public static int conllTrainSize = -1; 44 | @Option(gloss = "Path to CoNLL test set") 45 | public static String conllTestPath = ""; 46 | @Option(gloss = "Test set size, -1 for all") 47 | public static int conllTestSize = -1; 48 | 49 | public static enum Mode { 50 | TRAIN, RUN; 51 | } 52 | 53 | public static void main(String[] args) { 54 | SentenceSplitterTokenizerDriver main = new SentenceSplitterTokenizerDriver(); 55 | Execution.run(args, main); // add .class here if that class should receive command-line args 56 | } 57 | 58 | public void run() { 59 | Logger.setFig(); 60 | switch (mode) { 61 | case TRAIN: SentenceSplitter.trainSentenceSplitter(); 62 | break; 63 | case RUN: 64 | SentenceSplitter splitter = SentenceSplitter.loadSentenceSplitter(modelPath); 65 | String[] lines = IOUtils.readLinesHard(inputPath).toArray(new String[0]); 66 | String[] canonicalizedParagraphs = splitter.formCanonicalizedParagraphs(lines, respectInputLineBreaks, respectInputTwoLineBreaks); 67 | String[] sentences = splitter.splitSentences(canonicalizedParagraphs); 68 | String[][] tokenizedSentences = splitter.tokenize(sentences); 69 | PrintWriter writer = IOUtils.openOutHard(outputPath); 70 | for (String[] sentence : tokenizedSentences) { 71 | for (int i = 0; i < sentence.length; i++) { 72 | writer.print(sentence[i]); 73 | if (i < sentence.length - 1) { 74 | writer.print(" "); 75 | } 76 | } 77 | writer.println(); 78 | } 79 | writer.close(); 80 | break; 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/sem/QueryCountAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.sem 2 | 3 | import edu.berkeley.nlp.coref.DocumentGraph 4 | import edu.berkeley.nlp.coref.MentionType 5 | import edu.berkeley.nlp.util.Counter 6 | import edu.berkeley.nlp.coref.PronounDictionary 7 | import edu.berkeley.nlp.futile.util.Logger 8 | import edu.berkeley.nlp.coref.Mention 9 | 10 | object QueryCountAnalyzer { 11 | 12 | def renderSomeQueries(docGraph: DocumentGraph, predBackptrs: Seq[Int], queryCounts: QueryCountsBundle) = { 13 | var rendered = ""; 14 | for (mentIdx <- 0 until docGraph.size) { 15 | if (isReferring(docGraph, mentIdx) && 16 | isIncorrect(docGraph, mentIdx, predBackptrs(mentIdx)) && 17 | hasReferringAntecedents(docGraph, mentIdx) && 18 | !hasHeadMatchWithAntecedent(docGraph, mentIdx)) { 19 | val myHeadTc = docGraph.getMention(mentIdx).headString; 20 | val antIndicesCounts = (0 until mentIdx).filter(idx => isReferring(docGraph, idx)).map(idx => (idx, queryCounts.pairCounts.getCount(myHeadTc, docGraph.getMention(idx).headString))) 21 | // Top five scores and whether they're in 22 | val topAntIndicesCounts = antIndicesCounts.sortBy(_._2).reverse.slice(0, Math.min(5, antIndicesCounts.size)); 23 | val goldRefAntecedents = docGraph.getGoldAntecedentsUnderCurrentPruning(mentIdx).filter(docGraph.getMention(_).mentionType != MentionType.PRONOMINAL); 24 | def renderMentionAndCount = (idx: Int, count: Double) => "[" + idx + ": " + renderMentionWithHead(docGraph.getMention(idx)) + ", " + count + ", " + 25 | (if (docGraph.getGoldAntecedentsUnderCurrentPruning(mentIdx).contains(idx)) "corr" else "wrong") + "] "; 26 | rendered += docGraph.corefDoc.rawDoc.docID + " " + mentIdx + ": " + renderMentionWithHead(docGraph.getMention(mentIdx)) + "\n correct = "; 27 | for (goldRefAntecedent <- goldRefAntecedents) { 28 | val antHeadTc = docGraph.getMention(goldRefAntecedent).headString; 29 | rendered += renderMentionAndCount(goldRefAntecedent, queryCounts.pairCounts.getCount(myHeadTc, antHeadTc)); 30 | } 31 | rendered += "\n top five = "; 32 | for (i <- 0 until topAntIndicesCounts.size) { 33 | rendered += renderMentionAndCount(topAntIndicesCounts(i)._1, topAntIndicesCounts(i)._2); 34 | } 35 | rendered += "\n"; 36 | } 37 | } 38 | rendered; 39 | } 40 | 41 | private def renderMentionWithHead(mention: Mention) = { 42 | val startIdx = Math.max(mention.headIdx - mention.startIdx - 2, 0); 43 | val endIdx = Math.min(mention.headIdx - mention.startIdx + 3, mention.words.size); 44 | val str = mention.words.slice(startIdx, mention.headIdx - mention.startIdx).foldLeft("")(_ + " " + _) + " _" + mention.words(mention.headIdx - mention.startIdx) + 45 | "_" + mention.words.slice(mention.headIdx - mention.startIdx + 1, endIdx).foldLeft("")(_ + " " + _) 46 | str.trim; 47 | } 48 | 49 | def renderQueryCountStats(docGraphs: Seq[DocumentGraph], allPredBackptrs: Seq[Seq[Int]], queryCounts: QueryCountsBundle) = { 50 | // var numTop = 0.0; 51 | // var numUnseen = 0.0; 52 | // for (i <- 0 until docGraphs.size) { 53 | // val docGraph = docGraphs(i); 54 | // for (j <- 0 until docGraph.size) { 55 | // if (isReferring(docGraph, j) && 56 | // isIncorrect(docGraph, j, allPredBackptrs(i)(j)) && 57 | // hasReferringAntecedents(docGraph, j) && 58 | // !hasHeadMatchWithAntecedent(docGraph, j)) { 59 | // val myHeadTc = docGraph.getMention(j).headString; 60 | // val antHeads = (0 until j).filter(isReferring(docGraph, _)).map(docGraph.getMention(_).headString); 61 | // val topCountScore = queryCounts.pairCounts.getCount(myHeadTc, ) 62 | // } 63 | // } 64 | // } 65 | "" 66 | } 67 | 68 | def renderTopFailedRecallHeadPairs(docGraphs: Seq[DocumentGraph], allPredBackptrs: Array[Array[Int]]) = { 69 | val headCounter = new Counter[String](); 70 | val headCounterMislead = new Counter[String](); 71 | for (i <- 0 until docGraphs.size) { 72 | val docGraph = docGraphs(i); 73 | for (j <- 0 until docGraph.size) { 74 | if (isReferring(docGraph, j) && 75 | isIncorrect(docGraph, j, allPredBackptrs(i)(j)) && 76 | hasReferringAntecedents(docGraph, j) && 77 | hasHeadMatchWithAntecedent(docGraph, j)) { 78 | if (hasHeadMatchWithPrediction(docGraph, j, allPredBackptrs(i)(j))) { 79 | headCounterMislead.incrementCount(docGraph.getMention(j).headStringLc, 1.0); 80 | } 81 | headCounter.incrementCount(docGraph.getMention(j).headStringLc, 1.0); 82 | } 83 | } 84 | } 85 | var rendered = headCounter.size + " heads missed, " + headCounterMislead.size + " heads mislead\n"; 86 | headCounter.keepTopNKeys(100); 87 | rendered += headCounter.toString + "\n"; 88 | headCounterMislead.keepTopNKeys(100); 89 | rendered += headCounterMislead.toString + "\n"; 90 | rendered; 91 | } 92 | 93 | // N.B. Referring here means nominal or proper, not coreferent 94 | def isReferring(docGraph: DocumentGraph, idx: Int) = { 95 | docGraph.getMention(idx).mentionType != MentionType.PRONOMINAL; 96 | } 97 | 98 | def isIncorrect(docGraph: DocumentGraph, idx: Int, backptr: Int) = { 99 | !docGraph.getGoldAntecedentsUnderCurrentPruning(idx).contains(backptr); 100 | } 101 | 102 | def isPredictedNewCluster(docGraph: DocumentGraph, idx: Int, backptr: Int) = { 103 | backptr == idx; 104 | } 105 | 106 | def hasReferringAntecedents(docGraph: DocumentGraph, idx: Int) = { 107 | val goldAntecedents = docGraph.getGoldAntecedentsUnderCurrentPruning(idx); 108 | goldAntecedents.filter(i => docGraph.getMention(i).mentionType != MentionType.PRONOMINAL).size > 0; 109 | } 110 | 111 | def hasHeadMatchWithAntecedent(docGraph: DocumentGraph, idx: Int) = { 112 | val goldAntecedents = docGraph.getGoldAntecedentsUnderCurrentPruning(idx); 113 | goldAntecedents.filter(i => docGraph.getMention(i).headStringLc == docGraph.getMention(idx).headStringLc).size > 0; 114 | } 115 | 116 | def hasHeadMatchWithPrediction(docGraph: DocumentGraph, idx: Int, backptr: Int) = { 117 | backptr != idx && docGraph.getMention(idx).headStringLc == docGraph.getMention(backptr).headStringLc; 118 | } 119 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/berkeley/nlp/coref/sem/QueryCountsBundle.scala: -------------------------------------------------------------------------------- 1 | package edu.berkeley.nlp.coref.sem 2 | import edu.berkeley.nlp.futile.fig.basic.IOUtils 3 | import edu.berkeley.nlp.futile.util.Counter 4 | import edu.berkeley.nlp.futile.util.Logger 5 | import java.io.File 6 | 7 | @SerialVersionUID(1L) 8 | class QueryCountsBundle(val wordCounts: Counter[String], 9 | val pairCounts: Counter[(String,String)]) extends Serializable { 10 | } 11 | 12 | object QueryCountsBundle { 13 | 14 | def createFromFile(path: String) = { 15 | val wordCounts = new Counter[String]; 16 | val pairCounts = new Counter[(String,String)]; 17 | val cleanedPath = if (path != path.trim) { 18 | Logger.logss("WARNING: queryCountsFile has spurious spaces for some inexplicable reason; trimming"); 19 | path.trim; 20 | } else { 21 | path; 22 | } 23 | val lineItr = IOUtils.lineIterator(cleanedPath); 24 | while (lineItr.hasNext) { 25 | val line = lineItr.next; 26 | val fields = line.split("\\s+"); 27 | if (fields.size == 2) { 28 | wordCounts.incrementCount(fields(0), fields(1).toDouble); 29 | } else if (fields.size == 3) { 30 | pairCounts.incrementCount(fields(0) -> fields(1), fields(2).toDouble); 31 | } 32 | } 33 | Logger.logss("Loaded " + pairCounts.size + " query counts from " + path); 34 | new QueryCountsBundle(wordCounts, pairCounts); 35 | } 36 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/harvard/nlp/moarcoref/AnimacyHelper.java: -------------------------------------------------------------------------------- 1 | package edu.harvard.nlp.moarcoref; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.InputStreamReader; 7 | import java.util.Arrays; 8 | import java.util.HashSet; 9 | import java.util.Set; 10 | 11 | import edu.berkeley.nlp.coref.Mention; 12 | import edu.berkeley.nlp.coref.MentionType; 13 | 14 | public class AnimacyHelper { 15 | 16 | public static Set animates; 17 | public static Set inanimates; 18 | 19 | static { 20 | try { 21 | animates = getWordsFromFile(MiniDriver.animacyPath, false); 22 | inanimates = getWordsFromFile(MiniDriver.inanimacyPath, false); 23 | } catch (IOException e) { 24 | e.printStackTrace(); 25 | System.exit(1); 26 | } 27 | 28 | } 29 | 30 | 31 | ////////////////////////////////////////////////////// 32 | // implementation of some recasens features 33 | ///////////////////////////////////////////////////// 34 | 35 | public static String getAnimacy(Mention ment) { 36 | String animacy = "UNKNOWN"; 37 | String headString = ment.headString(); 38 | String nerString = ment.nerString(); 39 | Set inanimateNers = new HashSet(Arrays.asList( 40 | "LOCATION", "MONEY", "NUMBER", "PERCENT", "DATE", "TIME", 41 | "FAC", "GPE", "WEA", "ORG")); 42 | if (ment.mentionType() == MentionType.PRONOMINAL) { 43 | if (animates.contains(headString)) { 44 | animacy = "ANIMATE"; 45 | } else if (inanimates.contains(headString)) { 46 | animacy = "INANIMATE"; 47 | } 48 | } else if (nerString.equals("PERSON") || nerString.startsWith("PER")) { 49 | animacy = "ANIMATE"; 50 | } else if (inanimateNers.contains(nerString) 51 | || nerString.startsWith("LOC")) { 52 | animacy = "INANIMATE"; 53 | } 54 | // if still unknown, use list 55 | if (ment.mentionType() != MentionType.PRONOMINAL 56 | && animacy.equals("UNKNOWN")) { 57 | if (animates.contains(headString)) { 58 | animacy = "ANIMATE"; 59 | } else if (inanimates.contains(headString)) { 60 | animacy = "INANIMATE"; 61 | } 62 | } 63 | return animacy; 64 | } 65 | 66 | // mostly stolen from Dictionaries.java in stanfordcorenlp.dcoref 67 | public static Set getWordsFromFile(String filename, boolean lowercase) throws IOException{ 68 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename))); 69 | Set words = new HashSet(); 70 | while (reader.ready()){ 71 | if (lowercase){ 72 | words.add(reader.readLine().toLowerCase()); // readLine strips the trailing '\n' etc 73 | } else { 74 | words.add(reader.readLine()); 75 | } 76 | } 77 | reader.close(); 78 | return words; 79 | } 80 | } -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/harvard/nlp/moarcoref/FeatureExtractor.scala: -------------------------------------------------------------------------------- 1 | package edu.harvard.nlp.moarcoref; 2 | 3 | import java.io.PrintWriter 4 | 5 | import scala.collection.JavaConverters.mapAsScalaMapConverter 6 | import scala.collection.immutable.TreeMap 7 | 8 | import edu.berkeley.nlp.coref.CorefFeaturizerTrainer 9 | import edu.berkeley.nlp.coref.CorefSystem 10 | import edu.berkeley.nlp.coref.DocumentGraph 11 | import edu.berkeley.nlp.coref.NumberGenderComputer 12 | import edu.berkeley.nlp.coref.PairwiseIndexingFeaturizerJoint 13 | import edu.berkeley.nlp.coref.PairwiseIndexingFeaturizer 14 | import edu.berkeley.nlp.coref.sem.QueryCountsBundle 15 | import edu.berkeley.nlp.futile.fig.basic.Indexer 16 | import edu.berkeley.nlp.futile.util.Logger 17 | 18 | object FeatureExtractor { 19 | 20 | def writeSeparatedFeatsAndOraclePredClustering(smaller:Boolean) { 21 | var pfx = (if (smaller) "SMALL" else "BIG"); 22 | Logger.logss("Using conjType = " + MiniDriver.conjType); 23 | val numberGenderComputer = NumberGenderComputer.readBergsmaLinData(MiniDriver.numberGenderDataPath); 24 | // require(!MiniDriver.trainOnGold); 25 | 26 | var trainDocs = CorefSystem.loadCorefDocs(MiniDriver.trainPath, MiniDriver.trainSize, numberGenderComputer, MiniDriver.useGoldMentions); 27 | var trainDocGraphsOrigOrder = trainDocs.map(new DocumentGraph(_, true)); 28 | var trainDocGraphs = if (MiniDriver.randomizeTrain) new scala.util.Random(0).shuffle(trainDocGraphsOrigOrder.sortBy(_.corefDoc.rawDoc.printableDocName)) else trainDocGraphsOrigOrder; 29 | 30 | Logger.logss(trainDocGraphs.size + " many train docs"); 31 | val totalMents = trainDocGraphs.foldLeft(0)((total, curr) => total + curr.size); 32 | val lexicalCounts = MoarLexicalCountsBundle.countLexicalItems(trainDocs, MiniDriver.lexicalFeatCutoff, MiniDriver.bilexicalFeatCutoff); 33 | val queryCounts: QueryCountsBundle = null; 34 | val featurizerTrainer = new CorefFeaturizerTrainer(); 35 | 36 | // extract anaphoricity features 37 | var anaphFeatureIndexer = new Indexer[String](); 38 | anaphFeatureIndexer.getIndex(SeparatingFeaturizer.UnkFeatName); 39 | // last true parameter to function below means it's in anaphoricity mode 40 | var anaphFeaturizer = new SmallerSeparatingFeaturizer(anaphFeatureIndexer, MiniDriver.pairwiseFeats, MiniDriver.conjType, lexicalCounts, queryCounts, true); //anaphoricityMode=true 41 | featurizerTrainer.featurizeBasic(trainDocGraphs, anaphFeaturizer); 42 | anaphFeaturizer.printFeatureTemplateCounts(); 43 | // write our features to a file 44 | TextPickler.writeAnaphFeats(trainDocGraphs, pfx + "-" + MiniDriver.pairwiseFeats + "-" + "anaphTrainFeats.txt"); 45 | 46 | // write anaph feature mapping 47 | val printerAnaph = new PrintWriter(pfx+"-"+MiniDriver.pairwiseFeats + "-" + "anaphMapping.txt"); 48 | var invMap = anaphFeatureIndexer.getMap().asScala.map(_.swap); // asScala is magic 49 | var tmap = TreeMap(invMap.toSeq:_*); // sort the map 50 | for ((idx,str) <- tmap){ 51 | printerAnaph.println(idx + " : " + str); 52 | } 53 | printerAnaph.flush(); 54 | printerAnaph.close(); 55 | 56 | // write oracle pred clustering for train 57 | TextPickler.writePredOracleClusterings(trainDocGraphs, pfx+"TrainOPCs.txt"); 58 | 59 | // now do pairwise features 60 | trainDocGraphsOrigOrder = trainDocs.map(new DocumentGraph(_, true)); 61 | trainDocGraphs = if (MiniDriver.randomizeTrain) new scala.util.Random(0).shuffle(trainDocGraphsOrigOrder.sortBy(_.corefDoc.rawDoc.printableDocName)) else trainDocGraphsOrigOrder; 62 | var pwFeatureIndexer = new Indexer[String](); 63 | pwFeatureIndexer.getIndex(PairwiseIndexingFeaturizerJoint.UnkFeatName); 64 | // below we set anaphoricityMode = false 65 | var pwFeaturizer:PairwiseIndexingFeaturizer = null; 66 | if (smaller){ 67 | pwFeaturizer = new SmallerSeparatingFeaturizer(pwFeatureIndexer, MiniDriver.pairwiseFeats, MiniDriver.conjType, lexicalCounts, queryCounts, false); //anaphoricityMode=false 68 | } else{ 69 | pwFeaturizer = new SeparatingFeaturizer(pwFeatureIndexer, MiniDriver.pairwiseFeats, MiniDriver.conjType, lexicalCounts, queryCounts, false); //anaphoricityMode=false 70 | } 71 | //var pwFeaturizer = new SeparatingFeaturizer(pwFeatureIndexer, MiniDriver.pairwiseFeats, MiniDriver.conjType, lexicalCounts, queryCounts, false); //anaphoricityMode=false 72 | featurizerTrainer.featurizeBasic(trainDocGraphs, pwFeaturizer); 73 | pwFeaturizer.printFeatureTemplateCounts; 74 | // write pairwise train features 75 | TextPickler.writePWFeats(trainDocGraphs, pwFeatureIndexer.size(), pfx + "-" + MiniDriver.pairwiseFeats + "-" + "pwTrainFeats.txt"); 76 | 77 | // write pw feature mapping 78 | val printerPW = new PrintWriter(pfx+"-"+ MiniDriver.pairwiseFeats + "-" + "pwMapping.txt"); 79 | invMap = pwFeatureIndexer.getMap().asScala.map(_.swap); // asScala is magic 80 | tmap = TreeMap(invMap.toSeq:_*); // sort the map 81 | for ((idx,str) <- tmap){ 82 | printerPW.println(idx + " : " + str); 83 | } 84 | printerPW.flush(); 85 | printerPW.close(); 86 | 87 | // hopefully helps with gc 88 | trainDocs = null; 89 | trainDocGraphsOrigOrder = null; 90 | trainDocGraphs = null; 91 | 92 | var devDocs = CorefSystem.loadCorefDocs(MiniDriver.devPath, MiniDriver.devSize, numberGenderComputer, MiniDriver.useGoldMentions); 93 | var devDocGraphs = devDocs.map(new DocumentGraph(_, false)).sortBy(_.corefDoc.rawDoc.printableDocName); 94 | featurizerTrainer.featurizeBasic(devDocGraphs, anaphFeaturizer); // dev docs already know they are dev docs so they don't add features 95 | TextPickler.writeAnaphFeats(devDocGraphs, pfx + "-" + MiniDriver.pairwiseFeats + "-" + "anaphDevFeats.txt"); 96 | devDocGraphs = devDocs.map(new DocumentGraph(_, false)).sortBy(_.corefDoc.rawDoc.printableDocName); 97 | featurizerTrainer.featurizeBasic(devDocGraphs,pwFeaturizer); 98 | TextPickler.writePWFeats(devDocGraphs, pwFeatureIndexer.size(), pfx + "-" + MiniDriver.pairwiseFeats + "-" + "pwDevFeats.txt"); 99 | 100 | // write dev oracle predicted clustering 101 | TextPickler.writePredOracleClusterings(devDocGraphs, pfx+"DevOPCs.txt"); 102 | 103 | // do test docs 104 | devDocs = null; 105 | devDocGraphs = null; 106 | var testDocs = CorefSystem.loadCorefDocs(MiniDriver.testPath, MiniDriver.testSize, numberGenderComputer, MiniDriver.useGoldMentions); 107 | 108 | var testDocGraphs = testDocs.map(new DocumentGraph(_, false)).sortBy(_.corefDoc.rawDoc.printableDocName); 109 | featurizerTrainer.featurizeBasic(testDocGraphs, anaphFeaturizer); // test docs already know they are test docs so they don't add features 110 | TextPickler.writeAnaphFeats(testDocGraphs, pfx + "-" + MiniDriver.pairwiseFeats + "-" + "anaphTestFeats.txt"); 111 | testDocGraphs = testDocs.map(new DocumentGraph(_, false)).sortBy(_.corefDoc.rawDoc.printableDocName); 112 | featurizerTrainer.featurizeBasic(testDocGraphs,pwFeaturizer); 113 | TextPickler.writePWFeats(testDocGraphs, pwFeatureIndexer.size(), pfx + "-" + MiniDriver.pairwiseFeats + "-" + "pwTestFeats.txt"); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/harvard/nlp/moarcoref/MiniDriver.java: -------------------------------------------------------------------------------- 1 | package edu.harvard.nlp.moarcoref; 2 | 3 | import edu.berkeley.nlp.coref.lang.Language; 4 | import edu.berkeley.nlp.futile.util.Logger; 5 | import edu.berkeley.nlp.futile.fig.basic.Option; 6 | import edu.berkeley.nlp.futile.fig.exec.Execution; 7 | import edu.berkeley.nlp.coref.ConjType; 8 | 9 | /* 10 | * A minimal version of BCS's Driver.java 11 | */ 12 | public class MiniDriver implements Runnable { 13 | 14 | @Option(gloss = "Which experiment to run?") 15 | public static Mode mode = Mode.SMALLER; 16 | @Option(gloss = "Language choice") 17 | public static Language lang = Language.ENGLISH; 18 | 19 | // DATA AND PATHS 20 | @Option(gloss = "Path to number/gender data") 21 | public static String numberGenderDataPath = "gender.data"; 22 | @Option(gloss = "Path to Stanford Coref's animate unigrams") 23 | public static String animacyPath = "animate.unigrams.txt"; 24 | @Option(gloss = "Path to Stanford Coref's inanimate unigrams") 25 | public static String inanimacyPath = "inanimate.unigrams.txt"; 26 | @Option(gloss = "Path to training set") 27 | public static String trainPath = "flat_train_2012"; 28 | @Option(gloss = "Training set size, -1 for all") 29 | public static int trainSize = -1; 30 | @Option(gloss = "Path to dev set") 31 | public static String devPath = "flat_dev_2012"; 32 | @Option(gloss = "Dev set size, -1 for all") 33 | public static int devSize = -1; 34 | @Option(gloss = "Path to test set") 35 | public static String testPath = "flat_test_2012"; 36 | @Option(gloss = "Test set size, -1 for all") 37 | public static int testSize = -1; 38 | @Option(gloss = "Suffix to use for documents") 39 | public static String docSuffix = "auto_conll"; 40 | @Option(gloss = "Randomize the order of train documents") 41 | public static boolean randomizeTrain = true; 42 | 43 | @Option(gloss = "True if we should train on the documents with gold annotations, false if we should use auto annotations") 44 | public static boolean trainOnGold = false; 45 | @Option(gloss = "Use gold mentions.") 46 | public static boolean useGoldMentions = false; 47 | 48 | @Option(gloss = "Features to use; default is SURFACE, write \"+FINAL\" for FINAL") 49 | public static String pairwiseFeats = ""; 50 | @Option(gloss = "Conjunction type") 51 | public static ConjType conjType = ConjType.CANONICAL; 52 | @Option(gloss = "Cutoff below which lexical features fire POS tags instead") 53 | public static int lexicalFeatCutoff = 20; 54 | @Option(gloss = "Cutoff below which bilexical features fire backoff indicator feature") 55 | public static int bilexicalFeatCutoff = 10; 56 | 57 | 58 | public static enum Mode { 59 | SMALLER; 60 | } 61 | 62 | public static void main(String[] args) { 63 | MiniDriver main = new MiniDriver(); 64 | Execution.run(args, main); // add .class here if that class should receive command-line args 65 | } 66 | 67 | public void run() { 68 | Logger.setFig(); 69 | if (mode.equals(Mode.SMALLER)) { 70 | FeatureExtractor.writeSeparatedFeatsAndOraclePredClustering(true); 71 | } else { 72 | FeatureExtractor.writeSeparatedFeatsAndOraclePredClustering(false); 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /modifiedBCS/src/main/java/edu/harvard/nlp/moarcoref/TextPickler.scala: -------------------------------------------------------------------------------- 1 | package edu.harvard.nlp.moarcoref 2 | 3 | import edu.berkeley.nlp.coref.DocumentGraph 4 | import java.io.PrintWriter 5 | import scala.collection.mutable.HashSet 6 | import scala.collection.mutable.TreeSet 7 | 8 | object TextPickler { 9 | 10 | // we'll write in the following fmt. each doc will be on its own line. the line will start with the number of mentions 11 | // then will be feats_j0| .. |feats_jj|feats_{j+1}0|..|feats_{j+1}{j+1} etc. 12 | def writeFeats(docGraphs: Seq[DocumentGraph], fiName: String){ 13 | val pw = new PrintWriter(fiName); 14 | for (dg <- docGraphs){ 15 | pw.print(dg.size); 16 | var i = 0; 17 | while (i < dg.size){ 18 | var j = 0; 19 | while (j <= i){ 20 | pw.print('|'); 21 | val feats = dg.cachedFeats(i)(j); 22 | var k = 0; 23 | while (k < feats.length){ 24 | pw.print(feats(k)); 25 | if (k < feats.length - 1){ 26 | pw.print(' '); 27 | } 28 | k += 1; 29 | } 30 | j += 1; 31 | } 32 | i += 1; 33 | } 34 | pw.println(); 35 | } 36 | pw.close(); 37 | } 38 | 39 | def writeFeats(docGraphs: Seq[DocumentGraph], fiName: String, prunedIndices: TreeSet[(Int, Int, Int)]) { 40 | val pw = new PrintWriter(fiName); 41 | for ((dg, d) <- docGraphs.zipWithIndex) { 42 | pw.print(dg.size); 43 | var i = 0; 44 | while (i < dg.size) { 45 | var j = 0; 46 | while (j <= i) { 47 | pw.print('|'); 48 | if (!prunedIndices.contains((d, i, j))) { 49 | val feats = dg.cachedFeats(i)(j); 50 | var k = 0; 51 | while (k < feats.length) { 52 | pw.print(feats(k)); 53 | if (k < feats.length - 1) { 54 | pw.print(' '); 55 | } 56 | k += 1; 57 | } 58 | } 59 | j += 1; 60 | } 61 | i += 1; 62 | } 63 | pw.println(); 64 | } 65 | pw.close(); 66 | } 67 | 68 | // just writes anaphoric feats for each mention 69 | def writeAnaphFeats(docGraphs: Seq[DocumentGraph], fiName: String){ 70 | val pw = new PrintWriter(fiName); 71 | for (dg <- docGraphs){ 72 | pw.print(dg.size); 73 | var i = 0; 74 | while (i < dg.size){ 75 | pw.print('|'); 76 | val feats = dg.cachedFeats(i)(i).sorted; 77 | var k = 0; 78 | while (k < feats.length){ 79 | pw.print(feats(k)); 80 | if (k < feats.length - 1){ 81 | pw.print(' '); 82 | } 83 | k += 1; 84 | } 85 | i += 1; 86 | } 87 | pw.println(); 88 | } 89 | pw.close(); 90 | } 91 | 92 | def writePWFeats(docGraphs: Seq[DocumentGraph], biasFeatIdx:Int, fiName: String){ 93 | val pw = new PrintWriter(fiName); 94 | for (dg <- docGraphs){ 95 | pw.print(dg.size); 96 | var i = 0; 97 | while (i < dg.size){ 98 | var j = 0; 99 | while (j < i){ 100 | pw.print('|'); 101 | val feats = dg.cachedFeats(i)(j); 102 | var k = 0; 103 | while (k < feats.length){ 104 | pw.print(feats(k)); 105 | if (k < feats.length - 1){ 106 | pw.print(' '); 107 | } 108 | k += 1; 109 | } 110 | j += 1; 111 | } 112 | // now just write one bias feature for non-anaphoric option 113 | pw.print('|'); 114 | pw.print(biasFeatIdx); // don't really use biasFeat anymore, but it indicates total number of features 115 | i += 1; 116 | } 117 | pw.println(); 118 | } 119 | pw.close(); 120 | } 121 | 122 | 123 | // format will be a sequence of clusters separated by '|'. this can be used during training 124 | // and also for loss fcns. 125 | def writePredOracleClusterings(docGraphs: Seq[DocumentGraph], fiName: String){ 126 | val pw = new PrintWriter(fiName); 127 | for (dg <- docGraphs){ 128 | for ((clust,clustIdx) <- dg.getOraclePredClustering.clusters.zipWithIndex){ 129 | if (clustIdx > 0){ 130 | pw.print('|'); 131 | } 132 | val clustSize = clust.size; 133 | for ((ment,mentIdx) <- clust.zipWithIndex){ 134 | pw.print(ment); 135 | if (mentIdx < clustSize - 1){ 136 | pw.print(' '); 137 | } 138 | } 139 | } 140 | pw.println(); 141 | } 142 | pw.close(); 143 | } 144 | 145 | def writeMentHeads(docGraphs: Seq[DocumentGraph],fiName: String, lowercase:Boolean = false){ 146 | val pw = new PrintWriter(fiName); 147 | for (dg <- docGraphs){ 148 | pw.print(dg.size); 149 | var i = 0; 150 | while (i < dg.size){ 151 | pw.print('|'); 152 | if (lowercase){ 153 | pw.print(dg.corefDoc.predMentions(i).headStringLc); 154 | } else { 155 | pw.print(dg.corefDoc.predMentions(i).headString); 156 | } 157 | i += 1; 158 | } 159 | pw.println(); 160 | } 161 | pw.close(); 162 | } 163 | 164 | 165 | def writeFullMentandCtx(docGraphs: Seq[DocumentGraph],fiName: String, lowercase:Boolean = false){ 166 | val pw = new PrintWriter(fiName); 167 | for (dg <- docGraphs){ 168 | pw.print(dg.size); 169 | var i = 0; 170 | while (i < dg.size){ 171 | pw.print('|'); 172 | val ment = dg.corefDoc.predMentions(i); 173 | if (lowercase){ 174 | pw.print(ment.contextWordOrPlaceholder(-1).toLowerCase() + " ["); 175 | pw.print(ment.spanToString.toLowerCase() + "] "); 176 | pw.print(ment.contextWordOrPlaceholder(ment.words.size).toLowerCase); 177 | } else { 178 | pw.print(ment.contextWordOrPlaceholder(-1) + " ["); 179 | pw.print(dg.corefDoc.predMentions(i).spanToString + "] "); 180 | pw.print(ment.contextWordOrPlaceholder(ment.words.size)); 181 | } 182 | i += 1; 183 | } 184 | pw.println(); 185 | } 186 | pw.close(); 187 | } 188 | 189 | } 190 | -------------------------------------------------------------------------------- /nn/ante_model.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'coref_utils' 3 | require 'sparse_doc_data' 4 | local mu = require 'model_utils' 5 | 6 | torch.manualSeed(2) 7 | 8 | do 9 | local AnteModel = torch.class('AnteModel') 10 | 11 | function AnteModel:__init(pwD, hiddenPW, cuda, dop) 12 | torch.manualSeed(2) 13 | if cuda then 14 | cutorch.manualSeed(2) 15 | end 16 | self.hiddenPW = hiddenPW 17 | 18 | local pwNet = nn.Sequential() 19 | pwNet:add(nn.LookupTable(pwD,hiddenPW)) 20 | pwNet:add(nn.Sum(2)) 21 | pwNet:add(nn.Add(hiddenPW)) 22 | pwNet:add(nn.Tanh()) 23 | pwNet:add(nn.Dropout(dop)) 24 | pwNet:add(nn.Linear(hiddenPW,1)) 25 | 26 | -- make sure contiguous, and do sparse init while we're at it 27 | recSutsInit(pwNet,15) 28 | pwNet:get(1).weight[-1]:fill(0) -- assume last feature is a dummy, padding feature 29 | self.pwNet = cuda and pwNet:cuda() or pwNet 30 | collectgarbage() 31 | end 32 | 33 | function AnteModel:docGrad(d,batch,clust,deltTensor,numMents) 34 | for m = 2, numMents do -- ignore first guy; always NA 35 | if clust:anaphoric(m) then 36 | local start = ((m-2)*(m-1))/2 -- one behind first pair for mention m 37 | local scores = self.pwNet:forward(batch:sub(start+1,start+m-1)):squeeze(2) 38 | local late = maxGoldAnt(clust,scores,m,0) 39 | local pred = simpleAnteLAArgmax(clust.m2c,scores,m,late,0) 40 | if clust.m2c[pred] ~= clust.m2c[late] then 41 | self.pwNet:forward(batch:sub(start+pred,start+pred)) 42 | self.pwNet:backward(batch:sub(start+pred,start+pred),deltTensor) 43 | self.pwNet:forward(batch:sub(start+late,start+late)) 44 | self.pwNet:backward(batch:sub(start+late,start+late),-deltTensor) 45 | end 46 | end 47 | end 48 | end 49 | 50 | 51 | function AnteModel:getDevAcc(pwDevData,devClusts,cuda) 52 | assert(self.pwNet:get(1).weight[-1]:abs():sum() == 0) 53 | assert(self.pwNet.train == false) 54 | local total = 0 55 | local correct = 0 56 | for d = 1, pwDevData.numDocs do 57 | if d % 100 == 0 then 58 | print("dev doc " .. tostring(d)) 59 | collectgarbage() 60 | end 61 | local numMents = pwDevData:numMents(d) 62 | local docBatch = pwDevData:getDocBatch(d) 63 | if cuda then 64 | docBatch = docBatch:cuda() 65 | end 66 | for m = 2, numMents do 67 | if devClusts[d]:anaphoric(m) then 68 | local start = ((m-2)*(m-1))/2 69 | local scores = self.pwNet:forward(docBatch:sub(start+1,start+m-1)):squeeze(2) 70 | local _, pred = torch.max(scores,1) 71 | total = total + 1 72 | if devClusts[d].m2c[m] == devClusts[d].m2c[pred[1]] then 73 | correct = correct + 1 74 | end 75 | end 76 | end 77 | end 78 | return correct/total 79 | end 80 | 81 | function AnteModel:docLoss(d,batch,clust,numMents) 82 | local loss = 0 83 | for m = 2, numMents do -- ignore first guy; always NA 84 | if clust:anaphoric(m) then 85 | local start = ((m-1)*(m-2))/2 -- index one behind first antecedent for this mention (in pwData) 86 | local scores = self.pwNet:forward(batch:sub(start+1,start+m-1)):squeeze(2) 87 | local late = maxGoldAnt(clust,scores,m,0) 88 | local pred = simpleAnteLAArgmax(clust.m2c,scores,m,late,0) 89 | if clust.m2c[pred] ~= clust.m2c[late] then 90 | loss = loss + (1 + scores[pred] - scores[late]) 91 | end 92 | end 93 | end 94 | return loss 95 | end 96 | 97 | end 98 | 99 | function train(pwData,clusts,pwDevData,devClusts,cuda) 100 | local anteModel = AnteModel(pwData.maxFeat+1, opts.H, cuda, opts.dop) 101 | local serFi = string.format("models/%s_%d.model", opts.savePrefix, opts.H) 102 | local params, gradParams = anteModel.pwNet:getParameters() 103 | local optState = {} 104 | local deltTensor = cuda and torch.ones(1,1):cuda() or torch.ones(1,1) 105 | for t = 1, opts.nEpochs do 106 | print("epoch: " .. tostring(t)) 107 | anteModel.pwNet:training() 108 | -- use document sized minibatches 109 | for d = 1, pwData.numDocs do 110 | if d % 200 == 0 then 111 | print("doc " .. tostring(d)) 112 | collectgarbage() 113 | end 114 | local batch = pwData:getDocBatch(d) 115 | if cuda then 116 | batch = batch:cuda() 117 | end 118 | gradParams:zero() 119 | anteModel:docGrad(d,batch,clusts[d],deltTensor,pwData:numMents(d)) 120 | -- do pw gradients 121 | mu.adagradStep(params,gradParams,opts.eta,optState) 122 | end 123 | 124 | print("evaluating on dev...") 125 | anteModel.pwNet:evaluate() 126 | local currAcc = anteModel:getDevAcc(pwDevData,devClusts,cuda) 127 | print("Acc " .. tostring(currAcc)) 128 | print("") 129 | end 130 | if opts.save then 131 | print("overwriting params...") 132 | torch.save(serFi..string.format("-pw-%f",opts.eta), anteModel.pwNet) 133 | end 134 | end 135 | 136 | 137 | cmd = torch.CmdLine() 138 | cmd:text() 139 | cmd:text() 140 | cmd:text('Training ante model') 141 | cmd:text() 142 | cmd:text('Options') 143 | cmd:option('-H', 700, 'Hidden layer size') 144 | cmd:option('-trainClustFile', '../SMALLTrainOPCs.txt', 'Train Oracle Predicted Clustering File') 145 | cmd:option('-devClustFile', '../SMALLDevOPCs.txt', 'Dev Oracle Predicted Clustering File') 146 | cmd:option('-pwTrFeatPrefix', 'train_small', 'Expects train pairwise features in -pw-*.h5') 147 | cmd:option('-pwDevFeatPrefix', 'dev_small', 'Expects dev pairwise features in -pw-*.h5') 148 | cmd:option('-nEpochs', 20, 'Number of epochs to train') 149 | cmd:option('-save', false, 'Save best model') 150 | cmd:option('-savePrefix', 'small', 'Prefixes saved model with this') 151 | cmd:option('-gpuid', -1, 'if >= 0, gives idx of gpu to use') 152 | cmd:option('-eta', 0.1, 'adagrad learning rate') 153 | cmd:option('-dop', 0.5, 'dropout rate') 154 | cmd:text() 155 | 156 | -- Parse input options 157 | opts = cmd:parse(arg) 158 | 159 | if opts.gpuid >= 0 then 160 | print('using cuda on gpu ' .. opts.gpuid) 161 | require 'cutorch' 162 | require 'cunn' 163 | cutorch.manualSeed(2) 164 | cutorch.setDevice(opts.gpuid+1) 165 | end 166 | 167 | function main() 168 | local pwTrData = SpDMPWData.loadFromH5(opts.pwTrFeatPrefix) 169 | print("read pw train data") 170 | print("max pw feature is: " .. pwTrData.maxFeat) 171 | local pwDevData = SpDMPWData.loadFromH5(opts.pwDevFeatPrefix) 172 | print("read pw dev data") 173 | local trClusts = getOPCs(opts.trainClustFile,pwTrData) 174 | print("read train clusters") 175 | local devClusts = getOPCs(opts.devClustFile,pwDevData) 176 | print("read dev clusters") 177 | 178 | train(pwTrData,trClusts,pwDevData,devClusts,opts.gpuid >= 0) 179 | end 180 | 181 | main() 182 | -------------------------------------------------------------------------------- /nn/model_utils.lua: -------------------------------------------------------------------------------- 1 | local model_utils = {} 2 | 3 | function model_utils.adagradStep(x,dfdx,eta,state) 4 | if not state.var then 5 | state.var = torch.Tensor():typeAs(x):resizeAs(x):zero() 6 | state.std = torch.Tensor():typeAs(x):resizeAs(x) 7 | end 8 | state.var:addcmul(1,dfdx,dfdx) 9 | state.std:sqrt(state.var) 10 | x:addcdiv(-eta, dfdx, state.std:add(1e-10)) 11 | end 12 | 13 | function model_utils.make_sp_mlp(D,H,zeroLast,justFirstLayer,dop) 14 | local mlp = nn.Sequential() 15 | mlp:add(nn.LookupTable(D,H)) 16 | mlp:add(nn.Sum(2)) 17 | mlp:add(nn.Add(H)) -- add a bias 18 | mlp:add(nn.Tanh()) 19 | if not justFirstLayer then 20 | mlp:add(nn.Dropout(dop or 0.5)) 21 | mlp:add(nn.Linear(H,1)) 22 | end 23 | -- make sure contiguous, and do sparse sutskever init while we're at it 24 | recSutsInit(mlp,15) 25 | if zeroLast then 26 | mlp:get(1).weight[-1]:fill(0) 27 | end 28 | return mlp 29 | end 30 | 31 | 32 | function model_utils.make_sp_and_dense_mlp(spD,dD,H,zeroLast,justFirstLayer,dop) 33 | local mlp = nn.Sequential() 34 | local parLayer = nn.ParallelTable() 35 | local left = nn.Sequential() 36 | left:add(nn.LookupTable(spD,H)) 37 | left:add(nn.Sum(2)) -- after this pt, will have totalNumMents x H 38 | local right = nn.Sequential() 39 | right:add(nn.Linear(dD,H)) -- just handles the distance feature (and the bias, conveniently) 40 | parLayer:add(left) 41 | parLayer:add(right) 42 | mlp:add(parLayer) 43 | mlp:add(nn.CAddTable()) 44 | mlp:add(nn.Tanh()) 45 | if not justFirstLayer then 46 | mlp:add(nn.Dropout(dop or 0.5)) 47 | mlp:add(nn.Linear(H,1)) 48 | end 49 | recSutsInit(mlp,15) 50 | if zeroLast then 51 | mlp:get(1):get(1):get(1).weight[-1]:fill(0) 52 | end 53 | return mlp 54 | end 55 | 56 | 57 | function sparseSutsMatInit(W,numNZ,scale) 58 | local numNZ = numNZ or 15 59 | local scale = scale or 0.25 60 | local m = W:size(1) 61 | local n = W:size(2) 62 | -- zero everything out 63 | W:fill(0) 64 | if n >= m then -- assume columns are features and rows are hidden dims 65 | numNZ = math.min(numNZ,n) 66 | for i = 1, m do 67 | local perm = torch.randperm(n) 68 | -- probably better ways of doing this 69 | local r = torch.randn(numNZ)*scale 70 | for j = 1, numNZ do 71 | W[i][perm[j]] = r[j] 72 | end 73 | end 74 | else -- assume rows are features and columns hidden dims 75 | numNZ = math.min(numNZ,m) 76 | for j = 1, n do 77 | local perm = torch.randperm(m) 78 | local r = torch.randn(numNZ)*scale 79 | for i = 1, numNZ do 80 | W[perm[i]][j] = r[i] 81 | end 82 | end 83 | end 84 | end 85 | 86 | function recSutsInit(net,numNZ) -- assuming no module can have weight and children 87 | local numNZ = numNZ or 15 88 | if net.weight and net.bias then 89 | sparseSutsMatInit(net.weight, math.min(numNZ,net.weight:size(1),net.weight:size(2))) 90 | net.bias:fill(0.5) 91 | elseif net.weight then 92 | sparseSutsMatInit(net.weight, math.min(numNZ,net.weight:size(1),net.weight:size(2))) 93 | elseif net.bias then 94 | net.bias:fill(0.5) 95 | elseif net.modules and #net.modules > 0 then 96 | for layer, subnet in ipairs(net.modules) do 97 | recSutsInit(subnet, numNZ) 98 | end 99 | end 100 | end 101 | 102 | -- stolen from https://github.com/karpathy/char-rnn/blob/master/util/model_utils.lua 103 | function model_utils.combine_all_parameters(...) 104 | --[[ like module:getParameters, but operates on many modules ]]-- 105 | 106 | -- get parameters 107 | local networks = {...} 108 | local parameters = {} 109 | local gradParameters = {} 110 | for i = 1, #networks do 111 | local tn = torch.typename(layer) 112 | local net_params, net_grads = networks[i]:parameters() 113 | if net_params then 114 | for _, p in pairs(net_params) do 115 | parameters[#parameters + 1] = p 116 | end 117 | for _, g in pairs(net_grads) do 118 | gradParameters[#gradParameters + 1] = g 119 | end 120 | end 121 | end 122 | 123 | local function storageInSet(set, storage) 124 | local storageAndOffset = set[torch.pointer(storage)] 125 | if storageAndOffset == nil then 126 | return nil 127 | end 128 | local _, offset = unpack(storageAndOffset) 129 | return offset 130 | end 131 | 132 | -- this function flattens arbitrary lists of parameters, 133 | -- even complex shared ones 134 | local function flatten(parameters) 135 | if not parameters or #parameters == 0 then 136 | return torch.Tensor() 137 | end 138 | local Tensor = parameters[1].new 139 | 140 | local storages = {} 141 | local nParameters = 0 142 | for k = 1,#parameters do 143 | local storage = parameters[k]:storage() 144 | if not storageInSet(storages, storage) then 145 | storages[torch.pointer(storage)] = {storage, nParameters} 146 | nParameters = nParameters + storage:size() 147 | end 148 | end 149 | 150 | local flatParameters = Tensor(nParameters):fill(1) 151 | local flatStorage = flatParameters:storage() 152 | 153 | for k = 1,#parameters do 154 | local storageOffset = storageInSet(storages, parameters[k]:storage()) 155 | parameters[k]:set(flatStorage, 156 | storageOffset + parameters[k]:storageOffset(), 157 | parameters[k]:size(), 158 | parameters[k]:stride()) 159 | parameters[k]:zero() 160 | end 161 | 162 | local maskParameters= flatParameters:float():clone() 163 | local cumSumOfHoles = flatParameters:float():cumsum(1) 164 | local nUsedParameters = nParameters - cumSumOfHoles[#cumSumOfHoles] 165 | local flatUsedParameters = Tensor(nUsedParameters) 166 | local flatUsedStorage = flatUsedParameters:storage() 167 | 168 | for k = 1,#parameters do 169 | local offset = cumSumOfHoles[parameters[k]:storageOffset()] 170 | parameters[k]:set(flatUsedStorage, 171 | parameters[k]:storageOffset() - offset, 172 | parameters[k]:size(), 173 | parameters[k]:stride()) 174 | end 175 | 176 | for _, storageAndOffset in pairs(storages) do 177 | local k, v = unpack(storageAndOffset) 178 | flatParameters[{{v+1,v+k:size()}}]:copy(Tensor():set(k)) 179 | end 180 | 181 | if cumSumOfHoles:sum() == 0 then 182 | flatUsedParameters:copy(flatParameters) 183 | else 184 | local counter = 0 185 | for k = 1,flatParameters:nElement() do 186 | if maskParameters[k] == 0 then 187 | counter = counter + 1 188 | flatUsedParameters[counter] = flatParameters[counter+cumSumOfHoles[k]] 189 | end 190 | end 191 | assert (counter == nUsedParameters) 192 | end 193 | return flatUsedParameters 194 | end 195 | 196 | -- flatten parameters and gradients 197 | local flatParameters = flatten(parameters) 198 | local flatGradParameters = flatten(gradParameters) 199 | 200 | -- return new flat vector that contains all discrete parameters 201 | return flatParameters, flatGradParameters 202 | end 203 | 204 | 205 | 206 | return model_utils 207 | -------------------------------------------------------------------------------- /nn/sparse_doc_data.lua: -------------------------------------------------------------------------------- 1 | require 'hdf5' 2 | 3 | SpDMPWData = {} -- for pairwise mention data 4 | SpDMPWData.__index = SpDMPWData 5 | 6 | function SpDMPWData.loadFromH5(featPfx) 7 | spdmmd = {} 8 | setmetatable(spdmmd,SpDMPWData) 9 | local featfi = assert(hdf5.open(featPfx .. "-pw-feats.h5")) 10 | spdmmd.feats = featfi:read("feats"):all() 11 | featfi:close() 12 | local offsetfi = assert(hdf5.open(featPfx .. "-pw-offsets.h5")) 13 | spdmmd.docStarts = offsetfi:read("doc_starts"):all() 14 | spdmmd.mentStarts = offsetfi:read("ment_starts"):all() 15 | offsetfi:close() 16 | spdmmd.numDocs = spdmmd.docStarts:size(1)-1 17 | spdmmd.maxFeat = spdmmd.feats:max() 18 | collectgarbage() 19 | assert(spdmmd.feats:isContiguous()) 20 | -- below only works if every pair actually has same number of features... 21 | spdmmd.numNZ = spdmmd.mentStarts[2] - spdmmd.mentStarts[1] 22 | return spdmmd 23 | end 24 | 25 | 26 | function SpDMPWData.makeFromTensors(feats,docStarts,mentStarts) -- for debugging 27 | spdmmd = {} 28 | setmetatable(spdmmd,SpDMPWData) 29 | spdmmd.feats = feats 30 | spdmmd.docStarts = docStarts 31 | spdmmd.mentStarts = mentStarts 32 | spdmmd.numDocs = spdmmd.docStarts:size(1)-1 33 | spdmmd.maxFeat = spdmmd.feats:max() 34 | spdmmd.numNZ = spdmmd.mentStarts[2] - spdmmd.mentStarts[1] 35 | collectgarbage() 36 | assert(spdmmd.feats:isContiguous()) 37 | return spdmmd 38 | end 39 | 40 | 41 | -- d1m2m1, d1m3m1, d1m3m2, d1m4m1, d1m4m2, d1m4m3 42 | -- this assumes self.docStarts and self.mentStarts begin at 0 rather than 1 43 | function SpDMPWData:getFeats(d,m,a) 44 | local docStartIdx = self.docStarts[d] --idx within self.MentStarts 1 behind where this doc starts 45 | local mentAntOffset = (m-2)*(m-1)/2 + a 46 | return self.feats:sub(self.mentStarts[docStartIdx+mentAntOffset]+1, self.mentStarts[docStartIdx+mentAntOffset+1]) 47 | end 48 | 49 | 50 | function SpDMPWData:numMents(d) -- solve the quadratic equation (for the positive root) 51 | -- we want m such that m*(m-1)/2 = numPairs => m^2 -m -2*numPairs = 0 52 | local numPairs = self.docStarts[d+1] - self.docStarts[d] 53 | return (1 + math.sqrt(1 + 4*2*numPairs))/2 54 | end 55 | 56 | function SpDMPWData:getMentBatch(d,m) 57 | local docStartIdx = self.docStarts[d] --idx within self.MentStarts 1 behind where this doc starts 58 | --local mentAntOffset = (m-2)*(m-1)/2 + a 59 | local mentOffset = (m-2)*(m-1)/2 60 | return self.feats:sub(self.mentStarts[docStartIdx+mentOffset+1]+1, self.mentStarts[docStartIdx+mentOffset+m]):view(m-1,self.numNZ) 61 | end 62 | 63 | function SpDMPWData:getDocBatch(d) 64 | local docStartIdx = self.docStarts[d] 65 | local nextDocStartIdx = self.docStarts[d+1] 66 | local feats = self.feats:sub(self.mentStarts[docStartIdx+1]+1, self.mentStarts[nextDocStartIdx+1]) 67 | local numRows = feats:size(1)/self.numNZ 68 | return feats:view(numRows, self.numNZ) 69 | end 70 | 71 | SpDMData = {} -- for just mention data 72 | SpDMData.__index = SpDMData 73 | 74 | function SpDMData.loadFromH5(featPfx) 75 | spdmd = {} 76 | setmetatable(spdmd,SpDMData) 77 | local featfi = assert(hdf5.open(featPfx .. "-na-feats.h5")) 78 | spdmd.feats = featfi:read("feats"):all() 79 | featfi:close() 80 | local offsetfi = assert(hdf5.open(featPfx .. "-na-offsets.h5")) 81 | spdmd.docStarts = offsetfi:read("doc_starts"):all() 82 | spdmd.mentStarts = offsetfi:read("ment_starts"):all() 83 | spdmd.numDocs = spdmd.docStarts:size(1)-1 84 | offsetfi:close() 85 | spdmd.maxFeat = spdmd.feats:max() 86 | collectgarbage() 87 | assert(spdmd.feats:isContiguous()) 88 | return spdmd 89 | end 90 | 91 | function SpDMData.makeFromTensors(feats,docStarts,mentStarts) 92 | spdmd = {} 93 | setmetatable(spdmd,SpDMData) 94 | spdmd.feats = feats 95 | spdmd.docStarts = docStarts 96 | spdmd.mentStarts = mentStarts 97 | spdmd.numDocs = spdmd.docStarts:size(1)-1 98 | spdmd.maxFeat = spdmd.feats:max() 99 | collectgarbage() 100 | assert(spdmd.feats:isContiguous()) 101 | return spdmd 102 | end 103 | 104 | -- d1m2, d1m3,... 105 | -- assumes mentStarts and docStarts start at 0 106 | function SpDMData:getFeats(d,m) 107 | local docStartIdx = self.docStarts[d] --idx within self.MentStarts that this doc starts 108 | return self.feats:sub(self.mentStarts[docStartIdx+m-1]+1, self.mentStarts[docStartIdx+m]) 109 | end 110 | 111 | 112 | function SpDMData:numMents(d) 113 | return (self.docStarts[d+1] - self.docStarts[d]) + 1 114 | end 115 | 116 | function SpDMData:docMiniBatch(d) -- this will only work if each mention has same # of features 117 | local docStartIdx = self.docStarts[d] 118 | local numMents = self:numMents(d) 119 | local docFeats = self.feats:sub(self.mentStarts[docStartIdx+1]+1,self.mentStarts[docStartIdx+numMents]) 120 | local numCols = docFeats:size(1)/(numMents-1) 121 | return docFeats:view(numMents-1,numCols) 122 | end 123 | 124 | do 125 | local SpKFDMData = torch.class('SpKFDMData') 126 | 127 | -- just gonna do a hacky thing for 2 constructors 128 | function SpKFDMData:__init(featPfx,docStarts,numNZ,feats) 129 | if featPfx ~= nil then 130 | local featfi = assert(hdf5.open(featPfx .. "-na-feats.h5")) 131 | local allfeats = featfi:read("feats"):all():long() 132 | featfi:close() 133 | local offsetfi = assert(hdf5.open(featPfx .. "-na-offsets.h5")) 134 | self.docStarts = offsetfi:read("doc_starts"):all() 135 | local mentStarts = offsetfi:read("ment_starts"):all() 136 | self.numNZ = mentStarts[2] - mentStarts[1] 137 | self.numDocs = self.docStarts:size(1)-1 138 | offsetfi:close() 139 | self.maxFeat = allfeats:max() 140 | self.feats = allfeats:view(allfeats:size(1)/self.numNZ, self.numNZ) 141 | else 142 | self.docStarts = docStarts 143 | self.numNZ = numNZ 144 | self.numDocs = self.docStarts:size(1)-1 145 | self.maxFeat = feats:max() 146 | self.feats = feats:view(feats:size(1)/self.numNZ, self.numNZ) 147 | end 148 | collectgarbage() 149 | assert(self.feats:isContiguous()) 150 | end 151 | 152 | function SpKFDMData:getFeats(d,m) 153 | return self.feats[self.docStarts[d]+m] 154 | end 155 | 156 | function SpKFDMData:docBatch(d) 157 | return self.feats:sub(self.docStarts[d]+1,self.docStarts[d+1]) 158 | end 159 | 160 | function SpKFDMData:numMents(d) 161 | return (self.docStarts[d+1] - self.docStarts[d]) 162 | end 163 | 164 | function SpKFDMData:cudify() 165 | self.feats = self.feats:cuda() 166 | self.docStarts = self.docStarts:cuda() 167 | collectgarbage() 168 | assert(self.feats:getDevice() ~= nil) 169 | assert(self.docStarts:getDevice() ~= nil) 170 | end 171 | 172 | end -------------------------------------------------------------------------------- /nncoref_acl15_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiseman/nn_coref/e29b16deecd0d87d4b7c145e07e2908266fe63d6/nncoref_acl15_slides.pdf -------------------------------------------------------------------------------- /nncoref_naacl16_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swiseman/nn_coref/e29b16deecd0d87d4b7c145e07e2908266fe63d6/nncoref_naacl16_slides.pdf -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/README.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | CorScorer: Perl package for scoring coreference resolution systems 3 | using different metrics. 4 | 5 | 6 | VERSION 7 | v8.01 -- reference implementations of MUC, B-cubed, CEAF and BLANC metrics. 8 | 9 | 10 | CHANGES SINCE v8.0 11 | - fixed a bug that crashed the BLANC scorer when a duplicate singleton 12 | mention was present in the response. 13 | 14 | INSTALLATION 15 | Requirements: 16 | 1. Perl: downloadable from http://perl.org 17 | 2. Algorithm-Munkres: included in this package and downloadable 18 | from CPAN http://search.cpan.org/~tpederse/Algorithm-Munkres-0.08 19 | 20 | USE 21 | This package is distributed with two scripts to execute the scorer from 22 | the command line. 23 | 24 | Windows (tm): scorer.bat 25 | Linux: scorer.pl 26 | 27 | 28 | SYNOPSIS 29 | use CorScorer; 30 | 31 | $metric = 'ceafm'; 32 | 33 | # Scores the whole dataset 34 | &CorScorer::Score($metric, $keys_file, $response_file); 35 | 36 | # Scores one file 37 | &CorScorer::Score($metric, $keys_file, $response_file, $name); 38 | 39 | 40 | INPUT 41 | metric: the metric desired to score the results: 42 | muc: MUCScorer (Vilain et al, 1995) 43 | bcub: B-Cubed (Bagga and Baldwin, 1998) 44 | ceafm: CEAF (Luo et al., 2005) using mention-based similarity 45 | ceafe: CEAF (Luo et al., 2005) using entity-based similarity 46 | blanc: BLANC (Luo et al., 2014) BLANC metric for gold and predicted mentions 47 | all: uses all the metrics to score 48 | 49 | keys_file: file with expected coreference chains in CoNLL-2011/2012 format 50 | 51 | response_file: file with output of coreference system (CoNLL-2011/2012 format) 52 | 53 | name: [optional] the name of the document to score. If name is not 54 | given, all the documents in the dataset will be scored. If given 55 | name is "none" then all the documents are scored but only total 56 | results are shown. 57 | 58 | 59 | OUTPUT 60 | The score subroutine returns an array with four values in this order: 61 | 1) Recall numerator 62 | 2) Recall denominator 63 | 3) Precision numerator 64 | 4) Precision denominator 65 | 66 | Also recall, precision and F1 are printed in the standard output when variable 67 | $VERBOSE is not null. 68 | 69 | Final scores: 70 | Recall = recall_numerator / recall_denominator 71 | Precision = precision_numerator / precision_denominator 72 | F1 = 2 * Recall * Precision / (Recall + Precision) 73 | 74 | Identification of mentions 75 | An scorer for identification of mentions (recall, precision and F1) is also included. 76 | Mentions from system response are compared with key mentions. This version performs 77 | strict mention matching as was used in the CoNLL-2011 and 2012 shared tasks. 78 | 79 | AUTHORS 80 | Emili Sapena, Universitat Politècnica de Catalunya, http://www.lsi.upc.edu/~esapena, esapena lsi.upc.edu 81 | Sameer Pradhan, sameer.pradhan childrens.harvard.edu 82 | Sebastian Martschat, sebastian.martschat h-its.org 83 | Xiaoqiang Luo, xql google.com 84 | 85 | COPYRIGHT AND LICENSE 86 | Copyright (C) 2009-2011, Emili Sapena esapena lsi.upc.edu 87 | 2011-2014, Sameer Pradhan sameer.pradhan childrens.harvard.edu 88 | 89 | This program is free software; you can redistribute it and/or modify it 90 | under the terms of the GNU General Public License as published by the 91 | Free Software Foundation; either version 2 of the License, or (at your 92 | option) any later version. This program is distributed in the hope that 93 | it will be useful, but WITHOUT ANY WARRANTY; without even the implied 94 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 95 | GNU General Public License for more details. 96 | 97 | You should have received a copy of the GNU General Public License along 98 | with this program; if not, write to the Free Software Foundation, Inc., 99 | 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 100 | 101 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/scorer.bat: -------------------------------------------------------------------------------- 1 | @rem = '--*-Perl-*-- 2 | @echo off 3 | if "%OS%" == "Windows_NT" goto WinNT 4 | perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9 5 | goto endofperl 6 | :WinNT 7 | perl -x -S %0 %* 8 | if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl 9 | if %errorlevel% == 9009 echo You do not have Perl in your PATH. 10 | if errorlevel 1 goto script_failed_so_exit_with_non_zero_val 2>nul 11 | goto endofperl 12 | @rem '; 13 | #!perl 14 | #line 15 15 | 16 | BEGIN { 17 | $d = $0; 18 | $d =~ s/\/[^\/][^\/]*$//g; 19 | push(@INC, $d."/lib"); 20 | } 21 | 22 | use strict; 23 | use CorScorer; 24 | 25 | if (@ARGV < 3) { 26 | print q| 27 | use: scorer.bat [name] 28 | 29 | metric: the metric desired to score the results: 30 | muc: MUCScorer (Vilain et al, 1995) 31 | bcub: B-Cubed (Bagga and Baldwin, 1998) 32 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity 33 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity 34 | all: uses all the metrics to score 35 | 36 | keys_file: file with expected coreference chains in SemEval format 37 | 38 | response_file: file with output of coreference system (SemEval format) 39 | 40 | name: [optional] the name of the document to score. If name is not 41 | given, all the documents in the dataset will be scored. If given 42 | name is "none" then all the documents are scored but only total 43 | results are shown. 44 | 45 | |; 46 | exit; 47 | } 48 | 49 | my $metric = shift (@ARGV); 50 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|all)/i) { 51 | print "Invalid metric\n"; 52 | exit; 53 | } 54 | 55 | 56 | if ($metric eq 'all') { 57 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe') { 58 | print "\nMETRIC $m:\n"; 59 | &CorScorer::Score( $m, @ARGV ); 60 | } 61 | } 62 | else { 63 | &CorScorer::Score( $metric, @ARGV ); 64 | } 65 | 66 | __END__ 67 | :endofperl 68 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/scorer.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | BEGIN { 4 | $d = $0; 5 | $d =~ s/\/[^\/][^\/]*$//g; 6 | 7 | if ($d eq $0) { 8 | unshift(@INC, "lib"); 9 | } 10 | else { 11 | unshift(@INC, $d . "/lib"); 12 | } 13 | } 14 | 15 | use strict; 16 | use CorScorer; 17 | 18 | if (@ARGV < 3) { 19 | print q| 20 | use: scorer.pl [name] 21 | 22 | metric: the metric desired to score the results: 23 | muc: MUCScorer (Vilain et al, 1995) 24 | bcub: B-Cubed (Bagga and Baldwin, 1998) 25 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity 26 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity 27 | blanc: BLANC 28 | all: uses all the metrics to score 29 | 30 | keys_file: file with expected coreference chains in SemEval format 31 | 32 | response_file: file with output of coreference system (SemEval format) 33 | 34 | name: [optional] the name of the document to score. If name is not 35 | given, all the documents in the dataset will be scored. If given 36 | name is "none" then all the documents are scored but only total 37 | results are shown. 38 | 39 | |; 40 | exit; 41 | } 42 | 43 | my $metric = shift(@ARGV); 44 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|blanc|all)/i) { 45 | print "Invalid metric\n"; 46 | exit; 47 | } 48 | 49 | if ($metric eq 'all') { 50 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc') { 51 | print "\nMETRIC $m:\n"; 52 | &CorScorer::Score($m, @ARGV); 53 | } 54 | } 55 | else { 56 | &CorScorer::Score($metric, @ARGV); 57 | } 58 | 59 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/CorefMetricTest.pm: -------------------------------------------------------------------------------- 1 | package CorefMetricTest; 2 | use strict; 3 | use warnings; 4 | use Exporter; 5 | 6 | our @ISA= qw(Exporter); 7 | our @EXPORT = qw(ComputeScoreFromCounts DiffExpectedAndActual); 8 | 9 | ################################################################################ 10 | # Compute recall, precision and F1. 11 | # 12 | # Input: (numerator_counts_for_recall, denominator_counts_for_recall, 13 | # numerator_counts_for_precision, denominator_counts_for_precision) 14 | # Output: (recall, precision, F1) 15 | ################################################################################ 16 | sub ComputeScoreFromCounts { 17 | # The first 4 are also coref link counts when using BLANC. 18 | my ($recall_numerator, $recall_denominator, 19 | $precision_numerator, $precision_denominator, @noncoref_counts) = @_; 20 | # The coref recall, precision, and F1 when using BLANC. 21 | my ($recall, $precision, $F1) = 22 | RPFFromCounts($recall_numerator, $recall_denominator, 23 | $precision_numerator, $precision_denominator); 24 | 25 | # BLANC: @noncoref_counts= 26 | # (noncoref_numerator_recall, noncoref_denominator_recall, 27 | # noncoref_numerator_precision, noncoref_denominator_precision) 28 | if (scalar(@noncoref_counts) == 4) { 29 | ($recall, $precision, $F1) = CorScorer::ComputeBLANCFromCounts( 30 | $recall_numerator, $recall_denominator, $precision_denominator, 31 | $noncoref_counts[0], $noncoref_counts[1], $noncoref_counts[3]); 32 | } 33 | $recall = ($recall < 0) ? 0 : $recall; 34 | $precision = ($precision < 0) ? 0 : $precision; 35 | $F1 = ($F1 < 0) ? 0 : $F1; 36 | return ($recall, $precision, $F1); 37 | } 38 | 39 | sub RPFFromCounts 40 | { 41 | my ($recall_numerator, $recall_denominator, 42 | $precision_numerator, $precision_denominator, @nonCorefCounts) = @_; 43 | my ($recall, $precision, $F1) = (-1, -1, 0); 44 | if ($recall_denominator > 0) { 45 | $recall = $recall_numerator / $recall_denominator; 46 | } 47 | if ($precision_denominator > 0) { 48 | $precision = $precision_numerator / $precision_denominator; 49 | } 50 | 51 | if (($recall + $precision) > 0) { 52 | $F1 = 2 * $recall * $precision / ($recall + $precision); 53 | } 54 | 55 | return ($recall, $precision, $F1); 56 | } 57 | 58 | # deprecated -- see CorScorer::ComputeBLANCFromCounts(). 59 | sub ComputeBLANCRPF 60 | { 61 | my ($coref_recall, $coref_precision, $coref_F1, 62 | $noncoref_recall, $noncoref_precision, $noncoref_F1) = @_; 63 | 64 | my ($recall, $precision, $F1); 65 | 66 | if ($coref_recall < 0 && $noncoref_recall < 0) { 67 | # no key mention. 68 | $recall = $precision = $F1 = 0; 69 | } elsif ($coref_recall < 0) { 70 | # key: all links are non-coref (mentions are all singltons). 71 | $recall = $noncoref_recall; 72 | $precision = ($noncoref_precision < 0) ? 0 : $noncoref_precision; 73 | $F1 = $noncoref_F1; 74 | } elsif ($noncoref_recall < 0) { 75 | # key: all links are coref (all mentions are in one entity). 76 | $recall = $coref_recall; 77 | $precision = ($coref_precision < 0) ? 0 : $coref_precision; 78 | $F1 = $coref_F1; 79 | } else { 80 | #key contains both coref and non-coref links. 81 | if ($coref_precision < 0 && $noncoref_precision < 0) { 82 | # no response. 83 | $recall = $precision = $F1 = 0; 84 | } else { 85 | if ($coref_precision < 0) { 86 | # response: all links are non-coref, or response mentions are all 87 | # singletons. 88 | $coref_precision = 0; 89 | } elsif ($noncoref_precision < 0) { 90 | # response: all links are coref, or all mentions are in one entity. 91 | $noncoref_precision = 0; 92 | } 93 | $recall = ($coref_recall + $noncoref_recall)/2; 94 | $precision = ($coref_precision + $noncoref_precision)/2; 95 | $F1 = ($coref_F1 + $noncoref_F1)/2; 96 | } 97 | } 98 | 99 | return ($recall, $precision, $F1); 100 | } 101 | 102 | ############################################################################## 103 | # Compute the sum of the duifference between the expected recall, precision, 104 | # F1 and the actual one. 105 | ############################################################################## 106 | sub DiffExpectedAndActual { 107 | my ($expected, $actual) = @_; 108 | if (scalar(@$expected) != scalar(@$actual)) { 109 | print STDERR "Expected and actual have diff dimensions: \n"; 110 | print STDERR " Expected: ", join(" ", @$expected), "\n"; 111 | print STDERR " Actual: ", join(" ", @$actual), "\n"; 112 | return 1.0e5; 113 | } 114 | my $sum = 0.0; 115 | my $i = 0; 116 | foreach my $e (@$expected) { 117 | $sum += abs($e - $actual->[$i]); 118 | ++$i; 119 | } 120 | return $sum; 121 | } 122 | 123 | 1; 124 | 125 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-10.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 x - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 z - 17 | test2 0 5 e (4) 18 | test2 0 6 y - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-11.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 x - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 z - 17 | test2 0 5 e (0) 18 | test2 0 6 y - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-12.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 1) 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk (2) 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (3) 13 | test2 0 1 x - 14 | test2 0 2 d1 (4 15 | test2 0 3 d2 4) 16 | test2 0 4 z - 17 | test2 0 5 e (5) 18 | test2 0 6 y - 19 | test2 0 7 f1 (6) 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-13.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 0) 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk (0) 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 x - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 z - 17 | test2 0 5 e (0) 18 | test2 0 6 y - 19 | test2 0 7 f1 (0) 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 - 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c - 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 y (2) 17 | test2 0 5 e (2) 18 | test2 0 6 z (3) 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 x (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 (1 7 | test1 0 5 b3 1) 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 (3 7 | test1 0 5 b3 3) 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-7.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-8.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(3 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 3)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A-9.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(3(3(3(3(3(3(3(3(3(3 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 3)3)3)3)3)3)3)3)3)3)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-A.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-B-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10043 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10043) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 (10043 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 10043) 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 - 72 | nw/xinhua/00/chtb_0009 - 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-B.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (10043 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 10043) 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10054 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10054) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 - 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 - 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 - 72 | nw/xinhua/00/chtb_0009 - 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-C-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10043 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10043) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 (10043 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 10043) 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 (10060) 72 | nw/xinhua/00/chtb_0009 (10060) 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-C.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (10043 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 10043) 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10054 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10054) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 - 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 - 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 (10060) 72 | nw/xinhua/00/chtb_0009 (10060) 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-D-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-D.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-E-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (1) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (1) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (1) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (1) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (1) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-E.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-F-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-F.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-G-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-G.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-H-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-H.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-I-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-I.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-J-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-J.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-K-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (2) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (2) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 (3) 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-K.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (1) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (1) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (1) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-L-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (3) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-L.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (2) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (2) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (1 15 | test2 0 3 d2 1) 16 | test2 0 4 jnk - 17 | test2 0 5 e (1) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk (0) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (0) 17 | test2 0 5 e - 18 | test2 0 6 jnk (0) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk (3) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (4) 17 | test2 0 5 e - 18 | test2 0 6 jnk (5) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk (1) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (1) 17 | test2 0 5 e - 18 | test2 0 6 jnk (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-M.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (1 15 | test2 0 3 d2 1) 16 | test2 0 4 jnk - 17 | test2 0 5 e (1) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk (3) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (4) 17 | test2 0 5 e - 18 | test2 0 6 jnk (5) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk (0) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (0) 17 | test2 0 5 e - 18 | test2 0 6 jnk (0) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk (1) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (1) 17 | test2 0 5 e - 18 | test2 0 6 jnk (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/DataFiles/TC-N.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /reference-coreference-scorers/v8.01/test/test.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | BEGIN { 4 | $d = $0; 5 | $d =~ s/\/[^\/][^\/]*$//g; 6 | push(@INC, $d); 7 | push(@INC, $d . "/../lib"); 8 | } 9 | 10 | use strict; 11 | use CorScorer; 12 | use CorefMetricTest; 13 | use CorefMetricTestConfig; 14 | 15 | my $error_tolerance = 1.e-4; 16 | my $script_dir = $0; 17 | $script_dir =~ s/\/[^\/][^\/]*$//g; 18 | 19 | foreach my $test_case (@CorefMetricTestConfig::TestCases) { 20 | my $id = $test_case->{'id'}; 21 | my @key_response_files = ($script_dir . "/" . $test_case->{'key_file'}, 22 | $script_dir . "/" . $test_case->{'response_file'}); 23 | print "\nTesting case ($id): keyFile=", $key_response_files[0], 24 | " responseFile=", $key_response_files[1], "\n"; 25 | my $expected_metrics = $test_case->{'expected_metrics'}; 26 | foreach my $metric_name (sort keys %$expected_metrics) { 27 | my $expected_values = $expected_metrics->{$metric_name}; 28 | *::SAVED_STDOUT = *STDOUT; 29 | *STDOUT = *::SUPRRES_STDOUT; 30 | my @actual_counts = &CorScorer::Score($metric_name, @key_response_files); 31 | # Compute R,P,and F1 from raw counts. 32 | my @actual_values = CorefMetricTest::ComputeScoreFromCounts(@actual_counts); 33 | *STDOUT = *::SAVED_STDOUT; 34 | my $diff = CorefMetricTest::DiffExpectedAndActual($expected_values, \@actual_values); 35 | printf " metric: %+10s", $metric_name; 36 | if ($diff < $error_tolerance) { 37 | print " => PASS\n"; 38 | } else { 39 | print " => FAIL\n"; 40 | print " Expected (recall, prec, F1) = (", join(" ", @$expected_values), ")\n"; 41 | print " Actual (recall, prec, F1) = (", join(" ", @actual_values), ")\n"; 42 | #exit(1); 43 | } 44 | } 45 | } 46 | 47 | --------------------------------------------------------------------------------