├── ANALYSIS.md ├── COREFERENCE.md ├── LICENSE ├── MANIFEST.in ├── MULTIGRAPH.md ├── README.md ├── bin ├── cort-predict-conll ├── cort-predict-raw ├── cort-train ├── cort-visualize └── run-multigraph ├── cort ├── __init__.py ├── analysis │ ├── __init__.py │ ├── data_structures.py │ ├── error_extractors.py │ ├── plotting.py │ ├── spanning_tree_algorithms.py │ ├── visualization.py │ └── visualization │ │ ├── TODO │ │ ├── lib │ │ ├── cort-for-raw.js │ │ ├── cort.js │ │ ├── jquery-2.1.1.min.js │ │ └── jquery.jsPlumb-1.6.4.js │ │ └── style.css ├── config_files │ ├── corenlp.ini │ └── corenlp_with_coref.ini ├── core │ ├── __init__.py │ ├── corpora.py │ ├── documents.py │ ├── external_data.py │ ├── head_finders.py │ ├── mention_extractor.py │ ├── mention_property_computer.py │ ├── mentions.py │ ├── mixins.py │ ├── singletons.py │ ├── spans.py │ └── util.py ├── coreference │ ├── __init__.py │ ├── approaches │ │ ├── __init__.py │ │ ├── antecedent_trees.py │ │ ├── mention_pairs.py │ │ └── mention_ranking.py │ ├── clusterer.py │ ├── cost_functions.py │ ├── experiments.py │ ├── features.py │ ├── instance_extractors.py │ ├── multigraph │ │ ├── __init__.py │ │ ├── decoders.py │ │ ├── features.py │ │ ├── multigraphs.py │ │ └── weighting_functions.py │ └── perceptrons.pyx ├── preprocessing │ ├── __init__.py │ └── pipeline.py ├── reference-coreference-scorers │ └── v8.01 │ │ ├── README.txt │ │ ├── lib │ │ ├── Algorithm │ │ │ ├── Munkres.pm │ │ │ └── README.Munkres │ │ ├── CorScorer.pm │ │ ├── Cwd.pm │ │ ├── Data │ │ │ └── Dumper.pm │ │ └── Math │ │ │ └── Combinatorics.pm │ │ ├── scorer.bat │ │ ├── scorer.pl │ │ └── test │ │ ├── CorefMetricTest.pm │ │ ├── CorefMetricTestConfig.pm │ │ ├── DataFiles │ │ ├── TC-A-1.response │ │ ├── TC-A-10.response │ │ ├── TC-A-11.response │ │ ├── TC-A-12.response │ │ ├── TC-A-13.response │ │ ├── TC-A-2.response │ │ ├── TC-A-3.response │ │ ├── TC-A-4.response │ │ ├── TC-A-5.response │ │ ├── TC-A-6.response │ │ ├── TC-A-7.response │ │ ├── TC-A-8.response │ │ ├── TC-A-9.response │ │ ├── TC-A.key │ │ ├── TC-B-1.response │ │ ├── TC-B.key │ │ ├── TC-C-1.response │ │ ├── TC-C.key │ │ ├── TC-D-1.response │ │ ├── TC-D.key │ │ ├── TC-E-1.response │ │ ├── TC-E.key │ │ ├── TC-F-1.response │ │ ├── TC-F.key │ │ ├── TC-G-1.response │ │ ├── TC-G.key │ │ ├── TC-H-1.response │ │ ├── TC-H.key │ │ ├── TC-I-1.response │ │ ├── TC-I.key │ │ ├── TC-J-1.response │ │ ├── TC-J.key │ │ ├── TC-K-1.response │ │ ├── TC-K.key │ │ ├── TC-L-1.response │ │ ├── TC-L.key │ │ ├── TC-M-1.response │ │ ├── TC-M-2.response │ │ ├── TC-M-3.response │ │ ├── TC-M-4.response │ │ ├── TC-M-5.response │ │ ├── TC-M-6.response │ │ ├── TC-M.key │ │ ├── TC-N-1.response │ │ ├── TC-N-2.response │ │ ├── TC-N-3.response │ │ ├── TC-N-4.response │ │ ├── TC-N-5.response │ │ ├── TC-N-6.response │ │ └── TC-N.key │ │ ├── TestCases.README │ │ └── test.pl ├── resources │ ├── coreferent_pairs.obj │ ├── female.list │ ├── male.list │ ├── neutral.list │ ├── plural.list │ └── singletons_not_cleaned.obj ├── test │ ├── __init__.py │ ├── analysis │ │ ├── __init__.py │ │ ├── test_data_structures.py │ │ ├── test_error_extractors.py │ │ └── test_spanning_tree_algorithms.py │ ├── core │ │ ├── __init__.py │ │ ├── resources │ │ │ └── input.conll │ │ ├── test_corpora.py │ │ ├── test_documents.py │ │ ├── test_external_data.py │ │ ├── test_head_finders.py │ │ ├── test_mention_extractor.py │ │ ├── test_mention_property_computer.py │ │ ├── test_mentions.py │ │ ├── test_spans.py │ │ └── test_util.py │ └── multigraph │ │ ├── __init__.py │ │ └── test_features.py └── util │ ├── __init__.py │ └── import_helper.py ├── plot.png ├── scripts ├── acl15demo.py ├── naacl15-demo.py └── train-and-predict-all.py ├── setup.py ├── stanford_corenlp_pywrapper ├── __init__.py ├── javasrc │ ├── corenlp │ │ ├── JsonPipeline.java │ │ ├── PipeRunner.java │ │ └── SocketServer.java │ └── util │ │ ├── Arr.java │ │ ├── BasicFileIO.java │ │ ├── JsonUtil.java │ │ ├── U.java │ │ └── misc │ │ ├── Pair.java │ │ └── Triple.java ├── lib │ ├── corenlpwrapper.jar │ ├── guava-13.0.1.jar │ └── jackson-all-1.9.11.jar ├── rcorenlp.r └── sockwrap.py ├── tree.png └── visualization.png /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2015 Sebastian Martschat 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cort/resources/* 2 | include cort/config_files/* 3 | include cort/analysis/visualization/* 4 | include cort/analysis/visualization/lib/* 5 | include cort/coreference/perceptrons.pyx 6 | include stanford_corenlp_pywrapper/rcorenlp.r 7 | include stanford_corenlp_pywrapper/lib/* 8 | include stanford_corenlp_pywrapper/javasrc/corenlp/* 9 | include stanford_corenlp_pywrapper/javasrc/util/misc/* 10 | include stanford_corenlp_pywrapper/javasrc/util/* 11 | include cort/reference-coreference-scorers/v8.01/* 12 | include cort/reference-coreference-scorers/v8.01/lib/* 13 | include cort/reference-coreference-scorers/v8.01/lib/Algorithm/* 14 | include cort/reference-coreference-scorers/v8.01/lib/Data/* 15 | include cort/reference-coreference-scorers/v8.01/lib/Math/* -------------------------------------------------------------------------------- /MULTIGRAPH.md: -------------------------------------------------------------------------------- 1 | # Running cort's multigraph system 2 | 3 | **cort** ships with a deterministic coreference resolution system based on 4 | multigraph clustering. The input must follow [the 5 | format from the CoNLL shared tasks on coreference resolution](http://conll.cemantix.org/2012/data.html). 6 | 7 | To run the multigraph system, use 8 | 9 | ```shell 10 | run-multigraph -in my_data.data -out out.data 11 | ``` 12 | 13 | With the optional argument `-ante`, antecedent decisions are also written to a 14 | file: 15 | 16 | ```shell 17 | run-multigraph -in my_data.data -out out.data -ante antecedents_out.data 18 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cort 2 | 3 | __cort__ is a coreference resolution toolkit. It consists 4 | of two parts: the *coreference resolution* component implements a framework for 5 | coreference resolution based on latent variables, which allows you to rapidly 6 | devise approaches to coreference resolution, while the *error analysis* component 7 | provides extensive functionality for analyzing and visualizing errors made by 8 | coreference resolution systems. 9 | 10 | If you have any questions or comments, drop me an e-mail at 11 | [sebastian.martschat@gmail.com](mailto:sebastian.martschat@gmail.com). 12 | 13 | ## Branches/Forks 14 | 15 | * the [kbest branch](https://github.com/smartschat/cort/tree/kbest) contains code for kbest extraction of coreference information, as described in Ji et al. (2017) 16 | * the [v03 branch](https://github.com/smartschat/cort/tree/v03) contains a version of __cort__ with more models and a better train/dev/test workflow. For more details on the models see Martschat (2017). 17 | * [Nafise Moosavi's fork of __cort__](https://github.com/ns-moosavi/cort/tree/singleton_feature) implements search space pruning on top of __cort__, as described in Moosavi and Strube (2016) 18 | 19 | ## Documentation 20 | 21 | * coreference resolution with cort 22 | * error analysis with cort 23 | * running the multigraph system 24 | 25 | ## Installation 26 | 27 | __cort__ is available on PyPi. You can install it via 28 | 29 | ``` 30 | pip install cort 31 | ``` 32 | Dependencies (automatically installed by pip) are 33 | [nltk](http://www.nltk.org/), [numpy](http://www.numpy.org/), 34 | [matplotlib](http://matplotlib.org), 35 | [mmh3](https://pypi.python.org/pypi/mmh3), 36 | [PyStanfordDependencies](https://github.com/dmcc/PyStanfordDependencies), 37 | [cython](http://cython.org/), 38 | [future](https://pypi.python.org/pypi/future), 39 | [jpype](https://pypi.python.org/pypi/jpype1) and 40 | [beautifulsoup](https://pypi.python.org/pypi/beautifulsoup4). It ships with 41 | [stanford_corenlp_pywrapper](https://github.com/brendano/stanford_corenlp_pywrapper) 42 | and [the reference implementation of the CoNLL scorer](https://github.com/conll/reference-coreference-scorers). 43 | 44 | __cort__ is written for use on Linux with Python 3.3+. While __cort__ also runs under 45 | Python 2.7, I strongly recommend running __cort__ with Python 3, since the Python 3 46 | version is much more efficient. 47 | 48 | ## References 49 | 50 | Yangfeng Ji, Chenhao Tan, Sebastian Martschat, Yejin Choi and Noah A. Smith (2017). **Dynamic Entity Representations in Neural Language Models.** To appear in *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP), Copenhagen, Denmark, 7-11 September 2017*. 51 | [PDF](https://arxiv.org/abs/1708.00781) 52 | 53 | Sebastian Martschat (2017). **Structured Representations for Coreference Resolution.** PhD thesis, Heidelberg University. 54 | [PDF](http://www.ub.uni-heidelberg.de/archiv/23305) 55 | 56 | Nafise Sadat Moosavi and Michael Strube (2016). **Search space pruning: A 57 | simple solution for better coreference resolvers**. In *Proceedings of the 2016 58 | Conference of the North American Chapter of the Association for Computational 59 | Linguistics: Human Language Technologies*, San Diego, Cal., 12-17 June 2016, 60 | pages 1005-1011. 61 | [PDF](http://www.aclweb.org/anthology/N16-1115.pdf) 62 | 63 | Sebastian Martschat and Michael Strube (2015). **Latent Structures for 64 | Coreference Resolution**. *Transactions of the Association for 65 | Computational Linguistics*, 3, pages 405-418. 66 | [PDF](http://www.aclweb.org/anthology/Q/Q15/Q15-1029.pdf) 67 | 68 | Sebastian Martschat, Patrick Claus and Michael Strube (2015). **Plug Latent 69 | Structures and Play Coreference Resolution**. In *Proceedings of 70 | the Proceedings of ACL-IJCNLP 2015 System Demonstrations*, Beijing, China, 71 | 26-31 July 2015, pages 61-66. 72 | [PDF](http://www.aclweb.org/anthology/P/P15/P15-4011.pdf) 73 | 74 | Sebastian Martschat, Thierry Göckel and Michael Strube (2015). **Analyzing and 75 | Visualizing Coreference Resolution Errors**. In *Proceedings of the 2015 76 | Conference of the North American Chapter of the Association for Computational 77 | Linguistics: Demonstrations*, Denver, Colorado, USA, 31 May-5 June 2015, 78 | pages 6-10. 79 | [PDF](https://aclweb.org/anthology/N/N15/N15-3002.pdf) 80 | 81 | Sebastian Martschat and Michael Strube (2014). **Recall Error Analysis for 82 | Coreference Resolution**. In *Proceedings of the 2014 Conference on Empirical 83 | Methods in Natural Language Processing (EMNLP)*, Doha, Qatar, 25-29 October 84 | 2014, pages 2070-2081. 85 | [PDF](http://aclweb.org/anthology/D/D14/D14-1221.pdf) 86 | 87 | Sebastian Martschat (2013). **Multigraph Clustering for Unsupervised 88 | Coreference Resolution**. In *Proceedings of the Student Research Workshop 89 | at the 51st Annual Meeting of the Association for Computational Linguistics*, 90 | Sofia, Bulgaria, 5-7 August 2013, pages 81-88. 91 | [PDF](http://aclweb.org/anthology/P/P13/P13-3012.pdf) 92 | 93 | If you use the error analysis component in your research, please cite the 94 | [EMNLP'14 paper](http://aclweb.org/anthology/D/D14/D14-1221.pdf). If you use 95 | the coreference component in your research, please cite the 96 | [TACL paper](http://www.aclweb.org/anthology/Q/Q15/Q15-1029.pdf). If you use 97 | the multigraph system, please cite the 98 | [ACL'13-SRW paper](http://aclweb.org/anthology/P/P13/P13-3012.pdf). 99 | 100 | ## Changelog 101 | 102 | __Wednesday, 4 November 2015__ 103 | Support numeric features. Due to a different feature representation the models changed, 104 | hence I have updated the downloadable models. 105 | 106 | __Friday, 9 October 2015__ 107 | Now supports label-dependent cost functions. 108 | 109 | __Tuesday, 15 September 2015__ 110 | Minor bugfixes. 111 | 112 | __Monday, 27 July 2015__ 113 | Now can perform coreference resolution on raw text. 114 | 115 | __Tuesday, 21 July 2015__ 116 | Updated to status of TACL paper. 117 | 118 | __Wednesday, 3 June 2015__ 119 | Improvements to visualization (mention highlighting and scrolling). 120 | 121 | __Monday, 1 June 2015__ 122 | Fixed a bug in mention highlighting for visualization. 123 | 124 | __Sunday, 31 May 2015__ 125 | Updated to status of NAACL'15 demo paper. 126 | 127 | __Wednesday, 13 May 2015__ 128 | Fixed another bug in the documentation regarding format of antecedent data. 129 | 130 | __Tuesday, 3 February 2015__ 131 | Fixed a bug in the documentation: part no. in antecedent file must be with trailing 0s. 132 | 133 | __Thursday, 30 October 2014__ 134 | Fixed data structure bug in documents.py. The results from the paper are not affected by this bug. 135 | 136 | __Wednesday, 22 October 2014__ 137 | Initial release. 138 | -------------------------------------------------------------------------------- /bin/cort-predict-conll: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from __future__ import print_function 5 | import argparse 6 | import codecs 7 | import logging 8 | import os 9 | import pickle 10 | import subprocess 11 | import sys 12 | 13 | 14 | import cort 15 | from cort.core import corpora 16 | from cort.core import mention_extractor 17 | from cort.coreference import cost_functions 18 | from cort.coreference import experiments 19 | from cort.coreference import features 20 | from cort.coreference import instance_extractors 21 | from cort.util import import_helper 22 | 23 | 24 | __author__ = 'smartschat' 25 | 26 | logging.basicConfig(level=logging.INFO, 27 | format='%(asctime)s %(levelname)s %(''message)s') 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser(description='Predict coreference ' 32 | 'relations.') 33 | parser.add_argument('-in', 34 | required=True, 35 | dest='input_filename', 36 | help='The input file. Must follow the format of the ' 37 | 'CoNLL shared tasks on coreference resolution ' 38 | '(see http://conll.cemantix.org/2012/data.html).)') 39 | parser.add_argument('-model', 40 | required=True, 41 | dest='model', 42 | help='The model learned via cort-train.') 43 | parser.add_argument('-out', 44 | dest='output_filename', 45 | required=True, 46 | help='The output file the predictions will be stored' 47 | 'in (in the CoNLL format.') 48 | parser.add_argument('-ante', 49 | dest='ante', 50 | help='The file where antecedent predictions will be' 51 | 'stored to.') 52 | parser.add_argument('-extractor', 53 | dest='extractor', 54 | required=True, 55 | help='The function to extract instances.') 56 | parser.add_argument('-perceptron', 57 | dest='perceptron', 58 | required=True, 59 | help='The perceptron to use.') 60 | parser.add_argument('-clusterer', 61 | dest='clusterer', 62 | required=True, 63 | help='The clusterer to use.') 64 | parser.add_argument('-gold', 65 | dest='gold', 66 | help='Gold data (in the CoNLL format) for evaluation.') 67 | parser.add_argument('-features', 68 | dest='features', 69 | help='The file containing the list of features. If not' 70 | 'provided, defaults to a standard set of' 71 | 'features.') 72 | 73 | return parser.parse_args() 74 | 75 | 76 | def get_scores(output_data, gold_data): 77 | scorer_output = subprocess.check_output([ 78 | "perl", 79 | cort.__path__[0] + "/reference-coreference-scorers/v8.01/scorer.pl", 80 | "all", 81 | gold_data, 82 | os.getcwd() + "/" + output_data, 83 | "none"]).decode() 84 | 85 | metrics = ['muc', 'bcub', 'ceafm', 'ceafe', 'blanc'] 86 | 87 | metrics_results = {} 88 | 89 | metric = None 90 | 91 | results_formatted = "" 92 | 93 | for line in scorer_output.split("\n"): 94 | if not line: 95 | continue 96 | 97 | splitted = line.split() 98 | 99 | if splitted[0] == "METRIC": 100 | metric = line.split()[1][:-1] 101 | 102 | if (metric != 'blanc' and line.startswith("Coreference:")) \ 103 | or (metric == 'blanc' and line.startswith("BLANC:")): 104 | metrics_results[metric] = ( 105 | float(splitted[5][:-1]), 106 | float(splitted[10][:-1]), 107 | float(splitted[12][:-1]), 108 | ) 109 | 110 | results_formatted += "\tR\tP\tF1\n" 111 | 112 | for metric in metrics: 113 | results_formatted += metric + "\t" + \ 114 | "\t".join([str(val) for val in metrics_results[metric]]) + "\n" 115 | results_formatted += "\n" 116 | average = (metrics_results["muc"][2] + metrics_results["bcub"][2] + 117 | metrics_results["ceafe"][2])/3 118 | results_formatted += "conll\t\t\t" + format(average, '.2f') + "\n" 119 | 120 | return results_formatted 121 | 122 | 123 | logging.basicConfig(level=logging.INFO, 124 | format='%(asctime)s %(levelname)s %(''message)s') 125 | 126 | if sys.version_info[0] == 2: 127 | logging.warning("You are running cort under Python 2. cort is much more " 128 | "efficient under Python 3.3+.") 129 | args = parse_args() 130 | 131 | if args.features: 132 | mention_features, pairwise_features = import_helper.get_features( 133 | args.features) 134 | else: 135 | mention_features = [ 136 | features.fine_type, 137 | features.gender, 138 | features.number, 139 | features.sem_class, 140 | features.deprel, 141 | features.head_ner, 142 | features.length, 143 | features.head, 144 | features.first, 145 | features.last, 146 | features.preceding_token, 147 | features.next_token, 148 | features.governor, 149 | features.ancestry 150 | ] 151 | 152 | pairwise_features = [ 153 | features.exact_match, 154 | features.head_match, 155 | features.same_speaker, 156 | features.alias, 157 | features.sentence_distance, 158 | features.embedding, 159 | features.modifier, 160 | features.tokens_contained, 161 | features.head_contained, 162 | features.token_distance 163 | ] 164 | 165 | logging.info("Loading model.") 166 | priors, weights = pickle.load(open(args.model, "rb")) 167 | 168 | perceptron = import_helper.import_from_path(args.perceptron)( 169 | priors=priors, 170 | weights=weights, 171 | cost_scaling=0 172 | ) 173 | 174 | extractor = instance_extractors.InstanceExtractor( 175 | import_helper.import_from_path(args.extractor), 176 | mention_features, 177 | pairwise_features, 178 | cost_functions.null_cost, 179 | perceptron.get_labels() 180 | ) 181 | 182 | logging.info("Reading in data.") 183 | testing_corpus = corpora.Corpus.from_file( 184 | "testing", 185 | codecs.open(args.input_filename, "r", "utf-8")) 186 | 187 | logging.info("Extracting system mentions.") 188 | for doc in testing_corpus: 189 | doc.system_mentions = mention_extractor.extract_system_mentions(doc) 190 | 191 | mention_entity_mapping, antecedent_mapping = experiments.predict( 192 | testing_corpus, 193 | extractor, 194 | perceptron, 195 | import_helper.import_from_path(args.clusterer) 196 | ) 197 | 198 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) 199 | 200 | 201 | logging.info("Write corpus to file.") 202 | testing_corpus.write_to_file(codecs.open(args.output_filename, "w", "utf-8")) 203 | 204 | if args.ante: 205 | logging.info("Write antecedent decisions to file") 206 | testing_corpus.write_antecedent_decisions_to_file(open(args.ante, "w")) 207 | 208 | if args.gold: 209 | logging.info("Evaluate.") 210 | print(get_scores(args.output_filename, args.gold)) 211 | 212 | logging.info("Done.") 213 | -------------------------------------------------------------------------------- /bin/cort-predict-raw: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from __future__ import print_function 5 | import argparse 6 | import codecs 7 | import logging 8 | import pickle 9 | import sys 10 | 11 | 12 | from cort.preprocessing import pipeline 13 | from cort.core import mention_extractor 14 | from cort.coreference import cost_functions 15 | from cort.coreference import experiments 16 | from cort.coreference import features 17 | from cort.coreference import instance_extractors 18 | from cort.util import import_helper 19 | 20 | 21 | __author__ = 'smartschat' 22 | 23 | logging.basicConfig(level=logging.INFO, 24 | format='%(asctime)s %(levelname)s %(''message)s') 25 | 26 | 27 | def parse_args(): 28 | parser = argparse.ArgumentParser(description='Predict coreference ' 29 | 'relations.') 30 | parser.add_argument('-in', 31 | required=True, 32 | dest='input_filename', 33 | help='The raw text input files.', 34 | nargs="*") 35 | parser.add_argument('-model', 36 | required=True, 37 | dest='model', 38 | help='The model learned via cort-train.') 39 | parser.add_argument('-suffix', 40 | dest='suffix', 41 | default="out", 42 | help='Sufix for output files. Defaults to "out".') 43 | parser.add_argument('-extractor', 44 | dest='extractor', 45 | required=True, 46 | help='The function to extract instances.') 47 | parser.add_argument('-perceptron', 48 | dest='perceptron', 49 | required=True, 50 | help='The perceptron to use.') 51 | parser.add_argument('-clusterer', 52 | dest='clusterer', 53 | required=True, 54 | help='The clusterer to use.') 55 | parser.add_argument('-features', 56 | dest='features', 57 | help='The file containing the list of features. If not' 58 | 'provided, defaults to a standard set of' 59 | 'features.') 60 | parser.add_argument('-corenlp', 61 | dest='corenlp', 62 | required=True, 63 | help='Location of CoreNLP jars.') 64 | 65 | return parser.parse_args() 66 | 67 | 68 | logging.basicConfig(level=logging.INFO, 69 | format='%(asctime)s %(levelname)s %(''message)s') 70 | 71 | if sys.version_info[0] == 2: 72 | logging.warning("You are running cort under Python 2. cort is much more " 73 | "efficient under Python 3.3+.") 74 | 75 | args = parse_args() 76 | 77 | if args.features: 78 | mention_features, pairwise_features = import_helper.get_features( 79 | args.features) 80 | else: 81 | mention_features = [ 82 | features.fine_type, 83 | features.gender, 84 | features.number, 85 | features.sem_class, 86 | features.deprel, 87 | features.head_ner, 88 | features.length, 89 | features.head, 90 | features.first, 91 | features.last, 92 | features.preceding_token, 93 | features.next_token, 94 | features.governor, 95 | features.ancestry 96 | ] 97 | 98 | pairwise_features = [ 99 | features.exact_match, 100 | features.head_match, 101 | features.same_speaker, 102 | features.alias, 103 | features.sentence_distance, 104 | features.embedding, 105 | features.modifier, 106 | features.tokens_contained, 107 | features.head_contained, 108 | features.token_distance 109 | ] 110 | 111 | 112 | logging.info("Loading model.") 113 | priors, weights = pickle.load(open(args.model, "rb")) 114 | 115 | perceptron = import_helper.import_from_path(args.perceptron)( 116 | priors=priors, 117 | weights=weights, 118 | cost_scaling=0 119 | ) 120 | 121 | extractor = instance_extractors.InstanceExtractor( 122 | import_helper.import_from_path(args.extractor), 123 | mention_features, 124 | pairwise_features, 125 | cost_functions.null_cost, 126 | perceptron.get_labels() 127 | ) 128 | 129 | logging.info("Reading in and preprocessing data.") 130 | p = pipeline.Pipeline(args.corenlp) 131 | 132 | testing_corpus = p.run_on_docs("corpus", args.input_filename) 133 | 134 | logging.info("Extracting system mentions.") 135 | for doc in testing_corpus: 136 | doc.system_mentions = mention_extractor.extract_system_mentions(doc) 137 | 138 | mention_entity_mapping, antecedent_mapping = experiments.predict( 139 | testing_corpus, 140 | extractor, 141 | perceptron, 142 | import_helper.import_from_path(args.clusterer) 143 | ) 144 | 145 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) 146 | 147 | logging.info("Write output to file.") 148 | 149 | for doc in testing_corpus: 150 | output = doc.to_simple_output() 151 | my_file = codecs.open(doc.identifier + "." + args.suffix, "w", "utf-8") 152 | my_file.write(output) 153 | my_file.close() 154 | 155 | logging.info("Done.") 156 | -------------------------------------------------------------------------------- /bin/cort-train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import codecs 5 | import logging 6 | import pickle 7 | import sys 8 | 9 | 10 | from cort.core import corpora 11 | from cort.core import mention_extractor 12 | from cort.coreference import experiments 13 | from cort.coreference import features 14 | from cort.coreference import instance_extractors 15 | from cort.util import import_helper 16 | 17 | 18 | __author__ = 'smartschat' 19 | 20 | 21 | logging.basicConfig(level=logging.INFO, 22 | format='%(asctime)s %(levelname)s %(''message)s') 23 | 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser(description='Train coreference resolution ' 27 | 'models.') 28 | parser.add_argument('-in', 29 | required=True, 30 | dest='input_filename', 31 | help='The input file. Must follow the format of the ' 32 | 'CoNLL shared tasks on coreference resolution ' 33 | '(see http://conll.cemantix.org/2012/data.html).)') 34 | parser.add_argument('-out', 35 | dest='output_filename', 36 | required=True, 37 | help='The output file the learned model will be saved ' 38 | 'to.') 39 | parser.add_argument('-extractor', 40 | dest='extractor', 41 | required=True, 42 | help='The function to extract instances.') 43 | parser.add_argument('-perceptron', 44 | dest='perceptron', 45 | required=True, 46 | help='The perceptron to use.') 47 | parser.add_argument('-cost_function', 48 | dest='cost_function', 49 | required=True, 50 | help='The cost function to use.') 51 | parser.add_argument('-n_iter', 52 | dest='n_iter', 53 | default=5, 54 | help='Number of perceptron iterations. Defaults to 5.') 55 | parser.add_argument('-cost_scaling', 56 | dest='cost_scaling', 57 | default=1, 58 | help='Scaling factor of the cost function. Defaults ' 59 | 'to 1') 60 | parser.add_argument('-random_seed', 61 | dest='seed', 62 | default=23, 63 | help='Random seed for training data shuffling. ' 64 | 'Defaults to 23.') 65 | parser.add_argument('-features', 66 | dest='features', 67 | help='The file containing the list of features. If not' 68 | 'provided, defaults to a standard set of' 69 | 'features.') 70 | 71 | return parser.parse_args() 72 | 73 | 74 | if sys.version_info[0] == 2: 75 | logging.warning("You are running cort under Python 2. cort is much more " 76 | "efficient under Python 3.3+.") 77 | 78 | args = parse_args() 79 | 80 | if args.features: 81 | mention_features, pairwise_features = import_helper.get_features( 82 | args.features) 83 | else: 84 | mention_features = [ 85 | features.fine_type, 86 | features.gender, 87 | features.number, 88 | features.sem_class, 89 | features.deprel, 90 | features.head_ner, 91 | features.length, 92 | features.head, 93 | features.first, 94 | features.last, 95 | features.preceding_token, 96 | features.next_token, 97 | features.governor, 98 | features.ancestry 99 | ] 100 | 101 | pairwise_features = [ 102 | features.exact_match, 103 | features.head_match, 104 | features.same_speaker, 105 | features.alias, 106 | features.sentence_distance, 107 | features.embedding, 108 | features.modifier, 109 | features.tokens_contained, 110 | features.head_contained, 111 | features.token_distance 112 | ] 113 | 114 | 115 | perceptron = import_helper.import_from_path(args.perceptron)( 116 | cost_scaling=int(args.cost_scaling), 117 | n_iter=int(args.n_iter), 118 | seed=int(args.seed) 119 | ) 120 | 121 | extractor = instance_extractors.InstanceExtractor( 122 | import_helper.import_from_path(args.extractor), 123 | mention_features, 124 | pairwise_features, 125 | import_helper.import_from_path(args.cost_function), 126 | perceptron.get_labels() 127 | ) 128 | 129 | logging.info("Reading in data.") 130 | training_corpus = corpora.Corpus.from_file("training", 131 | codecs.open(args.input_filename, 132 | "r", "utf-8")) 133 | 134 | logging.info("Extracting system mentions.") 135 | for doc in training_corpus: 136 | doc.system_mentions = mention_extractor.extract_system_mentions(doc) 137 | 138 | model = experiments.learn( 139 | training_corpus, 140 | extractor, 141 | perceptron 142 | ) 143 | 144 | logging.info("Writing model to file.") 145 | pickle.dump(model, open(args.output_filename, "wb"), protocol=2) 146 | 147 | logging.info("Done.") 148 | -------------------------------------------------------------------------------- /bin/cort-visualize: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from __future__ import print_function 5 | import argparse 6 | import codecs 7 | import logging 8 | 9 | 10 | from cort.preprocessing import pipeline 11 | from cort.analysis import visualization, error_extractors, spanning_tree_algorithms 12 | from cort.core import corpora 13 | 14 | 15 | __author__ = 'smartschat' 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description='Visualize output.') 20 | parser.add_argument('input_filename', 21 | help='The files to visualize', 22 | nargs='*') 23 | parser.add_argument('-corenlp', 24 | required=True, 25 | dest='corenlp', 26 | help='Where is CoreNLP?') 27 | 28 | return parser.parse_args() 29 | 30 | 31 | logging.basicConfig(level=logging.INFO, 32 | format='%(asctime)s %(levelname)s %(''message)s') 33 | 34 | args = parse_args() 35 | 36 | p = pipeline.Pipeline(args.corenlp, with_coref=True) 37 | 38 | corpus_to_visualize = p.run_on_docs("corpus", args.input_filename) 39 | 40 | ex = error_extractors.ErrorExtractor(corpus_to_visualize, 41 | spanning_tree_algorithms.recall_accessibility, 42 | spanning_tree_algorithms.precision_system_output) 43 | 44 | ex.add_system(corpus_to_visualize) 45 | 46 | decisions = ex.get_errors() 47 | 48 | visualizer = visualization.Visualizer(decisions, "corpus", 49 | for_raw_input=True) 50 | 51 | visualizer.run() -------------------------------------------------------------------------------- /bin/run-multigraph: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | 6 | from cort.core import corpora 7 | from cort.core import mention_extractor 8 | from cort.coreference.multigraph import multigraphs, features, decoders, \ 9 | weighting_functions 10 | 11 | 12 | logging.basicConfig(level=logging.INFO, 13 | format='%(asctime)s %(levelname)s %(message)s') 14 | 15 | parser = argparse.ArgumentParser(description='Run the multigraph coreference ' 16 | 'resolution system..') 17 | parser.add_argument('-in', 18 | required=True, 19 | dest='input_filename', 20 | help='The input file. Must follow the format of the CoNLL ' 21 | 'shared tasks on coreference resolution (see ' 22 | 'http://conll.cemantix.org/2012/data.html).)') 23 | parser.add_argument('-out', 24 | dest='output_filename', 25 | required=True, 26 | help='The output file.') 27 | parser.add_argument('-ante', 28 | dest='antecedents_output_filename', 29 | default=None, 30 | help='The file where antecedent information should be' 31 | 'written to. Defaults to None.') 32 | 33 | args = parser.parse_args() 34 | 35 | logging.info("Reading in corpus") 36 | 37 | corpus = corpora.Corpus.from_file("my corpus", 38 | open(args.input_filename)) 39 | 40 | logging.info("Extracting system mentions") 41 | for doc in corpus: 42 | doc.system_mentions = mention_extractor.extract_system_mentions(doc) 43 | 44 | negative_features = [features.not_modifier, 45 | features.not_compatible, 46 | features.not_embedding, 47 | features.not_speaker, 48 | features.not_singleton, 49 | features.not_pronoun_distance, 50 | features.not_anaphoric] 51 | 52 | positive_features = [features.alias, 53 | features.non_pronominal_string_match, 54 | features.head_match, 55 | features.pronoun_same_canonical_form, 56 | features.anaphor_pronoun, 57 | features.speaker, 58 | features.antecedent_is_subject, 59 | features.antecedent_is_object, 60 | features.substring, 61 | features.lexical] 62 | 63 | cmc = multigraphs.CorefMultigraphCreator( 64 | positive_features, 65 | negative_features, 66 | weighting_functions.for_each_relation_with_distance, 67 | {}) 68 | 69 | relation_weights = {} 70 | 71 | for relation in positive_features: 72 | relation_weights[relation] = 1 73 | 74 | relation_weights[features.antecedent_is_object] = 0.5 75 | 76 | cmc.relation_weights = relation_weights 77 | 78 | logging.info("Decoding") 79 | 80 | decoder = decoders.MultigraphDecoder(cmc) 81 | 82 | decoder.decode(corpus) 83 | 84 | logging.info("Writing coreference to file") 85 | 86 | corpus.write_to_file(open(args.output_filename, 'w')) 87 | 88 | if args.antecedents_output_filename: 89 | logging.info("Writing antecedent decisions to file") 90 | corpus.write_antecedent_decisions_to_file( 91 | open(args.antecedents_output_filename, 'w')) 92 | 93 | logging.info("Finished") 94 | -------------------------------------------------------------------------------- /cort/__init__.py: -------------------------------------------------------------------------------- 1 | """ cort - a toolkit for coreference resolution and error analysis. """ 2 | 3 | __author__ = 'martscsn' 4 | -------------------------------------------------------------------------------- /cort/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | """ Classes and functions for coreference resolution error analysis and 2 | visualisation. """ 3 | 4 | __author__ = 'smartschat' 5 | -------------------------------------------------------------------------------- /cort/analysis/error_extractors.py: -------------------------------------------------------------------------------- 1 | """ Extract errors made by systems w.r.t. a reference corpus. """ 2 | 3 | 4 | from cort.analysis import data_structures 5 | 6 | 7 | __author__ = 'smartschat' 8 | 9 | 10 | class ErrorExtractor: 11 | """ Extract, manage and store recall and precision errors. 12 | 13 | Error extraction for recall errors works as follows: 14 | 15 | Go through each document. For each reference entity e in the document, 16 | construct an entity graph g_e for e and compute a partition of g_e by the 17 | system entity graphs. Then compute a spanning tree t_e of g_e and take 18 | every edge in t_e that does not appear in the partition as an error. 19 | 20 | For computing precision errors, switch the roles of reference and system 21 | entities. 22 | 23 | Attributes: 24 | reference_corpus (Corpus): The reference corpus with the gold 25 | information concerning the coreference relation. 26 | recall_spanning_tree_algorithm (function): A function mapping an 27 | entity graph and one its partitions to a list of mentions pairs, 28 | which represent a spanning tree of the entity graph. This 29 | function is used to compute recall errors. 30 | precision_spanning_tree_algorithm (function): Same as above, but for 31 | precision errors. 32 | errors (dict): A mapping of error descriptions to sets containing the 33 | respective errors. 34 | """ 35 | def __init__(self, 36 | reference_corpus, 37 | recall_spanning_tree_algorithm, 38 | precision_spanning_tree_algorithm, 39 | ): 40 | """ Initialize the error analysis. 41 | 42 | Args: 43 | reference_corpus (Corpus): The reference corpus with the gold 44 | information concerning the coreference relation. 45 | recall_spanning_tree_algorithm (function): A function mapping an 46 | entity graph and one its partitions to a list of mentions pairs, 47 | which represent a spanning tree of the entity graph. This 48 | function is used to compute recall errors. 49 | precision_spanning_tree_algorithm (function): Same as above, but for 50 | precision errors. 51 | """ 52 | 53 | self.reference_corpus = reference_corpus 54 | self.recall_spanning_tree_algorithm = recall_spanning_tree_algorithm 55 | self.precision_spanning_tree_algorithm = \ 56 | precision_spanning_tree_algorithm 57 | self.errors = {} 58 | self.corpora = {} 59 | 60 | def add_system(self, system_corpus, which_mentions="annotated"): 61 | """ Add a system to the error analysis. 62 | 63 | Error extraction for recall errors works as follows: 64 | 65 | Go through each document. For each reference entity e in the document, 66 | construct an entity graph g_e for e and compute a partition of g_e by 67 | the system entity graphs. Then compute a spanning tree t_e of g_e and 68 | take every edge in t_e that does not appear in the partition as an 69 | error. 70 | 71 | For computing precision errors, switch the roles of reference and system 72 | entities. 73 | 74 | Also extracts all pairwise decisions (if available). 75 | 76 | Args: 77 | system_corpus (Corpus): A corpus obtained from system output. 78 | which_mentions (str): Either "annotated" or "extracted", 79 | defaults to "annotated". Specifies from which mentions in 80 | the system corpus coreference information should be 81 | obtained, either annotated mentions or system mentions. 82 | """ 83 | if which_mentions not in ["annotated", "extracted"]: 84 | raise ValueError("which_mentions must be" 85 | "either 'annotated' or 'extracted'.") 86 | 87 | recall_errors, precision_errors = self.__compute_errors(system_corpus, 88 | which_mentions) 89 | 90 | self.errors[system_corpus.description] = { 91 | "recall_errors": {}, 92 | "precision_errors": {}, 93 | "decisions": {} 94 | } 95 | 96 | self.errors[system_corpus.description]["recall_errors"]["all"] = \ 97 | recall_errors 98 | self.errors[ 99 | system_corpus.description]["precision_errors"]["all"] = \ 100 | precision_errors 101 | self.errors[ 102 | system_corpus.description]["decisions"]["all"] = \ 103 | system_corpus.get_antecedent_decisions()[ 104 | system_corpus.description]["decisions"]["all"] 105 | 106 | self.corpora[system_corpus.description] = system_corpus 107 | 108 | def get_errors(self): 109 | """ Get errors for all systems managed by this ErrorAnalysis. 110 | 111 | The errors are stored via an ``StructuredCoreferenceAnalysis` 112 | which can be accessed like a dict. 113 | 114 | If a corpus with the description 115 | ``ranking``was added via ``self.add_system``, 116 | ``self.errors["ranking"]["recall_errors"]["all"]``is an ``EnhancedSet`` 117 | containing all recall errors of the system. Errors of other systems 118 | and precision errors can be accessed analogously. 119 | 120 | Returns: 121 | StructuredCoreferenceAnalysis: A StructuredCoreferenceAnalysis 122 | containing the errors. 123 | """ 124 | return data_structures.StructuredCoreferenceAnalysis( 125 | self.errors, corpora=self.corpora, 126 | reference=self.reference_corpus) 127 | 128 | def __compute_errors(self, system_corpus, which_mentions): 129 | gold_graphs = [data_structures.EntityGraph.from_mentions( 130 | doc.annotated_mentions, "annotated_set_id") 131 | for doc in self.reference_corpus.documents] 132 | 133 | if which_mentions == 'annotated': 134 | system_graphs = [data_structures.EntityGraph.from_mentions( 135 | doc.annotated_mentions, "annotated_set_id") 136 | for doc in system_corpus.documents] 137 | else: 138 | system_graphs = [data_structures.EntityGraph.from_mentions( 139 | doc.system_mentions, "set_id") 140 | for doc in system_corpus.documents] 141 | 142 | recall_errors = [] 143 | precision_errors = [] 144 | 145 | for doc_gold_graphs, doc_system_graphs in zip(gold_graphs, 146 | system_graphs): 147 | recall_errors.extend( 148 | self.__compute_errors_for_doc( 149 | doc_gold_graphs, 150 | doc_system_graphs, 151 | self.recall_spanning_tree_algorithm)) 152 | precision_errors.extend( 153 | self.__compute_errors_for_doc( 154 | doc_system_graphs, 155 | doc_gold_graphs, 156 | self.precision_spanning_tree_algorithm)) 157 | 158 | return (data_structures.EnhancedSet(recall_errors), 159 | data_structures.EnhancedSet(precision_errors)) 160 | 161 | @staticmethod 162 | def __compute_errors_for_doc(base_graphs, 163 | partitioning_graphs, 164 | spanning_tree_algorithm): 165 | errors = [] 166 | 167 | for graph in base_graphs: 168 | errors.extend( 169 | ErrorExtractor.__compute_errors_for_graph( 170 | graph, partitioning_graphs, spanning_tree_algorithm)) 171 | 172 | return errors 173 | 174 | @staticmethod 175 | def __compute_errors_for_graph(graph, 176 | partitioning_graphs, 177 | spanning_tree_algorithm): 178 | partitioned_graph = graph.partition(partitioning_graphs) 179 | spanning_tree = spanning_tree_algorithm(graph, partitioned_graph) 180 | extra_pairs = [ 181 | (anaphor, antecedent) for anaphor, antecedent in spanning_tree 182 | if anaphor not in partitioned_graph.edges or 183 | antecedent not in partitioned_graph.edges[anaphor] 184 | ] 185 | 186 | return [(anaphor, antecedent) for anaphor, antecedent in sorted( 187 | extra_pairs)] 188 | -------------------------------------------------------------------------------- /cort/analysis/plotting.py: -------------------------------------------------------------------------------- 1 | """ Plot error analysis statistics. """ 2 | 3 | from __future__ import division 4 | 5 | 6 | from matplotlib import pyplot 7 | from matplotlib import cm 8 | 9 | import numpy 10 | 11 | from pylab import rcParams 12 | 13 | 14 | __author__ = 'martscsn' 15 | 16 | 17 | def plot(data, 18 | title, 19 | xlabel, 20 | ylabel, 21 | filename=None): 22 | """ Plot error analysis statistics. 23 | 24 | In particular, plot a bar chart for the numbers described in ``data``. 25 | 26 | Args: 27 | data (list(str, list((str,int)))): The data to be plotted. The ith entry 28 | of this list contains the name which will appear in the legend, 29 | and a list of (category, count) pairs. These are the individual 30 | data points which will be plotted. 31 | title (str): Title of the plot. 32 | xlabel (str): Label of the x axis. 33 | ylabel (str): Label of the y axis. 34 | filename (str, optional): If set, write plot to ``filename``. 35 | 36 | Example:: 37 | pair_errs = errors["pair"]["recall_errors"]["all"] 38 | tree_errs = errors["tree"]["recall_errors"]["all"] 39 | 40 | plot( 41 | [("pair", [(cat, len(pair_errs[cat])) for cat in pair_errs.keys()]), 42 | ("tree", [(cat, len(tree_errs[cat])) for cat in tree_errs.keys()])], 43 | "Recall Errors", 44 | "Type of anaphor", 45 | "Number of Errors") 46 | """ 47 | 48 | rcParams['xtick.major.pad'] = '12' 49 | rcParams['ytick.major.pad'] = '12' 50 | 51 | fig, ax = pyplot.subplots() 52 | 53 | systems = [] 54 | categories = [] 55 | 56 | colors = cm.Accent(numpy.linspace(0, 1, len(data))) 57 | 58 | bars_for_legend = [] 59 | 60 | for i, system_data in enumerate(data): 61 | system_name, categories_and_numbers = system_data 62 | systems.append(system_name) 63 | 64 | for j, cat_and_number in enumerate(categories_and_numbers): 65 | category, number = cat_and_number 66 | 67 | if category not in categories: 68 | categories.append(category) 69 | 70 | bar = ax.bar(2*j + i*(1/len(data)), number, color=colors[i], 71 | width=1/len(data), label=system_name) 72 | 73 | if j == 0: 74 | bars_for_legend.append(bar) 75 | 76 | xticks = [2*k + 0.5 for k in range(0, len(categories))] 77 | 78 | pyplot.title(title, fontsize=28) 79 | pyplot.xlabel(xlabel, fontsize=24) 80 | pyplot.ylabel(ylabel, fontsize=24) 81 | 82 | ax.spines["top"].set_visible(False) 83 | ax.spines["right"].set_visible(False) 84 | 85 | ax.get_xaxis().tick_bottom() 86 | ax.get_yaxis().tick_left() 87 | 88 | ax.set_xticklabels(categories) 89 | ax.set_xticks(xticks) 90 | 91 | pyplot.tick_params(axis='both', which='major', labelsize=20) 92 | 93 | if filename: 94 | legend = ax.legend(bars_for_legend, systems, 95 | loc='upper right', bbox_to_anchor=(1.2, 1.2)) 96 | 97 | fig.savefig(filename, bbox_extra_artists=(legend,), bbox_inches='tight') 98 | else: 99 | legend = ax.legend(bars_for_legend, systems, loc='upper right') 100 | legend.draggable() 101 | 102 | fig.show() 103 | -------------------------------------------------------------------------------- /cort/analysis/spanning_tree_algorithms.py: -------------------------------------------------------------------------------- 1 | """ Algorithms for computing spanning trees of entity graphs. """ 2 | 3 | 4 | __author__ = 'smartschat' 5 | 6 | 7 | def precision_system_output(entity, partitioned_entity): 8 | """ Compute a spanning tree from antecedent information. 9 | 10 | All edges in the spanning tree correspond to anaphor-antecedent pairs. In 11 | order to access this antecedent information, the attribute "antecedent" of 12 | the mentions in the entity must be set. 13 | 14 | Args: 15 | entity (EntityGraph): The EntityGraph for the entity for which the 16 | spanning tree should be computed. 17 | partitioned_entity (EntityGraph): A partition of the entity -- not 18 | used for this algorithm. 19 | 20 | Returns: 21 | list(Mention, Mention): A list of mention pairs, which constitute the 22 | edges of the spanning tree. For a pair (m, n), n appears later in 23 | the text than m. 24 | """ 25 | edges = [] 26 | for mention in entity.edges: 27 | # just look at system output 28 | if ("antecedent" in mention.attributes 29 | and mention.attributes["antecedent"] in entity.edges[mention]): 30 | edges.append((mention, mention.attributes["antecedent"])) 31 | 32 | return sorted(edges) 33 | 34 | 35 | def recall_closest(entity, partitioned_entity): 36 | """ Compute a spanning tree by always taking the closest mention in the same 37 | entity. 38 | 39 | Args: 40 | entity (EntityGraph): The EntityGraph for the entity for which the 41 | spanning tree should be computed. 42 | partitioned_entity (EntityGraph): A partition of the entity -- not 43 | used for this algorithm. 44 | 45 | Returns: 46 | list(Mention, Mention): A list of mention pairs, which constitute the 47 | edges of the spanning tree. For a pair (m, n), n appears later in 48 | the text than m. 49 | """ 50 | edges = [] 51 | for mention in entity.edges: 52 | # always take closest (except for first mention in entity, which does 53 | # not have any antecedent) 54 | if entity.edges[mention]: 55 | if mention in partitioned_entity.edges: 56 | antecedent = sorted(partitioned_entity.edges[mention], 57 | reverse=True)[0] 58 | else: 59 | antecedent = sorted(entity.edges[mention], reverse=True)[0] 60 | edges.append((mention, antecedent)) 61 | 62 | return sorted(edges) 63 | 64 | 65 | def recall_accessibility(entity, partitioned_entity): 66 | """ Compute a spanning tree by choosing edges according to the accessibility 67 | of the antecedent. 68 | 69 | First, if a mention has an out-degree of at least one in the partitioned 70 | entity, take the edge with the closest mention distance as an edge for 71 | the spanning tree. Otherwise, proceed as follows. 72 | 73 | If a mention m is a proper name or a common noun, choose an antecedent as 74 | follows: 75 | 76 | - if a proper name antecedent exists, take the closest and output this 77 | pair as an edge 78 | - else if a common noun antecedent exists, take the closest and output 79 | this pair as an edge 80 | - else take the closest preceding mention and output this pair as an 81 | edge 82 | 83 | For all other mentions, take the closest preceding mention and output 84 | this pair as an edge. 85 | 86 | Args: 87 | entity (EntityGraph): The EntityGraph for the entity for which the 88 | spanning tree should be computed. 89 | partitioned_entity (EntityGraph): A partition of the entity -- not 90 | used for this algorithm. 91 | 92 | Returns: 93 | list(Mention, Mention): A list of mention pairs, which constitute the 94 | edges of the spanning tree. For a pair (m, n), n appears later in 95 | the text than m. 96 | """ 97 | edges = [] 98 | for mention in entity.edges: 99 | if entity.edges[mention]: 100 | # mention is not the first in subentity? take closest! 101 | if mention in partitioned_entity.edges: 102 | antecedent = sorted(partitioned_entity.edges[mention], 103 | reverse=True)[0] 104 | else: 105 | antecedent = __get_antecedent_by_type(mention, 106 | entity.edges[mention]) 107 | 108 | edges.append((mention, antecedent)) 109 | 110 | return sorted(edges) 111 | 112 | 113 | def __get_antecedent_by_type(mention, candidates): 114 | # make sure... 115 | candidates_reversed = sorted(candidates, reverse=True) 116 | # mention is (demonstrative) pronoun? take closest! 117 | if (mention.attributes["type"] == "PRO" or 118 | mention.attributes["type"] == "DEM"): 119 | return candidates_reversed[0] 120 | # otherwise chose by type, back off to closest 121 | elif __get_by_pos(candidates_reversed, "NAM"): 122 | return __get_by_pos(candidates_reversed, "NAM") 123 | elif __get_by_pos(candidates_reversed, "NOM"): 124 | return __get_by_pos(candidates_reversed, "NOM") 125 | else: 126 | return candidates_reversed[0] 127 | 128 | 129 | def __get_by_pos(candidates, pos): 130 | for mention in candidates: 131 | if mention.attributes["type"] == pos: 132 | return mention 133 | -------------------------------------------------------------------------------- /cort/analysis/visualization/TODO: -------------------------------------------------------------------------------- 1 | Python: 2 | - use python http server in order to avoid multi-megabyte html blobs (simplehttpserver) 3 | !- use discernible colours: https://github.com/gtaylor/python-colormath 4 | 5 | jQuery/javascript: 6 | - improve mentionhead tooltip behaviour 7 | - Dynamic computation of heights, etc. in scroll() -------------------------------------------------------------------------------- /cort/analysis/visualization/style.css: -------------------------------------------------------------------------------- 1 | html, body { 2 | margin: 0; 3 | font-family: Sans-Serif; 4 | } 5 | 6 | h1 { 7 | padding: 5px; 8 | margin: 0; 9 | text-align: left; 10 | } 11 | 12 | h3 { 13 | margin: 0 0 0 10px; 14 | padding: 0; 15 | font-family: Sans-Serif; 16 | font-size: 1em; 17 | } 18 | 19 | #header { 20 | background-color: rgb(1,70,153); 21 | margin: 0; 22 | padding: 5px; 23 | height: 50px; 24 | width: 100%; 25 | color: white; 26 | position: fixed; 27 | top: 0; 28 | z-index: 25; 29 | } 30 | 31 | #documentsNavi { 32 | margin: 10px 0 0 0; 33 | padding: 0; 34 | width: 225px; /* Must be same as .navcontainer*/ 35 | position: fixed; 36 | top: 60px; 37 | float: left; 38 | } 39 | 40 | #documentsNavi ul { 41 | margin: 5px 0 0 10px; 42 | padding: 5px 0; 43 | list-style-type: none; 44 | height: 100px; 45 | overflow: auto; 46 | font-size: .8em; 47 | background-color: #bbbbbb; 48 | } 49 | 50 | #documentsNavi ul li { 51 | margin: 0; 52 | padding: 5px; 53 | cursor: pointer; 54 | } 55 | 56 | #documentsNavi li:nth-child(even) { 57 | background-color: #bbbbbb; 58 | } 59 | 60 | #documentsNavi li:nth-child(odd) { 61 | background-color: #cccccc; 62 | } 63 | 64 | #documentsNavi ul li:hover, #documentsNavi ul li:active { 65 | background-color: gray; 66 | } 67 | 68 | #documentsNavi ul li.highlight { 69 | font-weight: bolder; 70 | } 71 | 72 | /* Contains navigation bars and the document text itself */ 73 | .document { 74 | margin: 80px 10px 0 0; 75 | padding: 0; 76 | display: none; 77 | min-height: 600px; 78 | } 79 | 80 | #documentsNavi + .document { 81 | display: block; 82 | } 83 | 84 | .navcontainer { 85 | margin: 0; 86 | padding: 0; 87 | position: fixed; 88 | top: 200px; 89 | width: 225px; 90 | } 91 | 92 | .navcontainer > div { 93 | margin-top: 20px; 94 | padding: 0; 95 | } 96 | 97 | .tease { 98 | display: none; 99 | opacity: .8; 100 | margin: 0 0 0 5px; 101 | padding: 0; 102 | font-family: Sans-Serif; 103 | font-size: .8em; 104 | } 105 | 106 | .navcontainer > div h3:hover { 107 | display: inline-block; 108 | cursor: pointer; 109 | } 110 | 111 | .navcontainer > div h3:hover + .tease { 112 | display: inline-block; 113 | } 114 | 115 | /* Gold and system navigation boxes */ 116 | .navcontainer > div > ul { 117 | margin: 5px 0 0 10px; 118 | padding: 5px 0; 119 | list-style-type: none; 120 | overflow-y: auto; 121 | max-height: 80px; 122 | font-size: .8em; 123 | background-color: #bbbbbb; 124 | } 125 | 126 | div.navcontainer div ul li:nth-child(even) { 127 | background-color: #bbbbbb; 128 | } 129 | 130 | div.navcontainer div ul li:nth-child(odd) { 131 | background-color: #cccccc; 132 | } 133 | 134 | .navcontainer > div ul li { 135 | margin: 0; 136 | padding: 2px; 137 | } 138 | 139 | .navcontainer > div ul li:hover { 140 | cursor: pointer; 141 | } 142 | 143 | /* Errors navigation box */ 144 | div.errorsNavi { 145 | 146 | } 147 | 148 | div.errorsNavi h4 { 149 | margin: 0; 150 | padding: 2px; 151 | font-size: .9em; 152 | font-weight: light; 153 | } 154 | 155 | div.errorsNavi h4:hover { 156 | display: inline-block; 157 | cursor: pointer; 158 | } 159 | 160 | div.errorsNavi h4:hover + .tease { 161 | display: inline-block; 162 | } 163 | 164 | div.errorsNavi > div { 165 | margin: 0 0 0 10px; 166 | padding: 5px 0; 167 | background-color: #eeeeee; 168 | } 169 | 170 | .precisionErrors, .recallErrors { 171 | margin: 0; 172 | padding: 5px 0; 173 | list-style-type: none; 174 | font-size: .8em; 175 | height: 80px; 176 | overflow: auto; 177 | } 178 | 179 | ol.text { 180 | margin: 10px 0 0 250px; 181 | padding: 5px; 182 | line-height: 250%; 183 | font-family: Sans-Serif; 184 | font-size: .9em; 185 | background-color: #eeeeee; 186 | } 187 | 188 | ol.text { 189 | counter-reset: li; 190 | display: table; 191 | } 192 | 193 | ol.text li.sentence { 194 | margin: 0; 195 | padding: 0; 196 | } 197 | 198 | ol.text > li { 199 | margin: 0 0 6px 2em; 200 | padding: 4px 8px; 201 | list-style: none; 202 | counter-increment: li; 203 | display: table-row; 204 | } 205 | 206 | ol.text > li:before { 207 | content: counter(li) "."; 208 | font-size: .7em; 209 | color: gray; 210 | display: inline-block; 211 | width: 20px; 212 | text-align: right; 213 | padding-right: 5px; 214 | display: table-cell; 215 | } 216 | 217 | ol.text span.mention { 218 | margin: 0; 219 | display: inline; 220 | border-radius: 0.5em; 221 | } 222 | 223 | ol.text span.mention:hover { 224 | cursor: pointer; 225 | } 226 | 227 | div ol.text .goldBorder { 228 | border: 3px solid gold; 229 | } 230 | 231 | div ol.text .blueBorder { 232 | border: 3px solid blue; 233 | } 234 | 235 | ol.text *[class^='system']{ 236 | border: 1px solid blue; 237 | padding: 5px; 238 | } 239 | 240 | ol.text *[class^='gold']{ 241 | border: 1px solid gold; 242 | padding: 2px; 243 | } 244 | /* 245 | span.transparentBg, .goldNavi ul li.transparentBg, .systemNavi ul li.transparentBg { 246 | background-color: transparent; 247 | border: 3px solid transparent; 248 | } 249 | */ 250 | .label { 251 | line-height: 100%; 252 | background-color: #F1F101; 253 | z-index: 24; 254 | opacity: .9; 255 | box-shadow: 2px 2px 13px #aaa; 256 | } 257 | 258 | .label:hover { 259 | display: block; 260 | } -------------------------------------------------------------------------------- /cort/config_files/corenlp.ini: -------------------------------------------------------------------------------- 1 | annotators = tokenize,ssplit,pos,lemma,parse,ner -------------------------------------------------------------------------------- /cort/config_files/corenlp_with_coref.ini: -------------------------------------------------------------------------------- 1 | annotators = tokenize,ssplit,pos,lemma,parse,ner 2 | tokenize.whitespace = true 3 | ssplit.eolonly = true -------------------------------------------------------------------------------- /cort/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ Includes core functionality for managing documents and mentions.""" 2 | 3 | __author__ = 'martscsn' 4 | -------------------------------------------------------------------------------- /cort/core/external_data.py: -------------------------------------------------------------------------------- 1 | """ Read in and access data from external resources such as gender lists.""" 2 | 3 | import os 4 | import pickle 5 | 6 | 7 | import cort 8 | from cort.core import singletons 9 | from cort.core import util 10 | 11 | 12 | __author__ = 'smartschat' 13 | 14 | 15 | @singletons.Singleton 16 | class GenderData: 17 | """ Read in and access data from lists with gender information. 18 | 19 | Attributes: 20 | word_to_gender (dict(str, str)): A mapping from lower-case strings 21 | to one of four genders: 'MALE', 'FEMALE', 'NEUTRAL' and 'PLURAL'. 22 | """ 23 | def __init__(self): 24 | """ Initialize the word-to-gender mapping from gender lists. 25 | """ 26 | self.word_to_gender = {} 27 | 28 | directory = cort.__path__[0] + "/resources/" 29 | 30 | lists = [ 31 | open(directory + "male.list"), 32 | open(directory + "female.list"), 33 | open(directory + "neutral.list"), 34 | open(directory + "plural.list") 35 | ] 36 | 37 | genders = ["MALE", "FEMALE", "NEUTRAL", "PLURAL"] 38 | 39 | for gender, gender_list in zip(genders, lists): 40 | for word in gender_list.readlines(): 41 | self.word_to_gender[word.strip()] = gender 42 | 43 | def look_up(self, attributes): 44 | """ Look up the gender of a mention described by the input attributes. 45 | 46 | Args: 47 | attributes (dict(str,object)): A dict describing attributes of 48 | mentions. Must contain "tokens" and "head", which have lists 49 | of strings as values. 50 | 51 | Returns: 52 | (str): None or one of the four genders 'MALE', 'FEMALE', 53 | 'NEUTRAL' or 'PLURAL'. 54 | """ 55 | # whole string 56 | if " ".join(attributes["tokens"]).lower() in self.word_to_gender: 57 | return self.word_to_gender[" ".join(attributes["tokens"]).lower()] 58 | # head 59 | elif " ".join(attributes["head"]).lower() in self.word_to_gender: 60 | return self.word_to_gender[" ".join(attributes["head"]).lower()] 61 | # head token by token 62 | elif self.__look_up_token_by_token(attributes["head"]): 63 | return self.__look_up_token_by_token(attributes["head"]) 64 | 65 | def __look_up_token_by_token(self, tokens): 66 | for token in tokens: 67 | if token[0].isupper() and token.lower() in self.word_to_gender: 68 | return self.word_to_gender[token.lower()] 69 | 70 | 71 | @singletons.Singleton 72 | class LexicalData: 73 | """ Read in and access data containing pairs of coreferent mention strings. 74 | 75 | Attributes: 76 | pairs (set((str, str))): A set of string pairs, which represent strings 77 | of potentially coreferent mentions. 78 | """ 79 | def __init__(self): 80 | """ Initialize the set of pairs from 81 | package_root/resources/coreferent_pairs.obj. 82 | """ 83 | directory = cort.__path__[0] + "/resources/" 84 | 85 | self.pairs = pickle.load( 86 | open(directory + "coreferent_pairs.obj", "rb")) 87 | 88 | def look_up(self, anaphor, antecedent): 89 | """ Look up strings of the mentions in the pair list. 90 | 91 | Args: 92 | anaphor (Mention): A mention. 93 | antecedent (Mention): Another mention, the candidate antecedent 94 | for anaphor. 95 | 96 | Returns: 97 | True if the pair of strings corresponding to anaphor of 98 | antecedent, stripped determiners and possessive s, can be found 99 | in the list of pairs. 100 | """ 101 | # whole string 102 | anaphor_cleaned = " ".join( 103 | util.clean_via_pos(anaphor.attributes["tokens"], 104 | anaphor.attributes["pos"])) 105 | antecedent_cleaned = " ".join( 106 | util.clean_via_pos(antecedent.attributes["tokens"], 107 | antecedent.attributes["pos"])) 108 | 109 | return ( 110 | (anaphor_cleaned, antecedent_cleaned) in self.pairs 111 | or (antecedent_cleaned, anaphor_cleaned) in self.pairs 112 | ) 113 | 114 | 115 | @singletons.Singleton 116 | class SingletonMentions: 117 | """ Read in and access data strings of singleton mentions. 118 | 119 | Attributes: 120 | singletons (set(str)): A set of strings, which represent strings of 121 | of potential singleton mentions. 122 | """ 123 | def __init__(self): 124 | """ Initialize the set of pairs from 125 | package_root/resources/singletons_not_cleaned.obj. 126 | """ 127 | directory = cort.__path__[0] + "/resources/" 128 | 129 | self.singletons = pickle.load( 130 | open(directory + "singletons_not_cleaned.obj", "rb")) 131 | -------------------------------------------------------------------------------- /cort/core/mixins.py: -------------------------------------------------------------------------------- 1 | """ Mixins. """ 2 | 3 | 4 | __author__ = 'smartschat' 5 | 6 | 7 | class ComparableMixin: 8 | """ A mixin for deducing comparison operators from __lt__. """ 9 | def __eq__(self, other): 10 | if self is None and other is not None: 11 | return False 12 | elif self is not None and other is None: 13 | return False 14 | else: 15 | return not self < other and not other < self 16 | 17 | def __ne__(self, other): 18 | return self < other or other < self 19 | 20 | def __gt__(self, other): 21 | return other < self 22 | 23 | def __ge__(self, other): 24 | return not self < other 25 | 26 | def __le__(self, other): 27 | return not other < self 28 | -------------------------------------------------------------------------------- /cort/core/singletons.py: -------------------------------------------------------------------------------- 1 | """ Implements the singleton pattern. """ 2 | 3 | 4 | __author__ = 'smartschat' 5 | 6 | 7 | class Singleton: 8 | """ 9 | A non-thread-safe helper class to ease implementing singletons. 10 | This should be used as a decorator -- not a metaclass -- to the 11 | class that should be a singleton. 12 | 13 | The decorated class can define one `__init__` function that 14 | takes only the `self` argument. Other than that, there are 15 | no restrictions that apply to the decorated class. 16 | 17 | To get the singleton instance, use the `get_instance` method. Trying 18 | to use `__call__` will result in a `TypeError` being raised. 19 | 20 | Limitations: The decorated class cannot be inherited from. 21 | 22 | Source: 23 | http://stackoverflow.com/questions/42558/python-and-the-singleton-pattern 24 | 25 | """ 26 | 27 | def __init__(self, decorated): 28 | self._decorated = decorated 29 | self._instance = None 30 | 31 | def get_instance(self): 32 | """ 33 | Returns the singleton instance. Upon its first call, it creates a 34 | new instance of the decorated class and calls its `__init__` method. 35 | On all subsequent calls, the already created instance is returned. 36 | 37 | """ 38 | if self._instance: 39 | return self._instance 40 | else: 41 | self._instance = self._decorated() 42 | return self._instance 43 | 44 | def __call__(self): 45 | raise TypeError('Singletons must be accessed through ' 46 | '`get_instance()`.') 47 | 48 | def __instancecheck__(self, inst): 49 | return isinstance(inst, self._decorated) 50 | -------------------------------------------------------------------------------- /cort/core/spans.py: -------------------------------------------------------------------------------- 1 | """ Manage spans in documents. """ 2 | 3 | from cort.core import mixins 4 | 5 | 6 | __author__ = 'smartschat' 7 | 8 | 9 | class Span(mixins.ComparableMixin): 10 | """ Manage and compare spans in documents. 11 | 12 | Attributes: 13 | begin (int): The begin of the span. 14 | end (int): The end of the span (inclusive). 15 | """ 16 | def __init__(self, begin, end): 17 | """ Initialize a span from a begin and an end position. 18 | 19 | Args: 20 | begin (int): The begin of the span. 21 | end (int): The end of the span. 22 | """ 23 | self.begin = begin 24 | self.end = end 25 | 26 | def __str__(self): 27 | return "(" + str(self.begin) + ", " + str(self.end) + ")" 28 | 29 | def __repr__(self): 30 | return "(" + str(self.begin) + ", " + str(self.end) + ")" 31 | 32 | def __lt__(self, other): 33 | """ Check whether this span is less than another span. 34 | 35 | (a,b) < (c,d) if and only if a < c or a = c and b < d 36 | 37 | Args: 38 | other (Span): A span. 39 | 40 | Returns: 41 | True if this span is less than other, False otherwise. 42 | """ 43 | if self.begin < other.begin: 44 | return True 45 | elif self.begin > other.begin: 46 | return False 47 | elif self.end < other.end: 48 | return True 49 | else: 50 | return False 51 | 52 | def embeds(self, other): 53 | """ Check whether this span embeds another span. 54 | 55 | Args: 56 | other (Span): A span. 57 | 58 | Returns: 59 | True if this span embeds other, False otherwise. 60 | """ 61 | return self.begin <= other.begin and self.end >= other.end 62 | 63 | def __hash__(self): 64 | return hash((self.begin, self.end)) 65 | 66 | @staticmethod 67 | def parse(span_string): 68 | """ Parse a string specification of a span to a Span object. 69 | 70 | Valid representations are for example "(1, 2)" or "(1,2)". 71 | 72 | Args: 73 | span_string (str): A string representation of a span. 74 | 75 | Returns: 76 | Span: The span corresponding to the string representation. 77 | """ 78 | without_brackets = span_string.strip()[1:-1] 79 | splitted_and_stripped = [token.strip() for token 80 | in without_brackets.split(",")] 81 | return Span( 82 | int(splitted_and_stripped[0]), 83 | int(splitted_and_stripped[1])) 84 | -------------------------------------------------------------------------------- /cort/core/util.py: -------------------------------------------------------------------------------- 1 | """ Utility functions. """ 2 | 3 | __author__ = 'smartschat' 4 | 5 | 6 | def clean_via_pos(tokens, pos): 7 | """ Clean a list of tokens according to their part-of-speech tags. 8 | 9 | In particular, retain only tokens which do not have the part-of-speech tag 10 | DT (determiner) or POS (possessive 's'). 11 | 12 | Args: 13 | tokens (list(str)): A list of tokens. 14 | pos (list(str)): A list of corresponding part-of-speech tags. 15 | 16 | Returns: 17 | list(str): The list of tokens which do not have part-of-speech tag 18 | DT or POS. 19 | """ 20 | return [token for token, pos in zip(tokens, pos) 21 | if pos not in ["DT", "POS"]] 22 | -------------------------------------------------------------------------------- /cort/coreference/__init__.py: -------------------------------------------------------------------------------- 1 | """ Includes a unified framework for representation and learning of coreference 2 | resolution approaches.""" 3 | 4 | __author__ = 'martscsn' 5 | -------------------------------------------------------------------------------- /cort/coreference/approaches/__init__.py: -------------------------------------------------------------------------------- 1 | ''' Contains implementations of various coreference resolution approaches in 2 | the unified framework. 3 | ''' 4 | 5 | __author__ = 'martscsn' 6 | -------------------------------------------------------------------------------- /cort/coreference/approaches/antecedent_trees.py: -------------------------------------------------------------------------------- 1 | """ Implements instance extraction and decoding for antecedent trees. 2 | 3 | This module implements antecedent trees (Fernandes et al., 2014) within a 4 | framework that expresses coreference resolution as predicting latent structures, 5 | while performing learning using a latent structured perceptron with 6 | cost-augmented inference. 7 | 8 | Hence, antecedent trees are expressed as as predicting a latent graph. 9 | In particular, let m_1, ..., m_n be all mentions in a document. Let m_0 be a 10 | dummy mention for anaphoricity determination. We predict 11 | the graph with nodes m_0, ..., m_n and with arcs (m_j, m_i) which correspond to 12 | antecedent decisions. In particular, for each j there exists exactly one i < j 13 | such that (m_j, m_i) is in the graph. Such a graph is called aa *substructure* 14 | (for antecedent trees, substructures and structures coincide). 15 | 16 | To implement antecedent trees, this module contains a function that defines the 17 | search space for the graphs, and a decoder that computes the best-scoring tree 18 | of antecedent decisions, and the best-scoring tree of antecedent decisions 19 | consistent with the gold annotation (i.e. only having pairs of coreferent 20 | mentions as arcs). 21 | 22 | Reference: 23 | 24 | - Eraldo Fernandes, Cicero dos Santos, and Ruy Milidiu. 2014. Latent trees 25 | for coreference resolution. *Computational Linguistics*, 40(4):801-835. 26 | http://www.aclweb.org/anthology/J14-4004 27 | """ 28 | 29 | from __future__ import division 30 | 31 | 32 | import array 33 | 34 | 35 | from cort.coreference import perceptrons 36 | 37 | 38 | __author__ = 'martscsn' 39 | 40 | 41 | def extract_substructures(doc): 42 | """ Extract the search space for the antecedent tree model, 43 | 44 | The mention ranking model consists in computing the optimal antecedent for 45 | each anaphor. These decisions are represented as edges in a tree of 46 | anaphor-antecedent decisions. This functions extracts the search space for 47 | the tree. 48 | 49 | The search space is represented as a nested list of mention pairs. The 50 | mention pairs are candidate arcs in the graph. The nested list contains 51 | only one list, since antecedent trees have only one substructure for 52 | each document. 53 | 54 | The list contains all potential (anaphor, antecedent) pairs in the 55 | following order: (m_1, m_0), (m_2, m_1), (m_2, m_0), (m_3, m_2), ..., 56 | where m_j is the jth mention in the document. 57 | 58 | Args: 59 | doc (CoNLLDocument): The document to extract substructures from. 60 | 61 | Returns: 62 | (list(list(Mention, Mention))): The nested list of mention pairs 63 | describing the search space for the substructures. 64 | """ 65 | substructure = [] 66 | 67 | # iterate over mentions 68 | for i, ana in enumerate(doc.system_mentions): 69 | 70 | # iterate in reversed order over candidate antecedents 71 | for ante in sorted(doc.system_mentions[:i], reverse=True): 72 | substructure.append((ana, ante)) 73 | 74 | return [substructure] 75 | 76 | 77 | class AntecedentTreePerceptron(perceptrons.Perceptron): 78 | """ A perceptron for antecedent trees. """ 79 | def argmax(self, substructure, arc_information): 80 | """ Decoder for antecedent trees. 81 | 82 | Compute highest-scoring antecedent tree and highest-scoring antecedent 83 | tree consistent with the gold annotation. 84 | 85 | Args: 86 | substructure (list((Mention, Mention))): The list of mention pairs 87 | which define the search space for one substructure. For mention 88 | ranking, this list contains all potential anaphor-antecedent 89 | pairs in the following order: 90 | (m_1, m_0), (m_2, m_1), (m_2, m_0), (m_3, m_2), ... 91 | arc_information (dict((Mention, Mention), 92 | ((array, array, array), list(int), bool)): 93 | A mapping of arcs (= mention pairs) to information about these 94 | arcs. The information consists of the features, the costs for 95 | the arc (for each label), and whether predicting the arc to be 96 | coreferent is consistent with the gold annotation). The features 97 | are divided in three arrays: the first array contains the non- 98 | numeric features, the second array the numeric features, and the 99 | third array the values for the numeric features. The features 100 | are represented as integers via feature hashing. 101 | 102 | Returns: 103 | A 7-tuple describing the highest-scoring antecedent tree, and the 104 | highest-scoring antecedent tree consistent with the gold 105 | annotation. The tuple consists of: 106 | 107 | - **best_arcs** (*list((Mention, Mention))*): the arcs 108 | constituting the highest-scoring antecedent tree, 109 | - **best_labels** (*list(str)*): empty, the antecedent tree 110 | approach does not employ any labels, 111 | - **best_scores** (*list(float)*): the scores of the 112 | arcs in the highest-scoring antecedent tree, 113 | - **best_cons_arcs** (*list((Mention, Mention))*): the arcs 114 | constituting the highest-scoring antecedent tree consistent 115 | with the gold annotation. 116 | - **best_cons_labels** (*list(str)*): empty, the antecedent 117 | tree approach does not employ any labels 118 | - **best_cons_scores** (*list(float)*): the scores of the 119 | arcs in the highest-scoring antecedent tree consistent with 120 | the gold annotation, 121 | - **is_consistent** (*bool*): whether the highest-scoring 122 | antecedent tree is consistent with the gold annotation. 123 | """ 124 | if not substructure: 125 | return [], [], [], [], [], [], True 126 | 127 | number_mentions = len(substructure[0][0].document.system_mentions) 128 | 129 | arcs = [] 130 | arcs_scores = [] 131 | coref_arcs = [] 132 | coref_arcs_scores = [] 133 | 134 | is_consistent = True 135 | 136 | for ana_index in range(1, number_mentions): 137 | 138 | first_arc = ana_index*(ana_index-1)//2 139 | last_arc = first_arc + ana_index 140 | 141 | best, max_val, best_cons, max_cons, best_is_consistent = \ 142 | self.find_best_arcs(substructure[first_arc:last_arc], 143 | arc_information) 144 | 145 | arcs.append(best) 146 | arcs_scores.append(max_val) 147 | coref_arcs.append(best_cons) 148 | coref_arcs_scores.append(max_cons) 149 | 150 | is_consistent &= best_is_consistent 151 | 152 | return ( 153 | arcs, 154 | [], 155 | arcs_scores, 156 | coref_arcs, 157 | [], 158 | coref_arcs_scores, 159 | is_consistent 160 | ) 161 | -------------------------------------------------------------------------------- /cort/coreference/clusterer.py: -------------------------------------------------------------------------------- 1 | """ Extract coreference information from pairwise predictions.""" 2 | 3 | __author__ = 'smartschat' 4 | 5 | 6 | def best_first(substructures, labels, scores, coref_labels): 7 | """ Extract coreference clusters from coreference predictions via best-first 8 | clustering. 9 | 10 | In particular, go through a list of anaphor-antecedent pairs, where 11 | pairs with the same anaphor are consecutive. Then, for each anaphor, the 12 | best-scoring antecedent is selected (this is also called best-first 13 | clustering). Ties are broken by position in the list: earlier items are 14 | preferred. 15 | 16 | Args: 17 | substructures (list(list((Mention, Mention)))): A list of substructures. 18 | For this clusterer, each substructure should contain only one 19 | (anaphor, antecedent) pair. If two substructures have the same 20 | anaphor, they should be consecutive. 21 | labels (list(list(str))): A list of arc labels. This list should 22 | have the same length as the list of substructures, and each inner 23 | list should contain only one element (as in ``substructures``). 24 | Each entry describes the label of an arc. 25 | labels (list(list(str))): A list of arc scores. This list should 26 | have the same length as the list of substructures, and each inner 27 | list should contain only one element (as in ``substructures``). 28 | Each entry describes the score of an arc. 29 | coref_labels (set(str)): A list of labels that indicate that mentions 30 | connected via an arc that has one of these labels are coreferent. 31 | 32 | Returns 33 | A tuple containing two dicts. The components are 34 | 35 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of 36 | mentions to entity identifiers. 37 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of 38 | mentions to their antecedent. 39 | """ 40 | 41 | anaphor = None 42 | best = None 43 | max_val = float('-inf') 44 | 45 | mention_entity_mapping = {} 46 | antecedent_mapping = {} 47 | 48 | for substructure, substructure_label, substructure_score in zip( 49 | substructures, labels, scores): 50 | # each substructure consists of one pair 51 | pair = substructure[0] 52 | label = substructure_label[0] 53 | score = substructure_score[0] 54 | current_anaphor, current_antecedent = pair 55 | if current_anaphor != anaphor: 56 | # change in anaphor: set coreference information based on 57 | # best-scoring antecedent 58 | if anaphor and best and not best.is_dummy(): 59 | antecedent_mapping[anaphor] = best 60 | if best not in mention_entity_mapping: 61 | mention_entity_mapping[best] = \ 62 | best.document.system_mentions.index(best) 63 | 64 | mention_entity_mapping[anaphor] = \ 65 | mention_entity_mapping[best] 66 | 67 | best = None 68 | max_val = float('-inf') 69 | 70 | if score > max_val and label in coref_labels: 71 | max_val = score 72 | best = current_antecedent 73 | 74 | anaphor = current_anaphor 75 | 76 | if anaphor and best and not best.is_dummy(): 77 | antecedent_mapping[anaphor] = best 78 | if best not in mention_entity_mapping: 79 | mention_entity_mapping[best] = \ 80 | best.document.system_mentions.index(best) 81 | 82 | mention_entity_mapping[anaphor] = \ 83 | mention_entity_mapping[best] 84 | 85 | return mention_entity_mapping, antecedent_mapping 86 | 87 | 88 | def all_ante(substructures, labels, scores, coref_labels): 89 | """ Extract coreference clusters from coreference predictions via transitive 90 | closure. 91 | 92 | In particular, go through all (anaphor, antecedent) pairs contained in 93 | ``substructures``, and obtain coreference clusters by transitive closure. 94 | 95 | Args: 96 | substructures (list(list((Mention, Mention)))): A list of substructures. 97 | labels (list(list(str))): Not used by this function. 98 | labels (list(list(str))): Not used by this function. 99 | coref_labels (set(str)): Not used by this function. 100 | 101 | Returns 102 | A tuple containing two dicts. The components are 103 | 104 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of 105 | mentions to entity identifiers. 106 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of 107 | mentions to their antecedent. 108 | """ 109 | mention_entity_mapping = {} 110 | antecedent_mapping = {} 111 | 112 | for substructure in substructures: 113 | for pair in substructure: 114 | anaphor, antecedent = pair 115 | 116 | # skip dummy antecedents 117 | if antecedent.is_dummy(): 118 | continue 119 | 120 | antecedent_mapping[anaphor] = antecedent 121 | 122 | # antecedent is not in the mapping: we initialize a new coreference 123 | # chain 124 | if antecedent not in mention_entity_mapping: 125 | # chain id: index of antecedent in system mentions 126 | mention_entity_mapping[antecedent] = \ 127 | antecedent.document.system_mentions.index(antecedent) 128 | 129 | # assign id based on antecedent 130 | mention_entity_mapping[anaphor] = \ 131 | mention_entity_mapping[antecedent] 132 | 133 | return mention_entity_mapping, antecedent_mapping 134 | -------------------------------------------------------------------------------- /cort/coreference/cost_functions.py: -------------------------------------------------------------------------------- 1 | """ Cost functions used during learning of coreference predictors. """ 2 | 3 | __author__ = 'martscsn' 4 | 5 | 6 | def cost_based_on_consistency(arc, label="+"): 7 | """ Assign cost to arcs based on consistency of decision and anaphoricity. 8 | 9 | An anaphor-antecedent decision is consistent if either 10 | (a) the mentions are coreferent, or 11 | (b) the antecedent is the dummy mention, and the anaphor does not have 12 | any preceding coreferent mention among all extracted mentions. 13 | 14 | Note that (b) also contains cases where the mention has an antecedent in the 15 | gold data, but we were unable to extract this antecedent due to errors in 16 | mention detection. 17 | 18 | If the anaphor-antecedent decision represented by ``arc``is consistent, it 19 | gets cost 0. If the the decision is not consistent, and the antecedent is 20 | the dummy mention, it gets cost 2. Otherwise, it gets cost 1. 21 | 22 | Args: 23 | arc ((Mention, Mention)): A pair of mentions. 24 | label (str): The label to predict for the arc. Defaults to '+'. 25 | 26 | Return: 27 | (int): The cost of predicting the arc. 28 | """ 29 | ana, ante = arc 30 | 31 | consistent = ana.decision_is_consistent(ante) 32 | 33 | # false new 34 | if not consistent and ante.is_dummy(): 35 | return 2 36 | # wrong link 37 | elif not consistent: 38 | return 1 39 | else: 40 | return 0 41 | 42 | 43 | def null_cost(arc, label="+"): 44 | """ Dummy cost function which always returns 0 (corresponding to not using 45 | a cost function at all). 46 | 47 | Args: 48 | arc ((Mention, Mention)): A pair of mentions. 49 | label (str): The label to predict for the arc. Defaults to '+' 50 | 51 | Return: 52 | 0 53 | """ 54 | return 0 -------------------------------------------------------------------------------- /cort/coreference/experiments.py: -------------------------------------------------------------------------------- 1 | """ Manage learning from training data and making predictions on test data. """ 2 | 3 | 4 | import logging 5 | 6 | 7 | __author__ = 'smartschat' 8 | 9 | 10 | def learn(training_corpus, instance_extractor, perceptron): 11 | """ Learn a model for coreference resolution from training data. 12 | 13 | In particular, apply an instance/feature extractor to a training corpus and 14 | employ a machine learning model to learn a weight vector from these 15 | instances. 16 | 17 | Args: 18 | training_corpus (Corpus): The corpus to learn from. 19 | instance_extractor (InstanceExtracor): The instance extractor that 20 | defines the features and the structure of instances that are 21 | extracted during training. 22 | perceptron (Perceptron): A perceptron (including a decoder) that 23 | learns from the instances extracted by ``instance_extractor``. 24 | 25 | Returns: 26 | A tuple consisting of 27 | - **priors** (*dict(str,float)*): A prior weight for each label 28 | in the graphs representing the instances, 29 | - **weights** (*dict(str, array)*): A mapping of labels to weight 30 | vectors. For each label ``l``, ``weights[l]`` contains weights 31 | for each feature seen during training (for representing the 32 | features we employ *feature hashing*). If the graphs employed are 33 | not labeled, ``l`` is set to "+". 34 | """ 35 | logging.info("Learning.") 36 | 37 | logging.info("\tExtracting instances and features.") 38 | substructures, arc_information = instance_extractor.extract( 39 | training_corpus) 40 | 41 | logging.info("\tFitting model parameters.") 42 | 43 | perceptron.fit(substructures, arc_information) 44 | 45 | return perceptron.get_model() 46 | 47 | 48 | def predict(testing_corpus, 49 | instance_extractor, 50 | perceptron, 51 | coref_extractor): 52 | """ According to a learned model, predict coreference information. 53 | 54 | Args: 55 | testing_corpus (Corpus): The corpus to predict coreference on. 56 | instance_extractor (InstanceExtracor): The instance extracor that 57 | defines the features and the structure of instances that are 58 | extracted during testing. 59 | perceptron (Perceptron): A perceptron learned from training data. 60 | argmax_function (function): A decoder that computes the best-scoring 61 | coreference structure over a set of structures. 62 | coref_extractor (function): An extractor for consolidating pairwise 63 | predictions into coreference clusters. 64 | 65 | Returns: 66 | A tuple containing two dicts. The components are 67 | 68 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of 69 | mentions to entity identifiers. 70 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of 71 | mentions to their antecedent (as determined by the 72 | ``coref_extractor``). 73 | """ 74 | logging.info("Predicting.") 75 | 76 | logging.info("\tRemoving coreference annotations from corpus.") 77 | for doc in testing_corpus: 78 | doc.antecedent_decisions = {} 79 | for mention in doc.system_mentions: 80 | mention.attributes["antecedent"] = None 81 | mention.attributes["set_id"] = None 82 | 83 | logging.info("\tExtracting instances and features.") 84 | substructures, arc_information = instance_extractor.extract(testing_corpus) 85 | 86 | logging.info("\tDoing predictions.") 87 | arcs, labels, scores = perceptron.predict(substructures, arc_information) 88 | 89 | logging.info("\tClustering results.") 90 | 91 | return coref_extractor(arcs, labels, scores, perceptron.get_coref_labels()) 92 | -------------------------------------------------------------------------------- /cort/coreference/multigraph/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/coreference/multigraph/decoders.py: -------------------------------------------------------------------------------- 1 | __author__ = 'smartschat' 2 | 3 | 4 | class MultigraphDecoder: 5 | def __init__(self, multigraph_creator): 6 | self.coref_multigraph_creator = multigraph_creator 7 | 8 | def decode(self, corpus): 9 | for doc in corpus: 10 | for mention in doc.system_mentions: 11 | mention.attributes["set_id"] = None 12 | 13 | # discard dummy mention 14 | self.decode_for_one_document(doc.system_mentions[1:]) 15 | 16 | def decode_for_one_document(self, mentions): 17 | multigraph = \ 18 | self.coref_multigraph_creator.construct_graph_from_mentions( 19 | mentions) 20 | 21 | for mention in mentions: 22 | antecedent = self.compute_antecedent(mention, multigraph) 23 | 24 | if antecedent is not None: 25 | if antecedent.attributes["set_id"] is None: 26 | antecedent.attributes["set_id"] = \ 27 | mentions.index(antecedent) 28 | 29 | mention.attributes["set_id"] = antecedent.attributes["set_id"] 30 | mention.document.antecedent_decisions[mention.span] = \ 31 | antecedent.span 32 | 33 | @staticmethod 34 | def compute_antecedent(mention, multigraph): 35 | weights = [] 36 | for antecedent in multigraph.edges[mention]: 37 | if not multigraph.edges[mention][antecedent]["negative_relations"]: 38 | weights.append( 39 | (multigraph.get_weight(mention, antecedent), antecedent)) 40 | 41 | # get antecedent with highest positive weight, break ties by distance 42 | if len(weights) > 0 and sorted(weights)[-1][0] > 0: 43 | return sorted(weights)[-1][1] 44 | -------------------------------------------------------------------------------- /cort/coreference/multigraph/multigraphs.py: -------------------------------------------------------------------------------- 1 | __author__ = 'smartschat' 2 | 3 | 4 | class CorefMultigraphCreator: 5 | def __init__(self, 6 | positive_features, 7 | negative_features, 8 | weighting_function, 9 | relation_weights, 10 | construct_when_negative=False): 11 | self.positive_features = positive_features 12 | self.negative_features = negative_features 13 | self.weighting_function = weighting_function 14 | self.relation_weights = relation_weights 15 | self.construct_when_negative = construct_when_negative 16 | 17 | def construct_graph_from_mentions(self, mentions): 18 | nodes = [] 19 | edges = {} 20 | 21 | for i in range(0, len(mentions)): 22 | anaphor = mentions[i] 23 | 24 | nodes.append(anaphor) 25 | 26 | edges[anaphor] = self.construct_for_one_mention(mentions, i) 27 | 28 | return CorefMultigraph(nodes, 29 | edges, 30 | self.weighting_function, 31 | self.relation_weights) 32 | 33 | def construct_for_one_mention(self, mentions, i): 34 | anaphor = mentions[i] 35 | 36 | edges = {} 37 | 38 | # do not include dummy mention 39 | for j in range(i-1, 0, -1): 40 | antecedent = mentions[j] 41 | if self.construct_when_negative: 42 | edges[antecedent] = self.get_edge_relations(anaphor, antecedent) 43 | else: 44 | if not self.has_negative(anaphor, antecedent): 45 | edges[antecedent] = { 46 | "negative_relations": [], 47 | "positive_relations": self.get_positive_relations( 48 | anaphor, antecedent) 49 | } 50 | 51 | return edges 52 | 53 | def get_edge_relations(self, anaphor, antecedent): 54 | relations = { 55 | "negative_relations": 56 | self.get_negative_relations(anaphor, antecedent), 57 | "positive_relations": 58 | self.get_positive_relations(anaphor, antecedent) 59 | } 60 | 61 | return relations 62 | 63 | def has_negative(self, anaphor, antecedent): 64 | for r in self.negative_features: 65 | if r(anaphor, antecedent): 66 | return True 67 | 68 | def get_negative_relations(self, anaphor, antecedent): 69 | negative_relations = [] 70 | 71 | for r in self.negative_features: 72 | if r(anaphor, antecedent): 73 | negative_relations.append(r) 74 | 75 | return negative_relations 76 | 77 | def get_positive_relations(self, anaphor, antecedent): 78 | positive_relations = [] 79 | 80 | for r in self.positive_features: 81 | if r(anaphor, antecedent): 82 | positive_relations.append(r) 83 | 84 | return positive_relations 85 | 86 | 87 | class CorefMultigraph: 88 | def __init__(self, nodes, edges, weighting_function, relation_weights): 89 | self.nodes = nodes 90 | self.edges = edges 91 | self.weighting_function = weighting_function 92 | self.relation_weights = relation_weights 93 | 94 | def get_weight(self, anaphor, antecedent): 95 | return self.weighting_function( 96 | anaphor, 97 | antecedent, 98 | self.edges[anaphor][antecedent], 99 | self.relation_weights) 100 | -------------------------------------------------------------------------------- /cort/coreference/multigraph/weighting_functions.py: -------------------------------------------------------------------------------- 1 | __author__ = 'smartschat' 2 | 3 | 4 | def for_each_relation_with_distance(anaphor, 5 | antecedent, 6 | relations, 7 | relation_weights): 8 | weight = 0.0 9 | 10 | if len(relations["negative_relations"]) > 0: 11 | return float("-inf") 12 | 13 | if len(relations["positive_relations"]) == 0: 14 | return 0 15 | 16 | for relation in relations["positive_relations"]: 17 | weight += relation_weights[relation] 18 | 19 | weight /= (anaphor.attributes["sentence_id"] - 20 | antecedent.attributes["sentence_id"] 21 | + 1) 22 | 23 | return weight 24 | -------------------------------------------------------------------------------- /cort/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/preprocessing/pipeline.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | 3 | import cort 4 | 5 | import codecs 6 | 7 | import stanford_corenlp_pywrapper 8 | 9 | from StanfordDependencies import CoNLL 10 | 11 | from cort.core import corpora, documents, spans 12 | 13 | import bs4 14 | 15 | 16 | class Pipeline(): 17 | def __init__(self, corenlp_location, with_coref=False): 18 | package_dir = cort.__path__[0] 19 | 20 | if with_coref: 21 | self.proc = stanford_corenlp_pywrapper.CoreNLP( 22 | configfile=package_dir + "/config_files/corenlp_with_coref.ini", 23 | corenlp_jars=[corenlp_location + "/*"] 24 | ) 25 | else: 26 | self.proc = stanford_corenlp_pywrapper.CoreNLP( 27 | configfile=package_dir + "/config_files/corenlp.ini", 28 | corenlp_jars=[corenlp_location + "/*"] 29 | ) 30 | 31 | self.with_coref = with_coref 32 | 33 | def run_on_docs(self, identifier, docs): 34 | processed_documents = [] 35 | 36 | for doc in docs: 37 | processed_documents.append(self.run_on_doc( 38 | codecs.open(doc, "r", "utf-8") 39 | )) 40 | 41 | return corpora.Corpus(identifier, processed_documents) 42 | 43 | def run_on_doc(self, doc_file, name=None): 44 | if self.with_coref: 45 | soup = bs4.BeautifulSoup(doc_file.read()) 46 | preprocessed = self.proc.parse_doc(soup.text) 47 | else: 48 | data = doc_file.read() 49 | preprocessed = self.proc.parse_doc(data) 50 | 51 | sentences = [] 52 | 53 | for sentence in preprocessed["sentences"]: 54 | processed_ner = [] 55 | for ner in sentence["ner"]: 56 | if ner == "O" or ner == "MISC": 57 | processed_ner.append("NONE") 58 | else: 59 | processed_ner.append(ner) 60 | 61 | processed_dep = [] 62 | 63 | index_to_dep_info = {} 64 | for dep_info in sentence["deps_basic"]: 65 | label, head, in_sent_index = dep_info 66 | index_to_dep_info[in_sent_index] = label, head 67 | 68 | for i in range(0, len(sentence["tokens"])): 69 | if i in index_to_dep_info.keys(): 70 | label, head = index_to_dep_info[i] 71 | processed_dep.append( 72 | CoNLL.Token( 73 | form=sentence["tokens"][i], 74 | lemma=sentence["lemmas"][i], 75 | pos=sentence["pos"][i], 76 | index=i+1, 77 | head=head+1, 78 | deprel=label, 79 | cpos=None, 80 | feats=None, 81 | phead=None, 82 | pdeprel=None, 83 | extra=None 84 | ) 85 | ) 86 | else: 87 | processed_dep.append( 88 | CoNLL.Token( 89 | form=sentence["tokens"][i], 90 | lemma=sentence["lemmas"][i], 91 | pos=sentence["pos"][i], 92 | index=i+1, 93 | head=0, 94 | deprel="punc", 95 | cpos=None, 96 | feats=None, 97 | phead=None, 98 | pdeprel=None, 99 | extra=None 100 | ) 101 | ) 102 | 103 | sentences.append( 104 | (sentence["tokens"], 105 | sentence["pos"], 106 | processed_ner, 107 | ["-"]*len(sentence["tokens"]), 108 | sentence["parse"], 109 | processed_dep, 110 | ) 111 | ) 112 | 113 | if not name: 114 | name = doc_file.name 115 | 116 | if self.with_coref: 117 | antecedent_decisions = {} 118 | coref = {} 119 | 120 | mention_id_to_spans = {} 121 | 122 | max_entity = 0 123 | 124 | for mention in soup.findAll("mention"): 125 | if mention.get("entity"): 126 | max_entity = max(max_entity, int(mention.get("entity"))) 127 | 128 | for mention in soup.findAll("mention"): 129 | mention_id = int(mention.get("id")) 130 | 131 | span = spans.Span(int(mention.get("span_start")), 132 | int(mention.get("span_end"))) 133 | 134 | mention_id_to_spans[mention_id] = span 135 | 136 | if mention.get("entity"): 137 | annotated_set_id = int(mention.get("entity")) 138 | else: 139 | annotated_set_id = max_entity + 1 + mention_id 140 | 141 | coref[span] = annotated_set_id 142 | 143 | if mention.get("antecedent"): 144 | antecedent_decisions[span] = mention_id_to_spans[ 145 | int(mention.get("antecedent")) 146 | ] 147 | 148 | doc = documents.Document( 149 | name, 150 | sentences, 151 | coref) 152 | 153 | spans_to_annotated_mentions = {} 154 | 155 | for mention in doc.annotated_mentions: 156 | spans_to_annotated_mentions[mention.span] = mention 157 | 158 | for span in antecedent_decisions: 159 | ante_span = antecedent_decisions[span] 160 | ana = spans_to_annotated_mentions[span] 161 | ante = spans_to_annotated_mentions[ante_span] 162 | ana.attributes["antecedent"] = ante 163 | else: 164 | doc = documents.Document( 165 | name, 166 | sentences, 167 | {}) 168 | 169 | return doc 170 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/README.txt: -------------------------------------------------------------------------------- 1 | NAME 2 | CorScorer: Perl package for scoring coreference resolution systems 3 | using different metrics. 4 | 5 | 6 | VERSION 7 | v8.01 -- reference implementations of MUC, B-cubed, CEAF and BLANC metrics. 8 | 9 | 10 | CHANGES SINCE v8.0 11 | - fixed a bug that crashed the BLANC scorer when a duplicate singleton 12 | mention was present in the response. 13 | 14 | INSTALLATION 15 | Requirements: 16 | 1. Perl: downloadable from http://perl.org 17 | 2. Algorithm-Munkres: included in this package and downloadable 18 | from CPAN http://search.cpan.org/~tpederse/Algorithm-Munkres-0.08 19 | 20 | USE 21 | This package is distributed with two scripts to execute the scorer from 22 | the command line. 23 | 24 | Windows (tm): scorer.bat 25 | Linux: scorer.pl 26 | 27 | 28 | SYNOPSIS 29 | use CorScorer; 30 | 31 | $metric = 'ceafm'; 32 | 33 | # Scores the whole dataset 34 | &CorScorer::Score($metric, $keys_file, $response_file); 35 | 36 | # Scores one file 37 | &CorScorer::Score($metric, $keys_file, $response_file, $name); 38 | 39 | 40 | INPUT 41 | metric: the metric desired to score the results: 42 | muc: MUCScorer (Vilain et al, 1995) 43 | bcub: B-Cubed (Bagga and Baldwin, 1998) 44 | ceafm: CEAF (Luo et al., 2005) using mention-based similarity 45 | ceafe: CEAF (Luo et al., 2005) using entity-based similarity 46 | blanc: BLANC (Luo et al., 2014) BLANC metric for gold and predicted mentions 47 | all: uses all the metrics to score 48 | 49 | keys_file: file with expected coreference chains in CoNLL-2011/2012 format 50 | 51 | response_file: file with output of coreference system (CoNLL-2011/2012 format) 52 | 53 | name: [optional] the name of the document to score. If name is not 54 | given, all the documents in the dataset will be scored. If given 55 | name is "none" then all the documents are scored but only total 56 | results are shown. 57 | 58 | 59 | OUTPUT 60 | The score subroutine returns an array with four values in this order: 61 | 1) Recall numerator 62 | 2) Recall denominator 63 | 3) Precision numerator 64 | 4) Precision denominator 65 | 66 | Also recall, precision and F1 are printed in the standard output when variable 67 | $VERBOSE is not null. 68 | 69 | Final scores: 70 | Recall = recall_numerator / recall_denominator 71 | Precision = precision_numerator / precision_denominator 72 | F1 = 2 * Recall * Precision / (Recall + Precision) 73 | 74 | Identification of mentions 75 | An scorer for identification of mentions (recall, precision and F1) is also included. 76 | Mentions from system response are compared with key mentions. This version performs 77 | strict mention matching as was used in the CoNLL-2011 and 2012 shared tasks. 78 | 79 | AUTHORS 80 | Emili Sapena, Universitat Politècnica de Catalunya, http://www.lsi.upc.edu/~esapena, esapena lsi.upc.edu 81 | Sameer Pradhan, sameer.pradhan childrens.harvard.edu 82 | Sebastian Martschat, sebastian.martschat h-its.org 83 | Xiaoqiang Luo, xql google.com 84 | 85 | COPYRIGHT AND LICENSE 86 | Copyright (C) 2009-2011, Emili Sapena esapena lsi.upc.edu 87 | 2011-2014, Sameer Pradhan sameer.pradhan childrens.harvard.edu 88 | 89 | This program is free software; you can redistribute it and/or modify it 90 | under the terms of the GNU General Public License as published by the 91 | Free Software Foundation; either version 2 of the License, or (at your 92 | option) any later version. This program is distributed in the hope that 93 | it will be useful, but WITHOUT ANY WARRANTY; without even the implied 94 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 95 | GNU General Public License for more details. 96 | 97 | You should have received a copy of the GNU General Public License along 98 | with this program; if not, write to the Free Software Foundation, Inc., 99 | 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 100 | 101 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/lib/Algorithm/README.Munkres: -------------------------------------------------------------------------------- 1 | NAME 2 | Algorithm-Munkres : Perl extension for Munkres' solution to 3 | classical Assignment problem for square and rectangular matrices 4 | This module extends the solution of Assignment problem for square 5 | matrices to rectangular matrices by padding zeros. Thus a rectangular 6 | matrix is converted to square matrix by padding necessary zeros. 7 | 8 | SYNOPSIS 9 | use Algorithm::Munkres; 10 | 11 | @mat = ( 12 | [2, 4, 7, 9], 13 | [3, 9, 5, 1], 14 | [8, 2, 9, 7], 15 | ); 16 | 17 | assign(\@mat,\@out_mat); 18 | 19 | Then the @out_mat array will have the output as: (0,3,1,2), 20 | where 21 | 0th element indicates that 0th row is assigned 0th column i.e value=2 22 | 1st element indicates that 1st row is assigned 3rd column i.e.value=1 23 | 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 24 | 3rd element indicates that 3rd row is assigned 2nd column.i.e.value=0 25 | 26 | DESCRIPTION 27 | Assignment Problem: Given N jobs, N workers and the time taken by 28 | each worker to complete a job then how should the assignment of a 29 | Worker to a Job be done, so as to minimize the time taken. 30 | 31 | Thus if we have 3 jobs p,q,r and 3 workers x,y,z such that: 32 | x y z 33 | p 2 4 7 34 | q 3 9 5 35 | r 8 2 9 36 | 37 | where the cell values of the above matrix give the time required 38 | for the worker(given by column name) to complete the job(given by 39 | the row name) 40 | 41 | then possible solutions are: 42 | Total 43 | 1. 2, 9, 9 20 44 | 2. 2, 2, 5 9 45 | 3. 3, 4, 9 16 46 | 4. 3, 2, 7 12 47 | 5. 8, 9, 7 24 48 | 6. 8, 4, 5 17 49 | 50 | Thus (2) is the optimal solution for the above problem. 51 | This kind of brute-force approach of solving Assignment problem 52 | quickly becomes slow and bulky as N grows, because the number of 53 | possible solution are N! and thus the task is to evaluate each 54 | and then find the optimal solution.(If N=10, number of possible 55 | solutions: 3628800 !) 56 | Munkres' gives us a solution to this problem, which is implemented 57 | in this module. 58 | 59 | This module also solves Assignment problem for rectangular matrices 60 | (M x N) by converting them to square matrices by padding zeros. ex: 61 | If input matrix is: 62 | [2, 4, 7, 9], 63 | [3, 9, 5, 1], 64 | [8, 2, 9, 7] 65 | i.e 3 x 4 then we will convert it to 4 x 4 and the modified input 66 | matrix will be: 67 | [2, 4, 7, 9], 68 | [3, 9, 5, 1], 69 | [8, 2, 9, 7], 70 | [0, 0, 0, 0] 71 | 72 | EXPORT 73 | "assign" function by default. 74 | 75 | INPUT 76 | The input matrix should be in a two dimensional array(array of 77 | array) and the 'assign' subroutine expects a reference to this 78 | array and not the complete array. 79 | eg:assign(\@inp_mat, \@out_mat); 80 | The second argument to the assign subroutine is the reference 81 | to the output array. 82 | 83 | OUTPUT 84 | The assign subroutine expects references to two arrays as its 85 | input paramenters. The second parameter is the reference to the 86 | output array. This array is populated by assign subroutine. This 87 | array is single dimensional Nx1 matrix. 88 | For above example the output array returned will be: 89 | (0, 90 | 2, 91 | 1) 92 | 93 | where 94 | 0th element indicates that 0th row is assigned 0th column i.e value=2 95 | 1st element indicates that 1st row is assigned 2nd column i.e.value=5 96 | 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2 97 | 98 | SEE ALSO 99 | 1. http://216.249.163.93/bob.pilgrim/445/munkres.html 100 | 101 | 2. Munkres, J. Algorithms for the assignment and transportation 102 | Problems. J. Siam 5 (Mar. 1957), 32-38 103 | 104 | 3. François Bourgeois and Jean-Claude Lassalle. 1971. 105 | An extension of the Munkres algorithm for the assignment 106 | problem to rectangular matrices. 107 | Communication ACM, 14(12):802-804 108 | 109 | AUTHOR 110 | Anagha Kulkarni, University of Minnesota Duluth 111 | kulka020 d.umn.edu 112 | 113 | Ted Pedersen, University of Minnesota Duluth 114 | tpederse d.umn.edu 115 | 116 | COPYRIGHT AND LICENSE 117 | Copyright (C) 2007-2008, Ted Pedersen and Anagha Kulkarni 118 | 119 | This program is free software; you can redistribute it and/or modify it 120 | under the terms of the GNU General Public License as published by the 121 | Free Software Foundation; either version 2 of the License, or (at your 122 | option) any later version. This program is distributed in the hope that 123 | it will be useful, but WITHOUT ANY WARRANTY; without even the implied 124 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 125 | GNU General Public License for more details. 126 | 127 | You should have received a copy of the GNU General Public License along 128 | with this program; if not, write to the Free Software Foundation, Inc., 129 | 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 130 | 131 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/scorer.bat: -------------------------------------------------------------------------------- 1 | @rem = '--*-Perl-*-- 2 | @echo off 3 | if "%OS%" == "Windows_NT" goto WinNT 4 | perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9 5 | goto endofperl 6 | :WinNT 7 | perl -x -S %0 %* 8 | if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl 9 | if %errorlevel% == 9009 echo You do not have Perl in your PATH. 10 | if errorlevel 1 goto script_failed_so_exit_with_non_zero_val 2>nul 11 | goto endofperl 12 | @rem '; 13 | #!perl 14 | #line 15 15 | 16 | BEGIN { 17 | $d = $0; 18 | $d =~ s/\/[^\/][^\/]*$//g; 19 | push(@INC, $d."/lib"); 20 | } 21 | 22 | use strict; 23 | use CorScorer; 24 | 25 | if (@ARGV < 3) { 26 | print q| 27 | use: scorer.bat [name] 28 | 29 | metric: the metric desired to score the results: 30 | muc: MUCScorer (Vilain et al, 1995) 31 | bcub: B-Cubed (Bagga and Baldwin, 1998) 32 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity 33 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity 34 | all: uses all the metrics to score 35 | 36 | keys_file: file with expected coreference chains in SemEval format 37 | 38 | response_file: file with output of coreference system (SemEval format) 39 | 40 | name: [optional] the name of the document to score. If name is not 41 | given, all the documents in the dataset will be scored. If given 42 | name is "none" then all the documents are scored but only total 43 | results are shown. 44 | 45 | |; 46 | exit; 47 | } 48 | 49 | my $metric = shift (@ARGV); 50 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|all)/i) { 51 | print "Invalid metric\n"; 52 | exit; 53 | } 54 | 55 | 56 | if ($metric eq 'all') { 57 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe') { 58 | print "\nMETRIC $m:\n"; 59 | &CorScorer::Score( $m, @ARGV ); 60 | } 61 | } 62 | else { 63 | &CorScorer::Score( $metric, @ARGV ); 64 | } 65 | 66 | __END__ 67 | :endofperl 68 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/scorer.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | BEGIN { 4 | $d = $0; 5 | $d =~ s/\/[^\/][^\/]*$//g; 6 | 7 | if ($d eq $0) { 8 | unshift(@INC, "lib"); 9 | } 10 | else { 11 | unshift(@INC, $d . "/lib"); 12 | } 13 | } 14 | 15 | use strict; 16 | use CorScorer; 17 | 18 | if (@ARGV < 3) { 19 | print q| 20 | use: scorer.pl [name] 21 | 22 | metric: the metric desired to score the results: 23 | muc: MUCScorer (Vilain et al, 1995) 24 | bcub: B-Cubed (Bagga and Baldwin, 1998) 25 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity 26 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity 27 | blanc: BLANC 28 | all: uses all the metrics to score 29 | 30 | keys_file: file with expected coreference chains in SemEval format 31 | 32 | response_file: file with output of coreference system (SemEval format) 33 | 34 | name: [optional] the name of the document to score. If name is not 35 | given, all the documents in the dataset will be scored. If given 36 | name is "none" then all the documents are scored but only total 37 | results are shown. 38 | 39 | |; 40 | exit; 41 | } 42 | 43 | my $metric = shift(@ARGV); 44 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|blanc|all)/i) { 45 | print "Invalid metric\n"; 46 | exit; 47 | } 48 | 49 | if ($metric eq 'all') { 50 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc') { 51 | print "\nMETRIC $m:\n"; 52 | &CorScorer::Score($m, @ARGV); 53 | } 54 | } 55 | else { 56 | &CorScorer::Score($metric, @ARGV); 57 | } 58 | 59 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/CorefMetricTest.pm: -------------------------------------------------------------------------------- 1 | package CorefMetricTest; 2 | use strict; 3 | use warnings; 4 | use Exporter; 5 | 6 | our @ISA= qw(Exporter); 7 | our @EXPORT = qw(ComputeScoreFromCounts DiffExpectedAndActual); 8 | 9 | ################################################################################ 10 | # Compute recall, precision and F1. 11 | # 12 | # Input: (numerator_counts_for_recall, denominator_counts_for_recall, 13 | # numerator_counts_for_precision, denominator_counts_for_precision) 14 | # Output: (recall, precision, F1) 15 | ################################################################################ 16 | sub ComputeScoreFromCounts { 17 | # The first 4 are also coref link counts when using BLANC. 18 | my ($recall_numerator, $recall_denominator, 19 | $precision_numerator, $precision_denominator, @noncoref_counts) = @_; 20 | # The coref recall, precision, and F1 when using BLANC. 21 | my ($recall, $precision, $F1) = 22 | RPFFromCounts($recall_numerator, $recall_denominator, 23 | $precision_numerator, $precision_denominator); 24 | 25 | # BLANC: @noncoref_counts= 26 | # (noncoref_numerator_recall, noncoref_denominator_recall, 27 | # noncoref_numerator_precision, noncoref_denominator_precision) 28 | if (scalar(@noncoref_counts) == 4) { 29 | ($recall, $precision, $F1) = CorScorer::ComputeBLANCFromCounts( 30 | $recall_numerator, $recall_denominator, $precision_denominator, 31 | $noncoref_counts[0], $noncoref_counts[1], $noncoref_counts[3]); 32 | } 33 | $recall = ($recall < 0) ? 0 : $recall; 34 | $precision = ($precision < 0) ? 0 : $precision; 35 | $F1 = ($F1 < 0) ? 0 : $F1; 36 | return ($recall, $precision, $F1); 37 | } 38 | 39 | sub RPFFromCounts 40 | { 41 | my ($recall_numerator, $recall_denominator, 42 | $precision_numerator, $precision_denominator, @nonCorefCounts) = @_; 43 | my ($recall, $precision, $F1) = (-1, -1, 0); 44 | if ($recall_denominator > 0) { 45 | $recall = $recall_numerator / $recall_denominator; 46 | } 47 | if ($precision_denominator > 0) { 48 | $precision = $precision_numerator / $precision_denominator; 49 | } 50 | 51 | if (($recall + $precision) > 0) { 52 | $F1 = 2 * $recall * $precision / ($recall + $precision); 53 | } 54 | 55 | return ($recall, $precision, $F1); 56 | } 57 | 58 | # deprecated -- see CorScorer::ComputeBLANCFromCounts(). 59 | sub ComputeBLANCRPF 60 | { 61 | my ($coref_recall, $coref_precision, $coref_F1, 62 | $noncoref_recall, $noncoref_precision, $noncoref_F1) = @_; 63 | 64 | my ($recall, $precision, $F1); 65 | 66 | if ($coref_recall < 0 && $noncoref_recall < 0) { 67 | # no key mention. 68 | $recall = $precision = $F1 = 0; 69 | } elsif ($coref_recall < 0) { 70 | # key: all links are non-coref (mentions are all singltons). 71 | $recall = $noncoref_recall; 72 | $precision = ($noncoref_precision < 0) ? 0 : $noncoref_precision; 73 | $F1 = $noncoref_F1; 74 | } elsif ($noncoref_recall < 0) { 75 | # key: all links are coref (all mentions are in one entity). 76 | $recall = $coref_recall; 77 | $precision = ($coref_precision < 0) ? 0 : $coref_precision; 78 | $F1 = $coref_F1; 79 | } else { 80 | #key contains both coref and non-coref links. 81 | if ($coref_precision < 0 && $noncoref_precision < 0) { 82 | # no response. 83 | $recall = $precision = $F1 = 0; 84 | } else { 85 | if ($coref_precision < 0) { 86 | # response: all links are non-coref, or response mentions are all 87 | # singletons. 88 | $coref_precision = 0; 89 | } elsif ($noncoref_precision < 0) { 90 | # response: all links are coref, or all mentions are in one entity. 91 | $noncoref_precision = 0; 92 | } 93 | $recall = ($coref_recall + $noncoref_recall)/2; 94 | $precision = ($coref_precision + $noncoref_precision)/2; 95 | $F1 = ($coref_F1 + $noncoref_F1)/2; 96 | } 97 | } 98 | 99 | return ($recall, $precision, $F1); 100 | } 101 | 102 | ############################################################################## 103 | # Compute the sum of the duifference between the expected recall, precision, 104 | # F1 and the actual one. 105 | ############################################################################## 106 | sub DiffExpectedAndActual { 107 | my ($expected, $actual) = @_; 108 | if (scalar(@$expected) != scalar(@$actual)) { 109 | print STDERR "Expected and actual have diff dimensions: \n"; 110 | print STDERR " Expected: ", join(" ", @$expected), "\n"; 111 | print STDERR " Actual: ", join(" ", @$actual), "\n"; 112 | return 1.0e5; 113 | } 114 | my $sum = 0.0; 115 | my $i = 0; 116 | foreach my $e (@$expected) { 117 | $sum += abs($e - $actual->[$i]); 118 | ++$i; 119 | } 120 | return $sum; 121 | } 122 | 123 | 1; 124 | 125 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-10.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 x - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 z - 17 | test2 0 5 e (4) 18 | test2 0 6 y - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-11.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 x - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 z - 17 | test2 0 5 e (0) 18 | test2 0 6 y - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-12.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 1) 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk (2) 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (3) 13 | test2 0 1 x - 14 | test2 0 2 d1 (4 15 | test2 0 3 d2 4) 16 | test2 0 4 z - 17 | test2 0 5 e (5) 18 | test2 0 6 y - 19 | test2 0 7 f1 (6) 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-13.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 0) 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk (0) 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 x - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 z - 17 | test2 0 5 e (0) 18 | test2 0 6 y - 19 | test2 0 7 f1 (0) 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 - 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 - 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c - 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 y (2) 17 | test2 0 5 e (2) 18 | test2 0 6 z (3) 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 x (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 (1 7 | test1 0 5 b3 1) 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 (3 7 | test1 0 5 b3 3) 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-7.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-8.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(3 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 3)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-9.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1(3(3(3(3(3(3(3(3(3(3 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 3)3)3)3)3)3)3)3)3)3)1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 x (1) 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 z (3) 17 | test2 0 5 e - 18 | test2 0 6 y (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (2 15 | test2 0 3 d2 2) 16 | test2 0 4 jnk - 17 | test2 0 5 e (2) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-B-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10043 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10043) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 (10043 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 10043) 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 - 72 | nw/xinhua/00/chtb_0009 - 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-B.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (10043 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 10043) 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10054 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10054) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 - 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 - 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 - 72 | nw/xinhua/00/chtb_0009 - 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-C-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10043 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10043) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 (10043 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 10043) 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 (10060) 72 | nw/xinhua/00/chtb_0009 (10060) 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-C.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (10043 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 - 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 10043) 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | nw/xinhua/00/chtb_0009 - 31 | nw/xinhua/00/chtb_0009 (10054 32 | nw/xinhua/00/chtb_0009 - 33 | nw/xinhua/00/chtb_0009 10054) 34 | nw/xinhua/00/chtb_0009 - 35 | nw/xinhua/00/chtb_0009 - 36 | nw/xinhua/00/chtb_0009 - 37 | nw/xinhua/00/chtb_0009 - 38 | nw/xinhua/00/chtb_0009 - 39 | nw/xinhua/00/chtb_0009 - 40 | nw/xinhua/00/chtb_0009 - 41 | nw/xinhua/00/chtb_0009 - 42 | nw/xinhua/00/chtb_0009 - 43 | nw/xinhua/00/chtb_0009 - 44 | nw/xinhua/00/chtb_0009 - 45 | nw/xinhua/00/chtb_0009 - 46 | nw/xinhua/00/chtb_0009 - 47 | nw/xinhua/00/chtb_0009 - 48 | nw/xinhua/00/chtb_0009 - 49 | nw/xinhua/00/chtb_0009 (10043) 50 | nw/xinhua/00/chtb_0009 - 51 | nw/xinhua/00/chtb_0009 - 52 | nw/xinhua/00/chtb_0009 - 53 | nw/xinhua/00/chtb_0009 - 54 | nw/xinhua/00/chtb_0009 - 55 | nw/xinhua/00/chtb_0009 - 56 | nw/xinhua/00/chtb_0009 - 57 | nw/xinhua/00/chtb_0009 - 58 | nw/xinhua/00/chtb_0009 - 59 | nw/xinhua/00/chtb_0009 - 60 | nw/xinhua/00/chtb_0009 - 61 | nw/xinhua/00/chtb_0009 - 62 | nw/xinhua/00/chtb_0009 - 63 | nw/xinhua/00/chtb_0009 - 64 | nw/xinhua/00/chtb_0009 (10054 65 | nw/xinhua/00/chtb_0009 10054) 66 | nw/xinhua/00/chtb_0009 - 67 | nw/xinhua/00/chtb_0009 - 68 | nw/xinhua/00/chtb_0009 (10054) 69 | nw/xinhua/00/chtb_0009 - 70 | nw/xinhua/00/chtb_0009 - 71 | nw/xinhua/00/chtb_0009 (10060) 72 | nw/xinhua/00/chtb_0009 (10060) 73 | 74 | #end document 75 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-D-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-D.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-E-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (1) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (1) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (1) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (1) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (1) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-E.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (2) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (3) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 (3) 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 (3) 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 (3) 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 (3) 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-F-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-F.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-G-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-G.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-H-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-H.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-I-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-I.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-J-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 - 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-J.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 - 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 - 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-K-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (2) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (2) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 (3) 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-K.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 - 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (1) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (1) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 - 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (1) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (1) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 (1) 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-L-1.response: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (2) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 - 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (3) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (3) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 (3) 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-L.key: -------------------------------------------------------------------------------- 1 | #begin document (nw/xinhua/00/chtb_0009); part 000 2 | nw/xinhua/00/chtb_0009 - 3 | nw/xinhua/00/chtb_0009 (1) 4 | nw/xinhua/00/chtb_0009 - 5 | nw/xinhua/00/chtb_0009 (1) 6 | nw/xinhua/00/chtb_0009 - 7 | nw/xinhua/00/chtb_0009 (1) 8 | nw/xinhua/00/chtb_0009 - 9 | nw/xinhua/00/chtb_0009 (2) 10 | nw/xinhua/00/chtb_0009 - 11 | nw/xinhua/00/chtb_0009 (2) 12 | nw/xinhua/00/chtb_0009 - 13 | nw/xinhua/00/chtb_0009 (2) 14 | nw/xinhua/00/chtb_0009 - 15 | nw/xinhua/00/chtb_0009 (2) 16 | nw/xinhua/00/chtb_0009 - 17 | nw/xinhua/00/chtb_0009 - 18 | nw/xinhua/00/chtb_0009 - 19 | nw/xinhua/00/chtb_0009 - 20 | nw/xinhua/00/chtb_0009 - 21 | nw/xinhua/00/chtb_0009 - 22 | nw/xinhua/00/chtb_0009 - 23 | nw/xinhua/00/chtb_0009 - 24 | nw/xinhua/00/chtb_0009 - 25 | nw/xinhua/00/chtb_0009 - 26 | nw/xinhua/00/chtb_0009 - 27 | nw/xinhua/00/chtb_0009 - 28 | nw/xinhua/00/chtb_0009 - 29 | nw/xinhua/00/chtb_0009 - 30 | 31 | #end document 32 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (1 15 | test2 0 3 d2 1) 16 | test2 0 4 jnk - 17 | test2 0 5 e (1) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk (0) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (0) 17 | test2 0 5 e - 18 | test2 0 6 jnk (0) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk (3) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (4) 17 | test2 0 5 e - 18 | test2 0 6 jnk (5) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk (1) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (1) 17 | test2 0 5 e - 18 | test2 0 6 jnk (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-1.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-2.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (0 15 | test2 0 3 d2 0) 16 | test2 0 4 jnk - 17 | test2 0 5 e (0) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (0 20 | test2 0 8 f2 - 21 | test2 0 9 f3 0) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-3.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (1 15 | test2 0 3 d2 1) 16 | test2 0 4 jnk - 17 | test2 0 5 e (1) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (2 20 | test2 0 8 f2 - 21 | test2 0 9 f3 2) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-4.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk (3) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (4) 17 | test2 0 5 e - 18 | test2 0 6 jnk (5) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-5.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (0) 13 | test2 0 1 jnk (0) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (0) 17 | test2 0 5 e - 18 | test2 0 6 jnk (0) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-6.response: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (0 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 0) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (1) 13 | test2 0 1 jnk (1) 14 | test2 0 2 d1 - 15 | test2 0 3 d2 - 16 | test2 0 4 jnk (1) 17 | test2 0 5 e - 18 | test2 0 6 jnk (2) 19 | test2 0 7 f1 - 20 | test2 0 8 f2 - 21 | test2 0 9 f3 - 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N.key: -------------------------------------------------------------------------------- 1 | #begin document (LuoTestCase); 2 | test1 0 0 a1 (0 3 | test1 0 1 a2 0) 4 | test1 0 2 junk - 5 | test1 0 3 b1 (1 6 | test1 0 4 b2 - 7 | test1 0 5 b3 - 8 | test1 0 6 b4 1) 9 | test1 0 7 jnk - 10 | test1 0 8 . - 11 | 12 | test2 0 0 c (2) 13 | test2 0 1 jnk - 14 | test2 0 2 d1 (3 15 | test2 0 3 d2 3) 16 | test2 0 4 jnk - 17 | test2 0 5 e (4) 18 | test2 0 6 jnk - 19 | test2 0 7 f1 (5 20 | test2 0 8 f2 - 21 | test2 0 9 f3 5) 22 | test2 0 10 . - 23 | #end document 24 | -------------------------------------------------------------------------------- /cort/reference-coreference-scorers/v8.01/test/test.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | BEGIN { 4 | $d = $0; 5 | $d =~ s/\/[^\/][^\/]*$//g; 6 | push(@INC, $d); 7 | push(@INC, $d . "/../lib"); 8 | } 9 | 10 | use strict; 11 | use CorScorer; 12 | use CorefMetricTest; 13 | use CorefMetricTestConfig; 14 | 15 | my $error_tolerance = 1.e-4; 16 | my $script_dir = $0; 17 | $script_dir =~ s/\/[^\/][^\/]*$//g; 18 | 19 | foreach my $test_case (@CorefMetricTestConfig::TestCases) { 20 | my $id = $test_case->{'id'}; 21 | my @key_response_files = ($script_dir . "/" . $test_case->{'key_file'}, 22 | $script_dir . "/" . $test_case->{'response_file'}); 23 | print "\nTesting case ($id): keyFile=", $key_response_files[0], 24 | " responseFile=", $key_response_files[1], "\n"; 25 | my $expected_metrics = $test_case->{'expected_metrics'}; 26 | foreach my $metric_name (sort keys %$expected_metrics) { 27 | my $expected_values = $expected_metrics->{$metric_name}; 28 | *::SAVED_STDOUT = *STDOUT; 29 | *STDOUT = *::SUPRRES_STDOUT; 30 | my @actual_counts = &CorScorer::Score($metric_name, @key_response_files); 31 | # Compute R,P,and F1 from raw counts. 32 | my @actual_values = CorefMetricTest::ComputeScoreFromCounts(@actual_counts); 33 | *STDOUT = *::SAVED_STDOUT; 34 | my $diff = CorefMetricTest::DiffExpectedAndActual($expected_values, \@actual_values); 35 | printf " metric: %+10s", $metric_name; 36 | if ($diff < $error_tolerance) { 37 | print " => PASS\n"; 38 | } else { 39 | print " => FAIL\n"; 40 | print " Expected (recall, prec, F1) = (", join(" ", @$expected_values), ")\n"; 41 | print " Actual (recall, prec, F1) = (", join(" ", @actual_values), ")\n"; 42 | #exit(1); 43 | } 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /cort/resources/coreferent_pairs.obj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/cort/resources/coreferent_pairs.obj -------------------------------------------------------------------------------- /cort/resources/singletons_not_cleaned.obj: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/cort/resources/singletons_not_cleaned.obj -------------------------------------------------------------------------------- /cort/test/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/test/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/test/analysis/test_data_structures.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import unittest 3 | 4 | from cort.analysis import data_structures 5 | from cort.core import documents 6 | from cort.core import mentions 7 | from cort.core import spans 8 | 9 | 10 | __author__ = 'smartschat' 11 | 12 | 13 | class TestCorefStructures(unittest.TestCase): 14 | def setUp(self): 15 | self.complicated_mention_example = """#begin document (test2); part 000 16 | test2 0 0 This NN (NP* - - - - - (0) 17 | test2 0 1 is NN * - - - - - - 18 | test2 0 2 just NN * - - - - - - 19 | test2 0 3 a NN * - - - - - (0|(1) 20 | test2 0 4 test NN * - - - - - 0) 21 | test2 0 5 . NN *) - - - - - - 22 | 23 | test2 0 0 It NN (NP* - - - - - (1)|(0 24 | test2 0 1 shows NN * - - - - - - 25 | test2 0 2 that NN * - - - - - (2) 26 | test2 0 3 the NN * - - - - - (2|(3 27 | test2 0 4 scorer NN * - - - - - 2)|0) 28 | test2 0 5 works NN * - - - - - 3) 29 | test2 0 6 . NN *) - - - - - - 30 | 31 | #end document""" 32 | 33 | self.complicated_mention_document = documents.CoNLLDocument( 34 | self.complicated_mention_example) 35 | 36 | def test_entity_graph_from_mentions(self): 37 | annotated_mentions = \ 38 | self.complicated_mention_document.annotated_mentions 39 | 40 | first_graph = data_structures.EntityGraph({ 41 | annotated_mentions[4]: [annotated_mentions[2], 42 | annotated_mentions[0]], 43 | annotated_mentions[2]: [annotated_mentions[0]] 44 | }) 45 | 46 | second_graph = data_structures.EntityGraph({ 47 | annotated_mentions[3]: [annotated_mentions[1]] 48 | }) 49 | 50 | third_graph = data_structures.EntityGraph({ 51 | annotated_mentions[6]: [annotated_mentions[5]] 52 | }) 53 | 54 | self.assertEqual( 55 | [first_graph, second_graph, third_graph], 56 | data_structures.EntityGraph.from_mentions(annotated_mentions, 57 | "annotated_set_id")) 58 | 59 | def test_entity_graph_partition(self): 60 | annotated_mentions = \ 61 | self.complicated_mention_document.annotated_mentions 62 | 63 | graph = data_structures.EntityGraph({ 64 | annotated_mentions[4]: [annotated_mentions[2], 65 | annotated_mentions[0]], 66 | annotated_mentions[2]: [annotated_mentions[0]] 67 | }) 68 | 69 | system_output = [ 70 | mentions.Mention( 71 | self.complicated_mention_document, 72 | spans.Span(0, 0), 73 | {"set_id": 0}), 74 | mentions.Mention( 75 | self.complicated_mention_document, 76 | spans.Span(2, 3), 77 | {"set_id": 1}), 78 | mentions.Mention( 79 | self.complicated_mention_document, 80 | spans.Span(6, 10), 81 | {"set_id": 0}), 82 | mentions.Mention( 83 | self.complicated_mention_document, 84 | spans.Span(5, 5), 85 | {"set_id": 0}) 86 | ] 87 | 88 | expected_edges = defaultdict(list) 89 | expected_edges[annotated_mentions[4]].append(annotated_mentions[0]) 90 | expected = data_structures.EntityGraph(expected_edges) 91 | 92 | self.assertEqual(expected, 93 | graph.partition( 94 | data_structures.EntityGraph.from_mentions( 95 | system_output, "set_id"))) 96 | 97 | 98 | if __name__ == '__main__': 99 | unittest.main() -------------------------------------------------------------------------------- /cort/test/analysis/test_error_extractors.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import unittest 3 | 4 | from cort.analysis import data_structures 5 | from cort.analysis import error_extractors 6 | from cort.analysis import spanning_tree_algorithms 7 | from cort.core import corpora 8 | from cort.core import mentions 9 | from cort.core import spans 10 | 11 | __author__ = 'smartschat' 12 | 13 | 14 | class TestErrorExtractor(unittest.TestCase): 15 | def setUp(self): 16 | self.first_cluster = [ 17 | mentions.Mention( 18 | None, 19 | spans.Span(0, 0), 20 | {"tokens": ["a"], "annotated_set_id": 0}), 21 | 22 | mentions.Mention( 23 | None, 24 | spans.Span(1, 1), 25 | {"tokens": ["b"], "annotated_set_id": 0}), 26 | 27 | mentions.Mention( 28 | None, 29 | spans.Span(2, 3), 30 | {"tokens": ["c", "d"], "annotated_set_id": 0}), 31 | 32 | mentions.Mention( 33 | None, 34 | spans.Span(4, 5), 35 | {"tokens": ["e", "f"], "annotated_set_id": 0}), 36 | 37 | mentions.Mention( 38 | None, 39 | spans.Span(5, 6), 40 | {"tokens": ["f", "g"], "annotated_set_id": 0}), 41 | 42 | mentions.Mention( 43 | None, 44 | spans.Span(7, 7), 45 | {"tokens": ["h"], "annotated_set_id": 0}), 46 | ] 47 | 48 | self.second_cluster = [ 49 | mentions.Mention( 50 | None, 51 | spans.Span(3, 4), 52 | {"tokens": ["d", "e"], "annotated_set_id": 1}), 53 | 54 | mentions.Mention( 55 | None, 56 | spans.Span(7, 8), 57 | {"tokens": ["h", "i"], "annotated_set_id": 1}), 58 | 59 | mentions.Mention( 60 | None, 61 | spans.Span(10, 10), 62 | {"tokens": ["k"], "annotated_set_id": 1}) 63 | ] 64 | 65 | self.system_cluster = [ 66 | mentions.Mention( 67 | None, 68 | spans.Span(0, 0), 69 | {"tokens": ["a"], "annotated_set_id": 0}), 70 | 71 | mentions.Mention( 72 | None, 73 | spans.Span(2, 3), 74 | {"tokens": ["c", "d"], "annotated_set_id": 0}), 75 | 76 | mentions.Mention( 77 | None, 78 | spans.Span(4, 5), 79 | {"tokens": ["e", "f"], "annotated_set_id": 2}), 80 | 81 | mentions.Mention( 82 | None, 83 | spans.Span(5, 6), 84 | {"tokens": ["f", "g"], "annotated_set_id": 2}), 85 | 86 | mentions.Mention( 87 | None, 88 | spans.Span(7, 7), 89 | {"tokens": ["h"], "annotated_set_id": 1}), 90 | 91 | mentions.Mention( 92 | None, 93 | spans.Span(10, 10), 94 | {"tokens": ["k"], "annotated_set_id": 1}) 95 | ] 96 | 97 | self.maxDiff = None 98 | 99 | def test_compute_errors(self): 100 | # fake document using a named tuple 101 | document = namedtuple("Document", "annotated_mentions") 102 | doc_gold = document(self.first_cluster + self.second_cluster) 103 | doc_system = document(self.system_cluster) 104 | corpus_gold = corpora.Corpus("fake gold", [doc_gold]) 105 | corpus_system = corpora.Corpus("fake system", [doc_system]) 106 | 107 | ex = error_extractors.ErrorExtractor( 108 | corpus_gold, 109 | spanning_tree_algorithms.recall_closest, 110 | spanning_tree_algorithms.precision_system_output 111 | ) 112 | 113 | ex.add_system(corpus_system) 114 | 115 | self.assertEqual( 116 | data_structures.EnhancedSet([ 117 | (self.first_cluster[1], self.first_cluster[0]), 118 | (self.first_cluster[3], self.first_cluster[2]), 119 | (self.first_cluster[5], self.first_cluster[4]), 120 | (self.second_cluster[1], self.second_cluster[0]), 121 | (self.second_cluster[2], self.second_cluster[1]), 122 | ]), 123 | ex.get_errors()["fake system"]["recall_errors"]["all"] 124 | ) 125 | 126 | if __name__ == '__main__': 127 | unittest.main() 128 | -------------------------------------------------------------------------------- /cort/test/analysis/test_spanning_tree_algorithms.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from cort.analysis import data_structures 4 | from cort.analysis import spanning_tree_algorithms 5 | from cort.core import mentions 6 | from cort.core import spans 7 | 8 | 9 | __author__ = 'smartschat' 10 | 11 | 12 | class TestSpanningTreeAlgorithms(unittest.TestCase): 13 | def setUp(self): 14 | self.gold_first_cluster = [ 15 | mentions.Mention( 16 | None, 17 | spans.Span(0, 0), 18 | {"tokens": ["a"], "type": "NOM", "annotated_set_id": 0}), 19 | 20 | mentions.Mention( 21 | None, 22 | spans.Span(1, 1), 23 | {"tokens": ["US"], "type": "NAM", "annotated_set_id": 0}), 24 | 25 | mentions.Mention( 26 | None, 27 | spans.Span(2, 3), 28 | {"tokens": ["angry", "salesman"], "type": "PRO", "annotated_set_id": 0}), 29 | 30 | mentions.Mention( 31 | None, 32 | spans.Span(4, 5), 33 | {"tokens": ["the", "rainbow"], "type": "NAM", 34 | "annotated_set_id": 0}), 35 | 36 | mentions.Mention( 37 | None, 38 | spans.Span(5, 6), 39 | {"tokens": ["and", "far"], "type": "NOM", 40 | "annotated_set_id": 0}), 41 | 42 | mentions.Mention( 43 | None, 44 | spans.Span(7, 7), 45 | {"tokens": ["neypmd"], "type": "NOM", "annotated_set_id": 0}), 46 | ] 47 | 48 | self.gold_second_cluster = [ 49 | mentions.Mention( 50 | None, 51 | spans.Span(7, 8), 52 | {"type": "NOM", "annotated_set_id": 1}), 53 | 54 | mentions.Mention( 55 | None, 56 | spans.Span(9, 9), 57 | {"type": "NAM", "annotated_set_id": 1}), 58 | 59 | mentions.Mention( 60 | None, 61 | spans.Span(10, 10), 62 | {"type": "PRO", "annotated_set_id": 1}), 63 | ] 64 | 65 | self.system1_mentions = [ 66 | mentions.Mention(None, spans.Span(0, 0), {"set_id": 0}), 67 | mentions.Mention(None, spans.Span(2, 3), {"set_id": 0}), 68 | mentions.Mention(None, spans.Span(4, 5), {"set_id": 2}), 69 | mentions.Mention(None, spans.Span(5, 6), {"set_id": 2}), 70 | mentions.Mention(None, spans.Span(3, 4), {"set_id": 1}), 71 | mentions.Mention(None, spans.Span(7, 8), {"set_id": 1}), 72 | ] 73 | 74 | self.system2_cluster = [ 75 | mentions.Mention( 76 | None, 77 | spans.Span(0, 0), 78 | {"tokens": ["a"], "set_id": 0}), 79 | 80 | mentions.Mention( 81 | None, 82 | spans.Span(2, 3), 83 | {"tokens": ["angry", "salesman"], "set_id": 0}), 84 | 85 | mentions.Mention( 86 | None, 87 | spans.Span(7, 8), 88 | {"tokens": ["snafu", "foo"], "set_id": 0}), 89 | 90 | mentions.Mention( 91 | None, 92 | spans.Span(9, 9), 93 | {"tokens": ["bar"], "set_id": 0}), 94 | ] 95 | self.system2_cluster[1].attributes["antecedent"] = \ 96 | self.system2_cluster[0] 97 | self.system2_cluster[2].attributes["antecedent"] = \ 98 | self.system2_cluster[0] 99 | self.system2_cluster[3].attributes["antecedent"] = \ 100 | self.system2_cluster[2] 101 | 102 | self.maxDiff = None 103 | 104 | def test_recall_closest(self): 105 | gold_graph = data_structures.EntityGraph.from_mentions( 106 | self.gold_first_cluster, "annotated_set_id")[0] 107 | 108 | spanning_tree_edges = [ 109 | (self.gold_first_cluster[1], self.gold_first_cluster[0]), 110 | (self.gold_first_cluster[2], self.gold_first_cluster[0]), 111 | (self.gold_first_cluster[3], self.gold_first_cluster[2]), 112 | (self.gold_first_cluster[4], self.gold_first_cluster[3]), 113 | (self.gold_first_cluster[5], self.gold_first_cluster[4]) 114 | ] 115 | 116 | self.assertEqual( 117 | spanning_tree_edges, 118 | spanning_tree_algorithms.recall_closest( 119 | gold_graph, 120 | gold_graph.partition( 121 | data_structures.EntityGraph.from_mentions( 122 | self.system1_mentions, "set_id")))) 123 | 124 | def test_recall_type(self): 125 | gold_graph = data_structures.EntityGraph.from_mentions( 126 | self.gold_first_cluster, "annotated_set_id")[0] 127 | 128 | spanning_tree_edges = [ 129 | (self.gold_first_cluster[1], self.gold_first_cluster[0]), 130 | (self.gold_first_cluster[2], self.gold_first_cluster[0]), 131 | (self.gold_first_cluster[3], self.gold_first_cluster[1]), 132 | (self.gold_first_cluster[4], self.gold_first_cluster[3]), 133 | (self.gold_first_cluster[5], self.gold_first_cluster[3]) 134 | ] 135 | 136 | self.assertEqual( 137 | spanning_tree_edges, 138 | spanning_tree_algorithms.recall_accessibility( 139 | gold_graph, 140 | gold_graph.partition( 141 | data_structures.EntityGraph.from_mentions( 142 | self.system1_mentions, "set_id")))) 143 | 144 | def test_precision_system_output(self): 145 | gold_graph = data_structures.EntityGraph.from_mentions( 146 | self.system2_cluster, "set_id")[0] 147 | 148 | spanning_tree_edges = [ 149 | (self.system2_cluster[1], self.system2_cluster[0]), 150 | (self.system2_cluster[2], self.system2_cluster[0]), 151 | (self.system2_cluster[3], self.system2_cluster[2]) 152 | ] 153 | 154 | self.assertEqual( 155 | spanning_tree_edges, 156 | spanning_tree_algorithms.precision_system_output( 157 | gold_graph, 158 | gold_graph.partition( 159 | data_structures.EntityGraph.from_mentions( 160 | self.gold_first_cluster, "annotated_set_id")))) 161 | 162 | 163 | if __name__ == '__main__': 164 | unittest.main() 165 | -------------------------------------------------------------------------------- /cort/test/core/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/test/core/test_corpora.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from cort.core.corpora import Corpus 5 | 6 | 7 | __author__ = 'smartschat' 8 | 9 | 10 | class TestCorpora(unittest.TestCase): 11 | def setUp(self): 12 | directory = os.path.dirname(os.path.realpath(__file__)) + "/resources/" 13 | self.input_data = open(directory + "input.conll", "r") 14 | 15 | def test_conll_reader(self): 16 | corpus = Corpus.from_file("test", self.input_data) 17 | self.assertEqual(5, len(corpus.documents)) 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /cort/test/core/test_external_data.py: -------------------------------------------------------------------------------- 1 | from cort.core.external_data import GenderData 2 | 3 | __author__ = 'smartschat' 4 | 5 | import unittest 6 | 7 | 8 | class TestGenderData(unittest.TestCase): 9 | def setUp(self): 10 | self.gender_data = GenderData.get_instance() 11 | 12 | def test_look_up(self): 13 | self.assertEqual("NEUTRAL", 14 | self.gender_data.look_up({"tokens": ["snafu"]})) 15 | 16 | self.assertEqual("FEMALE", 17 | self.gender_data.look_up( 18 | {"tokens": ["Barbara", "Bush"], 19 | "head": ["Barbara", "Bush"]})) 20 | 21 | self.assertEqual("MALE", 22 | self.gender_data.look_up({ 23 | "tokens": ["Footballer", "Zidane"], 24 | "head": ["Zidane"]})) 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /cort/test/core/test_spans.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from cort.core.spans import Span 4 | 5 | 6 | __author__ = 'smartschat' 7 | 8 | 9 | class TestSpan(unittest.TestCase): 10 | def test_span(self): 11 | span = Span(0, 1) 12 | self.assertEqual(0, span.begin) 13 | self.assertEqual(1, span.end) 14 | 15 | def test_parse(self): 16 | self.assertEqual(Span(10, 12), Span.parse("(10, 12)")) 17 | self.assertEqual(Span(10, 12), Span.parse("(10,12)")) 18 | 19 | if __name__ == '__main__': 20 | unittest.main() 21 | -------------------------------------------------------------------------------- /cort/test/core/test_util.py: -------------------------------------------------------------------------------- 1 | from cort.core.util import clean_via_pos 2 | 3 | __author__ = 'smartschat' 4 | 5 | import unittest 6 | 7 | 8 | class TestUtil(unittest.TestCase): 9 | def test_clean_via_pos(self): 10 | self.assertEqual( 11 | ["newly-elect", "leader", "wife"], 12 | clean_via_pos( 13 | ["the", "newly-elect", "leader", "'s", "wife"], 14 | ["DT", "JJ", "NN", "POS", "NN"])) 15 | 16 | 17 | if __name__ == '__main__': 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /cort/test/multigraph/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'martscsn' 2 | -------------------------------------------------------------------------------- /cort/util/__init__.py: -------------------------------------------------------------------------------- 1 | "Utility functions." 2 | 3 | __author__ = 'sebastian' 4 | -------------------------------------------------------------------------------- /cort/util/import_helper.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pyximport 3 | pyximport.install(setup_args={"include_dirs": numpy.get_include()}) 4 | 5 | import importlib 6 | import inspect 7 | 8 | 9 | __author__ = 'martscsn' 10 | 11 | 12 | def import_from_path(name): 13 | splitted = name.split(".") 14 | package_name = ".".join(splitted[:-1]) 15 | cls = splitted[-1] 16 | 17 | package = importlib.import_module(package_name) 18 | 19 | imported = getattr(package, cls) 20 | 21 | return imported 22 | 23 | 24 | def get_features(filename): 25 | mention_features = [] 26 | pairwise_features = [] 27 | 28 | for line in open(filename).readlines(): 29 | feature = import_from_path(line.strip()) 30 | number_of_arguments = len(inspect.getargspec(feature)[0]) 31 | 32 | if number_of_arguments == 1: 33 | mention_features.append(feature) 34 | elif number_of_arguments == 2: 35 | pairwise_features.append(feature) 36 | else: 37 | raise ValueError("Features must have one or two arguments, " 38 | "feature " + line.strip() + " has " + 39 | str(number_of_arguments) + " arguments.") 40 | 41 | return mention_features, pairwise_features 42 | -------------------------------------------------------------------------------- /plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/plot.png -------------------------------------------------------------------------------- /scripts/acl15demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | import io 5 | import logging 6 | import pickle 7 | import numpy 8 | 9 | import pyximport 10 | pyximport.install(setup_args={"include_dirs": numpy.get_include()}) 11 | 12 | from cort.preprocessing import pipeline 13 | from cort.core import mention_extractor 14 | from cort.coreference.approaches import mention_ranking 15 | from cort.coreference import cost_functions, clusterer 16 | from cort.coreference import experiments 17 | from cort.coreference import features 18 | from cort.coreference import instance_extractors 19 | from cort.core import corpora 20 | from cort.analysis import visualization, error_extractors, spanning_tree_algorithms 21 | 22 | try: 23 | import tkinter as tki 24 | except ImportError: 25 | import Tkinter as tki 26 | 27 | __author__ = 'smartschat' 28 | 29 | logging.basicConfig(level=logging.INFO, 30 | format='%(asctime)s %(levelname)s %(''message)s') 31 | 32 | class LiveDemo(): 33 | def __init__(self): 34 | mention_features = [ 35 | features.fine_type, 36 | features.gender, 37 | features.number, 38 | features.sem_class, 39 | features.deprel, 40 | features.head_ner, 41 | features.length, 42 | features.head, 43 | features.first, 44 | features.last, 45 | features.preceding_token, 46 | features.next_token, 47 | features.governor, 48 | features.ancestry 49 | ] 50 | 51 | pairwise_features = [ 52 | features.exact_match, 53 | features.head_match, 54 | features.same_speaker, 55 | features.alias, 56 | features.sentence_distance, 57 | features.embedding, 58 | features.modifier, 59 | features.tokens_contained, 60 | features.head_contained, 61 | features.token_distance 62 | ] 63 | 64 | self.extractor = instance_extractors.InstanceExtractor( 65 | mention_ranking.extract_substructures, 66 | mention_features, 67 | pairwise_features, 68 | cost_functions.null_cost 69 | ) 70 | 71 | logging.info("Loading model.") 72 | 73 | priors, weights = pickle.load(open("latent-model-train.obj", "rb")) 74 | 75 | self.perceptron = mention_ranking.RankingPerceptron( 76 | priors=priors, 77 | weights=weights, 78 | cost_scaling=0 79 | ) 80 | 81 | logging.info("Loading CoreNLP models.") 82 | self.p = pipeline.Pipeline( 83 | "/home/sebastian/Downloads/stanford-corenlp-full-2015-04-20") 84 | 85 | self.root = tki.Tk() 86 | self.root.title("cort Demo") 87 | 88 | # create a Frame for the Text and Scrollbar 89 | self.txt_frm = tki.Frame(self.root, width=400, height=200) 90 | self.txt_frm.pack(fill="both", expand=True) 91 | 92 | # ensure a consistent GUI size 93 | self.txt_frm.grid_propagate(False) 94 | 95 | # implement stretchability 96 | self.txt_frm.grid_rowconfigure(0, weight=1) 97 | self.txt_frm.grid_columnconfigure(0, weight=1) 98 | 99 | # create a Text widget 100 | self.txt = tki.Text(self.txt_frm, borderwidth=3, relief="sunken") 101 | self.txt.config(font=("consolas", 12), undo=True, wrap='word') 102 | self.txt.grid(row=0, column=0, sticky="nsew", padx=2, pady=2) 103 | 104 | # create a Scrollbar and associate it with txt 105 | scrollb = tki.Scrollbar(self.txt_frm, command=self.txt.yview) 106 | scrollb.grid(row=0, column=1, sticky='nsew') 107 | self.txt['yscrollcommand'] = scrollb.set 108 | 109 | self.button = tki.Button(self.root, text='Resolve Coreference', 110 | command=self.do_coreference) 111 | 112 | self.button.pack() 113 | 114 | def run(self): 115 | self.root.mainloop() 116 | 117 | def do_coreference(self): 118 | testing_corpus = corpora.Corpus("input", [self.p.run_on_doc( 119 | io.StringIO(self.txt.get("0.0", tki.END)), "input")]) 120 | 121 | logging.info("Extracting system mentions.") 122 | for doc in testing_corpus: 123 | doc.system_mentions = mention_extractor.extract_system_mentions(doc) 124 | 125 | mention_entity_mapping, antecedent_mapping = experiments.predict( 126 | testing_corpus, 127 | self.extractor, 128 | self.perceptron, 129 | clusterer.all_ante 130 | ) 131 | 132 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) 133 | 134 | logging.info("Visualize") 135 | 136 | for doc in testing_corpus: 137 | max_id = 0 138 | 139 | for mention in doc.system_mentions[1:]: 140 | set_id = mention.attributes["set_id"] 141 | 142 | if set_id: 143 | max_id = max(set_id, max_id) 144 | 145 | max_id += 1 146 | 147 | doc.annotated_mentions = [] 148 | 149 | for i, mention in enumerate(doc.system_mentions[1:]): 150 | if mention.attributes["set_id"]: 151 | mention.attributes["annotated_set_id"] = mention.attributes[ 152 | "set_id"] 153 | else: 154 | mention.attributes["annotated_set_id"] = max_id + i 155 | doc.annotated_mentions.append(mention) 156 | 157 | ex = error_extractors.ErrorExtractor(testing_corpus, 158 | spanning_tree_algorithms.recall_accessibility, 159 | spanning_tree_algorithms.precision_system_output) 160 | 161 | ex.add_system(testing_corpus) 162 | 163 | decisions = ex.get_errors() 164 | 165 | visualizer = visualization.Visualizer(decisions, "input", 166 | for_raw_input=True) 167 | 168 | visualizer.run() 169 | 170 | demo = LiveDemo() 171 | 172 | demo.run() 173 | -------------------------------------------------------------------------------- /scripts/naacl15-demo.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | 4 | from cort.analysis import error_extractors 5 | from cort.analysis import plotting 6 | from cort.analysis import spanning_tree_algorithms 7 | from cort.core import corpora 8 | 9 | 10 | __author__ = 'smartschat' 11 | 12 | 13 | # read in corpora 14 | reference = corpora.Corpus.from_file("reference", codecs.open("dev.gold", "r", 15 | "utf-8")) 16 | pair = corpora.Corpus.from_file("pair", codecs.open("pair-dev.out", "r", "utf-8")) 17 | tree = corpora.Corpus.from_file("tree", codecs.open("tree-dev.out", "r", "utf-8")) 18 | 19 | # optional -- not needed when you only want to compute recall errors 20 | pair.read_antecedents(open('pair-dev.antecedents')) 21 | tree.read_antecedents(open('tree-dev.antecedents')) 22 | 23 | # define error extractor 24 | extractor = error_extractors.ErrorExtractor( 25 | reference, 26 | spanning_tree_algorithms.recall_accessibility, 27 | spanning_tree_algorithms.precision_system_output 28 | ) 29 | 30 | # extract errors 31 | extractor.add_system(pair) 32 | extractor.add_system(tree) 33 | 34 | errors = extractor.get_errors() 35 | 36 | # categorize by mention type of anaphor 37 | by_type = errors.categorize( 38 | lambda err: err[0].attributes["type"] 39 | ) 40 | 41 | 42 | # visualize 43 | by_type.visualize("pair") 44 | 45 | # filter by distance 46 | by_type_filtered = by_type.filter( 47 | lambda err: err[0].attributes["sentence_id"] - err[1].attributes[ 48 | "sentence_id"] <= 3 49 | ) 50 | 51 | # plot 52 | pair_errs = by_type_filtered["pair"]["recall_errors"]["all"] 53 | tree_errs = by_type_filtered["tree"]["recall_errors"]["all"] 54 | 55 | plotting.plot( 56 | [("pair", [(cat, len(errs)) for cat, errs in pair_errs.items()]), 57 | ("tree", [(cat, len(errs)) for cat, errs in tree_errs.items()])], 58 | "Recall Errors", 59 | "Type of anaphor", 60 | "Number of Errors") 61 | 62 | # more advanced features 63 | 64 | # is anaphor a gold mention? 65 | all_gold = set() 66 | for doc in reference: 67 | for mention in doc.annotated_mentions: 68 | all_gold.add(mention) 69 | 70 | 71 | def is_anaphor_gold(mention): 72 | if mention in all_gold: 73 | return "is_gold" 74 | else: 75 | return "is_not_gold" 76 | 77 | is_ana_gold = by_type.categorize(lambda err: is_anaphor_gold(err[0])) 78 | 79 | # head statistics for NOM errors 80 | from collections import Counter 81 | 82 | for system in ["pair", "tree"]: 83 | nom_rec_errs = by_type[system]["recall_errors"]["all"]["NOM"] 84 | all_heads = [" ".join(err[0].attributes["head"]).lower() for err in nom_rec_errs] 85 | most_common = Counter(all_heads).most_common(10) 86 | print(system, most_common) 87 | 88 | # common errors: 89 | common = { 90 | "common": { 91 | "recall_errors": {}, 92 | "precision_errors": {} 93 | } 94 | } 95 | 96 | common["common"]["recall_errors"]["all"] = errors["pair"]["recall_errors"][ 97 | "all"].intersection(errors["tree"]["recall_errors"]["all"]) 98 | 99 | common["common"]["precision_errors"]["all"] = errors["pair"]["precision_errors"][ 100 | "all"].intersection(errors["tree"]["precision_errors"]["all"]) 101 | 102 | from cort.analysis import data_structures 103 | common = data_structures.StructuredCoreferenceAnalysis( 104 | common, errors.reference, errors.corpora 105 | ) 106 | 107 | # plot decisions 108 | decs = by_type_filtered["pair"]["decisions"]["all"] 109 | prec_errs = by_type_filtered["pair"]["precision_errors"]["all"] 110 | 111 | plotting.plot( 112 | [("decisions", [(cat, len(errs)) for cat, errs in decs.items()]), 113 | ("errors", [(cat, len(errs)) for cat, errs in prec_errs.items()])], 114 | "Decisions and Errors", 115 | "Type of anaphor", 116 | "Number") -------------------------------------------------------------------------------- /scripts/train-and-predict-all.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | 4 | import subprocess 5 | 6 | 7 | __author__ = 'smartschat' 8 | 9 | 10 | def get_extractor(data_set, system): 11 | if system == "closest" or system == "latent": 12 | return "cort.coreference.approaches.mention_ranking.extract_substructures" 13 | elif system == "tree": 14 | return "cort.coreference.approaches.antecedent_trees.extract_substructures" 15 | elif system == "pair": 16 | if data_set == "train": 17 | return "cort.coreference.approaches.mention_pairs" \ 18 | ".extract_training_substructures" 19 | else: 20 | return "cort.coreference.approaches.mention_pairs" \ 21 | ".extract_testing_substructures" 22 | 23 | 24 | def get_perceptron(system): 25 | if system == "pair": 26 | return "cort.coreference.approaches.mention_pairs.MentionPairsPerceptron" 27 | elif system == "closest": 28 | return "cort.coreference.approaches.mention_ranking.RankingPerceptronClosest" 29 | elif system == "latent": 30 | return "cort.coreference.approaches.mention_ranking.RankingPerceptron" 31 | elif system == "tree": 32 | return "cort.coreference.approaches.antecedent_trees.AntecedentTreePerceptron" 33 | 34 | 35 | def get_cost_function(system): 36 | if system == "pair": 37 | return "cort.coreference.cost_functions.null_cost" 38 | else: 39 | return "cort.coreference.cost_functions.cost_based_on_consistency" 40 | 41 | 42 | def get_clusterer(system): 43 | if system == "pair": 44 | return "cort.coreference.clusterer.best_first" 45 | else: 46 | return "cort.coreference.clusterer.all_ante" 47 | 48 | 49 | systems = ["pair", "closest", "latent", "tree"] 50 | data_sets = ["dev", "test"] 51 | 52 | for system in systems: 53 | print("Training", system, "on train.") 54 | subprocess.call([ 55 | "cort-train", 56 | "-in", "/data/nlp/martscsn/thesis/data/input/train.auto", 57 | "-out", "model-" + system + "-train.obj", 58 | "-extractor", get_extractor("train", system), 59 | "-perceptron", get_perceptron(system), 60 | "-cost_function", get_cost_function(system), 61 | "-cost_scaling", "100"]) 62 | 63 | print("Training", system, "on dev+train.") 64 | subprocess.call([ 65 | "cort-train", 66 | "-in", "/data/nlp/martscsn/thesis/data/input/train+dev.auto", 67 | "-out", "model-" + system + "-train+dev.obj", 68 | "-extractor", get_extractor("train", system), 69 | "-perceptron", get_perceptron(system), 70 | "-cost_function", get_cost_function(system), 71 | "-cost_scaling", "100"]) 72 | 73 | for data_set in data_sets: 74 | print("Predicting", system, "on", data_set) 75 | if data_set == "dev": 76 | model = "model-" + system + "-train.obj" 77 | else: 78 | model = "model-" + system + "-train+dev.obj" 79 | 80 | subprocess.call([ 81 | "cort-predict-conll", 82 | "-in", "/data/nlp/martscsn/thesis/data/input/" + data_set + 83 | ".auto", 84 | "-model", model, 85 | "-out", system + "-" + data_set + ".out", 86 | "-ante", system + "-" + data_set + ".antecedents", 87 | "-gold", "/data/nlp/martscsn/thesis/data/input/" + data_set + 88 | ".gold", 89 | "-extractor", get_extractor(data_set, system), 90 | "-perceptron", get_perceptron(system), 91 | "-clusterer", get_clusterer(system)]) 92 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='cort', 6 | version='0.2.4.5', 7 | packages=['cort', 8 | 'cort.analysis', 9 | 'cort.core', 10 | 'cort.test', 11 | 'cort.coreference', 12 | 'cort.test.multigraph', 13 | 'cort.test.analysis', 14 | 'cort.test.core', 15 | 'cort.coreference.multigraph', 16 | 'cort.coreference.approaches', 17 | 'cort.util', 18 | 'cort.preprocessing', 19 | 'stanford_corenlp_pywrapper'], 20 | 21 | url='http://github.com/smartschat/cort', 22 | license='MIT', 23 | author='Sebastian Martschat, Thierry Goeckel, Patrick Claus', 24 | author_email='sebastian.martschat@gmail.com', 25 | description='A coreference resolution research toolkit.', 26 | keywords = ['NLP', 'CL', 'natural language processing', 27 | 'computational linguistics', 'coreference resolution', 28 | 'text analytics'], 29 | classifiers = [ 30 | 'Intended Audience :: Science/Research', 31 | 'Programming Language :: Python :: 2.7', 32 | 'Programming Language :: Python :: 3.3', 33 | 'Topic :: Scientific/Engineering', 34 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 35 | 'Topic :: Text Processing', 36 | ], 37 | install_requires=['nltk >= 3.0.1', 'numpy', 'matplotlib', 'mmh3', 'cython', 38 | 'future', 'jpype1', 'beautifulsoup4', 39 | 'pystanforddependencies >= 0.3.1'], 40 | package_data={ 41 | 'cort': ['analysis/visualization/style.css', 42 | 'analysis/visualization/lib/*', 43 | 'resources/*', 44 | 'config_files/*', 45 | 'coreference/perceptrons.pyx', 46 | "reference-coreference-scorers/v8.01/*.*", 47 | "reference-coreference-scorers/v8.01/lib/*.pm", 48 | "reference-coreference-scorers/v8.01/lib/Algorithm/*", 49 | "reference-coreference-scorers/v8.01/lib/Data/*", 50 | "reference-coreference-scorers/v8.01/lib/Math/*"], 51 | 'stanford_corenlp_pywrapper': ['rcorenlp.r', 52 | 'lib/*', 53 | 'javasrc/corenlp/*', 54 | 'javasrc/util/misc/*', 55 | 'javasrc/util/*.java'], 56 | }, 57 | scripts=['bin/cort-train', 'bin/cort-predict-conll', 58 | 'bin/cort-predict-raw', 'bin/cort-visualize', 59 | 'bin/run-multigraph'] 60 | ) 61 | -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .sockwrap import * 2 | -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/javasrc/corenlp/PipeRunner.java: -------------------------------------------------------------------------------- 1 | package corenlp; 2 | 3 | import org.codehaus.jackson.JsonNode; 4 | 5 | import util.Arr; 6 | import util.BasicFileIO; 7 | import util.JsonUtil; 8 | import util.U; 9 | 10 | /** 11 | * stdin/stdout commandline pipe mode that lightly wraps JsonPipeline. 12 | * 13 | * INPUT: one line per document. 14 | * docid \t TextAsJsonStringOrObjectWithTextField 15 | * OUTPUT: as JSON, one doc per line ("jdoc"). 16 | * docid \t {sentences: [ {sentobj}, {sentobj}, ... ]} 17 | * where each sentobj is 18 | * {tokens: [...], char_offsets: [...], ....} 19 | * 20 | */ 21 | public class PipeRunner { 22 | ProcessingMode mode; 23 | JsonPipeline parse; 24 | 25 | static enum InputFormat { 26 | DETECT_JSON_VARIANT, 27 | RAW_TEXT 28 | }; 29 | 30 | /** the pre-baked processing modes, that define annotators and outputs. */ 31 | static enum ProcessingMode { 32 | NOMODE, 33 | SSPLIT, 34 | POS, 35 | NER, 36 | PARSE, 37 | NERPARSE; 38 | } 39 | static ProcessingMode modeFromString(String _mode) { 40 | return 41 | _mode.equals("nomode") ? ProcessingMode.NOMODE : 42 | _mode.equals("ssplit") ? ProcessingMode.SSPLIT : 43 | _mode.equals("pos") ? ProcessingMode.POS : 44 | _mode.equals("ner") ? ProcessingMode.NER : 45 | _mode.equals("parse") ? ProcessingMode.PARSE : 46 | _mode.equals("nerparse") ? ProcessingMode.NERPARSE : 47 | null; 48 | } 49 | 50 | 51 | static void usage() { 52 | U.p("corenlp.Parse [options] \n" + 53 | "Processes document texts on and outputs NLP-annotated versions.\n" + 54 | "Both input and output formats are one document per line.\n" + 55 | "\n" + 56 | "Input format can be either\n" + 57 | " one column: TextField\n" + 58 | " two columns: docid \\t TextField\n" + 59 | "Where TextField could be either\n" + 60 | " * a JSON string, or\n" + 61 | " * a JSON object with field 'text'.\n" + 62 | "--raw-input allows the text field to be raw text, interpreted as UTF-8 encoded.\n" + 63 | "Note that JSON strings can be preferable, since they can contain any type of whitespace.\n" + 64 | "\n" + 65 | "In all cases, the output mode is two-column: docid \\t NLPInfoAsJson\n" + 66 | ""); 67 | System.exit(1); 68 | } 69 | 70 | public void runStdinStdout(InputFormat inputFormat) { 71 | for (String line : BasicFileIO.STDIN_LINES) { 72 | System.err.print("."); 73 | 74 | String[] parts = line.split("\t"); 75 | String docid, doctext; 76 | JsonNode payload = null; 77 | if (inputFormat == InputFormat.DETECT_JSON_VARIANT) { 78 | payload =JsonUtil.parse(parts[parts.length-1]); 79 | doctext = 80 | payload.isTextual() ? payload.asText() : 81 | payload.has("text") ? payload.get("text").asText() : 82 | null; 83 | } 84 | else if (inputFormat == InputFormat.RAW_TEXT) { 85 | doctext = parts[parts.length-1]; 86 | } 87 | else { throw new RuntimeException("wtf"); } 88 | 89 | docid = parts.length >= 2 ? parts[0] : 90 | payload !=null && payload.has("docid") ? payload.get("docid").getTextValue() : 91 | "doc" + parse.numDocs; 92 | 93 | assert docid != null : "inconsistent 'docid' key"; 94 | if (doctext == null) throw new RuntimeException("Couldn't interpret JSON payload: should be string, or else object with a 'text' field."); 95 | 96 | JsonNode outDoc = parse.processTextDocument(doctext); 97 | U.pf("%s\t%s\n", docid, JsonUtil.toJson(outDoc)); 98 | } 99 | 100 | double elapsedSec = 1.0*(System.currentTimeMillis() - parse.startMilli) / 1000; 101 | System.err.print("\n"); 102 | System.err.printf("%d docs, %d tokens, %.1f tok/sec, %.1f byte/sec\n", parse.numDocs, parse.numTokens, parse.numTokens*1.0/elapsedSec, parse.numChars*1.0/elapsedSec); 103 | } 104 | 105 | public static void main(String[] args) { 106 | if (args.length < 1) { 107 | usage(); 108 | } 109 | InputFormat inputFormat = InputFormat.DETECT_JSON_VARIANT; 110 | 111 | while (args.length > 1) { 112 | String flag = args[0]; 113 | if (flag.equals("--raw-input")) { 114 | inputFormat = InputFormat.RAW_TEXT; 115 | args = Arr.subArray(args, 1, args.length); 116 | } 117 | else { throw new RuntimeException("bad flag: " + flag); } 118 | } 119 | 120 | 121 | throw new RuntimeException("TODO need to handle mode parsing; in the meantime this is broken"); 122 | 123 | // PipeRunner runner = new PipeRunner(); 124 | // String _mode = args[0]; 125 | // ProcessingMode mode = modeFromString(_mode); 126 | // if (runner.mode==null) { 127 | // U.pf("Bad mode '%s' ... to disable a mode, use 'nomode'\n", _mode); 128 | // usage(); 129 | // } 130 | // runner.runStdinStdout(inputFormat); 131 | } 132 | 133 | 134 | 135 | } 136 | -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/javasrc/util/JsonUtil.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | import java.io.IOException; 4 | import java.util.*; 5 | import org.codehaus.jackson.JsonNode; 6 | import org.codehaus.jackson.JsonProcessingException; 7 | import org.codehaus.jackson.map.ObjectMapper; 8 | import org.codehaus.jackson.map.type.TypeFactory; 9 | import org.codehaus.jackson.node.*; 10 | 11 | import com.google.common.collect.Multiset; 12 | 13 | import util.misc.Pair; 14 | 15 | /** simplified wrapper functions for the Jackson JSON library 16 | * this is half-baked, still learning the right way to use the library 17 | */ 18 | public class JsonUtil { 19 | 20 | public static ObjectMapper om; 21 | static { 22 | om = new ObjectMapper(); 23 | } 24 | 25 | public static void main(String args[]) { 26 | List x = toList(args[0], String.class); 27 | U.p(x); 28 | } 29 | 30 | public static String getTextDefault(JsonNode ob, String keyname, String defaultValue) { 31 | return ob.has(keyname) ? ob.get(keyname).asText() : defaultValue; 32 | } 33 | 34 | ////////////////////////////////////// 35 | 36 | // toList() derived from 37 | // http://stackoverflow.com/questions/9942475/convert-json-to-multiple-objects-using-jackson 38 | 39 | public static ArrayList toList(String jsonString, final Class type) { 40 | try { 41 | return om.readValue(jsonString, TypeFactory.defaultInstance().constructCollectionType(ArrayList.class, type)); 42 | } catch (IOException e) { 43 | return null; 44 | } 45 | } 46 | 47 | public static ArrayList toList(JsonNode jsonNode, final Class type) { 48 | try { 49 | return om.readValue(jsonNode, TypeFactory.defaultInstance().constructCollectionType(ArrayList.class, type)); 50 | } catch (IOException e) { 51 | return null; 52 | } 53 | } 54 | 55 | public static ObjectNode toJson(Multiset counts) { 56 | ObjectNode jmap = newObject(); 57 | for (Multiset.Entry e : counts.entrySet()) { 58 | jmap.put(e.getElement().toString(), e.getCount()); 59 | } 60 | return jmap; 61 | } 62 | 63 | public static JsonNode toJson(final List data) { 64 | ArrayNode jlist = new ObjectMapper().createArrayNode(); 65 | for (T elt : data) { 66 | jlist.add( toJson(elt) ); 67 | } 68 | return jlist; 69 | } 70 | 71 | public static JsonNode toJson(final Pair pair) { 72 | try { 73 | List x = new ArrayList<>(); 74 | x.add( (Object) pair.first); 75 | x.add( (Object) pair.second); 76 | return new ObjectMapper().valueToTree(x); 77 | } catch(Exception e) { 78 | throw new RuntimeException(e); 79 | } 80 | } 81 | 82 | ///////// from Play framework below 83 | 84 | /** 85 | * Convert an object to JsonNode. 86 | * 87 | * @param data Value to convert in Json. 88 | */ 89 | public static JsonNode toJson(final Object data) { 90 | try { 91 | return om.valueToTree(data); 92 | } catch(Exception e) { 93 | throw new RuntimeException(e); 94 | } 95 | } 96 | 97 | /** 98 | * Convert a JsonNode to a Java value 99 | * 100 | * @param json Json value to convert. 101 | * @param clazz Expected Java value type. 102 | */ 103 | public static A fromJson(JsonNode json, Class clazz) { 104 | try { 105 | return om.treeToValue(json, clazz); 106 | } catch(Exception e) { 107 | throw new RuntimeException(e); 108 | } 109 | } 110 | 111 | /** 112 | * Creates a new empty ObjectNode. 113 | */ 114 | public static ObjectNode newObject() { 115 | return om.createObjectNode(); 116 | } 117 | 118 | /** 119 | * Convert a JsonNode to its string representation. 120 | */ 121 | public static String stringify(JsonNode json) { 122 | return json.toString(); 123 | } 124 | 125 | /** 126 | * Parse a String representing a json, and return it as a JsonNode. 127 | */ 128 | public static JsonNode parse(String src) { 129 | try { 130 | return om.readValue(src, JsonNode.class); 131 | } catch(Throwable t) { 132 | throw new RuntimeException(t); 133 | } 134 | } 135 | 136 | public static JsonNode readJson(String jsonStr) throws JsonProcessingException, IOException { 137 | return om.readTree(jsonStr); 138 | } 139 | 140 | public static JsonNode readJsonNX(String jsonStr) { 141 | try { 142 | return om.readTree(jsonStr); 143 | } catch (IOException e) { 144 | e.printStackTrace(); 145 | return null; 146 | } 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/javasrc/util/misc/Triple.java: -------------------------------------------------------------------------------- 1 | package util.misc; 2 | 3 | /** 4 | * borrowed from berkeley nlp libraries which we were told was apache licensed 5 | */ 6 | public class Triple { 7 | public S first; 8 | public T second; 9 | public U third; 10 | 11 | public Triple(S first, T second, U third) { 12 | this.first = first; 13 | this.second = second; 14 | this.third = third; 15 | } 16 | 17 | @Override 18 | public int hashCode() { 19 | final int prime = 31; 20 | int result = 1; 21 | result = prime * result + ((first == null) ? 0 : first.hashCode()); 22 | result = prime * result + ((second == null) ? 0 : second.hashCode()); 23 | result = prime * result + ((third == null) ? 0 : third.hashCode()); 24 | return result; 25 | } 26 | 27 | @Override 28 | public boolean equals(Object obj) { 29 | if (this == obj) 30 | return true; 31 | if (obj == null) 32 | return false; 33 | if (getClass() != obj.getClass()) 34 | return false; 35 | final Triple other = (Triple) obj; 36 | if (first == null) { 37 | if (other.first != null) 38 | return false; 39 | } else if (!first.equals(other.first)) 40 | return false; 41 | if (second == null) { 42 | if (other.second != null) 43 | return false; 44 | } else if (!second.equals(other.second)) 45 | return false; 46 | if (third == null) { 47 | if (other.third != null) 48 | return false; 49 | } else if (!third.equals(other.third)) 50 | return false; 51 | return true; 52 | } 53 | 54 | public String toString() { 55 | return String.format("(%s,%s,%s)",first,second,third); 56 | } 57 | 58 | public static Triple makeTriple(S s, T t, U u) { 59 | // TODO Auto-generated method stub 60 | return new Triple(s,t,u); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/lib/corenlpwrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/stanford_corenlp_pywrapper/lib/corenlpwrapper.jar -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/lib/guava-13.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/stanford_corenlp_pywrapper/lib/guava-13.0.1.jar -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/lib/jackson-all-1.9.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/stanford_corenlp_pywrapper/lib/jackson-all-1.9.11.jar -------------------------------------------------------------------------------- /stanford_corenlp_pywrapper/rcorenlp.r: -------------------------------------------------------------------------------- 1 | # R wrapper for the Java JSON pipe server for CoreNLP 2 | library(rjson) 3 | library(stringr) 4 | 5 | # paste from python sockwrap.py modes_json 6 | MODES = rjson::fromJSON( 7 | "{\"ssplit\":{\"annotators\":\"tokenize, ssplit\",\"description\":\"tokenization and sentence splitting (included in all subsequent ones)\"},\"coref\":{\"annotators\":\"tokenize, ssplit, pos, lemma, ner, entitymentions, parse, dcoref\",\"description\":\"Coreference, including constituent parsing.\"},\"pos\":{\"annotators\":\"tokenize, ssplit, pos, lemma\",\"description\":\"POS (and lemmas)\"},\"parse\":{\"annotators\":\"tokenize, ssplit, pos, lemma, parse\",\"description\":\"fairly basic parsing with POS, lemmas, trees, dependencies\"},\"nerparse\":{\"annotators\":\"tokenize, ssplit, pos, lemma, ner, entitymentions, parse\",\"description\":\"parsing with NER, POS, lemmas, depenencies.\"},\"ner\":{\"annotators\":\"tokenize, ssplit, pos, lemma, ner, entitymentions\",\"description\":\"POS and NER (and lemmas)\"}}" 8 | ) 9 | 10 | CoreNLP = function( 11 | mode=NULL, 12 | configdict=list(annotators="tokenize, ssplit"), 13 | corenlp_jars=c( 14 | "/home/sw/corenlp/stanford-corenlp-full-2015-04-20/*", 15 | "/home/sw/stanford-srparser-2014-10-23-models.jar"), 16 | java_command="java", 17 | java_options="-Xmx4g -XX:ParallelGCThreads=1", 18 | outpipe_filename_prefix="/tmp/corenlp_rwrap_pipe", 19 | ... 20 | ) { 21 | 22 | # If a mode is specified, set the annotators on the configdict. 23 | if (!is.null(mode)) { 24 | stopifnot(mode %in% names(MODES)) 25 | configdict[['annotators']] = MODES[[mode]][['annotators']] 26 | } 27 | 28 | # Extra arguments are put into the configdict. 29 | 30 | moreargs = list(...) 31 | for (k in names(moreargs)) { 32 | configdict[[k]] = moreargs[[k]] 33 | } 34 | 35 | corenlp = list() 36 | corenlp$outpipe_filename = sprintf("%s_rpid=%s_time=%s", outpipe_filename_prefix, Sys.getpid(), as.numeric(Sys.time())) 37 | 38 | cmd = "exec JAVA_COMMAND JAVA_OPTIONS -cp 'CLASSPATH' \ 39 | corenlp.SocketServer COMM_INFO MORE_CONFIG" 40 | cmd = str_replace(cmd, "JAVA_COMMAND", java_command) 41 | cmd = str_replace(cmd, "JAVA_OPTIONS", java_options) 42 | # How to specify location of resources in R? there's no __FILE__ equivalent 43 | # Packages are the only way? Too bad. 44 | jars = c("lib/corenlpwrapper.jar", "lib/*") 45 | jars = c(jars, corenlp_jars) 46 | cmd = str_replace(cmd, "CLASSPATH", str_join(jars, collapse=":")) 47 | cmd = str_replace(cmd, "COMM_INFO", sprintf("--outpipe %s", corenlp$outpipe_filename)) 48 | cmd = str_replace(cmd, "MORE_CONFIG", sprintf(" --configdict '%s'", rjson::toJSON(configdict))) 49 | 50 | cmd = str_replace_all(cmd, "\n", " ") 51 | logmessage(sprintf("Starting with command: %s\n", cmd)) 52 | 53 | # - I'm not sure how R encodings work 54 | # - pipe() in write mode seems to block until the subprocess tries to read 55 | # from stdin. Perfect, so we don't need to check for that. 56 | corenlp$pipe = pipe(cmd, "wb", encoding="UTF-8") 57 | system(sprintf("mkfifo %s", corenlp$outpipe_filename)) 58 | corenlp$outpipe = file(corenlp$outpipe_filename, "rb", encoding="UTF-8", raw=TRUE) 59 | 60 | class(corenlp) = "corenlp_wrapper" 61 | corenlp 62 | } 63 | 64 | logmessage = function(msg) cat(sprintf("INFO:CoreNLP_RWrapper:%s", msg), file=stderr()) 65 | 66 | readresult = function(outpipe) { 67 | # TESTING 68 | # readresult(file("return.bin","rb", raw=TRUE)) 69 | size = readBin(outpipe, 'integer', n=1, endian='big', size=8) 70 | cat(sprintf("Returned size %s\n", size)) 71 | stopifnot(size > 0) 72 | # does useBytes=TRUE circumvent the encoding declaration earlier? 73 | result = readChar(outpipe, size, useBytes=TRUE) 74 | result = rjson::fromJSON(result) 75 | result 76 | } 77 | 78 | 79 | parsedoc = function(corenlp, string) { 80 | command = sprintf("PARSEDOC\t%s", rjson::toJSON(string)) 81 | writeLines(command, corenlp$pipe) 82 | flush(corenlp$pipe) 83 | readresult(corenlp$outpipe) 84 | } 85 | 86 | close.corenlp_wrapper = function(corenlp) { 87 | close(corenlp$outpipe) 88 | close(corenlp$pipe) 89 | system(sprintf("rm -f %s", corenlp$outpipe_filename)) 90 | } 91 | -------------------------------------------------------------------------------- /tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/tree.png -------------------------------------------------------------------------------- /visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/visualization.png --------------------------------------------------------------------------------