├── ANALYSIS.md
├── COREFERENCE.md
├── LICENSE
├── MANIFEST.in
├── MULTIGRAPH.md
├── README.md
├── bin
├── cort-predict-conll
├── cort-predict-raw
├── cort-train
├── cort-visualize
└── run-multigraph
├── cort
├── __init__.py
├── analysis
│ ├── __init__.py
│ ├── data_structures.py
│ ├── error_extractors.py
│ ├── plotting.py
│ ├── spanning_tree_algorithms.py
│ ├── visualization.py
│ └── visualization
│ │ ├── TODO
│ │ ├── lib
│ │ ├── cort-for-raw.js
│ │ ├── cort.js
│ │ ├── jquery-2.1.1.min.js
│ │ └── jquery.jsPlumb-1.6.4.js
│ │ └── style.css
├── config_files
│ ├── corenlp.ini
│ └── corenlp_with_coref.ini
├── core
│ ├── __init__.py
│ ├── corpora.py
│ ├── documents.py
│ ├── external_data.py
│ ├── head_finders.py
│ ├── mention_extractor.py
│ ├── mention_property_computer.py
│ ├── mentions.py
│ ├── mixins.py
│ ├── singletons.py
│ ├── spans.py
│ └── util.py
├── coreference
│ ├── __init__.py
│ ├── approaches
│ │ ├── __init__.py
│ │ ├── antecedent_trees.py
│ │ ├── mention_pairs.py
│ │ └── mention_ranking.py
│ ├── clusterer.py
│ ├── cost_functions.py
│ ├── experiments.py
│ ├── features.py
│ ├── instance_extractors.py
│ ├── multigraph
│ │ ├── __init__.py
│ │ ├── decoders.py
│ │ ├── features.py
│ │ ├── multigraphs.py
│ │ └── weighting_functions.py
│ └── perceptrons.pyx
├── preprocessing
│ ├── __init__.py
│ └── pipeline.py
├── reference-coreference-scorers
│ └── v8.01
│ │ ├── README.txt
│ │ ├── lib
│ │ ├── Algorithm
│ │ │ ├── Munkres.pm
│ │ │ └── README.Munkres
│ │ ├── CorScorer.pm
│ │ ├── Cwd.pm
│ │ ├── Data
│ │ │ └── Dumper.pm
│ │ └── Math
│ │ │ └── Combinatorics.pm
│ │ ├── scorer.bat
│ │ ├── scorer.pl
│ │ └── test
│ │ ├── CorefMetricTest.pm
│ │ ├── CorefMetricTestConfig.pm
│ │ ├── DataFiles
│ │ ├── TC-A-1.response
│ │ ├── TC-A-10.response
│ │ ├── TC-A-11.response
│ │ ├── TC-A-12.response
│ │ ├── TC-A-13.response
│ │ ├── TC-A-2.response
│ │ ├── TC-A-3.response
│ │ ├── TC-A-4.response
│ │ ├── TC-A-5.response
│ │ ├── TC-A-6.response
│ │ ├── TC-A-7.response
│ │ ├── TC-A-8.response
│ │ ├── TC-A-9.response
│ │ ├── TC-A.key
│ │ ├── TC-B-1.response
│ │ ├── TC-B.key
│ │ ├── TC-C-1.response
│ │ ├── TC-C.key
│ │ ├── TC-D-1.response
│ │ ├── TC-D.key
│ │ ├── TC-E-1.response
│ │ ├── TC-E.key
│ │ ├── TC-F-1.response
│ │ ├── TC-F.key
│ │ ├── TC-G-1.response
│ │ ├── TC-G.key
│ │ ├── TC-H-1.response
│ │ ├── TC-H.key
│ │ ├── TC-I-1.response
│ │ ├── TC-I.key
│ │ ├── TC-J-1.response
│ │ ├── TC-J.key
│ │ ├── TC-K-1.response
│ │ ├── TC-K.key
│ │ ├── TC-L-1.response
│ │ ├── TC-L.key
│ │ ├── TC-M-1.response
│ │ ├── TC-M-2.response
│ │ ├── TC-M-3.response
│ │ ├── TC-M-4.response
│ │ ├── TC-M-5.response
│ │ ├── TC-M-6.response
│ │ ├── TC-M.key
│ │ ├── TC-N-1.response
│ │ ├── TC-N-2.response
│ │ ├── TC-N-3.response
│ │ ├── TC-N-4.response
│ │ ├── TC-N-5.response
│ │ ├── TC-N-6.response
│ │ └── TC-N.key
│ │ ├── TestCases.README
│ │ └── test.pl
├── resources
│ ├── coreferent_pairs.obj
│ ├── female.list
│ ├── male.list
│ ├── neutral.list
│ ├── plural.list
│ └── singletons_not_cleaned.obj
├── test
│ ├── __init__.py
│ ├── analysis
│ │ ├── __init__.py
│ │ ├── test_data_structures.py
│ │ ├── test_error_extractors.py
│ │ └── test_spanning_tree_algorithms.py
│ ├── core
│ │ ├── __init__.py
│ │ ├── resources
│ │ │ └── input.conll
│ │ ├── test_corpora.py
│ │ ├── test_documents.py
│ │ ├── test_external_data.py
│ │ ├── test_head_finders.py
│ │ ├── test_mention_extractor.py
│ │ ├── test_mention_property_computer.py
│ │ ├── test_mentions.py
│ │ ├── test_spans.py
│ │ └── test_util.py
│ └── multigraph
│ │ ├── __init__.py
│ │ └── test_features.py
└── util
│ ├── __init__.py
│ └── import_helper.py
├── plot.png
├── scripts
├── acl15demo.py
├── naacl15-demo.py
└── train-and-predict-all.py
├── setup.py
├── stanford_corenlp_pywrapper
├── __init__.py
├── javasrc
│ ├── corenlp
│ │ ├── JsonPipeline.java
│ │ ├── PipeRunner.java
│ │ └── SocketServer.java
│ └── util
│ │ ├── Arr.java
│ │ ├── BasicFileIO.java
│ │ ├── JsonUtil.java
│ │ ├── U.java
│ │ └── misc
│ │ ├── Pair.java
│ │ └── Triple.java
├── lib
│ ├── corenlpwrapper.jar
│ ├── guava-13.0.1.jar
│ └── jackson-all-1.9.11.jar
├── rcorenlp.r
└── sockwrap.py
├── tree.png
└── visualization.png
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014-2015 Sebastian Martschat
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include cort/resources/*
2 | include cort/config_files/*
3 | include cort/analysis/visualization/*
4 | include cort/analysis/visualization/lib/*
5 | include cort/coreference/perceptrons.pyx
6 | include stanford_corenlp_pywrapper/rcorenlp.r
7 | include stanford_corenlp_pywrapper/lib/*
8 | include stanford_corenlp_pywrapper/javasrc/corenlp/*
9 | include stanford_corenlp_pywrapper/javasrc/util/misc/*
10 | include stanford_corenlp_pywrapper/javasrc/util/*
11 | include cort/reference-coreference-scorers/v8.01/*
12 | include cort/reference-coreference-scorers/v8.01/lib/*
13 | include cort/reference-coreference-scorers/v8.01/lib/Algorithm/*
14 | include cort/reference-coreference-scorers/v8.01/lib/Data/*
15 | include cort/reference-coreference-scorers/v8.01/lib/Math/*
--------------------------------------------------------------------------------
/MULTIGRAPH.md:
--------------------------------------------------------------------------------
1 | # Running cort's multigraph system
2 |
3 | **cort** ships with a deterministic coreference resolution system based on
4 | multigraph clustering. The input must follow [the
5 | format from the CoNLL shared tasks on coreference resolution](http://conll.cemantix.org/2012/data.html).
6 |
7 | To run the multigraph system, use
8 |
9 | ```shell
10 | run-multigraph -in my_data.data -out out.data
11 | ```
12 |
13 | With the optional argument `-ante`, antecedent decisions are also written to a
14 | file:
15 |
16 | ```shell
17 | run-multigraph -in my_data.data -out out.data -ante antecedents_out.data
18 | ```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cort
2 |
3 | __cort__ is a coreference resolution toolkit. It consists
4 | of two parts: the *coreference resolution* component implements a framework for
5 | coreference resolution based on latent variables, which allows you to rapidly
6 | devise approaches to coreference resolution, while the *error analysis* component
7 | provides extensive functionality for analyzing and visualizing errors made by
8 | coreference resolution systems.
9 |
10 | If you have any questions or comments, drop me an e-mail at
11 | [sebastian.martschat@gmail.com](mailto:sebastian.martschat@gmail.com).
12 |
13 | ## Branches/Forks
14 |
15 | * the [kbest branch](https://github.com/smartschat/cort/tree/kbest) contains code for kbest extraction of coreference information, as described in Ji et al. (2017)
16 | * the [v03 branch](https://github.com/smartschat/cort/tree/v03) contains a version of __cort__ with more models and a better train/dev/test workflow. For more details on the models see Martschat (2017).
17 | * [Nafise Moosavi's fork of __cort__](https://github.com/ns-moosavi/cort/tree/singleton_feature) implements search space pruning on top of __cort__, as described in Moosavi and Strube (2016)
18 |
19 | ## Documentation
20 |
21 | * coreference resolution with cort
22 | * error analysis with cort
23 | * running the multigraph system
24 |
25 | ## Installation
26 |
27 | __cort__ is available on PyPi. You can install it via
28 |
29 | ```
30 | pip install cort
31 | ```
32 | Dependencies (automatically installed by pip) are
33 | [nltk](http://www.nltk.org/), [numpy](http://www.numpy.org/),
34 | [matplotlib](http://matplotlib.org),
35 | [mmh3](https://pypi.python.org/pypi/mmh3),
36 | [PyStanfordDependencies](https://github.com/dmcc/PyStanfordDependencies),
37 | [cython](http://cython.org/),
38 | [future](https://pypi.python.org/pypi/future),
39 | [jpype](https://pypi.python.org/pypi/jpype1) and
40 | [beautifulsoup](https://pypi.python.org/pypi/beautifulsoup4). It ships with
41 | [stanford_corenlp_pywrapper](https://github.com/brendano/stanford_corenlp_pywrapper)
42 | and [the reference implementation of the CoNLL scorer](https://github.com/conll/reference-coreference-scorers).
43 |
44 | __cort__ is written for use on Linux with Python 3.3+. While __cort__ also runs under
45 | Python 2.7, I strongly recommend running __cort__ with Python 3, since the Python 3
46 | version is much more efficient.
47 |
48 | ## References
49 |
50 | Yangfeng Ji, Chenhao Tan, Sebastian Martschat, Yejin Choi and Noah A. Smith (2017). **Dynamic Entity Representations in Neural Language Models.** To appear in *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing (EMNLP), Copenhagen, Denmark, 7-11 September 2017*.
51 | [PDF](https://arxiv.org/abs/1708.00781)
52 |
53 | Sebastian Martschat (2017). **Structured Representations for Coreference Resolution.** PhD thesis, Heidelberg University.
54 | [PDF](http://www.ub.uni-heidelberg.de/archiv/23305)
55 |
56 | Nafise Sadat Moosavi and Michael Strube (2016). **Search space pruning: A
57 | simple solution for better coreference resolvers**. In *Proceedings of the 2016
58 | Conference of the North American Chapter of the Association for Computational
59 | Linguistics: Human Language Technologies*, San Diego, Cal., 12-17 June 2016,
60 | pages 1005-1011.
61 | [PDF](http://www.aclweb.org/anthology/N16-1115.pdf)
62 |
63 | Sebastian Martschat and Michael Strube (2015). **Latent Structures for
64 | Coreference Resolution**. *Transactions of the Association for
65 | Computational Linguistics*, 3, pages 405-418.
66 | [PDF](http://www.aclweb.org/anthology/Q/Q15/Q15-1029.pdf)
67 |
68 | Sebastian Martschat, Patrick Claus and Michael Strube (2015). **Plug Latent
69 | Structures and Play Coreference Resolution**. In *Proceedings of
70 | the Proceedings of ACL-IJCNLP 2015 System Demonstrations*, Beijing, China,
71 | 26-31 July 2015, pages 61-66.
72 | [PDF](http://www.aclweb.org/anthology/P/P15/P15-4011.pdf)
73 |
74 | Sebastian Martschat, Thierry Göckel and Michael Strube (2015). **Analyzing and
75 | Visualizing Coreference Resolution Errors**. In *Proceedings of the 2015
76 | Conference of the North American Chapter of the Association for Computational
77 | Linguistics: Demonstrations*, Denver, Colorado, USA, 31 May-5 June 2015,
78 | pages 6-10.
79 | [PDF](https://aclweb.org/anthology/N/N15/N15-3002.pdf)
80 |
81 | Sebastian Martschat and Michael Strube (2014). **Recall Error Analysis for
82 | Coreference Resolution**. In *Proceedings of the 2014 Conference on Empirical
83 | Methods in Natural Language Processing (EMNLP)*, Doha, Qatar, 25-29 October
84 | 2014, pages 2070-2081.
85 | [PDF](http://aclweb.org/anthology/D/D14/D14-1221.pdf)
86 |
87 | Sebastian Martschat (2013). **Multigraph Clustering for Unsupervised
88 | Coreference Resolution**. In *Proceedings of the Student Research Workshop
89 | at the 51st Annual Meeting of the Association for Computational Linguistics*,
90 | Sofia, Bulgaria, 5-7 August 2013, pages 81-88.
91 | [PDF](http://aclweb.org/anthology/P/P13/P13-3012.pdf)
92 |
93 | If you use the error analysis component in your research, please cite the
94 | [EMNLP'14 paper](http://aclweb.org/anthology/D/D14/D14-1221.pdf). If you use
95 | the coreference component in your research, please cite the
96 | [TACL paper](http://www.aclweb.org/anthology/Q/Q15/Q15-1029.pdf). If you use
97 | the multigraph system, please cite the
98 | [ACL'13-SRW paper](http://aclweb.org/anthology/P/P13/P13-3012.pdf).
99 |
100 | ## Changelog
101 |
102 | __Wednesday, 4 November 2015__
103 | Support numeric features. Due to a different feature representation the models changed,
104 | hence I have updated the downloadable models.
105 |
106 | __Friday, 9 October 2015__
107 | Now supports label-dependent cost functions.
108 |
109 | __Tuesday, 15 September 2015__
110 | Minor bugfixes.
111 |
112 | __Monday, 27 July 2015__
113 | Now can perform coreference resolution on raw text.
114 |
115 | __Tuesday, 21 July 2015__
116 | Updated to status of TACL paper.
117 |
118 | __Wednesday, 3 June 2015__
119 | Improvements to visualization (mention highlighting and scrolling).
120 |
121 | __Monday, 1 June 2015__
122 | Fixed a bug in mention highlighting for visualization.
123 |
124 | __Sunday, 31 May 2015__
125 | Updated to status of NAACL'15 demo paper.
126 |
127 | __Wednesday, 13 May 2015__
128 | Fixed another bug in the documentation regarding format of antecedent data.
129 |
130 | __Tuesday, 3 February 2015__
131 | Fixed a bug in the documentation: part no. in antecedent file must be with trailing 0s.
132 |
133 | __Thursday, 30 October 2014__
134 | Fixed data structure bug in documents.py. The results from the paper are not affected by this bug.
135 |
136 | __Wednesday, 22 October 2014__
137 | Initial release.
138 |
--------------------------------------------------------------------------------
/bin/cort-predict-conll:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | from __future__ import print_function
5 | import argparse
6 | import codecs
7 | import logging
8 | import os
9 | import pickle
10 | import subprocess
11 | import sys
12 |
13 |
14 | import cort
15 | from cort.core import corpora
16 | from cort.core import mention_extractor
17 | from cort.coreference import cost_functions
18 | from cort.coreference import experiments
19 | from cort.coreference import features
20 | from cort.coreference import instance_extractors
21 | from cort.util import import_helper
22 |
23 |
24 | __author__ = 'smartschat'
25 |
26 | logging.basicConfig(level=logging.INFO,
27 | format='%(asctime)s %(levelname)s %(''message)s')
28 |
29 |
30 | def parse_args():
31 | parser = argparse.ArgumentParser(description='Predict coreference '
32 | 'relations.')
33 | parser.add_argument('-in',
34 | required=True,
35 | dest='input_filename',
36 | help='The input file. Must follow the format of the '
37 | 'CoNLL shared tasks on coreference resolution '
38 | '(see http://conll.cemantix.org/2012/data.html).)')
39 | parser.add_argument('-model',
40 | required=True,
41 | dest='model',
42 | help='The model learned via cort-train.')
43 | parser.add_argument('-out',
44 | dest='output_filename',
45 | required=True,
46 | help='The output file the predictions will be stored'
47 | 'in (in the CoNLL format.')
48 | parser.add_argument('-ante',
49 | dest='ante',
50 | help='The file where antecedent predictions will be'
51 | 'stored to.')
52 | parser.add_argument('-extractor',
53 | dest='extractor',
54 | required=True,
55 | help='The function to extract instances.')
56 | parser.add_argument('-perceptron',
57 | dest='perceptron',
58 | required=True,
59 | help='The perceptron to use.')
60 | parser.add_argument('-clusterer',
61 | dest='clusterer',
62 | required=True,
63 | help='The clusterer to use.')
64 | parser.add_argument('-gold',
65 | dest='gold',
66 | help='Gold data (in the CoNLL format) for evaluation.')
67 | parser.add_argument('-features',
68 | dest='features',
69 | help='The file containing the list of features. If not'
70 | 'provided, defaults to a standard set of'
71 | 'features.')
72 |
73 | return parser.parse_args()
74 |
75 |
76 | def get_scores(output_data, gold_data):
77 | scorer_output = subprocess.check_output([
78 | "perl",
79 | cort.__path__[0] + "/reference-coreference-scorers/v8.01/scorer.pl",
80 | "all",
81 | gold_data,
82 | os.getcwd() + "/" + output_data,
83 | "none"]).decode()
84 |
85 | metrics = ['muc', 'bcub', 'ceafm', 'ceafe', 'blanc']
86 |
87 | metrics_results = {}
88 |
89 | metric = None
90 |
91 | results_formatted = ""
92 |
93 | for line in scorer_output.split("\n"):
94 | if not line:
95 | continue
96 |
97 | splitted = line.split()
98 |
99 | if splitted[0] == "METRIC":
100 | metric = line.split()[1][:-1]
101 |
102 | if (metric != 'blanc' and line.startswith("Coreference:")) \
103 | or (metric == 'blanc' and line.startswith("BLANC:")):
104 | metrics_results[metric] = (
105 | float(splitted[5][:-1]),
106 | float(splitted[10][:-1]),
107 | float(splitted[12][:-1]),
108 | )
109 |
110 | results_formatted += "\tR\tP\tF1\n"
111 |
112 | for metric in metrics:
113 | results_formatted += metric + "\t" + \
114 | "\t".join([str(val) for val in metrics_results[metric]]) + "\n"
115 | results_formatted += "\n"
116 | average = (metrics_results["muc"][2] + metrics_results["bcub"][2] +
117 | metrics_results["ceafe"][2])/3
118 | results_formatted += "conll\t\t\t" + format(average, '.2f') + "\n"
119 |
120 | return results_formatted
121 |
122 |
123 | logging.basicConfig(level=logging.INFO,
124 | format='%(asctime)s %(levelname)s %(''message)s')
125 |
126 | if sys.version_info[0] == 2:
127 | logging.warning("You are running cort under Python 2. cort is much more "
128 | "efficient under Python 3.3+.")
129 | args = parse_args()
130 |
131 | if args.features:
132 | mention_features, pairwise_features = import_helper.get_features(
133 | args.features)
134 | else:
135 | mention_features = [
136 | features.fine_type,
137 | features.gender,
138 | features.number,
139 | features.sem_class,
140 | features.deprel,
141 | features.head_ner,
142 | features.length,
143 | features.head,
144 | features.first,
145 | features.last,
146 | features.preceding_token,
147 | features.next_token,
148 | features.governor,
149 | features.ancestry
150 | ]
151 |
152 | pairwise_features = [
153 | features.exact_match,
154 | features.head_match,
155 | features.same_speaker,
156 | features.alias,
157 | features.sentence_distance,
158 | features.embedding,
159 | features.modifier,
160 | features.tokens_contained,
161 | features.head_contained,
162 | features.token_distance
163 | ]
164 |
165 | logging.info("Loading model.")
166 | priors, weights = pickle.load(open(args.model, "rb"))
167 |
168 | perceptron = import_helper.import_from_path(args.perceptron)(
169 | priors=priors,
170 | weights=weights,
171 | cost_scaling=0
172 | )
173 |
174 | extractor = instance_extractors.InstanceExtractor(
175 | import_helper.import_from_path(args.extractor),
176 | mention_features,
177 | pairwise_features,
178 | cost_functions.null_cost,
179 | perceptron.get_labels()
180 | )
181 |
182 | logging.info("Reading in data.")
183 | testing_corpus = corpora.Corpus.from_file(
184 | "testing",
185 | codecs.open(args.input_filename, "r", "utf-8"))
186 |
187 | logging.info("Extracting system mentions.")
188 | for doc in testing_corpus:
189 | doc.system_mentions = mention_extractor.extract_system_mentions(doc)
190 |
191 | mention_entity_mapping, antecedent_mapping = experiments.predict(
192 | testing_corpus,
193 | extractor,
194 | perceptron,
195 | import_helper.import_from_path(args.clusterer)
196 | )
197 |
198 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping)
199 |
200 |
201 | logging.info("Write corpus to file.")
202 | testing_corpus.write_to_file(codecs.open(args.output_filename, "w", "utf-8"))
203 |
204 | if args.ante:
205 | logging.info("Write antecedent decisions to file")
206 | testing_corpus.write_antecedent_decisions_to_file(open(args.ante, "w"))
207 |
208 | if args.gold:
209 | logging.info("Evaluate.")
210 | print(get_scores(args.output_filename, args.gold))
211 |
212 | logging.info("Done.")
213 |
--------------------------------------------------------------------------------
/bin/cort-predict-raw:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | from __future__ import print_function
5 | import argparse
6 | import codecs
7 | import logging
8 | import pickle
9 | import sys
10 |
11 |
12 | from cort.preprocessing import pipeline
13 | from cort.core import mention_extractor
14 | from cort.coreference import cost_functions
15 | from cort.coreference import experiments
16 | from cort.coreference import features
17 | from cort.coreference import instance_extractors
18 | from cort.util import import_helper
19 |
20 |
21 | __author__ = 'smartschat'
22 |
23 | logging.basicConfig(level=logging.INFO,
24 | format='%(asctime)s %(levelname)s %(''message)s')
25 |
26 |
27 | def parse_args():
28 | parser = argparse.ArgumentParser(description='Predict coreference '
29 | 'relations.')
30 | parser.add_argument('-in',
31 | required=True,
32 | dest='input_filename',
33 | help='The raw text input files.',
34 | nargs="*")
35 | parser.add_argument('-model',
36 | required=True,
37 | dest='model',
38 | help='The model learned via cort-train.')
39 | parser.add_argument('-suffix',
40 | dest='suffix',
41 | default="out",
42 | help='Sufix for output files. Defaults to "out".')
43 | parser.add_argument('-extractor',
44 | dest='extractor',
45 | required=True,
46 | help='The function to extract instances.')
47 | parser.add_argument('-perceptron',
48 | dest='perceptron',
49 | required=True,
50 | help='The perceptron to use.')
51 | parser.add_argument('-clusterer',
52 | dest='clusterer',
53 | required=True,
54 | help='The clusterer to use.')
55 | parser.add_argument('-features',
56 | dest='features',
57 | help='The file containing the list of features. If not'
58 | 'provided, defaults to a standard set of'
59 | 'features.')
60 | parser.add_argument('-corenlp',
61 | dest='corenlp',
62 | required=True,
63 | help='Location of CoreNLP jars.')
64 |
65 | return parser.parse_args()
66 |
67 |
68 | logging.basicConfig(level=logging.INFO,
69 | format='%(asctime)s %(levelname)s %(''message)s')
70 |
71 | if sys.version_info[0] == 2:
72 | logging.warning("You are running cort under Python 2. cort is much more "
73 | "efficient under Python 3.3+.")
74 |
75 | args = parse_args()
76 |
77 | if args.features:
78 | mention_features, pairwise_features = import_helper.get_features(
79 | args.features)
80 | else:
81 | mention_features = [
82 | features.fine_type,
83 | features.gender,
84 | features.number,
85 | features.sem_class,
86 | features.deprel,
87 | features.head_ner,
88 | features.length,
89 | features.head,
90 | features.first,
91 | features.last,
92 | features.preceding_token,
93 | features.next_token,
94 | features.governor,
95 | features.ancestry
96 | ]
97 |
98 | pairwise_features = [
99 | features.exact_match,
100 | features.head_match,
101 | features.same_speaker,
102 | features.alias,
103 | features.sentence_distance,
104 | features.embedding,
105 | features.modifier,
106 | features.tokens_contained,
107 | features.head_contained,
108 | features.token_distance
109 | ]
110 |
111 |
112 | logging.info("Loading model.")
113 | priors, weights = pickle.load(open(args.model, "rb"))
114 |
115 | perceptron = import_helper.import_from_path(args.perceptron)(
116 | priors=priors,
117 | weights=weights,
118 | cost_scaling=0
119 | )
120 |
121 | extractor = instance_extractors.InstanceExtractor(
122 | import_helper.import_from_path(args.extractor),
123 | mention_features,
124 | pairwise_features,
125 | cost_functions.null_cost,
126 | perceptron.get_labels()
127 | )
128 |
129 | logging.info("Reading in and preprocessing data.")
130 | p = pipeline.Pipeline(args.corenlp)
131 |
132 | testing_corpus = p.run_on_docs("corpus", args.input_filename)
133 |
134 | logging.info("Extracting system mentions.")
135 | for doc in testing_corpus:
136 | doc.system_mentions = mention_extractor.extract_system_mentions(doc)
137 |
138 | mention_entity_mapping, antecedent_mapping = experiments.predict(
139 | testing_corpus,
140 | extractor,
141 | perceptron,
142 | import_helper.import_from_path(args.clusterer)
143 | )
144 |
145 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping)
146 |
147 | logging.info("Write output to file.")
148 |
149 | for doc in testing_corpus:
150 | output = doc.to_simple_output()
151 | my_file = codecs.open(doc.identifier + "." + args.suffix, "w", "utf-8")
152 | my_file.write(output)
153 | my_file.close()
154 |
155 | logging.info("Done.")
156 |
--------------------------------------------------------------------------------
/bin/cort-train:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import codecs
5 | import logging
6 | import pickle
7 | import sys
8 |
9 |
10 | from cort.core import corpora
11 | from cort.core import mention_extractor
12 | from cort.coreference import experiments
13 | from cort.coreference import features
14 | from cort.coreference import instance_extractors
15 | from cort.util import import_helper
16 |
17 |
18 | __author__ = 'smartschat'
19 |
20 |
21 | logging.basicConfig(level=logging.INFO,
22 | format='%(asctime)s %(levelname)s %(''message)s')
23 |
24 |
25 | def parse_args():
26 | parser = argparse.ArgumentParser(description='Train coreference resolution '
27 | 'models.')
28 | parser.add_argument('-in',
29 | required=True,
30 | dest='input_filename',
31 | help='The input file. Must follow the format of the '
32 | 'CoNLL shared tasks on coreference resolution '
33 | '(see http://conll.cemantix.org/2012/data.html).)')
34 | parser.add_argument('-out',
35 | dest='output_filename',
36 | required=True,
37 | help='The output file the learned model will be saved '
38 | 'to.')
39 | parser.add_argument('-extractor',
40 | dest='extractor',
41 | required=True,
42 | help='The function to extract instances.')
43 | parser.add_argument('-perceptron',
44 | dest='perceptron',
45 | required=True,
46 | help='The perceptron to use.')
47 | parser.add_argument('-cost_function',
48 | dest='cost_function',
49 | required=True,
50 | help='The cost function to use.')
51 | parser.add_argument('-n_iter',
52 | dest='n_iter',
53 | default=5,
54 | help='Number of perceptron iterations. Defaults to 5.')
55 | parser.add_argument('-cost_scaling',
56 | dest='cost_scaling',
57 | default=1,
58 | help='Scaling factor of the cost function. Defaults '
59 | 'to 1')
60 | parser.add_argument('-random_seed',
61 | dest='seed',
62 | default=23,
63 | help='Random seed for training data shuffling. '
64 | 'Defaults to 23.')
65 | parser.add_argument('-features',
66 | dest='features',
67 | help='The file containing the list of features. If not'
68 | 'provided, defaults to a standard set of'
69 | 'features.')
70 |
71 | return parser.parse_args()
72 |
73 |
74 | if sys.version_info[0] == 2:
75 | logging.warning("You are running cort under Python 2. cort is much more "
76 | "efficient under Python 3.3+.")
77 |
78 | args = parse_args()
79 |
80 | if args.features:
81 | mention_features, pairwise_features = import_helper.get_features(
82 | args.features)
83 | else:
84 | mention_features = [
85 | features.fine_type,
86 | features.gender,
87 | features.number,
88 | features.sem_class,
89 | features.deprel,
90 | features.head_ner,
91 | features.length,
92 | features.head,
93 | features.first,
94 | features.last,
95 | features.preceding_token,
96 | features.next_token,
97 | features.governor,
98 | features.ancestry
99 | ]
100 |
101 | pairwise_features = [
102 | features.exact_match,
103 | features.head_match,
104 | features.same_speaker,
105 | features.alias,
106 | features.sentence_distance,
107 | features.embedding,
108 | features.modifier,
109 | features.tokens_contained,
110 | features.head_contained,
111 | features.token_distance
112 | ]
113 |
114 |
115 | perceptron = import_helper.import_from_path(args.perceptron)(
116 | cost_scaling=int(args.cost_scaling),
117 | n_iter=int(args.n_iter),
118 | seed=int(args.seed)
119 | )
120 |
121 | extractor = instance_extractors.InstanceExtractor(
122 | import_helper.import_from_path(args.extractor),
123 | mention_features,
124 | pairwise_features,
125 | import_helper.import_from_path(args.cost_function),
126 | perceptron.get_labels()
127 | )
128 |
129 | logging.info("Reading in data.")
130 | training_corpus = corpora.Corpus.from_file("training",
131 | codecs.open(args.input_filename,
132 | "r", "utf-8"))
133 |
134 | logging.info("Extracting system mentions.")
135 | for doc in training_corpus:
136 | doc.system_mentions = mention_extractor.extract_system_mentions(doc)
137 |
138 | model = experiments.learn(
139 | training_corpus,
140 | extractor,
141 | perceptron
142 | )
143 |
144 | logging.info("Writing model to file.")
145 | pickle.dump(model, open(args.output_filename, "wb"), protocol=2)
146 |
147 | logging.info("Done.")
148 |
--------------------------------------------------------------------------------
/bin/cort-visualize:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | from __future__ import print_function
5 | import argparse
6 | import codecs
7 | import logging
8 |
9 |
10 | from cort.preprocessing import pipeline
11 | from cort.analysis import visualization, error_extractors, spanning_tree_algorithms
12 | from cort.core import corpora
13 |
14 |
15 | __author__ = 'smartschat'
16 |
17 |
18 | def parse_args():
19 | parser = argparse.ArgumentParser(description='Visualize output.')
20 | parser.add_argument('input_filename',
21 | help='The files to visualize',
22 | nargs='*')
23 | parser.add_argument('-corenlp',
24 | required=True,
25 | dest='corenlp',
26 | help='Where is CoreNLP?')
27 |
28 | return parser.parse_args()
29 |
30 |
31 | logging.basicConfig(level=logging.INFO,
32 | format='%(asctime)s %(levelname)s %(''message)s')
33 |
34 | args = parse_args()
35 |
36 | p = pipeline.Pipeline(args.corenlp, with_coref=True)
37 |
38 | corpus_to_visualize = p.run_on_docs("corpus", args.input_filename)
39 |
40 | ex = error_extractors.ErrorExtractor(corpus_to_visualize,
41 | spanning_tree_algorithms.recall_accessibility,
42 | spanning_tree_algorithms.precision_system_output)
43 |
44 | ex.add_system(corpus_to_visualize)
45 |
46 | decisions = ex.get_errors()
47 |
48 | visualizer = visualization.Visualizer(decisions, "corpus",
49 | for_raw_input=True)
50 |
51 | visualizer.run()
--------------------------------------------------------------------------------
/bin/run-multigraph:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import logging
5 |
6 | from cort.core import corpora
7 | from cort.core import mention_extractor
8 | from cort.coreference.multigraph import multigraphs, features, decoders, \
9 | weighting_functions
10 |
11 |
12 | logging.basicConfig(level=logging.INFO,
13 | format='%(asctime)s %(levelname)s %(message)s')
14 |
15 | parser = argparse.ArgumentParser(description='Run the multigraph coreference '
16 | 'resolution system..')
17 | parser.add_argument('-in',
18 | required=True,
19 | dest='input_filename',
20 | help='The input file. Must follow the format of the CoNLL '
21 | 'shared tasks on coreference resolution (see '
22 | 'http://conll.cemantix.org/2012/data.html).)')
23 | parser.add_argument('-out',
24 | dest='output_filename',
25 | required=True,
26 | help='The output file.')
27 | parser.add_argument('-ante',
28 | dest='antecedents_output_filename',
29 | default=None,
30 | help='The file where antecedent information should be'
31 | 'written to. Defaults to None.')
32 |
33 | args = parser.parse_args()
34 |
35 | logging.info("Reading in corpus")
36 |
37 | corpus = corpora.Corpus.from_file("my corpus",
38 | open(args.input_filename))
39 |
40 | logging.info("Extracting system mentions")
41 | for doc in corpus:
42 | doc.system_mentions = mention_extractor.extract_system_mentions(doc)
43 |
44 | negative_features = [features.not_modifier,
45 | features.not_compatible,
46 | features.not_embedding,
47 | features.not_speaker,
48 | features.not_singleton,
49 | features.not_pronoun_distance,
50 | features.not_anaphoric]
51 |
52 | positive_features = [features.alias,
53 | features.non_pronominal_string_match,
54 | features.head_match,
55 | features.pronoun_same_canonical_form,
56 | features.anaphor_pronoun,
57 | features.speaker,
58 | features.antecedent_is_subject,
59 | features.antecedent_is_object,
60 | features.substring,
61 | features.lexical]
62 |
63 | cmc = multigraphs.CorefMultigraphCreator(
64 | positive_features,
65 | negative_features,
66 | weighting_functions.for_each_relation_with_distance,
67 | {})
68 |
69 | relation_weights = {}
70 |
71 | for relation in positive_features:
72 | relation_weights[relation] = 1
73 |
74 | relation_weights[features.antecedent_is_object] = 0.5
75 |
76 | cmc.relation_weights = relation_weights
77 |
78 | logging.info("Decoding")
79 |
80 | decoder = decoders.MultigraphDecoder(cmc)
81 |
82 | decoder.decode(corpus)
83 |
84 | logging.info("Writing coreference to file")
85 |
86 | corpus.write_to_file(open(args.output_filename, 'w'))
87 |
88 | if args.antecedents_output_filename:
89 | logging.info("Writing antecedent decisions to file")
90 | corpus.write_antecedent_decisions_to_file(
91 | open(args.antecedents_output_filename, 'w'))
92 |
93 | logging.info("Finished")
94 |
--------------------------------------------------------------------------------
/cort/__init__.py:
--------------------------------------------------------------------------------
1 | """ cort - a toolkit for coreference resolution and error analysis. """
2 |
3 | __author__ = 'martscsn'
4 |
--------------------------------------------------------------------------------
/cort/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | """ Classes and functions for coreference resolution error analysis and
2 | visualisation. """
3 |
4 | __author__ = 'smartschat'
5 |
--------------------------------------------------------------------------------
/cort/analysis/error_extractors.py:
--------------------------------------------------------------------------------
1 | """ Extract errors made by systems w.r.t. a reference corpus. """
2 |
3 |
4 | from cort.analysis import data_structures
5 |
6 |
7 | __author__ = 'smartschat'
8 |
9 |
10 | class ErrorExtractor:
11 | """ Extract, manage and store recall and precision errors.
12 |
13 | Error extraction for recall errors works as follows:
14 |
15 | Go through each document. For each reference entity e in the document,
16 | construct an entity graph g_e for e and compute a partition of g_e by the
17 | system entity graphs. Then compute a spanning tree t_e of g_e and take
18 | every edge in t_e that does not appear in the partition as an error.
19 |
20 | For computing precision errors, switch the roles of reference and system
21 | entities.
22 |
23 | Attributes:
24 | reference_corpus (Corpus): The reference corpus with the gold
25 | information concerning the coreference relation.
26 | recall_spanning_tree_algorithm (function): A function mapping an
27 | entity graph and one its partitions to a list of mentions pairs,
28 | which represent a spanning tree of the entity graph. This
29 | function is used to compute recall errors.
30 | precision_spanning_tree_algorithm (function): Same as above, but for
31 | precision errors.
32 | errors (dict): A mapping of error descriptions to sets containing the
33 | respective errors.
34 | """
35 | def __init__(self,
36 | reference_corpus,
37 | recall_spanning_tree_algorithm,
38 | precision_spanning_tree_algorithm,
39 | ):
40 | """ Initialize the error analysis.
41 |
42 | Args:
43 | reference_corpus (Corpus): The reference corpus with the gold
44 | information concerning the coreference relation.
45 | recall_spanning_tree_algorithm (function): A function mapping an
46 | entity graph and one its partitions to a list of mentions pairs,
47 | which represent a spanning tree of the entity graph. This
48 | function is used to compute recall errors.
49 | precision_spanning_tree_algorithm (function): Same as above, but for
50 | precision errors.
51 | """
52 |
53 | self.reference_corpus = reference_corpus
54 | self.recall_spanning_tree_algorithm = recall_spanning_tree_algorithm
55 | self.precision_spanning_tree_algorithm = \
56 | precision_spanning_tree_algorithm
57 | self.errors = {}
58 | self.corpora = {}
59 |
60 | def add_system(self, system_corpus, which_mentions="annotated"):
61 | """ Add a system to the error analysis.
62 |
63 | Error extraction for recall errors works as follows:
64 |
65 | Go through each document. For each reference entity e in the document,
66 | construct an entity graph g_e for e and compute a partition of g_e by
67 | the system entity graphs. Then compute a spanning tree t_e of g_e and
68 | take every edge in t_e that does not appear in the partition as an
69 | error.
70 |
71 | For computing precision errors, switch the roles of reference and system
72 | entities.
73 |
74 | Also extracts all pairwise decisions (if available).
75 |
76 | Args:
77 | system_corpus (Corpus): A corpus obtained from system output.
78 | which_mentions (str): Either "annotated" or "extracted",
79 | defaults to "annotated". Specifies from which mentions in
80 | the system corpus coreference information should be
81 | obtained, either annotated mentions or system mentions.
82 | """
83 | if which_mentions not in ["annotated", "extracted"]:
84 | raise ValueError("which_mentions must be"
85 | "either 'annotated' or 'extracted'.")
86 |
87 | recall_errors, precision_errors = self.__compute_errors(system_corpus,
88 | which_mentions)
89 |
90 | self.errors[system_corpus.description] = {
91 | "recall_errors": {},
92 | "precision_errors": {},
93 | "decisions": {}
94 | }
95 |
96 | self.errors[system_corpus.description]["recall_errors"]["all"] = \
97 | recall_errors
98 | self.errors[
99 | system_corpus.description]["precision_errors"]["all"] = \
100 | precision_errors
101 | self.errors[
102 | system_corpus.description]["decisions"]["all"] = \
103 | system_corpus.get_antecedent_decisions()[
104 | system_corpus.description]["decisions"]["all"]
105 |
106 | self.corpora[system_corpus.description] = system_corpus
107 |
108 | def get_errors(self):
109 | """ Get errors for all systems managed by this ErrorAnalysis.
110 |
111 | The errors are stored via an ``StructuredCoreferenceAnalysis`
112 | which can be accessed like a dict.
113 |
114 | If a corpus with the description
115 | ``ranking``was added via ``self.add_system``,
116 | ``self.errors["ranking"]["recall_errors"]["all"]``is an ``EnhancedSet``
117 | containing all recall errors of the system. Errors of other systems
118 | and precision errors can be accessed analogously.
119 |
120 | Returns:
121 | StructuredCoreferenceAnalysis: A StructuredCoreferenceAnalysis
122 | containing the errors.
123 | """
124 | return data_structures.StructuredCoreferenceAnalysis(
125 | self.errors, corpora=self.corpora,
126 | reference=self.reference_corpus)
127 |
128 | def __compute_errors(self, system_corpus, which_mentions):
129 | gold_graphs = [data_structures.EntityGraph.from_mentions(
130 | doc.annotated_mentions, "annotated_set_id")
131 | for doc in self.reference_corpus.documents]
132 |
133 | if which_mentions == 'annotated':
134 | system_graphs = [data_structures.EntityGraph.from_mentions(
135 | doc.annotated_mentions, "annotated_set_id")
136 | for doc in system_corpus.documents]
137 | else:
138 | system_graphs = [data_structures.EntityGraph.from_mentions(
139 | doc.system_mentions, "set_id")
140 | for doc in system_corpus.documents]
141 |
142 | recall_errors = []
143 | precision_errors = []
144 |
145 | for doc_gold_graphs, doc_system_graphs in zip(gold_graphs,
146 | system_graphs):
147 | recall_errors.extend(
148 | self.__compute_errors_for_doc(
149 | doc_gold_graphs,
150 | doc_system_graphs,
151 | self.recall_spanning_tree_algorithm))
152 | precision_errors.extend(
153 | self.__compute_errors_for_doc(
154 | doc_system_graphs,
155 | doc_gold_graphs,
156 | self.precision_spanning_tree_algorithm))
157 |
158 | return (data_structures.EnhancedSet(recall_errors),
159 | data_structures.EnhancedSet(precision_errors))
160 |
161 | @staticmethod
162 | def __compute_errors_for_doc(base_graphs,
163 | partitioning_graphs,
164 | spanning_tree_algorithm):
165 | errors = []
166 |
167 | for graph in base_graphs:
168 | errors.extend(
169 | ErrorExtractor.__compute_errors_for_graph(
170 | graph, partitioning_graphs, spanning_tree_algorithm))
171 |
172 | return errors
173 |
174 | @staticmethod
175 | def __compute_errors_for_graph(graph,
176 | partitioning_graphs,
177 | spanning_tree_algorithm):
178 | partitioned_graph = graph.partition(partitioning_graphs)
179 | spanning_tree = spanning_tree_algorithm(graph, partitioned_graph)
180 | extra_pairs = [
181 | (anaphor, antecedent) for anaphor, antecedent in spanning_tree
182 | if anaphor not in partitioned_graph.edges or
183 | antecedent not in partitioned_graph.edges[anaphor]
184 | ]
185 |
186 | return [(anaphor, antecedent) for anaphor, antecedent in sorted(
187 | extra_pairs)]
188 |
--------------------------------------------------------------------------------
/cort/analysis/plotting.py:
--------------------------------------------------------------------------------
1 | """ Plot error analysis statistics. """
2 |
3 | from __future__ import division
4 |
5 |
6 | from matplotlib import pyplot
7 | from matplotlib import cm
8 |
9 | import numpy
10 |
11 | from pylab import rcParams
12 |
13 |
14 | __author__ = 'martscsn'
15 |
16 |
17 | def plot(data,
18 | title,
19 | xlabel,
20 | ylabel,
21 | filename=None):
22 | """ Plot error analysis statistics.
23 |
24 | In particular, plot a bar chart for the numbers described in ``data``.
25 |
26 | Args:
27 | data (list(str, list((str,int)))): The data to be plotted. The ith entry
28 | of this list contains the name which will appear in the legend,
29 | and a list of (category, count) pairs. These are the individual
30 | data points which will be plotted.
31 | title (str): Title of the plot.
32 | xlabel (str): Label of the x axis.
33 | ylabel (str): Label of the y axis.
34 | filename (str, optional): If set, write plot to ``filename``.
35 |
36 | Example::
37 | pair_errs = errors["pair"]["recall_errors"]["all"]
38 | tree_errs = errors["tree"]["recall_errors"]["all"]
39 |
40 | plot(
41 | [("pair", [(cat, len(pair_errs[cat])) for cat in pair_errs.keys()]),
42 | ("tree", [(cat, len(tree_errs[cat])) for cat in tree_errs.keys()])],
43 | "Recall Errors",
44 | "Type of anaphor",
45 | "Number of Errors")
46 | """
47 |
48 | rcParams['xtick.major.pad'] = '12'
49 | rcParams['ytick.major.pad'] = '12'
50 |
51 | fig, ax = pyplot.subplots()
52 |
53 | systems = []
54 | categories = []
55 |
56 | colors = cm.Accent(numpy.linspace(0, 1, len(data)))
57 |
58 | bars_for_legend = []
59 |
60 | for i, system_data in enumerate(data):
61 | system_name, categories_and_numbers = system_data
62 | systems.append(system_name)
63 |
64 | for j, cat_and_number in enumerate(categories_and_numbers):
65 | category, number = cat_and_number
66 |
67 | if category not in categories:
68 | categories.append(category)
69 |
70 | bar = ax.bar(2*j + i*(1/len(data)), number, color=colors[i],
71 | width=1/len(data), label=system_name)
72 |
73 | if j == 0:
74 | bars_for_legend.append(bar)
75 |
76 | xticks = [2*k + 0.5 for k in range(0, len(categories))]
77 |
78 | pyplot.title(title, fontsize=28)
79 | pyplot.xlabel(xlabel, fontsize=24)
80 | pyplot.ylabel(ylabel, fontsize=24)
81 |
82 | ax.spines["top"].set_visible(False)
83 | ax.spines["right"].set_visible(False)
84 |
85 | ax.get_xaxis().tick_bottom()
86 | ax.get_yaxis().tick_left()
87 |
88 | ax.set_xticklabels(categories)
89 | ax.set_xticks(xticks)
90 |
91 | pyplot.tick_params(axis='both', which='major', labelsize=20)
92 |
93 | if filename:
94 | legend = ax.legend(bars_for_legend, systems,
95 | loc='upper right', bbox_to_anchor=(1.2, 1.2))
96 |
97 | fig.savefig(filename, bbox_extra_artists=(legend,), bbox_inches='tight')
98 | else:
99 | legend = ax.legend(bars_for_legend, systems, loc='upper right')
100 | legend.draggable()
101 |
102 | fig.show()
103 |
--------------------------------------------------------------------------------
/cort/analysis/spanning_tree_algorithms.py:
--------------------------------------------------------------------------------
1 | """ Algorithms for computing spanning trees of entity graphs. """
2 |
3 |
4 | __author__ = 'smartschat'
5 |
6 |
7 | def precision_system_output(entity, partitioned_entity):
8 | """ Compute a spanning tree from antecedent information.
9 |
10 | All edges in the spanning tree correspond to anaphor-antecedent pairs. In
11 | order to access this antecedent information, the attribute "antecedent" of
12 | the mentions in the entity must be set.
13 |
14 | Args:
15 | entity (EntityGraph): The EntityGraph for the entity for which the
16 | spanning tree should be computed.
17 | partitioned_entity (EntityGraph): A partition of the entity -- not
18 | used for this algorithm.
19 |
20 | Returns:
21 | list(Mention, Mention): A list of mention pairs, which constitute the
22 | edges of the spanning tree. For a pair (m, n), n appears later in
23 | the text than m.
24 | """
25 | edges = []
26 | for mention in entity.edges:
27 | # just look at system output
28 | if ("antecedent" in mention.attributes
29 | and mention.attributes["antecedent"] in entity.edges[mention]):
30 | edges.append((mention, mention.attributes["antecedent"]))
31 |
32 | return sorted(edges)
33 |
34 |
35 | def recall_closest(entity, partitioned_entity):
36 | """ Compute a spanning tree by always taking the closest mention in the same
37 | entity.
38 |
39 | Args:
40 | entity (EntityGraph): The EntityGraph for the entity for which the
41 | spanning tree should be computed.
42 | partitioned_entity (EntityGraph): A partition of the entity -- not
43 | used for this algorithm.
44 |
45 | Returns:
46 | list(Mention, Mention): A list of mention pairs, which constitute the
47 | edges of the spanning tree. For a pair (m, n), n appears later in
48 | the text than m.
49 | """
50 | edges = []
51 | for mention in entity.edges:
52 | # always take closest (except for first mention in entity, which does
53 | # not have any antecedent)
54 | if entity.edges[mention]:
55 | if mention in partitioned_entity.edges:
56 | antecedent = sorted(partitioned_entity.edges[mention],
57 | reverse=True)[0]
58 | else:
59 | antecedent = sorted(entity.edges[mention], reverse=True)[0]
60 | edges.append((mention, antecedent))
61 |
62 | return sorted(edges)
63 |
64 |
65 | def recall_accessibility(entity, partitioned_entity):
66 | """ Compute a spanning tree by choosing edges according to the accessibility
67 | of the antecedent.
68 |
69 | First, if a mention has an out-degree of at least one in the partitioned
70 | entity, take the edge with the closest mention distance as an edge for
71 | the spanning tree. Otherwise, proceed as follows.
72 |
73 | If a mention m is a proper name or a common noun, choose an antecedent as
74 | follows:
75 |
76 | - if a proper name antecedent exists, take the closest and output this
77 | pair as an edge
78 | - else if a common noun antecedent exists, take the closest and output
79 | this pair as an edge
80 | - else take the closest preceding mention and output this pair as an
81 | edge
82 |
83 | For all other mentions, take the closest preceding mention and output
84 | this pair as an edge.
85 |
86 | Args:
87 | entity (EntityGraph): The EntityGraph for the entity for which the
88 | spanning tree should be computed.
89 | partitioned_entity (EntityGraph): A partition of the entity -- not
90 | used for this algorithm.
91 |
92 | Returns:
93 | list(Mention, Mention): A list of mention pairs, which constitute the
94 | edges of the spanning tree. For a pair (m, n), n appears later in
95 | the text than m.
96 | """
97 | edges = []
98 | for mention in entity.edges:
99 | if entity.edges[mention]:
100 | # mention is not the first in subentity? take closest!
101 | if mention in partitioned_entity.edges:
102 | antecedent = sorted(partitioned_entity.edges[mention],
103 | reverse=True)[0]
104 | else:
105 | antecedent = __get_antecedent_by_type(mention,
106 | entity.edges[mention])
107 |
108 | edges.append((mention, antecedent))
109 |
110 | return sorted(edges)
111 |
112 |
113 | def __get_antecedent_by_type(mention, candidates):
114 | # make sure...
115 | candidates_reversed = sorted(candidates, reverse=True)
116 | # mention is (demonstrative) pronoun? take closest!
117 | if (mention.attributes["type"] == "PRO" or
118 | mention.attributes["type"] == "DEM"):
119 | return candidates_reversed[0]
120 | # otherwise chose by type, back off to closest
121 | elif __get_by_pos(candidates_reversed, "NAM"):
122 | return __get_by_pos(candidates_reversed, "NAM")
123 | elif __get_by_pos(candidates_reversed, "NOM"):
124 | return __get_by_pos(candidates_reversed, "NOM")
125 | else:
126 | return candidates_reversed[0]
127 |
128 |
129 | def __get_by_pos(candidates, pos):
130 | for mention in candidates:
131 | if mention.attributes["type"] == pos:
132 | return mention
133 |
--------------------------------------------------------------------------------
/cort/analysis/visualization/TODO:
--------------------------------------------------------------------------------
1 | Python:
2 | - use python http server in order to avoid multi-megabyte html blobs (simplehttpserver)
3 | !- use discernible colours: https://github.com/gtaylor/python-colormath
4 |
5 | jQuery/javascript:
6 | - improve mentionhead tooltip behaviour
7 | - Dynamic computation of heights, etc. in scroll()
--------------------------------------------------------------------------------
/cort/analysis/visualization/style.css:
--------------------------------------------------------------------------------
1 | html, body {
2 | margin: 0;
3 | font-family: Sans-Serif;
4 | }
5 |
6 | h1 {
7 | padding: 5px;
8 | margin: 0;
9 | text-align: left;
10 | }
11 |
12 | h3 {
13 | margin: 0 0 0 10px;
14 | padding: 0;
15 | font-family: Sans-Serif;
16 | font-size: 1em;
17 | }
18 |
19 | #header {
20 | background-color: rgb(1,70,153);
21 | margin: 0;
22 | padding: 5px;
23 | height: 50px;
24 | width: 100%;
25 | color: white;
26 | position: fixed;
27 | top: 0;
28 | z-index: 25;
29 | }
30 |
31 | #documentsNavi {
32 | margin: 10px 0 0 0;
33 | padding: 0;
34 | width: 225px; /* Must be same as .navcontainer*/
35 | position: fixed;
36 | top: 60px;
37 | float: left;
38 | }
39 |
40 | #documentsNavi ul {
41 | margin: 5px 0 0 10px;
42 | padding: 5px 0;
43 | list-style-type: none;
44 | height: 100px;
45 | overflow: auto;
46 | font-size: .8em;
47 | background-color: #bbbbbb;
48 | }
49 |
50 | #documentsNavi ul li {
51 | margin: 0;
52 | padding: 5px;
53 | cursor: pointer;
54 | }
55 |
56 | #documentsNavi li:nth-child(even) {
57 | background-color: #bbbbbb;
58 | }
59 |
60 | #documentsNavi li:nth-child(odd) {
61 | background-color: #cccccc;
62 | }
63 |
64 | #documentsNavi ul li:hover, #documentsNavi ul li:active {
65 | background-color: gray;
66 | }
67 |
68 | #documentsNavi ul li.highlight {
69 | font-weight: bolder;
70 | }
71 |
72 | /* Contains navigation bars and the document text itself */
73 | .document {
74 | margin: 80px 10px 0 0;
75 | padding: 0;
76 | display: none;
77 | min-height: 600px;
78 | }
79 |
80 | #documentsNavi + .document {
81 | display: block;
82 | }
83 |
84 | .navcontainer {
85 | margin: 0;
86 | padding: 0;
87 | position: fixed;
88 | top: 200px;
89 | width: 225px;
90 | }
91 |
92 | .navcontainer > div {
93 | margin-top: 20px;
94 | padding: 0;
95 | }
96 |
97 | .tease {
98 | display: none;
99 | opacity: .8;
100 | margin: 0 0 0 5px;
101 | padding: 0;
102 | font-family: Sans-Serif;
103 | font-size: .8em;
104 | }
105 |
106 | .navcontainer > div h3:hover {
107 | display: inline-block;
108 | cursor: pointer;
109 | }
110 |
111 | .navcontainer > div h3:hover + .tease {
112 | display: inline-block;
113 | }
114 |
115 | /* Gold and system navigation boxes */
116 | .navcontainer > div > ul {
117 | margin: 5px 0 0 10px;
118 | padding: 5px 0;
119 | list-style-type: none;
120 | overflow-y: auto;
121 | max-height: 80px;
122 | font-size: .8em;
123 | background-color: #bbbbbb;
124 | }
125 |
126 | div.navcontainer div ul li:nth-child(even) {
127 | background-color: #bbbbbb;
128 | }
129 |
130 | div.navcontainer div ul li:nth-child(odd) {
131 | background-color: #cccccc;
132 | }
133 |
134 | .navcontainer > div ul li {
135 | margin: 0;
136 | padding: 2px;
137 | }
138 |
139 | .navcontainer > div ul li:hover {
140 | cursor: pointer;
141 | }
142 |
143 | /* Errors navigation box */
144 | div.errorsNavi {
145 |
146 | }
147 |
148 | div.errorsNavi h4 {
149 | margin: 0;
150 | padding: 2px;
151 | font-size: .9em;
152 | font-weight: light;
153 | }
154 |
155 | div.errorsNavi h4:hover {
156 | display: inline-block;
157 | cursor: pointer;
158 | }
159 |
160 | div.errorsNavi h4:hover + .tease {
161 | display: inline-block;
162 | }
163 |
164 | div.errorsNavi > div {
165 | margin: 0 0 0 10px;
166 | padding: 5px 0;
167 | background-color: #eeeeee;
168 | }
169 |
170 | .precisionErrors, .recallErrors {
171 | margin: 0;
172 | padding: 5px 0;
173 | list-style-type: none;
174 | font-size: .8em;
175 | height: 80px;
176 | overflow: auto;
177 | }
178 |
179 | ol.text {
180 | margin: 10px 0 0 250px;
181 | padding: 5px;
182 | line-height: 250%;
183 | font-family: Sans-Serif;
184 | font-size: .9em;
185 | background-color: #eeeeee;
186 | }
187 |
188 | ol.text {
189 | counter-reset: li;
190 | display: table;
191 | }
192 |
193 | ol.text li.sentence {
194 | margin: 0;
195 | padding: 0;
196 | }
197 |
198 | ol.text > li {
199 | margin: 0 0 6px 2em;
200 | padding: 4px 8px;
201 | list-style: none;
202 | counter-increment: li;
203 | display: table-row;
204 | }
205 |
206 | ol.text > li:before {
207 | content: counter(li) ".";
208 | font-size: .7em;
209 | color: gray;
210 | display: inline-block;
211 | width: 20px;
212 | text-align: right;
213 | padding-right: 5px;
214 | display: table-cell;
215 | }
216 |
217 | ol.text span.mention {
218 | margin: 0;
219 | display: inline;
220 | border-radius: 0.5em;
221 | }
222 |
223 | ol.text span.mention:hover {
224 | cursor: pointer;
225 | }
226 |
227 | div ol.text .goldBorder {
228 | border: 3px solid gold;
229 | }
230 |
231 | div ol.text .blueBorder {
232 | border: 3px solid blue;
233 | }
234 |
235 | ol.text *[class^='system']{
236 | border: 1px solid blue;
237 | padding: 5px;
238 | }
239 |
240 | ol.text *[class^='gold']{
241 | border: 1px solid gold;
242 | padding: 2px;
243 | }
244 | /*
245 | span.transparentBg, .goldNavi ul li.transparentBg, .systemNavi ul li.transparentBg {
246 | background-color: transparent;
247 | border: 3px solid transparent;
248 | }
249 | */
250 | .label {
251 | line-height: 100%;
252 | background-color: #F1F101;
253 | z-index: 24;
254 | opacity: .9;
255 | box-shadow: 2px 2px 13px #aaa;
256 | }
257 |
258 | .label:hover {
259 | display: block;
260 | }
--------------------------------------------------------------------------------
/cort/config_files/corenlp.ini:
--------------------------------------------------------------------------------
1 | annotators = tokenize,ssplit,pos,lemma,parse,ner
--------------------------------------------------------------------------------
/cort/config_files/corenlp_with_coref.ini:
--------------------------------------------------------------------------------
1 | annotators = tokenize,ssplit,pos,lemma,parse,ner
2 | tokenize.whitespace = true
3 | ssplit.eolonly = true
--------------------------------------------------------------------------------
/cort/core/__init__.py:
--------------------------------------------------------------------------------
1 | """ Includes core functionality for managing documents and mentions."""
2 |
3 | __author__ = 'martscsn'
4 |
--------------------------------------------------------------------------------
/cort/core/external_data.py:
--------------------------------------------------------------------------------
1 | """ Read in and access data from external resources such as gender lists."""
2 |
3 | import os
4 | import pickle
5 |
6 |
7 | import cort
8 | from cort.core import singletons
9 | from cort.core import util
10 |
11 |
12 | __author__ = 'smartschat'
13 |
14 |
15 | @singletons.Singleton
16 | class GenderData:
17 | """ Read in and access data from lists with gender information.
18 |
19 | Attributes:
20 | word_to_gender (dict(str, str)): A mapping from lower-case strings
21 | to one of four genders: 'MALE', 'FEMALE', 'NEUTRAL' and 'PLURAL'.
22 | """
23 | def __init__(self):
24 | """ Initialize the word-to-gender mapping from gender lists.
25 | """
26 | self.word_to_gender = {}
27 |
28 | directory = cort.__path__[0] + "/resources/"
29 |
30 | lists = [
31 | open(directory + "male.list"),
32 | open(directory + "female.list"),
33 | open(directory + "neutral.list"),
34 | open(directory + "plural.list")
35 | ]
36 |
37 | genders = ["MALE", "FEMALE", "NEUTRAL", "PLURAL"]
38 |
39 | for gender, gender_list in zip(genders, lists):
40 | for word in gender_list.readlines():
41 | self.word_to_gender[word.strip()] = gender
42 |
43 | def look_up(self, attributes):
44 | """ Look up the gender of a mention described by the input attributes.
45 |
46 | Args:
47 | attributes (dict(str,object)): A dict describing attributes of
48 | mentions. Must contain "tokens" and "head", which have lists
49 | of strings as values.
50 |
51 | Returns:
52 | (str): None or one of the four genders 'MALE', 'FEMALE',
53 | 'NEUTRAL' or 'PLURAL'.
54 | """
55 | # whole string
56 | if " ".join(attributes["tokens"]).lower() in self.word_to_gender:
57 | return self.word_to_gender[" ".join(attributes["tokens"]).lower()]
58 | # head
59 | elif " ".join(attributes["head"]).lower() in self.word_to_gender:
60 | return self.word_to_gender[" ".join(attributes["head"]).lower()]
61 | # head token by token
62 | elif self.__look_up_token_by_token(attributes["head"]):
63 | return self.__look_up_token_by_token(attributes["head"])
64 |
65 | def __look_up_token_by_token(self, tokens):
66 | for token in tokens:
67 | if token[0].isupper() and token.lower() in self.word_to_gender:
68 | return self.word_to_gender[token.lower()]
69 |
70 |
71 | @singletons.Singleton
72 | class LexicalData:
73 | """ Read in and access data containing pairs of coreferent mention strings.
74 |
75 | Attributes:
76 | pairs (set((str, str))): A set of string pairs, which represent strings
77 | of potentially coreferent mentions.
78 | """
79 | def __init__(self):
80 | """ Initialize the set of pairs from
81 | package_root/resources/coreferent_pairs.obj.
82 | """
83 | directory = cort.__path__[0] + "/resources/"
84 |
85 | self.pairs = pickle.load(
86 | open(directory + "coreferent_pairs.obj", "rb"))
87 |
88 | def look_up(self, anaphor, antecedent):
89 | """ Look up strings of the mentions in the pair list.
90 |
91 | Args:
92 | anaphor (Mention): A mention.
93 | antecedent (Mention): Another mention, the candidate antecedent
94 | for anaphor.
95 |
96 | Returns:
97 | True if the pair of strings corresponding to anaphor of
98 | antecedent, stripped determiners and possessive s, can be found
99 | in the list of pairs.
100 | """
101 | # whole string
102 | anaphor_cleaned = " ".join(
103 | util.clean_via_pos(anaphor.attributes["tokens"],
104 | anaphor.attributes["pos"]))
105 | antecedent_cleaned = " ".join(
106 | util.clean_via_pos(antecedent.attributes["tokens"],
107 | antecedent.attributes["pos"]))
108 |
109 | return (
110 | (anaphor_cleaned, antecedent_cleaned) in self.pairs
111 | or (antecedent_cleaned, anaphor_cleaned) in self.pairs
112 | )
113 |
114 |
115 | @singletons.Singleton
116 | class SingletonMentions:
117 | """ Read in and access data strings of singleton mentions.
118 |
119 | Attributes:
120 | singletons (set(str)): A set of strings, which represent strings of
121 | of potential singleton mentions.
122 | """
123 | def __init__(self):
124 | """ Initialize the set of pairs from
125 | package_root/resources/singletons_not_cleaned.obj.
126 | """
127 | directory = cort.__path__[0] + "/resources/"
128 |
129 | self.singletons = pickle.load(
130 | open(directory + "singletons_not_cleaned.obj", "rb"))
131 |
--------------------------------------------------------------------------------
/cort/core/mixins.py:
--------------------------------------------------------------------------------
1 | """ Mixins. """
2 |
3 |
4 | __author__ = 'smartschat'
5 |
6 |
7 | class ComparableMixin:
8 | """ A mixin for deducing comparison operators from __lt__. """
9 | def __eq__(self, other):
10 | if self is None and other is not None:
11 | return False
12 | elif self is not None and other is None:
13 | return False
14 | else:
15 | return not self < other and not other < self
16 |
17 | def __ne__(self, other):
18 | return self < other or other < self
19 |
20 | def __gt__(self, other):
21 | return other < self
22 |
23 | def __ge__(self, other):
24 | return not self < other
25 |
26 | def __le__(self, other):
27 | return not other < self
28 |
--------------------------------------------------------------------------------
/cort/core/singletons.py:
--------------------------------------------------------------------------------
1 | """ Implements the singleton pattern. """
2 |
3 |
4 | __author__ = 'smartschat'
5 |
6 |
7 | class Singleton:
8 | """
9 | A non-thread-safe helper class to ease implementing singletons.
10 | This should be used as a decorator -- not a metaclass -- to the
11 | class that should be a singleton.
12 |
13 | The decorated class can define one `__init__` function that
14 | takes only the `self` argument. Other than that, there are
15 | no restrictions that apply to the decorated class.
16 |
17 | To get the singleton instance, use the `get_instance` method. Trying
18 | to use `__call__` will result in a `TypeError` being raised.
19 |
20 | Limitations: The decorated class cannot be inherited from.
21 |
22 | Source:
23 | http://stackoverflow.com/questions/42558/python-and-the-singleton-pattern
24 |
25 | """
26 |
27 | def __init__(self, decorated):
28 | self._decorated = decorated
29 | self._instance = None
30 |
31 | def get_instance(self):
32 | """
33 | Returns the singleton instance. Upon its first call, it creates a
34 | new instance of the decorated class and calls its `__init__` method.
35 | On all subsequent calls, the already created instance is returned.
36 |
37 | """
38 | if self._instance:
39 | return self._instance
40 | else:
41 | self._instance = self._decorated()
42 | return self._instance
43 |
44 | def __call__(self):
45 | raise TypeError('Singletons must be accessed through '
46 | '`get_instance()`.')
47 |
48 | def __instancecheck__(self, inst):
49 | return isinstance(inst, self._decorated)
50 |
--------------------------------------------------------------------------------
/cort/core/spans.py:
--------------------------------------------------------------------------------
1 | """ Manage spans in documents. """
2 |
3 | from cort.core import mixins
4 |
5 |
6 | __author__ = 'smartschat'
7 |
8 |
9 | class Span(mixins.ComparableMixin):
10 | """ Manage and compare spans in documents.
11 |
12 | Attributes:
13 | begin (int): The begin of the span.
14 | end (int): The end of the span (inclusive).
15 | """
16 | def __init__(self, begin, end):
17 | """ Initialize a span from a begin and an end position.
18 |
19 | Args:
20 | begin (int): The begin of the span.
21 | end (int): The end of the span.
22 | """
23 | self.begin = begin
24 | self.end = end
25 |
26 | def __str__(self):
27 | return "(" + str(self.begin) + ", " + str(self.end) + ")"
28 |
29 | def __repr__(self):
30 | return "(" + str(self.begin) + ", " + str(self.end) + ")"
31 |
32 | def __lt__(self, other):
33 | """ Check whether this span is less than another span.
34 |
35 | (a,b) < (c,d) if and only if a < c or a = c and b < d
36 |
37 | Args:
38 | other (Span): A span.
39 |
40 | Returns:
41 | True if this span is less than other, False otherwise.
42 | """
43 | if self.begin < other.begin:
44 | return True
45 | elif self.begin > other.begin:
46 | return False
47 | elif self.end < other.end:
48 | return True
49 | else:
50 | return False
51 |
52 | def embeds(self, other):
53 | """ Check whether this span embeds another span.
54 |
55 | Args:
56 | other (Span): A span.
57 |
58 | Returns:
59 | True if this span embeds other, False otherwise.
60 | """
61 | return self.begin <= other.begin and self.end >= other.end
62 |
63 | def __hash__(self):
64 | return hash((self.begin, self.end))
65 |
66 | @staticmethod
67 | def parse(span_string):
68 | """ Parse a string specification of a span to a Span object.
69 |
70 | Valid representations are for example "(1, 2)" or "(1,2)".
71 |
72 | Args:
73 | span_string (str): A string representation of a span.
74 |
75 | Returns:
76 | Span: The span corresponding to the string representation.
77 | """
78 | without_brackets = span_string.strip()[1:-1]
79 | splitted_and_stripped = [token.strip() for token
80 | in without_brackets.split(",")]
81 | return Span(
82 | int(splitted_and_stripped[0]),
83 | int(splitted_and_stripped[1]))
84 |
--------------------------------------------------------------------------------
/cort/core/util.py:
--------------------------------------------------------------------------------
1 | """ Utility functions. """
2 |
3 | __author__ = 'smartschat'
4 |
5 |
6 | def clean_via_pos(tokens, pos):
7 | """ Clean a list of tokens according to their part-of-speech tags.
8 |
9 | In particular, retain only tokens which do not have the part-of-speech tag
10 | DT (determiner) or POS (possessive 's').
11 |
12 | Args:
13 | tokens (list(str)): A list of tokens.
14 | pos (list(str)): A list of corresponding part-of-speech tags.
15 |
16 | Returns:
17 | list(str): The list of tokens which do not have part-of-speech tag
18 | DT or POS.
19 | """
20 | return [token for token, pos in zip(tokens, pos)
21 | if pos not in ["DT", "POS"]]
22 |
--------------------------------------------------------------------------------
/cort/coreference/__init__.py:
--------------------------------------------------------------------------------
1 | """ Includes a unified framework for representation and learning of coreference
2 | resolution approaches."""
3 |
4 | __author__ = 'martscsn'
5 |
--------------------------------------------------------------------------------
/cort/coreference/approaches/__init__.py:
--------------------------------------------------------------------------------
1 | ''' Contains implementations of various coreference resolution approaches in
2 | the unified framework.
3 | '''
4 |
5 | __author__ = 'martscsn'
6 |
--------------------------------------------------------------------------------
/cort/coreference/approaches/antecedent_trees.py:
--------------------------------------------------------------------------------
1 | """ Implements instance extraction and decoding for antecedent trees.
2 |
3 | This module implements antecedent trees (Fernandes et al., 2014) within a
4 | framework that expresses coreference resolution as predicting latent structures,
5 | while performing learning using a latent structured perceptron with
6 | cost-augmented inference.
7 |
8 | Hence, antecedent trees are expressed as as predicting a latent graph.
9 | In particular, let m_1, ..., m_n be all mentions in a document. Let m_0 be a
10 | dummy mention for anaphoricity determination. We predict
11 | the graph with nodes m_0, ..., m_n and with arcs (m_j, m_i) which correspond to
12 | antecedent decisions. In particular, for each j there exists exactly one i < j
13 | such that (m_j, m_i) is in the graph. Such a graph is called aa *substructure*
14 | (for antecedent trees, substructures and structures coincide).
15 |
16 | To implement antecedent trees, this module contains a function that defines the
17 | search space for the graphs, and a decoder that computes the best-scoring tree
18 | of antecedent decisions, and the best-scoring tree of antecedent decisions
19 | consistent with the gold annotation (i.e. only having pairs of coreferent
20 | mentions as arcs).
21 |
22 | Reference:
23 |
24 | - Eraldo Fernandes, Cicero dos Santos, and Ruy Milidiu. 2014. Latent trees
25 | for coreference resolution. *Computational Linguistics*, 40(4):801-835.
26 | http://www.aclweb.org/anthology/J14-4004
27 | """
28 |
29 | from __future__ import division
30 |
31 |
32 | import array
33 |
34 |
35 | from cort.coreference import perceptrons
36 |
37 |
38 | __author__ = 'martscsn'
39 |
40 |
41 | def extract_substructures(doc):
42 | """ Extract the search space for the antecedent tree model,
43 |
44 | The mention ranking model consists in computing the optimal antecedent for
45 | each anaphor. These decisions are represented as edges in a tree of
46 | anaphor-antecedent decisions. This functions extracts the search space for
47 | the tree.
48 |
49 | The search space is represented as a nested list of mention pairs. The
50 | mention pairs are candidate arcs in the graph. The nested list contains
51 | only one list, since antecedent trees have only one substructure for
52 | each document.
53 |
54 | The list contains all potential (anaphor, antecedent) pairs in the
55 | following order: (m_1, m_0), (m_2, m_1), (m_2, m_0), (m_3, m_2), ...,
56 | where m_j is the jth mention in the document.
57 |
58 | Args:
59 | doc (CoNLLDocument): The document to extract substructures from.
60 |
61 | Returns:
62 | (list(list(Mention, Mention))): The nested list of mention pairs
63 | describing the search space for the substructures.
64 | """
65 | substructure = []
66 |
67 | # iterate over mentions
68 | for i, ana in enumerate(doc.system_mentions):
69 |
70 | # iterate in reversed order over candidate antecedents
71 | for ante in sorted(doc.system_mentions[:i], reverse=True):
72 | substructure.append((ana, ante))
73 |
74 | return [substructure]
75 |
76 |
77 | class AntecedentTreePerceptron(perceptrons.Perceptron):
78 | """ A perceptron for antecedent trees. """
79 | def argmax(self, substructure, arc_information):
80 | """ Decoder for antecedent trees.
81 |
82 | Compute highest-scoring antecedent tree and highest-scoring antecedent
83 | tree consistent with the gold annotation.
84 |
85 | Args:
86 | substructure (list((Mention, Mention))): The list of mention pairs
87 | which define the search space for one substructure. For mention
88 | ranking, this list contains all potential anaphor-antecedent
89 | pairs in the following order:
90 | (m_1, m_0), (m_2, m_1), (m_2, m_0), (m_3, m_2), ...
91 | arc_information (dict((Mention, Mention),
92 | ((array, array, array), list(int), bool)):
93 | A mapping of arcs (= mention pairs) to information about these
94 | arcs. The information consists of the features, the costs for
95 | the arc (for each label), and whether predicting the arc to be
96 | coreferent is consistent with the gold annotation). The features
97 | are divided in three arrays: the first array contains the non-
98 | numeric features, the second array the numeric features, and the
99 | third array the values for the numeric features. The features
100 | are represented as integers via feature hashing.
101 |
102 | Returns:
103 | A 7-tuple describing the highest-scoring antecedent tree, and the
104 | highest-scoring antecedent tree consistent with the gold
105 | annotation. The tuple consists of:
106 |
107 | - **best_arcs** (*list((Mention, Mention))*): the arcs
108 | constituting the highest-scoring antecedent tree,
109 | - **best_labels** (*list(str)*): empty, the antecedent tree
110 | approach does not employ any labels,
111 | - **best_scores** (*list(float)*): the scores of the
112 | arcs in the highest-scoring antecedent tree,
113 | - **best_cons_arcs** (*list((Mention, Mention))*): the arcs
114 | constituting the highest-scoring antecedent tree consistent
115 | with the gold annotation.
116 | - **best_cons_labels** (*list(str)*): empty, the antecedent
117 | tree approach does not employ any labels
118 | - **best_cons_scores** (*list(float)*): the scores of the
119 | arcs in the highest-scoring antecedent tree consistent with
120 | the gold annotation,
121 | - **is_consistent** (*bool*): whether the highest-scoring
122 | antecedent tree is consistent with the gold annotation.
123 | """
124 | if not substructure:
125 | return [], [], [], [], [], [], True
126 |
127 | number_mentions = len(substructure[0][0].document.system_mentions)
128 |
129 | arcs = []
130 | arcs_scores = []
131 | coref_arcs = []
132 | coref_arcs_scores = []
133 |
134 | is_consistent = True
135 |
136 | for ana_index in range(1, number_mentions):
137 |
138 | first_arc = ana_index*(ana_index-1)//2
139 | last_arc = first_arc + ana_index
140 |
141 | best, max_val, best_cons, max_cons, best_is_consistent = \
142 | self.find_best_arcs(substructure[first_arc:last_arc],
143 | arc_information)
144 |
145 | arcs.append(best)
146 | arcs_scores.append(max_val)
147 | coref_arcs.append(best_cons)
148 | coref_arcs_scores.append(max_cons)
149 |
150 | is_consistent &= best_is_consistent
151 |
152 | return (
153 | arcs,
154 | [],
155 | arcs_scores,
156 | coref_arcs,
157 | [],
158 | coref_arcs_scores,
159 | is_consistent
160 | )
161 |
--------------------------------------------------------------------------------
/cort/coreference/clusterer.py:
--------------------------------------------------------------------------------
1 | """ Extract coreference information from pairwise predictions."""
2 |
3 | __author__ = 'smartschat'
4 |
5 |
6 | def best_first(substructures, labels, scores, coref_labels):
7 | """ Extract coreference clusters from coreference predictions via best-first
8 | clustering.
9 |
10 | In particular, go through a list of anaphor-antecedent pairs, where
11 | pairs with the same anaphor are consecutive. Then, for each anaphor, the
12 | best-scoring antecedent is selected (this is also called best-first
13 | clustering). Ties are broken by position in the list: earlier items are
14 | preferred.
15 |
16 | Args:
17 | substructures (list(list((Mention, Mention)))): A list of substructures.
18 | For this clusterer, each substructure should contain only one
19 | (anaphor, antecedent) pair. If two substructures have the same
20 | anaphor, they should be consecutive.
21 | labels (list(list(str))): A list of arc labels. This list should
22 | have the same length as the list of substructures, and each inner
23 | list should contain only one element (as in ``substructures``).
24 | Each entry describes the label of an arc.
25 | labels (list(list(str))): A list of arc scores. This list should
26 | have the same length as the list of substructures, and each inner
27 | list should contain only one element (as in ``substructures``).
28 | Each entry describes the score of an arc.
29 | coref_labels (set(str)): A list of labels that indicate that mentions
30 | connected via an arc that has one of these labels are coreferent.
31 |
32 | Returns
33 | A tuple containing two dicts. The components are
34 |
35 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of
36 | mentions to entity identifiers.
37 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of
38 | mentions to their antecedent.
39 | """
40 |
41 | anaphor = None
42 | best = None
43 | max_val = float('-inf')
44 |
45 | mention_entity_mapping = {}
46 | antecedent_mapping = {}
47 |
48 | for substructure, substructure_label, substructure_score in zip(
49 | substructures, labels, scores):
50 | # each substructure consists of one pair
51 | pair = substructure[0]
52 | label = substructure_label[0]
53 | score = substructure_score[0]
54 | current_anaphor, current_antecedent = pair
55 | if current_anaphor != anaphor:
56 | # change in anaphor: set coreference information based on
57 | # best-scoring antecedent
58 | if anaphor and best and not best.is_dummy():
59 | antecedent_mapping[anaphor] = best
60 | if best not in mention_entity_mapping:
61 | mention_entity_mapping[best] = \
62 | best.document.system_mentions.index(best)
63 |
64 | mention_entity_mapping[anaphor] = \
65 | mention_entity_mapping[best]
66 |
67 | best = None
68 | max_val = float('-inf')
69 |
70 | if score > max_val and label in coref_labels:
71 | max_val = score
72 | best = current_antecedent
73 |
74 | anaphor = current_anaphor
75 |
76 | if anaphor and best and not best.is_dummy():
77 | antecedent_mapping[anaphor] = best
78 | if best not in mention_entity_mapping:
79 | mention_entity_mapping[best] = \
80 | best.document.system_mentions.index(best)
81 |
82 | mention_entity_mapping[anaphor] = \
83 | mention_entity_mapping[best]
84 |
85 | return mention_entity_mapping, antecedent_mapping
86 |
87 |
88 | def all_ante(substructures, labels, scores, coref_labels):
89 | """ Extract coreference clusters from coreference predictions via transitive
90 | closure.
91 |
92 | In particular, go through all (anaphor, antecedent) pairs contained in
93 | ``substructures``, and obtain coreference clusters by transitive closure.
94 |
95 | Args:
96 | substructures (list(list((Mention, Mention)))): A list of substructures.
97 | labels (list(list(str))): Not used by this function.
98 | labels (list(list(str))): Not used by this function.
99 | coref_labels (set(str)): Not used by this function.
100 |
101 | Returns
102 | A tuple containing two dicts. The components are
103 |
104 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of
105 | mentions to entity identifiers.
106 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of
107 | mentions to their antecedent.
108 | """
109 | mention_entity_mapping = {}
110 | antecedent_mapping = {}
111 |
112 | for substructure in substructures:
113 | for pair in substructure:
114 | anaphor, antecedent = pair
115 |
116 | # skip dummy antecedents
117 | if antecedent.is_dummy():
118 | continue
119 |
120 | antecedent_mapping[anaphor] = antecedent
121 |
122 | # antecedent is not in the mapping: we initialize a new coreference
123 | # chain
124 | if antecedent not in mention_entity_mapping:
125 | # chain id: index of antecedent in system mentions
126 | mention_entity_mapping[antecedent] = \
127 | antecedent.document.system_mentions.index(antecedent)
128 |
129 | # assign id based on antecedent
130 | mention_entity_mapping[anaphor] = \
131 | mention_entity_mapping[antecedent]
132 |
133 | return mention_entity_mapping, antecedent_mapping
134 |
--------------------------------------------------------------------------------
/cort/coreference/cost_functions.py:
--------------------------------------------------------------------------------
1 | """ Cost functions used during learning of coreference predictors. """
2 |
3 | __author__ = 'martscsn'
4 |
5 |
6 | def cost_based_on_consistency(arc, label="+"):
7 | """ Assign cost to arcs based on consistency of decision and anaphoricity.
8 |
9 | An anaphor-antecedent decision is consistent if either
10 | (a) the mentions are coreferent, or
11 | (b) the antecedent is the dummy mention, and the anaphor does not have
12 | any preceding coreferent mention among all extracted mentions.
13 |
14 | Note that (b) also contains cases where the mention has an antecedent in the
15 | gold data, but we were unable to extract this antecedent due to errors in
16 | mention detection.
17 |
18 | If the anaphor-antecedent decision represented by ``arc``is consistent, it
19 | gets cost 0. If the the decision is not consistent, and the antecedent is
20 | the dummy mention, it gets cost 2. Otherwise, it gets cost 1.
21 |
22 | Args:
23 | arc ((Mention, Mention)): A pair of mentions.
24 | label (str): The label to predict for the arc. Defaults to '+'.
25 |
26 | Return:
27 | (int): The cost of predicting the arc.
28 | """
29 | ana, ante = arc
30 |
31 | consistent = ana.decision_is_consistent(ante)
32 |
33 | # false new
34 | if not consistent and ante.is_dummy():
35 | return 2
36 | # wrong link
37 | elif not consistent:
38 | return 1
39 | else:
40 | return 0
41 |
42 |
43 | def null_cost(arc, label="+"):
44 | """ Dummy cost function which always returns 0 (corresponding to not using
45 | a cost function at all).
46 |
47 | Args:
48 | arc ((Mention, Mention)): A pair of mentions.
49 | label (str): The label to predict for the arc. Defaults to '+'
50 |
51 | Return:
52 | 0
53 | """
54 | return 0
--------------------------------------------------------------------------------
/cort/coreference/experiments.py:
--------------------------------------------------------------------------------
1 | """ Manage learning from training data and making predictions on test data. """
2 |
3 |
4 | import logging
5 |
6 |
7 | __author__ = 'smartschat'
8 |
9 |
10 | def learn(training_corpus, instance_extractor, perceptron):
11 | """ Learn a model for coreference resolution from training data.
12 |
13 | In particular, apply an instance/feature extractor to a training corpus and
14 | employ a machine learning model to learn a weight vector from these
15 | instances.
16 |
17 | Args:
18 | training_corpus (Corpus): The corpus to learn from.
19 | instance_extractor (InstanceExtracor): The instance extractor that
20 | defines the features and the structure of instances that are
21 | extracted during training.
22 | perceptron (Perceptron): A perceptron (including a decoder) that
23 | learns from the instances extracted by ``instance_extractor``.
24 |
25 | Returns:
26 | A tuple consisting of
27 | - **priors** (*dict(str,float)*): A prior weight for each label
28 | in the graphs representing the instances,
29 | - **weights** (*dict(str, array)*): A mapping of labels to weight
30 | vectors. For each label ``l``, ``weights[l]`` contains weights
31 | for each feature seen during training (for representing the
32 | features we employ *feature hashing*). If the graphs employed are
33 | not labeled, ``l`` is set to "+".
34 | """
35 | logging.info("Learning.")
36 |
37 | logging.info("\tExtracting instances and features.")
38 | substructures, arc_information = instance_extractor.extract(
39 | training_corpus)
40 |
41 | logging.info("\tFitting model parameters.")
42 |
43 | perceptron.fit(substructures, arc_information)
44 |
45 | return perceptron.get_model()
46 |
47 |
48 | def predict(testing_corpus,
49 | instance_extractor,
50 | perceptron,
51 | coref_extractor):
52 | """ According to a learned model, predict coreference information.
53 |
54 | Args:
55 | testing_corpus (Corpus): The corpus to predict coreference on.
56 | instance_extractor (InstanceExtracor): The instance extracor that
57 | defines the features and the structure of instances that are
58 | extracted during testing.
59 | perceptron (Perceptron): A perceptron learned from training data.
60 | argmax_function (function): A decoder that computes the best-scoring
61 | coreference structure over a set of structures.
62 | coref_extractor (function): An extractor for consolidating pairwise
63 | predictions into coreference clusters.
64 |
65 | Returns:
66 | A tuple containing two dicts. The components are
67 |
68 | - **mention_entity_mapping** (*dict(Mention, int)*): A mapping of
69 | mentions to entity identifiers.
70 | - **antecedent_mapping** (*dict(Mention, Mention)*): A mapping of
71 | mentions to their antecedent (as determined by the
72 | ``coref_extractor``).
73 | """
74 | logging.info("Predicting.")
75 |
76 | logging.info("\tRemoving coreference annotations from corpus.")
77 | for doc in testing_corpus:
78 | doc.antecedent_decisions = {}
79 | for mention in doc.system_mentions:
80 | mention.attributes["antecedent"] = None
81 | mention.attributes["set_id"] = None
82 |
83 | logging.info("\tExtracting instances and features.")
84 | substructures, arc_information = instance_extractor.extract(testing_corpus)
85 |
86 | logging.info("\tDoing predictions.")
87 | arcs, labels, scores = perceptron.predict(substructures, arc_information)
88 |
89 | logging.info("\tClustering results.")
90 |
91 | return coref_extractor(arcs, labels, scores, perceptron.get_coref_labels())
92 |
--------------------------------------------------------------------------------
/cort/coreference/multigraph/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/coreference/multigraph/decoders.py:
--------------------------------------------------------------------------------
1 | __author__ = 'smartschat'
2 |
3 |
4 | class MultigraphDecoder:
5 | def __init__(self, multigraph_creator):
6 | self.coref_multigraph_creator = multigraph_creator
7 |
8 | def decode(self, corpus):
9 | for doc in corpus:
10 | for mention in doc.system_mentions:
11 | mention.attributes["set_id"] = None
12 |
13 | # discard dummy mention
14 | self.decode_for_one_document(doc.system_mentions[1:])
15 |
16 | def decode_for_one_document(self, mentions):
17 | multigraph = \
18 | self.coref_multigraph_creator.construct_graph_from_mentions(
19 | mentions)
20 |
21 | for mention in mentions:
22 | antecedent = self.compute_antecedent(mention, multigraph)
23 |
24 | if antecedent is not None:
25 | if antecedent.attributes["set_id"] is None:
26 | antecedent.attributes["set_id"] = \
27 | mentions.index(antecedent)
28 |
29 | mention.attributes["set_id"] = antecedent.attributes["set_id"]
30 | mention.document.antecedent_decisions[mention.span] = \
31 | antecedent.span
32 |
33 | @staticmethod
34 | def compute_antecedent(mention, multigraph):
35 | weights = []
36 | for antecedent in multigraph.edges[mention]:
37 | if not multigraph.edges[mention][antecedent]["negative_relations"]:
38 | weights.append(
39 | (multigraph.get_weight(mention, antecedent), antecedent))
40 |
41 | # get antecedent with highest positive weight, break ties by distance
42 | if len(weights) > 0 and sorted(weights)[-1][0] > 0:
43 | return sorted(weights)[-1][1]
44 |
--------------------------------------------------------------------------------
/cort/coreference/multigraph/multigraphs.py:
--------------------------------------------------------------------------------
1 | __author__ = 'smartschat'
2 |
3 |
4 | class CorefMultigraphCreator:
5 | def __init__(self,
6 | positive_features,
7 | negative_features,
8 | weighting_function,
9 | relation_weights,
10 | construct_when_negative=False):
11 | self.positive_features = positive_features
12 | self.negative_features = negative_features
13 | self.weighting_function = weighting_function
14 | self.relation_weights = relation_weights
15 | self.construct_when_negative = construct_when_negative
16 |
17 | def construct_graph_from_mentions(self, mentions):
18 | nodes = []
19 | edges = {}
20 |
21 | for i in range(0, len(mentions)):
22 | anaphor = mentions[i]
23 |
24 | nodes.append(anaphor)
25 |
26 | edges[anaphor] = self.construct_for_one_mention(mentions, i)
27 |
28 | return CorefMultigraph(nodes,
29 | edges,
30 | self.weighting_function,
31 | self.relation_weights)
32 |
33 | def construct_for_one_mention(self, mentions, i):
34 | anaphor = mentions[i]
35 |
36 | edges = {}
37 |
38 | # do not include dummy mention
39 | for j in range(i-1, 0, -1):
40 | antecedent = mentions[j]
41 | if self.construct_when_negative:
42 | edges[antecedent] = self.get_edge_relations(anaphor, antecedent)
43 | else:
44 | if not self.has_negative(anaphor, antecedent):
45 | edges[antecedent] = {
46 | "negative_relations": [],
47 | "positive_relations": self.get_positive_relations(
48 | anaphor, antecedent)
49 | }
50 |
51 | return edges
52 |
53 | def get_edge_relations(self, anaphor, antecedent):
54 | relations = {
55 | "negative_relations":
56 | self.get_negative_relations(anaphor, antecedent),
57 | "positive_relations":
58 | self.get_positive_relations(anaphor, antecedent)
59 | }
60 |
61 | return relations
62 |
63 | def has_negative(self, anaphor, antecedent):
64 | for r in self.negative_features:
65 | if r(anaphor, antecedent):
66 | return True
67 |
68 | def get_negative_relations(self, anaphor, antecedent):
69 | negative_relations = []
70 |
71 | for r in self.negative_features:
72 | if r(anaphor, antecedent):
73 | negative_relations.append(r)
74 |
75 | return negative_relations
76 |
77 | def get_positive_relations(self, anaphor, antecedent):
78 | positive_relations = []
79 |
80 | for r in self.positive_features:
81 | if r(anaphor, antecedent):
82 | positive_relations.append(r)
83 |
84 | return positive_relations
85 |
86 |
87 | class CorefMultigraph:
88 | def __init__(self, nodes, edges, weighting_function, relation_weights):
89 | self.nodes = nodes
90 | self.edges = edges
91 | self.weighting_function = weighting_function
92 | self.relation_weights = relation_weights
93 |
94 | def get_weight(self, anaphor, antecedent):
95 | return self.weighting_function(
96 | anaphor,
97 | antecedent,
98 | self.edges[anaphor][antecedent],
99 | self.relation_weights)
100 |
--------------------------------------------------------------------------------
/cort/coreference/multigraph/weighting_functions.py:
--------------------------------------------------------------------------------
1 | __author__ = 'smartschat'
2 |
3 |
4 | def for_each_relation_with_distance(anaphor,
5 | antecedent,
6 | relations,
7 | relation_weights):
8 | weight = 0.0
9 |
10 | if len(relations["negative_relations"]) > 0:
11 | return float("-inf")
12 |
13 | if len(relations["positive_relations"]) == 0:
14 | return 0
15 |
16 | for relation in relations["positive_relations"]:
17 | weight += relation_weights[relation]
18 |
19 | weight /= (anaphor.attributes["sentence_id"] -
20 | antecedent.attributes["sentence_id"]
21 | + 1)
22 |
23 | return weight
24 |
--------------------------------------------------------------------------------
/cort/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/preprocessing/pipeline.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
3 | import cort
4 |
5 | import codecs
6 |
7 | import stanford_corenlp_pywrapper
8 |
9 | from StanfordDependencies import CoNLL
10 |
11 | from cort.core import corpora, documents, spans
12 |
13 | import bs4
14 |
15 |
16 | class Pipeline():
17 | def __init__(self, corenlp_location, with_coref=False):
18 | package_dir = cort.__path__[0]
19 |
20 | if with_coref:
21 | self.proc = stanford_corenlp_pywrapper.CoreNLP(
22 | configfile=package_dir + "/config_files/corenlp_with_coref.ini",
23 | corenlp_jars=[corenlp_location + "/*"]
24 | )
25 | else:
26 | self.proc = stanford_corenlp_pywrapper.CoreNLP(
27 | configfile=package_dir + "/config_files/corenlp.ini",
28 | corenlp_jars=[corenlp_location + "/*"]
29 | )
30 |
31 | self.with_coref = with_coref
32 |
33 | def run_on_docs(self, identifier, docs):
34 | processed_documents = []
35 |
36 | for doc in docs:
37 | processed_documents.append(self.run_on_doc(
38 | codecs.open(doc, "r", "utf-8")
39 | ))
40 |
41 | return corpora.Corpus(identifier, processed_documents)
42 |
43 | def run_on_doc(self, doc_file, name=None):
44 | if self.with_coref:
45 | soup = bs4.BeautifulSoup(doc_file.read())
46 | preprocessed = self.proc.parse_doc(soup.text)
47 | else:
48 | data = doc_file.read()
49 | preprocessed = self.proc.parse_doc(data)
50 |
51 | sentences = []
52 |
53 | for sentence in preprocessed["sentences"]:
54 | processed_ner = []
55 | for ner in sentence["ner"]:
56 | if ner == "O" or ner == "MISC":
57 | processed_ner.append("NONE")
58 | else:
59 | processed_ner.append(ner)
60 |
61 | processed_dep = []
62 |
63 | index_to_dep_info = {}
64 | for dep_info in sentence["deps_basic"]:
65 | label, head, in_sent_index = dep_info
66 | index_to_dep_info[in_sent_index] = label, head
67 |
68 | for i in range(0, len(sentence["tokens"])):
69 | if i in index_to_dep_info.keys():
70 | label, head = index_to_dep_info[i]
71 | processed_dep.append(
72 | CoNLL.Token(
73 | form=sentence["tokens"][i],
74 | lemma=sentence["lemmas"][i],
75 | pos=sentence["pos"][i],
76 | index=i+1,
77 | head=head+1,
78 | deprel=label,
79 | cpos=None,
80 | feats=None,
81 | phead=None,
82 | pdeprel=None,
83 | extra=None
84 | )
85 | )
86 | else:
87 | processed_dep.append(
88 | CoNLL.Token(
89 | form=sentence["tokens"][i],
90 | lemma=sentence["lemmas"][i],
91 | pos=sentence["pos"][i],
92 | index=i+1,
93 | head=0,
94 | deprel="punc",
95 | cpos=None,
96 | feats=None,
97 | phead=None,
98 | pdeprel=None,
99 | extra=None
100 | )
101 | )
102 |
103 | sentences.append(
104 | (sentence["tokens"],
105 | sentence["pos"],
106 | processed_ner,
107 | ["-"]*len(sentence["tokens"]),
108 | sentence["parse"],
109 | processed_dep,
110 | )
111 | )
112 |
113 | if not name:
114 | name = doc_file.name
115 |
116 | if self.with_coref:
117 | antecedent_decisions = {}
118 | coref = {}
119 |
120 | mention_id_to_spans = {}
121 |
122 | max_entity = 0
123 |
124 | for mention in soup.findAll("mention"):
125 | if mention.get("entity"):
126 | max_entity = max(max_entity, int(mention.get("entity")))
127 |
128 | for mention in soup.findAll("mention"):
129 | mention_id = int(mention.get("id"))
130 |
131 | span = spans.Span(int(mention.get("span_start")),
132 | int(mention.get("span_end")))
133 |
134 | mention_id_to_spans[mention_id] = span
135 |
136 | if mention.get("entity"):
137 | annotated_set_id = int(mention.get("entity"))
138 | else:
139 | annotated_set_id = max_entity + 1 + mention_id
140 |
141 | coref[span] = annotated_set_id
142 |
143 | if mention.get("antecedent"):
144 | antecedent_decisions[span] = mention_id_to_spans[
145 | int(mention.get("antecedent"))
146 | ]
147 |
148 | doc = documents.Document(
149 | name,
150 | sentences,
151 | coref)
152 |
153 | spans_to_annotated_mentions = {}
154 |
155 | for mention in doc.annotated_mentions:
156 | spans_to_annotated_mentions[mention.span] = mention
157 |
158 | for span in antecedent_decisions:
159 | ante_span = antecedent_decisions[span]
160 | ana = spans_to_annotated_mentions[span]
161 | ante = spans_to_annotated_mentions[ante_span]
162 | ana.attributes["antecedent"] = ante
163 | else:
164 | doc = documents.Document(
165 | name,
166 | sentences,
167 | {})
168 |
169 | return doc
170 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/README.txt:
--------------------------------------------------------------------------------
1 | NAME
2 | CorScorer: Perl package for scoring coreference resolution systems
3 | using different metrics.
4 |
5 |
6 | VERSION
7 | v8.01 -- reference implementations of MUC, B-cubed, CEAF and BLANC metrics.
8 |
9 |
10 | CHANGES SINCE v8.0
11 | - fixed a bug that crashed the BLANC scorer when a duplicate singleton
12 | mention was present in the response.
13 |
14 | INSTALLATION
15 | Requirements:
16 | 1. Perl: downloadable from http://perl.org
17 | 2. Algorithm-Munkres: included in this package and downloadable
18 | from CPAN http://search.cpan.org/~tpederse/Algorithm-Munkres-0.08
19 |
20 | USE
21 | This package is distributed with two scripts to execute the scorer from
22 | the command line.
23 |
24 | Windows (tm): scorer.bat
25 | Linux: scorer.pl
26 |
27 |
28 | SYNOPSIS
29 | use CorScorer;
30 |
31 | $metric = 'ceafm';
32 |
33 | # Scores the whole dataset
34 | &CorScorer::Score($metric, $keys_file, $response_file);
35 |
36 | # Scores one file
37 | &CorScorer::Score($metric, $keys_file, $response_file, $name);
38 |
39 |
40 | INPUT
41 | metric: the metric desired to score the results:
42 | muc: MUCScorer (Vilain et al, 1995)
43 | bcub: B-Cubed (Bagga and Baldwin, 1998)
44 | ceafm: CEAF (Luo et al., 2005) using mention-based similarity
45 | ceafe: CEAF (Luo et al., 2005) using entity-based similarity
46 | blanc: BLANC (Luo et al., 2014) BLANC metric for gold and predicted mentions
47 | all: uses all the metrics to score
48 |
49 | keys_file: file with expected coreference chains in CoNLL-2011/2012 format
50 |
51 | response_file: file with output of coreference system (CoNLL-2011/2012 format)
52 |
53 | name: [optional] the name of the document to score. If name is not
54 | given, all the documents in the dataset will be scored. If given
55 | name is "none" then all the documents are scored but only total
56 | results are shown.
57 |
58 |
59 | OUTPUT
60 | The score subroutine returns an array with four values in this order:
61 | 1) Recall numerator
62 | 2) Recall denominator
63 | 3) Precision numerator
64 | 4) Precision denominator
65 |
66 | Also recall, precision and F1 are printed in the standard output when variable
67 | $VERBOSE is not null.
68 |
69 | Final scores:
70 | Recall = recall_numerator / recall_denominator
71 | Precision = precision_numerator / precision_denominator
72 | F1 = 2 * Recall * Precision / (Recall + Precision)
73 |
74 | Identification of mentions
75 | An scorer for identification of mentions (recall, precision and F1) is also included.
76 | Mentions from system response are compared with key mentions. This version performs
77 | strict mention matching as was used in the CoNLL-2011 and 2012 shared tasks.
78 |
79 | AUTHORS
80 | Emili Sapena, Universitat Politècnica de Catalunya, http://www.lsi.upc.edu/~esapena, esapena lsi.upc.edu
81 | Sameer Pradhan, sameer.pradhan childrens.harvard.edu
82 | Sebastian Martschat, sebastian.martschat h-its.org
83 | Xiaoqiang Luo, xql google.com
84 |
85 | COPYRIGHT AND LICENSE
86 | Copyright (C) 2009-2011, Emili Sapena esapena lsi.upc.edu
87 | 2011-2014, Sameer Pradhan sameer.pradhan childrens.harvard.edu
88 |
89 | This program is free software; you can redistribute it and/or modify it
90 | under the terms of the GNU General Public License as published by the
91 | Free Software Foundation; either version 2 of the License, or (at your
92 | option) any later version. This program is distributed in the hope that
93 | it will be useful, but WITHOUT ANY WARRANTY; without even the implied
94 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
95 | GNU General Public License for more details.
96 |
97 | You should have received a copy of the GNU General Public License along
98 | with this program; if not, write to the Free Software Foundation, Inc.,
99 | 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
100 |
101 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/lib/Algorithm/README.Munkres:
--------------------------------------------------------------------------------
1 | NAME
2 | Algorithm-Munkres : Perl extension for Munkres' solution to
3 | classical Assignment problem for square and rectangular matrices
4 | This module extends the solution of Assignment problem for square
5 | matrices to rectangular matrices by padding zeros. Thus a rectangular
6 | matrix is converted to square matrix by padding necessary zeros.
7 |
8 | SYNOPSIS
9 | use Algorithm::Munkres;
10 |
11 | @mat = (
12 | [2, 4, 7, 9],
13 | [3, 9, 5, 1],
14 | [8, 2, 9, 7],
15 | );
16 |
17 | assign(\@mat,\@out_mat);
18 |
19 | Then the @out_mat array will have the output as: (0,3,1,2),
20 | where
21 | 0th element indicates that 0th row is assigned 0th column i.e value=2
22 | 1st element indicates that 1st row is assigned 3rd column i.e.value=1
23 | 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2
24 | 3rd element indicates that 3rd row is assigned 2nd column.i.e.value=0
25 |
26 | DESCRIPTION
27 | Assignment Problem: Given N jobs, N workers and the time taken by
28 | each worker to complete a job then how should the assignment of a
29 | Worker to a Job be done, so as to minimize the time taken.
30 |
31 | Thus if we have 3 jobs p,q,r and 3 workers x,y,z such that:
32 | x y z
33 | p 2 4 7
34 | q 3 9 5
35 | r 8 2 9
36 |
37 | where the cell values of the above matrix give the time required
38 | for the worker(given by column name) to complete the job(given by
39 | the row name)
40 |
41 | then possible solutions are:
42 | Total
43 | 1. 2, 9, 9 20
44 | 2. 2, 2, 5 9
45 | 3. 3, 4, 9 16
46 | 4. 3, 2, 7 12
47 | 5. 8, 9, 7 24
48 | 6. 8, 4, 5 17
49 |
50 | Thus (2) is the optimal solution for the above problem.
51 | This kind of brute-force approach of solving Assignment problem
52 | quickly becomes slow and bulky as N grows, because the number of
53 | possible solution are N! and thus the task is to evaluate each
54 | and then find the optimal solution.(If N=10, number of possible
55 | solutions: 3628800 !)
56 | Munkres' gives us a solution to this problem, which is implemented
57 | in this module.
58 |
59 | This module also solves Assignment problem for rectangular matrices
60 | (M x N) by converting them to square matrices by padding zeros. ex:
61 | If input matrix is:
62 | [2, 4, 7, 9],
63 | [3, 9, 5, 1],
64 | [8, 2, 9, 7]
65 | i.e 3 x 4 then we will convert it to 4 x 4 and the modified input
66 | matrix will be:
67 | [2, 4, 7, 9],
68 | [3, 9, 5, 1],
69 | [8, 2, 9, 7],
70 | [0, 0, 0, 0]
71 |
72 | EXPORT
73 | "assign" function by default.
74 |
75 | INPUT
76 | The input matrix should be in a two dimensional array(array of
77 | array) and the 'assign' subroutine expects a reference to this
78 | array and not the complete array.
79 | eg:assign(\@inp_mat, \@out_mat);
80 | The second argument to the assign subroutine is the reference
81 | to the output array.
82 |
83 | OUTPUT
84 | The assign subroutine expects references to two arrays as its
85 | input paramenters. The second parameter is the reference to the
86 | output array. This array is populated by assign subroutine. This
87 | array is single dimensional Nx1 matrix.
88 | For above example the output array returned will be:
89 | (0,
90 | 2,
91 | 1)
92 |
93 | where
94 | 0th element indicates that 0th row is assigned 0th column i.e value=2
95 | 1st element indicates that 1st row is assigned 2nd column i.e.value=5
96 | 2nd element indicates that 2nd row is assigned 1st column.i.e.value=2
97 |
98 | SEE ALSO
99 | 1. http://216.249.163.93/bob.pilgrim/445/munkres.html
100 |
101 | 2. Munkres, J. Algorithms for the assignment and transportation
102 | Problems. J. Siam 5 (Mar. 1957), 32-38
103 |
104 | 3. François Bourgeois and Jean-Claude Lassalle. 1971.
105 | An extension of the Munkres algorithm for the assignment
106 | problem to rectangular matrices.
107 | Communication ACM, 14(12):802-804
108 |
109 | AUTHOR
110 | Anagha Kulkarni, University of Minnesota Duluth
111 | kulka020 d.umn.edu
112 |
113 | Ted Pedersen, University of Minnesota Duluth
114 | tpederse d.umn.edu
115 |
116 | COPYRIGHT AND LICENSE
117 | Copyright (C) 2007-2008, Ted Pedersen and Anagha Kulkarni
118 |
119 | This program is free software; you can redistribute it and/or modify it
120 | under the terms of the GNU General Public License as published by the
121 | Free Software Foundation; either version 2 of the License, or (at your
122 | option) any later version. This program is distributed in the hope that
123 | it will be useful, but WITHOUT ANY WARRANTY; without even the implied
124 | warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
125 | GNU General Public License for more details.
126 |
127 | You should have received a copy of the GNU General Public License along
128 | with this program; if not, write to the Free Software Foundation, Inc.,
129 | 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
130 |
131 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/scorer.bat:
--------------------------------------------------------------------------------
1 | @rem = '--*-Perl-*--
2 | @echo off
3 | if "%OS%" == "Windows_NT" goto WinNT
4 | perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
5 | goto endofperl
6 | :WinNT
7 | perl -x -S %0 %*
8 | if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
9 | if %errorlevel% == 9009 echo You do not have Perl in your PATH.
10 | if errorlevel 1 goto script_failed_so_exit_with_non_zero_val 2>nul
11 | goto endofperl
12 | @rem ';
13 | #!perl
14 | #line 15
15 |
16 | BEGIN {
17 | $d = $0;
18 | $d =~ s/\/[^\/][^\/]*$//g;
19 | push(@INC, $d."/lib");
20 | }
21 |
22 | use strict;
23 | use CorScorer;
24 |
25 | if (@ARGV < 3) {
26 | print q|
27 | use: scorer.bat [name]
28 |
29 | metric: the metric desired to score the results:
30 | muc: MUCScorer (Vilain et al, 1995)
31 | bcub: B-Cubed (Bagga and Baldwin, 1998)
32 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity
33 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity
34 | all: uses all the metrics to score
35 |
36 | keys_file: file with expected coreference chains in SemEval format
37 |
38 | response_file: file with output of coreference system (SemEval format)
39 |
40 | name: [optional] the name of the document to score. If name is not
41 | given, all the documents in the dataset will be scored. If given
42 | name is "none" then all the documents are scored but only total
43 | results are shown.
44 |
45 | |;
46 | exit;
47 | }
48 |
49 | my $metric = shift (@ARGV);
50 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|all)/i) {
51 | print "Invalid metric\n";
52 | exit;
53 | }
54 |
55 |
56 | if ($metric eq 'all') {
57 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe') {
58 | print "\nMETRIC $m:\n";
59 | &CorScorer::Score( $m, @ARGV );
60 | }
61 | }
62 | else {
63 | &CorScorer::Score( $metric, @ARGV );
64 | }
65 |
66 | __END__
67 | :endofperl
68 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/scorer.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | BEGIN {
4 | $d = $0;
5 | $d =~ s/\/[^\/][^\/]*$//g;
6 |
7 | if ($d eq $0) {
8 | unshift(@INC, "lib");
9 | }
10 | else {
11 | unshift(@INC, $d . "/lib");
12 | }
13 | }
14 |
15 | use strict;
16 | use CorScorer;
17 |
18 | if (@ARGV < 3) {
19 | print q|
20 | use: scorer.pl [name]
21 |
22 | metric: the metric desired to score the results:
23 | muc: MUCScorer (Vilain et al, 1995)
24 | bcub: B-Cubed (Bagga and Baldwin, 1998)
25 | ceafm: CEAF (Luo et al, 2005) using mention-based similarity
26 | ceafe: CEAF (Luo et al, 2005) using entity-based similarity
27 | blanc: BLANC
28 | all: uses all the metrics to score
29 |
30 | keys_file: file with expected coreference chains in SemEval format
31 |
32 | response_file: file with output of coreference system (SemEval format)
33 |
34 | name: [optional] the name of the document to score. If name is not
35 | given, all the documents in the dataset will be scored. If given
36 | name is "none" then all the documents are scored but only total
37 | results are shown.
38 |
39 | |;
40 | exit;
41 | }
42 |
43 | my $metric = shift(@ARGV);
44 | if ($metric !~ /^(muc|bcub|ceafm|ceafe|blanc|all)/i) {
45 | print "Invalid metric\n";
46 | exit;
47 | }
48 |
49 | if ($metric eq 'all') {
50 | foreach my $m ('muc', 'bcub', 'ceafm', 'ceafe', 'blanc') {
51 | print "\nMETRIC $m:\n";
52 | &CorScorer::Score($m, @ARGV);
53 | }
54 | }
55 | else {
56 | &CorScorer::Score($metric, @ARGV);
57 | }
58 |
59 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/CorefMetricTest.pm:
--------------------------------------------------------------------------------
1 | package CorefMetricTest;
2 | use strict;
3 | use warnings;
4 | use Exporter;
5 |
6 | our @ISA= qw(Exporter);
7 | our @EXPORT = qw(ComputeScoreFromCounts DiffExpectedAndActual);
8 |
9 | ################################################################################
10 | # Compute recall, precision and F1.
11 | #
12 | # Input: (numerator_counts_for_recall, denominator_counts_for_recall,
13 | # numerator_counts_for_precision, denominator_counts_for_precision)
14 | # Output: (recall, precision, F1)
15 | ################################################################################
16 | sub ComputeScoreFromCounts {
17 | # The first 4 are also coref link counts when using BLANC.
18 | my ($recall_numerator, $recall_denominator,
19 | $precision_numerator, $precision_denominator, @noncoref_counts) = @_;
20 | # The coref recall, precision, and F1 when using BLANC.
21 | my ($recall, $precision, $F1) =
22 | RPFFromCounts($recall_numerator, $recall_denominator,
23 | $precision_numerator, $precision_denominator);
24 |
25 | # BLANC: @noncoref_counts=
26 | # (noncoref_numerator_recall, noncoref_denominator_recall,
27 | # noncoref_numerator_precision, noncoref_denominator_precision)
28 | if (scalar(@noncoref_counts) == 4) {
29 | ($recall, $precision, $F1) = CorScorer::ComputeBLANCFromCounts(
30 | $recall_numerator, $recall_denominator, $precision_denominator,
31 | $noncoref_counts[0], $noncoref_counts[1], $noncoref_counts[3]);
32 | }
33 | $recall = ($recall < 0) ? 0 : $recall;
34 | $precision = ($precision < 0) ? 0 : $precision;
35 | $F1 = ($F1 < 0) ? 0 : $F1;
36 | return ($recall, $precision, $F1);
37 | }
38 |
39 | sub RPFFromCounts
40 | {
41 | my ($recall_numerator, $recall_denominator,
42 | $precision_numerator, $precision_denominator, @nonCorefCounts) = @_;
43 | my ($recall, $precision, $F1) = (-1, -1, 0);
44 | if ($recall_denominator > 0) {
45 | $recall = $recall_numerator / $recall_denominator;
46 | }
47 | if ($precision_denominator > 0) {
48 | $precision = $precision_numerator / $precision_denominator;
49 | }
50 |
51 | if (($recall + $precision) > 0) {
52 | $F1 = 2 * $recall * $precision / ($recall + $precision);
53 | }
54 |
55 | return ($recall, $precision, $F1);
56 | }
57 |
58 | # deprecated -- see CorScorer::ComputeBLANCFromCounts().
59 | sub ComputeBLANCRPF
60 | {
61 | my ($coref_recall, $coref_precision, $coref_F1,
62 | $noncoref_recall, $noncoref_precision, $noncoref_F1) = @_;
63 |
64 | my ($recall, $precision, $F1);
65 |
66 | if ($coref_recall < 0 && $noncoref_recall < 0) {
67 | # no key mention.
68 | $recall = $precision = $F1 = 0;
69 | } elsif ($coref_recall < 0) {
70 | # key: all links are non-coref (mentions are all singltons).
71 | $recall = $noncoref_recall;
72 | $precision = ($noncoref_precision < 0) ? 0 : $noncoref_precision;
73 | $F1 = $noncoref_F1;
74 | } elsif ($noncoref_recall < 0) {
75 | # key: all links are coref (all mentions are in one entity).
76 | $recall = $coref_recall;
77 | $precision = ($coref_precision < 0) ? 0 : $coref_precision;
78 | $F1 = $coref_F1;
79 | } else {
80 | #key contains both coref and non-coref links.
81 | if ($coref_precision < 0 && $noncoref_precision < 0) {
82 | # no response.
83 | $recall = $precision = $F1 = 0;
84 | } else {
85 | if ($coref_precision < 0) {
86 | # response: all links are non-coref, or response mentions are all
87 | # singletons.
88 | $coref_precision = 0;
89 | } elsif ($noncoref_precision < 0) {
90 | # response: all links are coref, or all mentions are in one entity.
91 | $noncoref_precision = 0;
92 | }
93 | $recall = ($coref_recall + $noncoref_recall)/2;
94 | $precision = ($coref_precision + $noncoref_precision)/2;
95 | $F1 = ($coref_F1 + $noncoref_F1)/2;
96 | }
97 | }
98 |
99 | return ($recall, $precision, $F1);
100 | }
101 |
102 | ##############################################################################
103 | # Compute the sum of the duifference between the expected recall, precision,
104 | # F1 and the actual one.
105 | ##############################################################################
106 | sub DiffExpectedAndActual {
107 | my ($expected, $actual) = @_;
108 | if (scalar(@$expected) != scalar(@$actual)) {
109 | print STDERR "Expected and actual have diff dimensions: \n";
110 | print STDERR " Expected: ", join(" ", @$expected), "\n";
111 | print STDERR " Actual: ", join(" ", @$actual), "\n";
112 | return 1.0e5;
113 | }
114 | my $sum = 0.0;
115 | my $i = 0;
116 | foreach my $e (@$expected) {
117 | $sum += abs($e - $actual->[$i]);
118 | ++$i;
119 | }
120 | return $sum;
121 | }
122 |
123 | 1;
124 |
125 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-1.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 jnk -
17 | test2 0 5 e (2)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (2
20 | test2 0 8 f2 -
21 | test2 0 9 f3 2)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-10.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 x -
14 | test2 0 2 d1 (3
15 | test2 0 3 d2 3)
16 | test2 0 4 z -
17 | test2 0 5 e (4)
18 | test2 0 6 y -
19 | test2 0 7 f1 (5
20 | test2 0 8 f2 -
21 | test2 0 9 f3 5)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-11.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 x -
14 | test2 0 2 d1 (0
15 | test2 0 3 d2 0)
16 | test2 0 4 z -
17 | test2 0 5 e (0)
18 | test2 0 6 y -
19 | test2 0 7 f1 (0
20 | test2 0 8 f2 -
21 | test2 0 9 f3 0)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-12.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 1)
7 | test1 0 5 b3 -
8 | test1 0 6 b4 -
9 | test1 0 7 jnk (2)
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (3)
13 | test2 0 1 x -
14 | test2 0 2 d1 (4
15 | test2 0 3 d2 4)
16 | test2 0 4 z -
17 | test2 0 5 e (5)
18 | test2 0 6 y -
19 | test2 0 7 f1 (6)
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-13.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 0)
7 | test1 0 5 b3 -
8 | test1 0 6 b4 -
9 | test1 0 7 jnk (0)
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 x -
14 | test2 0 2 d1 (0
15 | test2 0 3 d2 0)
16 | test2 0 4 z -
17 | test2 0 5 e (0)
18 | test2 0 6 y -
19 | test2 0 7 f1 (0)
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-2.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 -
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 -
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c -
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 jnk -
17 | test2 0 5 e (2)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-3.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 y (2)
17 | test2 0 5 e (2)
18 | test2 0 6 z (3)
19 | test2 0 7 f1 (2
20 | test2 0 8 f2 -
21 | test2 0 9 f3 2)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-4.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 x (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-5.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 (1
7 | test1 0 5 b3 1)
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 z (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-6.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 (3
7 | test1 0 5 b3 3)
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 z (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-7.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1(1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 z (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-8.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1(3
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 3)1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 z (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A-9.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1(3(3(3(3(3(3(3(3(3(3
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 3)3)3)3)3)3)3)3)3)3)1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 x (1)
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 z (3)
17 | test2 0 5 e -
18 | test2 0 6 y (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-A.key:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (2
15 | test2 0 3 d2 2)
16 | test2 0 4 jnk -
17 | test2 0 5 e (2)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (2
20 | test2 0 8 f2 -
21 | test2 0 9 f3 2)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-B-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 -
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 -
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 -
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 | nw/xinhua/00/chtb_0009 -
31 | nw/xinhua/00/chtb_0009 (10043
32 | nw/xinhua/00/chtb_0009 -
33 | nw/xinhua/00/chtb_0009 10043)
34 | nw/xinhua/00/chtb_0009 -
35 | nw/xinhua/00/chtb_0009 -
36 | nw/xinhua/00/chtb_0009 -
37 | nw/xinhua/00/chtb_0009 -
38 | nw/xinhua/00/chtb_0009 -
39 | nw/xinhua/00/chtb_0009 -
40 | nw/xinhua/00/chtb_0009 -
41 | nw/xinhua/00/chtb_0009 -
42 | nw/xinhua/00/chtb_0009 -
43 | nw/xinhua/00/chtb_0009 -
44 | nw/xinhua/00/chtb_0009 -
45 | nw/xinhua/00/chtb_0009 -
46 | nw/xinhua/00/chtb_0009 -
47 | nw/xinhua/00/chtb_0009 -
48 | nw/xinhua/00/chtb_0009 -
49 | nw/xinhua/00/chtb_0009 (10043)
50 | nw/xinhua/00/chtb_0009 -
51 | nw/xinhua/00/chtb_0009 -
52 | nw/xinhua/00/chtb_0009 -
53 | nw/xinhua/00/chtb_0009 -
54 | nw/xinhua/00/chtb_0009 -
55 | nw/xinhua/00/chtb_0009 -
56 | nw/xinhua/00/chtb_0009 (10043
57 | nw/xinhua/00/chtb_0009 -
58 | nw/xinhua/00/chtb_0009 -
59 | nw/xinhua/00/chtb_0009 -
60 | nw/xinhua/00/chtb_0009 10043)
61 | nw/xinhua/00/chtb_0009 -
62 | nw/xinhua/00/chtb_0009 -
63 | nw/xinhua/00/chtb_0009 -
64 | nw/xinhua/00/chtb_0009 (10054
65 | nw/xinhua/00/chtb_0009 10054)
66 | nw/xinhua/00/chtb_0009 -
67 | nw/xinhua/00/chtb_0009 -
68 | nw/xinhua/00/chtb_0009 (10054)
69 | nw/xinhua/00/chtb_0009 -
70 | nw/xinhua/00/chtb_0009 -
71 | nw/xinhua/00/chtb_0009 -
72 | nw/xinhua/00/chtb_0009 -
73 |
74 | #end document
75 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-B.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (10043
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 -
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 -
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 10043)
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 | nw/xinhua/00/chtb_0009 -
31 | nw/xinhua/00/chtb_0009 (10054
32 | nw/xinhua/00/chtb_0009 -
33 | nw/xinhua/00/chtb_0009 10054)
34 | nw/xinhua/00/chtb_0009 -
35 | nw/xinhua/00/chtb_0009 -
36 | nw/xinhua/00/chtb_0009 -
37 | nw/xinhua/00/chtb_0009 -
38 | nw/xinhua/00/chtb_0009 -
39 | nw/xinhua/00/chtb_0009 -
40 | nw/xinhua/00/chtb_0009 -
41 | nw/xinhua/00/chtb_0009 -
42 | nw/xinhua/00/chtb_0009 -
43 | nw/xinhua/00/chtb_0009 -
44 | nw/xinhua/00/chtb_0009 -
45 | nw/xinhua/00/chtb_0009 -
46 | nw/xinhua/00/chtb_0009 -
47 | nw/xinhua/00/chtb_0009 -
48 | nw/xinhua/00/chtb_0009 -
49 | nw/xinhua/00/chtb_0009 (10043)
50 | nw/xinhua/00/chtb_0009 -
51 | nw/xinhua/00/chtb_0009 -
52 | nw/xinhua/00/chtb_0009 -
53 | nw/xinhua/00/chtb_0009 -
54 | nw/xinhua/00/chtb_0009 -
55 | nw/xinhua/00/chtb_0009 -
56 | nw/xinhua/00/chtb_0009 -
57 | nw/xinhua/00/chtb_0009 -
58 | nw/xinhua/00/chtb_0009 -
59 | nw/xinhua/00/chtb_0009 -
60 | nw/xinhua/00/chtb_0009 -
61 | nw/xinhua/00/chtb_0009 -
62 | nw/xinhua/00/chtb_0009 -
63 | nw/xinhua/00/chtb_0009 -
64 | nw/xinhua/00/chtb_0009 (10054
65 | nw/xinhua/00/chtb_0009 10054)
66 | nw/xinhua/00/chtb_0009 -
67 | nw/xinhua/00/chtb_0009 -
68 | nw/xinhua/00/chtb_0009 (10054)
69 | nw/xinhua/00/chtb_0009 -
70 | nw/xinhua/00/chtb_0009 -
71 | nw/xinhua/00/chtb_0009 -
72 | nw/xinhua/00/chtb_0009 -
73 |
74 | #end document
75 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-C-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 -
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 -
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 -
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 | nw/xinhua/00/chtb_0009 -
31 | nw/xinhua/00/chtb_0009 (10043
32 | nw/xinhua/00/chtb_0009 -
33 | nw/xinhua/00/chtb_0009 10043)
34 | nw/xinhua/00/chtb_0009 -
35 | nw/xinhua/00/chtb_0009 -
36 | nw/xinhua/00/chtb_0009 -
37 | nw/xinhua/00/chtb_0009 -
38 | nw/xinhua/00/chtb_0009 -
39 | nw/xinhua/00/chtb_0009 -
40 | nw/xinhua/00/chtb_0009 -
41 | nw/xinhua/00/chtb_0009 -
42 | nw/xinhua/00/chtb_0009 -
43 | nw/xinhua/00/chtb_0009 -
44 | nw/xinhua/00/chtb_0009 -
45 | nw/xinhua/00/chtb_0009 -
46 | nw/xinhua/00/chtb_0009 -
47 | nw/xinhua/00/chtb_0009 -
48 | nw/xinhua/00/chtb_0009 -
49 | nw/xinhua/00/chtb_0009 (10043)
50 | nw/xinhua/00/chtb_0009 -
51 | nw/xinhua/00/chtb_0009 -
52 | nw/xinhua/00/chtb_0009 -
53 | nw/xinhua/00/chtb_0009 -
54 | nw/xinhua/00/chtb_0009 -
55 | nw/xinhua/00/chtb_0009 -
56 | nw/xinhua/00/chtb_0009 (10043
57 | nw/xinhua/00/chtb_0009 -
58 | nw/xinhua/00/chtb_0009 -
59 | nw/xinhua/00/chtb_0009 -
60 | nw/xinhua/00/chtb_0009 10043)
61 | nw/xinhua/00/chtb_0009 -
62 | nw/xinhua/00/chtb_0009 -
63 | nw/xinhua/00/chtb_0009 -
64 | nw/xinhua/00/chtb_0009 (10054
65 | nw/xinhua/00/chtb_0009 10054)
66 | nw/xinhua/00/chtb_0009 -
67 | nw/xinhua/00/chtb_0009 -
68 | nw/xinhua/00/chtb_0009 (10054)
69 | nw/xinhua/00/chtb_0009 -
70 | nw/xinhua/00/chtb_0009 -
71 | nw/xinhua/00/chtb_0009 (10060)
72 | nw/xinhua/00/chtb_0009 (10060)
73 |
74 | #end document
75 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-C.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (10043
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 -
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 -
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 10043)
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 | nw/xinhua/00/chtb_0009 -
31 | nw/xinhua/00/chtb_0009 (10054
32 | nw/xinhua/00/chtb_0009 -
33 | nw/xinhua/00/chtb_0009 10054)
34 | nw/xinhua/00/chtb_0009 -
35 | nw/xinhua/00/chtb_0009 -
36 | nw/xinhua/00/chtb_0009 -
37 | nw/xinhua/00/chtb_0009 -
38 | nw/xinhua/00/chtb_0009 -
39 | nw/xinhua/00/chtb_0009 -
40 | nw/xinhua/00/chtb_0009 -
41 | nw/xinhua/00/chtb_0009 -
42 | nw/xinhua/00/chtb_0009 -
43 | nw/xinhua/00/chtb_0009 -
44 | nw/xinhua/00/chtb_0009 -
45 | nw/xinhua/00/chtb_0009 -
46 | nw/xinhua/00/chtb_0009 -
47 | nw/xinhua/00/chtb_0009 -
48 | nw/xinhua/00/chtb_0009 -
49 | nw/xinhua/00/chtb_0009 (10043)
50 | nw/xinhua/00/chtb_0009 -
51 | nw/xinhua/00/chtb_0009 -
52 | nw/xinhua/00/chtb_0009 -
53 | nw/xinhua/00/chtb_0009 -
54 | nw/xinhua/00/chtb_0009 -
55 | nw/xinhua/00/chtb_0009 -
56 | nw/xinhua/00/chtb_0009 -
57 | nw/xinhua/00/chtb_0009 -
58 | nw/xinhua/00/chtb_0009 -
59 | nw/xinhua/00/chtb_0009 -
60 | nw/xinhua/00/chtb_0009 -
61 | nw/xinhua/00/chtb_0009 -
62 | nw/xinhua/00/chtb_0009 -
63 | nw/xinhua/00/chtb_0009 -
64 | nw/xinhua/00/chtb_0009 (10054
65 | nw/xinhua/00/chtb_0009 10054)
66 | nw/xinhua/00/chtb_0009 -
67 | nw/xinhua/00/chtb_0009 -
68 | nw/xinhua/00/chtb_0009 (10054)
69 | nw/xinhua/00/chtb_0009 -
70 | nw/xinhua/00/chtb_0009 -
71 | nw/xinhua/00/chtb_0009 (10060)
72 | nw/xinhua/00/chtb_0009 (10060)
73 |
74 | #end document
75 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-D-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (1)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (3)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (3)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 (3)
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 (3)
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 (3)
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 (3)
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 (3)
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-D.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (1)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (2)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (2)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 (3)
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 (3)
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 (3)
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 (3)
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 (3)
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-E-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (1)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (2)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (2)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 (1)
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 (1)
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 (1)
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 (1)
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 (1)
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-E.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (1)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (2)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (2)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 (3)
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 (3)
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 (3)
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 (3)
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 (3)
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-F-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (2)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-F.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-G-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-G.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (2)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-H-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-H.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-I-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (2)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-I.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-J-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 -
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-J.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 -
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 -
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-K-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (2)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 (2)
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (3)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (3)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 (3)
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-K.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 -
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (1)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (1)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 -
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (1)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (1)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 (1)
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-L-1.response:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (2)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 -
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 (3)
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (3)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 (3)
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-L.key:
--------------------------------------------------------------------------------
1 | #begin document (nw/xinhua/00/chtb_0009); part 000
2 | nw/xinhua/00/chtb_0009 -
3 | nw/xinhua/00/chtb_0009 (1)
4 | nw/xinhua/00/chtb_0009 -
5 | nw/xinhua/00/chtb_0009 (1)
6 | nw/xinhua/00/chtb_0009 -
7 | nw/xinhua/00/chtb_0009 (1)
8 | nw/xinhua/00/chtb_0009 -
9 | nw/xinhua/00/chtb_0009 (2)
10 | nw/xinhua/00/chtb_0009 -
11 | nw/xinhua/00/chtb_0009 (2)
12 | nw/xinhua/00/chtb_0009 -
13 | nw/xinhua/00/chtb_0009 (2)
14 | nw/xinhua/00/chtb_0009 -
15 | nw/xinhua/00/chtb_0009 (2)
16 | nw/xinhua/00/chtb_0009 -
17 | nw/xinhua/00/chtb_0009 -
18 | nw/xinhua/00/chtb_0009 -
19 | nw/xinhua/00/chtb_0009 -
20 | nw/xinhua/00/chtb_0009 -
21 | nw/xinhua/00/chtb_0009 -
22 | nw/xinhua/00/chtb_0009 -
23 | nw/xinhua/00/chtb_0009 -
24 | nw/xinhua/00/chtb_0009 -
25 | nw/xinhua/00/chtb_0009 -
26 | nw/xinhua/00/chtb_0009 -
27 | nw/xinhua/00/chtb_0009 -
28 | nw/xinhua/00/chtb_0009 -
29 | nw/xinhua/00/chtb_0009 -
30 |
31 | #end document
32 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-1.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (0
15 | test2 0 3 d2 0)
16 | test2 0 4 jnk -
17 | test2 0 5 e (0)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (0
20 | test2 0 8 f2 -
21 | test2 0 9 f3 0)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-2.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (3
15 | test2 0 3 d2 3)
16 | test2 0 4 jnk -
17 | test2 0 5 e (4)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (5
20 | test2 0 8 f2 -
21 | test2 0 9 f3 5)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-3.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (1
15 | test2 0 3 d2 1)
16 | test2 0 4 jnk -
17 | test2 0 5 e (1)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (2
20 | test2 0 8 f2 -
21 | test2 0 9 f3 2)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-4.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 jnk (0)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (0)
17 | test2 0 5 e -
18 | test2 0 6 jnk (0)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-5.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 jnk (3)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (4)
17 | test2 0 5 e -
18 | test2 0 6 jnk (5)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M-6.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk (1)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (1)
17 | test2 0 5 e -
18 | test2 0 6 jnk (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-M.key:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (0
15 | test2 0 3 d2 0)
16 | test2 0 4 jnk -
17 | test2 0 5 e (0)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (0
20 | test2 0 8 f2 -
21 | test2 0 9 f3 0)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-1.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (3
15 | test2 0 3 d2 3)
16 | test2 0 4 jnk -
17 | test2 0 5 e (4)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (5
20 | test2 0 8 f2 -
21 | test2 0 9 f3 5)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-2.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (0
15 | test2 0 3 d2 0)
16 | test2 0 4 jnk -
17 | test2 0 5 e (0)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (0
20 | test2 0 8 f2 -
21 | test2 0 9 f3 0)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-3.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (1
15 | test2 0 3 d2 1)
16 | test2 0 4 jnk -
17 | test2 0 5 e (1)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (2
20 | test2 0 8 f2 -
21 | test2 0 9 f3 2)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-4.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 jnk (3)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (4)
17 | test2 0 5 e -
18 | test2 0 6 jnk (5)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-5.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (0)
13 | test2 0 1 jnk (0)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (0)
17 | test2 0 5 e -
18 | test2 0 6 jnk (0)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N-6.response:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (0
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 0)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (1)
13 | test2 0 1 jnk (1)
14 | test2 0 2 d1 -
15 | test2 0 3 d2 -
16 | test2 0 4 jnk (1)
17 | test2 0 5 e -
18 | test2 0 6 jnk (2)
19 | test2 0 7 f1 -
20 | test2 0 8 f2 -
21 | test2 0 9 f3 -
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/DataFiles/TC-N.key:
--------------------------------------------------------------------------------
1 | #begin document (LuoTestCase);
2 | test1 0 0 a1 (0
3 | test1 0 1 a2 0)
4 | test1 0 2 junk -
5 | test1 0 3 b1 (1
6 | test1 0 4 b2 -
7 | test1 0 5 b3 -
8 | test1 0 6 b4 1)
9 | test1 0 7 jnk -
10 | test1 0 8 . -
11 |
12 | test2 0 0 c (2)
13 | test2 0 1 jnk -
14 | test2 0 2 d1 (3
15 | test2 0 3 d2 3)
16 | test2 0 4 jnk -
17 | test2 0 5 e (4)
18 | test2 0 6 jnk -
19 | test2 0 7 f1 (5
20 | test2 0 8 f2 -
21 | test2 0 9 f3 5)
22 | test2 0 10 . -
23 | #end document
24 |
--------------------------------------------------------------------------------
/cort/reference-coreference-scorers/v8.01/test/test.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | BEGIN {
4 | $d = $0;
5 | $d =~ s/\/[^\/][^\/]*$//g;
6 | push(@INC, $d);
7 | push(@INC, $d . "/../lib");
8 | }
9 |
10 | use strict;
11 | use CorScorer;
12 | use CorefMetricTest;
13 | use CorefMetricTestConfig;
14 |
15 | my $error_tolerance = 1.e-4;
16 | my $script_dir = $0;
17 | $script_dir =~ s/\/[^\/][^\/]*$//g;
18 |
19 | foreach my $test_case (@CorefMetricTestConfig::TestCases) {
20 | my $id = $test_case->{'id'};
21 | my @key_response_files = ($script_dir . "/" . $test_case->{'key_file'},
22 | $script_dir . "/" . $test_case->{'response_file'});
23 | print "\nTesting case ($id): keyFile=", $key_response_files[0],
24 | " responseFile=", $key_response_files[1], "\n";
25 | my $expected_metrics = $test_case->{'expected_metrics'};
26 | foreach my $metric_name (sort keys %$expected_metrics) {
27 | my $expected_values = $expected_metrics->{$metric_name};
28 | *::SAVED_STDOUT = *STDOUT;
29 | *STDOUT = *::SUPRRES_STDOUT;
30 | my @actual_counts = &CorScorer::Score($metric_name, @key_response_files);
31 | # Compute R,P,and F1 from raw counts.
32 | my @actual_values = CorefMetricTest::ComputeScoreFromCounts(@actual_counts);
33 | *STDOUT = *::SAVED_STDOUT;
34 | my $diff = CorefMetricTest::DiffExpectedAndActual($expected_values, \@actual_values);
35 | printf " metric: %+10s", $metric_name;
36 | if ($diff < $error_tolerance) {
37 | print " => PASS\n";
38 | } else {
39 | print " => FAIL\n";
40 | print " Expected (recall, prec, F1) = (", join(" ", @$expected_values), ")\n";
41 | print " Actual (recall, prec, F1) = (", join(" ", @actual_values), ")\n";
42 | #exit(1);
43 | }
44 | }
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/cort/resources/coreferent_pairs.obj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/cort/resources/coreferent_pairs.obj
--------------------------------------------------------------------------------
/cort/resources/singletons_not_cleaned.obj:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/cort/resources/singletons_not_cleaned.obj
--------------------------------------------------------------------------------
/cort/test/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/test/analysis/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/test/analysis/test_data_structures.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | import unittest
3 |
4 | from cort.analysis import data_structures
5 | from cort.core import documents
6 | from cort.core import mentions
7 | from cort.core import spans
8 |
9 |
10 | __author__ = 'smartschat'
11 |
12 |
13 | class TestCorefStructures(unittest.TestCase):
14 | def setUp(self):
15 | self.complicated_mention_example = """#begin document (test2); part 000
16 | test2 0 0 This NN (NP* - - - - - (0)
17 | test2 0 1 is NN * - - - - - -
18 | test2 0 2 just NN * - - - - - -
19 | test2 0 3 a NN * - - - - - (0|(1)
20 | test2 0 4 test NN * - - - - - 0)
21 | test2 0 5 . NN *) - - - - - -
22 |
23 | test2 0 0 It NN (NP* - - - - - (1)|(0
24 | test2 0 1 shows NN * - - - - - -
25 | test2 0 2 that NN * - - - - - (2)
26 | test2 0 3 the NN * - - - - - (2|(3
27 | test2 0 4 scorer NN * - - - - - 2)|0)
28 | test2 0 5 works NN * - - - - - 3)
29 | test2 0 6 . NN *) - - - - - -
30 |
31 | #end document"""
32 |
33 | self.complicated_mention_document = documents.CoNLLDocument(
34 | self.complicated_mention_example)
35 |
36 | def test_entity_graph_from_mentions(self):
37 | annotated_mentions = \
38 | self.complicated_mention_document.annotated_mentions
39 |
40 | first_graph = data_structures.EntityGraph({
41 | annotated_mentions[4]: [annotated_mentions[2],
42 | annotated_mentions[0]],
43 | annotated_mentions[2]: [annotated_mentions[0]]
44 | })
45 |
46 | second_graph = data_structures.EntityGraph({
47 | annotated_mentions[3]: [annotated_mentions[1]]
48 | })
49 |
50 | third_graph = data_structures.EntityGraph({
51 | annotated_mentions[6]: [annotated_mentions[5]]
52 | })
53 |
54 | self.assertEqual(
55 | [first_graph, second_graph, third_graph],
56 | data_structures.EntityGraph.from_mentions(annotated_mentions,
57 | "annotated_set_id"))
58 |
59 | def test_entity_graph_partition(self):
60 | annotated_mentions = \
61 | self.complicated_mention_document.annotated_mentions
62 |
63 | graph = data_structures.EntityGraph({
64 | annotated_mentions[4]: [annotated_mentions[2],
65 | annotated_mentions[0]],
66 | annotated_mentions[2]: [annotated_mentions[0]]
67 | })
68 |
69 | system_output = [
70 | mentions.Mention(
71 | self.complicated_mention_document,
72 | spans.Span(0, 0),
73 | {"set_id": 0}),
74 | mentions.Mention(
75 | self.complicated_mention_document,
76 | spans.Span(2, 3),
77 | {"set_id": 1}),
78 | mentions.Mention(
79 | self.complicated_mention_document,
80 | spans.Span(6, 10),
81 | {"set_id": 0}),
82 | mentions.Mention(
83 | self.complicated_mention_document,
84 | spans.Span(5, 5),
85 | {"set_id": 0})
86 | ]
87 |
88 | expected_edges = defaultdict(list)
89 | expected_edges[annotated_mentions[4]].append(annotated_mentions[0])
90 | expected = data_structures.EntityGraph(expected_edges)
91 |
92 | self.assertEqual(expected,
93 | graph.partition(
94 | data_structures.EntityGraph.from_mentions(
95 | system_output, "set_id")))
96 |
97 |
98 | if __name__ == '__main__':
99 | unittest.main()
--------------------------------------------------------------------------------
/cort/test/analysis/test_error_extractors.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | import unittest
3 |
4 | from cort.analysis import data_structures
5 | from cort.analysis import error_extractors
6 | from cort.analysis import spanning_tree_algorithms
7 | from cort.core import corpora
8 | from cort.core import mentions
9 | from cort.core import spans
10 |
11 | __author__ = 'smartschat'
12 |
13 |
14 | class TestErrorExtractor(unittest.TestCase):
15 | def setUp(self):
16 | self.first_cluster = [
17 | mentions.Mention(
18 | None,
19 | spans.Span(0, 0),
20 | {"tokens": ["a"], "annotated_set_id": 0}),
21 |
22 | mentions.Mention(
23 | None,
24 | spans.Span(1, 1),
25 | {"tokens": ["b"], "annotated_set_id": 0}),
26 |
27 | mentions.Mention(
28 | None,
29 | spans.Span(2, 3),
30 | {"tokens": ["c", "d"], "annotated_set_id": 0}),
31 |
32 | mentions.Mention(
33 | None,
34 | spans.Span(4, 5),
35 | {"tokens": ["e", "f"], "annotated_set_id": 0}),
36 |
37 | mentions.Mention(
38 | None,
39 | spans.Span(5, 6),
40 | {"tokens": ["f", "g"], "annotated_set_id": 0}),
41 |
42 | mentions.Mention(
43 | None,
44 | spans.Span(7, 7),
45 | {"tokens": ["h"], "annotated_set_id": 0}),
46 | ]
47 |
48 | self.second_cluster = [
49 | mentions.Mention(
50 | None,
51 | spans.Span(3, 4),
52 | {"tokens": ["d", "e"], "annotated_set_id": 1}),
53 |
54 | mentions.Mention(
55 | None,
56 | spans.Span(7, 8),
57 | {"tokens": ["h", "i"], "annotated_set_id": 1}),
58 |
59 | mentions.Mention(
60 | None,
61 | spans.Span(10, 10),
62 | {"tokens": ["k"], "annotated_set_id": 1})
63 | ]
64 |
65 | self.system_cluster = [
66 | mentions.Mention(
67 | None,
68 | spans.Span(0, 0),
69 | {"tokens": ["a"], "annotated_set_id": 0}),
70 |
71 | mentions.Mention(
72 | None,
73 | spans.Span(2, 3),
74 | {"tokens": ["c", "d"], "annotated_set_id": 0}),
75 |
76 | mentions.Mention(
77 | None,
78 | spans.Span(4, 5),
79 | {"tokens": ["e", "f"], "annotated_set_id": 2}),
80 |
81 | mentions.Mention(
82 | None,
83 | spans.Span(5, 6),
84 | {"tokens": ["f", "g"], "annotated_set_id": 2}),
85 |
86 | mentions.Mention(
87 | None,
88 | spans.Span(7, 7),
89 | {"tokens": ["h"], "annotated_set_id": 1}),
90 |
91 | mentions.Mention(
92 | None,
93 | spans.Span(10, 10),
94 | {"tokens": ["k"], "annotated_set_id": 1})
95 | ]
96 |
97 | self.maxDiff = None
98 |
99 | def test_compute_errors(self):
100 | # fake document using a named tuple
101 | document = namedtuple("Document", "annotated_mentions")
102 | doc_gold = document(self.first_cluster + self.second_cluster)
103 | doc_system = document(self.system_cluster)
104 | corpus_gold = corpora.Corpus("fake gold", [doc_gold])
105 | corpus_system = corpora.Corpus("fake system", [doc_system])
106 |
107 | ex = error_extractors.ErrorExtractor(
108 | corpus_gold,
109 | spanning_tree_algorithms.recall_closest,
110 | spanning_tree_algorithms.precision_system_output
111 | )
112 |
113 | ex.add_system(corpus_system)
114 |
115 | self.assertEqual(
116 | data_structures.EnhancedSet([
117 | (self.first_cluster[1], self.first_cluster[0]),
118 | (self.first_cluster[3], self.first_cluster[2]),
119 | (self.first_cluster[5], self.first_cluster[4]),
120 | (self.second_cluster[1], self.second_cluster[0]),
121 | (self.second_cluster[2], self.second_cluster[1]),
122 | ]),
123 | ex.get_errors()["fake system"]["recall_errors"]["all"]
124 | )
125 |
126 | if __name__ == '__main__':
127 | unittest.main()
128 |
--------------------------------------------------------------------------------
/cort/test/analysis/test_spanning_tree_algorithms.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from cort.analysis import data_structures
4 | from cort.analysis import spanning_tree_algorithms
5 | from cort.core import mentions
6 | from cort.core import spans
7 |
8 |
9 | __author__ = 'smartschat'
10 |
11 |
12 | class TestSpanningTreeAlgorithms(unittest.TestCase):
13 | def setUp(self):
14 | self.gold_first_cluster = [
15 | mentions.Mention(
16 | None,
17 | spans.Span(0, 0),
18 | {"tokens": ["a"], "type": "NOM", "annotated_set_id": 0}),
19 |
20 | mentions.Mention(
21 | None,
22 | spans.Span(1, 1),
23 | {"tokens": ["US"], "type": "NAM", "annotated_set_id": 0}),
24 |
25 | mentions.Mention(
26 | None,
27 | spans.Span(2, 3),
28 | {"tokens": ["angry", "salesman"], "type": "PRO", "annotated_set_id": 0}),
29 |
30 | mentions.Mention(
31 | None,
32 | spans.Span(4, 5),
33 | {"tokens": ["the", "rainbow"], "type": "NAM",
34 | "annotated_set_id": 0}),
35 |
36 | mentions.Mention(
37 | None,
38 | spans.Span(5, 6),
39 | {"tokens": ["and", "far"], "type": "NOM",
40 | "annotated_set_id": 0}),
41 |
42 | mentions.Mention(
43 | None,
44 | spans.Span(7, 7),
45 | {"tokens": ["neypmd"], "type": "NOM", "annotated_set_id": 0}),
46 | ]
47 |
48 | self.gold_second_cluster = [
49 | mentions.Mention(
50 | None,
51 | spans.Span(7, 8),
52 | {"type": "NOM", "annotated_set_id": 1}),
53 |
54 | mentions.Mention(
55 | None,
56 | spans.Span(9, 9),
57 | {"type": "NAM", "annotated_set_id": 1}),
58 |
59 | mentions.Mention(
60 | None,
61 | spans.Span(10, 10),
62 | {"type": "PRO", "annotated_set_id": 1}),
63 | ]
64 |
65 | self.system1_mentions = [
66 | mentions.Mention(None, spans.Span(0, 0), {"set_id": 0}),
67 | mentions.Mention(None, spans.Span(2, 3), {"set_id": 0}),
68 | mentions.Mention(None, spans.Span(4, 5), {"set_id": 2}),
69 | mentions.Mention(None, spans.Span(5, 6), {"set_id": 2}),
70 | mentions.Mention(None, spans.Span(3, 4), {"set_id": 1}),
71 | mentions.Mention(None, spans.Span(7, 8), {"set_id": 1}),
72 | ]
73 |
74 | self.system2_cluster = [
75 | mentions.Mention(
76 | None,
77 | spans.Span(0, 0),
78 | {"tokens": ["a"], "set_id": 0}),
79 |
80 | mentions.Mention(
81 | None,
82 | spans.Span(2, 3),
83 | {"tokens": ["angry", "salesman"], "set_id": 0}),
84 |
85 | mentions.Mention(
86 | None,
87 | spans.Span(7, 8),
88 | {"tokens": ["snafu", "foo"], "set_id": 0}),
89 |
90 | mentions.Mention(
91 | None,
92 | spans.Span(9, 9),
93 | {"tokens": ["bar"], "set_id": 0}),
94 | ]
95 | self.system2_cluster[1].attributes["antecedent"] = \
96 | self.system2_cluster[0]
97 | self.system2_cluster[2].attributes["antecedent"] = \
98 | self.system2_cluster[0]
99 | self.system2_cluster[3].attributes["antecedent"] = \
100 | self.system2_cluster[2]
101 |
102 | self.maxDiff = None
103 |
104 | def test_recall_closest(self):
105 | gold_graph = data_structures.EntityGraph.from_mentions(
106 | self.gold_first_cluster, "annotated_set_id")[0]
107 |
108 | spanning_tree_edges = [
109 | (self.gold_first_cluster[1], self.gold_first_cluster[0]),
110 | (self.gold_first_cluster[2], self.gold_first_cluster[0]),
111 | (self.gold_first_cluster[3], self.gold_first_cluster[2]),
112 | (self.gold_first_cluster[4], self.gold_first_cluster[3]),
113 | (self.gold_first_cluster[5], self.gold_first_cluster[4])
114 | ]
115 |
116 | self.assertEqual(
117 | spanning_tree_edges,
118 | spanning_tree_algorithms.recall_closest(
119 | gold_graph,
120 | gold_graph.partition(
121 | data_structures.EntityGraph.from_mentions(
122 | self.system1_mentions, "set_id"))))
123 |
124 | def test_recall_type(self):
125 | gold_graph = data_structures.EntityGraph.from_mentions(
126 | self.gold_first_cluster, "annotated_set_id")[0]
127 |
128 | spanning_tree_edges = [
129 | (self.gold_first_cluster[1], self.gold_first_cluster[0]),
130 | (self.gold_first_cluster[2], self.gold_first_cluster[0]),
131 | (self.gold_first_cluster[3], self.gold_first_cluster[1]),
132 | (self.gold_first_cluster[4], self.gold_first_cluster[3]),
133 | (self.gold_first_cluster[5], self.gold_first_cluster[3])
134 | ]
135 |
136 | self.assertEqual(
137 | spanning_tree_edges,
138 | spanning_tree_algorithms.recall_accessibility(
139 | gold_graph,
140 | gold_graph.partition(
141 | data_structures.EntityGraph.from_mentions(
142 | self.system1_mentions, "set_id"))))
143 |
144 | def test_precision_system_output(self):
145 | gold_graph = data_structures.EntityGraph.from_mentions(
146 | self.system2_cluster, "set_id")[0]
147 |
148 | spanning_tree_edges = [
149 | (self.system2_cluster[1], self.system2_cluster[0]),
150 | (self.system2_cluster[2], self.system2_cluster[0]),
151 | (self.system2_cluster[3], self.system2_cluster[2])
152 | ]
153 |
154 | self.assertEqual(
155 | spanning_tree_edges,
156 | spanning_tree_algorithms.precision_system_output(
157 | gold_graph,
158 | gold_graph.partition(
159 | data_structures.EntityGraph.from_mentions(
160 | self.gold_first_cluster, "annotated_set_id"))))
161 |
162 |
163 | if __name__ == '__main__':
164 | unittest.main()
165 |
--------------------------------------------------------------------------------
/cort/test/core/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/test/core/test_corpora.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 |
4 | from cort.core.corpora import Corpus
5 |
6 |
7 | __author__ = 'smartschat'
8 |
9 |
10 | class TestCorpora(unittest.TestCase):
11 | def setUp(self):
12 | directory = os.path.dirname(os.path.realpath(__file__)) + "/resources/"
13 | self.input_data = open(directory + "input.conll", "r")
14 |
15 | def test_conll_reader(self):
16 | corpus = Corpus.from_file("test", self.input_data)
17 | self.assertEqual(5, len(corpus.documents))
18 |
19 | if __name__ == '__main__':
20 | unittest.main()
21 |
--------------------------------------------------------------------------------
/cort/test/core/test_external_data.py:
--------------------------------------------------------------------------------
1 | from cort.core.external_data import GenderData
2 |
3 | __author__ = 'smartschat'
4 |
5 | import unittest
6 |
7 |
8 | class TestGenderData(unittest.TestCase):
9 | def setUp(self):
10 | self.gender_data = GenderData.get_instance()
11 |
12 | def test_look_up(self):
13 | self.assertEqual("NEUTRAL",
14 | self.gender_data.look_up({"tokens": ["snafu"]}))
15 |
16 | self.assertEqual("FEMALE",
17 | self.gender_data.look_up(
18 | {"tokens": ["Barbara", "Bush"],
19 | "head": ["Barbara", "Bush"]}))
20 |
21 | self.assertEqual("MALE",
22 | self.gender_data.look_up({
23 | "tokens": ["Footballer", "Zidane"],
24 | "head": ["Zidane"]}))
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/cort/test/core/test_spans.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from cort.core.spans import Span
4 |
5 |
6 | __author__ = 'smartschat'
7 |
8 |
9 | class TestSpan(unittest.TestCase):
10 | def test_span(self):
11 | span = Span(0, 1)
12 | self.assertEqual(0, span.begin)
13 | self.assertEqual(1, span.end)
14 |
15 | def test_parse(self):
16 | self.assertEqual(Span(10, 12), Span.parse("(10, 12)"))
17 | self.assertEqual(Span(10, 12), Span.parse("(10,12)"))
18 |
19 | if __name__ == '__main__':
20 | unittest.main()
21 |
--------------------------------------------------------------------------------
/cort/test/core/test_util.py:
--------------------------------------------------------------------------------
1 | from cort.core.util import clean_via_pos
2 |
3 | __author__ = 'smartschat'
4 |
5 | import unittest
6 |
7 |
8 | class TestUtil(unittest.TestCase):
9 | def test_clean_via_pos(self):
10 | self.assertEqual(
11 | ["newly-elect", "leader", "wife"],
12 | clean_via_pos(
13 | ["the", "newly-elect", "leader", "'s", "wife"],
14 | ["DT", "JJ", "NN", "POS", "NN"]))
15 |
16 |
17 | if __name__ == '__main__':
18 | unittest.main()
19 |
--------------------------------------------------------------------------------
/cort/test/multigraph/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'martscsn'
2 |
--------------------------------------------------------------------------------
/cort/util/__init__.py:
--------------------------------------------------------------------------------
1 | "Utility functions."
2 |
3 | __author__ = 'sebastian'
4 |
--------------------------------------------------------------------------------
/cort/util/import_helper.py:
--------------------------------------------------------------------------------
1 | import numpy
2 | import pyximport
3 | pyximport.install(setup_args={"include_dirs": numpy.get_include()})
4 |
5 | import importlib
6 | import inspect
7 |
8 |
9 | __author__ = 'martscsn'
10 |
11 |
12 | def import_from_path(name):
13 | splitted = name.split(".")
14 | package_name = ".".join(splitted[:-1])
15 | cls = splitted[-1]
16 |
17 | package = importlib.import_module(package_name)
18 |
19 | imported = getattr(package, cls)
20 |
21 | return imported
22 |
23 |
24 | def get_features(filename):
25 | mention_features = []
26 | pairwise_features = []
27 |
28 | for line in open(filename).readlines():
29 | feature = import_from_path(line.strip())
30 | number_of_arguments = len(inspect.getargspec(feature)[0])
31 |
32 | if number_of_arguments == 1:
33 | mention_features.append(feature)
34 | elif number_of_arguments == 2:
35 | pairwise_features.append(feature)
36 | else:
37 | raise ValueError("Features must have one or two arguments, "
38 | "feature " + line.strip() + " has " +
39 | str(number_of_arguments) + " arguments.")
40 |
41 | return mention_features, pairwise_features
42 |
--------------------------------------------------------------------------------
/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/smartschat/cort/2349f0308a4115acb89d442fe945533bdb3b70e2/plot.png
--------------------------------------------------------------------------------
/scripts/acl15demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import print_function
4 | import io
5 | import logging
6 | import pickle
7 | import numpy
8 |
9 | import pyximport
10 | pyximport.install(setup_args={"include_dirs": numpy.get_include()})
11 |
12 | from cort.preprocessing import pipeline
13 | from cort.core import mention_extractor
14 | from cort.coreference.approaches import mention_ranking
15 | from cort.coreference import cost_functions, clusterer
16 | from cort.coreference import experiments
17 | from cort.coreference import features
18 | from cort.coreference import instance_extractors
19 | from cort.core import corpora
20 | from cort.analysis import visualization, error_extractors, spanning_tree_algorithms
21 |
22 | try:
23 | import tkinter as tki
24 | except ImportError:
25 | import Tkinter as tki
26 |
27 | __author__ = 'smartschat'
28 |
29 | logging.basicConfig(level=logging.INFO,
30 | format='%(asctime)s %(levelname)s %(''message)s')
31 |
32 | class LiveDemo():
33 | def __init__(self):
34 | mention_features = [
35 | features.fine_type,
36 | features.gender,
37 | features.number,
38 | features.sem_class,
39 | features.deprel,
40 | features.head_ner,
41 | features.length,
42 | features.head,
43 | features.first,
44 | features.last,
45 | features.preceding_token,
46 | features.next_token,
47 | features.governor,
48 | features.ancestry
49 | ]
50 |
51 | pairwise_features = [
52 | features.exact_match,
53 | features.head_match,
54 | features.same_speaker,
55 | features.alias,
56 | features.sentence_distance,
57 | features.embedding,
58 | features.modifier,
59 | features.tokens_contained,
60 | features.head_contained,
61 | features.token_distance
62 | ]
63 |
64 | self.extractor = instance_extractors.InstanceExtractor(
65 | mention_ranking.extract_substructures,
66 | mention_features,
67 | pairwise_features,
68 | cost_functions.null_cost
69 | )
70 |
71 | logging.info("Loading model.")
72 |
73 | priors, weights = pickle.load(open("latent-model-train.obj", "rb"))
74 |
75 | self.perceptron = mention_ranking.RankingPerceptron(
76 | priors=priors,
77 | weights=weights,
78 | cost_scaling=0
79 | )
80 |
81 | logging.info("Loading CoreNLP models.")
82 | self.p = pipeline.Pipeline(
83 | "/home/sebastian/Downloads/stanford-corenlp-full-2015-04-20")
84 |
85 | self.root = tki.Tk()
86 | self.root.title("cort Demo")
87 |
88 | # create a Frame for the Text and Scrollbar
89 | self.txt_frm = tki.Frame(self.root, width=400, height=200)
90 | self.txt_frm.pack(fill="both", expand=True)
91 |
92 | # ensure a consistent GUI size
93 | self.txt_frm.grid_propagate(False)
94 |
95 | # implement stretchability
96 | self.txt_frm.grid_rowconfigure(0, weight=1)
97 | self.txt_frm.grid_columnconfigure(0, weight=1)
98 |
99 | # create a Text widget
100 | self.txt = tki.Text(self.txt_frm, borderwidth=3, relief="sunken")
101 | self.txt.config(font=("consolas", 12), undo=True, wrap='word')
102 | self.txt.grid(row=0, column=0, sticky="nsew", padx=2, pady=2)
103 |
104 | # create a Scrollbar and associate it with txt
105 | scrollb = tki.Scrollbar(self.txt_frm, command=self.txt.yview)
106 | scrollb.grid(row=0, column=1, sticky='nsew')
107 | self.txt['yscrollcommand'] = scrollb.set
108 |
109 | self.button = tki.Button(self.root, text='Resolve Coreference',
110 | command=self.do_coreference)
111 |
112 | self.button.pack()
113 |
114 | def run(self):
115 | self.root.mainloop()
116 |
117 | def do_coreference(self):
118 | testing_corpus = corpora.Corpus("input", [self.p.run_on_doc(
119 | io.StringIO(self.txt.get("0.0", tki.END)), "input")])
120 |
121 | logging.info("Extracting system mentions.")
122 | for doc in testing_corpus:
123 | doc.system_mentions = mention_extractor.extract_system_mentions(doc)
124 |
125 | mention_entity_mapping, antecedent_mapping = experiments.predict(
126 | testing_corpus,
127 | self.extractor,
128 | self.perceptron,
129 | clusterer.all_ante
130 | )
131 |
132 | testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping)
133 |
134 | logging.info("Visualize")
135 |
136 | for doc in testing_corpus:
137 | max_id = 0
138 |
139 | for mention in doc.system_mentions[1:]:
140 | set_id = mention.attributes["set_id"]
141 |
142 | if set_id:
143 | max_id = max(set_id, max_id)
144 |
145 | max_id += 1
146 |
147 | doc.annotated_mentions = []
148 |
149 | for i, mention in enumerate(doc.system_mentions[1:]):
150 | if mention.attributes["set_id"]:
151 | mention.attributes["annotated_set_id"] = mention.attributes[
152 | "set_id"]
153 | else:
154 | mention.attributes["annotated_set_id"] = max_id + i
155 | doc.annotated_mentions.append(mention)
156 |
157 | ex = error_extractors.ErrorExtractor(testing_corpus,
158 | spanning_tree_algorithms.recall_accessibility,
159 | spanning_tree_algorithms.precision_system_output)
160 |
161 | ex.add_system(testing_corpus)
162 |
163 | decisions = ex.get_errors()
164 |
165 | visualizer = visualization.Visualizer(decisions, "input",
166 | for_raw_input=True)
167 |
168 | visualizer.run()
169 |
170 | demo = LiveDemo()
171 |
172 | demo.run()
173 |
--------------------------------------------------------------------------------
/scripts/naacl15-demo.py:
--------------------------------------------------------------------------------
1 | import codecs
2 |
3 |
4 | from cort.analysis import error_extractors
5 | from cort.analysis import plotting
6 | from cort.analysis import spanning_tree_algorithms
7 | from cort.core import corpora
8 |
9 |
10 | __author__ = 'smartschat'
11 |
12 |
13 | # read in corpora
14 | reference = corpora.Corpus.from_file("reference", codecs.open("dev.gold", "r",
15 | "utf-8"))
16 | pair = corpora.Corpus.from_file("pair", codecs.open("pair-dev.out", "r", "utf-8"))
17 | tree = corpora.Corpus.from_file("tree", codecs.open("tree-dev.out", "r", "utf-8"))
18 |
19 | # optional -- not needed when you only want to compute recall errors
20 | pair.read_antecedents(open('pair-dev.antecedents'))
21 | tree.read_antecedents(open('tree-dev.antecedents'))
22 |
23 | # define error extractor
24 | extractor = error_extractors.ErrorExtractor(
25 | reference,
26 | spanning_tree_algorithms.recall_accessibility,
27 | spanning_tree_algorithms.precision_system_output
28 | )
29 |
30 | # extract errors
31 | extractor.add_system(pair)
32 | extractor.add_system(tree)
33 |
34 | errors = extractor.get_errors()
35 |
36 | # categorize by mention type of anaphor
37 | by_type = errors.categorize(
38 | lambda err: err[0].attributes["type"]
39 | )
40 |
41 |
42 | # visualize
43 | by_type.visualize("pair")
44 |
45 | # filter by distance
46 | by_type_filtered = by_type.filter(
47 | lambda err: err[0].attributes["sentence_id"] - err[1].attributes[
48 | "sentence_id"] <= 3
49 | )
50 |
51 | # plot
52 | pair_errs = by_type_filtered["pair"]["recall_errors"]["all"]
53 | tree_errs = by_type_filtered["tree"]["recall_errors"]["all"]
54 |
55 | plotting.plot(
56 | [("pair", [(cat, len(errs)) for cat, errs in pair_errs.items()]),
57 | ("tree", [(cat, len(errs)) for cat, errs in tree_errs.items()])],
58 | "Recall Errors",
59 | "Type of anaphor",
60 | "Number of Errors")
61 |
62 | # more advanced features
63 |
64 | # is anaphor a gold mention?
65 | all_gold = set()
66 | for doc in reference:
67 | for mention in doc.annotated_mentions:
68 | all_gold.add(mention)
69 |
70 |
71 | def is_anaphor_gold(mention):
72 | if mention in all_gold:
73 | return "is_gold"
74 | else:
75 | return "is_not_gold"
76 |
77 | is_ana_gold = by_type.categorize(lambda err: is_anaphor_gold(err[0]))
78 |
79 | # head statistics for NOM errors
80 | from collections import Counter
81 |
82 | for system in ["pair", "tree"]:
83 | nom_rec_errs = by_type[system]["recall_errors"]["all"]["NOM"]
84 | all_heads = [" ".join(err[0].attributes["head"]).lower() for err in nom_rec_errs]
85 | most_common = Counter(all_heads).most_common(10)
86 | print(system, most_common)
87 |
88 | # common errors:
89 | common = {
90 | "common": {
91 | "recall_errors": {},
92 | "precision_errors": {}
93 | }
94 | }
95 |
96 | common["common"]["recall_errors"]["all"] = errors["pair"]["recall_errors"][
97 | "all"].intersection(errors["tree"]["recall_errors"]["all"])
98 |
99 | common["common"]["precision_errors"]["all"] = errors["pair"]["precision_errors"][
100 | "all"].intersection(errors["tree"]["precision_errors"]["all"])
101 |
102 | from cort.analysis import data_structures
103 | common = data_structures.StructuredCoreferenceAnalysis(
104 | common, errors.reference, errors.corpora
105 | )
106 |
107 | # plot decisions
108 | decs = by_type_filtered["pair"]["decisions"]["all"]
109 | prec_errs = by_type_filtered["pair"]["precision_errors"]["all"]
110 |
111 | plotting.plot(
112 | [("decisions", [(cat, len(errs)) for cat, errs in decs.items()]),
113 | ("errors", [(cat, len(errs)) for cat, errs in prec_errs.items()])],
114 | "Decisions and Errors",
115 | "Type of anaphor",
116 | "Number")
--------------------------------------------------------------------------------
/scripts/train-and-predict-all.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 |
4 | import subprocess
5 |
6 |
7 | __author__ = 'smartschat'
8 |
9 |
10 | def get_extractor(data_set, system):
11 | if system == "closest" or system == "latent":
12 | return "cort.coreference.approaches.mention_ranking.extract_substructures"
13 | elif system == "tree":
14 | return "cort.coreference.approaches.antecedent_trees.extract_substructures"
15 | elif system == "pair":
16 | if data_set == "train":
17 | return "cort.coreference.approaches.mention_pairs" \
18 | ".extract_training_substructures"
19 | else:
20 | return "cort.coreference.approaches.mention_pairs" \
21 | ".extract_testing_substructures"
22 |
23 |
24 | def get_perceptron(system):
25 | if system == "pair":
26 | return "cort.coreference.approaches.mention_pairs.MentionPairsPerceptron"
27 | elif system == "closest":
28 | return "cort.coreference.approaches.mention_ranking.RankingPerceptronClosest"
29 | elif system == "latent":
30 | return "cort.coreference.approaches.mention_ranking.RankingPerceptron"
31 | elif system == "tree":
32 | return "cort.coreference.approaches.antecedent_trees.AntecedentTreePerceptron"
33 |
34 |
35 | def get_cost_function(system):
36 | if system == "pair":
37 | return "cort.coreference.cost_functions.null_cost"
38 | else:
39 | return "cort.coreference.cost_functions.cost_based_on_consistency"
40 |
41 |
42 | def get_clusterer(system):
43 | if system == "pair":
44 | return "cort.coreference.clusterer.best_first"
45 | else:
46 | return "cort.coreference.clusterer.all_ante"
47 |
48 |
49 | systems = ["pair", "closest", "latent", "tree"]
50 | data_sets = ["dev", "test"]
51 |
52 | for system in systems:
53 | print("Training", system, "on train.")
54 | subprocess.call([
55 | "cort-train",
56 | "-in", "/data/nlp/martscsn/thesis/data/input/train.auto",
57 | "-out", "model-" + system + "-train.obj",
58 | "-extractor", get_extractor("train", system),
59 | "-perceptron", get_perceptron(system),
60 | "-cost_function", get_cost_function(system),
61 | "-cost_scaling", "100"])
62 |
63 | print("Training", system, "on dev+train.")
64 | subprocess.call([
65 | "cort-train",
66 | "-in", "/data/nlp/martscsn/thesis/data/input/train+dev.auto",
67 | "-out", "model-" + system + "-train+dev.obj",
68 | "-extractor", get_extractor("train", system),
69 | "-perceptron", get_perceptron(system),
70 | "-cost_function", get_cost_function(system),
71 | "-cost_scaling", "100"])
72 |
73 | for data_set in data_sets:
74 | print("Predicting", system, "on", data_set)
75 | if data_set == "dev":
76 | model = "model-" + system + "-train.obj"
77 | else:
78 | model = "model-" + system + "-train+dev.obj"
79 |
80 | subprocess.call([
81 | "cort-predict-conll",
82 | "-in", "/data/nlp/martscsn/thesis/data/input/" + data_set +
83 | ".auto",
84 | "-model", model,
85 | "-out", system + "-" + data_set + ".out",
86 | "-ante", system + "-" + data_set + ".antecedents",
87 | "-gold", "/data/nlp/martscsn/thesis/data/input/" + data_set +
88 | ".gold",
89 | "-extractor", get_extractor(data_set, system),
90 | "-perceptron", get_perceptron(system),
91 | "-clusterer", get_clusterer(system)])
92 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 |
4 | setup(
5 | name='cort',
6 | version='0.2.4.5',
7 | packages=['cort',
8 | 'cort.analysis',
9 | 'cort.core',
10 | 'cort.test',
11 | 'cort.coreference',
12 | 'cort.test.multigraph',
13 | 'cort.test.analysis',
14 | 'cort.test.core',
15 | 'cort.coreference.multigraph',
16 | 'cort.coreference.approaches',
17 | 'cort.util',
18 | 'cort.preprocessing',
19 | 'stanford_corenlp_pywrapper'],
20 |
21 | url='http://github.com/smartschat/cort',
22 | license='MIT',
23 | author='Sebastian Martschat, Thierry Goeckel, Patrick Claus',
24 | author_email='sebastian.martschat@gmail.com',
25 | description='A coreference resolution research toolkit.',
26 | keywords = ['NLP', 'CL', 'natural language processing',
27 | 'computational linguistics', 'coreference resolution',
28 | 'text analytics'],
29 | classifiers = [
30 | 'Intended Audience :: Science/Research',
31 | 'Programming Language :: Python :: 2.7',
32 | 'Programming Language :: Python :: 3.3',
33 | 'Topic :: Scientific/Engineering',
34 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
35 | 'Topic :: Text Processing',
36 | ],
37 | install_requires=['nltk >= 3.0.1', 'numpy', 'matplotlib', 'mmh3', 'cython',
38 | 'future', 'jpype1', 'beautifulsoup4',
39 | 'pystanforddependencies >= 0.3.1'],
40 | package_data={
41 | 'cort': ['analysis/visualization/style.css',
42 | 'analysis/visualization/lib/*',
43 | 'resources/*',
44 | 'config_files/*',
45 | 'coreference/perceptrons.pyx',
46 | "reference-coreference-scorers/v8.01/*.*",
47 | "reference-coreference-scorers/v8.01/lib/*.pm",
48 | "reference-coreference-scorers/v8.01/lib/Algorithm/*",
49 | "reference-coreference-scorers/v8.01/lib/Data/*",
50 | "reference-coreference-scorers/v8.01/lib/Math/*"],
51 | 'stanford_corenlp_pywrapper': ['rcorenlp.r',
52 | 'lib/*',
53 | 'javasrc/corenlp/*',
54 | 'javasrc/util/misc/*',
55 | 'javasrc/util/*.java'],
56 | },
57 | scripts=['bin/cort-train', 'bin/cort-predict-conll',
58 | 'bin/cort-predict-raw', 'bin/cort-visualize',
59 | 'bin/run-multigraph']
60 | )
61 |
--------------------------------------------------------------------------------
/stanford_corenlp_pywrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .sockwrap import *
2 |
--------------------------------------------------------------------------------
/stanford_corenlp_pywrapper/javasrc/corenlp/PipeRunner.java:
--------------------------------------------------------------------------------
1 | package corenlp;
2 |
3 | import org.codehaus.jackson.JsonNode;
4 |
5 | import util.Arr;
6 | import util.BasicFileIO;
7 | import util.JsonUtil;
8 | import util.U;
9 |
10 | /**
11 | * stdin/stdout commandline pipe mode that lightly wraps JsonPipeline.
12 | *
13 | * INPUT: one line per document.
14 | * docid \t TextAsJsonStringOrObjectWithTextField
15 | * OUTPUT: as JSON, one doc per line ("jdoc").
16 | * docid \t {sentences: [ {sentobj}, {sentobj}, ... ]}
17 | * where each sentobj is
18 | * {tokens: [...], char_offsets: [...], ....}
19 | *
20 | */
21 | public class PipeRunner {
22 | ProcessingMode mode;
23 | JsonPipeline parse;
24 |
25 | static enum InputFormat {
26 | DETECT_JSON_VARIANT,
27 | RAW_TEXT
28 | };
29 |
30 | /** the pre-baked processing modes, that define annotators and outputs. */
31 | static enum ProcessingMode {
32 | NOMODE,
33 | SSPLIT,
34 | POS,
35 | NER,
36 | PARSE,
37 | NERPARSE;
38 | }
39 | static ProcessingMode modeFromString(String _mode) {
40 | return
41 | _mode.equals("nomode") ? ProcessingMode.NOMODE :
42 | _mode.equals("ssplit") ? ProcessingMode.SSPLIT :
43 | _mode.equals("pos") ? ProcessingMode.POS :
44 | _mode.equals("ner") ? ProcessingMode.NER :
45 | _mode.equals("parse") ? ProcessingMode.PARSE :
46 | _mode.equals("nerparse") ? ProcessingMode.NERPARSE :
47 | null;
48 | }
49 |
50 |
51 | static void usage() {
52 | U.p("corenlp.Parse [options] \n" +
53 | "Processes document texts on and outputs NLP-annotated versions.\n" +
54 | "Both input and output formats are one document per line.\n" +
55 | "\n" +
56 | "Input format can be either\n" +
57 | " one column: TextField\n" +
58 | " two columns: docid \\t TextField\n" +
59 | "Where TextField could be either\n" +
60 | " * a JSON string, or\n" +
61 | " * a JSON object with field 'text'.\n" +
62 | "--raw-input allows the text field to be raw text, interpreted as UTF-8 encoded.\n" +
63 | "Note that JSON strings can be preferable, since they can contain any type of whitespace.\n" +
64 | "\n" +
65 | "In all cases, the output mode is two-column: docid \\t NLPInfoAsJson\n" +
66 | "");
67 | System.exit(1);
68 | }
69 |
70 | public void runStdinStdout(InputFormat inputFormat) {
71 | for (String line : BasicFileIO.STDIN_LINES) {
72 | System.err.print(".");
73 |
74 | String[] parts = line.split("\t");
75 | String docid, doctext;
76 | JsonNode payload = null;
77 | if (inputFormat == InputFormat.DETECT_JSON_VARIANT) {
78 | payload =JsonUtil.parse(parts[parts.length-1]);
79 | doctext =
80 | payload.isTextual() ? payload.asText() :
81 | payload.has("text") ? payload.get("text").asText() :
82 | null;
83 | }
84 | else if (inputFormat == InputFormat.RAW_TEXT) {
85 | doctext = parts[parts.length-1];
86 | }
87 | else { throw new RuntimeException("wtf"); }
88 |
89 | docid = parts.length >= 2 ? parts[0] :
90 | payload !=null && payload.has("docid") ? payload.get("docid").getTextValue() :
91 | "doc" + parse.numDocs;
92 |
93 | assert docid != null : "inconsistent 'docid' key";
94 | if (doctext == null) throw new RuntimeException("Couldn't interpret JSON payload: should be string, or else object with a 'text' field.");
95 |
96 | JsonNode outDoc = parse.processTextDocument(doctext);
97 | U.pf("%s\t%s\n", docid, JsonUtil.toJson(outDoc));
98 | }
99 |
100 | double elapsedSec = 1.0*(System.currentTimeMillis() - parse.startMilli) / 1000;
101 | System.err.print("\n");
102 | System.err.printf("%d docs, %d tokens, %.1f tok/sec, %.1f byte/sec\n", parse.numDocs, parse.numTokens, parse.numTokens*1.0/elapsedSec, parse.numChars*1.0/elapsedSec);
103 | }
104 |
105 | public static void main(String[] args) {
106 | if (args.length < 1) {
107 | usage();
108 | }
109 | InputFormat inputFormat = InputFormat.DETECT_JSON_VARIANT;
110 |
111 | while (args.length > 1) {
112 | String flag = args[0];
113 | if (flag.equals("--raw-input")) {
114 | inputFormat = InputFormat.RAW_TEXT;
115 | args = Arr.subArray(args, 1, args.length);
116 | }
117 | else { throw new RuntimeException("bad flag: " + flag); }
118 | }
119 |
120 |
121 | throw new RuntimeException("TODO need to handle mode parsing; in the meantime this is broken");
122 |
123 | // PipeRunner runner = new PipeRunner();
124 | // String _mode = args[0];
125 | // ProcessingMode mode = modeFromString(_mode);
126 | // if (runner.mode==null) {
127 | // U.pf("Bad mode '%s' ... to disable a mode, use 'nomode'\n", _mode);
128 | // usage();
129 | // }
130 | // runner.runStdinStdout(inputFormat);
131 | }
132 |
133 |
134 |
135 | }
136 |
--------------------------------------------------------------------------------
/stanford_corenlp_pywrapper/javasrc/util/JsonUtil.java:
--------------------------------------------------------------------------------
1 | package util;
2 |
3 | import java.io.IOException;
4 | import java.util.*;
5 | import org.codehaus.jackson.JsonNode;
6 | import org.codehaus.jackson.JsonProcessingException;
7 | import org.codehaus.jackson.map.ObjectMapper;
8 | import org.codehaus.jackson.map.type.TypeFactory;
9 | import org.codehaus.jackson.node.*;
10 |
11 | import com.google.common.collect.Multiset;
12 |
13 | import util.misc.Pair;
14 |
15 | /** simplified wrapper functions for the Jackson JSON library
16 | * this is half-baked, still learning the right way to use the library
17 | */
18 | public class JsonUtil {
19 |
20 | public static ObjectMapper om;
21 | static {
22 | om = new ObjectMapper();
23 | }
24 |
25 | public static void main(String args[]) {
26 | List x = toList(args[0], String.class);
27 | U.p(x);
28 | }
29 |
30 | public static String getTextDefault(JsonNode ob, String keyname, String defaultValue) {
31 | return ob.has(keyname) ? ob.get(keyname).asText() : defaultValue;
32 | }
33 |
34 | //////////////////////////////////////
35 |
36 | // toList() derived from
37 | // http://stackoverflow.com/questions/9942475/convert-json-to-multiple-objects-using-jackson
38 |
39 | public static ArrayList toList(String jsonString, final Class type) {
40 | try {
41 | return om.readValue(jsonString, TypeFactory.defaultInstance().constructCollectionType(ArrayList.class, type));
42 | } catch (IOException e) {
43 | return null;
44 | }
45 | }
46 |
47 | public static ArrayList toList(JsonNode jsonNode, final Class type) {
48 | try {
49 | return om.readValue(jsonNode, TypeFactory.defaultInstance().constructCollectionType(ArrayList.class, type));
50 | } catch (IOException e) {
51 | return null;
52 | }
53 | }
54 |
55 | public static ObjectNode toJson(Multiset counts) {
56 | ObjectNode jmap = newObject();
57 | for (Multiset.Entry e : counts.entrySet()) {
58 | jmap.put(e.getElement().toString(), e.getCount());
59 | }
60 | return jmap;
61 | }
62 |
63 | public static JsonNode toJson(final List data) {
64 | ArrayNode jlist = new ObjectMapper().createArrayNode();
65 | for (T elt : data) {
66 | jlist.add( toJson(elt) );
67 | }
68 | return jlist;
69 | }
70 |
71 | public static JsonNode toJson(final Pair pair) {
72 | try {
73 | List