├── _config.yml
├── LICENSE_notes.txt
├── _includes
└── head-custom-google-analytics.html
├── LICENSE.txt
├── trivial.py
├── span_dict.py
├── README.md
├── sample.gold_ptb
├── category.py
├── convert.py
├── sample.candc
├── analysis.py
├── sample.ccgbank
├── rule.py
├── trees.py
└── markup_convert.py
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: CCG to PST conversion
3 | description: Convert Combinatory Categorial Grammar derivations to Phrase Structure Trees.
4 | show_downloads: true
5 | google_analytics: G-JVV2VPL5CX
6 |
--------------------------------------------------------------------------------
/LICENSE_notes.txt:
--------------------------------------------------------------------------------
1 | The license on this software is the ISC license.
2 |
3 | "The ISC copyright is functionally equivalent to a two-term BSD copyright with
4 | language removed that is made unnecessary by the Berne convention. This is the
5 | preferred license for new code incorporated into OpenBSD."
6 | from http://www.openbsd.org/policy.html
7 |
--------------------------------------------------------------------------------
/_includes/head-custom-google-analytics.html:
--------------------------------------------------------------------------------
1 | {% if site.google_analytics %}
2 |
3 |
4 |
11 | {% endif %}
12 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014, Jonathan K Kummerfeld
2 |
3 | Permission to use, copy, modify, and/or distribute this software for any
4 | purpose with or without fee is hereby granted, provided that the above
5 | copyright notice and this permission notice appear in all copies.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
8 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
9 | FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
11 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
12 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
13 | PERFORMANCE OF THIS SOFTWARE.
14 |
15 |
--------------------------------------------------------------------------------
/trivial.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys
4 | import trees, category
5 |
6 | # The trivial method reproduces the bracket structure exactly. Labels are either
7 | # the atomic category, or a VP
8 | def convert(source, argv=None, log=sys.stdout):
9 | ans = trees.PTB_Tree()
10 | if '\\' in source.category or '/' in source.category:
11 | ans.label = "VP"
12 | else:
13 | ans.label = category.strip_square_brackets(source.category)
14 | if source.word is not None:
15 | ans.word = source.word
16 | ans.pos = source.pos
17 | ans.label = source.pos
18 | for subtree in source.subtrees:
19 | ans.subtrees.append(convert(subtree))
20 | if argv is None:
21 | return ans
22 | else:
23 | return True, ans, None
24 |
25 | if __name__ == '__main__':
26 | print("Please enter CCG trees:")
27 | for line in sys.stdin:
28 | print(convert(trees.CCG_Tree(line.strip())))
29 |
--------------------------------------------------------------------------------
/span_dict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import trees
4 |
5 | labels_to_ignore = set(["-NONE-", "TOP", "."])
6 | words_to_ignore = set(["'","`","''", "``", "--",":",";","-",",","..."])
7 | def span_dict(tree, ans, pos=0):
8 | start = pos
9 | label = tree.label
10 | word = tree.word
11 | if len(tree.subtrees) == 0:
12 | if label in labels_to_ignore or word in words_to_ignore:
13 | return pos
14 | return pos + 1
15 | for subtree in tree.subtrees:
16 | pos = span_dict(subtree, ans, pos)
17 | end = pos
18 | if start == end:
19 | return start
20 | if (start, end) not in ans:
21 | ans[(start, end)] = set()
22 | if not label[0] == '-':
23 | label = label.split('-')[0]
24 | label = label.split('=')[0]
25 | if label == 'PRT':
26 | label = 'ADVP' # another collins change
27 | if label != '' and label != 'TOP':
28 | ans[(start, end)].add(label)
29 | return pos
30 |
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This software converts Combinatory Categorial Grammar (CCG) derivations to Phrase Structure Trees (PST). For a full description of the method, and discussion of results, see:
2 |
3 | [Robust Conversion of CCG Derivations to Phrase Structure Trees](https://aclweb.org/anthology/P/P12/P12-2021.pdf),
4 | Jonathan K. Kummerfeld, James R. Curran and Dan Klein,
5 | ACL (short) 2012
6 |
7 | To use the system, download it one of these ways, and run as shown below:
8 |
9 | - [Download .zip](https://github.com/jkkummerfeld/berkeley-ccg2pst/zipball/master)
10 | - [Download .tar.gz](https://github.com/jkkummerfeld/berkeley-ccg2pst/tarball/master)
11 | - `git clone https://github.com/jkkummerfeld/berkeley-ccg2pst.git`
12 |
13 | If you use my code in your own work, please cite the paper:
14 |
15 | ```
16 | @InProceedings{Kummerfeld-Klein-Curran:2012:ACL,
17 | author = {Jonathan K. Kummerfeld and Dan Klein and James R. Curran},
18 | title = {Robust Conversion of {CCG} Derivations to Phrase Structure Trees},
19 | booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
20 | month = {July},
21 | year = {2012},
22 | address = {Jeju Island, Korea},
23 | pages = {105--109},
24 | software = {http://github.com/jkkummerfeld/berkeley-ccg2pst/},
25 | url = {http://www.aclweb.org/anthology/P12-2021},
26 | }
27 | ```
28 |
29 | ## Running the code
30 |
31 | On a sample of CCGbank:
32 | ```
33 | ./convert.py sample.gold_ptb sample.ccgbank -print_comparison -prefix=sample.ccgbank -verbose -method=markedup ./markedup
34 | ```
35 |
36 | On a sample of C&C Parser output:
37 | ```
38 | ./convert.py sample.gold_ptb sample.candc -print_comparison -prefix=sample.candc -verbose -method=markedup ./markedup
39 | ```
40 |
41 | Conversion output will be in:
42 | ```
43 | sample.ccgbank.auto
44 | sample.candc.auto
45 | ```
46 |
47 | The code also comes with a sample of parses from the Penn Treebank section 00,
48 | the corresponding parses from CCGbank section 00, and the C&C parser output on
49 | the same sentences.
50 |
--------------------------------------------------------------------------------
/sample.gold_ptb:
--------------------------------------------------------------------------------
1 | (ROOT (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .)))
2 | (ROOT (S (NP-SBJ (NNP Mr.) (NNP Vinken)) (VP (VBZ is) (NP-PRD (NP (NN chairman)) (PP (IN of) (NP (NP (NNP Elsevier) (NNP N.V.)) (, ,) (NP (DT the) (NNP Dutch) (VBG publishing) (NN group)))))) (. .)))
3 | (ROOT (S (NP-SBJ-1 (NP (NNP Rudolph) (NNP Agnew)) (, ,) (UCP (ADJP (NP (CD 55) (NNS years)) (JJ old)) (CC and) (NP (NP (JJ former) (NN chairman)) (PP (IN of) (NP (NNP Consolidated) (NNP Gold) (NNP Fields) (NNP PLC))))) (, ,)) (VP (VBD was) (VP (VBN named) (S (NP-SBJ (-NONE- *-1)) (NP-PRD (NP (DT a) (JJ nonexecutive) (NN director)) (PP (IN of) (NP (DT this) (JJ British) (JJ industrial) (NN conglomerate))))))) (. .)))
4 | (ROOT (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC (ADVP-TMP (RB once)) (VP (VBN used) (NP (-NONE- *)) (S-CLR (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB make) (NP (NNP Kent) (NN cigarette) (NNS filters)))))))) (VP (VBZ has) (VP (VBN caused) (NP (NP (DT a) (JJ high) (NN percentage)) (PP (IN of) (NP (NN cancer) (NNS deaths))) (PP-LOC (IN among) (NP (NP (DT a) (NN group)) (PP (IN of) (NP (NP (NNS workers)) (RRC (VP (VBN exposed) (NP (-NONE- *)) (PP-CLR (TO to) (NP (PRP it))) (ADVP-TMP (NP (QP (RBR more) (IN than) (CD 30)) (NNS years)) (IN ago)))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)))
5 | (ROOT (S (S-TPC-2 (NP-SBJ (NP (DT The) (NN asbestos) (NN fiber)) (, ,) (NP (NN crocidolite)) (, ,)) (VP (VBZ is) (ADJP-PRD (RB unusually) (JJ resilient)) (SBAR-TMP (IN once) (S (NP-SBJ (PRP it)) (VP (VBZ enters) (NP (DT the) (NNS lungs))))) (, ,) (PP (IN with) (S-NOM (NP-SBJ (NP (RB even) (JJ brief) (NNS exposures)) (PP (TO to) (NP (PRP it)))) (VP (VBG causing) (NP (NP (NNS symptoms)) (SBAR (WHNP-1 (WDT that)) (S (NP-SBJ (-NONE- *T*-1)) (VP (VBP show) (PRT (RP up)) (ADVP-TMP (NP (NNS decades)) (JJ later))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD said) (SBAR (-NONE- 0) (S (-NONE- *T*-2)))) (. .)))
6 | (ROOT (S (NP-SBJ (NP (NNP Lorillard) (NNP Inc.)) (, ,) (NP (NP (DT the) (NN unit)) (PP (IN of) (NP (ADJP (JJ New) (JJ York-based)) (NNP Loews) (NNP Corp.))) (SBAR (WHNP-2 (WDT that)) (S (NP-SBJ (-NONE- *T*-2)) (VP (VBZ makes) (NP (NNP Kent) (NNS cigarettes)))))) (, ,)) (VP (VBD stopped) (VP (VBG using) (NP (NN crocidolite)) (PP-LOC-CLR (IN in) (NP (PRP$ its) (NN Micronite) (NN cigarette) (NNS filters))) (PP-TMP (IN in) (NP (CD 1956))))) (. .)))
7 | (ROOT (S (SBAR-ADV (IN Although) (S (NP-SBJ-2 (JJ preliminary) (NNS findings)) (VP (VBD were) (VP (VBN reported) (NP (-NONE- *-2)) (ADVP-TMP (NP (QP (RBR more) (IN than) (DT a)) (NN year)) (IN ago)))))) (, ,) (NP-SBJ (DT the) (JJS latest) (NNS results)) (VP (VBP appear) (PP-LOC (IN in) (NP (NP (NP (NP (NN today) (POS 's)) (NNP New) (NNP England) (NNP Journal)) (PP (IN of) (NP (NNP Medicine)))) (, ,) (NP (NP (DT a) (NN forum)) (ADJP (JJ likely) (S (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB bring) (NP (JJ new) (NN attention)) (PP-DIR (TO to) (NP (DT the) (NN problem))))))))))) (. .)))
8 | (ROOT (S (NP-SBJ (DT A) (NNP Lorillard) (NN spokewoman)) (VP (VBD said) (, ,) (`` ``) (S (NP-SBJ (DT This)) (VP (VBZ is) (NP-PRD (DT an) (JJ old) (NN story))))) (. .)))
9 | (ROOT (S (NP-SBJ (PRP We)) (VP (VBP 're) (VP (VBG talking) (PP-CLR (IN about) (ADVP-TMP (ADVP (NP (NNS years)) (IN ago)) (SBAR (IN before) (S (NP-SBJ (NN anyone)) (VP (VBD heard) (PP-CLR (IN of) (S-NOM (NP-SBJ (NN asbestos)) (VP (VBG having) (NP (DT any) (JJ questionable) (NNS properties)))))))))))) (. .)))
10 | (ROOT (S (NP-SBJ (EX There)) (VP (VBZ is) (NP-PRD (DT no) (NN asbestos)) (PP-LOC (IN in) (NP (PRP$ our) (NNS products))) (ADVP-TMP (RB now))) (. .) ('' '')))
11 |
--------------------------------------------------------------------------------
/category.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Just handle unary rules, working out when one is being used
3 |
4 | import re
5 |
6 | CURLY_BRACES_RE = re.compile('{[^}]*}')
7 | def strip_braces(category):
8 | return CURLY_BRACES_RE.sub('', category)
9 |
10 | SQUARE_BRACKETS_RE = re.compile('\[[^\]]*\]')
11 | def strip_square_brackets(category):
12 | if category is not None:
13 | return SQUARE_BRACKETS_RE.sub('', category)
14 | else:
15 | return None
16 |
17 | def remove_extra_brackets(category):
18 | if category[0] != '(' or category[-1] != ')':
19 | return category
20 | if not ('\\' in category or '/' in category):
21 | return category[1:-1]
22 | depth = 0
23 | hit_zero = False
24 | for i in range(len(category)):
25 | if category[i] == '(':
26 | depth += 1
27 | elif category[i] == ')':
28 | depth -= 1
29 | elif depth == 0:
30 | hit_zero = True
31 | break
32 | if not hit_zero:
33 | return category[1:-1]
34 | return category
35 |
36 | def divide(category):
37 | if '\\' not in category and '/' not in category:
38 | return [category, None, None]
39 | category = remove_extra_brackets(category)
40 | depth = 0
41 | sep = None
42 | for i in range(len(category)):
43 | if category[i] == '(':
44 | depth += 1
45 | elif category[i] == ')':
46 | depth -= 1
47 | elif category[i] in '/\\' and depth == 0:
48 | sep = i
49 | break
50 | if sep is None:
51 | return [category, None, None]
52 | parts = [category[:sep], category[sep:sep+1], category[sep+1:]]
53 | for i in [0, 2]:
54 | while True:
55 | if parts[i][0] != '(' or parts[i][-1] != ')':
56 | break
57 | stripped_version = parts[i][1:-1]
58 | depth = 0
59 | use = True
60 | for char in stripped_version:
61 | if char == '(':
62 | depth += 1
63 | elif char == ')':
64 | depth -= 1
65 | if depth < 0:
66 | use = False
67 | if use:
68 | parts[i] = stripped_version
69 | else:
70 | break
71 | return parts
72 |
73 | def compare(cat0, cat1):
74 | if cat0 is None or cat1 is None:
75 | return False
76 | # Check the general structure matches
77 | if strip_square_brackets(cat0) != strip_square_brackets(cat1):
78 | return False
79 | # remove [conj], which is present temporarily at the end
80 | cat0 = cat0.split('[conj]')[0]
81 | cat1 = cat1.split('[conj]')[0]
82 |
83 | cat0 = 'NP[X]'.join(cat0.split('NP'))
84 | cat0 = 'NP['.join(cat0.split('NP[X]['))
85 | cat1 = 'NP[X]'.join(cat1.split('NP'))
86 | cat1 = 'NP['.join(cat1.split('NP[X]['))
87 |
88 | cat0 = 'S[X]'.join(cat0.split('S'))
89 | cat0 = 'S['.join(cat0.split('S[X]['))
90 | cat1 = 'S[X]'.join(cat1.split('S'))
91 | cat1 = 'S['.join(cat1.split('S[X]['))
92 |
93 | pairs0 = SQUARE_BRACKETS_RE.findall(cat0)
94 | pairs1 = SQUARE_BRACKETS_RE.findall(cat1)
95 | # Having no brackets indicates no S, so it's fine
96 | if len(pairs0) == 0 or len(pairs1) == 0:
97 | return True
98 | # For debugging
99 | if len(pairs0) != len(pairs1):
100 | print('confused by:')
101 | print(cat0, cat1)
102 | # Make sure they all match (with X as a wildcard)
103 | for i in range(len(pairs0)):
104 | if pairs0[i] == '[X]' or pairs1[i] == '[X]' or pairs0[i] == pairs1[i]:
105 | continue
106 | return False
107 | return True
108 |
109 | if __name__ == '__main__':
110 | pass
111 |
--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import sys, re
4 | import trees, category, rule
5 | import analysis
6 | import span_dict
7 | import trivial, markup_convert
8 |
9 | tree_out = None
10 | gold_out = None
11 | log_out = sys.stdout
12 | colour_out = None
13 | analysis_out = sys.stdout
14 |
15 | def score_count(target, auto):
16 | gold_nodes = 0
17 | auta_nodeo = 0
18 | match_brackets = 0
19 | match_labels = 0
20 |
21 | target_spans = {}
22 | span_dict.span_dict(target, target_spans)
23 | auto_spans = {}
24 | span_dict.span_dict(auto, auto_spans)
25 | gold_nodes = 0
26 | auto_nodes = 0
27 | print(target_spans.keys(), file=log_out)
28 | print(auto_spans.keys(), file=log_out)
29 | for key in target_spans:
30 | gold_nodes += len(target_spans[key])
31 | if key in auto_spans:
32 | match_brackets += min(len(auto_spans[key]), len(target_spans[key]))
33 | match_labels += len(auto_spans[key].intersection(target_spans[key]))
34 | if len(target_spans[key].symmetric_difference(auto_spans[key])) != 0:
35 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=log_out)
36 | print('different label sets: ', key, target_spans[key], auto_spans[key], target.word_yield(key)[1], file=colour_out)
37 | else:
38 | # Check for crossing brackets
39 | crossing = False
40 | for akey in auto_spans:
41 | if key[0] < akey[0] < key[1] < akey[1]:
42 | crossing = True
43 | break
44 | if akey[0] < key[0] < akey[1] < key[1]:
45 | crossing = True
46 | break
47 | if crossing:
48 | print('crossing', end=" ", file=log_out)
49 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out)
50 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=log_out)
51 | print('missing span: ', key, target_spans[key], target.word_yield(key)[1], file=colour_out)
52 | for key in auto_spans:
53 | auto_nodes += len(auto_spans[key])
54 | if key not in target_spans:
55 | crossing = False
56 | for tkey in target_spans:
57 | if key[0] < tkey[0] < key[1] < tkey[1]:
58 | crossing = True
59 | break
60 | if tkey[0] < key[0] < tkey[1] < key[1]:
61 | crossing = True
62 | break
63 | if crossing:
64 | print('crossing', end=" ", file=log_out)
65 | print('\033[01;31mcrossing\033[00m', end=" ", file=colour_out)
66 | # Check for crossing brackets
67 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=log_out)
68 | print('extra span: ', key, auto_spans[key], target.word_yield(key)[1], file=colour_out)
69 | return gold_nodes, auto_nodes, match_brackets, match_labels
70 |
71 | def calc_prf(overlap, auto, gold):
72 | if gold == 0:
73 | return 1.0, 1.0, 1.0
74 | if auto == 0:
75 | return 0.0, 0.0, 0.0
76 | p = float(overlap) / auto
77 | r = float(overlap) / gold
78 | f = 0
79 | if p + r > 1e-5:
80 | f = 2 * p * r / (p + r)
81 | return p, r, f
82 |
83 | def compare_words(pwords, cwords):
84 | i = 0
85 | match = 0
86 | for word in cwords:
87 | while word != pwords[i]:
88 | if i == len(pwords) - 1:
89 | break
90 | i += 1
91 | if word == pwords[i]:
92 | match += 1
93 | return float(match) / len(cwords)
94 |
95 | def print_stats(stats_name, gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences):
96 | p_brac, r_brac, f_brac = calc_prf(match_brackets, auto_nodes, gold_nodes)
97 | p_labe, r_labe, f_labe = calc_prf(match_labels, auto_nodes, gold_nodes)
98 | print(stats_name, "counts: ", gold_nodes, auto_nodes, ' ', match_brackets, match_labels, file=log_out)
99 | print(stats_name, "brackets: %.2f %.2f %.2f" % (p_brac * 100, r_brac * 100, f_brac * 100), file=log_out)
100 | print(stats_name, "labels: %.2f %.2f %.2f" % (p_labe * 100, r_labe * 100, f_labe * 100), file=log_out)
101 | print(stats_name, "sentences: %d of %d (i.e. %.2f), just brackets %d of %d (i.e. %.2f)" % (correct_sentences, total_sentences, correct_sentences * 100.0 / total_sentences, correct_sentences_brackets, total_sentences, correct_sentences_brackets * 102.0 / total_sentences), file=log_out)
102 |
103 | if __name__ == '__main__':
104 | args = ' '.join(sys.argv)
105 | methods = {
106 | 'trivial': trivial.convert,
107 | 'markedup': markup_convert.convert
108 | }
109 | if len(sys.argv) < 3:
110 | print("Usage:\n%s " % sys.argv[0])
111 | print("Options:")
112 | print("\t-method=[%s]" % (','.join(methods.keys())))
113 | print("\t-print_comparison")
114 | print("\t-sents=")
115 | print("\t-max_length=")
116 | print("\t-prefix=")
117 | print("\t-exclude_no_parse")
118 | sys.exit(1)
119 |
120 | only_parsed = '-exclude_no_parse' in ' '.join(sys.argv)
121 | if '-prefix=' in args:
122 | prefix = args.split('-prefix=')[1].split(' ')[0]
123 | tree_out = open(prefix + '.auto', 'w')
124 | gold_out = open(prefix + '.gold', 'w')
125 | log_out = open(prefix + '.log', 'w')
126 | colour_out = open(prefix + '.colour', 'w')
127 | analysis_out = open(prefix + '.analysis', 'w')
128 | for output in [log_out, colour_out, analysis_out]:
129 | print("# this file was generated by the following command(s):", file=output)
130 | print("# " + args, file=output)
131 | print('', file=output)
132 | else:
133 | print("# this file was generated by the following command(s):")
134 | print("# " + args)
135 | print
136 |
137 | total_sentences = 1000000 if "-sents" not in args else int(args.split('-sents=')[1].split(' ')[0])
138 | max_sent_length = -1 if "-max_length" not in args else int(args.split('-sents=')[1].split(' ')[0])
139 |
140 | gold_nodes = 0
141 | auto_nodes = 0
142 | match_brackets = 0
143 | match_labels = 0
144 | correct_sentences = 0
145 | correct_sentences_brackets = 0
146 | print_trees = "-print_comparison" in args
147 | ptb_source = open(sys.argv[1])
148 | ccg_source = open(sys.argv[2])
149 | for i in range(total_sentences):
150 | source = trees.read_CCG_tree(ccg_source)
151 | target = trees.read_PTB_tree(ptb_source)
152 | ### print(source)
153 | if source is None or target is None:
154 | total_sentences = i
155 | break
156 |
157 | if source.category is None:
158 | if not only_parsed:
159 | if gold_out is not None:
160 | print(target.one_line_repr(), file=gold_out)
161 | print("", file=tree_out)
162 | # only evaluate on sentences that receive a parse
163 | continue
164 |
165 | pwords = target.get_words()
166 | cwords = source.get_words()
167 | if len(cwords) != 0:
168 | while compare_words(pwords, cwords) < 0.7:
169 | if not only_parsed:
170 | if gold_out is not None:
171 | print(target.one_line_repr(), file=gold_out)
172 | print("", file=tree_out)
173 | target = trees.read_PTB_tree(ptb_source)
174 | if target is None:
175 | print("Ran out of sentences trying to find a match", file=sys.stderr)
176 | sys.exit(2)
177 | pwords = target.get_words()
178 |
179 | if max_sent_length > 0 and len(pwords) > max_sent_length:
180 | continue
181 |
182 | if target.label == '':
183 | target.label = 'ROOT'
184 |
185 | if print_trees:
186 | print(source, file=log_out)
187 | print(target, file=log_out)
188 | use, auto_ptb, auto_schema = (False, None, None)
189 | if 'method' in args:
190 | method_name = args.split('method=')[1].split()[0]
191 | ans = methods[method_name](source, sys.argv, log_out)
192 | use, auto_ptb, auto_schema = ans
193 | else:
194 | ans = trivial.convert(source, sys.argv, log_out)
195 | use, auto_ptb, auto_schema = ans
196 |
197 | if not use:
198 | print("Not being included", file=log_out)
199 | if auto_schema is not None:
200 | analysis.analyse(source, target, auto_ptb, auto_schema, analysis_out)
201 | if tree_out is not None:
202 | if use:
203 | print(target.one_line_repr(), file=gold_out)
204 | print(auto_ptb.one_line_repr(), file=tree_out)
205 | elif not only_parsed:
206 | print(target.one_line_repr(), file=gold_out)
207 | print("", file=tree_out)
208 |
209 | if print_trees:
210 | print(auto_ptb, file=log_out)
211 | if colour_out is not None:
212 | print(source, file=colour_out)
213 | print(auto_ptb.repr_with_corrections(target), file=colour_out)
214 |
215 | scores = score_count(target, auto_ptb)
216 | gold_nodes += scores[0]
217 | auto_nodes += scores[1]
218 | match_brackets += scores[2]
219 | match_labels += scores[3]
220 | if scores[0] == scores[1] == scores[2]:
221 | correct_sentences_brackets += 1
222 | if scores[0] == scores[1] == scores[3]:
223 | correct_sentences += 1
224 | print_stats('', scores[0], scores[1], scores[2], scores[3], correct_sentences, correct_sentences_brackets, i + 1)
225 | print_stats('cumulative', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, i + 1)
226 | print_stats('final', gold_nodes, auto_nodes, match_brackets, match_labels, correct_sentences, correct_sentences_brackets, total_sentences)
227 |
--------------------------------------------------------------------------------
/sample.candc:
--------------------------------------------------------------------------------
1 | ID=1 PARSER=GOLD NUMPARSE=0
2 | ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( () ( ( ( () ( () ())) ( () ( () ( () ())))) ( () ( () ()))))))
3 | ID=2 PARSER=GOLD NUMPARSE=0
4 | ( ( ( () ())) ( () ( ( ()) ( () ( ( ( () ())) ( () ( () ( () ( () ( () ()))))))))))
5 | ID=3 PARSER=GOLD NUMPARSE=0
6 | ( ( ( ( ( () ())) ( () ( ( ( ( () ())) ())))) ( () ( ( ( () ())) ( () ( ( () ( () ( () ())))))))) ( () ( () ( () ( ( () ( () ())) ( () ( () ( () ( () ( () ()))))))))))
7 | ID=4 PARSER=GOLD NUMPARSE=0
8 | ( ( ( ( () ()) ( () ( ( ()) ( ( () ( () ( () ( () ( ( () ( () ()))))))))))) ( () ( () ( ( () ( () ())) ( () ( ( ( () ())) ( () ( ( () ()) ( () ( ( ()) ( ( ( () ( () ())) ( ( ( ( ( () ()) ()) ())) ()))))))))))))) ( () ( ( ()) ( () ()))))
9 | ID=5 PARSER=GOLD NUMPARSE=0
10 | ( ( ( ( () ( () ())) ( () ( ()))) ( () ( ( () ( () ())) ( () ( () ( ( () ( () ())) ( () ( ( () ( ( ( () ( () ()))) ( () ()))) ( () (