├── .gitignore
├── README.md
├── bin
    ├── add_sent_ids.py
    ├── compute_transition_probs.py
    ├── convert-bilou-single-field.py
    ├── convert-bilou.sh
    ├── convert-bio.sh
    ├── download_clearnlp.sh
    ├── extract_conll_parse_file.py
    ├── extract_conll_prop_file.py
    ├── filter-conll-2012.py
    ├── jackknife_ontonotes_pos.py
    ├── parse-conll09.sh
    ├── parse-conll12.sh
    ├── preprocess_conll2012.sh
    ├── preprocess_conll2012_sdeps.sh
    └── preprocess_conll2012_sdeps_old.sh
├── config_decode_pos.xml
├── conll05
    ├── convert-bio.sh
    ├── extract_dev_from_ptb.sh
    ├── extract_test_from_brown.sh
    ├── extract_test_from_ptb.sh
    ├── extract_train_from_ptb.sh
    ├── make-brown-test.sh
    ├── make-devset.sh
    ├── make-trainset.sh
    ├── make-wsj-test.sh
    ├── preprocess_conll05.sh
    ├── preprocess_conll05_sdeps.sh
    └── set_paths.env
├── headrule_en_stanford.txt
├── ptb
    ├── convert_spos.sh
    └── ptb2stanford.sh
└── testdata.conll


/.gitignore:
--------------------------------------------------------------------------------
1 | *.jar
2 | .idea
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # conll2012-preprocess-parsing
 2 | Scripts for pre-processing the CoNLL-2012 dataset for syntactic dependency parsing.
 3 | 
 4 | These scripts attempt to replicate the pre-processing of the [CoNLL-2012 subset](http://conll.cemantix.org/2012/data.html) of the OntoNotes corpus described in the paper 
 5 | [It Depends: Dependency Parser Comparison Using A Web-based Evaluation Tool](http://www.aclweb.org/anthology/P/P15/P15-1038.pdf).
 6 | 
 7 | Specifically, they use [ClearNLP](https://github.com/clir/clearnlp) to (1) convert the
 8 | constituency parses to dependency structure (w/ head rules [described here](http://www.mathcs.emory.edu/~choi/doc/cu-2012-choi.pdf) 
 9 | and (2) assign automatic part-of-speech tags.
10 | 
11 | The scripts assume you have already extracted the CoNLL-2012 split from the OntoNotes corpus, [as described here](http://conll.cemantix.org/2012/data.html).
12 | 
13 | Note that these scripts *do not* remove length-1 sentences, as was done in the paper cited above. We leave you to do that if you choose.
14 | 
15 | Requirements
16 | --------------
17 | - I ran these with Java 8
18 | 
19 | Currently these scripts write to the `/path/to/conll-2012` directories below. Happy to accept patches that will write files to
20 | somewhere else.
21 | 
22 | Download ClearNLP
23 | --------------
24 | ```
25 | ./bin/download_clearNLP.sh
26 | ```
27 | 
28 | Do the pre-processing
29 | --------------
30 | ```
31 | ./bin/preprocess_conll2012.sh /path/to/conll-2012/dev
32 | ./bin/preprocess_conll2012.sh /path/to/conll-2012/test
33 | ./bin/preprocess_conll2012.sh /path/to/conll-2012/train
34 | ```
35 | 
36 | Combine into single files
37 | --------------
38 | ```
39 | for f in `find /path/to/conll-2012/train -type f -name "*\.parse\.dep\.combined"`; do cat $f >> conll2012-train.txt; done
40 | for f in `find /path/to/conll-2012/dev -type f -name "*\.parse\.dep\.combined"`; do cat $f >> conll2012-dev.txt; done
41 | for f in `find /path/to/conll-2012/test -type f -name "*\.parse\.dep\.combined"`; do cat $f >> conll2012-test.txt; done
42 | ```
43 | 
44 | Convert segments to BILOU encoding
45 | -------------
46 | ```
47 | ./bin/convert-bilou.sh /path/to/file
48 | ```
49 | 
50 | Extracting props
51 | -------------
52 | This is useful for producing the gold file expected for `srl-eval.pl`
53 | ```
54 | python bin/extract_conll_prop_file.py --input_file /path/to/conll2012-test.txt --word_field 3 --first_prop_field 14 --pred_field 9 --pred_field_offset 1 --no_take_last
55 | ```
56 | 
57 | File format
58 | -------------
59 | TODO describe
60 | ```
61 | nw/wsj/24/wsj_2437      0       0       For     IN      IN      7       prep    _       -       -       -       -       *       (ARGM-TMP*      -
62 | nw/wsj/24/wsj_2437      0       1       all     DT      DT      1       pobj    _       -       -       -       -       *       *       (0)
63 | nw/wsj/24/wsj_2437      0       2       of      IN      IN      2       prep    _       -       -       -       -       *       *       (0
64 | nw/wsj/24/wsj_2437      0       3       1988    CD      CD      3       pobj    _       -       -       -       -       (DATE)  *)      (0)|0)
65 | nw/wsj/24/wsj_2437      0       4       ,       ,       ,       7       punct   _       -       -       -       -       *       *       -
66 | nw/wsj/24/wsj_2437      0       5       Dassault        NNP     NNP     7       dep     _       -       -       -       -       (ORG)   (ARG0*) (1)
67 | nw/wsj/24/wsj_2437      0       6       had     VBD     VBD     0       root    _       have    03      1       -       *       (V*)    -
68 | nw/wsj/24/wsj_2437      0       7       group   NN      NN      9       compound        _       group   -       -       -       *       (ARG1*  (1)
69 | nw/wsj/24/wsj_2437      0       8       profit  NN      NN      7       dobj    _       profit  -       1       -       *       *       -
70 | nw/wsj/24/wsj_2437      0       9       of      IN      IN      9       prep    _       -       -       -       -       *       *       -
71 | nw/wsj/24/wsj_2437      0       10      428     CD      CD      12      compound        _       -       -       -       -       (MONEY* *       -
72 | nw/wsj/24/wsj_2437      0       11      million CD      CD      13      nummod  _       -       -       -       -       *       *       -
73 | nw/wsj/24/wsj_2437      0       12      francs  NNS     NNS     10      pobj    _       -       -       -       -       *)      *       -
74 | nw/wsj/24/wsj_2437      0       13      on      IN      IN      9       prep    _       -       -       -       -       *       *       -
75 | nw/wsj/24/wsj_2437      0       14      revenue NN      NN      14      pobj    _       revenue -       1       -       *       *       -
76 | nw/wsj/24/wsj_2437      0       15      of      IN      IN      15      prep    _       -       -       -       -       *       *       -
77 | nw/wsj/24/wsj_2437      0       16      18.819  CD      CD      18      compound        _       -       -       -       -       (MONEY* *       -
78 | nw/wsj/24/wsj_2437      0       17      billion CD      CD      19      nummod  _       -       -       -       -       *       *       -
79 | nw/wsj/24/wsj_2437      0       18      francs  NNS     NNS     16      pobj    _       -       -       -       -       *)      *)      -
80 | nw/wsj/24/wsj_2437      0       19      .       .       .       7       punct   _       -       -       -       -       *       *       -
81 | ```
82 | 


--------------------------------------------------------------------------------
/bin/add_sent_ids.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | 
 4 | arg_parser = argparse.ArgumentParser(description='Add sent ids to CoNLL-2012 file')
 5 | arg_parser.add_argument('--input_file', type=str, help='File to process')
 6 | 
 7 | args = arg_parser.parse_args()
 8 | 
 9 | with open(args.input_file, 'r') as in_f:
10 |     current_doc = ''
11 |     sent_num = 0
12 |     for line_num, line in enumerate(in_f):
13 |         line = line.strip()
14 |         # blank line means end of sentence
15 |         if not line:
16 |             sent_num += 1
17 |             print()
18 |         else:
19 |             split_line = line.split('\t')
20 |             this_doc = split_line[0]
21 |             if this_doc != current_doc:
22 |                 current_doc = this_doc
23 |                 sent_num = 0
24 |             split_line[1] = str(sent_num)
25 |             print('\t'.join(split_line))
26 | 
27 | 


--------------------------------------------------------------------------------
/bin/compute_transition_probs.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import argparse
 3 | 
 4 | arg_parser = argparse.ArgumentParser(description='Compute transition probabilities between SRL tags')
 5 | arg_parser.add_argument('--in_file_name', type=str, help='File to process')
 6 | 
 7 | args = arg_parser.parse_args()
 8 | 
 9 | with open(args.in_file_name, 'r') as in_f:
10 |     # keep track of label_a / label_b counts here
11 |     label_counts = defaultdict(dict)
12 |     current_sentence = []
13 |     current_domain = None
14 |     for line_num, line in enumerate(in_f):
15 |         line = line.strip()
16 |         # blank line means end of sentence
17 |         if not line:
18 |             sentence_frames = []
19 |             # grab only columns corresponding to frames
20 |             for token_parts in current_sentence:
21 |                 parts = token_parts.split('\t')
22 |                 # first 13 columns are not frames, last column is coref for some reason
23 |                 if len(parts) > 14:
24 |                     frames = parts[14:-1]
25 |                     sentence_frames.append(frames)
26 |             # seperate out into list per frame sequence
27 |             frame_sequences = zip(*sentence_frames)
28 |             for frame in frame_sequences:
29 |                 if frame:
30 |                     for i in range(len(frame)-1):
31 |                         # keep track of counts for each label pair transition
32 |                         label_a = frame[i]
33 |                         label_b = frame[i+1]
34 |                         label_a_count_map = label_counts[label_a]
35 |                         if label_b in label_a_count_map:
36 |                             label_a_count_map[label_b] += 1
37 |                         else:
38 |                             label_a_count_map[label_b] = 1
39 |             current_sentence = []
40 |         else:
41 |             current_sentence.append(line)
42 | 
43 | for label_a, transitions in label_counts.iteritems():
44 |     total_count = float(sum([count for label, count in transitions.iteritems()]))
45 |     for label_b, count in transitions.iteritems():
46 |         print('%s\t%s\t%g' % (label_a, label_b, (count/total_count)))
47 | 


--------------------------------------------------------------------------------
/bin/convert-bilou-single-field.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | 
 4 | arg_parser = argparse.ArgumentParser(description='Convert a field in CoNLL-2012 to BIO/BILOU format')
 5 | arg_parser.add_argument('--input_file', type=str, help='File to process')
 6 | arg_parser.add_argument('--field', type=int, help='Field in the file to process')
 7 | arg_parser.add_argument('--bilou', dest='bilou', help='Whether to use BILOU encoding (default BIO)', default=False, action='store_true')
 8 | arg_parser.add_argument('--bio', dest='bilou', help='Whether to use BIO encoding (default)', default=False, action='store_false')
 9 | arg_parser.add_argument('--take_last', dest='take_last', action='store_true')
10 | arg_parser.add_argument('--no_take_last', dest='take_last', action='store_false')
11 | 
12 | args = arg_parser.parse_args()
13 | 
14 | join_str = '/'
15 | # (R-ARG1*))
16 | # (ARG1(ARG0*)) --> U-ARG1/U-ARG0 (U-ARG0/L-ARG1)
17 | 
18 | with open(args.input_file, 'r') as f:
19 |   label_stack = []
20 |   for line_num, line in enumerate(f):
21 |   # for line_num, line in enumerate(lines):
22 |     new_labels = []
23 |     split_line = line.strip().split()
24 |     last_field = len(split_line) - (0 if args.take_last else 1)
25 |     if not split_line:
26 |       assert not label_stack, "There remains an unclosed paren (line %d) labels: %s" % (line_num, ','.join(label_stack))
27 |     elif args.field < last_field:
28 |       field = split_line[args.field]
29 |       output_labels = map(lambda s: "I-" + s, label_stack)
30 |       if field == "*" and not label_stack:
31 |         output_labels.append("O")
32 |       else:
33 |         split_field = field.split("(")
34 |         # gather new labels introduced
35 |         for label in split_field:
36 |           stripped = label.strip("()*")
37 |           if stripped:
38 |             new_labels.append(stripped)
39 | 
40 |         # do we close labels?
41 |         close_parens = field.count(")")
42 |         unit_labels = []
43 |         close_labels = []
44 |         if close_parens > 0:
45 |           # if there are new labels, close those first
46 |           while new_labels and close_parens > 0:
47 |             if args.bilou:
48 |               unit_label = "U-" + new_labels.pop()
49 |             else:
50 |               unit_label = "B-" + new_labels.pop()
51 |             close_parens -= 1
52 |             unit_labels = [unit_label] + unit_labels
53 | 
54 |           # if there are additional close parens, close labels from label stack
55 |           if args.bilou:
56 |             close_labels = ["L-" + label_stack.pop(-1) for i in range(close_parens)][::-1]
57 |           else:
58 |             close_labels = ["I-" + label_stack.pop(-1) for i in range(close_parens)][::-1]
59 | 
60 |         # add unclosed new labels to label stack, and begin them
61 |         start_labels = []
62 |         while new_labels:
63 |           new_label = new_labels.pop()
64 |           start_labels = ["B-" + new_label] + start_labels
65 |           label_stack.append(new_label)
66 | 
67 |         output_labels = output_labels[:len(output_labels) - close_parens] + start_labels + unit_labels + close_labels
68 | 
69 |       new_label = join_str.join(output_labels)
70 |       split_line[args.field] = new_label
71 |     print('\t'.join(split_line))


--------------------------------------------------------------------------------
/bin/convert-bilou.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Converts the fields defined below in the given file (arg1) to BILOU format 
 4 | # from the CoNLL-2012 segment format.
 5 | #
 6 | 
 7 | input_file=$1
 8 | 
 9 | bilou_arg="--bilou"
10 | 
11 | max_field=`awk '{print NF}' $input_file | sort -n | tail -1`
12 | first_field=14
13 | fields_to_convert=`seq $first_field $(( max_field - 1 ))`
14 | 
15 | tmpfile=`mktemp`
16 | 
17 | bilou_file="$input_file.bilou"
18 | cp $input_file $bilou_file
19 | 
20 | for field in $fields_to_convert; do
21 |     echo "Converting field $field of $(( max_field - 1 ))"
22 |     python bin/convert-bilou-single-field.py --input_file $bilou_file --field $((field - 1)) $bilou_arg > $tmpfile
23 |     cp $tmpfile $bilou_file
24 | done
25 | 
26 | rm $tmpfile
27 | 
28 | 


--------------------------------------------------------------------------------
/bin/convert-bio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Converts the fields defined below in the given file (arg1) to BILOU format 
 4 | # from the CoNLL-2012 segment format.
 5 | #
 6 | 
 7 | input_file=$1
 8 | 
 9 | bilou_arg="--bio"
10 | 
11 | max_field=`awk '{print NF}' $input_file | sort -n | tail -1`
12 | first_field=14
13 | fields_to_convert=`seq $first_field $(( max_field - 1 ))`
14 | 
15 | tmpfile=`mktemp`
16 | 
17 | bilou_file="$input_file.bio"
18 | cp $input_file $bilou_file
19 | 
20 | for field in $fields_to_convert; do
21 |     echo "Converting field $field of $(( max_field - 1 ))"
22 |     echo "bin/convert-bilou-single-field.py --input_file $bilou_file --field $((field - 1)) $bilou_arg"
23 |     python bin/convert-bilou-single-field.py --input_file $bilou_file --field $((field - 1)) $bilou_arg > $tmpfile
24 |     cp $tmpfile $bilou_file
25 | done
26 | 
27 | rm $tmpfile
28 | 
29 | 


--------------------------------------------------------------------------------
/bin/download_clearnlp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Grab all the jars needed for running ClearNLP POS tagger, constituencies -> dependencies conversion
 4 | # As of 12/19/2017
 5 | #
 6 | 
 7 | mkdir lib
 8 | cd lib
 9 | 
10 | # ClearNLP
11 | curl -LO "http://search.maven.org/remotecontent?filepath=edu/emory/clir/clearnlp/3.1.2/clearnlp-3.1.2.jar"
12 | 
13 | # ClearNLP dependencies
14 | curl -LO "http://search.maven.org/remotecontent?filepath=args4j/args4j/2.0.29/args4j-2.0.29.jar"
15 | curl -LO "http://search.maven.org/remotecontent?filepath=log4j/log4j/1.2.17/log4j-1.2.17.jar"
16 | curl -LO "http://search.maven.org/remotecontent?filepath=com/carrotsearch/hppc/0.6.1/hppc-0.6.1.jar"
17 | curl -LO "http://search.maven.org/remotecontent?filepath=org/tukaani/xz/1.5/xz-1.5.jar"
18 | 
19 | # Dictionaries, needed for converting constituencies -> dependencies
20 | curl -LO "http://search.maven.org/remotecontent?filepath=edu/emory/clir/clearnlp-dictionary/3.2/clearnlp-dictionary-3.2.jar"
21 | 
22 | # POS tagger model
23 | curl -LO "http://search.maven.org/remotecontent?filepath=edu/emory/clir/clearnlp-general-en-pos/3.2/clearnlp-general-en-pos-3.2.jar"
24 | 
25 | # word clusters
26 | curl -LO "http://search.maven.org/remotecontent?filepath=edu/emory/clir/clearnlp-global-lexica/3.1/clearnlp-global-lexica-3.1.jar"
27 | 
28 | cd ..
29 | 


--------------------------------------------------------------------------------
/bin/extract_conll_parse_file.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import gzip
 4 | 
 5 | arg_parser = argparse.ArgumentParser(description='Convert a CoNLL-2012 file to CoNLL-X format')
 6 | arg_parser.add_argument('--input_file', type=str, help='File to process')
 7 | arg_parser.add_argument('--word_field', type=int, help='Field containing words')
 8 | arg_parser.add_argument('--pos_field', type=int, help='Field containing gold part-of-speech tags')
 9 | arg_parser.add_argument('--head_field', type=int, help='Field containing gold parse head')
10 | arg_parser.add_argument('--label_field', type=int, help='Field containing gold parse label')
11 | arg_parser.add_argument('--id_field', type=int, help='Field containing word id')
12 | arg_parser.add_argument('--drop_single_tokens', dest='drop_single_tokens', action='store_true')
13 | arg_parser.add_argument('--no_drop_single_tokens', dest='drop_single_tokens', action='store_false')
14 | arg_parser.set_defaults(drop_single_tokens=False)
15 | 
16 | arg_parser.add_argument('--domain', dest='domain')
17 | arg_parser.set_defaults(domain='-')
18 | 
19 | args = arg_parser.parse_args()
20 | 
21 | # ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes.
22 | # FORM: Word form or punctuation symbol.
23 | # LEMMA: Lemma or stem of word form.
24 | # UPOSTAG: Universal part-of-speech tag.
25 | # XPOSTAG: Language-specific part-of-speech tag; underscore if not available.
26 | # FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
27 | # HEAD: Head of the current word, which is either a value of ID or zero (0).
28 | # DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
29 | # DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
30 | # MISC: Any other annotation.
31 | with gzip.open(args.input_file, 'r') if args.input_file.endswith('gz') else open(args.input_file, 'r') as f:
32 |   # print_newline = False
33 |   word_idx = 1
34 |   buf = []
35 |   for line in f:
36 |     line = line.strip()
37 |     if line:
38 |       split_line = line.strip().split()
39 |       domain = split_line[0].split('/')[0]
40 |       if args.domain == '-' or domain == args.domain:
41 |         # print_newline = True
42 |         word = split_line[args.word_field]
43 |         id = split_line[args.id_field]
44 |         if id == '_':
45 |           id = word_idx
46 |         else:
47 |           id = int(split_line[args.id_field]) + 1
48 |         pos = split_line[args.pos_field]
49 |         head = split_line[args.head_field]
50 |         label = split_line[args.label_field]
51 |         new_fields = [str(id), word, '_', pos, '_', '_', head, label]
52 |         new_line = '\t'.join(new_fields)
53 |         buf.append(new_line)
54 |         # print(new_line)
55 |         word_idx += 1
56 |     else:
57 |       word_idx = 1
58 |       if buf:
59 |         sent_len = len(buf)
60 |         if not args.drop_single_tokens or sent_len > 1:
61 |           for tok in buf:
62 |             print(tok)
63 |           print()
64 |         buf = []
65 |       # if print_newline:
66 |       #   print_newline = False
67 |       #   print()
68 |   if buf:
69 |     for tok in buf:
70 |       print(tok)
71 |     print()
72 | 


--------------------------------------------------------------------------------
/bin/extract_conll_prop_file.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import gzip
 4 | 
 5 | arg_parser = argparse.ArgumentParser(description='Convert a CoNLL-2012 file to CoNLL-2005 prop format')
 6 | arg_parser.add_argument('--input_file', type=str, help='File to process')
 7 | arg_parser.add_argument('--word_field', type=int, help='Field containing words')
 8 | arg_parser.add_argument('--pred_field', type=int, help='Field containing predicates')
 9 | arg_parser.add_argument('--pred_field_offset', type=int, help='Offset for predicates field', default=1)
10 | arg_parser.add_argument('--take_last', dest='take_last', action='store_true')
11 | arg_parser.add_argument('--no_take_last', dest='take_last', action='store_false')
12 | arg_parser.set_defaults(take_last=False)
13 | arg_parser.add_argument('--domain', dest='domain')
14 | arg_parser.set_defaults(domain='-')
15 | 
16 | arg_parser.add_argument('--first_prop_field', type=int, help='First field containing props')
17 | 
18 | args = arg_parser.parse_args()
19 | 
20 | 
21 | with gzip.open(args.input_file, 'r') if args.input_file.endswith('gz') else open(args.input_file, 'r') as f:
22 |   print_newline = False
23 |   for line in f:
24 |     line = line.strip()
25 |     if line:
26 |       split_line = line.strip().split()
27 |       domain = split_line[0].split('/')[0]
28 |       if args.domain == '-' or domain == args.domain:
29 |         print_newline = True
30 |         props = split_line[args.first_prop_field:] if args.take_last else split_line[args.first_prop_field:-1]
31 |         word = split_line[args.word_field] if split_line[args.pred_field + args.pred_field_offset] != '-' else '-'
32 |         new_fields = [word] + props
33 |         new_line = '\t'.join(new_fields)
34 |         print(new_line)
35 |     else:
36 |       if print_newline:
37 |         print_newline = False
38 |         print()
39 | 


--------------------------------------------------------------------------------
/bin/filter-conll-2012.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import argparse
 3 | import gzip
 4 | import os
 5 | from glob import glob
 6 | 
 7 | arg_parser = argparse.ArgumentParser(description='Filter CoNLL-12 files by docid and concatenate')
 8 | arg_parser.add_argument('--input_dir', type=str, help='Directory to process')
 9 | arg_parser.add_argument('--input_file', type=str, help='File to process')
10 | arg_parser.add_argument('--docid_file', type=str, default='', help='List of doc ids to keep')
11 | args = arg_parser.parse_args()
12 | 
13 | docid_map = set()
14 | 
15 | if args.docid_file != '':
16 |   with open(args.docid_file, 'r') as f:
17 |     for line in f:
18 |       line = line.strip()
19 |       docid_map.add(line)
20 | 
21 | if args.input_dir:
22 |   fnames = [d for f in os.walk(args.input_dir) for d in glob(os.path.join(f[0], '*.combined'))]
23 | else:
24 |   fnames = [args.input_file]
25 | 
26 | for fname in fnames:
27 |   with open(fname, 'r') as f:
28 |     last_print_empty = True
29 |     for line in f:
30 |       line = line.strip()
31 |       if line:
32 |         split_line = line.strip().split()
33 |         docid = split_line[0].split('/')[-1]
34 |         if docid in docid_map or not docid_map:
35 |           print(line)
36 |           last_print_empty = False
37 |       else:
38 |         if not last_print_empty:
39 |           print()
40 |         last_print_empty = True
41 | 


--------------------------------------------------------------------------------
/bin/jackknife_ontonotes_pos.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | import os
 3 | import argparse
 4 | 
 5 | arg_parser = argparse.ArgumentParser(description='Split CoNLL-2012 data into splits for jackknifing')
 6 | arg_parser.add_argument('--input_file', type=str, help='File to process')
 7 | arg_parser.add_argument('--output_dir', type=str, help='Directory to write output files')
 8 | arg_parser.add_argument('--num_splits', type=int, help='Number of splits to make')
 9 | arg_parser.set_defaults(num_splits=10)
10 | 
11 | args = arg_parser.parse_args()
12 | 
13 | if not os.path.exists(args.output_dir):
14 |     os.makedirs(args.output_dir)
15 | 
16 | print('Reading in all the data and seperating by domain')
17 | with open(args.input_file, 'r') as in_f:
18 |     domain_sentence_map = defaultdict(list)
19 |     current_sentence = []
20 |     current_domain = None
21 |     for line_num, line in enumerate(in_f):
22 |         line = line.strip()
23 |         # blank line means end of sentence
24 |         if not line:
25 |             sentence_str = '\n'.join(current_sentence)
26 |             domain_sentence_map[current_domain].append(sentence_str)
27 |             current_sentence = []
28 |         else:
29 |             current_sentence.append(line)
30 |             current_domain = line.split('/', 1)[0]
31 | 
32 | # write num_splits train and test files
33 | for split_num in range(args.num_splits):
34 |     print('writing split: %d' % split_num)
35 |     train_file = open('%s/train_%d' % (args.output_dir, split_num), 'w')
36 |     test_file = open('%s/test_%d' % (args.output_dir, split_num), 'w')
37 |     for domain, sentences in domain_sentence_map.iteritems():
38 |         for sent_num, sentence in enumerate(sentences):
39 |             if sent_num % 10 == split_num:
40 |                 test_file.write('%s\n\n' % sentence)
41 |             else:
42 |                 train_file.write('%s\n\n' % sentence)
43 |     train_file.close()
44 |     test_file.close()
45 | print('Done')
46 | 


--------------------------------------------------------------------------------
/bin/parse-conll09.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Requirements:
 4 | # - Download the Stanford parser: https://nlp.stanford.edu/software/lex-parser.shtml
 5 | # - Make sure you set the STANFORD_PARSER environment variable, e.g:
 6 | #     export STANFORD_PARSER="$HOME/canvas/stanford-parser-full-2017-06-09"
 7 | # - This script expects that you have already created $output_dir, and will fail otherwise
 8 | 
 9 | 
10 | STANFORD_CP="$STANFORD_PARSER/*:"
11 | 
12 | dependencies_option="CCPropagatedDependencies" # "basic"
13 | 
14 | input_file=$1
15 | output_dir=$2
16 | input_file_nopath=${input_file##*/}
17 | 
18 | # Convert to one-sentence-per-line format and parse
19 | awk '{print $2}' $input_file | sed 's/)/-RRB-/' | sed 's/(/-LRB-/' | awk '{if($1 == ""){print ""} else {printf "%s ", $0}} END {print ""}' | \
20 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.parser.lexparser.LexicalizedParser \
21 | -sentences newline \
22 | -outputFormat penn \
23 | -tokenized \
24 | -originalDependencies \
25 | edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz \
26 | - > $output_dir/$input_file_nopath.trees
27 | 
28 | # Convert parses to dependencies
29 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.trees.EnglishGrammaticalStructure \
30 | -$dependencies_option \
31 | -conllx \
32 | -treeFile $output_dir/$input_file_nopath.trees \
33 | > $output_dir/$input_file_nopath.deps
34 | 
35 | # Finally, paste the original file together with the dependency parses and auto pos tags
36 | f_parsed="$output_dir/$input_file_nopath.deps"
37 | f_combined="$output_dir/$input_file_nopath.parsed"
38 | paste <(awk 'BEGIN{s=0} {if (NF != 0) {print $1"\t"$2"\t"$3"\t"$4"\t"$5} else {print ""; s++}}' $input_file) \
39 |       <(awk '{if(NF == 0){print ""} else {print $5"\t"$6"\t"$7"\t"$8}}' $f_parsed) \
40 |       <(awk '{if(NF == 0){print ""}}' $input_file | tr -s ' ' | cut -d' ' -f8- | sed 's/ /\t/g') \
41 | > $f_combined
42 | 


--------------------------------------------------------------------------------
/bin/parse-conll12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Requirements:
 4 | # - Download the Stanford parser: https://nlp.stanford.edu/software/lex-parser.shtml
 5 | # - Make sure you set the STANFORD_PARSER environment variable, e.g:
 6 | #     export STANFORD_PARSER="$HOME/canvas/stanford-parser-full-2017-06-09"
 7 | 
 8 | STANFORD_CP="$STANFORD_PARSER/*:"
 9 | 
10 | dependencies_option="CCPropagatedDependencies" # "basic"
11 | 
12 | input_file=$1
13 | output_dir=$2
14 | input_file_nopath=${input_file##*/}
15 | 
16 | # Convert to one-sentence-per-line format and parse
17 | awk '{print $4}' $input_file | awk '{if($1 == ""){print ""} else {printf "%s ", $0}} END {print ""}' | \
18 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.parser.lexparser.LexicalizedParser \
19 | -sentences newline \
20 | -outputFormat penn \
21 | -tokenized \
22 | -originalDependencies \
23 | edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz \
24 | - > $output_dir/$input_file_nopath.trees
25 | 
26 | # Convert parses to dependencies
27 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.trees.EnglishGrammaticalStructure \
28 | -$dependencies_option \
29 | -conllx \
30 | -treeFile $output_dir/$input_file_nopath.trees \
31 | > $output_dir/$input_file_nopath.deps
32 | 
33 | # Finally, paste the original file together with the dependency parses and auto pos tags
34 | f_parsed="$output_dir/$input_file_nopath.deps"
35 | f_combined="$output_dir/$input_file_nopath.parsed"
36 | paste <(awk 'BEGIN{s=0} {if (substr($1,1,1) !~ /#/ && NF != 0) {print $1"\t"s"\t"$3"\t"$4"\t"$5} else {print ""; s++}}' $input_file) \
37 |       <(awk '{if(NF == 0){print ""} else {print $5"\t"$7"\t"$8"\t_"}}' $f_parsed) \
38 |       <(awk '{if (substr($1,1,1) !~ /#/ ) {print $0}}' $input_file | cut -d$'\t' -f7- | sed 's/ /\t/g') \
39 | > $f_combined
40 | 


--------------------------------------------------------------------------------
/bin/preprocess_conll2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # You'll want to change this if you're not running from the project's root directory
 4 | CLEARNLP=`pwd`
 5 | CLEARLIB=$CLEARNLP/lib
 6 | CLASSPATH=$CLEARLIB/clearnlp-3.1.2.jar:$CLEARLIB/args4j-2.0.29.jar:$CLEARLIB/log4j-1.2.17.jar:$CLEARLIB/hppc-0.6.1.jar:$CLEARLIB/xz-1.5.jar:$CLEARLIB/clearnlp-dictionary-3.2.jar:$CLEARLIB/clearnlp-general-en-pos-3.2.jar:$CLEARLIB/clearnlp-global-lexica-3.1.jar:.
 7 | 
 8 | input_dir=$1
 9 | headrules=$CLEARNLP/headrule_en_stanford.txt
10 | pos_config=$CLEARNLP/config_decode_pos.xml
11 | 
12 | # First, convert the constituencies from the ontonotes files to the format expected
13 | # by the converter
14 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
15 |     echo "Extracting trees from: $f"
16 |     # word pos parse -> stick words, pos into parse as terminals
17 |     awk '{if (substr($1,1,1) !~ /#/ ) print $5" "$4"\t"$6}' $f | \
18 |     sed 's/\(.*\)\t\(.*\)\*\(.*\)/\2(\1)\3/' | \
19 |     awk '{if(NF && substr($1,1,1) !~ /\(/){print "(TOP(INTJ(UH XX)))"} else {print}}' > "$f.parse"
20 | done
21 | 
22 | # Now convert those parses to dependencies
23 | # Output will have the extension .dep
24 | for f in `find $input_dir/* -type d -not -path '*/\.*'`; do
25 |     echo "Converting to dependencies: $f"
26 |     java -cp $CLASSPATH edu.emory.clir.clearnlp.bin.C2DConvert \
27 |         -h $headrules \
28 |         -i $f \
29 |         -pe parse
30 | done
31 | 
32 | # Now assign auto part-of-speech tags
33 | # Output will have extension .cnlp
34 | for f in `find $input_dir/* -type d -not -path '*/\.*'`; do
35 |     echo "POS tagging: $f"
36 |     java -cp $CLASSPATH edu.emory.clir.clearnlp.bin.NLPDecode \
37 |         -mode pos \
38 |         -c config_decode_pos.xml \
39 |         -i $f \
40 |         -ie dep
41 | done
42 | 
43 | # Finally, paste the original file together with the dependency parses and auto pos tags
44 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
45 |     f_converted="$f.parse.dep"
46 |     f_pos="$f.parse.dep.cnlp"
47 |     f_combined="$f_converted.combined"
48 |     paste <(awk '{if (substr($1,1,1) !~ /#/ ) {print $1"\t"$2"\t"$3"\t"$4"\t"$5}}' $f) \
49 |         <(awk '{print $2}' $f_pos) \
50 |         <(awk '{print $6"\t"$7"\t"$9}' $f_converted) \
51 |         <(awk '{if (substr($1,1,1) !~ /#/ ) {print $0}}' $f | tr -s ' ' | cut -d' ' -f7- | sed 's/ /\t/g') \
52 |     > $f_combined
53 | done
54 | 


--------------------------------------------------------------------------------
/bin/preprocess_conll2012_sdeps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | STANFORD_CP="$STANFORD_PARSER/*:$STANFORD_POS/*:"
 4 | postagger_model="$STANFORD_POS/models/english-left3words-distsim.tagger"
 5 | 
 6 | dependencies_option="basic"
 7 | 
 8 | input_dir=$1
 9 | output_dir=$2
10 | 
11 | if [[ "$input_dir" =~ "dev" ]]; then
12 |     data_split="dev"
13 | elif [[ "$input_dir" =~ "test" ]]; then
14 |     data_split="test"
15 | elif [[ "$input_dir" =~ "train" ]]; then
16 |     data_split="train"
17 | else
18 |     echo "Unable to match data split (train|dev|test) in path."
19 |     exit
20 | fi
21 | 
22 | output_dir=$output_dir/$data_split
23 | 
24 | # First, convert the constituencies from the ontonotes files to the format expected
25 | # by the converter
26 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
27 |     f_path=`sed 's|'${input_dir}'||' <<< $f`
28 |     f_prefix=${f_path%/*}
29 |     mkdir -p $output_dir/$f_prefix
30 | 
31 |     echo "Extracting trees from: $f_path"
32 |     # word pos parse -> stick words, pos into parse as terminals
33 |     awk '{if (substr($1,1,1) !~ /#/ ) print $5" "$4"\t"$6}' $f | \
34 |     sed 's/\/\([.?-]\)/\1/' | \
35 |     sed 's/\(.*\)\t\(.*\)\*\(.*\)/\2(\1)\3/' > "$output_dir/$f_path.parse"
36 |     # awk '{if(NF && substr($1,1,1) !~ /\(/){print "(TOP(INTJ(UH XX)))"} else {print}}' > "$f.parse"
37 | done
38 | 
39 | # Now convert those parses to dependencies
40 | # Output will have the extension .sdeps
41 | for f in `find $input_dir/* -type f -not -path '*/\.*' -name "*_conll"`; do
42 |     f_path=`sed 's|'${input_dir}'||' <<< $f`
43 |     echo "Converting to dependencies: $f_path"
44 |     f=$output_dir/$f_path
45 |     java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.trees.EnglishGrammaticalStructure \
46 |     -treeFile "$f.parse" -$dependencies_option -conllx -keepPunct -makeCopulaHead > "$f.parse.sdeps"
47 | done
48 | 
49 | # Now assign auto part-of-speech tags
50 | for f in `find $input_dir/* -type f -not -path '*/\.*' -name "*_conll"`; do
51 |     f_path=`sed 's|'${input_dir}'||' <<< $f`
52 |     echo "POS tagging: $f_path"
53 |     f=$output_dir/$f_path
54 |     awk '{if(NF){printf "%s ", $2} else{ print "" }}' "$f.parse.sdeps" > "$f.parse.sdeps.posonly"
55 |     java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.tagger.maxent.MaxentTagger \
56 |         -model $postagger_model \
57 |         -textFile "$f.parse.sdeps.posonly" \
58 |         -tokenize false \
59 |         -outputFormat tsv \
60 |         -sentenceDelimiter newline \
61 |         > "$f.parse.sdeps.pos"
62 | done
63 | 
64 | # Finally, paste the original file together with the dependency parses and auto pos tags
65 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
66 |     f_path=`sed 's|'${input_dir}'||' <<< $f`
67 |     f_converted="$output_dir/$f_path.parse.sdeps"
68 |     f_pos="$output_dir/$f_path.parse.sdeps.pos"
69 |     f_combined="$output_dir/$f_path.combined"
70 |     paste <(awk 'BEGIN{s=0} {if (substr($1,1,1) !~ /#/ && NF != 0) {print $1"\t"s"\t"$3}else {print ""; s++}}' $f) \
71 |         <(awk '{print $2}' $f_converted) \
72 |         <(awk '{if (substr($1,1,1) !~ /#/ ) {print $5}}' $f) \
73 |         <(awk '{print $2}' $f_pos) \
74 |         <(awk '{if(NF == 0){print ""} else {print $7"\t"$8"\t_"}}' $f_converted) \
75 |         <(awk '{if (substr($1,1,1) !~ /#/ ) {print $0}}' $f | tr -s ' ' | cut -d' ' -f7- | sed 's/ /\t/g') \
76 |     > $f_combined
77 | done
78 | 


--------------------------------------------------------------------------------
/bin/preprocess_conll2012_sdeps_old.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | STANFORD_CP="$STANFORD_PARSER/*:$STANFORD_POS/*:"
 4 | postagger_model="$STANFORD_POS/models/english-left3words-distsim.tagger"
 5 | 
 6 | input_dir=$1
 7 | 
 8 | # First, convert the constituencies from the ontonotes files to the format expected
 9 | # by the converter
10 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
11 |     echo "Extracting trees from: $f"
12 |     # word pos parse -> stick words, pos into parse as terminals
13 |     awk '{if (substr($1,1,1) !~ /#/ ) print $5" "$4"\t"$6}' $f | \
14 |     sed 's/\/\([.?-]\)/\1/' | \
15 |     sed 's/\(.*\)\t\(.*\)\*\(.*\)/\2(\1)\3/' > "$f.parse"
16 | #    awk '{if(NF && substr($1,1,1) !~ /\(/){print "(TOP(INTJ(UH XX)))"} else {print}}' > "$f.parse"
17 | done
18 | 
19 | # Now convert those parses to dependencies
20 | # Output will have the extension .dep
21 | for f in `find $input_dir/* -type f -not -path '*/\.*' -name "*_conll"`; do
22 |     echo "Converting to dependencies: $f"
23 |     java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.trees.EnglishGrammaticalStructure \
24 |     -treeFile "$f.parse" -basic -conllx -keepPunct -makeCopulaHead > "$f.parse.sdeps"
25 | done
26 | 
27 | # Now assign auto part-of-speech tags
28 | # Output will have extension .cnlp
29 | for f in `find $input_dir/* -type f -not -path '*/\.*' -name "*_conll"`; do
30 |     echo "POS tagging: $f"
31 |     awk '{if(NF){printf "%s ", $2} else{ print "" }}' "$f.parse.sdeps" > "$f.parse.sdeps.posonly"
32 | 
33 |     java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.tagger.maxent.MaxentTagger \
34 |         -model $postagger_model \
35 |         -textFile "$f.parse.sdeps.posonly" \
36 |         -tokenize false \
37 |         -outputFormat tsv \
38 |         -sentenceDelimiter newline \
39 |         > "$f.parse.sdeps.pos"
40 | done
41 | 
42 | # Finally, paste the original file together with the dependency parses and auto pos tags
43 | for f in `find $input_dir -type f -not -path '*/\.*' -name "*_conll"`; do
44 |     f_converted="$f.parse.sdeps"
45 |     f_pos="$f.parse.sdeps.pos"
46 |     f_combined="$f_converted.combined"
47 |     paste <(awk '{if (substr($1,1,1) !~ /#/ ) {print $1"\t"$2"\t"$3"\t"$4"\t"$5}}' $f) \
48 |         <(awk '{print $2}' $f_pos) \
49 |         <(awk '{if(NF == 0){print ""} else {print $7"\t"$8"\t_"}}' $f_converted) \
50 |         <(awk '{if (substr($1,1,1) !~ /#/ ) {print $0}}' $f | tr -s ' ' | cut -d' ' -f7- | sed 's/ /\t/g') \
51 |     > $f_combined
52 | done
53 | 


--------------------------------------------------------------------------------
/config_decode_pos.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 |     <language>english</language>
 3 |     
 4 |     <reader type="tsv">
 5 |         <column index="2" field="form"/>
 6 |     </reader>
 7 | 
 8 |     <global>
 9 |         <distributional_semantics>brown-rcv1.clean.tokenized-CoNLL03.txt-c1000-freq1.txt.xz</distributional_semantics>
10 |     </global>
11 | 
12 |     <model>
13 |         <pos>general-en-pos.xz</pos>
14 |     </model>
15 | </configuration>
16 | 


--------------------------------------------------------------------------------
/conll05/convert-bio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Converts the fields defined below in the given file (arg1) to BILOU format 
 4 | # from the CoNLL-2012 segment format.
 5 | #
 6 | 
 7 | input_file=$1
 8 | 
 9 | bilou_arg="--bio"
10 | 
11 | max_field=`awk '{print NF}' $input_file | sort -n | tail -1`
12 | first_field=14
13 | fields_to_convert=`seq $first_field $(( max_field ))`
14 | 
15 | tmpfile=`mktemp`
16 | 
17 | bilou_file="$input_file.bio"
18 | cp $input_file $bilou_file
19 | 
20 | for field in $fields_to_convert; do
21 |     echo "Converting field $field of $(( max_field ))"
22 |     echo "bin/convert-bilou-single-field.py --input_file $bilou_file --field $((field - 1)) --take_last $bilou_arg"
23 |     python bin/convert-bilou-single-field.py --input_file $bilou_file --field $((field - 1)) --take_last $bilou_arg > $tmpfile
24 |     cp $tmpfile $bilou_file
25 | done
26 | 
27 | rm $tmpfile
28 | 
29 | 


--------------------------------------------------------------------------------
/conll05/extract_dev_from_ptb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Extract words and gold syntactic parses from PTB for the CoNLL-2005 dev set
 3 | 
 4 | # dev sections
 5 | SECTIONS="24"
 6 | 
 7 | mkdir -p $CONLL05/devel/words
 8 | mkdir -p $CONLL05/devel/synt
 9 | 
10 | for section in $SECTIONS; do
11 |     cat $WSJ/$section/* | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 1 | awk '{print $1}' | gzip > $CONLL05/devel/words/devel.${section}.words.gz
12 |     cat $WSJ/$section/* | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 0 -p 1 | gzip > $CONLL05/devel/synt/devel.${section}.synt.wsj.gz
13 | done
14 | 


--------------------------------------------------------------------------------
/conll05/extract_test_from_brown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Extract words and gold syntactic parses from PTB for the CoNLL-2005 dev set
 3 | 
 4 | # dev sections
 5 | SECTIONS="01 02 03"
 6 | 
 7 | mkdir -p $CONLL05/test.brown/synt
 8 | 
 9 | rm $CONLL05/test.brown/synt/test.brown.synt.gz
10 | 
11 | for section in $SECTIONS; do
12 |     cat $BROWN/CK/CK$section.MRG | awk '{if($1 !~ "*x*") print}' | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 0 -p 1 | gzip >> $CONLL05/test.brown/synt/test.brown.synt.gz
13 | done
14 | 


--------------------------------------------------------------------------------
/conll05/extract_test_from_ptb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Extract words and gold syntactic parses from PTB for the CoNLL-2005 dev set
 3 | 
 4 | # dev sections
 5 | SECTIONS="23"
 6 | 
 7 | mkdir -p $CONLL05/test.wsj/synt
 8 | 
 9 | for section in $SECTIONS; do
10 |     cat $WSJ/$section/* | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 0 -p 1 | gzip > $CONLL05/test.wsj/synt/test.wsj.${section}.synt.gz
11 | done
12 | 


--------------------------------------------------------------------------------
/conll05/extract_train_from_ptb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Extract words and gold syntactic parses from PTB for the CoNLL-2005 train set
 3 | 
 4 | # train sections
 5 | SECTIONS="02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21"
 6 | 
 7 | mkdir -p $CONLL05/train/words
 8 | mkdir -p $CONLL05/train/synt
 9 | 
10 | for section in $SECTIONS; do
11 |     cat $WSJ/$section/* | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 1 | awk '{print $1}' | gzip > $CONLL05/train/words/train.${section}.words.gz
12 |     cat $WSJ/$section/* | $SRLCONLL/bin/wsj-removetraces.pl | $SRLCONLL/bin/wsj-to-se.pl -w 0 -p 1 | gzip > $CONLL05/train/synt/train.${section}.synt.wsj.gz
13 | done
14 | 


--------------------------------------------------------------------------------
/conll05/make-brown-test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/tcsh
 2 | 
 3 | # section for development  
 4 | set SECTIONS = "brown"
 5 | 
 6 | # name of the output file 
 7 | set FILE = "test.brown" 
 8 | 
 9 | foreach s ( $SECTIONS )
10 | 
11 |     echo Processing section $s
12 | 
13 |     zcat test.$s/words/test.$s.words.gz > /tmp/$$.words
14 |     zcat test.$s/props/test.$s.props.gz > /tmp/$$.props
15 | 
16 |     ## Choose syntax
17 |     # zcat devel/synt.col2/devel.$s.synt.col2.gz > /tmp/$$.synt
18 |     # zcat devel/synt.col2h/devel.$s.synt.col2h.gz > /tmp/$$.synt
19 |     # zcat devel/synt.upc/devel.$s.synt.upc.gz > /tmp/$$.synt
20 |     # zcat devel/synt.cha/devel.$s.synt.cha.gz > /tmp/$$.synt
21 | 
22 |     # no gold parse, set to auto 
23 | #    zcat test.$s/synt.cha/test.$s.synt.cha.gz > /tmp/$$.synt
24 |     zcat test.$s/synt/test.$s.synt.gz > /tmp/$$.synt
25 | 
26 | 
27 |     # no senses, set to null
28 |     zcat test.$s/null/test.$s.null.gz > /tmp/$$.senses
29 |     zcat test.$s/ne/test.$s.ne.gz > /tmp/$$.ne
30 | 
31 |     paste -d ' ' /tmp/$$.words /tmp/$$.synt /tmp/$$.ne /tmp/$$.senses /tmp/$$.props | gzip> /tmp/$$.section.$s.gz
32 | end
33 | 
34 | echo Generating gzipped file $FILE.gz
35 | zcat /tmp/$$.section* | gzip -c > $FILE.gz
36 | 
37 | echo Cleaning files
38 | rm -f /tmp/$$*
39 | 
40 | 


--------------------------------------------------------------------------------
/conll05/make-devset.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/tcsh
 2 | 
 3 | # section for development  
 4 | set SECTIONS = "24"
 5 | 
 6 | # name of the output file 
 7 | set FILE = "dev-set" 
 8 | 
 9 | foreach s ( $SECTIONS )
10 | 
11 |     echo Processing section $s
12 | 
13 |     zcat devel/words/devel.$s.words.gz > /tmp/$$.words
14 |     zcat devel/props/devel.$s.props.gz > /tmp/$$.props
15 | 
16 |     ## Choose syntax
17 |     # zcat devel/synt.col2/devel.$s.synt.col2.gz > /tmp/$$.synt
18 |     # zcat devel/synt.col2h/devel.$s.synt.col2h.gz > /tmp/$$.synt
19 |     # zcat devel/synt.upc/devel.$s.synt.upc.gz > /tmp/$$.synt
20 |     # zcat devel/synt.cha/devel.$s.synt.cha.gz > /tmp/$$.synt
21 | 
22 |     # Use gold syntax
23 |     zcat devel/synt/devel.$s.synt.wsj.gz > /tmp/$$.synt
24 | 
25 |     zcat devel/senses/devel.$s.senses.gz > /tmp/$$.senses
26 |     zcat devel/ne/devel.$s.ne.gz > /tmp/$$.ne
27 | 
28 |     paste -d ' ' /tmp/$$.words /tmp/$$.synt /tmp/$$.ne /tmp/$$.senses /tmp/$$.props | gzip> /tmp/$$.section.$s.gz
29 | end
30 | 
31 | echo Generating gzipped file $FILE.gz
32 | zcat /tmp/$$.section* | gzip -c > $FILE.gz
33 | 
34 | echo Cleaning files
35 | rm -f /tmp/$$*
36 | 
37 | 


--------------------------------------------------------------------------------
/conll05/make-trainset.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/tcsh
 2 | 
 3 | # sections that are considered to generate training data; section numbers should be sorted 
 4 | set SECTIONS = "02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21"
 5 | 
 6 | 
 7 | # if you feel that 4 sections is enough training data, use the following 
 8 | # set SECTIONS = "15 16 17 18"
 9 | 
10 | # name of the output file 
11 | set FILE = "train-set" 
12 | 
13 | foreach s ( $SECTIONS )
14 | 
15 |     echo Processing section $s
16 | 
17 |     zcat train/words/train.$s.words.gz > /tmp/$$.words
18 |     zcat train/props/train.$s.props.gz > /tmp/$$.props
19 | 
20 |     ## Choose syntax
21 |     # zcat train/synt.col2/train.$s.synt.col2.gz > /tmp/$$.synt
22 |     # zcat train/synt.col2h/train.$s.synt.col2h.gz > /tmp/$$.synt
23 |     # zcat train/synt.upc/train.$s.synt.upc.gz > /tmp/$$.synt
24 |     # zcat train/synt.cha/train.$s.synt.cha.gz > /tmp/$$.synt
25 |     
26 |     # use gold syntax
27 |     zcat train/synt/train.$s.synt.wsj.gz > /tmp/$$.synt
28 | 
29 |     zcat train/senses/train.$s.senses.gz > /tmp/$$.senses
30 |     zcat train/ne/train.$s.ne.gz > /tmp/$$.ne
31 | 
32 |     paste -d ' ' /tmp/$$.words /tmp/$$.synt /tmp/$$.ne /tmp/$$.senses /tmp/$$.props | gzip > /tmp/$$.section.$s.gz
33 | end
34 | 
35 | echo Generating gzipped file $FILE.gz
36 | zcat /tmp/$$.section* | gzip -c > $FILE.gz
37 | 
38 | echo Cleaning files
39 | rm -f /tmp/$$*
40 | 
41 | 


--------------------------------------------------------------------------------
/conll05/make-wsj-test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/tcsh
 2 | 
 3 | # section for development  
 4 | set SECTIONS = "wsj"
 5 | 
 6 | # name of the output file 
 7 | set FILE = "test.wsj" 
 8 | 
 9 | foreach s ( $SECTIONS )
10 | 
11 |     echo Processing section $s
12 | 
13 |     zcat test.$s/words/test.$s.words.gz > /tmp/$$.words
14 |     zcat test.$s/props/test.$s.props.gz > /tmp/$$.props
15 | 
16 |     ## Choose syntax
17 |     # zcat devel/synt.col2/devel.$s.synt.col2.gz > /tmp/$$.synt
18 |     # zcat devel/synt.col2h/devel.$s.synt.col2h.gz > /tmp/$$.synt
19 |     # zcat devel/synt.upc/devel.$s.synt.upc.gz > /tmp/$$.synt
20 |     # zcat devel/synt.cha/devel.$s.synt.cha.gz > /tmp/$$.synt
21 | 
22 |     # no gold parse, set to auto 
23 |     zcat test.$s/synt/test.$s.23.synt.gz > /tmp/$$.synt
24 | 
25 |     # no senses, set to null
26 |     zcat test.$s/null/test.$s.null.gz > /tmp/$$.senses
27 |     zcat test.$s/ne/test.$s.ne.gz > /tmp/$$.ne
28 | 
29 |     paste -d ' ' /tmp/$$.words /tmp/$$.synt /tmp/$$.ne /tmp/$$.senses /tmp/$$.props | gzip> /tmp/$$.section.$s.gz
30 | end
31 | 
32 | echo Generating gzipped file $FILE.gz
33 | zcat /tmp/$$.section* | gzip -c > $FILE.gz
34 | 
35 | echo Cleaning files
36 | rm -f /tmp/$$*
37 | 
38 | 


--------------------------------------------------------------------------------
/conll05/preprocess_conll05.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # You'll want to change this if you're not running from the project's root directory
 4 | CLEARNLP=`pwd`
 5 | CLEARLIB=$CLEARNLP/lib
 6 | CLASSPATH=$CLEARLIB/clearnlp-3.1.2.jar:$CLEARLIB/args4j-2.0.29.jar:$CLEARLIB/log4j-1.2.17.jar:$CLEARLIB/hppc-0.6.1.jar:$CLEARLIB/xz-1.5.jar:$CLEARLIB/clearnlp-dictionary-3.2.jar:$CLEARLIB/clearnlp-general-en-pos-3.2.jar:$CLEARLIB/clearnlp-global-lexica-3.1.jar:.
 7 | 
 8 | input_file=$1
 9 | headrules=$CLEARNLP/headrule_en_stanford.txt
10 | pos_config=$CLEARNLP/config_decode_pos.xml
11 | 
12 | # First, convert the constituencies from the ontonotes files to the format expected
13 | # by the converter
14 | echo "Extracting trees from: $input_file"
15 | # word pos parse -> stick words, pos into parse as terminals
16 | zcat $input_file | \
17 | awk 'gsub(/\(/, "-LRB-", $2); gsub(/\)/, "-RRB-", $2); print $2" "$1"\t"$3}' | \
18 | sed 's/\(.*\)\t\(.*\)\*\(.*\)/\2(\1)\3/' > "$input_file.parse"
19 | 
20 | # Now convert those parses to dependencies
21 | # Output will have the extension .dep
22 | echo "Converting to dependencies: $input_file.parse"
23 | java -cp $CLASSPATH edu.emory.clir.clearnlp.bin.C2DConvert \
24 |     -h $headrules \
25 |     -i "$input_file.parse" \
26 |     -pe parse
27 | 
28 | # Now assign auto part-of-speech tags
29 | # Output will have extension .cnlp
30 | echo "POS tagging: $input_file.parse.dep"
31 | java -cp $CLASSPATH edu.emory.clir.clearnlp.bin.NLPDecode \
32 |     -mode pos \
33 |     -c config_decode_pos.xml \
34 |     -i "$input_file.parse.dep" \
35 |     -ie dep
36 | 
37 | # Finally, paste the original file together with the dependency parses and auto pos tags
38 | f_converted="$input_file.parse.dep"
39 | f_pos="$input_file.parse.dep.cnlp"
40 | f_combined="$f_converted.combined"
41 | paste <(zcat $input_file | awk '{if(NF == 0){print ""} else {print "_\t_\t_\t"$1"\t"$2}}' ) \
42 |     <(awk '{print $2}' $f_pos) \
43 |     <(awk '{print $6"\t"$7"\t"$9}' $f_converted) \
44 |     <(zcat $input_file | awk '{if(NF == 0){print ""} else {print $5"\t"$6"\t-\t-\t"$4}}' ) \
45 |     <(zcat $input_file | awk '{if(NF == 0){print ""} else {print $0"\t_"}}' | tr -s ' ' | cut -d' ' -f7- | sed 's/ /\t/g') \
46 | > $f_combined
47 | 


--------------------------------------------------------------------------------
/conll05/preprocess_conll05_sdeps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | STANFORD_CP="$STANFORD_PARSER/*:$STANFORD_POS/*:"
 4 | postagger_model="$STANFORD_POS/models/english-left3words-distsim.tagger"
 5 | 
 6 | input_file=$1
 7 | 
 8 | # First, convert the constituencies from the files to the format expected by the converter
 9 | echo "Extracting trees from: $input_file"
10 | # word pos parse -> stick words, pos into parse as terminals
11 | # gsub(/#/, "$", $1); gsub(/#/, "$", $2);
12 | zcat $input_file | \
13 | awk '{gsub(/\(/, "-LRB-", $1); gsub(/\)/, "-RRB-", $1); gsub(/\(/, "-LRB-", $2); gsub(/\)/, "-RRB-", $2); print $2" "$1"\t"$3}' | \
14 | sed 's/\(.*\)\t\(.*\)\*\(.*\)/\2(\1)\3/' > "$input_file.parse"
15 | 
16 | # Now convert those parses to dependencies
17 | # Output will have the extension .dep
18 | echo "Converting to dependencies: $input_file.parse"
19 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.trees.EnglishGrammaticalStructure \
20 |     -treeFile "$input_file.parse" -basic -conllx -keepPunct -makeCopulaHead > "$input_file.parse.sdeps"
21 | 
22 | # Now assign auto part-of-speech tags
23 | # Output will have extension .tagged
24 | echo "POS tagging: $input_file.parse.sdeps"
25 | 
26 | # need to convert to text format Stanford likes
27 | awk '{if(NF){printf "%s ", $2} else{ print "" }}' "$input_file.parse.sdeps" > "$input_file.parse.sdeps.posonly"
28 | 
29 | java -Xmx8g -cp $STANFORD_CP edu.stanford.nlp.tagger.maxent.MaxentTagger \
30 |     -model $postagger_model \
31 |     -textFile "$input_file.parse.sdeps.posonly" \
32 |     -tokenize false \
33 |     -outputFormat tsv \
34 |     -sentenceDelimiter newline \
35 |     > "$input_file.parse.sdeps.pos"
36 | 
37 | echo "Combining: $input_file.parse.sdeps, $input_file.parse.sdeps.pos"
38 | 
39 | # Finally, paste the original file together with the dependency parses and auto pos tags
40 | f_converted="$input_file.parse.sdeps"
41 | f_pos="$input_file.parse.sdeps.pos"
42 | f_combined="$f_converted.combined"
43 | 
44 | # docid is conll05
45 | paste <(zcat $input_file | awk 'BEGIN{s=0;c=0}{if(NF == 0){print ""; c=0; s++} else {print "conll05\t"s"\t"c++"\t"$1}}' ) \
46 |     <(awk '{print $5}' $f_converted) \
47 |     <(awk '{print $2}' $f_pos) \
48 |     <(awk '{if(NF == 0){print ""} else {print $7"\t"$8"\t_"}}' $f_converted) \
49 |     <(zcat $input_file | awk '{if(NF == 0){print ""} else {print $5"\t"$6"\t-\t-\t"$4}}' ) \
50 |     <(zcat $input_file | awk '{if(NF == 0){print ""} else {print $0}}' | tr -s ' ' | cut -d' ' -f7- | sed 's/ /\t/g') \
51 | > $f_combined
52 | 


--------------------------------------------------------------------------------
/conll05/set_paths.env:
--------------------------------------------------------------------------------
1 | export WSJ=/iesl/data/ptb/v1/combined/wsj
2 | export BROWN=/iesl/canvas/strubell/data/BROWN
3 | 
4 | export CONLL05=/iesl/canvas/strubell/data/conll05st-release
5 | export SRLCONLL=/iesl/canvas/strubell/data/conll05st-release/srlconll-1.1
6 | export PERL5LIB=$SRLCONLL/lib:$PERL5LIB


--------------------------------------------------------------------------------
/headrule_en_stanford.txt:
--------------------------------------------------------------------------------
 1 | ADJP	r	JJ.*|VB.*|NN.*;ADJP;IN;RB|ADVP;CD|QP;FW|NP;.*
 2 | ADVP	r	VB.*;RP;RB.*|JJ.*;ADJP;ADVP;QP;IN;NN;CD;NP;.*
 3 | CAPTION	l	NNP.*;NN.*;NP;CD;.*
 4 | CIT	l	NNP.*;NN.*;NP;CD;.*
 5 | CONJP	l	CC;VB.*;NN.*;TO|IN;.*
 6 | EDITED	r	VP;VB.*;NN.*|PRP|NP;IN|PP;S.*;.*
 7 | EMBED	r	S.*;FRAG|NP;.*
 8 | FRAG	r	VP;VB.*;-PRD;S|SQ|SINV|SBARQ;NN.*|NP;PP;SBAR;JJ.*|ADJP;RB|ADVP;INTJ;.*
 9 | INTJ	l	VB.*;NN.*;UH;INTJ;.*
10 | LST	l	LS|CD;NN;.*
11 | META	l	NP;VP|S;.*
12 | NAC	r	NN.*;NP;S|SINV;.*
13 | NML	r	NN.*|NML;CD|NP|QP|JJ.*|VB.*;.*
14 | NP	r	NN.*|NML;NX;PRP;FW;CD;NP;-NOM;QP|JJ.*|VB.*;ADJP;S;SBAR;.*
15 | NX	r	NN.*;NX;NP;.*
16 | PP	l	RP;TO;IN;VB.*;PP;NN.*;JJ;RB;.*
17 | PRN	r	VP;NP;S|SBARQ|SINV|SQ;SBAR;.*
18 | PRT	l	RP;PRT;.*
19 | QP	r	CD;NN.*;JJ;DT|PDT;RB;NP|QP;.*
20 | RRC	l	VP;VB.*;-PRD;NP|NN.*;ADJP;PP;.*
21 | S	r	VP;VB.*;-PRD;S|SQ|SINV|SBARQ;SBAR;NP;PP;.*
22 | SBAR	r	VP;S|SQ|SINV;SBAR.*;FRAG|NP;.*
23 | SBARQ	r	VP;SQ|SBARQ;S|SINV;FRAG|NP;.*
24 | SINV	r	VP;VB.*;MD;S|SINV;NP;.*
25 | SQ	r	VP;VB.*;SQ;S;MD;NP;.*
26 | UCP	r	.*
27 | VP	l	VP;VB.*;MD|TO;JJ.*|NN.*|IN;-PRD;NP;ADJP|QP;S;.*
28 | WHADJP	r	JJ.*|VBN;WHADJP|ADJP;.*
29 | WHADVP	r	RB.*|WRB;WHADVP;.*
30 | WHNP	r	NN.*;WP|WHNP;NP|NML|CD;JJ.*|VBG;WHADJP|ADJP;DT;.*
31 | WHPP	l	IN|TO;.*
32 | X	r	.*


--------------------------------------------------------------------------------
/ptb/convert_spos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## convert spos.conllu  data to 15 column tsv format
 4 | 
 5 | if [ "$#" -ne 1 ]; then
 6 |   echo "Must supply input directory containing  wsj\*sdep.spos.conllu files."
 7 |   exit 1
 8 | fi
 9 | 
10 | in_dir=$1
11 | out_dir="${in_dir}/bio_format"
12 | mkdir -p $out_dir
13 | 
14 | for in_f in wsj02-21-trn.sdep.spos.conllu wsj22-dev.sdep.spos.conllu wsj23-tst.sdep.spos.conllu;
15 | do
16 |   awk '{if(NF){ printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \
17 |   "-", "-", ($1-1), $2, $4, $5, $7, $8, "-", "-", "-", "-", "-", "*", "-"} \
18 |   else {print} \
19 |   }' $in_dir/$in_f > ${out_dir}/${in_f}_BIO
20 | done
21 | 


--------------------------------------------------------------------------------
/ptb/ptb2stanford.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Convert the Penn TreeBank to Stanford dependencies
 4 | # https://nlp.stanford.edu/software/stanford-dependencies.html
 5 | #
 6 | 
 7 | path_to_stanford_parser=/iesl/canvas/strubell/stanford-parser-full-2017-06-09
 8 | PTB=/iesl/data/ptb/v1/combined/wsj
 9 | 
10 | declare -a train=(02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21)
11 | declare -a dev=(22)
12 | declare -a test=(23)
13 | declare -a altdev=(24)
14 | 
15 | output_dir="wsj-parse-3.5.0"
16 | mkdir -p $output_dir
17 | 
18 | train_output="$output_dir/wsj02-21-trn.sdep"
19 | dev_output="$output_dir/wsj22-dev.sdep"
20 | test_output="$output_dir/wsj23-tst.sdep"
21 | altdev_output="$output_dir/wsj24-altdev.sdep"
22 | 
23 | # make sure output files are empty
24 | cat /dev/null > $train_output
25 | cat /dev/null > $test_output
26 | cat /dev/null > $dev_output
27 | cat /dev/null > $altdev_output
28 | 
29 | #for sec in ${train[@]}
30 | #  do
31 | #  dir=$PTB/$sec
32 | #  for f in $dir/*
33 | #  do
34 | #    echo "Writing $f to $train_output..."
35 | #    java -mx150m -cp "$path_to_stanford_parser/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
36 | #        -treeFile $f -basic -conllx -keepPunct -makeCopulaHead >> $train_output
37 | #  done
38 | #done
39 | 
40 | for sec in ${test[@]}
41 |   do
42 |   dir=$PTB/$sec
43 |   for f in $dir/*
44 |   do
45 |     echo "Writing $f to $test_output ..."
46 |     java -mx150m -cp "$path_to_stanford_parser/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
47 |         -treeFile $f -basic -conllx -keepPunct -makeCopulaHead >> $test_output
48 |   done
49 | done
50 | 
51 | for sec in ${dev[@]}
52 | do
53 |   dir=$PTB/$sec
54 |   for f in $dir/*
55 |   do
56 |     echo "Writing $f to $dev_output ..."
57 |     java -mx150m -cp "$path_to_stanford_parser/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
58 |         -treeFile $f -basic -conllx -keepPunct -makeCopulaHead >> $dev_output
59 |   done
60 | done
61 | 
62 | for sec in ${altdev[@]}
63 | do
64 |   dir=$PTB/$sec
65 |   for f in $dir/*
66 |   do
67 |     echo "Writing $f to $altdev_output ..."
68 |     java -mx150m -cp "$path_to_stanford_parser/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
69 |         -treeFile $f -basic -conllx -keepPunct -makeCopulaHead >> $altdev_output
70 |   done
71 | done
72 | 


--------------------------------------------------------------------------------
/testdata.conll:
--------------------------------------------------------------------------------
  1 | nw/wsj/24/wsj_2400      0       0       A       DT      DT      2       det     _       -       -       -       -       *       *       (R-ARG0(ARG0*   (R-ARG0(ARG0*   (ARG1*  (ARG0*  -
  2 | nw/wsj/24/wsj_2400      0       1       rebound NN      NN      17      dep     _       rebound 01      1       -       *       (V*)    *)      *)      *       *       -
  3 | nw/wsj/24/wsj_2400      0       2       in      IN      IN      2       prep    _       -       -       -       -       *       (ARG1*  *       *       *       *       -
  4 | nw/wsj/24/wsj_2400      0       3       energy  NN      NN      5       compound        _       energy  -       1       -       *       *       *       *       *       *       -
  5 | nw/wsj/24/wsj_2400      0       4       prices  NNS     NNS     3       pobj    _       price   -       1       -       *       *)      *)      *)      *       *       -
  6 | nw/wsj/24/wsj_2400      0       5       ,       ,       ,       2       punct   _       -       -       -       -       *       *       *       *       *       *       -
  7 | nw/wsj/24/wsj_2400      0       6       which   WDT     WDT     9       dep     2:ref   -       -       -       -       *       (ARGM-ADJ*      (R-ARG0*)       (R-ARG0*)       *       *       -
  8 | nw/wsj/24/wsj_2400      0       7       helped  VBD     VBD     9       aux     _       help    01      1       -       *       *       (V*)    *       *       *       -
  9 | nw/wsj/24/wsj_2400      0       8       push    VB      VB      2       relcl   _       push    01      6.1     -       *       *       (ARG1*  (V*)    *       *       -
 10 | nw/wsj/24/wsj_2400      0       9       up      RP      RP      9       prt     _       -       -       -       -       *       *       *       (ARG2*) *       *       -
 11 | nw/wsj/24/wsj_2400      0       10      the     DT      DT      14      det     _       -       -       -       -       *       *       *       (ARG1*  *       *       (4
 12 | nw/wsj/24/wsj_2400      0       11      producer        NN      NN      13      compound        _       -       -       -       -       *       *       *       *       *       *       -
 13 | nw/wsj/24/wsj_2400      0       12      price   NN      NN      14      compound        _       price   -       -       -       *       *       *       *       *       *       -
 14 | nw/wsj/24/wsj_2400      0       13      index   NN      NN      9       dobj    _       index   -       -       -       *       *)      *)      *)      *       *       4)
 15 | nw/wsj/24/wsj_2400      0       14      ,       ,       ,       2       punct   _       -       -       -       -       *       *       *       *       *)      *)      -
 16 | nw/wsj/24/wsj_2400      0       15      is      VBZ     VBZ     17      auxpass _       be      -       -       -       *       *       *       *       *       *       -
 17 | nw/wsj/24/wsj_2400      0       16      expected        VBN     VBN     0       root    _       expect  01      1       -       *       *       *       *       (V*)    *       -
 18 | nw/wsj/24/wsj_2400      0       17      to      TO      TO      19      aux     _       -       -       -       -       *       *       *       *       (C-ARG1*        *       -
 19 | nw/wsj/24/wsj_2400      0       18      do      VB      VB      17      xcomp   _       do      02      1       -       *       *       *       *       *       (V*)    -
 20 | nw/wsj/24/wsj_2400      0       19      the     DT      DT      21      det     _       -       -       -       -       *       *       *       *       *       (ARG1*  -
 21 | nw/wsj/24/wsj_2400      0       20      same    JJ      JJ      19      dobj    _       -       -       -       -       *       *       *       *       *       *)      -
 22 | nw/wsj/24/wsj_2400      0       21      in      IN      IN      19      prep    _       -       -       -       -       *       *       *       *       *       (ARGM-LOC*      -
 23 | nw/wsj/24/wsj_2400      0       22      the     DT      DT      26      det     _       -       -       -       -       *       *       *       *       *       *       -
 24 | nw/wsj/24/wsj_2400      0       23      consumer        NN      NN      25      compound        _       -       -       -       -       *       *       *       *       *       *       -
 25 | nw/wsj/24/wsj_2400      0       24      price   NN      NN      26      compound        _       price   -       -       -       *       *       *       *       *       *       -
 26 | nw/wsj/24/wsj_2400      0       25      report  NN      NN      22      pobj    _       report  -       -       -       *       *       *       *       *)      *)      -
 27 | nw/wsj/24/wsj_2400      0       26      .       .       .       17      punct   _       -       -       -       -       *       *       *       *       *       *       -
 28 | 
 29 | nw/wsj/01/wsj_0102      0       0       A       DT      5       det     _       -       -       -       -       *       (ARG1*  (R-ARG1*        (ARG2*  *       -
 30 | nw/wsj/01/wsj_0102      0       1       medium  JJ      4       amod    _       -       -       -       -       *       *       *       *       *       -
 31 | nw/wsj/01/wsj_0102      0       2       -       HYPH    4       punct   _       -       -       -       -       *       *       *       *       *       -
 32 | nw/wsj/01/wsj_0102      0       3       sized   JJ      5       amod    _       -       -       -       -       *       *       *       *       *       -
 33 | nw/wsj/01/wsj_0102      0       4       one     CD      14      dep     _       -       -       -       -       *       *       *       *       *       -
 34 | nw/wsj/01/wsj_0102      0       5       in      IN      5       prep    _       -       -       -       -       *       *       *       *       *       -
 35 | nw/wsj/01/wsj_0102      0       6       Brooklyn        NNP     6       pobj    _       -       -       -       -       (GPE)   *)      *)      *)      *       -
 36 | nw/wsj/01/wsj_0102      0       7       ,       ,       10      punct   _       -       -       -       -       *       *       (ARGM-DIS*      *       *       -
 37 | nw/wsj/01/wsj_0102      0       8       it      PRP     10      dep     _       -       -       -       -       *       (ARG0*) *       *       *       (19)
 38 | nw/wsj/01/wsj_0102      0       9       says    VBZ     14      parataxis       _       say     01      1       -       *       (V*)    *       *       *       -
 39 | nw/wsj/01/wsj_0102      0       10      ,       ,       10      punct   _       -       -       -       -       *       *       *)      *       *       -
 40 | nw/wsj/01/wsj_0102      0       11      could   MD      14      aux     _       -       -       -       -       *       (C-ARG1*        (ARGM-MOD(ARG1*)        *       *       -
 41 | nw/wsj/01/wsj_0102      0       12      be      VB      14      auxpass _       be      -       -       -       *       *       *       *       *       -
 42 | nw/wsj/01/wsj_0102      0       13      altered VBN     0       root    _       alter   01      1       -       *       *       (V*)    *       *       -
 43 | nw/wsj/01/wsj_0102      0       14      to      TO      16      aux     _       -       -       -       -       *       *       (ARG2*  *       *       -
 44 | nw/wsj/01/wsj_0102      0       15      house   VB      14      xcomp   _       house   01      1       -       *       *       *       (V*)    *       -
 45 | nw/wsj/01/wsj_0102      0       16      up      IN      19      quantmod        _       -       -       -       -       (CARDINAL*      *       *       (ARG1*  *       -
 46 | nw/wsj/01/wsj_0102      0       17      to      TO      19      quantmod        _       -       -       -       -       *       *       *       *       *       -
 47 | nw/wsj/01/wsj_0102      0       18      1,000   CD      20      nummod  _       -       -       -       -       *)      *       *       *       *       -
 48 | nw/wsj/01/wsj_0102      0       19      inmates NNS     16      dobj    _       -       -       -       -       *       *       *       *)      *       -
 49 | nw/wsj/01/wsj_0102      0       20      at      IN      16      prep    _       -       -       -       -       *       *       *       (ARGM-MNR*      *       -
 50 | nw/wsj/01/wsj_0102      0       21      a       DT      24      det     _       -       -       -       -       *       *       *       *       *       -
 51 | nw/wsj/01/wsj_0102      0       22      lower   JJR     24      amod    _       -       -       -       -       *       *       *       *       *       -
 52 | nw/wsj/01/wsj_0102      0       23      cost    NN      21      pobj    _       cost    -       1       -       *       *       *       *       *       -
 53 | nw/wsj/01/wsj_0102      0       24      than    IN      24      prep    _       -       -       -       -       *       *       *       *       *       -
 54 | nw/wsj/01/wsj_0102      0       25      building        VBG     25      pcomp   _       build   01      1       -       *       *       *       *       (V*)    -
 55 | nw/wsj/01/wsj_0102      0       26      a       DT      29      det     _       -       -       -       -       *       *       *       *       (ARG1*  -
 56 | nw/wsj/01/wsj_0102      0       27      new     JJ      29      amod    _       -       -       -       -       *       *       *       *       *       -
 57 | nw/wsj/01/wsj_0102      0       28      prison  NN      26      dobj    _       -       -       -       -       *       *       *       *       *)      -
 58 | nw/wsj/01/wsj_0102      0       29      in      IN      26      prep    _       -       -       -       -       *       *       *       *       (ARGM-LOC*      -
 59 | nw/wsj/01/wsj_0102      0       30      upstate JJ      33      amod    _       -       -       -       -       *       *       *       *       *       -
 60 | nw/wsj/01/wsj_0102      0       31      New     NNP     33      compound        _       -       -       -       -       (GPE*   *       *       *       *       -
 61 | nw/wsj/01/wsj_0102      0       32      York    NNP     30      pobj    _       -       -       -       -       *)      *)      *))     *)      *)      -
 62 | nw/wsj/01/wsj_0102      0       33      .       .       14      punct   _       -       -       -       -       *       *       *       *       *       -
 63 | 
 64 | bc/phoenix/00/phoenix_0000      1       0       I       PRP     PRP     4       dep     _       -       -       -       Ye_daying       *       *       (ARG1(ARG0*))   (ARG0*) *       *       (108)
 65 | bc/phoenix/00/phoenix_0000      1       1       did     VBD     VBD     4       aux     _       do      01      -       Ye_daying       *       (V*)    *       *       *       *       -
 66 | bc/phoenix/00/phoenix_0000      1       2       n't     RB      RB      4       neg     _       -       -       -       Ye_daying       *       *       (ARGM-NEG*)     *       *       *       -
 67 | bc/phoenix/00/phoenix_0000      1       3       dare    VB      VB      0       root    _       dare    01      1       Ye_daying       *       *       (V*)    *       *       *       -
 68 | bc/phoenix/00/phoenix_0000      1       4       defend  VB      VB      4       xcomp   _       defend  01      1       Ye_daying       *       *       (ARG2*  (V*)    *       *       -
 69 | bc/phoenix/00/phoenix_0000      1       5       myself  PRP     PRP     5       dobj    _       -       -       -       Ye_daying       *       *       *)      (ARG1*) *       *       (108)
 70 | bc/phoenix/00/phoenix_0000      1       6       ,       ,       ,       4       punct   _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 71 | bc/phoenix/00/phoenix_0000      1       7       so      CC      CC      4       cc      _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 72 | bc/phoenix/00/phoenix_0000      1       8       I       PRP     PRP     11      dep     _       -       -       -       Ye_daying       *       *       *       *       *       (ARG1*) (108)
 73 | bc/phoenix/00/phoenix_0000      1       9       just    RB      RB      11      advmod  _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 74 | bc/phoenix/00/phoenix_0000      1       10      had     VBD     VBD     4       conj    _       have    02      12      Ye_daying       *       *       *       *       (V*)    *       -
 75 | bc/phoenix/00/phoenix_0000      1       11      to      TO      TO      13      aux     _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 76 | bc/phoenix/00/phoenix_0000      1       12      endure  VB      VB      11      xcomp   _       endure  01      2       Ye_daying       *       *       *       *       *       (V*)    -
 77 | bc/phoenix/00/phoenix_0000      1       13      it      PRP     PRP     13      dobj    _       -       -       -       Ye_daying       *       *       *       *       *       (ARG2*) (8)
 78 | bc/phoenix/00/phoenix_0000      1       14      for     IN      IN      13      prep    _       -       -       -       Ye_daying       *       *       *       *       *       (ARGM-TMP*      -
 79 | bc/phoenix/00/phoenix_0000      1       15      quite   RB      PDT     18      advmod  _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 80 | bc/phoenix/00/phoenix_0000      1       16      a       DT      DT      18      det     _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 81 | bc/phoenix/00/phoenix_0000      1       17      while   NN      NN      15      pobj    _       -       -       -       Ye_daying       *       *       *       *       *       *)      -
 82 | bc/phoenix/00/phoenix_0000      1       18      .       .       .       11      punct   _       -       -       -       Ye_daying       *       *       *       *       *       *       -
 83 | 
 84 | mz/sinorama/10/ectb_1030        1       0       A       DT      DT      2       det     _       -       -       -       -       *       *       (ARG1*  (ARG0*  (ARG0(ARG0*     *       -
 85 | mz/sinorama/10/ectb_1030        1       1       crowd   NN      NN      13      dep     _       crowd   -       -       -       *       *       *       *       *       *       -
 86 | mz/sinorama/10/ectb_1030        1       2       of      IN      IN      2       prep    _       -       -       -       -       *       *       *       *       *       *       -
 87 | mz/sinorama/10/ectb_1030        1       3       500     CD      CD      6       quantmod        _       -       -       -       -       (CARDINAL)      (ARG0*  *       *       *       *       -
 88 | mz/sinorama/10/ectb_1030        1       4       to      IN      TO      6       quantmod        _       -       -       -       -       *       *       *       *       *       *       -
 89 | mz/sinorama/10/ectb_1030        1       5       600     CD      CD      7       nummod  _       -       -       -       -       (CARDINAL)      *       *       *       *       *       -
 90 | mz/sinorama/10/ectb_1030        1       6       Taiwanese       JJ      NNPS    3       pobj    _       -       -       -       -       (NORP)  *)      *       *       *       *       -
 91 | mz/sinorama/10/ectb_1030        1       7       living  VBG     VBG     7       acl     _       live    01      2       -       *       (V*)    *       *       *       *       -
 92 | mz/sinorama/10/ectb_1030        1       8       in      IN      IN      8       prep    _       -       -       -       -       *       (ARGM-LOC*      *       *       *       *       -
 93 | mz/sinorama/10/ectb_1030        1       9       the     DT      DT      12      det     _       -       -       -       -       (GPE*   *       *       *       *       *       (22
 94 | mz/sinorama/10/ectb_1030        1       10      United  NNP     NNP     12      compound        _       -       -       -       -       *       *       *       *       *       *       -
 95 | mz/sinorama/10/ectb_1030        1       11      States  NNP     NNP     9       pobj    _       -       -       -       -       *)      *)      *)      *)      *))     *       22)
 96 | mz/sinorama/10/ectb_1030        1       12      showed  VBD     VBD     0       root    _       show    02      2       -       *       *       (V*)    *       *       *       -
 97 | mz/sinorama/10/ectb_1030        1       13      up      RP      RP      13      prt     _       -       -       -       -       *       *       *       *       *       *       -
 98 | mz/sinorama/10/ectb_1030        1       14      to      TO      TO      16      aux     _       -       -       -       -       *       *       (ARGM-PRP*      *       *       *       -
 99 | mz/sinorama/10/ectb_1030        1       15      greet   VB      VB      13      xcomp   _       greet   01      1       -       *       *       *       (V*)    *       *       -
100 | mz/sinorama/10/ectb_1030        1       16      the     DT      DT      18      det     _       -       -       -       -       *       *       *       (ARG1*  *       *       (39
101 | mz/sinorama/10/ectb_1030        1       17      president       NN      NN      16      dobj    _       president       -       2       -       *       *       *       *)      *       *       39)
102 | mz/sinorama/10/ectb_1030        1       18      at      IN      IN      16      prep    _       -       -       -       -       *       *       *       (ARGM-LOC*      *       *       -
103 | mz/sinorama/10/ectb_1030        1       19      his     PRP$    PRP$    21      poss    _       -       -       -       -       *       *       *       *       *       *       (39)
104 | mz/sinorama/10/ectb_1030        1       20      hotel   NN      NN      19      pobj    _       -       -       -       -       *       *       *       *)      *       *       -
105 | mz/sinorama/10/ectb_1030        1       21      and     CC      CC      16      cc      _       -       -       -       -       *       *       *       *       *       *       -
106 | mz/sinorama/10/ectb_1030        1       22      protest VB      NN      16      conj    _       protest 01      1       -       *       *       *       *       (V*)    *       -
107 | mz/sinorama/10/ectb_1030        1       23      against IN      IN      23      prep    _       -       -       -       -       *       *       *       *       (ARG1*  *       -
108 | mz/sinorama/10/ectb_1030        1       24      the     DT      DT      27      det     _       -       -       -       -       (ORG*   *       *       *       *       (ARG1*  (22
109 | mz/sinorama/10/ectb_1030        1       25      US      NNP     NNP     27      compound        _       -       -       -       -       *       *       *       *       *       *       -
110 | mz/sinorama/10/ectb_1030        1       26      government      NN      NN      30      dep     _       government      -       1       -       *       *       *       *       *       *       -
111 | mz/sinorama/10/ectb_1030        1       27      's      POS     POS     27      case    _       -       -       -       -       *)      *       *       *       *       *)      22)
112 | mz/sinorama/10/ectb_1030        1       28      "       ``      ``      30      punct   _       -       -       -       -       *       *       *       *       *       *       -
113 | mz/sinorama/10/ectb_1030        1       29      caving  VBG     VBG     24      pcomp   _       cave    02      4       -       *       *       *       *       *       (V*)    -
114 | mz/sinorama/10/ectb_1030        1       30      in      RP      RP      30      prt     _       -       -       -       -       *       *       *       *       *       *       -
115 | mz/sinorama/10/ectb_1030        1       31      to      IN      IN      30      prep    _       -       -       -       -       *       *       *       *       *       (ARGM-GOL*      -
116 | mz/sinorama/10/ectb_1030        1       32      Beijing NNP     NNP     32      pobj    _       -       -       -       -       (GPE)   *       *       *       *       *)      (12)
117 | mz/sinorama/10/ectb_1030        1       33      at      IN      IN      30      prep    _       -       -       -       -       *       *       *       *       *       (ARGM-MNR*      -
118 | mz/sinorama/10/ectb_1030        1       34      the     DT      DT      36      det     _       -       -       -       -       *       *       *       *       *       *       -
119 | mz/sinorama/10/ectb_1030        1       35      expense NN      NN      34      pobj    _       expense -       2       -       *       *       *       *       *       *       -
120 | mz/sinorama/10/ectb_1030        1       36      of      IN      IN      36      prep    _       -       -       -       -       *       *       *       *       *       *       -
121 | mz/sinorama/10/ectb_1030        1       37      Taiwan  NNP     NNP     37      pobj    _       -       -       -       -       (GPE)   *       *)      *       *)      *)      (0)
122 | mz/sinorama/10/ectb_1030        1       38      .       .       .       13      punct   _       -       -       -       -       *       *       *       *       *       *       -
123 | mz/sinorama/10/ectb_1030        1       39      "       ''      ''      13      punct   _       -       -       -       -       *       *       *       *       *       *       -
124 | 
125 | nw/xinhua/02/chtb_0240  0       14      to      IN      IN      10      prep    _       -       -       -       -       *       *       *       (ARGM-GOL*      *       *       *       *       *       *       -
126 | nw/xinhua/02/chtb_0240  0       15      the     DT      DT      18      det     _       -       -       -       -       (ORG*   *       *       *       *       *       *       *       *       *       (2
127 | nw/xinhua/02/chtb_0240  0       16      Straits NNPS    NNPS    18      compound        _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
128 | nw/xinhua/02/chtb_0240  0       17      Foundation      NNP     NNP     15      pobj    _       -       -       -       -       *)      *       *       *)      *       *       *       *       *       *       2)
129 | nw/xinhua/02/chtb_0240  0       18      which   WDT     WDT     21      dep     _       -       -       -       -       *       *       *       (ARGM-ADV(R-ARG1*)      (R-ARG0*)       (R-ARG0*)       *       *       *       *       -
130 | nw/xinhua/02/chtb_0240  0       19      seriously       RB      RB      21      advmod  _       -       -       -       -       *       *       *       *       (ARGM-EXT*)     (ARGM-MNR*)     *       *       *       *       -
131 | nw/xinhua/02/chtb_0240  0       20      hurts   VBZ     VBZ     10      ccomp   _       hurt    01      3       -       *       *       *       *       (V*)    *       *       *       *       *       -
132 | nw/xinhua/02/chtb_0240  0       21      feelings        NNS     NNS     21      dobj    _       feeling -       7       -       *       *       *       *       (ARG1*  *       *       *       *       *       -
133 | nw/xinhua/02/chtb_0240  0       22      of      IN      IN      22      prep    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
134 | nw/xinhua/02/chtb_0240  0       23      compatriots     NNS     NNS     23      pobj    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
135 | nw/xinhua/02/chtb_0240  0       24      of      IN      IN      24      prep    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
136 | nw/xinhua/02/chtb_0240  0       25      both    DT      DT      27      det     _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       (7
137 | nw/xinhua/02/chtb_0240  0       26      sides   NNS     NNS     25      pobj    _       side    -       3       -       *       *       *       *       *)      *       *       *       *       *       7)
138 | nw/xinhua/02/chtb_0240  0       27      and     CC      CC      21      cc      _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
139 | nw/xinhua/02/chtb_0240  0       28      breaks  VBZ     VBZ     21      conj    _       break   01      3       -       *       *       *       *       *       (V*)    *       *       *       *       -
140 | nw/xinhua/02/chtb_0240  0       29      the     DT      DT      32      det     _       -       -       -       -       *       *       *       *       *       (ARG1*  *       *       *       *       -
141 | nw/xinhua/02/chtb_0240  0       30      harmonious      JJ      JJ      32      amod    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
142 | nw/xinhua/02/chtb_0240  0       31      atmosphere      NN      NN      29      dobj    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
143 | nw/xinhua/02/chtb_0240  0       32      on      IN      IN      32      prep    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
144 | nw/xinhua/02/chtb_0240  0       33      both    DT      DT      35      det     _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       (7
145 | nw/xinhua/02/chtb_0240  0       34      sides   NNS     NNS     33      pobj    _       side    -       3       -       *       *       *       *)      *       *)      *       *       *       *       7)
146 | nw/xinhua/02/chtb_0240  0       35      ,       ,       ,       10      punct   _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
147 | nw/xinhua/02/chtb_0240  0       36      so      IN      IN      40      mark    _       -       -       -       -       *       *       *       (ARGM-PRP*      *       *       *       *       *       *       -
148 | nw/xinhua/02/chtb_0240  0       37      as      IN      IN      40      mark    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
149 | nw/xinhua/02/chtb_0240  0       38      to      TO      TO      40      aux     _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
150 | nw/xinhua/02/chtb_0240  0       39      attract VB      VB      10      advcl   _       attract 01      1       -       *       *       *       *       *       *       (V*)    *       *       *       -
151 | nw/xinhua/02/chtb_0240  0       40      the     DT      DT      42      det     _       -       -       -       -       *       *       *       *       *       *       (ARG1*  *       *       *       -
152 | nw/xinhua/02/chtb_0240  0       41      attention       NN      NN      40      dobj    _       attention       -       2       -       *       *       *       *       *       *       *       *       *       *       -
153 | nw/xinhua/02/chtb_0240  0       42      of      IN      IN      42      prep    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
154 | nw/xinhua/02/chtb_0240  0       43      relevant        JJ      JJ      46      amod    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
155 | nw/xinhua/02/chtb_0240  0       44      Taiwanese       JJ      JJ      46      amod    _       -       -       -       -       (NORP)  *       *       *       *       *       *       *       *       *       -
156 | nw/xinhua/02/chtb_0240  0       45      areas   NNS     NNS     43      pobj    _       area    -       1       -       *       *       *       *       *       *       *       *       *       *       -
157 | nw/xinhua/02/chtb_0240  0       46      on      IN      IN      42      prep    _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
158 | nw/xinhua/02/chtb_0240  0       47      compensating    VBG     VBG     47      pcomp   _       compensate      01      1       -       *       *       *       *       *       *       *       (V*)    *       *       -
159 | nw/xinhua/02/chtb_0240  0       48      for     IN      IN      48      prep    _       -       -       -       -       *       *       *       *       *       *       *       (ARG1*  *       *       -
160 | nw/xinhua/02/chtb_0240  0       49      losses  NNS     NNS     49      pobj    _       loss    -       3       -       *       *       *       *       *       *       *       *)      *       *       -
161 | nw/xinhua/02/chtb_0240  0       50      ,       ,       ,       48      punct   _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
162 | nw/xinhua/02/chtb_0240  0       51      finding VBG     VBG     48      conj    _       find    01      1       -       *       *       *       *       *       *       *       *       (V*)    *       -
163 | nw/xinhua/02/chtb_0240  0       52      those   DT      DT      52      dobj    _       -       -       -       -       *       *       *       *       *       *       *       *       (ARG1*  *       -
164 | nw/xinhua/02/chtb_0240  0       53      responsible     JJ      JJ      53      amod    _       -       -       -       -       *       *       *       *       *       *       *       *       *)      *       -
165 | nw/xinhua/02/chtb_0240  0       54      ,       ,       ,       52      punct   _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
166 | nw/xinhua/02/chtb_0240  0       55      and     CC      CC      52      cc      _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
167 | nw/xinhua/02/chtb_0240  0       56      severely        RB      RB      58      advmod  _       -       -       -       -       *       *       *       *       *       *       *       *       *       (ARGM-MNR*)     -
168 | nw/xinhua/02/chtb_0240  0       57      punishing       VBG     VBG     52      conj    _       punish  01      1       -       *       *       *       *       *       *       *       *       *       (V*)    -
169 | nw/xinhua/02/chtb_0240  0       58      the     DT      DT      60      det     _       -       -       -       -       *       *       *       *       *       *       *       *       *       (ARG1*  -
170 | nw/xinhua/02/chtb_0240  0       59      assailants      NNS     NNS     58      dobj    _       -       -       -       -       *       *       *       *)      *       *       *)      *       *       *)      -
171 | nw/xinhua/02/chtb_0240  0       60      .       .       .       10      punct   _       -       -       -       -       *       *       *       *       *       *       *       *       *       *       -
172 | 
173 | nw/wsj/24/wsj_2412      0       0       But     CC      CC      3       cc      _       -       -       -       -       *       *       -
174 | nw/wsj/24/wsj_2412      0       1       what    WP      WP      3       dep     _       -       -       -       -       *       *       -
175 | nw/wsj/24/wsj_2412      0       2       about   IN      IN      0       root    _       -       -       -       -       *       *       -
176 | nw/wsj/24/wsj_2412      0       3       those   DT      DT      3       pobj    _       -       -       -       -       *       *       (33
177 | nw/wsj/24/wsj_2412      0       4       of      IN      IN      4       prep    _       -       -       -       -       *       *       -
178 | nw/wsj/24/wsj_2412      0       5       us      PRP     PRP     5       pobj    _       -       -       -       -       *       *       -
179 | nw/wsj/24/wsj_2412      0       6       whose   WP$     WP$     8       poss    4:ref   -       -       -       -       *       (ARG1*  -
180 | nw/wsj/24/wsj_2412      0       7       views   NNS     NNS     11      dep     _       view    -       4       -       *       (R-ARG1*))      -
181 | nw/wsj/24/wsj_2412      0       8       are     VBP     VBP     11      auxpass _       are     -       -       -       *       *       -
182 | nw/wsj/24/wsj_2412      0       9       not     RB      RB      11      neg     _       -       -       -       -       *       (ARGM-NEG*)     -
183 | nw/wsj/24/wsj_2412      0       10      predetermined   VBN     VBN     4       relcl   _       predetermine    01      -       -       *       (V*)    -
184 | nw/wsj/24/wsj_2412      0       11      by      IN      IN      11      prep    _       -       -       -       -       *       (ARG0*  -
185 | nw/wsj/24/wsj_2412      0       12      formula NN      NN      12      pobj    _       formula -       2       -       *       *       -
186 | nw/wsj/24/wsj_2412      0       13      or      CC      CC      13      cc      _       -       -       -       -       *       *       -
187 | nw/wsj/24/wsj_2412      0       14      ideology        NN      NN      13      conj    _       -       -       -       -       *       *)      33)
188 | nw/wsj/24/wsj_2412      0       15      ?       .       .       3       punct   _       -       -       -       -       *       *       -
189 | 
190 | nw/wsj/01/wsj_0102	0	0	A	DT	DT	5	det	_	-	-	-	-	*	(ARG1*	(R-ARG1*	(ARG2*	*	-
191 | nw/wsj/01/wsj_0102	0	1	medium	JJ	JJ	4	amod	_	-	-	-	-	*	*	*	*	*	-
192 | nw/wsj/01/wsj_0102	0	2	-	HYPH	HYPH	4	punct	_	-	-	-	-	*	*	*	*	*	-
193 | nw/wsj/01/wsj_0102	0	3	sized	JJ	JJ	5	amod	_	-	-	-	-	*	*	*	*	*	-
194 | nw/wsj/01/wsj_0102	0	4	one	CD	CD	14	dep	_	-	-	-	-	*	*	*	*	*	-
195 | nw/wsj/01/wsj_0102	0	5	in	IN	IN	5	prep	_	-	-	-	-	*	*	*	*	*	-
196 | nw/wsj/01/wsj_0102	0	6	Brooklyn	NNP	NNP	6	pobj	_	-	-	-	-	(GPE)	*)	*)	*)	*	-
197 | nw/wsj/01/wsj_0102	0	7	,	,	,	10	punct	_	-	-	-	-	*	*	(ARGM-DIS*	*	*	-
198 | nw/wsj/01/wsj_0102	0	8	it	PRP	PRP	10	dep	_	-	-	-	-	*	(ARG0*)	*	*	*	(19)
199 | nw/wsj/01/wsj_0102	0	9	says	VBZ	VBZ	14	parataxis	_	say	01	1	-	*	(V*)	*	*	*	-
200 | nw/wsj/01/wsj_0102	0	10	,	,	,	10	punct	_	-	-	-	-	*	*	*)	*	*	-
201 | nw/wsj/01/wsj_0102	0	11	could	MD	MD	14	aux	_	-	-	-	-	*	(C-ARG1*	(ARGM-MOD(ARG1*)	*	*	-
202 | nw/wsj/01/wsj_0102	0	12	be	VB	VB	14	auxpass	_	be	-	-	-	*	*	*	*	*	-
203 | nw/wsj/01/wsj_0102	0	13	altered	VBN	VBN	0	root	_	alter	01	1	-	*	*	(V*)	*	*	-
204 | nw/wsj/01/wsj_0102	0	14	to	TO	TO	16	aux	_	-	-	-	-	*	*	(ARG2*	*	*	-
205 | nw/wsj/01/wsj_0102	0	15	house	VB	VB	14	xcomp	_	house	01	1	-	*	*	*	(V*)	*	-
206 | nw/wsj/01/wsj_0102	0	16	up	IN	IN	19	quantmod	_	-	-	-	-	(CARDINAL*	*	*	(ARG1*	*	-
207 | nw/wsj/01/wsj_0102	0	17	to	TO	TO	19	quantmod	_	-	-	-	-	*	*	*	*	*	-
208 | nw/wsj/01/wsj_0102	0	18	1,000	CD	CD	20	nummod	_	-	-	-	-	*)	*	*	*	*	-
209 | nw/wsj/01/wsj_0102	0	19	inmates	NNS	NNS	16	dobj	_	-	-	-	-	*	*	*	*)	*	-
210 | nw/wsj/01/wsj_0102	0	20	at	IN	IN	16	prep	_	-	-	-	-	*	*	*	(ARGM-MNR*	*	-
211 | nw/wsj/01/wsj_0102	0	21	a	DT	DT	24	det	_	-	-	-	-	*	*	*	*	*	-
212 | nw/wsj/01/wsj_0102	0	22	lower	JJR	JJR	24	amod	_	-	-	-	-	*	*	*	*	*	-
213 | nw/wsj/01/wsj_0102	0	23	cost	NN	NN	21	pobj	_	cost	-	1	-	*	*	*	*	*	-
214 | nw/wsj/01/wsj_0102	0	24	than	IN	IN	24	prep	_	-	-	-	-	*	*	*	*	*	-
215 | nw/wsj/01/wsj_0102	0	25	building	VBG	VBG	25	pcomp	_	build	01	1	-	*	*	*	*	(V*)	-
216 | nw/wsj/01/wsj_0102	0	26	a	DT	DT	29	det	_	-	-	-	-	*	*	*	*	(ARG1*	-
217 | nw/wsj/01/wsj_0102	0	27	new	JJ	JJ	29	amod	_	-	-	-	-	*	*	*	*	*	-
218 | nw/wsj/01/wsj_0102	0	28	prison	NN	NN	26	dobj	_	-	-	-	-	*	*	*	*	*)	-
219 | nw/wsj/01/wsj_0102	0	29	in	IN	IN	26	prep	_	-	-	-	-	*	*	*	*	(ARGM-LOC*	-
220 | nw/wsj/01/wsj_0102	0	30	upstate	JJ	JJ	33	amod	_	-	-	-	-	*	*	*	*	*	-
221 | nw/wsj/01/wsj_0102	0	31	New	NNP	NNP	33	compound	_	-	-	-	-	(GPE*	*	*	*	*	-
222 | nw/wsj/01/wsj_0102	0	32	York	NNP	NNP	30	pobj	_	-	-	-	-	*)	*)	*))	*)	*)	-
223 | nw/wsj/01/wsj_0102	0	33	.	.	.	14	punct	_	-	-	-	-	*	*	*	*	*	-
224 | 
225 | pt/nt/46/nt_4610        1       0       Eat     VB      VB      0       root    _       eat     01      -       1_Corinthians   *       (V*)    *       -
226 | pt/nt/46/nt_4610        1       1       any     DT      DT      3       det     _       -       -       -       1_Corinthians   *       (ARG1*  (ARG1*  (18
227 | pt/nt/46/nt_4610        1       2       meat    NN      NN      1       dobj    _       -       -       -       1_Corinthians   *       *       *)      -
228 | pt/nt/46/nt_4610        1       3       that    WDT     WDT     6       dep     3:ref   -       -       -       1_Corinthians   *       *       (R-ARG1(ARG0*)) -
229 | pt/nt/46/nt_4610        1       4       is      VBZ     VBZ     6       auxpass _       be      -       -       1_Corinthians   *       *       *       -
230 | pt/nt/46/nt_4610        1       5       sold    VBN     VBN     3       relcl   _       sell    01      -       1_Corinthians   *       *       (V*)    -
231 | pt/nt/46/nt_4610        1       6       in      IN      IN      6       prep    _       -       -       -       1_Corinthians   *       *       (ARGM-LOC*      -
232 | pt/nt/46/nt_4610        1       7       the     DT      DT      10      det     _       -       -       -       1_Corinthians   *       *       *       -
233 | pt/nt/46/nt_4610        1       8       meat    NN      NN      10      compound        _       -       -       -       1_Corinthians   *       *       *       -
234 | pt/nt/46/nt_4610        1       9       market  NN      NN      7       pobj    _       market  -       -       1_Corinthians   *       *)      *)      18)
235 | pt/nt/46/nt_4610        1       10      .       .       .       1       punct   _       -       -       -       1_Corinthians   *       *       *       -
236 | 


--------------------------------------------------------------------------------