├── .DS_Store
├── README.md
├── exp-4.1-baseline
├── config.json
├── log.txt
├── model
│ ├── char_dict
│ ├── checkpoints.tsv
│ ├── config
│ ├── cons_label_dict
│ ├── dep_label_dict
│ ├── label_dict
│ ├── pos_dict
│ └── word_dict
├── predict.sh
└── train.sh
├── figures
├── model.jpg
└── model.pdf
├── scripts
├── convert_orl_conll_to_json.py
├── eval_averaged_metrics.py
├── eval_orl_conll_file.py
├── eval_orl_e2e_json_file.py
├── eval_orl_json_file.py
└── generate_constituent_trees_from_benepar.py
└── src
├── orl-4.1-ultimate-hard-e2e
├── __init__.py
├── analyze.py
├── neural_srl
│ ├── TreeLSTM
│ │ ├── Encoder.py
│ │ ├── Tree.py
│ │ ├── TreeGRU.py
│ │ └── __init__.py
│ ├── __init__.py
│ ├── gcn_model
│ │ ├── __init__.py
│ │ ├── gcn.py
│ │ ├── tree.py
│ │ └── various_gcn.py
│ ├── pytorch
│ │ ├── HBiLSTM.py
│ │ ├── HighWayLSTM.py
│ │ ├── __init__.py
│ │ ├── implicit_syntactic_representations.py
│ │ ├── layer.py
│ │ ├── model.py
│ │ ├── pre_trained_language_model.py
│ │ ├── tagger.py
│ │ └── util.py
│ └── shared
│ │ ├── __init__.py
│ │ ├── configuration.py
│ │ ├── conll_utils.py
│ │ ├── constants.py
│ │ ├── constituent_extraction.py
│ │ ├── constituent_reader.py
│ │ ├── dictionary.py
│ │ ├── evaluation.py
│ │ ├── features.py
│ │ ├── inference.py
│ │ ├── inference_utils.py
│ │ ├── io_utils.py
│ │ ├── measurements.py
│ │ ├── numpy_utils.py
│ │ ├── reader.py
│ │ ├── scores_pb2.py
│ │ ├── srl_eval_utils.py
│ │ ├── syntactic_extraction.py
│ │ ├── tagger_data.py
│ │ └── tensor_pb2.py
├── predict.py
└── train.py
└── orl-4.1
├── __init__.py
├── analyze.py
├── neural_srl
├── TreeLSTM
│ ├── Encoder.py
│ ├── Tree.py
│ ├── TreeGRU.py
│ └── __init__.py
├── __init__.py
├── __pycache__
│ └── __init__.cpython-37.pyc
├── gcn_model
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── tree.cpython-37.pyc
│ │ └── various_gcn.cpython-37.pyc
│ ├── gcn.py
│ ├── tree.py
│ └── various_gcn.py
├── pytorch
│ ├── HBiLSTM.py
│ ├── HighWayLSTM.py
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── HighWayLSTM.cpython-37.pyc
│ │ ├── __init__.cpython-37.pyc
│ │ ├── implicit_syntactic_representations.cpython-37.pyc
│ │ ├── layer.cpython-37.pyc
│ │ ├── model.cpython-37.pyc
│ │ ├── pre_trained_language_model.cpython-37.pyc
│ │ ├── tagger.cpython-37.pyc
│ │ └── util.cpython-37.pyc
│ ├── implicit_syntactic_representations.py
│ ├── layer.py
│ ├── model.py
│ ├── pre_trained_language_model.py
│ ├── tagger.py
│ └── util.py
└── shared
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── configuration.cpython-37.pyc
│ ├── conll_utils.cpython-37.pyc
│ ├── constants.cpython-37.pyc
│ ├── constituent_extraction.cpython-37.pyc
│ ├── constituent_reader.cpython-37.pyc
│ ├── dictionary.cpython-37.pyc
│ ├── evaluation.cpython-37.pyc
│ ├── inference_utils.cpython-37.pyc
│ ├── measurements.cpython-37.pyc
│ ├── reader.cpython-37.pyc
│ ├── srl_eval_utils.cpython-37.pyc
│ ├── syntactic_extraction.cpython-37.pyc
│ └── tagger_data.cpython-37.pyc
│ ├── configuration.py
│ ├── conll_utils.py
│ ├── constants.py
│ ├── constituent_extraction.py
│ ├── constituent_reader.py
│ ├── dictionary.py
│ ├── evaluation.py
│ ├── features.py
│ ├── inference.py
│ ├── inference_utils.py
│ ├── io_utils.py
│ ├── measurements.py
│ ├── numpy_utils.py
│ ├── reader.py
│ ├── scores_pb2.py
│ ├── srl_eval_utils.py
│ ├── syntactic_extraction.py
│ ├── tagger_data.py
│ └── tensor_pb2.py
├── predict.py
└── train.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # opinion_mining_with_syn_cons
2 | This repositry contains our code, configurations, and model for our work on "A Unified Span-Based Approach for Opinion Mining with Syntactic Constituents", which is published on NAACL-2021.
3 | The src directory contains our code and the exp-4.1-baseline contains our experiment for "Baseline+BERT" (data0, the first data of the five fold cross-validation).
4 |
5 | 
6 |
7 | ## Environment
8 | Python3, Pytorch, Transformers 2.1.1 (for BERT)
9 |
10 | ### Data
11 | MPQA2.0 [url](http://mpqa.cs.pitt.edu/corpora/mpqa_corpus/mpqa_corpus_2_0/)
12 | PTB and OntoNotes can be download from LDC.
13 |
14 | ### Training
15 | Please reset and check the files in the train.sh and config.json when you want to run the code.
16 |
17 | ```
18 | sh train.sh GPU\_ID
19 | ```
20 |
21 | ### Test
22 | To test the performance of the trained model, you should run the following script.
23 |
24 | ```
25 | sh predict.sh GPU\_ID
26 | ```
27 | We release the sample model of the "exp-4.1-baseline" on the Google Drive, [url](https://drive.google.com/file/d/17u8ofyaBThb66qYPZe-60A2lyEnWCNil/view?usp=sharing).
28 | Important, use the offline evaluation script to eval the output file.
29 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "max_train_length": 100,
3 | "batch_size" : 32,
4 | "subbatch_size": 1,
5 | "max_tokens_per_batch" : 700,
6 | "features" : ["predicate"],
7 | "feature_sizes": [100],
8 | "dev_batch_size": 40,
9 |
10 | "use_bert": true,
11 | "bert_vocab_path": "bert-base-cased",
12 | "bert_path": "bert-base-cased",
13 | "bert_dim": 768,
14 |
15 | "mtl_cons": false,
16 | "use_cons_labels": false,
17 |
18 | "use_cons_gcn": false,
19 |
20 | "mtl_dep": false,
21 | "dep_prune_ratio": 0.8,
22 | "dep_num_lstm_layers": 3,
23 | "mlp_arc_size": 500,
24 | "mlp_rel_size": 100,
25 | "dropout_mlp": 0.33,
26 |
27 | "use_dep_gcn": false,
28 | "gcn_dep_num_layers": 2,
29 |
30 | "joint": true,
31 | "mtl": false,
32 | "analyze": false,
33 |
34 | "learning_rate":0.001,
35 | "input_dropout_prob":0.0,
36 | "feature_dropout": 0.5,
37 | "lexical_dropout" : 0.5,
38 | "dropout" : 0.3,
39 | "recurrent_dropout_prob":0.4,
40 | "mlp_dropout_prob": 0.2,
41 | "max_grad_norm": 5.0,
42 | "weight_decay": 1e-7,
43 | "decay_steps": 50,
44 | "fl_alpha": 1.0,
45 | "fl_gamma": 3.0,
46 | "pruning_by_arg_prob": false,
47 | "arg_boundary_prob_threshold": 0.0,
48 | "pruning_by_three_threshold": false,
49 | "arg_three_p_boundary_prob_threshold": 0.02,
50 | "neg_threshold": 80,
51 |
52 | "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt",
53 | "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt",
54 | "char_emb_size" : 8,
55 | "pos_emb_size" : 100,
56 | "cons_label_dim": 100,
57 | "span_width_feature_size" : 20,
58 | "num_attention_heads" : 1,
59 | "kernel_sizes" : [3, 4, 5],
60 | "output_channel" : 50,
61 | "argument_ratio" : 0.8,
62 | "predicate_ratio" : 0.4,
63 | "linear_projection_size" : 400,
64 | "cons_num_lstm_layers": 3,
65 | "num_lstm_layers" : 2,
66 | "lstm_hidden_size": 300,
67 | "max_arg_width" : 60,
68 | "lstm_cell":"highway",
69 | "mlp_label_size":100,
70 | "per_layer_dropout":true,
71 |
72 | "gcn_rnn": true,
73 | "gcn_rnn_hidden": 200,
74 | "gcn_rnn_layers": 1,
75 | "gcn_rnn_dropout": 0.4,
76 | "gcn_hidden_dim": 300,
77 | "gcn_num_layers": 3,
78 | "gcn_drop": 0.3,
79 |
80 | "pred_size": 300,
81 | "arg_start_size": 200,
82 | "arg_end_size": 200,
83 | "argu_size": 300,
84 | "argu_size_u": 400,
85 | "num_attention_heads" : 1,
86 | "ffnn_size" : 150,
87 | "ffnn_depth" : 1,
88 |
89 | "trainer" : "Adadelta",
90 | "max_epochs": 500,
91 | "checkpoint_every_x_epochs": 1,
92 |
93 | "enforce_srl_constraint": false,
94 | "use_gold_predicates": true,
95 | "use_gold_arguments": false
96 | }
97 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/char_dict:
--------------------------------------------------------------------------------
1 | *PAD*
2 | *UNKNOWN*
3 | @
4 | p
5 | Y
6 | s
7 | :
8 | P
9 | a
10 | _
11 | 4
12 | Z
13 | }
14 | o
15 | +
16 | w
17 | r
18 | 8
19 | #
20 | 0
21 | h
22 | R
23 | E
24 | g
25 | 2
26 | x
27 | U
28 | $
29 | d
30 | [
31 | ?
32 | F
33 | X
34 | \
35 | 7
36 | Q
37 | 9
38 | '
39 | z
40 | e
41 | t
42 | 3
43 | c
44 | "
45 | v
46 | ˙
47 | k
48 | ò
49 | *
50 | m
51 | ,
52 | %
53 | S
54 | `
55 | K
56 | A
57 | -
58 | .
59 | q
60 | L
61 | B
62 | J
63 | Ì
64 | j
65 | <
66 | i
67 | 1
68 | ö
69 | ’
70 | N
71 | &
72 | ]
73 | 5
74 | H
75 | T
76 | Û
77 | b
78 | y
79 | ;
80 | G
81 | V
82 | f
83 | !
84 | >
85 | /
86 | O
87 | W
88 | D
89 | u
90 | {
91 | M
92 | =
93 | 6
94 | l
95 | n
96 | C
97 | I
98 | Ê
99 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/config:
--------------------------------------------------------------------------------
1 | {
2 | "max_train_length": 100,
3 | "batch_size" : 32,
4 | "subbatch_size": 1,
5 | "max_tokens_per_batch" : 700,
6 | "features" : ["predicate"],
7 | "feature_sizes": [100],
8 | "dev_batch_size": 40,
9 |
10 | "use_bert": true,
11 | "bert_vocab_path": "bert-base-cased",
12 | "bert_path": "bert-base-cased",
13 | "bert_dim": 768,
14 |
15 | "mtl_cons": false,
16 | "use_cons_labels": false,
17 |
18 | "use_cons_gcn": false,
19 |
20 | "mtl_dep": false,
21 | "dep_prune_ratio": 0.8,
22 | "dep_num_lstm_layers": 3,
23 | "mlp_arc_size": 500,
24 | "mlp_rel_size": 100,
25 | "dropout_mlp": 0.33,
26 |
27 | "use_dep_gcn": false,
28 | "gcn_dep_num_layers": 2,
29 |
30 | "joint": true,
31 | "mtl": false,
32 | "analyze": false,
33 |
34 | "learning_rate":0.001,
35 | "input_dropout_prob":0.0,
36 | "feature_dropout": 0.5,
37 | "lexical_dropout" : 0.5,
38 | "dropout" : 0.3,
39 | "recurrent_dropout_prob":0.4,
40 | "mlp_dropout_prob": 0.2,
41 | "max_grad_norm": 5.0,
42 | "weight_decay": 1e-7,
43 | "decay_steps": 50,
44 | "fl_alpha": 1.0,
45 | "fl_gamma": 3.0,
46 | "pruning_by_arg_prob": false,
47 | "arg_boundary_prob_threshold": 0.0,
48 | "pruning_by_three_threshold": false,
49 | "arg_three_p_boundary_prob_threshold": 0.02,
50 | "neg_threshold": 80,
51 |
52 | "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt",
53 | "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt",
54 | "char_emb_size" : 8,
55 | "pos_emb_size" : 100,
56 | "cons_label_dim": 100,
57 | "span_width_feature_size" : 20,
58 | "num_attention_heads" : 1,
59 | "kernel_sizes" : [3, 4, 5],
60 | "output_channel" : 50,
61 | "argument_ratio" : 0.8,
62 | "predicate_ratio" : 0.4,
63 | "linear_projection_size" : 400,
64 | "cons_num_lstm_layers": 3,
65 | "num_lstm_layers" : 2,
66 | "lstm_hidden_size": 300,
67 | "max_arg_width" : 60,
68 | "lstm_cell":"highway",
69 | "mlp_label_size":100,
70 | "per_layer_dropout":true,
71 |
72 | "gcn_rnn": true,
73 | "gcn_rnn_hidden": 200,
74 | "gcn_rnn_layers": 1,
75 | "gcn_rnn_dropout": 0.4,
76 | "gcn_hidden_dim": 300,
77 | "gcn_num_layers": 3,
78 | "gcn_drop": 0.3,
79 |
80 | "pred_size": 300,
81 | "arg_start_size": 200,
82 | "arg_end_size": 200,
83 | "argu_size": 300,
84 | "argu_size_u": 400,
85 | "num_attention_heads" : 1,
86 | "ffnn_size" : 150,
87 | "ffnn_depth" : 1,
88 |
89 | "trainer" : "Adadelta",
90 | "max_epochs": 500,
91 | "checkpoint_every_x_epochs": 1,
92 |
93 | "enforce_srl_constraint": false,
94 | "use_gold_predicates": true,
95 | "use_gold_arguments": false
96 | }
97 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/cons_label_dict:
--------------------------------------------------------------------------------
1 | O
2 | WHNP
3 | NP
4 | PP
5 | SBARQ
6 | ADVP
7 | VP
8 | ADJP
9 | NML
10 | SINV
11 | PRT
12 | WHADVP
13 | SBAR
14 | INTJ
15 | SQ
16 | QP
17 | CONJP
18 | UCP
19 | X
20 | FRAG
21 | PRN
22 | WHPP
23 | WHADJP
24 | LST
25 | NAC
26 | RRC
27 | META
28 | NX
29 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/dep_label_dict:
--------------------------------------------------------------------------------
1 | Root
2 | prep
3 | det
4 | nn
5 | num
6 | pobj
7 | punct
8 | poss
9 | possessive
10 | amod
11 | nsubj
12 | dep
13 | dobj
14 | cc
15 | conj
16 | nsubjpass
17 | partmod
18 | auxpass
19 | advmod
20 | root
21 | ccomp
22 | aux
23 | cop
24 | xcomp
25 | quantmod
26 | tmod
27 | appos
28 | npadvmod
29 | neg
30 | infmod
31 | rcmod
32 | pcomp
33 | mark
34 | advcl
35 | predet
36 | mwe
37 | parataxis
38 | number
39 | acomp
40 | prt
41 | iobj
42 | expl
43 | csubj
44 | preconj
45 | discourse
46 | csubjpass
47 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/label_dict:
--------------------------------------------------------------------------------
1 | O
2 | AGENT
3 | TARGET
4 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/model/pos_dict:
--------------------------------------------------------------------------------
1 | *PAD*
2 | DT
3 | NNP
4 | JJ
5 | NN
6 | VBD
7 | PRP
8 | MD
9 | RB
10 | VB
11 | IN
12 | CD
13 | PRP$
14 | NNS
15 | .
16 | :
17 | VBG
18 | VBN
19 | TO
20 | ``
21 | VBZ
22 | ,
23 | VBP
24 | JJR
25 | ''
26 | -LRB-
27 | -RRB-
28 | POS
29 | FW
30 | CC
31 | WP
32 | $
33 | RP
34 | WDT
35 | EX
36 | RBS
37 | WRB
38 | NNPS
39 | UH
40 | RBR
41 | JJS
42 | PDT
43 | WP$
44 | LS
45 | SYM
46 | #
47 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/predict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export PATH=/usr/local/cuda/bin:$PATH
3 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib
4 |
5 | MODEL_PATH="./model"
6 |
7 | #INPUT_PATH="../data/aaai19srl.train0.conll.srl.json"
8 | #OUTPUT_PATH="../temp/orl.train0.out"
9 |
10 | INPUT_PATH="../data/aaai19srl.dev0.conll.json"
11 | GOLD_PATH="../data/conll_format/aaai19srl.dev0.conll"
12 | OUTPUT_PATH="../temp/orl.devel0.out"
13 |
14 | INPUT_PATH="../data/aaai19srl.test0.conll.json"
15 | GOLD_PATH="../data/conll_format/aaai19srl.test0.conll"
16 | OUTPUT_PATH="../temp/orl.test0.out"
17 |
18 | ORL_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt"
19 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt"
20 |
21 | CUDA_VISIBLE_DEVICES=$1 python3 ../src/orl-4.1/predict.py \
22 | --span="span" \
23 | --model="$MODEL_PATH" \
24 | --input="$INPUT_PATH" \
25 | --gold="$GOLD_PATH" \
26 | --orl_cons=$ORL_CONS \
27 | --auto_dep_trees=$SYS_DEP \
28 | --output="$OUTPUT_PATH" \
29 | --gpu=$1
30 |
31 |
--------------------------------------------------------------------------------
/exp-4.1-baseline/train.sh:
--------------------------------------------------------------------------------
1 | export PATH=/usr/local/cuda/bin:$PATH
2 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib
3 |
4 | CONFIG="config.json"
5 | MODEL="model"
6 |
7 | TRAIN_PATH="../data/aaai19srl.train0.conll.json"
8 | #TRAIN_PATH="../data/aaai19srl.dev0.conll.json"
9 | DEV_PATH="../data/aaai19srl.dev0.conll.json"
10 | GOLD_PATH="../data/english/srl/conll05/conll05.devel.props.gold.txt"
11 |
12 | CONS_PATH="../data/constituent_conll12/ontonote5.0.train.constituents.json"
13 | DEP_TREES="/data2/qrxia/SRL-w-Heterogenous-Dep/data/english/dependency/ptb_from_baidu_from_n171/ptb.english.conll.train.txt.opentest.tag.projective"
14 |
15 | SYS_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt"
16 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt"
17 |
18 | gpu_id=$1
19 | CUDA_VISIBLE_DEVICES=$gpu_id python3 ../src/orl-4.1/train.py \
20 | --info="orl baseline bert" \
21 | --config=$CONFIG \
22 | --span="span" \
23 | --model=$MODEL \
24 | --train=$TRAIN_PATH \
25 | --dev=$DEV_PATH \
26 | --gold=$GOLD_PATH \
27 | --cons_trees=$CONS_PATH \
28 | --dep_trees=$DEP_TREES \
29 | --auto_cons_trees=$SYS_CONS \
30 | --auto_dep_trees=$SYS_DEP \
31 | --gpu=$1
32 |
--------------------------------------------------------------------------------
/figures/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.jpg
--------------------------------------------------------------------------------
/figures/model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.pdf
--------------------------------------------------------------------------------
/scripts/convert_orl_conll_to_json.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from collections import OrderedDict
4 |
5 |
6 | DSE="DSE"
7 | TARGET="TARGET"
8 | AGENT="AGENT"
9 |
10 | max_dse_length, max_target_length, max_agent_length = 0, 0, 0
11 |
12 |
13 | class orl_data():
14 | def __init__(self, tuples):
15 | self.idx = []
16 | self.words = []
17 | self.labels = []
18 | self.des = []
19 | self.des_head = []
20 | self.target = []
21 | self.agent = []
22 | self.orl = []
23 | self.init_by_typles(tuples)
24 |
25 | def output_to_srl_json(self):
26 | srl_span = []
27 | for span in self.orl:
28 | s, e, a_s, a_e, label = span
29 | if label == "DSE":
30 | continue
31 | count = 0
32 | for des_head in self.des_head:
33 | if s <= des_head <= e:
34 | srl_span.append([des_head, a_s, a_e, label])
35 | count += 1
36 | if count != 1:
37 | print(self.words, self.des_head)
38 | # assert count == 1
39 | for des_head in self.des_head:
40 | srl_span.append([des_head, des_head, des_head, "V"])
41 | output = {
42 | "speakers": [["-"] * len(self.words)],
43 | "doc_key": "S0",
44 | "sentences": [self.words],
45 | "srl": [srl_span],
46 | "constituents": [[]],
47 | "clusters": [],
48 | "ner": [[]]
49 |
50 | }
51 | return output
52 |
53 | def output_to_json(self):
54 | output = {
55 | "sentences": self.words,
56 | "orl": self.orl
57 | }
58 | return output
59 |
60 | def a_complete_span(self, des, span, label):
61 | # print(des, span, label)
62 | t = (des + span)
63 | t.append(label)
64 | # print(t)
65 | assert len(t) == 5
66 | if t[-1] == DSE:
67 | global max_dse_length
68 | dse_length = t[1] - t[0]
69 | max_dse_length = max_dse_length if max_dse_length > dse_length else dse_length
70 | self.des.append(t)
71 | elif t[-1] == TARGET:
72 | global max_target_length
73 | target_length = t[3] - t[2]
74 | max_target_length = max_target_length if max_target_length > target_length else target_length
75 | self.target.append(t)
76 | else:
77 | assert t[-1] == AGENT
78 | global max_agent_length
79 | agent_length = t[3] - t[2]
80 | max_agent_length = max_agent_length if max_agent_length > agent_length else agent_length
81 | self.agent.append(t)
82 | self.orl.append(t)
83 |
84 | @staticmethod
85 | def compose_a_span(des, spans, labels, span, l):
86 | assert len(span) == 2 and l != ""
87 | if l == DSE:
88 | assert len(des) == 0
89 | des = span
90 | # print("DES!!!!")
91 | spans.append(des)
92 | labels.append(l)
93 | else:
94 | spans.append(span)
95 | labels.append(l)
96 | return des
97 |
98 | def init_by_typles(self, tuples):
99 | # print(tuples)
100 | self.idx, self.words, self.labels = tuples[0], tuples[1], tuples[2:]
101 | for expression_aware_label in self.labels:
102 | des, spans, labels = [], [], []
103 | span, l = [], ''
104 | # print(self.label)
105 | for i, label in enumerate(expression_aware_label):
106 | if label.endswith("-*"): # we do n't need the * that marks the ``head word''
107 | label = label[:-2]
108 | self.des_head.append(i)
109 | if label == "S-DSE":
110 | self.des_head.append(i)
111 |
112 | if label.startswith("B"):
113 | assert len(span) == 0
114 | span.append(i)
115 | l = label[2:]
116 | elif label.startswith("M"):
117 | assert l == label[2:]
118 | elif label.startswith("E"):
119 | assert l == label[2:]
120 | span.append(i)
121 | des = orl_data.compose_a_span(des, spans, labels, span, l)
122 | span, l = [], ''
123 | elif label.startswith("S"):
124 | span = [i, i]
125 | l = label[2:]
126 | # print("label", l)
127 | des = orl_data.compose_a_span(des, spans, labels, span, l)
128 | # print("XXX", des)
129 | span, l = [], ''
130 | else:
131 | assert label == 'O'
132 |
133 | assert len(spans) == len(labels)
134 | for s, l in list(zip(spans, labels)):
135 | self.a_complete_span(des, s, l)
136 |
137 | def write_to_json(self):
138 | pass
139 |
140 |
141 | if __name__ == "__main__":
142 | input_filepath = sys.argv[1]
143 |
144 | input_data = OrderedDict()
145 | original_sentence_number, unique_sentence_number = 0, 0
146 | duplicate_sentence_number = 0
147 | duplicate_sentence_label_number = 0
148 | with open(input_filepath, 'r') as input_orl_file:
149 | sentence = []
150 | for line in input_orl_file.readlines():
151 | if line.strip() == "":
152 | original_sentence_number += 1
153 | tuples = list(zip(*sentence))
154 | sen = ' '.join(tuples[1])
155 | # print(sen)
156 | if len(input_data) != 0:
157 | if sen in input_data.keys(): # if it is the same sentence
158 | # print("xx")
159 | if tuples[-1] not in input_data[sen][2:]:
160 | input_data[sen].append(tuples[-1])
161 | duplicate_sentence_number += 1
162 | else:
163 | print(tuples[-1], "already in previous sample", input_data[sen])
164 | duplicate_sentence_label_number += 1
165 | else:
166 | input_data[sen] = tuples
167 | unique_sentence_number += 1
168 | else:
169 | input_data[sen] = tuples
170 | unique_sentence_number += 1
171 | sentence = []
172 | continue
173 | tokens = line.strip().split()
174 | # print(tokens)
175 | sentence.append(tokens)
176 | # check for sentences appear more than once
177 | assert original_sentence_number == unique_sentence_number + duplicate_sentence_number +\
178 | duplicate_sentence_label_number
179 | print("original sentence number:", original_sentence_number)
180 | print("unique_sentence_number:", unique_sentence_number)
181 | print("duplicate_sentence_number:", duplicate_sentence_number)
182 | print("duplicate_sentence_label_number", duplicate_sentence_label_number)
183 | # generate_chars
184 | # with open(input_filepath + ".char.txt", 'w') as char_file:
185 | # char = set()
186 | # for sen in input_data.keys():
187 | # words = sen.strip().split()
188 | # for word in words:
189 | # for c in word:
190 | # char.add(c)
191 | # for c in char:
192 | # char_file.write(c + '\n')
193 |
194 | sentences = set()
195 | for data in input_data.keys():
196 | sen = ' '.join(data)
197 | if sen not in sentences:
198 | sentences.add(sen)
199 | else:
200 | print(sen, "already appears!")
201 | # pass
202 | # generate orl data
203 | orl_dataset = []
204 | for data in input_data.keys():
205 | orl_dataset.append(orl_data(input_data[data]))
206 | # global max_dse_length
207 | # global max_target_length
208 | # global max_agent_length
209 | print("max_dse_length", max_dse_length, "max_target_length", max_target_length,
210 | "max_agent_length", max_agent_length)
211 | # output to json
212 | json_filename = input_filepath + '.json'
213 | with open(json_filename, 'w') as output_json:
214 | for orl in orl_dataset:
215 | # print(orl.output_to_json())
216 | output_json.write(json.dumps(orl.output_to_json()) + '\n')
217 |
218 |
219 |
--------------------------------------------------------------------------------
/scripts/eval_averaged_metrics.py:
--------------------------------------------------------------------------------
1 | import sys
2 | # from eval_orl_conll_file import load_eval_data, analyze_error_prediction_matrix
3 | from eval_orl_e2e_json_file import load_eval_data, analyze_error_prediction_matrix
4 |
5 |
6 | def average_fscore(all_metrics):
7 | zipped_metrics = list(zip(*all_metrics))
8 |
9 | def avg(metrics):
10 | return sum([item.f for item in metrics]) / len(metrics)
11 |
12 | print('='*10, "Binary F1", '='*10)
13 | print("Agent", avg(zipped_metrics[0]))
14 | print("Target", avg(zipped_metrics[1]))
15 | print("Agent", avg(zipped_metrics[2]))
16 |
17 | print('='*10, "Proportional F1", '='*10)
18 | print("Agent", avg(zipped_metrics[3]))
19 | print("Target", avg(zipped_metrics[4]))
20 | print("Agent", avg(zipped_metrics[5]))
21 |
22 | print('='*10, "Exact F1", '='*10)
23 | print("Agent", avg(zipped_metrics[6]))
24 | print("Target", avg(zipped_metrics[7]))
25 | print("Agent", avg(zipped_metrics[8]))
26 |
27 | print('=' * 10, "Expression F1", '=' * 10)
28 | print("Binary", avg(zipped_metrics[9]))
29 | print("Proportional", avg(zipped_metrics[10]))
30 | print("Exact", avg(zipped_metrics[11]))
31 |
32 |
33 | if __name__ == "__main__":
34 | averaged_metric = []
35 | assert len(sys.argv[1:]) == 5
36 | for file_path in sys.argv[1:]:
37 | result = load_eval_data(file_path)
38 |
39 | x = analyze_error_prediction_matrix(result)
40 | (agent_binary, target_binary, all_binary), \
41 | (agent_proportional, target_proportional, all_proportional), \
42 | (agent_exact, target_exact, all_exact), \
43 | (exp_binary, exp_proportional, exp_exact) = x
44 |
45 | averaged_metric.append([agent_binary, target_binary, all_binary,
46 | agent_proportional, target_proportional, all_proportional,
47 | agent_exact, target_exact, all_exact,
48 | exp_binary, exp_proportional, exp_exact])
49 | average_fscore(averaged_metric)
50 |
51 |
--------------------------------------------------------------------------------
/scripts/eval_orl_json_file.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | from collections import OrderedDict, Counter
5 |
6 |
7 | AGENT="AGENT"
8 | TARGET="TARGET"
9 |
10 |
11 | class Sample():
12 | def __init__(self, obj):
13 | self.sentence = obj["sentence"]
14 | self.gold_orl = obj['gold_orl']
15 | self.sys_orl = obj["sys_orl"]
16 | # self.sys_argus_constituents = obj['sys_argus_constituents']
17 | # self.constituent = obj["constituent"]
18 |
19 |
20 | def load_eval_data(eval_path):
21 | eval_data = []
22 | with open(eval_path, 'r') as f:
23 | eval_data = [Sample(json.loads(jsonline)) for jsonline in f.readlines()]
24 | print("Loaded {} eval examples.".format(len(eval_data)))
25 | return eval_data
26 |
27 |
28 | class EvalMetric():
29 | def __init__(self, name="None"):
30 | self.name = name
31 | self.matched, self.sys, self.gold = 0, 0, 0
32 | self.p = self.r = self.f = 0.0
33 |
34 | def compute_prf(self):
35 | try:
36 | self.p = 100.0 * self.matched / self.sys
37 | except:
38 | self.p = 0.0
39 | try:
40 | self.r = 100.0 * self.matched / self.gold
41 | except:
42 | self.r = 0.0
43 | try:
44 | self.f = 2.0 * self.p * self.r / (self.p + self.r)
45 | except:
46 | self.f = 0.0
47 | print("="*5, self.name, "="*5)
48 | print("Precision:", self.matched, '/', self.sys, '=', self.p)
49 | print("Recall:", self.matched, '/', self.gold, '=', self.r)
50 | print("F1 score:", self.f)
51 |
52 |
53 | def analyze_error_prediction_matrix(samples):
54 | agent_binary, target_binary, all_binary = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
55 | agent_proportional, target_proportional, all_proportional = \
56 | EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
57 | agent_exact, target_exact, all_exact = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
58 | for sample in samples:
59 | gold_orl, sys_orl = sample.gold_orl, sample.sys_orl
60 | # dict s-e: label
61 | dict_gold_orl, dict_sys_orl = OrderedDict(), OrderedDict()
62 | for g_orl in gold_orl: # construct the expression-argument tuples
63 | dse_s, dse_e, s, e, label = g_orl
64 | expression = str(dse_s) + '-' + str(dse_e)
65 | argument = (s, e, label)
66 | if expression not in dict_gold_orl:
67 | dict_gold_orl[expression] = []
68 | dict_gold_orl[expression].append(argument)
69 | else:
70 | dict_gold_orl[expression].append(argument)
71 | for s_orl in sys_orl: # construct the expression-argument tuples
72 | dse_s, dse_e, s, e, label = s_orl
73 | expression = str(dse_s) + '-' + str(dse_e)
74 | argument = (s, e, label)
75 | if expression not in dict_sys_orl:
76 | dict_sys_orl[expression] = []
77 | dict_sys_orl[expression].append(argument)
78 | else:
79 | dict_sys_orl[expression].append(argument)
80 |
81 | for expression in dict_gold_orl: # compute the gold
82 | for argument in dict_gold_orl[expression]:
83 | s, e, label = argument
84 | all_binary.gold += 1
85 | all_proportional.gold += 1
86 | all_exact.gold += 1
87 | if label == AGENT:
88 | agent_binary.gold += 1
89 | agent_proportional.gold += 1
90 | agent_exact.gold += 1
91 | else:
92 | assert label == TARGET
93 | target_binary.gold += 1
94 | target_proportional.gold += 1
95 | target_exact.gold += 1
96 |
97 | for expression in dict_sys_orl: # compute the sys
98 | for argument in dict_sys_orl[expression]:
99 | s, e, label = argument
100 | all_binary.sys += 1
101 | all_proportional.sys += 1
102 | all_exact.sys += 1
103 | if label == AGENT:
104 | agent_binary.sys += 1
105 | agent_proportional.sys += 1
106 | agent_exact.sys += 1
107 | else:
108 | assert label == TARGET
109 | target_binary.sys += 1
110 | target_proportional.sys += 1
111 | target_exact.sys += 1
112 |
113 | for expression in dict_sys_orl: # compute the sys
114 | if expression not in dict_gold_orl: # debug: some gold orl has no argument, only expression
115 | # print(sample.sentence)
116 | continue
117 | gold_arguments = dict_gold_orl[expression]
118 | for argument in dict_sys_orl[expression]:
119 | s, e, label = argument
120 | if argument in gold_arguments: # exact
121 | all_binary.matched += 1
122 | all_proportional.matched += 1
123 | all_exact.matched += 1
124 | if label == AGENT:
125 | agent_binary.matched += 1
126 | agent_proportional.matched += 1
127 | agent_exact.matched += 1
128 | else:
129 | assert label == TARGET
130 | target_binary.matched += 1
131 | target_proportional.matched += 1
132 | target_exact.matched += 1
133 | else:
134 | # binary
135 | find = False
136 | for index in range(s, e + 1):
137 | for gold_arg in gold_arguments:
138 | g_s, g_e, g_label = gold_arg
139 | if g_label == label:
140 | if g_s <= index <= g_e:
141 | all_binary.matched += 1
142 | if label == AGENT:
143 | agent_binary.matched += 1
144 | else:
145 | target_binary.matched += 1
146 | find = True
147 | break
148 | if find is True:
149 | break
150 | # proportional
151 | list_of_proportional = []
152 | for gold_argument in dict_gold_orl[expression]:
153 | g_s, g_e, g_label = gold_argument
154 | matched_positions = 0
155 | if label != g_label:
156 | pass
157 | else:
158 | for position in range(g_s, g_e + 1):
159 | if s <= position <= e:
160 | matched_positions += 1
161 | list_of_proportional.append(1.0 * matched_positions / (g_e - g_s + 1))
162 | if len(list_of_proportional) > 0: # matched a gold argument
163 | all_proportional.matched += max(list_of_proportional)
164 | if label == AGENT:
165 | agent_proportional.matched += max(list_of_proportional)
166 | else:
167 | target_proportional.matched += max(list_of_proportional)
168 |
169 | print("="*15, 'Binary Metric', "="*15)
170 | agent_binary.compute_prf()
171 | target_binary.compute_prf()
172 | all_binary.compute_prf()
173 |
174 | print("="*15, 'Proportional Metric', "="*15)
175 | agent_proportional.compute_prf()
176 | target_proportional.compute_prf()
177 | all_proportional.compute_prf()
178 |
179 | print("="*15, 'Exact Metric', "="*15)
180 | agent_exact.compute_prf()
181 | target_exact.compute_prf()
182 | all_exact.compute_prf()
183 |
184 |
185 | if __name__ == "__main__":
186 | input_file_path = sys.argv[1]
187 | data = load_eval_data(input_file_path)
188 | analyze_error_prediction_matrix(data)
189 |
--------------------------------------------------------------------------------
/scripts/generate_constituent_trees_from_benepar.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import sys
3 | import benepar
4 |
5 |
6 | if __name__ == "__main__":
7 | filepath = sys.argv[1]
8 | sentences = []
9 | with open(filepath, 'r') as input_file:
10 | for line in input_file.readlines():
11 | sentence = line.strip()
12 | words = sentence.split(' ')
13 | sentences.append(words)
14 |
15 | parser = benepar.Parser("benepar_en2")
16 | constituent_trees = []
17 | for sentence in sentences:
18 | tree = parser.parse(sentence)
19 | constituent_trees.append(tree)
20 |
21 | with open(filepath + '.constituent.txt', 'w') as output_file:
22 | for t in constituent_trees:
23 | output_file.write(str(t) + '\n' + '\n')
24 |
25 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["neural_srl"]
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | from torch.nn.utils.rnn import pack_padded_sequence as pack
5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
6 | from .TreeGRU import DTTreeGRU, TDTreeGRU
7 | from .Tree import creatTree
8 |
9 |
10 | class EncoderRNN(nn.Module):
11 | """ The standard RNN encoder.
12 | """
13 | def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
14 | super(EncoderRNN, self).__init__()
15 | self.hidden_size = hidden_size
16 | self.num_layers = num_layers
17 | self.dropout = nn.Dropout(dropout)
18 |
19 | self.rnn = nn.GRU(input_size=input_size,
20 | hidden_size=hidden_size,
21 | num_layers=num_layers,
22 | bidirectional=True) # batch_first = False
23 | self.transform = nn.Linear(in_features=2 * hidden_size,
24 | out_features=input_size,
25 | bias=True)
26 | self.dt_tree = DTTreeGRU(input_size, hidden_size)
27 | self.td_tree = TDTreeGRU(input_size, hidden_size)
28 |
29 | def forward(self, input, heads, lengths=None, hidden=None):
30 | """ See EncoderBase.forward() for description of args and returns.
31 | inputs: [L, B, H], including the -ROOT-
32 | heads: [heads] * B
33 | """
34 | emb = self.dropout(input)
35 |
36 | packed_emb = emb
37 | if lengths is not None:
38 | # Lengths data is wrapped inside a Variable.
39 | packed_emb = pack(emb, lengths)
40 |
41 | outputs, hidden_t = self.rnn(packed_emb, hidden)
42 |
43 | if lengths is not None:
44 | outputs = unpack(outputs)[0]
45 |
46 | outputs = self.dropout(self.transform(outputs))
47 | max_length, batch_size, input_dim = outputs.size()
48 | trees = []
49 | indexes = np.full((max_length, batch_size), -1,
50 | dtype=np.int32) # a col is a sentence
51 | for b, head in enumerate(heads):
52 | root, tree = creatTree(
53 | head) # head: a sentence's heads; sentence base
54 | root.traverse() # traverse the tree
55 | for step, index in enumerate(root.order):
56 | indexes[step, b] = index
57 | trees.append(tree)
58 |
59 | dt_outputs, dt_hidden_ts = self.dt_tree.forward(
60 | outputs, indexes, trees)
61 | td_outputs, td_hidden_ts = self.td_tree.forward(
62 | outputs, indexes, trees)
63 |
64 | outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1)
65 | output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0)
66 |
67 | return outputs, output_t
68 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Tree.py:
--------------------------------------------------------------------------------
1 | class Tree(object):
2 | def __init__(self, index):
3 | self.parent = None
4 | self.is_left = False
5 | self.index = index
6 | self.left_children = list()
7 | self.left_num = 0
8 | self.right_children = list()
9 | self.right_num = 0
10 | self._depth = -1
11 | self.order = []
12 |
13 | def add_left(self, child):
14 | """
15 | :param child: a Tree object represent the child
16 | :return:
17 | """
18 | child.parent = self
19 | child.is_left = True
20 | self.left_children.append(child)
21 | self.left_num += 1
22 |
23 | def add_right(self, child):
24 | """
25 | :param child: a Tree object represent the child
26 | :return:
27 | """
28 | child.parent = self
29 | child.is_left = False
30 | self.right_children.append(child)
31 | self.right_num += 1
32 |
33 | def size(self): # compute the total size of the Tree
34 | if hasattr(self, '_size'):
35 | return self._size
36 | count = 1
37 | for i in range(self.left_num):
38 | count += self.left_children[i].size()
39 | for i in range(self.right_num):
40 | count += self.right_children[i].size()
41 | self._size = count
42 | return self._size
43 |
44 | def depth(self): # compute the depth of the Tree
45 | if self._depth > 0:
46 | return self._depth
47 | count = 0
48 | if self.left_num + self.right_num > 0:
49 | for i in range(self.left_num):
50 | child_depth = self.left_children[i].depth()
51 | if child_depth > count:
52 | count = child_depth
53 | for i in range(self.right_num):
54 | child_depth = self.right_children[i].depth()
55 | if child_depth > count:
56 | count = child_depth
57 | count += 1
58 | self._depth = count
59 | return self._depth
60 |
61 | def traverse(self): # traverse the Tree
62 | if len(self.order) > 0:
63 | return self.order
64 |
65 | for i in range(self.left_num):
66 | left_order = self.left_children[i].traverse()
67 | self.order.extend(left_order)
68 | for i in range(self.right_num):
69 | right_order = self.right_children[i].traverse()
70 | self.order.extend(right_order)
71 | self.order.append(self.index) # append the root
72 | return self.order
73 |
74 |
75 | def creatTree(heads):
76 | tree = []
77 | # current sentence has already been numberized [form, head, rel]
78 | root = None
79 | for idx, head in enumerate(heads):
80 | tree.append(Tree(idx))
81 |
82 | for idx, head in enumerate(heads):
83 | if head == -1: # -1 mszhang, 0 kiro
84 | root = tree[idx]
85 | continue
86 | if head < 0:
87 | print('error: multi roots')
88 | if head > idx:
89 | tree[head].add_left(tree[idx])
90 | if head < idx:
91 | tree[head].add_right(tree[idx])
92 | if head == idx:
93 | print('error: head is it self.')
94 |
95 | return root, tree
96 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["Encoder", "Tree", "TreeGRU"]
2 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/gcn.py:
--------------------------------------------------------------------------------
1 | """
2 | GCN model for relation extraction.
3 | """
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.autograd import Variable
9 | from ..shared.constants import PAD_ID
10 | import numpy as np
11 |
12 |
13 | class GCN(nn.Module):
14 | def __init__(self, config, input_dim, mem_dim, num_layers):
15 | super(GCN, self).__init__()
16 | self.config = config
17 | self.input_dim = input_dim
18 | self.mem_dim = mem_dim
19 | self.layers = num_layers
20 |
21 | # rnn layer
22 | if self.config.gcn_rnn is True:
23 | input_size = self.input_dim
24 | self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True,
25 | dropout=self.config.gcn_rnn_dropout, bidirectional=True)
26 | self.in_dim = self.config.gcn_rnn_hidden * 2
27 | self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout) # use on last layer output
28 |
29 | self.in_drop = nn.Dropout(self.config.gcn_input_dropout)
30 | self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout)
31 |
32 | # gcn layer
33 | self.W = nn.ModuleList()
34 | self.layer_normalization = nn.ModuleList()
35 |
36 | for layer in range(self.layers):
37 | # input_dim = self.in_dim if layer == 0 else self.mem_dim
38 | self.W.append(nn.Linear(self.in_dim, self.in_dim))
39 | self.layer_normalization.append(LayerNormalization(self.in_dim))
40 |
41 | def conv_l2(self):
42 | conv_weights = []
43 | for w in self.W:
44 | conv_weights += [w.weight, w.bias]
45 | return sum([x.pow(2).sum() for x in conv_weights])
46 |
47 | def encode_with_rnn(self, rnn_inputs, masks, batch_size):
48 | seq_lens = masks.data.eq(1).long().sum(1).squeeze()
49 | h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers)
50 |
51 | # SORT YOUR TENSORS BY LENGTH!
52 | seq_lens, perm_idx = seq_lens.sort(0, descending=True)
53 |
54 | rnn_inputs = rnn_inputs[perm_idx]
55 | rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True)
56 | rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0))
57 | rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)
58 |
59 | _, unperm_idx = perm_idx.sort(0)
60 | rnn_outputs = rnn_outputs[unperm_idx]
61 | return rnn_outputs
62 |
63 | def forward(self, adj, embs, masks):
64 | batch_size = masks.size()[0]
65 | embs = self.in_drop(embs)
66 | # rnn layer
67 | if self.config.gcn_rnn is True:
68 | gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size))
69 | else:
70 | gcn_inputs = embs
71 |
72 | # gcn layer
73 | denom = adj.sum(2).unsqueeze(2) + 1
74 | mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2)
75 | # # zero out adj for ablation
76 | # if self.opt.get('no_adj', False):
77 | # adj = torch.zeros_like(adj)
78 |
79 | for l in range(self.layers):
80 | # print(gcn_inputs.size(), adj.size())
81 | x = gcn_inputs
82 | Ax = adj.bmm(gcn_inputs)
83 | AxW = self.W[l](Ax)
84 | AxW = AxW + self.W[l](gcn_inputs) # self loop
85 | AxW = AxW / denom
86 |
87 | gAxW = F.relu(AxW)
88 | gcn_inputs = self.gcn_drop(gAxW)
89 | self.layer_normalization[l].forward(gcn_inputs + x)
90 |
91 | return gcn_inputs, mask
92 |
93 |
94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True):
95 | total_layers = num_layers * 2 if bidirectional else num_layers
96 | state_shape = (total_layers, batch_size, hidden_dim)
97 | h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False)
98 | if use_cuda:
99 | return h0.cuda(), c0.cuda()
100 | else:
101 | return h0, c0
102 |
103 |
104 | class LayerNormalization(nn.Module):
105 | ''' Layer normalization module '''
106 |
107 | def __init__(self, d_hid, eps=1e-3): #
108 | super(LayerNormalization, self).__init__()
109 | self.eps = eps
110 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
111 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
112 |
113 | def forward(self, z):
114 | if z.size(1) == 1:
115 | return z
116 | mu = torch.mean(z, keepdim=True, dim=-1)
117 | sigma = torch.std(z, keepdim=True, dim=-1)
118 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) # 1e-3 is ok, because variance and std.
119 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
120 | return ln_out
121 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic operations on trees.
3 | """
4 |
5 | import numpy as np
6 | from collections import defaultdict
7 |
8 |
9 | class Tree(object):
10 | """
11 | Reused tree object from stanfordnlp/treelstm.
12 | """
13 |
14 | def __init__(self):
15 | self.parent = None
16 | # head probability
17 | self.phead = -1
18 | self.num_children = 0
19 | self.children = list()
20 |
21 | def add_child(self, child):
22 | child.parent = self
23 | self.num_children += 1
24 | self.children.append(child)
25 |
26 | def size(self):
27 | if getattr(self, '_size'):
28 | return self._size
29 | count = 1
30 | for i in xrange(self.num_children):
31 | count += self.children[i].size()
32 | self._size = count
33 | return self._size
34 |
35 | def depth(self):
36 | if getattr(self, '_depth'):
37 | return self._depth
38 | count = 0
39 | if self.num_children > 0:
40 | for i in xrange(self.num_children):
41 | child_depth = self.children[i].depth()
42 | if child_depth > count:
43 | count = child_depth
44 | count += 1
45 | self._depth = count
46 | return self._depth
47 |
48 | def __iter__(self):
49 | yield self
50 | for c in self.children:
51 | for x in c:
52 | yield x
53 |
54 |
55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos):
56 | """
57 | Convert a sequence of head indexes into a tree object.
58 | """
59 | tokens = tokens[:len_].tolist()
60 | head = head[:len_].tolist()
61 | root = None
62 |
63 | if prune < 0:
64 | nodes = [Tree() for _ in head]
65 |
66 | for i in range(len(nodes)):
67 | h = head[i]
68 | nodes[i].idx = i
69 | nodes[i].dist = -1 # just a filler
70 | if h == 0:
71 | root = nodes[i]
72 | else:
73 | nodes[h - 1].add_child(nodes[i])
74 | else:
75 | # find dependency path
76 | subj_pos = [i for i in range(len_) if subj_pos[i] == 0]
77 | obj_pos = [i for i in range(len_) if obj_pos[i] == 0]
78 |
79 | cas = None
80 |
81 | subj_ancestors = set(subj_pos)
82 | for s in subj_pos:
83 | h = head[s]
84 | tmp = [s]
85 | while h > 0:
86 | tmp += [h - 1]
87 | subj_ancestors.add(h - 1)
88 | h = head[h - 1]
89 |
90 | if cas is None:
91 | cas = set(tmp)
92 | else:
93 | cas.intersection_update(tmp)
94 |
95 | obj_ancestors = set(obj_pos)
96 | for o in obj_pos:
97 | h = head[o]
98 | tmp = [o]
99 | while h > 0:
100 | tmp += [h - 1]
101 | obj_ancestors.add(h - 1)
102 | h = head[h - 1]
103 | cas.intersection_update(tmp)
104 |
105 | # find lowest common ancestor
106 | if len(cas) == 1:
107 | lca = list(cas)[0]
108 | else:
109 | child_count = {k: 0 for k in cas}
110 | for ca in cas:
111 | if head[ca] > 0 and head[ca] - 1 in cas:
112 | child_count[head[ca] - 1] += 1
113 |
114 | # the LCA has no child in the CA set
115 | for ca in cas:
116 | if child_count[ca] == 0:
117 | lca = ca
118 | break
119 |
120 | path_nodes = subj_ancestors.union(obj_ancestors).difference(cas)
121 | path_nodes.add(lca)
122 |
123 | # compute distance to path_nodes
124 | dist = [-1 if i not in path_nodes else 0 for i in range(len_)]
125 |
126 | for i in range(len_):
127 | if dist[i] < 0:
128 | stack = [i]
129 | while stack[-1] >= 0 and stack[-1] not in path_nodes:
130 | stack.append(head[stack[-1]] - 1)
131 |
132 | if stack[-1] in path_nodes:
133 | for d, j in enumerate(reversed(stack)):
134 | dist[j] = d
135 | else:
136 | for j in stack:
137 | if j >= 0 and dist[j] < 0:
138 | dist[j] = int(1e4) # aka infinity
139 |
140 | highest_node = lca
141 | nodes = [Tree() if dist[i] <= prune else None for i in range(len_)]
142 |
143 | for i in range(len(nodes)):
144 | if nodes[i] is None:
145 | continue
146 | h = head[i]
147 | nodes[i].idx = i
148 | nodes[i].dist = dist[i]
149 | if h > 0 and i != highest_node:
150 | assert nodes[h - 1] is not None
151 | nodes[h - 1].add_child(nodes[i])
152 |
153 | root = nodes[highest_node]
154 |
155 | assert root is not None
156 | return root
157 |
158 |
159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False):
160 | """
161 | Convert a tree object to an (numpy) adjacency matrix.
162 | """
163 | ret = np.zeros((sent_len, sent_len), dtype=np.float32)
164 |
165 | queue = [tree]
166 | idx = []
167 | while len(queue) > 0:
168 | t, queue = queue[0], queue[1:]
169 |
170 | idx += [t.idx]
171 |
172 | for c in t.children:
173 | ret[t.idx, c.idx] = 1
174 | queue += t.children
175 |
176 | if not directed:
177 | ret = ret + ret.T
178 |
179 | if self_loop:
180 | for i in idx:
181 | ret[i, i] = 1
182 |
183 | return ret
184 |
185 |
186 | def tree_to_dist(sent_len, tree):
187 | ret = -1 * np.ones(sent_len, dtype=np.int64)
188 |
189 | for node in tree:
190 | ret[node.idx] = node.dist
191 |
192 | return ret
193 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/implicit_syntactic_representations.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.nn.utils.rnn import pad_sequence
6 |
7 |
8 | from .model import drop_sequence_sharedmask, _model_var
9 | from .HighWayLSTM import Highway_Concat_BiLSTM
10 | from .layer import NonLinear, Biaffine
11 |
12 |
13 | class ImplicitDependencyRepresentations(nn.Module):
14 | def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size):
15 | super(ImplicitDependencyRepresentations, self).__init__()
16 | self.config = config
17 | self.lstm_input_size = lstm_input_size
18 | self.lstm_hidden_size = lstm_hidden_size
19 | self.dep_label_space_size = dep_label_space_size
20 | # softmax weights
21 | self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0]))
22 | self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0]))
23 | for _ in range(self.config.dep_num_lstm_layers)])
24 | self.cuda = True
25 |
26 | self.dep_bilstm = Highway_Concat_BiLSTM(
27 | input_size=self.lstm_input_size,
28 | hidden_size=self.lstm_hidden_size, # // 2 for MyLSTM
29 | num_layers=self.config.dep_num_lstm_layers,
30 | batch_first=True,
31 | bidirectional=True,
32 | dropout_in=config.input_dropout_prob,
33 | dropout_out=config.recurrent_dropout_prob
34 | )
35 |
36 | # dependency parsing module
37 | self.mlp_arc_dep = NonLinear(
38 | input_size=2 * config.lstm_hidden_size,
39 | hidden_size=config.mlp_arc_size + config.mlp_rel_size,
40 | activation=nn.LeakyReLU(0.1))
41 | self.mlp_arc_head = NonLinear(
42 | input_size=2 * config.lstm_hidden_size,
43 | hidden_size=config.mlp_arc_size + config.mlp_rel_size,
44 | activation=nn.LeakyReLU(0.1))
45 |
46 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
47 | self.arc_num = int(config.mlp_arc_size / 100)
48 | self.rel_num = int(config.mlp_rel_size / 100)
49 |
50 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False))
51 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size,
52 | bias=(True, True))
53 |
54 | def init_masks(self, batch_size, lengths):
55 | max_sent_length = max(lengths)
56 | num_sentences = batch_size
57 | indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1)
58 | masks = indices < lengths.unsqueeze(1)
59 | masks = masks.type(torch.FloatTensor)
60 | if self.cuda:
61 | masks = masks.cuda()
62 | return masks
63 |
64 | def forward(self, num_sentences, context_embeddings, sent_lengths, dep):
65 | masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths))
66 | lstm_out, _ = self.dep_bilstm(context_embeddings, masks)
67 |
68 | if self.training:
69 | lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp)
70 |
71 | x_all_dep = self.mlp_arc_dep(lstm_out)
72 | x_all_head = self.mlp_arc_head(lstm_out)
73 |
74 | if self.training:
75 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
76 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
77 |
78 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
79 | x_all_head_splits = torch.split(x_all_head, 100, dim=2)
80 |
81 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
82 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
83 |
84 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
85 | arc_logit = torch.squeeze(arc_logit, dim=3)
86 |
87 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
88 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
89 |
90 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
91 |
92 | self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond
93 |
94 | heads, rels = dep[0], dep[1]
95 | loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist()) # compute the dep loss
96 | return loss, self.arc_logits
97 |
98 | def compute_dep_loss(self, true_arcs, true_rels, lengths):
99 | b, l1, l2 = self.arc_logits.size()
100 | index_true_arcs = _model_var(
101 | self.parameters(),
102 | pad_sequence(true_arcs, padding_value=0, batch_first=True)
103 | )
104 | true_arcs = _model_var(
105 | self.parameters(),
106 | pad_sequence(true_arcs, padding_value=-1, batch_first=True)
107 | )
108 |
109 | masks = []
110 | for length in lengths:
111 | mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length))
112 | mask = _model_var(self.parameters(), mask)
113 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
114 | masks.append(mask.transpose(0, 1))
115 | length_mask = torch.stack(masks, 0)
116 | arc_logits = self.arc_logits + length_mask
117 |
118 | arc_loss = F.cross_entropy(
119 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
120 | ignore_index=-1, reduction="sum")
121 |
122 | size = self.rel_logits.size()
123 | output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3]))
124 | for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))):
125 | rel_probs = []
126 | for i in range(l1):
127 | rel_probs.append(logits[i][int(arcs[i])])
128 | rel_probs = torch.stack(rel_probs, dim=0)
129 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
130 |
131 | b, l1, d = output_logits.size()
132 | true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True))
133 |
134 | rel_loss = F.cross_entropy(
135 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum")
136 |
137 | loss = arc_loss + rel_loss
138 | return loss
139 |
140 | def get_reps(self, context_embeddings, masks):
141 | dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks)
142 | normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0)
143 | normed_weights = torch.split(normed_weights, 1) # split_size_or_sections=1, split_size=1) # 0.3.0
144 | dep_representations = self.dep_gamma * \
145 | sum([normed_weights[i] * dep_lstm_outputs[i] for i in
146 | range(self.config.dep_num_lstm_layers)])
147 | if self.training:
148 | lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp)
149 |
150 | x_all_dep = self.mlp_arc_dep(dep_lstm_out)
151 | x_all_head = self.mlp_arc_head(dep_lstm_out)
152 |
153 | if self.training:
154 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
155 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
156 |
157 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
158 | x_all_head_splits = torch.split(x_all_head, 100, dim=2)
159 |
160 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
161 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
162 |
163 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
164 | arc_logit = torch.squeeze(arc_logit, dim=3)
165 | return dep_representations, arc_logit
166 |
167 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn
4 | from .layer import MyLSTM, NonLinear, Biaffine
5 |
6 |
7 | def _model_var(parameters, x):
8 | p = next(iter(filter(lambda p: p.requires_grad, parameters)))
9 | if p.is_cuda:
10 | x = x.cuda(p.get_device())
11 | return torch.autograd.Variable(x)
12 |
13 |
14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb):
15 | batch_size, seq_length, _ = word_embeddings.size()
16 | # tensor.new: build a tensor with the same data type
17 | word_masks = word_embeddings.data.new(batch_size,
18 | seq_length).fill_(1 - dropout_emb)
19 | word_masks = torch.Tensor(torch.bernoulli(word_masks))
20 | word_masks.requires_grad = False
21 | tag_masks = tag_embeddings.data.new(batch_size,
22 | seq_length).fill_(1 - dropout_emb)
23 | tag_masks = torch.Tensor(torch.bernoulli(tag_masks))
24 | tag_masks.requires_grad = False
25 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12)
26 | word_masks *= scale
27 | tag_masks *= scale
28 | # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.
29 | word_masks = word_masks.unsqueeze(dim=2) # ?
30 | tag_masks = tag_masks.unsqueeze(dim=2)
31 | word_embeddings = word_embeddings * word_masks
32 | tag_embeddings = tag_embeddings * tag_masks
33 |
34 | return word_embeddings, tag_embeddings
35 |
36 |
37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
38 | if batch_first:
39 | inputs = inputs.transpose(0, 1)
40 | seq_length, batch_size, hidden_size = inputs.size()
41 | drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout)
42 | drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type())
43 | drop_masks.requires_grad = False
44 | drop_masks = drop_masks / (1 - dropout)
45 | drop_masks = torch.unsqueeze(drop_masks,
46 | dim=2).expand(-1, -1,
47 | seq_length).permute(2, 0, 1)
48 | inputs = inputs * drop_masks
49 |
50 | return inputs.transpose(1, 0)
51 |
52 |
53 | class ParserModel(nn.Module): # build a biaffine parser model
54 | def __init__(self, vocab, config, pretrained_embedding):
55 | super(ParserModel, self).__init__()
56 | self.config = config
57 | self.word_embed = nn.Embedding(vocab.vocab_size,
58 | config.word_dims,
59 | padding_idx=0)
60 | self.extword_embed = nn.Embedding(vocab.extvocab_size,
61 | config.word_dims,
62 | padding_idx=0)
63 | self.tag_embed = nn.Embedding(vocab.tag_size,
64 | config.tag_dims,
65 | padding_idx=0)
66 |
67 | word_init = np.zeros((vocab.vocab_size, config.word_dims),
68 | dtype=np.float32)
69 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
70 |
71 | tag_init = np.random.randn(vocab.tag_size,
72 | config.tag_dims).astype(np.float32)
73 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
74 |
75 | self.extword_embed.weight.data.copy_(
76 | torch.from_numpy(pretrained_embedding))
77 | self.extword_embed.weight.requires_grad = False
78 |
79 | self.lstm = MyLSTM(
80 | input_size=config.word_dims + config.tag_dims,
81 | hidden_size=config.lstm_hiddens,
82 | num_layers=config.lstm_layers,
83 | batch_first=True,
84 | bidirectional=True,
85 | dropout_in=config.dropout_lstm_input,
86 | dropout_out=config.dropout_lstm_hidden,
87 | )
88 |
89 | self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens,
90 | hidden_size=config.mlp_arc_size +
91 | config.mlp_rel_size,
92 | activation=nn.LeakyReLU(0.1))
93 | self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens,
94 | hidden_size=config.mlp_arc_size +
95 | config.mlp_rel_size,
96 | activation=nn.LeakyReLU(0.1))
97 |
98 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
99 | self.arc_num = int(config.mlp_arc_size / 100) # config: 500
100 | self.rel_num = int(config.mlp_rel_size / 100) # config: 100
101 |
102 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size,
103 | 1, bias=(True, False))
104 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size,
105 | vocab.rel_size, bias=(True, True))
106 |
107 | def forward(
108 | self, words, extwords, tags,
109 | masks): # words [batch, max_sentence_length], padding with zeros
110 | # x = (batch size, sequence length, dimension of embedding)
111 | x_word_embed = self.word_embed(words)
112 | x_extword_embed = self.extword_embed(extwords)
113 | x_embed = x_word_embed + x_extword_embed
114 | x_tag_embed = self.tag_embed(tags)
115 |
116 | if self.training:
117 | x_embed, x_tag_embed = drop_input_independent(
118 | x_embed, x_tag_embed, self.config.dropout_emb)
119 |
120 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2)
121 |
122 | outputs, _ = self.lstm(x_lexical, masks, None)
123 | outputs = outputs.transpose(1, 0)
124 |
125 | if self.training:
126 | outputs = drop_sequence_sharedmask(outputs,
127 | self.config.dropout_mlp)
128 |
129 | x_all_dep = self.mlp_arc_dep(outputs)
130 | x_all_head = self.mlp_arc_head(outputs)
131 |
132 | if self.training:
133 | x_all_dep = drop_sequence_sharedmask(x_all_dep,
134 | self.config.dropout_mlp)
135 | x_all_head = drop_sequence_sharedmask(x_all_head,
136 | self.config.dropout_mlp)
137 |
138 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
139 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
140 |
141 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
142 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
143 |
144 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
145 | arc_logit = torch.squeeze(arc_logit, dim=3)
146 |
147 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
148 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
149 |
150 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
151 | return arc_logit, rel_logit_cond
152 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.autograd import Variable
4 |
5 |
6 | def block_orth_normal_initializer(input_size, output_size):
7 | weight = []
8 | for o in output_size:
9 | for i in input_size:
10 | param = torch.FloatTensor(o, i)
11 | torch.nn.init.orthogonal_(param)
12 | weight.append(param)
13 | return torch.cat(weight)
14 |
15 |
16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights):
17 | batch_size = len(batch_x) # batch size
18 | length = max(batch_lengths)
19 |
20 | words = Variable(torch.LongTensor(batch_size, length).zero_(),
21 | requires_grad=False) # padding with 0
22 | predicates = Variable(torch.LongTensor(batch_size, length).zero_(),
23 | requires_grad=False)
24 | masks = Variable(torch.Tensor(batch_size, length).zero_(),
25 | requires_grad=False)
26 | padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(),
27 | requires_grad=False)
28 | labels, lengths = [], []
29 |
30 | b = 0
31 | for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y,
32 | batch_lengths,
33 | batch_weights):
34 | lengths.append(s_length)
35 | rel = np.zeros((s_length), dtype=np.int32)
36 | for i in range(s_length):
37 | words[b, i] = s_words[1][i] # word
38 | predicates[b, i] = s_words[2][i] # predicate
39 | rel[i] = s_answer[0][i]
40 | padding_answers[b, i] = s_answer[0][i]
41 | masks[b, i] = 1
42 |
43 | # sentence_id = s_words[0][0] # get the dep_labels_ids of each sentence
44 | b += 1
45 | labels.append(rel)
46 |
47 | return words, predicates, labels, torch.LongTensor(
48 | lengths), masks, padding_answers
49 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/configuration.py:
--------------------------------------------------------------------------------
1 | ''' Configuration for experiments.
2 | '''
3 | import json
4 | from argparse import Namespace
5 |
6 |
7 | def get_config(config_filepath):
8 | with open(config_filepath, 'r') as config_file:
9 | conf = json.load(config_file, object_hook=lambda d: Namespace(**d))
10 | return conf
11 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/conll_utils.py:
--------------------------------------------------------------------------------
1 | def bio_to_se(labels):
2 | slen = len(labels)
3 | new_labels = []
4 | has_opening = False
5 | for i in range(slen):
6 | label = labels[i]
7 | if label == 'O':
8 | new_labels.append('*')
9 | continue
10 | new_label = '*'
11 | if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]:
12 | new_label = '(' + label[2:] + new_label
13 | has_opening = True
14 | if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]:
15 | new_label = new_label + ')'
16 | has_opening = False
17 | new_labels.append(new_label)
18 |
19 | if has_opening:
20 | ''' logging '''
21 | print("Has unclosed opening: {}".format(labels))
22 | return new_labels
23 |
24 |
25 | def print_sentence_to_conll(fout, tokens, labels):
26 | for label_column in labels:
27 | assert len(label_column) == len(tokens)
28 | for i in range(len(tokens)):
29 | fout.write(tokens[i].ljust(15))
30 | for label_column in labels:
31 | fout.write(label_column[i].rjust(15))
32 | fout.write("\n")
33 | fout.write("\n")
34 |
35 |
36 | def print_to_conll(pred_labels, gold_props_file, output_filename):
37 | """
38 | """
39 | fout = open(output_filename, 'w')
40 | seq_ptr = 0
41 | num_props_for_sentence = 0
42 | tokens_buf = []
43 |
44 | for line in open(gold_props_file, 'r'):
45 | line = line.strip()
46 | if line == "" and len(tokens_buf) > 0:
47 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
48 | seq_ptr += num_props_for_sentence
49 | tokens_buf = []
50 | num_props_for_sentence = 0
51 | else:
52 | info = line.split()
53 | num_props_for_sentence = len(info) - 1
54 | tokens_buf.append(info[0])
55 |
56 | # Output last sentence.
57 | if len(tokens_buf) > 0:
58 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
59 |
60 | fout.close()
61 |
62 |
63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename):
64 | fout = open(output_filename, 'w')
65 | props_buf = []
66 | labels_buf = []
67 | tokens_buf = []
68 | prev_words = ''
69 |
70 | x, y, num_tokens, _ = data
71 | for (sent, gold, slen) in zip(x, y, num_tokens):
72 | words = [word_dict.idx2str[w[0]] for w in sent[:slen]]
73 | labels = [label_dict.idx2str[l] for l in gold[:slen]]
74 |
75 | concat_words = ' '.join(words)
76 | if concat_words != prev_words and len(props_buf) > 0:
77 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
78 |
79 | print_sentence_to_conll(fout, tokens, labels_buf)
80 | props_buf = []
81 | tokens_buf = []
82 | labels_buf = []
83 | prev_words = ''
84 |
85 | if prev_words == '':
86 | prev_words = concat_words
87 | tokens_buf = [w for w in words]
88 | if 'B-V' in labels:
89 | prop_id = labels.index('B-V')
90 | props_buf.append(prop_id)
91 | labels_buf.append(bio_to_se(labels))
92 |
93 | if len(props_buf) > 0:
94 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
95 | print_sentence_to_conll(fout, tokens, labels_buf)
96 |
97 | fout.close()
98 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constants.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 | import os
3 | import random
4 |
5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../')
6 |
7 | RANDOM_SEED = 12345
8 | random.seed(RANDOM_SEED)
9 |
10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh')
11 |
12 | START_MARKER = ''
13 | END_MARKER = ''
14 | PADDING_TOKEN = '*PAD*'
15 | UNKNOWN_TOKEN = '*UNKNOWN*'
16 | NULL_LABEL = 'O'
17 |
18 | TEMP_DIR = join(ROOT_DIR, '../temp')
19 |
20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT)
21 | if not os.path.exists(TEMP_DIR):
22 | os.makedirs(TEMP_DIR)
23 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_extraction.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import sys
3 | import numpy as np
4 | import random
5 |
6 | from .dictionary import Dictionary
7 | from collections import OrderedDict
8 | from nltk.tree import Tree
9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN
10 | # from .reader import list_of_words_to_ids
11 |
12 |
13 | PREFIX = "--PTB-CONS-LABEL--"
14 |
15 |
16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
17 | ids = []
18 | for s in list_of_words:
19 | # s = s.encode('utf-8') # unicode -> utf-8
20 | if s is None:
21 | ids.append(-1)
22 | continue
23 | if lowercase:
24 | s = s.lower()
25 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
26 | s = UNKNOWN_TOKEN
27 | ids.append(dictionary.add(s))
28 | return ids
29 |
30 |
31 | class constituent_tree():
32 | def __init__(self, sentence, words, tree):
33 | self.sentence = sentence
34 | self.words = words
35 | self.tree = tree
36 |
37 | self.heads = []
38 | self.non_terminal_nodes = [] # cons labels, e.g., NP, VP
39 | self.terminal_nodes = [] # words
40 | self.indicator = [] # 1 no terminal, 2 terminal
41 |
42 | self.non_terminal_nodes_idx = []
43 | self.non_terminal_nodes_char_idx = []
44 | self.terminal_node_idx = []
45 | self.terminal_node_char_idx = []
46 |
47 | self.sentence_length = len(words)
48 | self.input_length = -1
49 | self.sentence_index = -1
50 |
51 | def pos(self):
52 | """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]"""
53 | return self.tree.pos()
54 |
55 | def traverse_tree(self, tree,
56 | non_terminal_nodes, terminal_nodes,
57 | non_terminal_nodes_idx, terminal_nodes_idx,
58 | indicator,
59 | heads,
60 | parent,
61 | non_terminal_dict, word_dict, pos,
62 | word_embeddings):
63 | # print(tree)
64 | # print("subtree", subtree)
65 | if tree.height() > 2:
66 | non_terminal = tree.label()
67 |
68 | non_terminal_nodes.append(non_terminal)
69 | non_terminal_nodes_idx.append(non_terminal_dict.add(non_terminal))
70 | indicator.append(1)
71 | heads.append(parent - 1)
72 | else:
73 | # print("YY", subtree)
74 | terminal = tree[0] # word
75 | terminal_nodes.append(terminal)
76 | terminal_nodes_idx.append(
77 | constituent_tree.add_word(terminal, word_dict, word_embeddings)
78 | )
79 | indicator.append(2)
80 |
81 | pos.add(tree.label())
82 | heads.append(parent - 1)
83 | if tree.height() <= 2: # 2 == ["V", Tree("Chased")]
84 | return
85 | parent = len(non_terminal_nodes) + len(terminal_nodes)
86 | for i, subtree in enumerate(tree):
87 | self.traverse_tree(subtree,
88 | non_terminal_nodes, terminal_nodes,
89 | non_terminal_nodes_idx, terminal_nodes_idx,
90 | indicator,
91 | heads, parent,
92 | non_terminal_dict, word_dict, pos,
93 | word_embeddings)
94 |
95 | @staticmethod
96 | def add_unknown_labels(label, word_embeddings):
97 | if label not in word_embeddings:
98 | embedding_size = len(word_embeddings[PADDING_TOKEN])
99 | word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)])
100 |
101 | @staticmethod
102 | def add_word(word, word_dict, word_embeddings):
103 | if word not in word_embeddings:
104 | word = UNKNOWN_TOKEN
105 | idx = word_dict.add(word)
106 | return idx
107 |
108 | @staticmethod
109 | def get_node_char_idx(words, char_dict, lowercase=False):
110 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
111 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
112 | for i, word in enumerate(words):
113 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
114 | return single_sample_char_tokens
115 |
116 | def generate_adjacent(self, non_terminal_dict, word_dict, char_dict, pos, word_embeddings):
117 | root_label = self.tree.label()
118 | self.traverse_tree(self.tree,
119 | self.non_terminal_nodes, self.terminal_nodes,
120 | self.non_terminal_nodes_idx, self.terminal_node_idx,
121 | self.indicator,
122 | self.heads, len(self.heads),
123 | non_terminal_dict, word_dict, pos,
124 | word_embeddings
125 | )
126 | self.input_length = len(self.non_terminal_nodes) + len(self.terminal_nodes)
127 | self.sentence_index = self.input_length - self.sentence_length - 1
128 |
129 | self.non_terminal_nodes_char_idx = constituent_tree.get_node_char_idx(
130 | self.non_terminal_nodes, char_dict
131 | )
132 | self.terminal_node_char_idx = constituent_tree.get_node_char_idx(
133 | self.terminal_nodes, char_dict
134 | )
135 |
136 |
137 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings):
138 | data = []
139 | with open(file_path, 'r') as input_file:
140 | sentence = ""
141 | for line in input_file.readlines():
142 | if line.strip() == "":
143 | data.append(sentence)
144 | sentence = ""
145 | continue
146 | line = line.strip()
147 | if ' ' not in line: # avoid the split of leave node of it's PoS
148 | line = ' ' + line
149 | sentence += line
150 | print("Read {} sentence from {}".format(len(data), file_path))
151 |
152 | def reset_sentence(sentence):
153 | for i in range(len(sentence)):
154 | if sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]:
155 | sentence[i] = '-'
156 |
157 | cons_trees = OrderedDict()
158 | for sentence in data:
159 | tree = Tree.fromstring(sentence)
160 | words = tree.leaves()
161 | reset_sentence(words)
162 | sentence = ' '.join(words)
163 | cons_trees[sentence] = constituent_tree(sentence, words, tree)
164 |
165 | pos_dict = Dictionary(padding_token=PADDING_TOKEN)
166 | non_terminal_dict = Dictionary(padding_token=PADDING_TOKEN)
167 | for sen in cons_trees:
168 | tree = cons_trees[sen]
169 | tree.generate_adjacent(non_terminal_dict, word_dict, char_dict, pos_dict, word_embeddings)
170 |
171 | return cons_trees, non_terminal_dict, pos_dict,
172 |
173 |
174 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_reader.py:
--------------------------------------------------------------------------------
1 | import json
2 | import codecs
3 | import numpy as np
4 |
5 |
6 | from sortedcontainers import SortedSet
7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL
8 | from .dictionary import Dictionary
9 |
10 |
11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
12 | ids = []
13 | for s in list_of_words:
14 | # s = s.encode('utf-8') # unicode -> utf-8
15 | if s is None:
16 | ids.append(-1)
17 | continue
18 | if lowercase:
19 | s = s.lower()
20 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
21 | s = UNKNOWN_TOKEN
22 | ids.append(dictionary.add(s))
23 | return ids
24 |
25 |
26 | class constituent_sentence():
27 | def __init__(self, obj):
28 | self.sentence = obj["sentence"]
29 | self.constituent_spans = obj["constituents"]
30 | self.max_span_width = 30
31 | self.reset_sentence()
32 |
33 | def reset_sentence(self):
34 | for i in range(len(self.sentence)):
35 | if self.sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]:
36 | self.sentence[i] = '-'
37 | self.sentence[i] = self.sentence[i].replace("\\/", "/")
38 |
39 | def tokenize_cons_spans(self, dictionary, max_cons_width=60):
40 | cons_span = []
41 | set_cons_span = set()
42 | for cons_s in self.constituent_spans: # remove self-loop V-V
43 | cons_start, cons_end, cons_label = cons_s
44 | if cons_label in ["TOP", "S"]: # todo: add some constrains here
45 | continue
46 | if cons_end - cons_start + 1 >= max_cons_width:
47 | continue
48 | if (cons_start, cons_end) not in set_cons_span:
49 | set_cons_span.add((cons_start, cons_end))
50 | cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))])
51 | else:
52 | # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence)
53 | pass
54 | if len(cons_span) == 0: # if the sentence has no arguments.
55 | return [[], [], []]
56 | tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \
57 | zip(*cons_span)
58 | return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels
59 |
60 |
61 | def read_constituent_file(file_path):
62 | sentences = []
63 | with codecs.open(file_path, encoding="utf8") as f:
64 | for line in f.readlines():
65 | sen = json.loads(line)
66 | cons_sen = constituent_sentence(sen)
67 | sentences.append(cons_sen)
68 | print("{} total constituent sentences number {}".format(file_path, len(sentences)))
69 | return sentences
70 |
71 |
72 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False):
73 | sample_word_tokens = [list_of_words_to_ids(
74 | sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples]
75 | # for the character
76 | sample_char_tokens = []
77 | for sent in samples:
78 | words = sent.sentence
79 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
80 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
81 | for i, word in enumerate(words):
82 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
83 | # Add the sample char tokens into the sample_char_tokens
84 | sample_char_tokens.append(single_sample_char_tokens)
85 | sample_texts = [sent.sentence for sent in samples]
86 | sample_lengths = [len(sent.sentence) for sent in samples]
87 | sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples]
88 | return list(zip(sample_lengths, sample_texts, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens))
89 |
90 |
91 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None):
92 | raw_cons_sentences = read_constituent_file(file_path)
93 | cons_label_dict = Dictionary()
94 | cons_label_dict.set_unknown_token(NULL_LABEL)
95 |
96 | # tokenized the data
97 | if word_dict.accept_new is False:
98 | word_dict.accept_new = True
99 | if char_dict.accept_new is False:
100 | char_dict.accept_new = True
101 | cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict,
102 | False, word_embeddings)
103 | # word_dict.accept_new = False
104 | # char_dict.accept_new = False
105 | # cons_label_dict.accept_new = False
106 |
107 | print("="*10, "Constituent Info", "="*10)
108 | print("Extract {} tags".format(cons_label_dict.size()))
109 | # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size()))
110 | print("Max sentence length: {}".format(max([s[0] for s in cons_samples])))
111 | return cons_samples, word_dict, char_dict, cons_label_dict
112 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/dictionary.py:
--------------------------------------------------------------------------------
1 | ''' Bidirectional dictionary that maps between words and ids.
2 | '''
3 |
4 |
5 | class Dictionary(object):
6 | def __init__(self, padding_token=None, unknown_token=None):
7 | self.str2idx = {}
8 | self.idx2str = []
9 |
10 | self.accept_new = True
11 | self.padding_token = None
12 | self.padding_id = None
13 | self.unknown_token = None
14 | self.unknown_id = None
15 | if padding_token is not None: # add the padding info into the dictionary
16 | self.set_padding_token(padding_token)
17 | if unknown_token is not None:
18 | self.set_unknown_token(unknown_token)
19 |
20 | def set_padding_token(self, padding_token):
21 | self.padding_token = padding_token
22 | self.padding_id = self.add(self.padding_token)
23 |
24 | def set_unknown_token(self, unknown_token):
25 | self.unknown_token = unknown_token
26 | self.unknown_id = self.add(self.unknown_token)
27 |
28 | def add(self, new_str):
29 | if new_str not in self.str2idx:
30 | if self.accept_new:
31 | self.str2idx[new_str] = len(self.idx2str)
32 | self.idx2str.append(new_str)
33 | else:
34 | if new_str == "C-ADV":
35 | return self.str2idx["O"]
36 | if self.unknown_id is None:
37 | raise LookupError(
38 | 'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str)
39 | return self.unknown_id
40 |
41 | return self.str2idx[new_str]
42 |
43 | def add_all(self, str_list):
44 | return [self.add(s) for s in str_list]
45 |
46 | def get_index(self, input_str):
47 | if input_str in self.str2idx:
48 | return self.str2idx[input_str]
49 | return None
50 |
51 | def size(self):
52 | return len(self.idx2str)
53 |
54 | def save(self, filename):
55 | with open(filename, 'w') as f:
56 | for s in self.idx2str:
57 | f.write(s + '\n')
58 | f.close()
59 |
60 | def load(self, filename):
61 | with open(filename, 'r') as f:
62 | for line in f:
63 | line = line.strip()
64 | if line != '':
65 | self.add(line)
66 | f.close()
67 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/evaluation.py:
--------------------------------------------------------------------------------
1 | ''' Framework independent evaluator. Not in use yet.
2 | '''
3 | import numpy
4 | import os
5 | from os.path import join
6 | # import subprocess
7 | from .constants import ROOT_DIR
8 | from .conll_utils import print_gold_to_conll
9 | # from .measurements import Timer
10 |
11 |
12 | class TaggerEvaluator(object):
13 | def __init__(self, data):
14 | self.data = data
15 | self.best_accuracy = 0.0
16 | self.has_best = False
17 |
18 | def compute_accuracy(self, predictions):
19 | for x, y in zip(predictions,
20 | [sent[2] for sent in self.data
21 | ]): # the predication's order should be the origin
22 | assert len(x) == y
23 | predictions = numpy.concatenate(predictions)
24 | tensors = self.data
25 | answer = numpy.concatenate(
26 | [sent[1].reshape(sent[1].shape[1]) for sent in tensors])
27 | # predictions.resize(predictions.shape[0]) # resize the answer to the [length, 1]
28 | num_correct = numpy.equal(predictions, answer).sum()
29 | num_total = answer.shape[0]
30 | self.accuracy = (100.0 * num_correct) / num_total
31 | print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct,
32 | num_total))
33 |
34 | def evaluate(self, predictions):
35 | self.compute_accuracy(predictions)
36 | self.has_best = self.accuracy > self.best_accuracy
37 | if self.has_best:
38 | print("Best accuracy so far: {:.3f}".format(self.accuracy))
39 | self.best_accuracy = self.accuracy
40 |
41 |
42 | class PropIdEvaluator(object):
43 | def __init__(self, data, label_dict, target_label='V',
44 | use_se_marker=False):
45 | self.data = data
46 | self.label_dict = label_dict
47 | self.target_label_id = label_dict.str2idx[target_label]
48 | self.best_accuracy = 0.0
49 | self.has_best = False
50 |
51 | def compute_accuracy(self, predictions):
52 | _, y, _, weights = self.data
53 | # print predictions.shape, predictions
54 | identified = numpy.equal(predictions, self.target_label_id)
55 | print(y)
56 | # print self.target_label_id
57 | # print identified
58 | # exit()
59 | num_correct = numpy.sum(
60 | numpy.logical_and(numpy.equal(predictions, y), identified) * weights)
61 | num_identified = numpy.sum(identified * weights)
62 | num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights)
63 | self.precision = 100.0 * num_correct / num_identified
64 | self.recall = 100.0 * num_correct / num_gold
65 | self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall)
66 | print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format(
67 | self.accuracy, self.precision, self.recall))
68 |
69 | def evaluate(self, predictions):
70 | self.compute_accuracy(predictions)
71 | self.has_best = self.accuracy > self.best_accuracy
72 | if self.has_best:
73 | print("Best accuracy so far: {:.3f}".format(self.accuracy))
74 | self.best_accuracy = self.accuracy
75 |
76 |
77 | class SRLEvaluator(TaggerEvaluator):
78 | def __init__(self):
79 | self.best_accuracy = -1.0
80 | self.has_best = False
81 |
82 | def compute_accuracy(self, predictions):
83 | print("exit()")
84 | exit()
85 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/features.py:
--------------------------------------------------------------------------------
1 | def get_srl_features(sentences, config, feature_dicts=None):
2 | ''' TODO: Support adding more features.
3 | '''
4 | feature_names = config.features
5 | feature_sizes = config.feature_sizes
6 | use_se_marker = config.use_se_marker
7 |
8 | features = []
9 | feature_shapes = []
10 | for fname, fsize in zip(feature_names, feature_sizes):
11 | if fname == "predicate":
12 | offset = int(use_se_marker)
13 | offset = 1 # pad is in the position 0
14 | features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences])
15 | feature_shapes.append([2, fsize])
16 | return (zip(*features), feature_shapes)
17 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/inference.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | def get_transition_params(label_strs):
5 | """Construct transtion scoresd (0 for allowed, -inf for invalid).
6 | Args:
7 | label_strs: A [num_tags,] sequence of BIO-tags.
8 | Returns:
9 | A [num_tags, num_tags] matrix of transition scores.
10 | """
11 | num_tags = len(label_strs)
12 | transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32)
13 | for i, prev_label in enumerate(label_strs):
14 | for j, label in enumerate(label_strs):
15 | if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]:
16 | transition_params[i, j] = numpy.NINF
17 | return transition_params
18 |
19 |
20 | def viterbi_decode(score, transition_params):
21 | """ Adapted from Tensorflow implementation.
22 | Decode the highest scoring sequence of tags outside of TensorFlow.
23 | This should only be used at test time.
24 | Args:
25 | score: A [seq_len, num_tags] matrix of unary potentials.
26 | transition_params: A [num_tags, num_tags] matrix of binary potentials.
27 | Returns:
28 | viterbi: A [seq_len] list of integers containing the highest scoring tag
29 | indicies.
30 | viterbi_score: A float containing the score for the Viterbi sequence.
31 | """
32 | trellis = numpy.zeros_like(score)
33 | backpointers = numpy.zeros_like(score, dtype=numpy.int32)
34 | trellis[0] = score[0]
35 | for t in range(1, score.shape[0]):
36 | v = numpy.expand_dims(trellis[t - 1], 1) + transition_params
37 | trellis[t] = score[t] + numpy.max(v, 0)
38 | backpointers[t] = numpy.argmax(v, 0)
39 | viterbi = [numpy.argmax(trellis[-1])]
40 | for bp in reversed(backpointers[1:]):
41 | viterbi.append(bp[viterbi[-1]])
42 | viterbi.reverse()
43 | viterbi_score = numpy.max(trellis[-1])
44 | return viterbi, viterbi_score
45 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/io_utils.py:
--------------------------------------------------------------------------------
1 | from google.protobuf.internal import encoder
2 |
3 | _EncodeVarint = encoder._VarintEncoder()
4 |
5 |
6 | def write_delimited_to(out_file, message):
7 | msg_size = message.ByteSize()
8 | pieces = []
9 | _EncodeVarint(pieces.append, msg_size)
10 | out_file.write(b"".join(pieces))
11 | out_file.write(message.SerializeToString())
12 |
13 |
14 | def read_gold_props(gold_props_file):
15 | """ Read gold predicates from CoNLL-formatted file.
16 | """
17 | gold_props = []
18 | props = []
19 | with open(gold_props_file, 'r') as f:
20 | for line in f:
21 | line = line.strip()
22 | if line == '':
23 | gold_props.append(props)
24 | props = []
25 | else:
26 | props.append(line.split()[0])
27 | f.close()
28 | if len(props) > 0:
29 | gold_props.append(props)
30 | return gold_props
31 |
32 |
33 | def write_predprops_to(predictions,
34 | label_dict,
35 | input_file,
36 | output_file,
37 | gold_props_file=None,
38 | output_props_file=None):
39 | """ Write predicted predicate information to files.
40 |
41 | Arguments:
42 | predictions: Predictions from the predicate identification model.
43 | Is a numpy array of size [num_sentences, max_sentence_length].
44 | label_dict: Label dictionary.
45 | input_file: Input sequential tagging file.
46 | output_file: Output SRL file with identified predicates.
47 | gold_props_file: Input file with gold predicates in CoNLL format.
48 | output_props_file: Output SRL file with identified predicates, in CoNLL format.
49 | """
50 |
51 | fin = open(input_file, 'r')
52 | fout = open(output_file, 'w')
53 |
54 | if output_props_file is not None and output_props_file != '':
55 | fout_props = open(output_props_file, 'w')
56 | else:
57 | fout_props = None
58 |
59 | if gold_props_file is not None and gold_props_file != '':
60 | gold_props = read_gold_props(gold_props_file)
61 | print(len(gold_props), len(predictions))
62 | assert len(gold_props) == len(predictions)
63 | else:
64 | gold_props = None
65 |
66 | sent_id = 0
67 | for line in fin:
68 | # Read original sentence from input file.
69 | raw_sent = line.split('|||')[0].strip()
70 | tokens = raw_sent.split(' ')
71 | slen = len(tokens)
72 | pred = predictions[sent_id, :slen]
73 | props = []
74 |
75 | for (t, p) in enumerate(pred):
76 | if label_dict.idx2str[p] == 'V':
77 | out_tags = ['O' for _ in range(slen)]
78 | out_tags[t] = 'B-V'
79 | out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join(
80 | out_tags) + '\n'
81 | fout.write(out_line)
82 | props.append(t)
83 |
84 | if fout_props is not None:
85 | if sent_id > 0:
86 | fout_props.write('\n')
87 | for t in range(slen):
88 | lemma = 'P' + tokens[t].lower()
89 | # In order for CoNLL evaluation script to run, we need to output the same
90 | # lemma as the gold predicate in the CoNLL-formatted file.
91 | if gold_props is not None and gold_props[sent_id][t] != '-':
92 | lemma = gold_props[sent_id][t]
93 | if t in props:
94 | fout_props.write(lemma)
95 | else:
96 | fout_props.write('-')
97 | for p in props:
98 | if t == p:
99 | fout_props.write('\t(V*)')
100 | else:
101 | fout_props.write('\t*')
102 | fout_props.write('\n')
103 | sent_id += 1
104 |
105 | fout.close()
106 | print('Predicted predicates in sequential-tagging format written to: {}.'.
107 | format(output_file))
108 | if fout_props is not None:
109 | fout_props.close()
110 | print('CoNLL-formatted predicate information written to: {}.'.format(
111 | output_props_file))
112 |
113 |
114 | def bio_to_spans(predictions, label_dict):
115 | """ Convert BIO-based predictions to a set of arguments.
116 | Arguments:
117 | predictions: A single integer array, already truncated to the original sequence lengths.
118 | label_dict: Label dictionary.
119 | Returns:
120 | A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions.
121 | """
122 | args = []
123 | tags = [label_dict.idx2str[p] for p in predictions]
124 | for (i, tag) in enumerate(tags):
125 | if tag == 'O':
126 | continue
127 | label = tag[2:]
128 | # Append new span.
129 | if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]:
130 | args.append([label, i, -1])
131 | # Close current span.
132 | if i == len(predictions) - 1 or tags[
133 | i + 1][0] == 'B' or label != tags[i + 1][2:]:
134 | args[-1][2] = i
135 | return args
136 |
137 |
138 | def print_to_readable(predictions, num_tokens, label_dict, input_path,
139 | output_path):
140 | """ Print predictions to human-readable format.
141 | """
142 | fout = open(output_path, 'w')
143 | sample_id = 0
144 | for line in open(input_path, 'r'):
145 | info = line.split('|||')[0].strip().split()
146 | pid = int(info[0])
147 | sent = info[1:]
148 | fout.write(' '.join(sent) + '\n')
149 | fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid))
150 |
151 | tags = predictions[sample_id]
152 | arg_spans = bio_to_spans(tags, label_dict)
153 | for arg in arg_spans:
154 | fout.write('\t\t{}: {}\n'.format(arg[0], " ".join(
155 | sent[arg[1]:arg[2] + 1])))
156 | fout.write('\n')
157 | sample_id += 1
158 |
159 | fout.close()
160 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/measurements.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 |
4 |
5 | class Timer:
6 | def __init__(self, name, active=True):
7 | self.name = name if active else None
8 |
9 | def __enter__(self):
10 | self.start = time.time()
11 | self.last_tick = self.start
12 | return self
13 |
14 | def __exit__(self, *args):
15 | if self.name is not None:
16 | print("{} duration was {}.".format(
17 | self.name, self.readable(time.time() - self.start)))
18 |
19 | def readable(self, seconds):
20 | return str(datetime.timedelta(seconds=int(seconds)))
21 |
22 | def tick(self, message):
23 | current = time.time()
24 | print("{} took {} ({} since last tick).".format(
25 | message, self.readable(current - self.start),
26 | self.readable(current - self.last_tick)))
27 | self.last_tick = current
28 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/numpy_utils.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | def orth_normal_initializer(factor=1.0, seed=None):
5 | ''' Reference: Exact solutions to the nonlinear dynamics of learning in
6 | deep linear neural networks
7 | Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf
8 | Adapted from the original implementation by Mingxuan Wang.
9 | '''
10 | def _initializer(shape, dtype):
11 | assert len(shape) == 2
12 | rng = numpy.random.RandomState(seed)
13 | if shape[0] == shape[1]:
14 | M = rng.randn(*shape).astype(dtype)
15 | Q, R = numpy.linalg.qr(M)
16 | Q = Q * numpy.sign(numpy.diag(R))
17 | param = Q * factor
18 | return param
19 | else:
20 | M1 = rng.randn(shape[0], shape[0]).astype(dtype)
21 | M2 = rng.randn(shape[1], shape[1]).astype(dtype)
22 | Q1, R1 = numpy.linalg.qr(M1)
23 | Q2, R2 = numpy.linalg.qr(M2)
24 | Q1 = Q1 * numpy.sign(numpy.diag(R1))
25 | Q2 = Q2 * numpy.sign(numpy.diag(R2))
26 | n_min = min(shape[0], shape[1])
27 | param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor
28 | return param
29 |
30 | return _initializer
31 |
32 |
33 | def block_orth_normal_initializer(input_shapes,
34 | output_shapes,
35 | factor=1.0,
36 | seed=None):
37 | ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix.
38 | Input:
39 | - input_shapes: the sizes of each block alone dimension 0.
40 | - output_shapes: the sizes of each block along dimension 1.
41 | for example input_shapes = [100, 128] output_shapes=[100,100,100,100]
42 | indicates eight blocks with shapes [100,100], [128,100], etc.
43 | '''
44 | def _initializer(shape, dtype):
45 | assert len(shape) == 2
46 | initializer = orth_normal_initializer(factor, seed)
47 | params = numpy.concatenate([
48 | numpy.concatenate([
49 | initializer([dim_in, dim_out], dtype)
50 | for dim_out in output_shapes
51 | ], 1) for dim_in in input_shapes
52 | ], 0)
53 | return params
54 |
55 | return _initializer
56 |
57 |
58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None):
59 | def _initializer(shape, dtype):
60 | rng = numpy.random.RandomState(seed)
61 | return numpy.asarray(rng.normal(mean, stddev, shape), dtype)
62 |
63 | return _initializer
64 |
65 |
66 | def all_zero_initializer():
67 | def _initializer(shape, dtype):
68 | return numpy.zeros(shape).astype(dtype)
69 |
70 | return _initializer
71 |
72 |
73 | def uniform_initializer(value=0.01):
74 | def _initializer(shape, dtype):
75 | return numpy.full(shape, value).astype(dtype)
76 |
77 | return _initializer
78 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/scores_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: scores.proto
3 |
4 | import sys
5 | import tensor_pb2 as tensor__pb2
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | # from google.protobuf import descriptor_pb2
11 |
12 |
13 | # @@protoc_insertion_point(imports)
14 | _b = sys.version_info[0] < 3 and (lambda x: x) or (
15 | lambda x: x.encode('latin1'))
16 |
17 |
18 | _sym_db = _symbol_database.Default()
19 |
20 |
21 | DESCRIPTOR = _descriptor.FileDescriptor(
22 | name='scores.proto',
23 | package='',
24 | syntax='proto2',
25 | serialized_pb=_b(
26 | '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\
27 | sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto'
28 | ),
29 | dependencies=[
30 | tensor__pb2.DESCRIPTOR,
31 | ])
32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
33 |
34 | _SENTENCESCORESPROTO = _descriptor.Descriptor(
35 | name='SentenceScoresProto',
36 | full_name='SentenceScoresProto',
37 | filename=None,
38 | file=DESCRIPTOR,
39 | containing_type=None,
40 | fields=[
41 | _descriptor.FieldDescriptor(
42 | name='sentence_id',
43 | full_name='SentenceScoresProto.sentence_id',
44 | index=0,
45 | number=1,
46 | type=13,
47 | cpp_type=3,
48 | label=1,
49 | has_default_value=False,
50 | default_value=0,
51 | message_type=None,
52 | enum_type=None,
53 | containing_type=None,
54 | is_extension=False,
55 | extension_scope=None,
56 | options=None),
57 | _descriptor.FieldDescriptor(name='scores',
58 | full_name='SentenceScoresProto.scores',
59 | index=1,
60 | number=2,
61 | type=11,
62 | cpp_type=10,
63 | label=1,
64 | has_default_value=False,
65 | default_value=None,
66 | message_type=None,
67 | enum_type=None,
68 | containing_type=None,
69 | is_extension=False,
70 | extension_scope=None,
71 | options=None),
72 | ],
73 | extensions=[],
74 | nested_types=[],
75 | enum_types=[],
76 | options=None,
77 | is_extendable=False,
78 | syntax='proto2',
79 | extension_ranges=[],
80 | oneofs=[],
81 | serialized_start=30,
82 | serialized_end=102,
83 | )
84 |
85 | _SCORESPROTO = _descriptor.Descriptor(
86 | name='ScoresProto',
87 | full_name='ScoresProto',
88 | filename=None,
89 | file=DESCRIPTOR,
90 | containing_type=None,
91 | fields=[
92 | _descriptor.FieldDescriptor(name='sentences',
93 | full_name='ScoresProto.sentences',
94 | index=0,
95 | number=1,
96 | type=11,
97 | cpp_type=10,
98 | label=3,
99 | has_default_value=False,
100 | default_value=[],
101 | message_type=None,
102 | enum_type=None,
103 | containing_type=None,
104 | is_extension=False,
105 | extension_scope=None,
106 | options=None),
107 | ],
108 | extensions=[],
109 | nested_types=[],
110 | enum_types=[],
111 | options=None,
112 | is_extendable=False,
113 | syntax='proto2',
114 | extension_ranges=[],
115 | oneofs=[],
116 | serialized_start=104,
117 | serialized_end=158,
118 | )
119 |
120 | _SENTENCESCORESPROTO.fields_by_name[
121 | 'scores'].message_type = tensor__pb2._TENSORPROTO
122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO
123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO
124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO
125 |
126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType(
127 | 'SentenceScoresProto',
128 | (_message.Message, ),
129 | dict(DESCRIPTOR=_SENTENCESCORESPROTO,
130 | __module__='scores_pb2'
131 | # @@protoc_insertion_point(class_scope:SentenceScoresProto)
132 | ))
133 | _sym_db.RegisterMessage(SentenceScoresProto)
134 |
135 | ScoresProto = _reflection.GeneratedProtocolMessageType(
136 | 'ScoresProto',
137 | (_message.Message, ),
138 | dict(DESCRIPTOR=_SCORESPROTO,
139 | __module__='scores_pb2'
140 | # @@protoc_insertion_point(class_scope:ScoresProto)
141 | ))
142 | _sym_db.RegisterMessage(ScoresProto)
143 |
144 | # @@protoc_insertion_point(module_scope)
145 |
--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/syntactic_extraction.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import codecs
3 |
4 | from .dictionary import Dictionary
5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN
6 | from collections import OrderedDict
7 |
8 |
9 | class SyntacticTree(object):
10 | def __init__(self, sentence_id):
11 | self.sentence_id = sentence_id
12 | self.word_forms = ["Root"]
13 | self.word_forms_ids = []
14 | self.char_ids = [[]] # 2D
15 | self.pos_forms = ["Root"]
16 | self.heads = [0]
17 | self.labels = ["Root"]
18 | self.labels_id = []
19 |
20 |
21 | class SyntacticCONLL(object):
22 | def __init__(self):
23 | self.file_name = ""
24 | self.trees = []
25 | self.sample_dep_data = None
26 |
27 | def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8):
28 | self.file_name = filename
29 |
30 | print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio))
31 | conll_file = codecs.open(self.file_name, 'r', encoding="utf8")
32 | if conll_file.closed:
33 | print("Cannot open the syntactic conll file! Please check {}".format(self.file_name))
34 |
35 | sentence_id = 0
36 | a_tree = SyntacticTree(sentence_id)
37 | find_root = False
38 | for line in conll_file:
39 | if line == '\n' or line == '\r\n': # new sentence
40 | sentence_id += 1
41 | if len(a_tree.word_forms) <= max_sentence_length:
42 | assert find_root is True
43 | # keep the sentence with the length < max_sentence_l
44 | self.trees.append(a_tree)
45 | a_tree = SyntacticTree(sentence_id)
46 | find_root = False
47 | continue
48 | tokens = line.strip().split('\t')
49 | a_tree.word_forms.append(tokens[1])
50 | a_tree.pos_forms.append(tokens[3])
51 | # head = int(tokens[6]) if int(tokens[6]) > 0 else -1
52 | head = int(tokens[6]) - 1 # root's head is 0
53 | if head == -1:
54 | assert tokens[7] == "root"
55 | find_root = True
56 | a_tree.heads.append(head)
57 | a_tree.labels.append(tokens[7])
58 | token_9 = tokens[9] # or tokens 9 will be 'unicode' type
59 | dep_prob = 1.0 if isinstance(token_9, str) else float(token_9)
60 | if dep_prob < prune_ratio:
61 | a_tree.heads[-1] = -1
62 | print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees)))
63 |
64 | @staticmethod
65 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
66 | ids = []
67 | for s in list_of_words:
68 | s = s
69 | if s is None:
70 | ids.append(-1)
71 | continue
72 | if lowercase:
73 | s = s.lower()
74 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
75 | s = UNKNOWN_TOKEN
76 | ids.append(dictionary.add(s))
77 | return ids
78 |
79 | def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None):
80 | for tree in self.trees:
81 | tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False,
82 | pretrained_word_embedding)
83 | words = tree.word_forms
84 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
85 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64)
86 | for i, word in enumerate(words):
87 | single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict)
88 | # Add the sample char tokens into the sample_char_tokens
89 | tree.char_ids = single_sample_char_tokens
90 |
91 | tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict)
92 |
93 | sample_word_texts = [tree.word_forms for tree in self.trees]
94 | sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees]
95 | sample_char_ids = [tree.char_ids for tree in self.trees]
96 | sample_heads = [np.asarray(tree.heads) for tree in self.trees]
97 | sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees]
98 | self.sample_dep_data = list(zip(sample_word_texts,
99 | sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids))
100 |
101 | def get_syntactic_label_dict(self, syn_label_dict=None):
102 | if syn_label_dict is None:
103 | syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)
104 | else:
105 | assert syn_label_dict.accept_new is False
106 | sentences_length = len(self.trees)
107 | for i in range(sentences_length):
108 | ith_sentence_length = len(self.trees[i].labels)
109 | for j in range(ith_sentence_length):
110 | self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j]))
111 | return syn_label_dict
112 |
113 |
114 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings):
115 | dep_trees = SyntacticCONLL()
116 | dep_trees.read_from_file(file_path, max_sentence_length=2000)
117 | dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings)
118 |
119 | auto_dep_trees = OrderedDict()
120 | for tree in dep_trees.trees:
121 | sentence = ' '.join(tree.word_forms[1:]) # remove the "Root"
122 | auto_dep_trees[sentence] = tree
123 | return auto_dep_trees
124 |
125 |
126 | class SyntacticRepresentation(object):
127 | def __init__(self):
128 | self.file_name = ""
129 | self.representations = []
130 |
131 | def read_from_file(self, filename):
132 | self.file_name = filename
133 | print("Reading lstm representations from {}".format(self.file_name))
134 | representation_file = open(self.file_name, 'r')
135 | if representation_file.closed:
136 | print("Cannot open the representation file! Please check {}".format(self.file_name))
137 | exit()
138 | each_sentence_representations = []
139 | for line in representation_file:
140 | if line == '\n' or line == "\r\n": # new sentence
141 | self.representations.append(each_sentence_representations)
142 | each_sentence_representations = []
143 | continue
144 | line = line.strip()
145 | line = line.split('\t')
146 | line = line[1].split(' ')
147 | rep = np.asarray(line, dtype=np.float32)
148 | each_sentence_representations.append(rep)
149 | representation_file.close()
150 | print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations)))
151 |
152 | def minus_by_the_predicate(self, corpus_tensors):
153 | has_processed_sentence_id = {}
154 | for i, data in enumerate(corpus_tensors):
155 | sentence_id = data[0][0][0]
156 | predicates = data[0][2]
157 | predicate_id = predicates.argmax()
158 | if sentence_id in has_processed_sentence_id:
159 | continue
160 | else:
161 | has_processed_sentence_id[sentence_id] = 1
162 | for j in range(1, len(self.representations[sentence_id])): # Root doesn't use.
163 | self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j]
164 |
165 | def check_math_corpus(self, lengths):
166 | for i, length in enumerate(lengths):
167 | if len(self.representations[i]) != length + 1: # 1 means the first one, Root. Actually never use it.
168 | print(i, length, len(self.representations[i]))
169 | print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length)
170 | exit()
171 | print("LSTM representation match the corpus!")
172 |
--------------------------------------------------------------------------------
/src/orl-4.1/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["neural_srl"]
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/Encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | from torch.nn.utils.rnn import pack_padded_sequence as pack
5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
6 | from .TreeGRU import DTTreeGRU, TDTreeGRU
7 | from .Tree import creatTree
8 |
9 |
10 | class EncoderRNN(nn.Module):
11 | """ The standard RNN encoder.
12 | """
13 | def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
14 | super(EncoderRNN, self).__init__()
15 | self.hidden_size = hidden_size
16 | self.num_layers = num_layers
17 | self.dropout = nn.Dropout(dropout)
18 |
19 | self.rnn = nn.GRU(input_size=input_size,
20 | hidden_size=hidden_size,
21 | num_layers=num_layers,
22 | bidirectional=True) # batch_first = False
23 | self.transform = nn.Linear(in_features=2 * hidden_size,
24 | out_features=input_size,
25 | bias=True)
26 | self.dt_tree = DTTreeGRU(input_size, hidden_size)
27 | self.td_tree = TDTreeGRU(input_size, hidden_size)
28 |
29 | def forward(self, input, heads, lengths=None, hidden=None):
30 | """ See EncoderBase.forward() for description of args and returns.
31 | inputs: [L, B, H], including the -ROOT-
32 | heads: [heads] * B
33 | """
34 | emb = self.dropout(input)
35 |
36 | packed_emb = emb
37 | if lengths is not None:
38 | # Lengths data is wrapped inside a Variable.
39 | packed_emb = pack(emb, lengths)
40 |
41 | outputs, hidden_t = self.rnn(packed_emb, hidden)
42 |
43 | if lengths is not None:
44 | outputs = unpack(outputs)[0]
45 |
46 | outputs = self.dropout(self.transform(outputs))
47 | max_length, batch_size, input_dim = outputs.size()
48 | trees = []
49 | indexes = np.full((max_length, batch_size), -1,
50 | dtype=np.int32) # a col is a sentence
51 | for b, head in enumerate(heads):
52 | root, tree = creatTree(
53 | head) # head: a sentence's heads; sentence base
54 | root.traverse() # traverse the tree
55 | for step, index in enumerate(root.order):
56 | indexes[step, b] = index
57 | trees.append(tree)
58 |
59 | dt_outputs, dt_hidden_ts = self.dt_tree.forward(
60 | outputs, indexes, trees)
61 | td_outputs, td_hidden_ts = self.td_tree.forward(
62 | outputs, indexes, trees)
63 |
64 | outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1)
65 | output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0)
66 |
67 | return outputs, output_t
68 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/Tree.py:
--------------------------------------------------------------------------------
1 | class Tree(object):
2 | def __init__(self, index):
3 | self.parent = None
4 | self.is_left = False
5 | self.index = index
6 | self.left_children = list()
7 | self.left_num = 0
8 | self.right_children = list()
9 | self.right_num = 0
10 | self._depth = -1
11 | self.order = []
12 |
13 | def add_left(self, child):
14 | """
15 | :param child: a Tree object represent the child
16 | :return:
17 | """
18 | child.parent = self
19 | child.is_left = True
20 | self.left_children.append(child)
21 | self.left_num += 1
22 |
23 | def add_right(self, child):
24 | """
25 | :param child: a Tree object represent the child
26 | :return:
27 | """
28 | child.parent = self
29 | child.is_left = False
30 | self.right_children.append(child)
31 | self.right_num += 1
32 |
33 | def size(self): # compute the total size of the Tree
34 | if hasattr(self, '_size'):
35 | return self._size
36 | count = 1
37 | for i in range(self.left_num):
38 | count += self.left_children[i].size()
39 | for i in range(self.right_num):
40 | count += self.right_children[i].size()
41 | self._size = count
42 | return self._size
43 |
44 | def depth(self): # compute the depth of the Tree
45 | if self._depth > 0:
46 | return self._depth
47 | count = 0
48 | if self.left_num + self.right_num > 0:
49 | for i in range(self.left_num):
50 | child_depth = self.left_children[i].depth()
51 | if child_depth > count:
52 | count = child_depth
53 | for i in range(self.right_num):
54 | child_depth = self.right_children[i].depth()
55 | if child_depth > count:
56 | count = child_depth
57 | count += 1
58 | self._depth = count
59 | return self._depth
60 |
61 | def traverse(self): # traverse the Tree
62 | if len(self.order) > 0:
63 | return self.order
64 |
65 | for i in range(self.left_num):
66 | left_order = self.left_children[i].traverse()
67 | self.order.extend(left_order)
68 | for i in range(self.right_num):
69 | right_order = self.right_children[i].traverse()
70 | self.order.extend(right_order)
71 | self.order.append(self.index) # append the root
72 | return self.order
73 |
74 |
75 | def creatTree(heads):
76 | tree = []
77 | # current sentence has already been numberized [form, head, rel]
78 | root = None
79 | for idx, head in enumerate(heads):
80 | tree.append(Tree(idx))
81 |
82 | for idx, head in enumerate(heads):
83 | if head == -1: # -1 mszhang, 0 kiro
84 | root = tree[idx]
85 | continue
86 | if head < 0:
87 | print('error: multi roots')
88 | if head > idx:
89 | tree[head].add_left(tree[idx])
90 | if head < idx:
91 | tree[head].add_right(tree[idx])
92 | if head == idx:
93 | print('error: head is it self.')
94 |
95 | return root, tree
96 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["Encoder", "Tree", "TreeGRU"]
2 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/gcn.py:
--------------------------------------------------------------------------------
1 | """
2 | GCN model for relation extraction.
3 | """
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.autograd import Variable
9 | from ..shared.constants import PAD_ID
10 | import numpy as np
11 |
12 |
13 | class GCN(nn.Module):
14 | def __init__(self, config, input_dim, mem_dim, num_layers):
15 | super(GCN, self).__init__()
16 | self.config = config
17 | self.input_dim = input_dim
18 | self.mem_dim = mem_dim
19 | self.layers = num_layers
20 |
21 | # rnn layer
22 | if self.config.gcn_rnn is True:
23 | input_size = self.input_dim
24 | self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True,
25 | dropout=self.config.gcn_rnn_dropout, bidirectional=True)
26 | self.in_dim = self.config.gcn_rnn_hidden * 2
27 | self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout) # use on last layer output
28 |
29 | self.in_drop = nn.Dropout(self.config.gcn_input_dropout)
30 | self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout)
31 |
32 | # gcn layer
33 | self.W = nn.ModuleList()
34 | self.layer_normalization = nn.ModuleList()
35 |
36 | for layer in range(self.layers):
37 | # input_dim = self.in_dim if layer == 0 else self.mem_dim
38 | self.W.append(nn.Linear(self.in_dim, self.in_dim))
39 | self.layer_normalization.append(LayerNormalization(self.in_dim))
40 |
41 | def conv_l2(self):
42 | conv_weights = []
43 | for w in self.W:
44 | conv_weights += [w.weight, w.bias]
45 | return sum([x.pow(2).sum() for x in conv_weights])
46 |
47 | def encode_with_rnn(self, rnn_inputs, masks, batch_size):
48 | seq_lens = masks.data.eq(1).long().sum(1).squeeze()
49 | h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers)
50 |
51 | # SORT YOUR TENSORS BY LENGTH!
52 | seq_lens, perm_idx = seq_lens.sort(0, descending=True)
53 |
54 | rnn_inputs = rnn_inputs[perm_idx]
55 | rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True)
56 | rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0))
57 | rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)
58 |
59 | _, unperm_idx = perm_idx.sort(0)
60 | rnn_outputs = rnn_outputs[unperm_idx]
61 | return rnn_outputs
62 |
63 | def forward(self, adj, embs, masks):
64 | batch_size = masks.size()[0]
65 | embs = self.in_drop(embs)
66 | # rnn layer
67 | if self.config.gcn_rnn is True:
68 | gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size))
69 | else:
70 | gcn_inputs = embs
71 |
72 | # gcn layer
73 | denom = adj.sum(2).unsqueeze(2) + 1
74 | mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2)
75 | # # zero out adj for ablation
76 | # if self.opt.get('no_adj', False):
77 | # adj = torch.zeros_like(adj)
78 |
79 | for l in range(self.layers):
80 | # print(gcn_inputs.size(), adj.size())
81 | x = gcn_inputs
82 | Ax = adj.bmm(gcn_inputs)
83 | AxW = self.W[l](Ax)
84 | AxW = AxW + self.W[l](gcn_inputs) # self loop
85 | AxW = AxW / denom
86 |
87 | gAxW = F.relu(AxW)
88 | gcn_inputs = self.gcn_drop(gAxW)
89 | self.layer_normalization[l].forward(gcn_inputs + x)
90 |
91 | return gcn_inputs, mask
92 |
93 |
94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True):
95 | total_layers = num_layers * 2 if bidirectional else num_layers
96 | state_shape = (total_layers, batch_size, hidden_dim)
97 | h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False)
98 | if use_cuda:
99 | return h0.cuda(), c0.cuda()
100 | else:
101 | return h0, c0
102 |
103 |
104 | class LayerNormalization(nn.Module):
105 | ''' Layer normalization module '''
106 |
107 | def __init__(self, d_hid, eps=1e-3): #
108 | super(LayerNormalization, self).__init__()
109 | self.eps = eps
110 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
111 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
112 |
113 | def forward(self, z):
114 | if z.size(1) == 1:
115 | return z
116 | mu = torch.mean(z, keepdim=True, dim=-1)
117 | sigma = torch.std(z, keepdim=True, dim=-1)
118 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) # 1e-3 is ok, because variance and std.
119 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
120 | return ln_out
121 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | Basic operations on trees.
3 | """
4 |
5 | import numpy as np
6 | from collections import defaultdict
7 |
8 |
9 | class Tree(object):
10 | """
11 | Reused tree object from stanfordnlp/treelstm.
12 | """
13 |
14 | def __init__(self):
15 | self.parent = None
16 | # head probability
17 | self.phead = -1
18 | self.num_children = 0
19 | self.children = list()
20 |
21 | def add_child(self, child):
22 | child.parent = self
23 | self.num_children += 1
24 | self.children.append(child)
25 |
26 | def size(self):
27 | if getattr(self, '_size'):
28 | return self._size
29 | count = 1
30 | for i in xrange(self.num_children):
31 | count += self.children[i].size()
32 | self._size = count
33 | return self._size
34 |
35 | def depth(self):
36 | if getattr(self, '_depth'):
37 | return self._depth
38 | count = 0
39 | if self.num_children > 0:
40 | for i in xrange(self.num_children):
41 | child_depth = self.children[i].depth()
42 | if child_depth > count:
43 | count = child_depth
44 | count += 1
45 | self._depth = count
46 | return self._depth
47 |
48 | def __iter__(self):
49 | yield self
50 | for c in self.children:
51 | for x in c:
52 | yield x
53 |
54 |
55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos):
56 | """
57 | Convert a sequence of head indexes into a tree object.
58 | """
59 | tokens = tokens[:len_].tolist()
60 | head = head[:len_].tolist()
61 | root = None
62 |
63 | if prune < 0:
64 | nodes = [Tree() for _ in head]
65 |
66 | for i in range(len(nodes)):
67 | h = head[i]
68 | nodes[i].idx = i
69 | nodes[i].dist = -1 # just a filler
70 | if h == 0:
71 | root = nodes[i]
72 | else:
73 | nodes[h - 1].add_child(nodes[i])
74 | else:
75 | # find dependency path
76 | subj_pos = [i for i in range(len_) if subj_pos[i] == 0]
77 | obj_pos = [i for i in range(len_) if obj_pos[i] == 0]
78 |
79 | cas = None
80 |
81 | subj_ancestors = set(subj_pos)
82 | for s in subj_pos:
83 | h = head[s]
84 | tmp = [s]
85 | while h > 0:
86 | tmp += [h - 1]
87 | subj_ancestors.add(h - 1)
88 | h = head[h - 1]
89 |
90 | if cas is None:
91 | cas = set(tmp)
92 | else:
93 | cas.intersection_update(tmp)
94 |
95 | obj_ancestors = set(obj_pos)
96 | for o in obj_pos:
97 | h = head[o]
98 | tmp = [o]
99 | while h > 0:
100 | tmp += [h - 1]
101 | obj_ancestors.add(h - 1)
102 | h = head[h - 1]
103 | cas.intersection_update(tmp)
104 |
105 | # find lowest common ancestor
106 | if len(cas) == 1:
107 | lca = list(cas)[0]
108 | else:
109 | child_count = {k: 0 for k in cas}
110 | for ca in cas:
111 | if head[ca] > 0 and head[ca] - 1 in cas:
112 | child_count[head[ca] - 1] += 1
113 |
114 | # the LCA has no child in the CA set
115 | for ca in cas:
116 | if child_count[ca] == 0:
117 | lca = ca
118 | break
119 |
120 | path_nodes = subj_ancestors.union(obj_ancestors).difference(cas)
121 | path_nodes.add(lca)
122 |
123 | # compute distance to path_nodes
124 | dist = [-1 if i not in path_nodes else 0 for i in range(len_)]
125 |
126 | for i in range(len_):
127 | if dist[i] < 0:
128 | stack = [i]
129 | while stack[-1] >= 0 and stack[-1] not in path_nodes:
130 | stack.append(head[stack[-1]] - 1)
131 |
132 | if stack[-1] in path_nodes:
133 | for d, j in enumerate(reversed(stack)):
134 | dist[j] = d
135 | else:
136 | for j in stack:
137 | if j >= 0 and dist[j] < 0:
138 | dist[j] = int(1e4) # aka infinity
139 |
140 | highest_node = lca
141 | nodes = [Tree() if dist[i] <= prune else None for i in range(len_)]
142 |
143 | for i in range(len(nodes)):
144 | if nodes[i] is None:
145 | continue
146 | h = head[i]
147 | nodes[i].idx = i
148 | nodes[i].dist = dist[i]
149 | if h > 0 and i != highest_node:
150 | assert nodes[h - 1] is not None
151 | nodes[h - 1].add_child(nodes[i])
152 |
153 | root = nodes[highest_node]
154 |
155 | assert root is not None
156 | return root
157 |
158 |
159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False):
160 | """
161 | Convert a tree object to an (numpy) adjacency matrix.
162 | """
163 | ret = np.zeros((sent_len, sent_len), dtype=np.float32)
164 |
165 | queue = [tree]
166 | idx = []
167 | while len(queue) > 0:
168 | t, queue = queue[0], queue[1:]
169 |
170 | idx += [t.idx]
171 |
172 | for c in t.children:
173 | ret[t.idx, c.idx] = 1
174 | queue += t.children
175 |
176 | if not directed:
177 | ret = ret + ret.T
178 |
179 | if self_loop:
180 | for i in idx:
181 | ret[i, i] = 1
182 |
183 | return ret
184 |
185 |
186 | def tree_to_dist(sent_len, tree):
187 | ret = -1 * np.ones(sent_len, dtype=np.int64)
188 |
189 | for node in tree:
190 | ret[node.idx] = node.dist
191 |
192 | return ret
193 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/implicit_syntactic_representations.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from torch.nn.utils.rnn import pad_sequence
6 |
7 |
8 | from .model import drop_sequence_sharedmask, _model_var
9 | from .HighWayLSTM import Highway_Concat_BiLSTM
10 | from .layer import NonLinear, Biaffine
11 |
12 |
13 | class ImplicitDependencyRepresentations(nn.Module):
14 | def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size):
15 | super(ImplicitDependencyRepresentations, self).__init__()
16 | self.config = config
17 | self.lstm_input_size = lstm_input_size
18 | self.lstm_hidden_size = lstm_hidden_size
19 | self.dep_label_space_size = dep_label_space_size
20 | # softmax weights
21 | self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0]))
22 | self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0]))
23 | for _ in range(self.config.dep_num_lstm_layers)])
24 | self.cuda = True
25 |
26 | self.dep_bilstm = Highway_Concat_BiLSTM(
27 | input_size=self.lstm_input_size,
28 | hidden_size=self.lstm_hidden_size, # // 2 for MyLSTM
29 | num_layers=self.config.dep_num_lstm_layers,
30 | batch_first=True,
31 | bidirectional=True,
32 | dropout_in=config.input_dropout_prob,
33 | dropout_out=config.recurrent_dropout_prob
34 | )
35 |
36 | # dependency parsing module
37 | self.mlp_arc_dep = NonLinear(
38 | input_size=2 * config.lstm_hidden_size,
39 | hidden_size=config.mlp_arc_size + config.mlp_rel_size,
40 | activation=nn.LeakyReLU(0.1))
41 | self.mlp_arc_head = NonLinear(
42 | input_size=2 * config.lstm_hidden_size,
43 | hidden_size=config.mlp_arc_size + config.mlp_rel_size,
44 | activation=nn.LeakyReLU(0.1))
45 |
46 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
47 | self.arc_num = int(config.mlp_arc_size / 100)
48 | self.rel_num = int(config.mlp_rel_size / 100)
49 |
50 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False))
51 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size,
52 | bias=(True, True))
53 |
54 | def init_masks(self, batch_size, lengths):
55 | max_sent_length = max(lengths)
56 | num_sentences = batch_size
57 | indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1)
58 | masks = indices < lengths.unsqueeze(1)
59 | masks = masks.type(torch.FloatTensor)
60 | if self.cuda:
61 | masks = masks.cuda()
62 | return masks
63 |
64 | def forward(self, num_sentences, context_embeddings, sent_lengths, dep):
65 | masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths))
66 | lstm_out, _ = self.dep_bilstm(context_embeddings, masks)
67 |
68 | if self.training:
69 | lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp)
70 |
71 | x_all_dep = self.mlp_arc_dep(lstm_out)
72 | x_all_head = self.mlp_arc_head(lstm_out)
73 |
74 | if self.training:
75 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
76 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
77 |
78 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
79 | x_all_head_splits = torch.split(x_all_head, 100, dim=2)
80 |
81 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
82 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
83 |
84 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
85 | arc_logit = torch.squeeze(arc_logit, dim=3)
86 |
87 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
88 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
89 |
90 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
91 |
92 | self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond
93 |
94 | heads, rels = dep[0], dep[1]
95 | loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist()) # compute the dep loss
96 | return loss, self.arc_logits
97 |
98 | def compute_dep_loss(self, true_arcs, true_rels, lengths):
99 | b, l1, l2 = self.arc_logits.size()
100 | index_true_arcs = _model_var(
101 | self.parameters(),
102 | pad_sequence(true_arcs, padding_value=0, batch_first=True)
103 | )
104 | true_arcs = _model_var(
105 | self.parameters(),
106 | pad_sequence(true_arcs, padding_value=-1, batch_first=True)
107 | )
108 |
109 | masks = []
110 | for length in lengths:
111 | mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length))
112 | mask = _model_var(self.parameters(), mask)
113 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
114 | masks.append(mask.transpose(0, 1))
115 | length_mask = torch.stack(masks, 0)
116 | arc_logits = self.arc_logits + length_mask
117 |
118 | arc_loss = F.cross_entropy(
119 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
120 | ignore_index=-1, reduction="sum")
121 |
122 | size = self.rel_logits.size()
123 | output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3]))
124 | for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))):
125 | rel_probs = []
126 | for i in range(l1):
127 | rel_probs.append(logits[i][int(arcs[i])])
128 | rel_probs = torch.stack(rel_probs, dim=0)
129 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
130 |
131 | b, l1, d = output_logits.size()
132 | true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True))
133 |
134 | rel_loss = F.cross_entropy(
135 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum")
136 |
137 | loss = arc_loss + rel_loss
138 | return loss
139 |
140 | def get_reps(self, context_embeddings, masks):
141 | dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks)
142 | normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0)
143 | normed_weights = torch.split(normed_weights, 1) # split_size_or_sections=1, split_size=1) # 0.3.0
144 | dep_representations = self.dep_gamma * \
145 | sum([normed_weights[i] * dep_lstm_outputs[i] for i in
146 | range(self.config.dep_num_lstm_layers)])
147 | if self.training:
148 | lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp)
149 |
150 | x_all_dep = self.mlp_arc_dep(dep_lstm_out)
151 | x_all_head = self.mlp_arc_head(dep_lstm_out)
152 |
153 | if self.training:
154 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
155 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
156 |
157 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
158 | x_all_head_splits = torch.split(x_all_head, 100, dim=2)
159 |
160 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
161 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
162 |
163 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
164 | arc_logit = torch.squeeze(arc_logit, dim=3)
165 | return dep_representations, arc_logit
166 |
167 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn
4 | from .layer import MyLSTM, NonLinear, Biaffine
5 |
6 |
7 | def _model_var(parameters, x):
8 | p = next(iter(filter(lambda p: p.requires_grad, parameters)))
9 | if p.is_cuda:
10 | x = x.cuda(p.get_device())
11 | return torch.autograd.Variable(x)
12 |
13 |
14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb):
15 | batch_size, seq_length, _ = word_embeddings.size()
16 | # tensor.new: build a tensor with the same data type
17 | word_masks = word_embeddings.data.new(batch_size,
18 | seq_length).fill_(1 - dropout_emb)
19 | word_masks = torch.Tensor(torch.bernoulli(word_masks))
20 | word_masks.requires_grad = False
21 | tag_masks = tag_embeddings.data.new(batch_size,
22 | seq_length).fill_(1 - dropout_emb)
23 | tag_masks = torch.Tensor(torch.bernoulli(tag_masks))
24 | tag_masks.requires_grad = False
25 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12)
26 | word_masks *= scale
27 | tag_masks *= scale
28 | # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.
29 | word_masks = word_masks.unsqueeze(dim=2) # ?
30 | tag_masks = tag_masks.unsqueeze(dim=2)
31 | word_embeddings = word_embeddings * word_masks
32 | tag_embeddings = tag_embeddings * tag_masks
33 |
34 | return word_embeddings, tag_embeddings
35 |
36 |
37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
38 | if batch_first:
39 | inputs = inputs.transpose(0, 1)
40 | seq_length, batch_size, hidden_size = inputs.size()
41 | drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout)
42 | drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type())
43 | drop_masks.requires_grad = False
44 | drop_masks = drop_masks / (1 - dropout)
45 | drop_masks = torch.unsqueeze(drop_masks,
46 | dim=2).expand(-1, -1,
47 | seq_length).permute(2, 0, 1)
48 | inputs = inputs * drop_masks
49 |
50 | return inputs.transpose(1, 0)
51 |
52 |
53 | class ParserModel(nn.Module): # build a biaffine parser model
54 | def __init__(self, vocab, config, pretrained_embedding):
55 | super(ParserModel, self).__init__()
56 | self.config = config
57 | self.word_embed = nn.Embedding(vocab.vocab_size,
58 | config.word_dims,
59 | padding_idx=0)
60 | self.extword_embed = nn.Embedding(vocab.extvocab_size,
61 | config.word_dims,
62 | padding_idx=0)
63 | self.tag_embed = nn.Embedding(vocab.tag_size,
64 | config.tag_dims,
65 | padding_idx=0)
66 |
67 | word_init = np.zeros((vocab.vocab_size, config.word_dims),
68 | dtype=np.float32)
69 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
70 |
71 | tag_init = np.random.randn(vocab.tag_size,
72 | config.tag_dims).astype(np.float32)
73 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
74 |
75 | self.extword_embed.weight.data.copy_(
76 | torch.from_numpy(pretrained_embedding))
77 | self.extword_embed.weight.requires_grad = False
78 |
79 | self.lstm = MyLSTM(
80 | input_size=config.word_dims + config.tag_dims,
81 | hidden_size=config.lstm_hiddens,
82 | num_layers=config.lstm_layers,
83 | batch_first=True,
84 | bidirectional=True,
85 | dropout_in=config.dropout_lstm_input,
86 | dropout_out=config.dropout_lstm_hidden,
87 | )
88 |
89 | self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens,
90 | hidden_size=config.mlp_arc_size +
91 | config.mlp_rel_size,
92 | activation=nn.LeakyReLU(0.1))
93 | self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens,
94 | hidden_size=config.mlp_arc_size +
95 | config.mlp_rel_size,
96 | activation=nn.LeakyReLU(0.1))
97 |
98 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
99 | self.arc_num = int(config.mlp_arc_size / 100) # config: 500
100 | self.rel_num = int(config.mlp_rel_size / 100) # config: 100
101 |
102 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size,
103 | 1, bias=(True, False))
104 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size,
105 | vocab.rel_size, bias=(True, True))
106 |
107 | def forward(
108 | self, words, extwords, tags,
109 | masks): # words [batch, max_sentence_length], padding with zeros
110 | # x = (batch size, sequence length, dimension of embedding)
111 | x_word_embed = self.word_embed(words)
112 | x_extword_embed = self.extword_embed(extwords)
113 | x_embed = x_word_embed + x_extword_embed
114 | x_tag_embed = self.tag_embed(tags)
115 |
116 | if self.training:
117 | x_embed, x_tag_embed = drop_input_independent(
118 | x_embed, x_tag_embed, self.config.dropout_emb)
119 |
120 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2)
121 |
122 | outputs, _ = self.lstm(x_lexical, masks, None)
123 | outputs = outputs.transpose(1, 0)
124 |
125 | if self.training:
126 | outputs = drop_sequence_sharedmask(outputs,
127 | self.config.dropout_mlp)
128 |
129 | x_all_dep = self.mlp_arc_dep(outputs)
130 | x_all_head = self.mlp_arc_head(outputs)
131 |
132 | if self.training:
133 | x_all_dep = drop_sequence_sharedmask(x_all_dep,
134 | self.config.dropout_mlp)
135 | x_all_head = drop_sequence_sharedmask(x_all_head,
136 | self.config.dropout_mlp)
137 |
138 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
139 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
140 |
141 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
142 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
143 |
144 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
145 | arc_logit = torch.squeeze(arc_logit, dim=3)
146 |
147 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
148 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
149 |
150 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
151 | return arc_logit, rel_logit_cond
152 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/pre_trained_language_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.nn import Parameter
5 | from torch.nn.utils.rnn import pad_sequence
6 | from transformers import BertModel
7 | from transformers import BertTokenizer
8 |
9 |
10 | class ScalarMix(torch.nn.Module):
11 | def __init__(self, mixture_size=4):
12 | super(ScalarMix, self).__init__()
13 | self.mixture_size = mixture_size
14 | self.scalar_parameters = Parameter(torch.ones(mixture_size))
15 | self.gamma = Parameter(torch.tensor(1.0))
16 |
17 | def forward(self, layers):
18 | normed_weights = F.softmax(self.scalar_parameters, dim=0)
19 | return self.gamma * sum(
20 | weight * tensor for weight, tensor in zip(normed_weights, layers)
21 | )
22 |
23 |
24 | class Bert_Embedding(nn.Module):
25 | def __init__(self, bert_path, bert_layer, bert_dim, freeze=True):
26 | super(Bert_Embedding, self).__init__()
27 | self.bert_layer = bert_layer
28 | self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True)
29 | print(self.bert.config)
30 | self.scalar_mix = ScalarMix(bert_layer)
31 |
32 | if freeze:
33 | self.freeze()
34 |
35 | def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask):
36 | self.eval()
37 | sen_lens = token_starts_masks.sum(dim=1)
38 | _, _, bert_outs = self.bert(
39 | subword_idxs,
40 | attention_mask=subword_masks
41 | ) # tuple([Batch_size, max_sentence_length, dim])
42 | bert_outs = bert_outs[-self.bert_layer:]
43 | bert_outs = self.scalar_mix(bert_outs)
44 | # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
45 | # bert_outs = pad_sequence(bert_outs, batch_first=True)
46 | zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1))
47 | zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks])
48 | subwords_lens = subwords_mask.sum(-1)
49 | subwords_lens += (subwords_lens == 0).type(subwords_lens.type()) # 0.0 / 0 -> 0.0 / 1
50 | bert_outs = zeros.sum(2) / subwords_lens.unsqueeze(-1)
51 | return bert_outs
52 |
53 | def freeze(self):
54 | for para in self.bert.parameters():
55 | para.requires_grad = False
56 |
57 |
58 | class Bert_Encoder(nn.Module):
59 | def __init__(self, bert_path, bert_layer, freeze=False, fix_layer_number=None):
60 | super(Bert_Encoder, self).__init__()
61 | self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True)
62 | self.bert_layer = bert_layer
63 |
64 | if freeze:
65 | self.freeze()
66 | if fix_layer_number is not None:
67 | self.fix_several_layers(fix_layer_number)
68 |
69 | def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask):
70 | sen_lens = token_starts_masks.sum(dim=1)
71 | _, _, bert_outs = self.bert(
72 | subword_idxs,
73 | token_type_ids=None,
74 | attention_mask=subword_masks,
75 | )
76 | bert_outs = bert_outs[-1] # the last layer of BERT outputs
77 | # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
78 | zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1))
79 | zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks])
80 | bert_outs = pad_sequence(zeros, batch_first=True)
81 | subwords_lens = subwords_mask.sum(-1)
82 | subwords_lens += (subwords_lens == 0).type(subwords_lens.type()) # 0.0 / 0 -> 0.0 / 1
83 | bert_outs = bert_outs.sum(2) / subwords_lens.unsqueeze(-1)
84 | return bert_outs
85 |
86 | def freeze(self):
87 | for para in self.bert.parameters():
88 | para.requires_grad = False
89 |
90 | def fix_several_layers(self, layer_numer):
91 | fixed_layer_names = ["embeddings"] if layer_numer >= 0 else []
92 | for i in range(layer_numer):
93 | fixed_layer_names.append("encoder.layer." + str(i) + '.')
94 | print("{} will be fixed".format(fixed_layer_names))
95 | for name, para in self.bert.named_parameters():
96 | for layer_name in fixed_layer_names:
97 | if layer_name in name:
98 | para.requires_grad = False
99 | break
100 |
101 |
102 | class Vocab(object):
103 | def __init__(self, bert_vocab_path):
104 | self.tokenizer = BertTokenizer.from_pretrained(
105 | bert_vocab_path, do_lower_case=False
106 | )
107 |
108 | def numericalize(self, seqs, training=True):
109 | subwords, masks, starts = [], [], []
110 | text_masks, subwords_mask = [], []
111 |
112 | for seq in seqs:
113 | seq = [self.tokenizer.tokenize(token) for token in seq]
114 | seq = [piece if piece else ["[PAD]"] for piece in seq]
115 | seq = [["[CLS]"]] + seq + [["[SEP]"]]
116 | lengths = [0] + [len(piece) for piece in seq]
117 | # flatten the word pieces
118 | tokens = sum(seq, [])
119 | # subwords indexes
120 | token_idx = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
121 | subwords.append(token_idx)
122 |
123 | # subword masks
124 | mask = torch.ones(len(tokens), dtype=torch.bool)
125 | masks.append(mask)
126 | # subword text mask
127 | text_mask = torch.BoolTensor([0] + [1] * (len(tokens) - 2) + [0])
128 | text_masks.append(text_mask)
129 |
130 | # record the start position of all words
131 | start_idxs = torch.tensor(lengths).cumsum(0)[1:-2] # bos:0 eos:-2
132 | # subword start masks
133 | start_mask = torch.zeros(len(tokens), dtype=torch.bool)
134 | start_mask[start_idxs] = 1
135 | starts.append(start_mask)
136 |
137 | # record the start and last position of all words
138 | start_end_idxs = torch.tensor(lengths).cumsum(0)[1:-1]
139 | subword_mask = [torch.ones(start_end_idxs[i + 1] - start_end_idxs[i])
140 | for i in range(len(start_end_idxs) - 1)]
141 | subword_mask = pad_sequence(subword_mask, batch_first=True)
142 | subwords_mask.append(subword_mask)
143 | max_subword_length = max(m.size(-1) for m in subwords_mask)
144 | max_sentence_length = max(m.size(0) for m in subwords_mask)
145 | subwords_mask = [F.pad(mask, (0, max_subword_length - mask.size(1), 0, max_sentence_length - mask.size(0)))
146 | for mask in subwords_mask] # [left, right, top, down]
147 | subwords_mask = torch.stack(subwords_mask)
148 | return subwords, masks, starts, text_masks, subwords_mask
149 |
150 |
151 | class BERT_input(nn.Module):
152 | def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim):
153 | super(BERT_input, self).__init__()
154 | self.vocab = Vocab(bert_vocab_path)
155 | self.bert_input = Bert_Embedding(bert_path, bert_layer, bert_dim)
156 |
157 | def forward(self, seqs):
158 | subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs)
159 | subwords = pad_sequence(subwords, batch_first=True).cuda()
160 | masks = pad_sequence(masks, batch_first=True).cuda()
161 | starts = pad_sequence(starts, batch_first=True).cuda()
162 | text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda()
163 | subwords_mask = subwords_mask.type(torch.BoolTensor).cuda()
164 | bert_outs = self.bert_input.forward(subwords, masks, starts, text_masks, subwords_mask)
165 | return bert_outs
166 |
167 |
168 | class BERT_model(nn.Module):
169 | def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim, fix_layer_number=None):
170 | super(BERT_model, self).__init__()
171 | self.vocab = Vocab(bert_vocab_path)
172 | self.bert_encoder = Bert_Encoder(bert_path, bert_layer,
173 | freeze=False, fix_layer_number=fix_layer_number)
174 |
175 | def forward(self, seqs):
176 | subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs)
177 | subwords = pad_sequence(subwords, batch_first=True).cuda()
178 | masks = pad_sequence(masks, batch_first=True).cuda()
179 | starts = pad_sequence(starts, batch_first=True).type(torch.BoolTensor).cuda()
180 | text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda()
181 | subwords_mask = subwords_mask.type(torch.BoolTensor).cuda()
182 | bert_outs = self.bert_encoder.forward(subwords, masks, starts, text_masks, subwords_mask)
183 | return bert_outs
184 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.autograd import Variable
4 |
5 |
6 | def block_orth_normal_initializer(input_size, output_size):
7 | weight = []
8 | for o in output_size:
9 | for i in input_size:
10 | param = torch.FloatTensor(o, i)
11 | torch.nn.init.orthogonal_(param)
12 | weight.append(param)
13 | return torch.cat(weight)
14 |
15 |
16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights):
17 | batch_size = len(batch_x) # batch size
18 | length = max(batch_lengths)
19 |
20 | words = Variable(torch.LongTensor(batch_size, length).zero_(),
21 | requires_grad=False) # padding with 0
22 | predicates = Variable(torch.LongTensor(batch_size, length).zero_(),
23 | requires_grad=False)
24 | masks = Variable(torch.Tensor(batch_size, length).zero_(),
25 | requires_grad=False)
26 | padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(),
27 | requires_grad=False)
28 | labels, lengths = [], []
29 |
30 | b = 0
31 | for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y,
32 | batch_lengths,
33 | batch_weights):
34 | lengths.append(s_length)
35 | rel = np.zeros((s_length), dtype=np.int32)
36 | for i in range(s_length):
37 | words[b, i] = s_words[1][i] # word
38 | predicates[b, i] = s_words[2][i] # predicate
39 | rel[i] = s_answer[0][i]
40 | padding_answers[b, i] = s_answer[0][i]
41 | masks[b, i] = 1
42 |
43 | # sentence_id = s_words[0][0] # get the dep_labels_ids of each sentence
44 | b += 1
45 | labels.append(rel)
46 |
47 | return words, predicates, labels, torch.LongTensor(
48 | lengths), masks, padding_answers
49 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__init__.py
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/configuration.py:
--------------------------------------------------------------------------------
1 | ''' Configuration for experiments.
2 | '''
3 | import json
4 | from argparse import Namespace
5 |
6 |
7 | def get_config(config_filepath):
8 | with open(config_filepath, 'r') as config_file:
9 | conf = json.load(config_file, object_hook=lambda d: Namespace(**d))
10 | return conf
11 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/conll_utils.py:
--------------------------------------------------------------------------------
1 | def bio_to_se(labels):
2 | slen = len(labels)
3 | new_labels = []
4 | has_opening = False
5 | for i in range(slen):
6 | label = labels[i]
7 | if label == 'O':
8 | new_labels.append('*')
9 | continue
10 | new_label = '*'
11 | if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]:
12 | new_label = '(' + label[2:] + new_label
13 | has_opening = True
14 | if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]:
15 | new_label = new_label + ')'
16 | has_opening = False
17 | new_labels.append(new_label)
18 |
19 | if has_opening:
20 | ''' logging '''
21 | print("Has unclosed opening: {}".format(labels))
22 | return new_labels
23 |
24 |
25 | def print_sentence_to_conll(fout, tokens, labels):
26 | for label_column in labels:
27 | assert len(label_column) == len(tokens)
28 | for i in range(len(tokens)):
29 | fout.write(tokens[i].ljust(15))
30 | for label_column in labels:
31 | fout.write(label_column[i].rjust(15))
32 | fout.write("\n")
33 | fout.write("\n")
34 |
35 |
36 | def print_to_conll(pred_labels, gold_props_file, output_filename):
37 | """
38 | """
39 | fout = open(output_filename, 'w')
40 | seq_ptr = 0
41 | num_props_for_sentence = 0
42 | tokens_buf = []
43 |
44 | for line in open(gold_props_file, 'r'):
45 | line = line.strip()
46 | if line == "" and len(tokens_buf) > 0:
47 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
48 | seq_ptr += num_props_for_sentence
49 | tokens_buf = []
50 | num_props_for_sentence = 0
51 | else:
52 | info = line.split()
53 | num_props_for_sentence = len(info) - 1
54 | tokens_buf.append(info[0])
55 |
56 | # Output last sentence.
57 | if len(tokens_buf) > 0:
58 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
59 |
60 | fout.close()
61 |
62 |
63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename):
64 | fout = open(output_filename, 'w')
65 | props_buf = []
66 | labels_buf = []
67 | tokens_buf = []
68 | prev_words = ''
69 |
70 | x, y, num_tokens, _ = data
71 | for (sent, gold, slen) in zip(x, y, num_tokens):
72 | words = [word_dict.idx2str[w[0]] for w in sent[:slen]]
73 | labels = [label_dict.idx2str[l] for l in gold[:slen]]
74 |
75 | concat_words = ' '.join(words)
76 | if concat_words != prev_words and len(props_buf) > 0:
77 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
78 |
79 | print_sentence_to_conll(fout, tokens, labels_buf)
80 | props_buf = []
81 | tokens_buf = []
82 | labels_buf = []
83 | prev_words = ''
84 |
85 | if prev_words == '':
86 | prev_words = concat_words
87 | tokens_buf = [w for w in words]
88 | if 'B-V' in labels:
89 | prop_id = labels.index('B-V')
90 | props_buf.append(prop_id)
91 | labels_buf.append(bio_to_se(labels))
92 |
93 | if len(props_buf) > 0:
94 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
95 | print_sentence_to_conll(fout, tokens, labels_buf)
96 |
97 | fout.close()
98 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constants.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 | import os
3 | import random
4 |
5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../')
6 |
7 | RANDOM_SEED = 12345
8 | random.seed(RANDOM_SEED)
9 |
10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh')
11 |
12 | START_MARKER = ''
13 | END_MARKER = ''
14 | PADDING_TOKEN = '*PAD*'
15 | UNKNOWN_TOKEN = '*UNKNOWN*'
16 | NULL_LABEL = 'O'
17 |
18 | TEMP_DIR = join(ROOT_DIR, '../temp')
19 |
20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT)
21 | if not os.path.exists(TEMP_DIR):
22 | os.makedirs(TEMP_DIR)
23 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constituent_extraction.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import sys
3 | import numpy as np
4 | import random
5 |
6 | from .dictionary import Dictionary
7 | from collections import OrderedDict
8 | from nltk.tree import Tree
9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN
10 | # from .reader import list_of_words_to_ids
11 |
12 |
13 | PREFIX = "--PTB-CONS-LABEL--"
14 |
15 |
16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
17 | ids = []
18 | for s in list_of_words:
19 | # s = s.encode('utf-8') # unicode -> utf-8
20 | if s is None:
21 | ids.append(-1)
22 | continue
23 | if lowercase:
24 | s = s.lower()
25 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
26 | s = UNKNOWN_TOKEN
27 | ids.append(dictionary.add(s))
28 | return ids
29 |
30 |
31 | class constituent_tree():
32 | def __init__(self, sentence, words, tree):
33 | self.sentence = sentence
34 | self.words = words
35 | self.tree = tree
36 | self.heads = None
37 | self.nodes = None
38 | self.indicator = [] # 0 no terminal, 1 terminal
39 | self.word_position = []
40 | self.node_idx = []
41 | self.node_char_idx = []
42 |
43 | self.sentence_length = len(words)
44 | self.input_length = -1
45 | self.sentence_index = -1
46 |
47 | def pos(self):
48 | """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]"""
49 | return self.tree.pos()
50 |
51 | def traverse_tree(self, tree, nodes, indicator, heads, parent, pos, label, word_embeddings):
52 | # print(tree)
53 | # print("subtree", subtree)
54 | if tree.height() > 2:
55 | subtree_label = PREFIX + tree.label()
56 | label.add(subtree_label)
57 | constituent_tree.add_unknown_labels(subtree_label, word_embeddings)
58 | nodes.append(subtree_label)
59 | indicator.append(0)
60 | heads.append(parent - 1)
61 | else:
62 | # print("YY", subtree)
63 | pos.add(tree.label())
64 | subtree_pos = tree[0] # word
65 | subtree_pos = constituent_tree.add_word(subtree_pos, label, word_embeddings)
66 | nodes.append(subtree_pos)
67 | indicator.append(1)
68 | idx = len(nodes) - 1
69 | self.word_position.append(idx)
70 | heads.append(parent - 1)
71 | if tree.height() <= 2:
72 | return
73 | parent = len(nodes)
74 | for i, subtree in enumerate(tree):
75 | self.traverse_tree(subtree, nodes, indicator, heads, parent, pos, label, word_embeddings)
76 |
77 | @staticmethod
78 | def add_unknown_labels(label, word_embeddings):
79 | if label not in word_embeddings:
80 | embedding_size = len(word_embeddings[PADDING_TOKEN])
81 | word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)])
82 |
83 | @staticmethod
84 | def add_word(word, word_dict, word_embeddings):
85 | if word not in word_embeddings:
86 | word = UNKNOWN_TOKEN
87 | idx = word_dict.add(word)
88 | return word
89 |
90 | @staticmethod
91 | def get_node_char_idx(words, char_dict, lowercase=False):
92 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
93 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
94 | for i, word in enumerate(words):
95 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
96 | return single_sample_char_tokens
97 |
98 | def generate_adjacent(self, pos, label_dict, char_dict, word_embeddings):
99 | assert self.heads is None
100 | root_label = PREFIX + self.tree.label()
101 | nodes, heads = [], [] # TODO notice
102 | self.traverse_tree(self.tree, nodes, self.indicator, heads,
103 | len(heads), pos, label_dict, word_embeddings)
104 | self.nodes = nodes
105 | self.heads = heads
106 | self.input_length = len(self.nodes)
107 | self.sentence_index = self.input_length - self.sentence_length - 1
108 | self.node_idx = [label_dict.get_index(node) for node in self.nodes]
109 |
110 | max_word_length = max([len(w) for w in self.nodes] + [3, 4, 5]) # compare with character cnn filter width
111 | self.node_char_idx = np.zeros([len(self.nodes), max_word_length], dtype=np.int64)
112 | for i, word in enumerate(self.nodes):
113 | self.node_char_idx[i, :len(word)] = list_of_words_to_ids(word, char_dict)
114 |
115 | self.node_char_idx = constituent_tree.get_node_char_idx(self.nodes, char_dict)
116 |
117 |
118 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings):
119 | data = []
120 | with open(file_path, 'r') as input_file:
121 | sentence = ""
122 | for line in input_file.readlines():
123 | if line.strip() == "":
124 | data.append(sentence)
125 | sentence = ""
126 | continue
127 | line = line.strip()
128 | if ' ' not in line: # avoid the split of leave node of it's PoS
129 | line = ' ' + line
130 | sentence += line
131 | print("Read {} sentence from {}".format(len(data), file_path))
132 |
133 | cons_trees = OrderedDict()
134 | for sentence in data:
135 | tree = Tree.fromstring(sentence)
136 | words = tree.leaves()
137 | sentence = ' '.join(words)
138 | cons_trees[sentence] = constituent_tree(sentence, words, tree)
139 |
140 | pos_dict = Dictionary(padding_token=PADDING_TOKEN)
141 | for sen in cons_trees:
142 | tree = cons_trees[sen]
143 | tree.generate_adjacent(pos_dict, word_dict, char_dict, word_embeddings)
144 |
145 | return cons_trees, pos_dict
146 |
147 |
148 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constituent_reader.py:
--------------------------------------------------------------------------------
1 | import json
2 | import codecs
3 | import numpy as np
4 |
5 |
6 | from sortedcontainers import SortedSet
7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL
8 | from .dictionary import Dictionary
9 |
10 |
11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
12 | ids = []
13 | for s in list_of_words:
14 | # s = s.encode('utf-8') # unicode -> utf-8
15 | if s is None:
16 | ids.append(-1)
17 | continue
18 | if lowercase:
19 | s = s.lower()
20 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
21 | s = UNKNOWN_TOKEN
22 | ids.append(dictionary.add(s))
23 | return ids
24 |
25 |
26 | class constituent_sentence():
27 | def __init__(self, obj):
28 | self.sentence = obj["sentence"]
29 | self.constituent_spans = obj["constituents"]
30 | self.max_span_width = 30
31 |
32 | def tokenize_cons_spans(self, dictionary, max_cons_width=60):
33 | cons_span = []
34 | set_cons_span = set()
35 | for cons_s in self.constituent_spans: # remove self-loop V-V
36 | cons_start, cons_end, cons_label = cons_s
37 | if cons_label in ["TOP", "S"]: # todo: add some constrains here
38 | continue
39 | if cons_end - cons_start + 1 >= max_cons_width:
40 | continue
41 | if (cons_start, cons_end) not in set_cons_span:
42 | set_cons_span.add((cons_start, cons_end))
43 | cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))])
44 | else:
45 | # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence)
46 | pass
47 | if len(cons_span) == 0: # if the sentence has no arguments.
48 | return [[], [], []]
49 | tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \
50 | zip(*cons_span)
51 | return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels
52 |
53 |
54 | def read_constituent_file(file_path):
55 | sentences = []
56 | with codecs.open(file_path, encoding="utf8") as f:
57 | for line in f.readlines():
58 | sen = json.loads(line)
59 | cons_sen = constituent_sentence(sen)
60 | sentences.append(cons_sen)
61 | print("{} total constituent sentences number {}".format(file_path, len(sentences)))
62 | return sentences
63 |
64 |
65 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False):
66 | sample_word_tokens = [list_of_words_to_ids(
67 | sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples]
68 | # for the character
69 | sample_char_tokens = []
70 | for sent in samples:
71 | words = sent.sentence
72 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
73 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
74 | for i, word in enumerate(words):
75 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
76 | # Add the sample char tokens into the sample_char_tokens
77 | sample_char_tokens.append(single_sample_char_tokens)
78 | sample_lengths = [len(sent.sentence)for sent in samples]
79 | sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples]
80 | return list(zip(sample_lengths, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens))
81 |
82 |
83 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None):
84 | raw_cons_sentences = read_constituent_file(file_path)
85 | cons_label_dict = Dictionary()
86 | cons_label_dict.set_unknown_token(NULL_LABEL)
87 |
88 | # tokenized the data
89 | if word_dict.accept_new is False:
90 | word_dict.accept_new = True
91 | if char_dict.accept_new is False:
92 | char_dict.accept_new = True
93 | cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict,
94 | False, word_embeddings)
95 | # word_dict.accept_new = False
96 | # char_dict.accept_new = False
97 | # cons_label_dict.accept_new = False
98 |
99 | print("="*10, "Constituent Info", "="*10)
100 | print("Extract {} tags".format(cons_label_dict.size()))
101 | # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size()))
102 | print("Max sentence length: {}".format(max([s[0] for s in cons_samples])))
103 | return cons_samples, word_dict, char_dict, cons_label_dict
104 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/dictionary.py:
--------------------------------------------------------------------------------
1 | ''' Bidirectional dictionary that maps between words and ids.
2 | '''
3 |
4 |
5 | class Dictionary(object):
6 | def __init__(self, padding_token=None, unknown_token=None):
7 | self.str2idx = {}
8 | self.idx2str = []
9 |
10 | self.accept_new = True
11 | self.padding_token = None
12 | self.padding_id = None
13 | self.unknown_token = None
14 | self.unknown_id = None
15 | if padding_token is not None: # add the padding info into the dictionary
16 | self.set_padding_token(padding_token)
17 | if unknown_token is not None:
18 | self.set_unknown_token(unknown_token)
19 |
20 | def set_padding_token(self, padding_token):
21 | self.padding_token = padding_token
22 | self.padding_id = self.add(self.padding_token)
23 |
24 | def set_unknown_token(self, unknown_token):
25 | self.unknown_token = unknown_token
26 | self.unknown_id = self.add(self.unknown_token)
27 |
28 | def add(self, new_str):
29 | if new_str not in self.str2idx:
30 | if self.accept_new:
31 | self.str2idx[new_str] = len(self.idx2str)
32 | self.idx2str.append(new_str)
33 | else:
34 | if new_str == "C-ADV":
35 | return self.str2idx["O"]
36 | if self.unknown_id is None:
37 | raise LookupError(
38 | 'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str)
39 | return self.unknown_id
40 |
41 | return self.str2idx[new_str]
42 |
43 | def add_all(self, str_list):
44 | return [self.add(s) for s in str_list]
45 |
46 | def get_index(self, input_str):
47 | if input_str in self.str2idx:
48 | return self.str2idx[input_str]
49 | return None
50 |
51 | def size(self):
52 | return len(self.idx2str)
53 |
54 | def save(self, filename):
55 | with open(filename, 'w') as f:
56 | for s in self.idx2str:
57 | f.write(s + '\n')
58 | f.close()
59 |
60 | def load(self, filename):
61 | with open(filename, 'r') as f:
62 | for line in f:
63 | line = line.strip()
64 | if line != '':
65 | self.add(line)
66 | f.close()
67 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/evaluation.py:
--------------------------------------------------------------------------------
1 | ''' Framework independent evaluator. Not in use yet.
2 | '''
3 | import numpy
4 | import os
5 | from os.path import join
6 | # import subprocess
7 | from .constants import ROOT_DIR
8 | from .conll_utils import print_gold_to_conll
9 | # from .measurements import Timer
10 |
11 |
12 | class TaggerEvaluator(object):
13 | def __init__(self, data):
14 | self.data = data
15 | self.best_accuracy = 0.0
16 | self.has_best = False
17 |
18 | def compute_accuracy(self, predictions):
19 | for x, y in zip(predictions,
20 | [sent[2] for sent in self.data
21 | ]): # the predication's order should be the origin
22 | assert len(x) == y
23 | predictions = numpy.concatenate(predictions)
24 | tensors = self.data
25 | answer = numpy.concatenate(
26 | [sent[1].reshape(sent[1].shape[1]) for sent in tensors])
27 | # predictions.resize(predictions.shape[0]) # resize the answer to the [length, 1]
28 | num_correct = numpy.equal(predictions, answer).sum()
29 | num_total = answer.shape[0]
30 | self.accuracy = (100.0 * num_correct) / num_total
31 | print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct,
32 | num_total))
33 |
34 | def evaluate(self, predictions):
35 | self.compute_accuracy(predictions)
36 | self.has_best = self.accuracy > self.best_accuracy
37 | if self.has_best:
38 | print("Best accuracy so far: {:.3f}".format(self.accuracy))
39 | self.best_accuracy = self.accuracy
40 |
41 |
42 | class PropIdEvaluator(object):
43 | def __init__(self, data, label_dict, target_label='V',
44 | use_se_marker=False):
45 | self.data = data
46 | self.label_dict = label_dict
47 | self.target_label_id = label_dict.str2idx[target_label]
48 | self.best_accuracy = 0.0
49 | self.has_best = False
50 |
51 | def compute_accuracy(self, predictions):
52 | _, y, _, weights = self.data
53 | # print predictions.shape, predictions
54 | identified = numpy.equal(predictions, self.target_label_id)
55 | print(y)
56 | # print self.target_label_id
57 | # print identified
58 | # exit()
59 | num_correct = numpy.sum(
60 | numpy.logical_and(numpy.equal(predictions, y), identified) * weights)
61 | num_identified = numpy.sum(identified * weights)
62 | num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights)
63 | self.precision = 100.0 * num_correct / num_identified
64 | self.recall = 100.0 * num_correct / num_gold
65 | self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall)
66 | print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format(
67 | self.accuracy, self.precision, self.recall))
68 |
69 | def evaluate(self, predictions):
70 | self.compute_accuracy(predictions)
71 | self.has_best = self.accuracy > self.best_accuracy
72 | if self.has_best:
73 | print("Best accuracy so far: {:.3f}".format(self.accuracy))
74 | self.best_accuracy = self.accuracy
75 |
76 |
77 | class SRLEvaluator(TaggerEvaluator):
78 | def __init__(self):
79 | self.best_accuracy = -1.0
80 | self.has_best = False
81 |
82 | def compute_accuracy(self, predictions):
83 | print("exit()")
84 | exit()
85 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/features.py:
--------------------------------------------------------------------------------
1 | def get_srl_features(sentences, config, feature_dicts=None):
2 | ''' TODO: Support adding more features.
3 | '''
4 | feature_names = config.features
5 | feature_sizes = config.feature_sizes
6 | use_se_marker = config.use_se_marker
7 |
8 | features = []
9 | feature_shapes = []
10 | for fname, fsize in zip(feature_names, feature_sizes):
11 | if fname == "predicate":
12 | offset = int(use_se_marker)
13 | offset = 1 # pad is in the position 0
14 | features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences])
15 | feature_shapes.append([2, fsize])
16 | return (zip(*features), feature_shapes)
17 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/inference.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | def get_transition_params(label_strs):
5 | """Construct transtion scoresd (0 for allowed, -inf for invalid).
6 | Args:
7 | label_strs: A [num_tags,] sequence of BIO-tags.
8 | Returns:
9 | A [num_tags, num_tags] matrix of transition scores.
10 | """
11 | num_tags = len(label_strs)
12 | transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32)
13 | for i, prev_label in enumerate(label_strs):
14 | for j, label in enumerate(label_strs):
15 | if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]:
16 | transition_params[i, j] = numpy.NINF
17 | return transition_params
18 |
19 |
20 | def viterbi_decode(score, transition_params):
21 | """ Adapted from Tensorflow implementation.
22 | Decode the highest scoring sequence of tags outside of TensorFlow.
23 | This should only be used at test time.
24 | Args:
25 | score: A [seq_len, num_tags] matrix of unary potentials.
26 | transition_params: A [num_tags, num_tags] matrix of binary potentials.
27 | Returns:
28 | viterbi: A [seq_len] list of integers containing the highest scoring tag
29 | indicies.
30 | viterbi_score: A float containing the score for the Viterbi sequence.
31 | """
32 | trellis = numpy.zeros_like(score)
33 | backpointers = numpy.zeros_like(score, dtype=numpy.int32)
34 | trellis[0] = score[0]
35 | for t in range(1, score.shape[0]):
36 | v = numpy.expand_dims(trellis[t - 1], 1) + transition_params
37 | trellis[t] = score[t] + numpy.max(v, 0)
38 | backpointers[t] = numpy.argmax(v, 0)
39 | viterbi = [numpy.argmax(trellis[-1])]
40 | for bp in reversed(backpointers[1:]):
41 | viterbi.append(bp[viterbi[-1]])
42 | viterbi.reverse()
43 | viterbi_score = numpy.max(trellis[-1])
44 | return viterbi, viterbi_score
45 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/io_utils.py:
--------------------------------------------------------------------------------
1 | from google.protobuf.internal import encoder
2 |
3 | _EncodeVarint = encoder._VarintEncoder()
4 |
5 |
6 | def write_delimited_to(out_file, message):
7 | msg_size = message.ByteSize()
8 | pieces = []
9 | _EncodeVarint(pieces.append, msg_size)
10 | out_file.write(b"".join(pieces))
11 | out_file.write(message.SerializeToString())
12 |
13 |
14 | def read_gold_props(gold_props_file):
15 | """ Read gold predicates from CoNLL-formatted file.
16 | """
17 | gold_props = []
18 | props = []
19 | with open(gold_props_file, 'r') as f:
20 | for line in f:
21 | line = line.strip()
22 | if line == '':
23 | gold_props.append(props)
24 | props = []
25 | else:
26 | props.append(line.split()[0])
27 | f.close()
28 | if len(props) > 0:
29 | gold_props.append(props)
30 | return gold_props
31 |
32 |
33 | def write_predprops_to(predictions,
34 | label_dict,
35 | input_file,
36 | output_file,
37 | gold_props_file=None,
38 | output_props_file=None):
39 | """ Write predicted predicate information to files.
40 |
41 | Arguments:
42 | predictions: Predictions from the predicate identification model.
43 | Is a numpy array of size [num_sentences, max_sentence_length].
44 | label_dict: Label dictionary.
45 | input_file: Input sequential tagging file.
46 | output_file: Output SRL file with identified predicates.
47 | gold_props_file: Input file with gold predicates in CoNLL format.
48 | output_props_file: Output SRL file with identified predicates, in CoNLL format.
49 | """
50 |
51 | fin = open(input_file, 'r')
52 | fout = open(output_file, 'w')
53 |
54 | if output_props_file is not None and output_props_file != '':
55 | fout_props = open(output_props_file, 'w')
56 | else:
57 | fout_props = None
58 |
59 | if gold_props_file is not None and gold_props_file != '':
60 | gold_props = read_gold_props(gold_props_file)
61 | print(len(gold_props), len(predictions))
62 | assert len(gold_props) == len(predictions)
63 | else:
64 | gold_props = None
65 |
66 | sent_id = 0
67 | for line in fin:
68 | # Read original sentence from input file.
69 | raw_sent = line.split('|||')[0].strip()
70 | tokens = raw_sent.split(' ')
71 | slen = len(tokens)
72 | pred = predictions[sent_id, :slen]
73 | props = []
74 |
75 | for (t, p) in enumerate(pred):
76 | if label_dict.idx2str[p] == 'V':
77 | out_tags = ['O' for _ in range(slen)]
78 | out_tags[t] = 'B-V'
79 | out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join(
80 | out_tags) + '\n'
81 | fout.write(out_line)
82 | props.append(t)
83 |
84 | if fout_props is not None:
85 | if sent_id > 0:
86 | fout_props.write('\n')
87 | for t in range(slen):
88 | lemma = 'P' + tokens[t].lower()
89 | # In order for CoNLL evaluation script to run, we need to output the same
90 | # lemma as the gold predicate in the CoNLL-formatted file.
91 | if gold_props is not None and gold_props[sent_id][t] != '-':
92 | lemma = gold_props[sent_id][t]
93 | if t in props:
94 | fout_props.write(lemma)
95 | else:
96 | fout_props.write('-')
97 | for p in props:
98 | if t == p:
99 | fout_props.write('\t(V*)')
100 | else:
101 | fout_props.write('\t*')
102 | fout_props.write('\n')
103 | sent_id += 1
104 |
105 | fout.close()
106 | print('Predicted predicates in sequential-tagging format written to: {}.'.
107 | format(output_file))
108 | if fout_props is not None:
109 | fout_props.close()
110 | print('CoNLL-formatted predicate information written to: {}.'.format(
111 | output_props_file))
112 |
113 |
114 | def bio_to_spans(predictions, label_dict):
115 | """ Convert BIO-based predictions to a set of arguments.
116 | Arguments:
117 | predictions: A single integer array, already truncated to the original sequence lengths.
118 | label_dict: Label dictionary.
119 | Returns:
120 | A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions.
121 | """
122 | args = []
123 | tags = [label_dict.idx2str[p] for p in predictions]
124 | for (i, tag) in enumerate(tags):
125 | if tag == 'O':
126 | continue
127 | label = tag[2:]
128 | # Append new span.
129 | if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]:
130 | args.append([label, i, -1])
131 | # Close current span.
132 | if i == len(predictions) - 1 or tags[
133 | i + 1][0] == 'B' or label != tags[i + 1][2:]:
134 | args[-1][2] = i
135 | return args
136 |
137 |
138 | def print_to_readable(predictions, num_tokens, label_dict, input_path,
139 | output_path):
140 | """ Print predictions to human-readable format.
141 | """
142 | fout = open(output_path, 'w')
143 | sample_id = 0
144 | for line in open(input_path, 'r'):
145 | info = line.split('|||')[0].strip().split()
146 | pid = int(info[0])
147 | sent = info[1:]
148 | fout.write(' '.join(sent) + '\n')
149 | fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid))
150 |
151 | tags = predictions[sample_id]
152 | arg_spans = bio_to_spans(tags, label_dict)
153 | for arg in arg_spans:
154 | fout.write('\t\t{}: {}\n'.format(arg[0], " ".join(
155 | sent[arg[1]:arg[2] + 1])))
156 | fout.write('\n')
157 | sample_id += 1
158 |
159 | fout.close()
160 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/measurements.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 |
4 |
5 | class Timer:
6 | def __init__(self, name, active=True):
7 | self.name = name if active else None
8 |
9 | def __enter__(self):
10 | self.start = time.time()
11 | self.last_tick = self.start
12 | return self
13 |
14 | def __exit__(self, *args):
15 | if self.name is not None:
16 | print("{} duration was {}.".format(
17 | self.name, self.readable(time.time() - self.start)))
18 |
19 | def readable(self, seconds):
20 | return str(datetime.timedelta(seconds=int(seconds)))
21 |
22 | def tick(self, message):
23 | current = time.time()
24 | print("{} took {} ({} since last tick).".format(
25 | message, self.readable(current - self.start),
26 | self.readable(current - self.last_tick)))
27 | self.last_tick = current
28 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/numpy_utils.py:
--------------------------------------------------------------------------------
1 | import numpy
2 |
3 |
4 | def orth_normal_initializer(factor=1.0, seed=None):
5 | ''' Reference: Exact solutions to the nonlinear dynamics of learning in
6 | deep linear neural networks
7 | Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf
8 | Adapted from the original implementation by Mingxuan Wang.
9 | '''
10 | def _initializer(shape, dtype):
11 | assert len(shape) == 2
12 | rng = numpy.random.RandomState(seed)
13 | if shape[0] == shape[1]:
14 | M = rng.randn(*shape).astype(dtype)
15 | Q, R = numpy.linalg.qr(M)
16 | Q = Q * numpy.sign(numpy.diag(R))
17 | param = Q * factor
18 | return param
19 | else:
20 | M1 = rng.randn(shape[0], shape[0]).astype(dtype)
21 | M2 = rng.randn(shape[1], shape[1]).astype(dtype)
22 | Q1, R1 = numpy.linalg.qr(M1)
23 | Q2, R2 = numpy.linalg.qr(M2)
24 | Q1 = Q1 * numpy.sign(numpy.diag(R1))
25 | Q2 = Q2 * numpy.sign(numpy.diag(R2))
26 | n_min = min(shape[0], shape[1])
27 | param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor
28 | return param
29 |
30 | return _initializer
31 |
32 |
33 | def block_orth_normal_initializer(input_shapes,
34 | output_shapes,
35 | factor=1.0,
36 | seed=None):
37 | ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix.
38 | Input:
39 | - input_shapes: the sizes of each block alone dimension 0.
40 | - output_shapes: the sizes of each block along dimension 1.
41 | for example input_shapes = [100, 128] output_shapes=[100,100,100,100]
42 | indicates eight blocks with shapes [100,100], [128,100], etc.
43 | '''
44 | def _initializer(shape, dtype):
45 | assert len(shape) == 2
46 | initializer = orth_normal_initializer(factor, seed)
47 | params = numpy.concatenate([
48 | numpy.concatenate([
49 | initializer([dim_in, dim_out], dtype)
50 | for dim_out in output_shapes
51 | ], 1) for dim_in in input_shapes
52 | ], 0)
53 | return params
54 |
55 | return _initializer
56 |
57 |
58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None):
59 | def _initializer(shape, dtype):
60 | rng = numpy.random.RandomState(seed)
61 | return numpy.asarray(rng.normal(mean, stddev, shape), dtype)
62 |
63 | return _initializer
64 |
65 |
66 | def all_zero_initializer():
67 | def _initializer(shape, dtype):
68 | return numpy.zeros(shape).astype(dtype)
69 |
70 | return _initializer
71 |
72 |
73 | def uniform_initializer(value=0.01):
74 | def _initializer(shape, dtype):
75 | return numpy.full(shape, value).astype(dtype)
76 |
77 | return _initializer
78 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/scores_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: scores.proto
3 |
4 | import sys
5 | import tensor_pb2 as tensor__pb2
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | # from google.protobuf import descriptor_pb2
11 |
12 |
13 | # @@protoc_insertion_point(imports)
14 | _b = sys.version_info[0] < 3 and (lambda x: x) or (
15 | lambda x: x.encode('latin1'))
16 |
17 |
18 | _sym_db = _symbol_database.Default()
19 |
20 |
21 | DESCRIPTOR = _descriptor.FileDescriptor(
22 | name='scores.proto',
23 | package='',
24 | syntax='proto2',
25 | serialized_pb=_b(
26 | '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\
27 | sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto'
28 | ),
29 | dependencies=[
30 | tensor__pb2.DESCRIPTOR,
31 | ])
32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
33 |
34 | _SENTENCESCORESPROTO = _descriptor.Descriptor(
35 | name='SentenceScoresProto',
36 | full_name='SentenceScoresProto',
37 | filename=None,
38 | file=DESCRIPTOR,
39 | containing_type=None,
40 | fields=[
41 | _descriptor.FieldDescriptor(
42 | name='sentence_id',
43 | full_name='SentenceScoresProto.sentence_id',
44 | index=0,
45 | number=1,
46 | type=13,
47 | cpp_type=3,
48 | label=1,
49 | has_default_value=False,
50 | default_value=0,
51 | message_type=None,
52 | enum_type=None,
53 | containing_type=None,
54 | is_extension=False,
55 | extension_scope=None,
56 | options=None),
57 | _descriptor.FieldDescriptor(name='scores',
58 | full_name='SentenceScoresProto.scores',
59 | index=1,
60 | number=2,
61 | type=11,
62 | cpp_type=10,
63 | label=1,
64 | has_default_value=False,
65 | default_value=None,
66 | message_type=None,
67 | enum_type=None,
68 | containing_type=None,
69 | is_extension=False,
70 | extension_scope=None,
71 | options=None),
72 | ],
73 | extensions=[],
74 | nested_types=[],
75 | enum_types=[],
76 | options=None,
77 | is_extendable=False,
78 | syntax='proto2',
79 | extension_ranges=[],
80 | oneofs=[],
81 | serialized_start=30,
82 | serialized_end=102,
83 | )
84 |
85 | _SCORESPROTO = _descriptor.Descriptor(
86 | name='ScoresProto',
87 | full_name='ScoresProto',
88 | filename=None,
89 | file=DESCRIPTOR,
90 | containing_type=None,
91 | fields=[
92 | _descriptor.FieldDescriptor(name='sentences',
93 | full_name='ScoresProto.sentences',
94 | index=0,
95 | number=1,
96 | type=11,
97 | cpp_type=10,
98 | label=3,
99 | has_default_value=False,
100 | default_value=[],
101 | message_type=None,
102 | enum_type=None,
103 | containing_type=None,
104 | is_extension=False,
105 | extension_scope=None,
106 | options=None),
107 | ],
108 | extensions=[],
109 | nested_types=[],
110 | enum_types=[],
111 | options=None,
112 | is_extendable=False,
113 | syntax='proto2',
114 | extension_ranges=[],
115 | oneofs=[],
116 | serialized_start=104,
117 | serialized_end=158,
118 | )
119 |
120 | _SENTENCESCORESPROTO.fields_by_name[
121 | 'scores'].message_type = tensor__pb2._TENSORPROTO
122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO
123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO
124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO
125 |
126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType(
127 | 'SentenceScoresProto',
128 | (_message.Message, ),
129 | dict(DESCRIPTOR=_SENTENCESCORESPROTO,
130 | __module__='scores_pb2'
131 | # @@protoc_insertion_point(class_scope:SentenceScoresProto)
132 | ))
133 | _sym_db.RegisterMessage(SentenceScoresProto)
134 |
135 | ScoresProto = _reflection.GeneratedProtocolMessageType(
136 | 'ScoresProto',
137 | (_message.Message, ),
138 | dict(DESCRIPTOR=_SCORESPROTO,
139 | __module__='scores_pb2'
140 | # @@protoc_insertion_point(class_scope:ScoresProto)
141 | ))
142 | _sym_db.RegisterMessage(ScoresProto)
143 |
144 | # @@protoc_insertion_point(module_scope)
145 |
--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/syntactic_extraction.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import codecs
3 |
4 | from .dictionary import Dictionary
5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN
6 | from collections import OrderedDict
7 |
8 |
9 | class SyntacticTree(object):
10 | def __init__(self, sentence_id):
11 | self.sentence_id = sentence_id
12 | self.word_forms = ["Root"]
13 | self.word_forms_ids = []
14 | self.char_ids = [[]] # 2D
15 | self.pos_forms = ["Root"]
16 | self.heads = [0]
17 | self.labels = ["Root"]
18 | self.labels_id = []
19 |
20 |
21 | class SyntacticCONLL(object):
22 | def __init__(self):
23 | self.file_name = ""
24 | self.trees = []
25 | self.sample_dep_data = None
26 |
27 | def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8):
28 | self.file_name = filename
29 |
30 | print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio))
31 | conll_file = codecs.open(self.file_name, 'r', encoding="utf8")
32 | if conll_file.closed:
33 | print("Cannot open the syntactic conll file! Please check {}".format(self.file_name))
34 |
35 | sentence_id = 0
36 | a_tree = SyntacticTree(sentence_id)
37 | find_root = False
38 | for line in conll_file:
39 | if line == '\n' or line == '\r\n': # new sentence
40 | sentence_id += 1
41 | if len(a_tree.word_forms) <= max_sentence_length:
42 | assert find_root is True
43 | # keep the sentence with the length < max_sentence_l
44 | self.trees.append(a_tree)
45 | a_tree = SyntacticTree(sentence_id)
46 | find_root = False
47 | continue
48 | tokens = line.strip().split('\t')
49 | a_tree.word_forms.append(tokens[1])
50 | a_tree.pos_forms.append(tokens[3])
51 | # head = int(tokens[6]) if int(tokens[6]) > 0 else -1
52 | head = int(tokens[6]) - 1 # root's head is 0
53 | if head == -1:
54 | assert tokens[7] == "root"
55 | find_root = True
56 | a_tree.heads.append(head)
57 | a_tree.labels.append(tokens[7])
58 | token_9 = tokens[9] # or tokens 9 will be 'unicode' type
59 | dep_prob = 1.0 if isinstance(token_9, str) else float(token_9)
60 | if dep_prob < prune_ratio:
61 | a_tree.heads[-1] = -1
62 | print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees)))
63 |
64 | @staticmethod
65 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
66 | ids = []
67 | for s in list_of_words:
68 | s = s
69 | if s is None:
70 | ids.append(-1)
71 | continue
72 | if lowercase:
73 | s = s.lower()
74 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
75 | s = UNKNOWN_TOKEN
76 | ids.append(dictionary.add(s))
77 | return ids
78 |
79 | def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None):
80 | for tree in self.trees:
81 | tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False,
82 | pretrained_word_embedding)
83 | words = tree.word_forms
84 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width
85 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64)
86 | for i, word in enumerate(words):
87 | single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict)
88 | # Add the sample char tokens into the sample_char_tokens
89 | tree.char_ids = single_sample_char_tokens
90 |
91 | tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict)
92 |
93 | sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees]
94 | sample_char_ids = [tree.char_ids for tree in self.trees]
95 | sample_heads = [np.asarray(tree.heads) for tree in self.trees]
96 | sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees]
97 | self.sample_dep_data = list(zip(sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids))
98 |
99 | def get_syntactic_label_dict(self, syn_label_dict=None):
100 | if syn_label_dict is None:
101 | syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)
102 | else:
103 | assert syn_label_dict.accept_new is False
104 | sentences_length = len(self.trees)
105 | for i in range(sentences_length):
106 | ith_sentence_length = len(self.trees[i].labels)
107 | for j in range(ith_sentence_length):
108 | self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j]))
109 | return syn_label_dict
110 |
111 |
112 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings):
113 | dep_trees = SyntacticCONLL()
114 | dep_trees.read_from_file(file_path, max_sentence_length=2000)
115 | dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings)
116 |
117 | auto_dep_trees = OrderedDict()
118 | for tree in dep_trees.trees:
119 | sentence = ' '.join(tree.word_forms[1:]) # remove the "Root"
120 | auto_dep_trees[sentence] = tree
121 | return auto_dep_trees
122 |
123 |
124 | class SyntacticRepresentation(object):
125 | def __init__(self):
126 | self.file_name = ""
127 | self.representations = []
128 |
129 | def read_from_file(self, filename):
130 | self.file_name = filename
131 | print("Reading lstm representations from {}".format(self.file_name))
132 | representation_file = open(self.file_name, 'r')
133 | if representation_file.closed:
134 | print("Cannot open the representation file! Please check {}".format(self.file_name))
135 | exit()
136 | each_sentence_representations = []
137 | for line in representation_file:
138 | if line == '\n' or line == "\r\n": # new sentence
139 | self.representations.append(each_sentence_representations)
140 | each_sentence_representations = []
141 | continue
142 | line = line.strip()
143 | line = line.split('\t')
144 | line = line[1].split(' ')
145 | rep = np.asarray(line, dtype=np.float32)
146 | each_sentence_representations.append(rep)
147 | representation_file.close()
148 | print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations)))
149 |
150 | def minus_by_the_predicate(self, corpus_tensors):
151 | has_processed_sentence_id = {}
152 | for i, data in enumerate(corpus_tensors):
153 | sentence_id = data[0][0][0]
154 | predicates = data[0][2]
155 | predicate_id = predicates.argmax()
156 | if sentence_id in has_processed_sentence_id:
157 | continue
158 | else:
159 | has_processed_sentence_id[sentence_id] = 1
160 | for j in range(1, len(self.representations[sentence_id])): # Root doesn't use.
161 | self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j]
162 |
163 | def check_math_corpus(self, lengths):
164 | for i, length in enumerate(lengths):
165 | if len(self.representations[i]) != length + 1: # 1 means the first one, Root. Actually never use it.
166 | print(i, length, len(self.representations[i]))
167 | print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length)
168 | exit()
169 | print("LSTM representation match the corpus!")
170 |
--------------------------------------------------------------------------------