├── .DS_Store ├── README.md ├── exp-4.1-baseline ├── config.json ├── log.txt ├── model │ ├── char_dict │ ├── checkpoints.tsv │ ├── config │ ├── cons_label_dict │ ├── dep_label_dict │ ├── label_dict │ ├── pos_dict │ └── word_dict ├── predict.sh └── train.sh ├── figures ├── model.jpg └── model.pdf ├── scripts ├── convert_orl_conll_to_json.py ├── eval_averaged_metrics.py ├── eval_orl_conll_file.py ├── eval_orl_e2e_json_file.py ├── eval_orl_json_file.py └── generate_constituent_trees_from_benepar.py └── src ├── orl-4.1-ultimate-hard-e2e ├── __init__.py ├── analyze.py ├── neural_srl │ ├── TreeLSTM │ │ ├── Encoder.py │ │ ├── Tree.py │ │ ├── TreeGRU.py │ │ └── __init__.py │ ├── __init__.py │ ├── gcn_model │ │ ├── __init__.py │ │ ├── gcn.py │ │ ├── tree.py │ │ └── various_gcn.py │ ├── pytorch │ │ ├── HBiLSTM.py │ │ ├── HighWayLSTM.py │ │ ├── __init__.py │ │ ├── implicit_syntactic_representations.py │ │ ├── layer.py │ │ ├── model.py │ │ ├── pre_trained_language_model.py │ │ ├── tagger.py │ │ └── util.py │ └── shared │ │ ├── __init__.py │ │ ├── configuration.py │ │ ├── conll_utils.py │ │ ├── constants.py │ │ ├── constituent_extraction.py │ │ ├── constituent_reader.py │ │ ├── dictionary.py │ │ ├── evaluation.py │ │ ├── features.py │ │ ├── inference.py │ │ ├── inference_utils.py │ │ ├── io_utils.py │ │ ├── measurements.py │ │ ├── numpy_utils.py │ │ ├── reader.py │ │ ├── scores_pb2.py │ │ ├── srl_eval_utils.py │ │ ├── syntactic_extraction.py │ │ ├── tagger_data.py │ │ └── tensor_pb2.py ├── predict.py └── train.py └── orl-4.1 ├── __init__.py ├── analyze.py ├── neural_srl ├── TreeLSTM │ ├── Encoder.py │ ├── Tree.py │ ├── TreeGRU.py │ └── __init__.py ├── __init__.py ├── __pycache__ │ └── __init__.cpython-37.pyc ├── gcn_model │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── tree.cpython-37.pyc │ │ └── various_gcn.cpython-37.pyc │ ├── gcn.py │ ├── tree.py │ └── various_gcn.py ├── pytorch │ ├── HBiLSTM.py │ ├── HighWayLSTM.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── HighWayLSTM.cpython-37.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── implicit_syntactic_representations.cpython-37.pyc │ │ ├── layer.cpython-37.pyc │ │ ├── model.cpython-37.pyc │ │ ├── pre_trained_language_model.cpython-37.pyc │ │ ├── tagger.cpython-37.pyc │ │ └── util.cpython-37.pyc │ ├── implicit_syntactic_representations.py │ ├── layer.py │ ├── model.py │ ├── pre_trained_language_model.py │ ├── tagger.py │ └── util.py └── shared │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── configuration.cpython-37.pyc │ ├── conll_utils.cpython-37.pyc │ ├── constants.cpython-37.pyc │ ├── constituent_extraction.cpython-37.pyc │ ├── constituent_reader.cpython-37.pyc │ ├── dictionary.cpython-37.pyc │ ├── evaluation.cpython-37.pyc │ ├── inference_utils.cpython-37.pyc │ ├── measurements.cpython-37.pyc │ ├── reader.cpython-37.pyc │ ├── srl_eval_utils.cpython-37.pyc │ ├── syntactic_extraction.cpython-37.pyc │ └── tagger_data.cpython-37.pyc │ ├── configuration.py │ ├── conll_utils.py │ ├── constants.py │ ├── constituent_extraction.py │ ├── constituent_reader.py │ ├── dictionary.py │ ├── evaluation.py │ ├── features.py │ ├── inference.py │ ├── inference_utils.py │ ├── io_utils.py │ ├── measurements.py │ ├── numpy_utils.py │ ├── reader.py │ ├── scores_pb2.py │ ├── srl_eval_utils.py │ ├── syntactic_extraction.py │ ├── tagger_data.py │ └── tensor_pb2.py ├── predict.py └── train.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # opinion_mining_with_syn_cons 2 | This repositry contains our code, configurations, and model for our work on "A Unified Span-Based Approach for Opinion Mining with Syntactic Constituents", which is published on NAACL-2021. 3 | The src directory contains our code and the exp-4.1-baseline contains our experiment for "Baseline+BERT" (data0, the first data of the five fold cross-validation). 4 | 5 | ![model](https://github.com/KiroSummer/opinion_mining_with_syn_cons/blob/main/figures/model.jpg) 6 | 7 | ## Environment 8 | Python3, Pytorch, Transformers 2.1.1 (for BERT) 9 | 10 | ### Data 11 | MPQA2.0 [url](http://mpqa.cs.pitt.edu/corpora/mpqa_corpus/mpqa_corpus_2_0/) 12 | PTB and OntoNotes can be download from LDC. 13 | 14 | ### Training 15 | Please reset and check the files in the train.sh and config.json when you want to run the code. 16 | 17 | ``` 18 | sh train.sh GPU\_ID 19 | ``` 20 | 21 | ### Test 22 | To test the performance of the trained model, you should run the following script. 23 | 24 | ``` 25 | sh predict.sh GPU\_ID 26 | ``` 27 | We release the sample model of the "exp-4.1-baseline" on the Google Drive, [url](https://drive.google.com/file/d/17u8ofyaBThb66qYPZe-60A2lyEnWCNil/view?usp=sharing). 28 | Important, use the offline evaluation script to eval the output file. 29 | -------------------------------------------------------------------------------- /exp-4.1-baseline/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_train_length": 100, 3 | "batch_size" : 32, 4 | "subbatch_size": 1, 5 | "max_tokens_per_batch" : 700, 6 | "features" : ["predicate"], 7 | "feature_sizes": [100], 8 | "dev_batch_size": 40, 9 | 10 | "use_bert": true, 11 | "bert_vocab_path": "bert-base-cased", 12 | "bert_path": "bert-base-cased", 13 | "bert_dim": 768, 14 | 15 | "mtl_cons": false, 16 | "use_cons_labels": false, 17 | 18 | "use_cons_gcn": false, 19 | 20 | "mtl_dep": false, 21 | "dep_prune_ratio": 0.8, 22 | "dep_num_lstm_layers": 3, 23 | "mlp_arc_size": 500, 24 | "mlp_rel_size": 100, 25 | "dropout_mlp": 0.33, 26 | 27 | "use_dep_gcn": false, 28 | "gcn_dep_num_layers": 2, 29 | 30 | "joint": true, 31 | "mtl": false, 32 | "analyze": false, 33 | 34 | "learning_rate":0.001, 35 | "input_dropout_prob":0.0, 36 | "feature_dropout": 0.5, 37 | "lexical_dropout" : 0.5, 38 | "dropout" : 0.3, 39 | "recurrent_dropout_prob":0.4, 40 | "mlp_dropout_prob": 0.2, 41 | "max_grad_norm": 5.0, 42 | "weight_decay": 1e-7, 43 | "decay_steps": 50, 44 | "fl_alpha": 1.0, 45 | "fl_gamma": 3.0, 46 | "pruning_by_arg_prob": false, 47 | "arg_boundary_prob_threshold": 0.0, 48 | "pruning_by_three_threshold": false, 49 | "arg_three_p_boundary_prob_threshold": 0.02, 50 | "neg_threshold": 80, 51 | 52 | "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt", 53 | "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt", 54 | "char_emb_size" : 8, 55 | "pos_emb_size" : 100, 56 | "cons_label_dim": 100, 57 | "span_width_feature_size" : 20, 58 | "num_attention_heads" : 1, 59 | "kernel_sizes" : [3, 4, 5], 60 | "output_channel" : 50, 61 | "argument_ratio" : 0.8, 62 | "predicate_ratio" : 0.4, 63 | "linear_projection_size" : 400, 64 | "cons_num_lstm_layers": 3, 65 | "num_lstm_layers" : 2, 66 | "lstm_hidden_size": 300, 67 | "max_arg_width" : 60, 68 | "lstm_cell":"highway", 69 | "mlp_label_size":100, 70 | "per_layer_dropout":true, 71 | 72 | "gcn_rnn": true, 73 | "gcn_rnn_hidden": 200, 74 | "gcn_rnn_layers": 1, 75 | "gcn_rnn_dropout": 0.4, 76 | "gcn_hidden_dim": 300, 77 | "gcn_num_layers": 3, 78 | "gcn_drop": 0.3, 79 | 80 | "pred_size": 300, 81 | "arg_start_size": 200, 82 | "arg_end_size": 200, 83 | "argu_size": 300, 84 | "argu_size_u": 400, 85 | "num_attention_heads" : 1, 86 | "ffnn_size" : 150, 87 | "ffnn_depth" : 1, 88 | 89 | "trainer" : "Adadelta", 90 | "max_epochs": 500, 91 | "checkpoint_every_x_epochs": 1, 92 | 93 | "enforce_srl_constraint": false, 94 | "use_gold_predicates": true, 95 | "use_gold_arguments": false 96 | } 97 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/char_dict: -------------------------------------------------------------------------------- 1 | *PAD* 2 | *UNKNOWN* 3 | @ 4 | p 5 | Y 6 | s 7 | : 8 | P 9 | a 10 | _ 11 | 4 12 | Z 13 | } 14 | o 15 | + 16 | w 17 | r 18 | 8 19 | # 20 | 0 21 | h 22 | R 23 | E 24 | g 25 | 2 26 | x 27 | U 28 | $ 29 | d 30 | [ 31 | ? 32 | F 33 | X 34 | \ 35 | 7 36 | Q 37 | 9 38 | ' 39 | z 40 | e 41 | t 42 | 3 43 | c 44 | " 45 | v 46 | ˙ 47 | k 48 | ò 49 | * 50 | m 51 | , 52 | % 53 | S 54 | ` 55 | K 56 | A 57 | - 58 | . 59 | q 60 | L 61 | B 62 | J 63 | Ì 64 | j 65 | < 66 | i 67 | 1 68 | ö 69 | ’ 70 | N 71 | & 72 | ] 73 | 5 74 | H 75 | T 76 | Û 77 | b 78 | y 79 | ; 80 | G 81 | V 82 | f 83 | ! 84 | > 85 | / 86 | O 87 | W 88 | D 89 | u 90 | { 91 | M 92 | = 93 | 6 94 | l 95 | n 96 | C 97 | I 98 | Ê 99 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/config: -------------------------------------------------------------------------------- 1 | { 2 | "max_train_length": 100, 3 | "batch_size" : 32, 4 | "subbatch_size": 1, 5 | "max_tokens_per_batch" : 700, 6 | "features" : ["predicate"], 7 | "feature_sizes": [100], 8 | "dev_batch_size": 40, 9 | 10 | "use_bert": true, 11 | "bert_vocab_path": "bert-base-cased", 12 | "bert_path": "bert-base-cased", 13 | "bert_dim": 768, 14 | 15 | "mtl_cons": false, 16 | "use_cons_labels": false, 17 | 18 | "use_cons_gcn": false, 19 | 20 | "mtl_dep": false, 21 | "dep_prune_ratio": 0.8, 22 | "dep_num_lstm_layers": 3, 23 | "mlp_arc_size": 500, 24 | "mlp_rel_size": 100, 25 | "dropout_mlp": 0.33, 26 | 27 | "use_dep_gcn": false, 28 | "gcn_dep_num_layers": 2, 29 | 30 | "joint": true, 31 | "mtl": false, 32 | "analyze": false, 33 | 34 | "learning_rate":0.001, 35 | "input_dropout_prob":0.0, 36 | "feature_dropout": 0.5, 37 | "lexical_dropout" : 0.5, 38 | "dropout" : 0.3, 39 | "recurrent_dropout_prob":0.4, 40 | "mlp_dropout_prob": 0.2, 41 | "max_grad_norm": 5.0, 42 | "weight_decay": 1e-7, 43 | "decay_steps": 50, 44 | "fl_alpha": 1.0, 45 | "fl_gamma": 3.0, 46 | "pruning_by_arg_prob": false, 47 | "arg_boundary_prob_threshold": 0.0, 48 | "pruning_by_three_threshold": false, 49 | "arg_three_p_boundary_prob_threshold": 0.02, 50 | "neg_threshold": 80, 51 | 52 | "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt", 53 | "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt", 54 | "char_emb_size" : 8, 55 | "pos_emb_size" : 100, 56 | "cons_label_dim": 100, 57 | "span_width_feature_size" : 20, 58 | "num_attention_heads" : 1, 59 | "kernel_sizes" : [3, 4, 5], 60 | "output_channel" : 50, 61 | "argument_ratio" : 0.8, 62 | "predicate_ratio" : 0.4, 63 | "linear_projection_size" : 400, 64 | "cons_num_lstm_layers": 3, 65 | "num_lstm_layers" : 2, 66 | "lstm_hidden_size": 300, 67 | "max_arg_width" : 60, 68 | "lstm_cell":"highway", 69 | "mlp_label_size":100, 70 | "per_layer_dropout":true, 71 | 72 | "gcn_rnn": true, 73 | "gcn_rnn_hidden": 200, 74 | "gcn_rnn_layers": 1, 75 | "gcn_rnn_dropout": 0.4, 76 | "gcn_hidden_dim": 300, 77 | "gcn_num_layers": 3, 78 | "gcn_drop": 0.3, 79 | 80 | "pred_size": 300, 81 | "arg_start_size": 200, 82 | "arg_end_size": 200, 83 | "argu_size": 300, 84 | "argu_size_u": 400, 85 | "num_attention_heads" : 1, 86 | "ffnn_size" : 150, 87 | "ffnn_depth" : 1, 88 | 89 | "trainer" : "Adadelta", 90 | "max_epochs": 500, 91 | "checkpoint_every_x_epochs": 1, 92 | 93 | "enforce_srl_constraint": false, 94 | "use_gold_predicates": true, 95 | "use_gold_arguments": false 96 | } 97 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/cons_label_dict: -------------------------------------------------------------------------------- 1 | O 2 | WHNP 3 | NP 4 | PP 5 | SBARQ 6 | ADVP 7 | VP 8 | ADJP 9 | NML 10 | SINV 11 | PRT 12 | WHADVP 13 | SBAR 14 | INTJ 15 | SQ 16 | QP 17 | CONJP 18 | UCP 19 | X 20 | FRAG 21 | PRN 22 | WHPP 23 | WHADJP 24 | LST 25 | NAC 26 | RRC 27 | META 28 | NX 29 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/dep_label_dict: -------------------------------------------------------------------------------- 1 | Root 2 | prep 3 | det 4 | nn 5 | num 6 | pobj 7 | punct 8 | poss 9 | possessive 10 | amod 11 | nsubj 12 | dep 13 | dobj 14 | cc 15 | conj 16 | nsubjpass 17 | partmod 18 | auxpass 19 | advmod 20 | root 21 | ccomp 22 | aux 23 | cop 24 | xcomp 25 | quantmod 26 | tmod 27 | appos 28 | npadvmod 29 | neg 30 | infmod 31 | rcmod 32 | pcomp 33 | mark 34 | advcl 35 | predet 36 | mwe 37 | parataxis 38 | number 39 | acomp 40 | prt 41 | iobj 42 | expl 43 | csubj 44 | preconj 45 | discourse 46 | csubjpass 47 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/label_dict: -------------------------------------------------------------------------------- 1 | O 2 | AGENT 3 | TARGET 4 | -------------------------------------------------------------------------------- /exp-4.1-baseline/model/pos_dict: -------------------------------------------------------------------------------- 1 | *PAD* 2 | DT 3 | NNP 4 | JJ 5 | NN 6 | VBD 7 | PRP 8 | MD 9 | RB 10 | VB 11 | IN 12 | CD 13 | PRP$ 14 | NNS 15 | . 16 | : 17 | VBG 18 | VBN 19 | TO 20 | `` 21 | VBZ 22 | , 23 | VBP 24 | JJR 25 | '' 26 | -LRB- 27 | -RRB- 28 | POS 29 | FW 30 | CC 31 | WP 32 | $ 33 | RP 34 | WDT 35 | EX 36 | RBS 37 | WRB 38 | NNPS 39 | UH 40 | RBR 41 | JJS 42 | PDT 43 | WP$ 44 | LS 45 | SYM 46 | # 47 | -------------------------------------------------------------------------------- /exp-4.1-baseline/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PATH=/usr/local/cuda/bin:$PATH 3 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib 4 | 5 | MODEL_PATH="./model" 6 | 7 | #INPUT_PATH="../data/aaai19srl.train0.conll.srl.json" 8 | #OUTPUT_PATH="../temp/orl.train0.out" 9 | 10 | INPUT_PATH="../data/aaai19srl.dev0.conll.json" 11 | GOLD_PATH="../data/conll_format/aaai19srl.dev0.conll" 12 | OUTPUT_PATH="../temp/orl.devel0.out" 13 | 14 | INPUT_PATH="../data/aaai19srl.test0.conll.json" 15 | GOLD_PATH="../data/conll_format/aaai19srl.test0.conll" 16 | OUTPUT_PATH="../temp/orl.test0.out" 17 | 18 | ORL_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt" 19 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt" 20 | 21 | CUDA_VISIBLE_DEVICES=$1 python3 ../src/orl-4.1/predict.py \ 22 | --span="span" \ 23 | --model="$MODEL_PATH" \ 24 | --input="$INPUT_PATH" \ 25 | --gold="$GOLD_PATH" \ 26 | --orl_cons=$ORL_CONS \ 27 | --auto_dep_trees=$SYS_DEP \ 28 | --output="$OUTPUT_PATH" \ 29 | --gpu=$1 30 | 31 | -------------------------------------------------------------------------------- /exp-4.1-baseline/train.sh: -------------------------------------------------------------------------------- 1 | export PATH=/usr/local/cuda/bin:$PATH 2 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib 3 | 4 | CONFIG="config.json" 5 | MODEL="model" 6 | 7 | TRAIN_PATH="../data/aaai19srl.train0.conll.json" 8 | #TRAIN_PATH="../data/aaai19srl.dev0.conll.json" 9 | DEV_PATH="../data/aaai19srl.dev0.conll.json" 10 | GOLD_PATH="../data/english/srl/conll05/conll05.devel.props.gold.txt" 11 | 12 | CONS_PATH="../data/constituent_conll12/ontonote5.0.train.constituents.json" 13 | DEP_TREES="/data2/qrxia/SRL-w-Heterogenous-Dep/data/english/dependency/ptb_from_baidu_from_n171/ptb.english.conll.train.txt.opentest.tag.projective" 14 | 15 | SYS_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt" 16 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt" 17 | 18 | gpu_id=$1 19 | CUDA_VISIBLE_DEVICES=$gpu_id python3 ../src/orl-4.1/train.py \ 20 | --info="orl baseline bert" \ 21 | --config=$CONFIG \ 22 | --span="span" \ 23 | --model=$MODEL \ 24 | --train=$TRAIN_PATH \ 25 | --dev=$DEV_PATH \ 26 | --gold=$GOLD_PATH \ 27 | --cons_trees=$CONS_PATH \ 28 | --dep_trees=$DEP_TREES \ 29 | --auto_cons_trees=$SYS_CONS \ 30 | --auto_dep_trees=$SYS_DEP \ 31 | --gpu=$1 32 | -------------------------------------------------------------------------------- /figures/model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.jpg -------------------------------------------------------------------------------- /figures/model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.pdf -------------------------------------------------------------------------------- /scripts/convert_orl_conll_to_json.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | from collections import OrderedDict 4 | 5 | 6 | DSE="DSE" 7 | TARGET="TARGET" 8 | AGENT="AGENT" 9 | 10 | max_dse_length, max_target_length, max_agent_length = 0, 0, 0 11 | 12 | 13 | class orl_data(): 14 | def __init__(self, tuples): 15 | self.idx = [] 16 | self.words = [] 17 | self.labels = [] 18 | self.des = [] 19 | self.des_head = [] 20 | self.target = [] 21 | self.agent = [] 22 | self.orl = [] 23 | self.init_by_typles(tuples) 24 | 25 | def output_to_srl_json(self): 26 | srl_span = [] 27 | for span in self.orl: 28 | s, e, a_s, a_e, label = span 29 | if label == "DSE": 30 | continue 31 | count = 0 32 | for des_head in self.des_head: 33 | if s <= des_head <= e: 34 | srl_span.append([des_head, a_s, a_e, label]) 35 | count += 1 36 | if count != 1: 37 | print(self.words, self.des_head) 38 | # assert count == 1 39 | for des_head in self.des_head: 40 | srl_span.append([des_head, des_head, des_head, "V"]) 41 | output = { 42 | "speakers": [["-"] * len(self.words)], 43 | "doc_key": "S0", 44 | "sentences": [self.words], 45 | "srl": [srl_span], 46 | "constituents": [[]], 47 | "clusters": [], 48 | "ner": [[]] 49 | 50 | } 51 | return output 52 | 53 | def output_to_json(self): 54 | output = { 55 | "sentences": self.words, 56 | "orl": self.orl 57 | } 58 | return output 59 | 60 | def a_complete_span(self, des, span, label): 61 | # print(des, span, label) 62 | t = (des + span) 63 | t.append(label) 64 | # print(t) 65 | assert len(t) == 5 66 | if t[-1] == DSE: 67 | global max_dse_length 68 | dse_length = t[1] - t[0] 69 | max_dse_length = max_dse_length if max_dse_length > dse_length else dse_length 70 | self.des.append(t) 71 | elif t[-1] == TARGET: 72 | global max_target_length 73 | target_length = t[3] - t[2] 74 | max_target_length = max_target_length if max_target_length > target_length else target_length 75 | self.target.append(t) 76 | else: 77 | assert t[-1] == AGENT 78 | global max_agent_length 79 | agent_length = t[3] - t[2] 80 | max_agent_length = max_agent_length if max_agent_length > agent_length else agent_length 81 | self.agent.append(t) 82 | self.orl.append(t) 83 | 84 | @staticmethod 85 | def compose_a_span(des, spans, labels, span, l): 86 | assert len(span) == 2 and l != "" 87 | if l == DSE: 88 | assert len(des) == 0 89 | des = span 90 | # print("DES!!!!") 91 | spans.append(des) 92 | labels.append(l) 93 | else: 94 | spans.append(span) 95 | labels.append(l) 96 | return des 97 | 98 | def init_by_typles(self, tuples): 99 | # print(tuples) 100 | self.idx, self.words, self.labels = tuples[0], tuples[1], tuples[2:] 101 | for expression_aware_label in self.labels: 102 | des, spans, labels = [], [], [] 103 | span, l = [], '' 104 | # print(self.label) 105 | for i, label in enumerate(expression_aware_label): 106 | if label.endswith("-*"): # we do n't need the * that marks the ``head word'' 107 | label = label[:-2] 108 | self.des_head.append(i) 109 | if label == "S-DSE": 110 | self.des_head.append(i) 111 | 112 | if label.startswith("B"): 113 | assert len(span) == 0 114 | span.append(i) 115 | l = label[2:] 116 | elif label.startswith("M"): 117 | assert l == label[2:] 118 | elif label.startswith("E"): 119 | assert l == label[2:] 120 | span.append(i) 121 | des = orl_data.compose_a_span(des, spans, labels, span, l) 122 | span, l = [], '' 123 | elif label.startswith("S"): 124 | span = [i, i] 125 | l = label[2:] 126 | # print("label", l) 127 | des = orl_data.compose_a_span(des, spans, labels, span, l) 128 | # print("XXX", des) 129 | span, l = [], '' 130 | else: 131 | assert label == 'O' 132 | 133 | assert len(spans) == len(labels) 134 | for s, l in list(zip(spans, labels)): 135 | self.a_complete_span(des, s, l) 136 | 137 | def write_to_json(self): 138 | pass 139 | 140 | 141 | if __name__ == "__main__": 142 | input_filepath = sys.argv[1] 143 | 144 | input_data = OrderedDict() 145 | original_sentence_number, unique_sentence_number = 0, 0 146 | duplicate_sentence_number = 0 147 | duplicate_sentence_label_number = 0 148 | with open(input_filepath, 'r') as input_orl_file: 149 | sentence = [] 150 | for line in input_orl_file.readlines(): 151 | if line.strip() == "": 152 | original_sentence_number += 1 153 | tuples = list(zip(*sentence)) 154 | sen = ' '.join(tuples[1]) 155 | # print(sen) 156 | if len(input_data) != 0: 157 | if sen in input_data.keys(): # if it is the same sentence 158 | # print("xx") 159 | if tuples[-1] not in input_data[sen][2:]: 160 | input_data[sen].append(tuples[-1]) 161 | duplicate_sentence_number += 1 162 | else: 163 | print(tuples[-1], "already in previous sample", input_data[sen]) 164 | duplicate_sentence_label_number += 1 165 | else: 166 | input_data[sen] = tuples 167 | unique_sentence_number += 1 168 | else: 169 | input_data[sen] = tuples 170 | unique_sentence_number += 1 171 | sentence = [] 172 | continue 173 | tokens = line.strip().split() 174 | # print(tokens) 175 | sentence.append(tokens) 176 | # check for sentences appear more than once 177 | assert original_sentence_number == unique_sentence_number + duplicate_sentence_number +\ 178 | duplicate_sentence_label_number 179 | print("original sentence number:", original_sentence_number) 180 | print("unique_sentence_number:", unique_sentence_number) 181 | print("duplicate_sentence_number:", duplicate_sentence_number) 182 | print("duplicate_sentence_label_number", duplicate_sentence_label_number) 183 | # generate_chars 184 | # with open(input_filepath + ".char.txt", 'w') as char_file: 185 | # char = set() 186 | # for sen in input_data.keys(): 187 | # words = sen.strip().split() 188 | # for word in words: 189 | # for c in word: 190 | # char.add(c) 191 | # for c in char: 192 | # char_file.write(c + '\n') 193 | 194 | sentences = set() 195 | for data in input_data.keys(): 196 | sen = ' '.join(data) 197 | if sen not in sentences: 198 | sentences.add(sen) 199 | else: 200 | print(sen, "already appears!") 201 | # pass 202 | # generate orl data 203 | orl_dataset = [] 204 | for data in input_data.keys(): 205 | orl_dataset.append(orl_data(input_data[data])) 206 | # global max_dse_length 207 | # global max_target_length 208 | # global max_agent_length 209 | print("max_dse_length", max_dse_length, "max_target_length", max_target_length, 210 | "max_agent_length", max_agent_length) 211 | # output to json 212 | json_filename = input_filepath + '.json' 213 | with open(json_filename, 'w') as output_json: 214 | for orl in orl_dataset: 215 | # print(orl.output_to_json()) 216 | output_json.write(json.dumps(orl.output_to_json()) + '\n') 217 | 218 | 219 | -------------------------------------------------------------------------------- /scripts/eval_averaged_metrics.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # from eval_orl_conll_file import load_eval_data, analyze_error_prediction_matrix 3 | from eval_orl_e2e_json_file import load_eval_data, analyze_error_prediction_matrix 4 | 5 | 6 | def average_fscore(all_metrics): 7 | zipped_metrics = list(zip(*all_metrics)) 8 | 9 | def avg(metrics): 10 | return sum([item.f for item in metrics]) / len(metrics) 11 | 12 | print('='*10, "Binary F1", '='*10) 13 | print("Agent", avg(zipped_metrics[0])) 14 | print("Target", avg(zipped_metrics[1])) 15 | print("Agent", avg(zipped_metrics[2])) 16 | 17 | print('='*10, "Proportional F1", '='*10) 18 | print("Agent", avg(zipped_metrics[3])) 19 | print("Target", avg(zipped_metrics[4])) 20 | print("Agent", avg(zipped_metrics[5])) 21 | 22 | print('='*10, "Exact F1", '='*10) 23 | print("Agent", avg(zipped_metrics[6])) 24 | print("Target", avg(zipped_metrics[7])) 25 | print("Agent", avg(zipped_metrics[8])) 26 | 27 | print('=' * 10, "Expression F1", '=' * 10) 28 | print("Binary", avg(zipped_metrics[9])) 29 | print("Proportional", avg(zipped_metrics[10])) 30 | print("Exact", avg(zipped_metrics[11])) 31 | 32 | 33 | if __name__ == "__main__": 34 | averaged_metric = [] 35 | assert len(sys.argv[1:]) == 5 36 | for file_path in sys.argv[1:]: 37 | result = load_eval_data(file_path) 38 | 39 | x = analyze_error_prediction_matrix(result) 40 | (agent_binary, target_binary, all_binary), \ 41 | (agent_proportional, target_proportional, all_proportional), \ 42 | (agent_exact, target_exact, all_exact), \ 43 | (exp_binary, exp_proportional, exp_exact) = x 44 | 45 | averaged_metric.append([agent_binary, target_binary, all_binary, 46 | agent_proportional, target_proportional, all_proportional, 47 | agent_exact, target_exact, all_exact, 48 | exp_binary, exp_proportional, exp_exact]) 49 | average_fscore(averaged_metric) 50 | 51 | -------------------------------------------------------------------------------- /scripts/eval_orl_json_file.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from collections import OrderedDict, Counter 5 | 6 | 7 | AGENT="AGENT" 8 | TARGET="TARGET" 9 | 10 | 11 | class Sample(): 12 | def __init__(self, obj): 13 | self.sentence = obj["sentence"] 14 | self.gold_orl = obj['gold_orl'] 15 | self.sys_orl = obj["sys_orl"] 16 | # self.sys_argus_constituents = obj['sys_argus_constituents'] 17 | # self.constituent = obj["constituent"] 18 | 19 | 20 | def load_eval_data(eval_path): 21 | eval_data = [] 22 | with open(eval_path, 'r') as f: 23 | eval_data = [Sample(json.loads(jsonline)) for jsonline in f.readlines()] 24 | print("Loaded {} eval examples.".format(len(eval_data))) 25 | return eval_data 26 | 27 | 28 | class EvalMetric(): 29 | def __init__(self, name="None"): 30 | self.name = name 31 | self.matched, self.sys, self.gold = 0, 0, 0 32 | self.p = self.r = self.f = 0.0 33 | 34 | def compute_prf(self): 35 | try: 36 | self.p = 100.0 * self.matched / self.sys 37 | except: 38 | self.p = 0.0 39 | try: 40 | self.r = 100.0 * self.matched / self.gold 41 | except: 42 | self.r = 0.0 43 | try: 44 | self.f = 2.0 * self.p * self.r / (self.p + self.r) 45 | except: 46 | self.f = 0.0 47 | print("="*5, self.name, "="*5) 48 | print("Precision:", self.matched, '/', self.sys, '=', self.p) 49 | print("Recall:", self.matched, '/', self.gold, '=', self.r) 50 | print("F1 score:", self.f) 51 | 52 | 53 | def analyze_error_prediction_matrix(samples): 54 | agent_binary, target_binary, all_binary = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All") 55 | agent_proportional, target_proportional, all_proportional = \ 56 | EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All") 57 | agent_exact, target_exact, all_exact = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All") 58 | for sample in samples: 59 | gold_orl, sys_orl = sample.gold_orl, sample.sys_orl 60 | # dict s-e: label 61 | dict_gold_orl, dict_sys_orl = OrderedDict(), OrderedDict() 62 | for g_orl in gold_orl: # construct the expression-argument tuples 63 | dse_s, dse_e, s, e, label = g_orl 64 | expression = str(dse_s) + '-' + str(dse_e) 65 | argument = (s, e, label) 66 | if expression not in dict_gold_orl: 67 | dict_gold_orl[expression] = [] 68 | dict_gold_orl[expression].append(argument) 69 | else: 70 | dict_gold_orl[expression].append(argument) 71 | for s_orl in sys_orl: # construct the expression-argument tuples 72 | dse_s, dse_e, s, e, label = s_orl 73 | expression = str(dse_s) + '-' + str(dse_e) 74 | argument = (s, e, label) 75 | if expression not in dict_sys_orl: 76 | dict_sys_orl[expression] = [] 77 | dict_sys_orl[expression].append(argument) 78 | else: 79 | dict_sys_orl[expression].append(argument) 80 | 81 | for expression in dict_gold_orl: # compute the gold 82 | for argument in dict_gold_orl[expression]: 83 | s, e, label = argument 84 | all_binary.gold += 1 85 | all_proportional.gold += 1 86 | all_exact.gold += 1 87 | if label == AGENT: 88 | agent_binary.gold += 1 89 | agent_proportional.gold += 1 90 | agent_exact.gold += 1 91 | else: 92 | assert label == TARGET 93 | target_binary.gold += 1 94 | target_proportional.gold += 1 95 | target_exact.gold += 1 96 | 97 | for expression in dict_sys_orl: # compute the sys 98 | for argument in dict_sys_orl[expression]: 99 | s, e, label = argument 100 | all_binary.sys += 1 101 | all_proportional.sys += 1 102 | all_exact.sys += 1 103 | if label == AGENT: 104 | agent_binary.sys += 1 105 | agent_proportional.sys += 1 106 | agent_exact.sys += 1 107 | else: 108 | assert label == TARGET 109 | target_binary.sys += 1 110 | target_proportional.sys += 1 111 | target_exact.sys += 1 112 | 113 | for expression in dict_sys_orl: # compute the sys 114 | if expression not in dict_gold_orl: # debug: some gold orl has no argument, only expression 115 | # print(sample.sentence) 116 | continue 117 | gold_arguments = dict_gold_orl[expression] 118 | for argument in dict_sys_orl[expression]: 119 | s, e, label = argument 120 | if argument in gold_arguments: # exact 121 | all_binary.matched += 1 122 | all_proportional.matched += 1 123 | all_exact.matched += 1 124 | if label == AGENT: 125 | agent_binary.matched += 1 126 | agent_proportional.matched += 1 127 | agent_exact.matched += 1 128 | else: 129 | assert label == TARGET 130 | target_binary.matched += 1 131 | target_proportional.matched += 1 132 | target_exact.matched += 1 133 | else: 134 | # binary 135 | find = False 136 | for index in range(s, e + 1): 137 | for gold_arg in gold_arguments: 138 | g_s, g_e, g_label = gold_arg 139 | if g_label == label: 140 | if g_s <= index <= g_e: 141 | all_binary.matched += 1 142 | if label == AGENT: 143 | agent_binary.matched += 1 144 | else: 145 | target_binary.matched += 1 146 | find = True 147 | break 148 | if find is True: 149 | break 150 | # proportional 151 | list_of_proportional = [] 152 | for gold_argument in dict_gold_orl[expression]: 153 | g_s, g_e, g_label = gold_argument 154 | matched_positions = 0 155 | if label != g_label: 156 | pass 157 | else: 158 | for position in range(g_s, g_e + 1): 159 | if s <= position <= e: 160 | matched_positions += 1 161 | list_of_proportional.append(1.0 * matched_positions / (g_e - g_s + 1)) 162 | if len(list_of_proportional) > 0: # matched a gold argument 163 | all_proportional.matched += max(list_of_proportional) 164 | if label == AGENT: 165 | agent_proportional.matched += max(list_of_proportional) 166 | else: 167 | target_proportional.matched += max(list_of_proportional) 168 | 169 | print("="*15, 'Binary Metric', "="*15) 170 | agent_binary.compute_prf() 171 | target_binary.compute_prf() 172 | all_binary.compute_prf() 173 | 174 | print("="*15, 'Proportional Metric', "="*15) 175 | agent_proportional.compute_prf() 176 | target_proportional.compute_prf() 177 | all_proportional.compute_prf() 178 | 179 | print("="*15, 'Exact Metric', "="*15) 180 | agent_exact.compute_prf() 181 | target_exact.compute_prf() 182 | all_exact.compute_prf() 183 | 184 | 185 | if __name__ == "__main__": 186 | input_file_path = sys.argv[1] 187 | data = load_eval_data(input_file_path) 188 | analyze_error_prediction_matrix(data) 189 | -------------------------------------------------------------------------------- /scripts/generate_constituent_trees_from_benepar.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import sys 3 | import benepar 4 | 5 | 6 | if __name__ == "__main__": 7 | filepath = sys.argv[1] 8 | sentences = [] 9 | with open(filepath, 'r') as input_file: 10 | for line in input_file.readlines(): 11 | sentence = line.strip() 12 | words = sentence.split(' ') 13 | sentences.append(words) 14 | 15 | parser = benepar.Parser("benepar_en2") 16 | constituent_trees = [] 17 | for sentence in sentences: 18 | tree = parser.parse(sentence) 19 | constituent_trees.append(tree) 20 | 21 | with open(filepath + '.constituent.txt', 'w') as output_file: 22 | for t in constituent_trees: 23 | output_file.write(str(t) + '\n' + '\n') 24 | 25 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["neural_srl"] -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from torch.nn.utils.rnn import pack_padded_sequence as pack 5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 6 | from .TreeGRU import DTTreeGRU, TDTreeGRU 7 | from .Tree import creatTree 8 | 9 | 10 | class EncoderRNN(nn.Module): 11 | """ The standard RNN encoder. 12 | """ 13 | def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1): 14 | super(EncoderRNN, self).__init__() 15 | self.hidden_size = hidden_size 16 | self.num_layers = num_layers 17 | self.dropout = nn.Dropout(dropout) 18 | 19 | self.rnn = nn.GRU(input_size=input_size, 20 | hidden_size=hidden_size, 21 | num_layers=num_layers, 22 | bidirectional=True) # batch_first = False 23 | self.transform = nn.Linear(in_features=2 * hidden_size, 24 | out_features=input_size, 25 | bias=True) 26 | self.dt_tree = DTTreeGRU(input_size, hidden_size) 27 | self.td_tree = TDTreeGRU(input_size, hidden_size) 28 | 29 | def forward(self, input, heads, lengths=None, hidden=None): 30 | """ See EncoderBase.forward() for description of args and returns. 31 | inputs: [L, B, H], including the -ROOT- 32 | heads: [heads] * B 33 | """ 34 | emb = self.dropout(input) 35 | 36 | packed_emb = emb 37 | if lengths is not None: 38 | # Lengths data is wrapped inside a Variable. 39 | packed_emb = pack(emb, lengths) 40 | 41 | outputs, hidden_t = self.rnn(packed_emb, hidden) 42 | 43 | if lengths is not None: 44 | outputs = unpack(outputs)[0] 45 | 46 | outputs = self.dropout(self.transform(outputs)) 47 | max_length, batch_size, input_dim = outputs.size() 48 | trees = [] 49 | indexes = np.full((max_length, batch_size), -1, 50 | dtype=np.int32) # a col is a sentence 51 | for b, head in enumerate(heads): 52 | root, tree = creatTree( 53 | head) # head: a sentence's heads; sentence base 54 | root.traverse() # traverse the tree 55 | for step, index in enumerate(root.order): 56 | indexes[step, b] = index 57 | trees.append(tree) 58 | 59 | dt_outputs, dt_hidden_ts = self.dt_tree.forward( 60 | outputs, indexes, trees) 61 | td_outputs, td_hidden_ts = self.td_tree.forward( 62 | outputs, indexes, trees) 63 | 64 | outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1) 65 | output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0) 66 | 67 | return outputs, output_t 68 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Tree.py: -------------------------------------------------------------------------------- 1 | class Tree(object): 2 | def __init__(self, index): 3 | self.parent = None 4 | self.is_left = False 5 | self.index = index 6 | self.left_children = list() 7 | self.left_num = 0 8 | self.right_children = list() 9 | self.right_num = 0 10 | self._depth = -1 11 | self.order = [] 12 | 13 | def add_left(self, child): 14 | """ 15 | :param child: a Tree object represent the child 16 | :return: 17 | """ 18 | child.parent = self 19 | child.is_left = True 20 | self.left_children.append(child) 21 | self.left_num += 1 22 | 23 | def add_right(self, child): 24 | """ 25 | :param child: a Tree object represent the child 26 | :return: 27 | """ 28 | child.parent = self 29 | child.is_left = False 30 | self.right_children.append(child) 31 | self.right_num += 1 32 | 33 | def size(self): # compute the total size of the Tree 34 | if hasattr(self, '_size'): 35 | return self._size 36 | count = 1 37 | for i in range(self.left_num): 38 | count += self.left_children[i].size() 39 | for i in range(self.right_num): 40 | count += self.right_children[i].size() 41 | self._size = count 42 | return self._size 43 | 44 | def depth(self): # compute the depth of the Tree 45 | if self._depth > 0: 46 | return self._depth 47 | count = 0 48 | if self.left_num + self.right_num > 0: 49 | for i in range(self.left_num): 50 | child_depth = self.left_children[i].depth() 51 | if child_depth > count: 52 | count = child_depth 53 | for i in range(self.right_num): 54 | child_depth = self.right_children[i].depth() 55 | if child_depth > count: 56 | count = child_depth 57 | count += 1 58 | self._depth = count 59 | return self._depth 60 | 61 | def traverse(self): # traverse the Tree 62 | if len(self.order) > 0: 63 | return self.order 64 | 65 | for i in range(self.left_num): 66 | left_order = self.left_children[i].traverse() 67 | self.order.extend(left_order) 68 | for i in range(self.right_num): 69 | right_order = self.right_children[i].traverse() 70 | self.order.extend(right_order) 71 | self.order.append(self.index) # append the root 72 | return self.order 73 | 74 | 75 | def creatTree(heads): 76 | tree = [] 77 | # current sentence has already been numberized [form, head, rel] 78 | root = None 79 | for idx, head in enumerate(heads): 80 | tree.append(Tree(idx)) 81 | 82 | for idx, head in enumerate(heads): 83 | if head == -1: # -1 mszhang, 0 kiro 84 | root = tree[idx] 85 | continue 86 | if head < 0: 87 | print('error: multi roots') 88 | if head > idx: 89 | tree[head].add_left(tree[idx]) 90 | if head < idx: 91 | tree[head].add_right(tree[idx]) 92 | if head == idx: 93 | print('error: head is it self.') 94 | 95 | return root, tree 96 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["Encoder", "Tree", "TreeGRU"] 2 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/gcn.py: -------------------------------------------------------------------------------- 1 | """ 2 | GCN model for relation extraction. 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | from ..shared.constants import PAD_ID 10 | import numpy as np 11 | 12 | 13 | class GCN(nn.Module): 14 | def __init__(self, config, input_dim, mem_dim, num_layers): 15 | super(GCN, self).__init__() 16 | self.config = config 17 | self.input_dim = input_dim 18 | self.mem_dim = mem_dim 19 | self.layers = num_layers 20 | 21 | # rnn layer 22 | if self.config.gcn_rnn is True: 23 | input_size = self.input_dim 24 | self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True, 25 | dropout=self.config.gcn_rnn_dropout, bidirectional=True) 26 | self.in_dim = self.config.gcn_rnn_hidden * 2 27 | self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout) # use on last layer output 28 | 29 | self.in_drop = nn.Dropout(self.config.gcn_input_dropout) 30 | self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout) 31 | 32 | # gcn layer 33 | self.W = nn.ModuleList() 34 | self.layer_normalization = nn.ModuleList() 35 | 36 | for layer in range(self.layers): 37 | # input_dim = self.in_dim if layer == 0 else self.mem_dim 38 | self.W.append(nn.Linear(self.in_dim, self.in_dim)) 39 | self.layer_normalization.append(LayerNormalization(self.in_dim)) 40 | 41 | def conv_l2(self): 42 | conv_weights = [] 43 | for w in self.W: 44 | conv_weights += [w.weight, w.bias] 45 | return sum([x.pow(2).sum() for x in conv_weights]) 46 | 47 | def encode_with_rnn(self, rnn_inputs, masks, batch_size): 48 | seq_lens = masks.data.eq(1).long().sum(1).squeeze() 49 | h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers) 50 | 51 | # SORT YOUR TENSORS BY LENGTH! 52 | seq_lens, perm_idx = seq_lens.sort(0, descending=True) 53 | 54 | rnn_inputs = rnn_inputs[perm_idx] 55 | rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True) 56 | rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0)) 57 | rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True) 58 | 59 | _, unperm_idx = perm_idx.sort(0) 60 | rnn_outputs = rnn_outputs[unperm_idx] 61 | return rnn_outputs 62 | 63 | def forward(self, adj, embs, masks): 64 | batch_size = masks.size()[0] 65 | embs = self.in_drop(embs) 66 | # rnn layer 67 | if self.config.gcn_rnn is True: 68 | gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size)) 69 | else: 70 | gcn_inputs = embs 71 | 72 | # gcn layer 73 | denom = adj.sum(2).unsqueeze(2) + 1 74 | mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2) 75 | # # zero out adj for ablation 76 | # if self.opt.get('no_adj', False): 77 | # adj = torch.zeros_like(adj) 78 | 79 | for l in range(self.layers): 80 | # print(gcn_inputs.size(), adj.size()) 81 | x = gcn_inputs 82 | Ax = adj.bmm(gcn_inputs) 83 | AxW = self.W[l](Ax) 84 | AxW = AxW + self.W[l](gcn_inputs) # self loop 85 | AxW = AxW / denom 86 | 87 | gAxW = F.relu(AxW) 88 | gcn_inputs = self.gcn_drop(gAxW) 89 | self.layer_normalization[l].forward(gcn_inputs + x) 90 | 91 | return gcn_inputs, mask 92 | 93 | 94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True): 95 | total_layers = num_layers * 2 if bidirectional else num_layers 96 | state_shape = (total_layers, batch_size, hidden_dim) 97 | h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False) 98 | if use_cuda: 99 | return h0.cuda(), c0.cuda() 100 | else: 101 | return h0, c0 102 | 103 | 104 | class LayerNormalization(nn.Module): 105 | ''' Layer normalization module ''' 106 | 107 | def __init__(self, d_hid, eps=1e-3): # 108 | super(LayerNormalization, self).__init__() 109 | self.eps = eps 110 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 111 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 112 | 113 | def forward(self, z): 114 | if z.size(1) == 1: 115 | return z 116 | mu = torch.mean(z, keepdim=True, dim=-1) 117 | sigma = torch.std(z, keepdim=True, dim=-1) 118 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) # 1e-3 is ok, because variance and std. 119 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) 120 | return ln_out 121 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic operations on trees. 3 | """ 4 | 5 | import numpy as np 6 | from collections import defaultdict 7 | 8 | 9 | class Tree(object): 10 | """ 11 | Reused tree object from stanfordnlp/treelstm. 12 | """ 13 | 14 | def __init__(self): 15 | self.parent = None 16 | # head probability 17 | self.phead = -1 18 | self.num_children = 0 19 | self.children = list() 20 | 21 | def add_child(self, child): 22 | child.parent = self 23 | self.num_children += 1 24 | self.children.append(child) 25 | 26 | def size(self): 27 | if getattr(self, '_size'): 28 | return self._size 29 | count = 1 30 | for i in xrange(self.num_children): 31 | count += self.children[i].size() 32 | self._size = count 33 | return self._size 34 | 35 | def depth(self): 36 | if getattr(self, '_depth'): 37 | return self._depth 38 | count = 0 39 | if self.num_children > 0: 40 | for i in xrange(self.num_children): 41 | child_depth = self.children[i].depth() 42 | if child_depth > count: 43 | count = child_depth 44 | count += 1 45 | self._depth = count 46 | return self._depth 47 | 48 | def __iter__(self): 49 | yield self 50 | for c in self.children: 51 | for x in c: 52 | yield x 53 | 54 | 55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos): 56 | """ 57 | Convert a sequence of head indexes into a tree object. 58 | """ 59 | tokens = tokens[:len_].tolist() 60 | head = head[:len_].tolist() 61 | root = None 62 | 63 | if prune < 0: 64 | nodes = [Tree() for _ in head] 65 | 66 | for i in range(len(nodes)): 67 | h = head[i] 68 | nodes[i].idx = i 69 | nodes[i].dist = -1 # just a filler 70 | if h == 0: 71 | root = nodes[i] 72 | else: 73 | nodes[h - 1].add_child(nodes[i]) 74 | else: 75 | # find dependency path 76 | subj_pos = [i for i in range(len_) if subj_pos[i] == 0] 77 | obj_pos = [i for i in range(len_) if obj_pos[i] == 0] 78 | 79 | cas = None 80 | 81 | subj_ancestors = set(subj_pos) 82 | for s in subj_pos: 83 | h = head[s] 84 | tmp = [s] 85 | while h > 0: 86 | tmp += [h - 1] 87 | subj_ancestors.add(h - 1) 88 | h = head[h - 1] 89 | 90 | if cas is None: 91 | cas = set(tmp) 92 | else: 93 | cas.intersection_update(tmp) 94 | 95 | obj_ancestors = set(obj_pos) 96 | for o in obj_pos: 97 | h = head[o] 98 | tmp = [o] 99 | while h > 0: 100 | tmp += [h - 1] 101 | obj_ancestors.add(h - 1) 102 | h = head[h - 1] 103 | cas.intersection_update(tmp) 104 | 105 | # find lowest common ancestor 106 | if len(cas) == 1: 107 | lca = list(cas)[0] 108 | else: 109 | child_count = {k: 0 for k in cas} 110 | for ca in cas: 111 | if head[ca] > 0 and head[ca] - 1 in cas: 112 | child_count[head[ca] - 1] += 1 113 | 114 | # the LCA has no child in the CA set 115 | for ca in cas: 116 | if child_count[ca] == 0: 117 | lca = ca 118 | break 119 | 120 | path_nodes = subj_ancestors.union(obj_ancestors).difference(cas) 121 | path_nodes.add(lca) 122 | 123 | # compute distance to path_nodes 124 | dist = [-1 if i not in path_nodes else 0 for i in range(len_)] 125 | 126 | for i in range(len_): 127 | if dist[i] < 0: 128 | stack = [i] 129 | while stack[-1] >= 0 and stack[-1] not in path_nodes: 130 | stack.append(head[stack[-1]] - 1) 131 | 132 | if stack[-1] in path_nodes: 133 | for d, j in enumerate(reversed(stack)): 134 | dist[j] = d 135 | else: 136 | for j in stack: 137 | if j >= 0 and dist[j] < 0: 138 | dist[j] = int(1e4) # aka infinity 139 | 140 | highest_node = lca 141 | nodes = [Tree() if dist[i] <= prune else None for i in range(len_)] 142 | 143 | for i in range(len(nodes)): 144 | if nodes[i] is None: 145 | continue 146 | h = head[i] 147 | nodes[i].idx = i 148 | nodes[i].dist = dist[i] 149 | if h > 0 and i != highest_node: 150 | assert nodes[h - 1] is not None 151 | nodes[h - 1].add_child(nodes[i]) 152 | 153 | root = nodes[highest_node] 154 | 155 | assert root is not None 156 | return root 157 | 158 | 159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False): 160 | """ 161 | Convert a tree object to an (numpy) adjacency matrix. 162 | """ 163 | ret = np.zeros((sent_len, sent_len), dtype=np.float32) 164 | 165 | queue = [tree] 166 | idx = [] 167 | while len(queue) > 0: 168 | t, queue = queue[0], queue[1:] 169 | 170 | idx += [t.idx] 171 | 172 | for c in t.children: 173 | ret[t.idx, c.idx] = 1 174 | queue += t.children 175 | 176 | if not directed: 177 | ret = ret + ret.T 178 | 179 | if self_loop: 180 | for i in idx: 181 | ret[i, i] = 1 182 | 183 | return ret 184 | 185 | 186 | def tree_to_dist(sent_len, tree): 187 | ret = -1 * np.ones(sent_len, dtype=np.int64) 188 | 189 | for node in tree: 190 | ret[node.idx] = node.dist 191 | 192 | return ret 193 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/implicit_syntactic_representations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.nn.utils.rnn import pad_sequence 6 | 7 | 8 | from .model import drop_sequence_sharedmask, _model_var 9 | from .HighWayLSTM import Highway_Concat_BiLSTM 10 | from .layer import NonLinear, Biaffine 11 | 12 | 13 | class ImplicitDependencyRepresentations(nn.Module): 14 | def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size): 15 | super(ImplicitDependencyRepresentations, self).__init__() 16 | self.config = config 17 | self.lstm_input_size = lstm_input_size 18 | self.lstm_hidden_size = lstm_hidden_size 19 | self.dep_label_space_size = dep_label_space_size 20 | # softmax weights 21 | self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0])) 22 | self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0])) 23 | for _ in range(self.config.dep_num_lstm_layers)]) 24 | self.cuda = True 25 | 26 | self.dep_bilstm = Highway_Concat_BiLSTM( 27 | input_size=self.lstm_input_size, 28 | hidden_size=self.lstm_hidden_size, # // 2 for MyLSTM 29 | num_layers=self.config.dep_num_lstm_layers, 30 | batch_first=True, 31 | bidirectional=True, 32 | dropout_in=config.input_dropout_prob, 33 | dropout_out=config.recurrent_dropout_prob 34 | ) 35 | 36 | # dependency parsing module 37 | self.mlp_arc_dep = NonLinear( 38 | input_size=2 * config.lstm_hidden_size, 39 | hidden_size=config.mlp_arc_size + config.mlp_rel_size, 40 | activation=nn.LeakyReLU(0.1)) 41 | self.mlp_arc_head = NonLinear( 42 | input_size=2 * config.lstm_hidden_size, 43 | hidden_size=config.mlp_arc_size + config.mlp_rel_size, 44 | activation=nn.LeakyReLU(0.1)) 45 | 46 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100) 47 | self.arc_num = int(config.mlp_arc_size / 100) 48 | self.rel_num = int(config.mlp_rel_size / 100) 49 | 50 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False)) 51 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size, 52 | bias=(True, True)) 53 | 54 | def init_masks(self, batch_size, lengths): 55 | max_sent_length = max(lengths) 56 | num_sentences = batch_size 57 | indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1) 58 | masks = indices < lengths.unsqueeze(1) 59 | masks = masks.type(torch.FloatTensor) 60 | if self.cuda: 61 | masks = masks.cuda() 62 | return masks 63 | 64 | def forward(self, num_sentences, context_embeddings, sent_lengths, dep): 65 | masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths)) 66 | lstm_out, _ = self.dep_bilstm(context_embeddings, masks) 67 | 68 | if self.training: 69 | lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp) 70 | 71 | x_all_dep = self.mlp_arc_dep(lstm_out) 72 | x_all_head = self.mlp_arc_head(lstm_out) 73 | 74 | if self.training: 75 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 76 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 77 | 78 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2) 79 | x_all_head_splits = torch.split(x_all_head, 100, dim=2) 80 | 81 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 82 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 83 | 84 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 85 | arc_logit = torch.squeeze(arc_logit, dim=3) 86 | 87 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 88 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 89 | 90 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 91 | 92 | self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond 93 | 94 | heads, rels = dep[0], dep[1] 95 | loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist()) # compute the dep loss 96 | return loss, self.arc_logits 97 | 98 | def compute_dep_loss(self, true_arcs, true_rels, lengths): 99 | b, l1, l2 = self.arc_logits.size() 100 | index_true_arcs = _model_var( 101 | self.parameters(), 102 | pad_sequence(true_arcs, padding_value=0, batch_first=True) 103 | ) 104 | true_arcs = _model_var( 105 | self.parameters(), 106 | pad_sequence(true_arcs, padding_value=-1, batch_first=True) 107 | ) 108 | 109 | masks = [] 110 | for length in lengths: 111 | mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length)) 112 | mask = _model_var(self.parameters(), mask) 113 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 114 | masks.append(mask.transpose(0, 1)) 115 | length_mask = torch.stack(masks, 0) 116 | arc_logits = self.arc_logits + length_mask 117 | 118 | arc_loss = F.cross_entropy( 119 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 120 | ignore_index=-1, reduction="sum") 121 | 122 | size = self.rel_logits.size() 123 | output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3])) 124 | for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))): 125 | rel_probs = [] 126 | for i in range(l1): 127 | rel_probs.append(logits[i][int(arcs[i])]) 128 | rel_probs = torch.stack(rel_probs, dim=0) 129 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 130 | 131 | b, l1, d = output_logits.size() 132 | true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True)) 133 | 134 | rel_loss = F.cross_entropy( 135 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum") 136 | 137 | loss = arc_loss + rel_loss 138 | return loss 139 | 140 | def get_reps(self, context_embeddings, masks): 141 | dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks) 142 | normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0) 143 | normed_weights = torch.split(normed_weights, 1) # split_size_or_sections=1, split_size=1) # 0.3.0 144 | dep_representations = self.dep_gamma * \ 145 | sum([normed_weights[i] * dep_lstm_outputs[i] for i in 146 | range(self.config.dep_num_lstm_layers)]) 147 | if self.training: 148 | lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp) 149 | 150 | x_all_dep = self.mlp_arc_dep(dep_lstm_out) 151 | x_all_head = self.mlp_arc_head(dep_lstm_out) 152 | 153 | if self.training: 154 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 155 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 156 | 157 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2) 158 | x_all_head_splits = torch.split(x_all_head, 100, dim=2) 159 | 160 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 161 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 162 | 163 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 164 | arc_logit = torch.squeeze(arc_logit, dim=3) 165 | return dep_representations, arc_logit 166 | 167 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from .layer import MyLSTM, NonLinear, Biaffine 5 | 6 | 7 | def _model_var(parameters, x): 8 | p = next(iter(filter(lambda p: p.requires_grad, parameters))) 9 | if p.is_cuda: 10 | x = x.cuda(p.get_device()) 11 | return torch.autograd.Variable(x) 12 | 13 | 14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb): 15 | batch_size, seq_length, _ = word_embeddings.size() 16 | # tensor.new: build a tensor with the same data type 17 | word_masks = word_embeddings.data.new(batch_size, 18 | seq_length).fill_(1 - dropout_emb) 19 | word_masks = torch.Tensor(torch.bernoulli(word_masks)) 20 | word_masks.requires_grad = False 21 | tag_masks = tag_embeddings.data.new(batch_size, 22 | seq_length).fill_(1 - dropout_emb) 23 | tag_masks = torch.Tensor(torch.bernoulli(tag_masks)) 24 | tag_masks.requires_grad = False 25 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12) 26 | word_masks *= scale 27 | tag_masks *= scale 28 | # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position. 29 | word_masks = word_masks.unsqueeze(dim=2) # ? 30 | tag_masks = tag_masks.unsqueeze(dim=2) 31 | word_embeddings = word_embeddings * word_masks 32 | tag_embeddings = tag_embeddings * tag_masks 33 | 34 | return word_embeddings, tag_embeddings 35 | 36 | 37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True): 38 | if batch_first: 39 | inputs = inputs.transpose(0, 1) 40 | seq_length, batch_size, hidden_size = inputs.size() 41 | drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout) 42 | drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type()) 43 | drop_masks.requires_grad = False 44 | drop_masks = drop_masks / (1 - dropout) 45 | drop_masks = torch.unsqueeze(drop_masks, 46 | dim=2).expand(-1, -1, 47 | seq_length).permute(2, 0, 1) 48 | inputs = inputs * drop_masks 49 | 50 | return inputs.transpose(1, 0) 51 | 52 | 53 | class ParserModel(nn.Module): # build a biaffine parser model 54 | def __init__(self, vocab, config, pretrained_embedding): 55 | super(ParserModel, self).__init__() 56 | self.config = config 57 | self.word_embed = nn.Embedding(vocab.vocab_size, 58 | config.word_dims, 59 | padding_idx=0) 60 | self.extword_embed = nn.Embedding(vocab.extvocab_size, 61 | config.word_dims, 62 | padding_idx=0) 63 | self.tag_embed = nn.Embedding(vocab.tag_size, 64 | config.tag_dims, 65 | padding_idx=0) 66 | 67 | word_init = np.zeros((vocab.vocab_size, config.word_dims), 68 | dtype=np.float32) 69 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) 70 | 71 | tag_init = np.random.randn(vocab.tag_size, 72 | config.tag_dims).astype(np.float32) 73 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init)) 74 | 75 | self.extword_embed.weight.data.copy_( 76 | torch.from_numpy(pretrained_embedding)) 77 | self.extword_embed.weight.requires_grad = False 78 | 79 | self.lstm = MyLSTM( 80 | input_size=config.word_dims + config.tag_dims, 81 | hidden_size=config.lstm_hiddens, 82 | num_layers=config.lstm_layers, 83 | batch_first=True, 84 | bidirectional=True, 85 | dropout_in=config.dropout_lstm_input, 86 | dropout_out=config.dropout_lstm_hidden, 87 | ) 88 | 89 | self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens, 90 | hidden_size=config.mlp_arc_size + 91 | config.mlp_rel_size, 92 | activation=nn.LeakyReLU(0.1)) 93 | self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens, 94 | hidden_size=config.mlp_arc_size + 95 | config.mlp_rel_size, 96 | activation=nn.LeakyReLU(0.1)) 97 | 98 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100) 99 | self.arc_num = int(config.mlp_arc_size / 100) # config: 500 100 | self.rel_num = int(config.mlp_rel_size / 100) # config: 100 101 | 102 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 103 | 1, bias=(True, False)) 104 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, 105 | vocab.rel_size, bias=(True, True)) 106 | 107 | def forward( 108 | self, words, extwords, tags, 109 | masks): # words [batch, max_sentence_length], padding with zeros 110 | # x = (batch size, sequence length, dimension of embedding) 111 | x_word_embed = self.word_embed(words) 112 | x_extword_embed = self.extword_embed(extwords) 113 | x_embed = x_word_embed + x_extword_embed 114 | x_tag_embed = self.tag_embed(tags) 115 | 116 | if self.training: 117 | x_embed, x_tag_embed = drop_input_independent( 118 | x_embed, x_tag_embed, self.config.dropout_emb) 119 | 120 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2) 121 | 122 | outputs, _ = self.lstm(x_lexical, masks, None) 123 | outputs = outputs.transpose(1, 0) 124 | 125 | if self.training: 126 | outputs = drop_sequence_sharedmask(outputs, 127 | self.config.dropout_mlp) 128 | 129 | x_all_dep = self.mlp_arc_dep(outputs) 130 | x_all_head = self.mlp_arc_head(outputs) 131 | 132 | if self.training: 133 | x_all_dep = drop_sequence_sharedmask(x_all_dep, 134 | self.config.dropout_mlp) 135 | x_all_head = drop_sequence_sharedmask(x_all_head, 136 | self.config.dropout_mlp) 137 | 138 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2) 139 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2) 140 | 141 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 142 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 143 | 144 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 145 | arc_logit = torch.squeeze(arc_logit, dim=3) 146 | 147 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 148 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 149 | 150 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 151 | return arc_logit, rel_logit_cond 152 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | 6 | def block_orth_normal_initializer(input_size, output_size): 7 | weight = [] 8 | for o in output_size: 9 | for i in input_size: 10 | param = torch.FloatTensor(o, i) 11 | torch.nn.init.orthogonal_(param) 12 | weight.append(param) 13 | return torch.cat(weight) 14 | 15 | 16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights): 17 | batch_size = len(batch_x) # batch size 18 | length = max(batch_lengths) 19 | 20 | words = Variable(torch.LongTensor(batch_size, length).zero_(), 21 | requires_grad=False) # padding with 0 22 | predicates = Variable(torch.LongTensor(batch_size, length).zero_(), 23 | requires_grad=False) 24 | masks = Variable(torch.Tensor(batch_size, length).zero_(), 25 | requires_grad=False) 26 | padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(), 27 | requires_grad=False) 28 | labels, lengths = [], [] 29 | 30 | b = 0 31 | for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y, 32 | batch_lengths, 33 | batch_weights): 34 | lengths.append(s_length) 35 | rel = np.zeros((s_length), dtype=np.int32) 36 | for i in range(s_length): 37 | words[b, i] = s_words[1][i] # word 38 | predicates[b, i] = s_words[2][i] # predicate 39 | rel[i] = s_answer[0][i] 40 | padding_answers[b, i] = s_answer[0][i] 41 | masks[b, i] = 1 42 | 43 | # sentence_id = s_words[0][0] # get the dep_labels_ids of each sentence 44 | b += 1 45 | labels.append(rel) 46 | 47 | return words, predicates, labels, torch.LongTensor( 48 | lengths), masks, padding_answers 49 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/configuration.py: -------------------------------------------------------------------------------- 1 | ''' Configuration for experiments. 2 | ''' 3 | import json 4 | from argparse import Namespace 5 | 6 | 7 | def get_config(config_filepath): 8 | with open(config_filepath, 'r') as config_file: 9 | conf = json.load(config_file, object_hook=lambda d: Namespace(**d)) 10 | return conf 11 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/conll_utils.py: -------------------------------------------------------------------------------- 1 | def bio_to_se(labels): 2 | slen = len(labels) 3 | new_labels = [] 4 | has_opening = False 5 | for i in range(slen): 6 | label = labels[i] 7 | if label == 'O': 8 | new_labels.append('*') 9 | continue 10 | new_label = '*' 11 | if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]: 12 | new_label = '(' + label[2:] + new_label 13 | has_opening = True 14 | if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]: 15 | new_label = new_label + ')' 16 | has_opening = False 17 | new_labels.append(new_label) 18 | 19 | if has_opening: 20 | ''' logging ''' 21 | print("Has unclosed opening: {}".format(labels)) 22 | return new_labels 23 | 24 | 25 | def print_sentence_to_conll(fout, tokens, labels): 26 | for label_column in labels: 27 | assert len(label_column) == len(tokens) 28 | for i in range(len(tokens)): 29 | fout.write(tokens[i].ljust(15)) 30 | for label_column in labels: 31 | fout.write(label_column[i].rjust(15)) 32 | fout.write("\n") 33 | fout.write("\n") 34 | 35 | 36 | def print_to_conll(pred_labels, gold_props_file, output_filename): 37 | """ 38 | """ 39 | fout = open(output_filename, 'w') 40 | seq_ptr = 0 41 | num_props_for_sentence = 0 42 | tokens_buf = [] 43 | 44 | for line in open(gold_props_file, 'r'): 45 | line = line.strip() 46 | if line == "" and len(tokens_buf) > 0: 47 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence]) 48 | seq_ptr += num_props_for_sentence 49 | tokens_buf = [] 50 | num_props_for_sentence = 0 51 | else: 52 | info = line.split() 53 | num_props_for_sentence = len(info) - 1 54 | tokens_buf.append(info[0]) 55 | 56 | # Output last sentence. 57 | if len(tokens_buf) > 0: 58 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence]) 59 | 60 | fout.close() 61 | 62 | 63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename): 64 | fout = open(output_filename, 'w') 65 | props_buf = [] 66 | labels_buf = [] 67 | tokens_buf = [] 68 | prev_words = '' 69 | 70 | x, y, num_tokens, _ = data 71 | for (sent, gold, slen) in zip(x, y, num_tokens): 72 | words = [word_dict.idx2str[w[0]] for w in sent[:slen]] 73 | labels = [label_dict.idx2str[l] for l in gold[:slen]] 74 | 75 | concat_words = ' '.join(words) 76 | if concat_words != prev_words and len(props_buf) > 0: 77 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)] 78 | 79 | print_sentence_to_conll(fout, tokens, labels_buf) 80 | props_buf = [] 81 | tokens_buf = [] 82 | labels_buf = [] 83 | prev_words = '' 84 | 85 | if prev_words == '': 86 | prev_words = concat_words 87 | tokens_buf = [w for w in words] 88 | if 'B-V' in labels: 89 | prop_id = labels.index('B-V') 90 | props_buf.append(prop_id) 91 | labels_buf.append(bio_to_se(labels)) 92 | 93 | if len(props_buf) > 0: 94 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)] 95 | print_sentence_to_conll(fout, tokens, labels_buf) 96 | 97 | fout.close() 98 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constants.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | import os 3 | import random 4 | 5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../') 6 | 7 | RANDOM_SEED = 12345 8 | random.seed(RANDOM_SEED) 9 | 10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh') 11 | 12 | START_MARKER = '' 13 | END_MARKER = '' 14 | PADDING_TOKEN = '*PAD*' 15 | UNKNOWN_TOKEN = '*UNKNOWN*' 16 | NULL_LABEL = 'O' 17 | 18 | TEMP_DIR = join(ROOT_DIR, '../temp') 19 | 20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT) 21 | if not os.path.exists(TEMP_DIR): 22 | os.makedirs(TEMP_DIR) 23 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_extraction.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import sys 3 | import numpy as np 4 | import random 5 | 6 | from .dictionary import Dictionary 7 | from collections import OrderedDict 8 | from nltk.tree import Tree 9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN 10 | # from .reader import list_of_words_to_ids 11 | 12 | 13 | PREFIX = "--PTB-CONS-LABEL--" 14 | 15 | 16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 17 | ids = [] 18 | for s in list_of_words: 19 | # s = s.encode('utf-8') # unicode -> utf-8 20 | if s is None: 21 | ids.append(-1) 22 | continue 23 | if lowercase: 24 | s = s.lower() 25 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 26 | s = UNKNOWN_TOKEN 27 | ids.append(dictionary.add(s)) 28 | return ids 29 | 30 | 31 | class constituent_tree(): 32 | def __init__(self, sentence, words, tree): 33 | self.sentence = sentence 34 | self.words = words 35 | self.tree = tree 36 | 37 | self.heads = [] 38 | self.non_terminal_nodes = [] # cons labels, e.g., NP, VP 39 | self.terminal_nodes = [] # words 40 | self.indicator = [] # 1 no terminal, 2 terminal 41 | 42 | self.non_terminal_nodes_idx = [] 43 | self.non_terminal_nodes_char_idx = [] 44 | self.terminal_node_idx = [] 45 | self.terminal_node_char_idx = [] 46 | 47 | self.sentence_length = len(words) 48 | self.input_length = -1 49 | self.sentence_index = -1 50 | 51 | def pos(self): 52 | """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]""" 53 | return self.tree.pos() 54 | 55 | def traverse_tree(self, tree, 56 | non_terminal_nodes, terminal_nodes, 57 | non_terminal_nodes_idx, terminal_nodes_idx, 58 | indicator, 59 | heads, 60 | parent, 61 | non_terminal_dict, word_dict, pos, 62 | word_embeddings): 63 | # print(tree) 64 | # print("subtree", subtree) 65 | if tree.height() > 2: 66 | non_terminal = tree.label() 67 | 68 | non_terminal_nodes.append(non_terminal) 69 | non_terminal_nodes_idx.append(non_terminal_dict.add(non_terminal)) 70 | indicator.append(1) 71 | heads.append(parent - 1) 72 | else: 73 | # print("YY", subtree) 74 | terminal = tree[0] # word 75 | terminal_nodes.append(terminal) 76 | terminal_nodes_idx.append( 77 | constituent_tree.add_word(terminal, word_dict, word_embeddings) 78 | ) 79 | indicator.append(2) 80 | 81 | pos.add(tree.label()) 82 | heads.append(parent - 1) 83 | if tree.height() <= 2: # 2 == ["V", Tree("Chased")] 84 | return 85 | parent = len(non_terminal_nodes) + len(terminal_nodes) 86 | for i, subtree in enumerate(tree): 87 | self.traverse_tree(subtree, 88 | non_terminal_nodes, terminal_nodes, 89 | non_terminal_nodes_idx, terminal_nodes_idx, 90 | indicator, 91 | heads, parent, 92 | non_terminal_dict, word_dict, pos, 93 | word_embeddings) 94 | 95 | @staticmethod 96 | def add_unknown_labels(label, word_embeddings): 97 | if label not in word_embeddings: 98 | embedding_size = len(word_embeddings[PADDING_TOKEN]) 99 | word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)]) 100 | 101 | @staticmethod 102 | def add_word(word, word_dict, word_embeddings): 103 | if word not in word_embeddings: 104 | word = UNKNOWN_TOKEN 105 | idx = word_dict.add(word) 106 | return idx 107 | 108 | @staticmethod 109 | def get_node_char_idx(words, char_dict, lowercase=False): 110 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 111 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int) 112 | for i, word in enumerate(words): 113 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase) 114 | return single_sample_char_tokens 115 | 116 | def generate_adjacent(self, non_terminal_dict, word_dict, char_dict, pos, word_embeddings): 117 | root_label = self.tree.label() 118 | self.traverse_tree(self.tree, 119 | self.non_terminal_nodes, self.terminal_nodes, 120 | self.non_terminal_nodes_idx, self.terminal_node_idx, 121 | self.indicator, 122 | self.heads, len(self.heads), 123 | non_terminal_dict, word_dict, pos, 124 | word_embeddings 125 | ) 126 | self.input_length = len(self.non_terminal_nodes) + len(self.terminal_nodes) 127 | self.sentence_index = self.input_length - self.sentence_length - 1 128 | 129 | self.non_terminal_nodes_char_idx = constituent_tree.get_node_char_idx( 130 | self.non_terminal_nodes, char_dict 131 | ) 132 | self.terminal_node_char_idx = constituent_tree.get_node_char_idx( 133 | self.terminal_nodes, char_dict 134 | ) 135 | 136 | 137 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings): 138 | data = [] 139 | with open(file_path, 'r') as input_file: 140 | sentence = "" 141 | for line in input_file.readlines(): 142 | if line.strip() == "": 143 | data.append(sentence) 144 | sentence = "" 145 | continue 146 | line = line.strip() 147 | if ' ' not in line: # avoid the split of leave node of it's PoS 148 | line = ' ' + line 149 | sentence += line 150 | print("Read {} sentence from {}".format(len(data), file_path)) 151 | 152 | def reset_sentence(sentence): 153 | for i in range(len(sentence)): 154 | if sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]: 155 | sentence[i] = '-' 156 | 157 | cons_trees = OrderedDict() 158 | for sentence in data: 159 | tree = Tree.fromstring(sentence) 160 | words = tree.leaves() 161 | reset_sentence(words) 162 | sentence = ' '.join(words) 163 | cons_trees[sentence] = constituent_tree(sentence, words, tree) 164 | 165 | pos_dict = Dictionary(padding_token=PADDING_TOKEN) 166 | non_terminal_dict = Dictionary(padding_token=PADDING_TOKEN) 167 | for sen in cons_trees: 168 | tree = cons_trees[sen] 169 | tree.generate_adjacent(non_terminal_dict, word_dict, char_dict, pos_dict, word_embeddings) 170 | 171 | return cons_trees, non_terminal_dict, pos_dict, 172 | 173 | 174 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import codecs 3 | import numpy as np 4 | 5 | 6 | from sortedcontainers import SortedSet 7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL 8 | from .dictionary import Dictionary 9 | 10 | 11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 12 | ids = [] 13 | for s in list_of_words: 14 | # s = s.encode('utf-8') # unicode -> utf-8 15 | if s is None: 16 | ids.append(-1) 17 | continue 18 | if lowercase: 19 | s = s.lower() 20 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 21 | s = UNKNOWN_TOKEN 22 | ids.append(dictionary.add(s)) 23 | return ids 24 | 25 | 26 | class constituent_sentence(): 27 | def __init__(self, obj): 28 | self.sentence = obj["sentence"] 29 | self.constituent_spans = obj["constituents"] 30 | self.max_span_width = 30 31 | self.reset_sentence() 32 | 33 | def reset_sentence(self): 34 | for i in range(len(self.sentence)): 35 | if self.sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]: 36 | self.sentence[i] = '-' 37 | self.sentence[i] = self.sentence[i].replace("\\/", "/") 38 | 39 | def tokenize_cons_spans(self, dictionary, max_cons_width=60): 40 | cons_span = [] 41 | set_cons_span = set() 42 | for cons_s in self.constituent_spans: # remove self-loop V-V 43 | cons_start, cons_end, cons_label = cons_s 44 | if cons_label in ["TOP", "S"]: # todo: add some constrains here 45 | continue 46 | if cons_end - cons_start + 1 >= max_cons_width: 47 | continue 48 | if (cons_start, cons_end) not in set_cons_span: 49 | set_cons_span.add((cons_start, cons_end)) 50 | cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))]) 51 | else: 52 | # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence) 53 | pass 54 | if len(cons_span) == 0: # if the sentence has no arguments. 55 | return [[], [], []] 56 | tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \ 57 | zip(*cons_span) 58 | return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels 59 | 60 | 61 | def read_constituent_file(file_path): 62 | sentences = [] 63 | with codecs.open(file_path, encoding="utf8") as f: 64 | for line in f.readlines(): 65 | sen = json.loads(line) 66 | cons_sen = constituent_sentence(sen) 67 | sentences.append(cons_sen) 68 | print("{} total constituent sentences number {}".format(file_path, len(sentences))) 69 | return sentences 70 | 71 | 72 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False): 73 | sample_word_tokens = [list_of_words_to_ids( 74 | sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples] 75 | # for the character 76 | sample_char_tokens = [] 77 | for sent in samples: 78 | words = sent.sentence 79 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 80 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int) 81 | for i, word in enumerate(words): 82 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase) 83 | # Add the sample char tokens into the sample_char_tokens 84 | sample_char_tokens.append(single_sample_char_tokens) 85 | sample_texts = [sent.sentence for sent in samples] 86 | sample_lengths = [len(sent.sentence) for sent in samples] 87 | sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples] 88 | return list(zip(sample_lengths, sample_texts, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens)) 89 | 90 | 91 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None): 92 | raw_cons_sentences = read_constituent_file(file_path) 93 | cons_label_dict = Dictionary() 94 | cons_label_dict.set_unknown_token(NULL_LABEL) 95 | 96 | # tokenized the data 97 | if word_dict.accept_new is False: 98 | word_dict.accept_new = True 99 | if char_dict.accept_new is False: 100 | char_dict.accept_new = True 101 | cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict, 102 | False, word_embeddings) 103 | # word_dict.accept_new = False 104 | # char_dict.accept_new = False 105 | # cons_label_dict.accept_new = False 106 | 107 | print("="*10, "Constituent Info", "="*10) 108 | print("Extract {} tags".format(cons_label_dict.size())) 109 | # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size())) 110 | print("Max sentence length: {}".format(max([s[0] for s in cons_samples]))) 111 | return cons_samples, word_dict, char_dict, cons_label_dict 112 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/dictionary.py: -------------------------------------------------------------------------------- 1 | ''' Bidirectional dictionary that maps between words and ids. 2 | ''' 3 | 4 | 5 | class Dictionary(object): 6 | def __init__(self, padding_token=None, unknown_token=None): 7 | self.str2idx = {} 8 | self.idx2str = [] 9 | 10 | self.accept_new = True 11 | self.padding_token = None 12 | self.padding_id = None 13 | self.unknown_token = None 14 | self.unknown_id = None 15 | if padding_token is not None: # add the padding info into the dictionary 16 | self.set_padding_token(padding_token) 17 | if unknown_token is not None: 18 | self.set_unknown_token(unknown_token) 19 | 20 | def set_padding_token(self, padding_token): 21 | self.padding_token = padding_token 22 | self.padding_id = self.add(self.padding_token) 23 | 24 | def set_unknown_token(self, unknown_token): 25 | self.unknown_token = unknown_token 26 | self.unknown_id = self.add(self.unknown_token) 27 | 28 | def add(self, new_str): 29 | if new_str not in self.str2idx: 30 | if self.accept_new: 31 | self.str2idx[new_str] = len(self.idx2str) 32 | self.idx2str.append(new_str) 33 | else: 34 | if new_str == "C-ADV": 35 | return self.str2idx["O"] 36 | if self.unknown_id is None: 37 | raise LookupError( 38 | 'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str) 39 | return self.unknown_id 40 | 41 | return self.str2idx[new_str] 42 | 43 | def add_all(self, str_list): 44 | return [self.add(s) for s in str_list] 45 | 46 | def get_index(self, input_str): 47 | if input_str in self.str2idx: 48 | return self.str2idx[input_str] 49 | return None 50 | 51 | def size(self): 52 | return len(self.idx2str) 53 | 54 | def save(self, filename): 55 | with open(filename, 'w') as f: 56 | for s in self.idx2str: 57 | f.write(s + '\n') 58 | f.close() 59 | 60 | def load(self, filename): 61 | with open(filename, 'r') as f: 62 | for line in f: 63 | line = line.strip() 64 | if line != '': 65 | self.add(line) 66 | f.close() 67 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/evaluation.py: -------------------------------------------------------------------------------- 1 | ''' Framework independent evaluator. Not in use yet. 2 | ''' 3 | import numpy 4 | import os 5 | from os.path import join 6 | # import subprocess 7 | from .constants import ROOT_DIR 8 | from .conll_utils import print_gold_to_conll 9 | # from .measurements import Timer 10 | 11 | 12 | class TaggerEvaluator(object): 13 | def __init__(self, data): 14 | self.data = data 15 | self.best_accuracy = 0.0 16 | self.has_best = False 17 | 18 | def compute_accuracy(self, predictions): 19 | for x, y in zip(predictions, 20 | [sent[2] for sent in self.data 21 | ]): # the predication's order should be the origin 22 | assert len(x) == y 23 | predictions = numpy.concatenate(predictions) 24 | tensors = self.data 25 | answer = numpy.concatenate( 26 | [sent[1].reshape(sent[1].shape[1]) for sent in tensors]) 27 | # predictions.resize(predictions.shape[0]) # resize the answer to the [length, 1] 28 | num_correct = numpy.equal(predictions, answer).sum() 29 | num_total = answer.shape[0] 30 | self.accuracy = (100.0 * num_correct) / num_total 31 | print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct, 32 | num_total)) 33 | 34 | def evaluate(self, predictions): 35 | self.compute_accuracy(predictions) 36 | self.has_best = self.accuracy > self.best_accuracy 37 | if self.has_best: 38 | print("Best accuracy so far: {:.3f}".format(self.accuracy)) 39 | self.best_accuracy = self.accuracy 40 | 41 | 42 | class PropIdEvaluator(object): 43 | def __init__(self, data, label_dict, target_label='V', 44 | use_se_marker=False): 45 | self.data = data 46 | self.label_dict = label_dict 47 | self.target_label_id = label_dict.str2idx[target_label] 48 | self.best_accuracy = 0.0 49 | self.has_best = False 50 | 51 | def compute_accuracy(self, predictions): 52 | _, y, _, weights = self.data 53 | # print predictions.shape, predictions 54 | identified = numpy.equal(predictions, self.target_label_id) 55 | print(y) 56 | # print self.target_label_id 57 | # print identified 58 | # exit() 59 | num_correct = numpy.sum( 60 | numpy.logical_and(numpy.equal(predictions, y), identified) * weights) 61 | num_identified = numpy.sum(identified * weights) 62 | num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights) 63 | self.precision = 100.0 * num_correct / num_identified 64 | self.recall = 100.0 * num_correct / num_gold 65 | self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall) 66 | print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format( 67 | self.accuracy, self.precision, self.recall)) 68 | 69 | def evaluate(self, predictions): 70 | self.compute_accuracy(predictions) 71 | self.has_best = self.accuracy > self.best_accuracy 72 | if self.has_best: 73 | print("Best accuracy so far: {:.3f}".format(self.accuracy)) 74 | self.best_accuracy = self.accuracy 75 | 76 | 77 | class SRLEvaluator(TaggerEvaluator): 78 | def __init__(self): 79 | self.best_accuracy = -1.0 80 | self.has_best = False 81 | 82 | def compute_accuracy(self, predictions): 83 | print("exit()") 84 | exit() 85 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/features.py: -------------------------------------------------------------------------------- 1 | def get_srl_features(sentences, config, feature_dicts=None): 2 | ''' TODO: Support adding more features. 3 | ''' 4 | feature_names = config.features 5 | feature_sizes = config.feature_sizes 6 | use_se_marker = config.use_se_marker 7 | 8 | features = [] 9 | feature_shapes = [] 10 | for fname, fsize in zip(feature_names, feature_sizes): 11 | if fname == "predicate": 12 | offset = int(use_se_marker) 13 | offset = 1 # pad is in the position 0 14 | features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences]) 15 | feature_shapes.append([2, fsize]) 16 | return (zip(*features), feature_shapes) 17 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/inference.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | def get_transition_params(label_strs): 5 | """Construct transtion scoresd (0 for allowed, -inf for invalid). 6 | Args: 7 | label_strs: A [num_tags,] sequence of BIO-tags. 8 | Returns: 9 | A [num_tags, num_tags] matrix of transition scores. 10 | """ 11 | num_tags = len(label_strs) 12 | transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32) 13 | for i, prev_label in enumerate(label_strs): 14 | for j, label in enumerate(label_strs): 15 | if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]: 16 | transition_params[i, j] = numpy.NINF 17 | return transition_params 18 | 19 | 20 | def viterbi_decode(score, transition_params): 21 | """ Adapted from Tensorflow implementation. 22 | Decode the highest scoring sequence of tags outside of TensorFlow. 23 | This should only be used at test time. 24 | Args: 25 | score: A [seq_len, num_tags] matrix of unary potentials. 26 | transition_params: A [num_tags, num_tags] matrix of binary potentials. 27 | Returns: 28 | viterbi: A [seq_len] list of integers containing the highest scoring tag 29 | indicies. 30 | viterbi_score: A float containing the score for the Viterbi sequence. 31 | """ 32 | trellis = numpy.zeros_like(score) 33 | backpointers = numpy.zeros_like(score, dtype=numpy.int32) 34 | trellis[0] = score[0] 35 | for t in range(1, score.shape[0]): 36 | v = numpy.expand_dims(trellis[t - 1], 1) + transition_params 37 | trellis[t] = score[t] + numpy.max(v, 0) 38 | backpointers[t] = numpy.argmax(v, 0) 39 | viterbi = [numpy.argmax(trellis[-1])] 40 | for bp in reversed(backpointers[1:]): 41 | viterbi.append(bp[viterbi[-1]]) 42 | viterbi.reverse() 43 | viterbi_score = numpy.max(trellis[-1]) 44 | return viterbi, viterbi_score 45 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/io_utils.py: -------------------------------------------------------------------------------- 1 | from google.protobuf.internal import encoder 2 | 3 | _EncodeVarint = encoder._VarintEncoder() 4 | 5 | 6 | def write_delimited_to(out_file, message): 7 | msg_size = message.ByteSize() 8 | pieces = [] 9 | _EncodeVarint(pieces.append, msg_size) 10 | out_file.write(b"".join(pieces)) 11 | out_file.write(message.SerializeToString()) 12 | 13 | 14 | def read_gold_props(gold_props_file): 15 | """ Read gold predicates from CoNLL-formatted file. 16 | """ 17 | gold_props = [] 18 | props = [] 19 | with open(gold_props_file, 'r') as f: 20 | for line in f: 21 | line = line.strip() 22 | if line == '': 23 | gold_props.append(props) 24 | props = [] 25 | else: 26 | props.append(line.split()[0]) 27 | f.close() 28 | if len(props) > 0: 29 | gold_props.append(props) 30 | return gold_props 31 | 32 | 33 | def write_predprops_to(predictions, 34 | label_dict, 35 | input_file, 36 | output_file, 37 | gold_props_file=None, 38 | output_props_file=None): 39 | """ Write predicted predicate information to files. 40 | 41 | Arguments: 42 | predictions: Predictions from the predicate identification model. 43 | Is a numpy array of size [num_sentences, max_sentence_length]. 44 | label_dict: Label dictionary. 45 | input_file: Input sequential tagging file. 46 | output_file: Output SRL file with identified predicates. 47 | gold_props_file: Input file with gold predicates in CoNLL format. 48 | output_props_file: Output SRL file with identified predicates, in CoNLL format. 49 | """ 50 | 51 | fin = open(input_file, 'r') 52 | fout = open(output_file, 'w') 53 | 54 | if output_props_file is not None and output_props_file != '': 55 | fout_props = open(output_props_file, 'w') 56 | else: 57 | fout_props = None 58 | 59 | if gold_props_file is not None and gold_props_file != '': 60 | gold_props = read_gold_props(gold_props_file) 61 | print(len(gold_props), len(predictions)) 62 | assert len(gold_props) == len(predictions) 63 | else: 64 | gold_props = None 65 | 66 | sent_id = 0 67 | for line in fin: 68 | # Read original sentence from input file. 69 | raw_sent = line.split('|||')[0].strip() 70 | tokens = raw_sent.split(' ') 71 | slen = len(tokens) 72 | pred = predictions[sent_id, :slen] 73 | props = [] 74 | 75 | for (t, p) in enumerate(pred): 76 | if label_dict.idx2str[p] == 'V': 77 | out_tags = ['O' for _ in range(slen)] 78 | out_tags[t] = 'B-V' 79 | out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join( 80 | out_tags) + '\n' 81 | fout.write(out_line) 82 | props.append(t) 83 | 84 | if fout_props is not None: 85 | if sent_id > 0: 86 | fout_props.write('\n') 87 | for t in range(slen): 88 | lemma = 'P' + tokens[t].lower() 89 | # In order for CoNLL evaluation script to run, we need to output the same 90 | # lemma as the gold predicate in the CoNLL-formatted file. 91 | if gold_props is not None and gold_props[sent_id][t] != '-': 92 | lemma = gold_props[sent_id][t] 93 | if t in props: 94 | fout_props.write(lemma) 95 | else: 96 | fout_props.write('-') 97 | for p in props: 98 | if t == p: 99 | fout_props.write('\t(V*)') 100 | else: 101 | fout_props.write('\t*') 102 | fout_props.write('\n') 103 | sent_id += 1 104 | 105 | fout.close() 106 | print('Predicted predicates in sequential-tagging format written to: {}.'. 107 | format(output_file)) 108 | if fout_props is not None: 109 | fout_props.close() 110 | print('CoNLL-formatted predicate information written to: {}.'.format( 111 | output_props_file)) 112 | 113 | 114 | def bio_to_spans(predictions, label_dict): 115 | """ Convert BIO-based predictions to a set of arguments. 116 | Arguments: 117 | predictions: A single integer array, already truncated to the original sequence lengths. 118 | label_dict: Label dictionary. 119 | Returns: 120 | A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions. 121 | """ 122 | args = [] 123 | tags = [label_dict.idx2str[p] for p in predictions] 124 | for (i, tag) in enumerate(tags): 125 | if tag == 'O': 126 | continue 127 | label = tag[2:] 128 | # Append new span. 129 | if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]: 130 | args.append([label, i, -1]) 131 | # Close current span. 132 | if i == len(predictions) - 1 or tags[ 133 | i + 1][0] == 'B' or label != tags[i + 1][2:]: 134 | args[-1][2] = i 135 | return args 136 | 137 | 138 | def print_to_readable(predictions, num_tokens, label_dict, input_path, 139 | output_path): 140 | """ Print predictions to human-readable format. 141 | """ 142 | fout = open(output_path, 'w') 143 | sample_id = 0 144 | for line in open(input_path, 'r'): 145 | info = line.split('|||')[0].strip().split() 146 | pid = int(info[0]) 147 | sent = info[1:] 148 | fout.write(' '.join(sent) + '\n') 149 | fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid)) 150 | 151 | tags = predictions[sample_id] 152 | arg_spans = bio_to_spans(tags, label_dict) 153 | for arg in arg_spans: 154 | fout.write('\t\t{}: {}\n'.format(arg[0], " ".join( 155 | sent[arg[1]:arg[2] + 1]))) 156 | fout.write('\n') 157 | sample_id += 1 158 | 159 | fout.close() 160 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/measurements.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | 5 | class Timer: 6 | def __init__(self, name, active=True): 7 | self.name = name if active else None 8 | 9 | def __enter__(self): 10 | self.start = time.time() 11 | self.last_tick = self.start 12 | return self 13 | 14 | def __exit__(self, *args): 15 | if self.name is not None: 16 | print("{} duration was {}.".format( 17 | self.name, self.readable(time.time() - self.start))) 18 | 19 | def readable(self, seconds): 20 | return str(datetime.timedelta(seconds=int(seconds))) 21 | 22 | def tick(self, message): 23 | current = time.time() 24 | print("{} took {} ({} since last tick).".format( 25 | message, self.readable(current - self.start), 26 | self.readable(current - self.last_tick))) 27 | self.last_tick = current 28 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/numpy_utils.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | def orth_normal_initializer(factor=1.0, seed=None): 5 | ''' Reference: Exact solutions to the nonlinear dynamics of learning in 6 | deep linear neural networks 7 | Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf 8 | Adapted from the original implementation by Mingxuan Wang. 9 | ''' 10 | def _initializer(shape, dtype): 11 | assert len(shape) == 2 12 | rng = numpy.random.RandomState(seed) 13 | if shape[0] == shape[1]: 14 | M = rng.randn(*shape).astype(dtype) 15 | Q, R = numpy.linalg.qr(M) 16 | Q = Q * numpy.sign(numpy.diag(R)) 17 | param = Q * factor 18 | return param 19 | else: 20 | M1 = rng.randn(shape[0], shape[0]).astype(dtype) 21 | M2 = rng.randn(shape[1], shape[1]).astype(dtype) 22 | Q1, R1 = numpy.linalg.qr(M1) 23 | Q2, R2 = numpy.linalg.qr(M2) 24 | Q1 = Q1 * numpy.sign(numpy.diag(R1)) 25 | Q2 = Q2 * numpy.sign(numpy.diag(R2)) 26 | n_min = min(shape[0], shape[1]) 27 | param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor 28 | return param 29 | 30 | return _initializer 31 | 32 | 33 | def block_orth_normal_initializer(input_shapes, 34 | output_shapes, 35 | factor=1.0, 36 | seed=None): 37 | ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix. 38 | Input: 39 | - input_shapes: the sizes of each block alone dimension 0. 40 | - output_shapes: the sizes of each block along dimension 1. 41 | for example input_shapes = [100, 128] output_shapes=[100,100,100,100] 42 | indicates eight blocks with shapes [100,100], [128,100], etc. 43 | ''' 44 | def _initializer(shape, dtype): 45 | assert len(shape) == 2 46 | initializer = orth_normal_initializer(factor, seed) 47 | params = numpy.concatenate([ 48 | numpy.concatenate([ 49 | initializer([dim_in, dim_out], dtype) 50 | for dim_out in output_shapes 51 | ], 1) for dim_in in input_shapes 52 | ], 0) 53 | return params 54 | 55 | return _initializer 56 | 57 | 58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None): 59 | def _initializer(shape, dtype): 60 | rng = numpy.random.RandomState(seed) 61 | return numpy.asarray(rng.normal(mean, stddev, shape), dtype) 62 | 63 | return _initializer 64 | 65 | 66 | def all_zero_initializer(): 67 | def _initializer(shape, dtype): 68 | return numpy.zeros(shape).astype(dtype) 69 | 70 | return _initializer 71 | 72 | 73 | def uniform_initializer(value=0.01): 74 | def _initializer(shape, dtype): 75 | return numpy.full(shape, value).astype(dtype) 76 | 77 | return _initializer 78 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/scores_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: scores.proto 3 | 4 | import sys 5 | import tensor_pb2 as tensor__pb2 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # from google.protobuf import descriptor_pb2 11 | 12 | 13 | # @@protoc_insertion_point(imports) 14 | _b = sys.version_info[0] < 3 and (lambda x: x) or ( 15 | lambda x: x.encode('latin1')) 16 | 17 | 18 | _sym_db = _symbol_database.Default() 19 | 20 | 21 | DESCRIPTOR = _descriptor.FileDescriptor( 22 | name='scores.proto', 23 | package='', 24 | syntax='proto2', 25 | serialized_pb=_b( 26 | '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\ 27 | sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto' 28 | ), 29 | dependencies=[ 30 | tensor__pb2.DESCRIPTOR, 31 | ]) 32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 33 | 34 | _SENTENCESCORESPROTO = _descriptor.Descriptor( 35 | name='SentenceScoresProto', 36 | full_name='SentenceScoresProto', 37 | filename=None, 38 | file=DESCRIPTOR, 39 | containing_type=None, 40 | fields=[ 41 | _descriptor.FieldDescriptor( 42 | name='sentence_id', 43 | full_name='SentenceScoresProto.sentence_id', 44 | index=0, 45 | number=1, 46 | type=13, 47 | cpp_type=3, 48 | label=1, 49 | has_default_value=False, 50 | default_value=0, 51 | message_type=None, 52 | enum_type=None, 53 | containing_type=None, 54 | is_extension=False, 55 | extension_scope=None, 56 | options=None), 57 | _descriptor.FieldDescriptor(name='scores', 58 | full_name='SentenceScoresProto.scores', 59 | index=1, 60 | number=2, 61 | type=11, 62 | cpp_type=10, 63 | label=1, 64 | has_default_value=False, 65 | default_value=None, 66 | message_type=None, 67 | enum_type=None, 68 | containing_type=None, 69 | is_extension=False, 70 | extension_scope=None, 71 | options=None), 72 | ], 73 | extensions=[], 74 | nested_types=[], 75 | enum_types=[], 76 | options=None, 77 | is_extendable=False, 78 | syntax='proto2', 79 | extension_ranges=[], 80 | oneofs=[], 81 | serialized_start=30, 82 | serialized_end=102, 83 | ) 84 | 85 | _SCORESPROTO = _descriptor.Descriptor( 86 | name='ScoresProto', 87 | full_name='ScoresProto', 88 | filename=None, 89 | file=DESCRIPTOR, 90 | containing_type=None, 91 | fields=[ 92 | _descriptor.FieldDescriptor(name='sentences', 93 | full_name='ScoresProto.sentences', 94 | index=0, 95 | number=1, 96 | type=11, 97 | cpp_type=10, 98 | label=3, 99 | has_default_value=False, 100 | default_value=[], 101 | message_type=None, 102 | enum_type=None, 103 | containing_type=None, 104 | is_extension=False, 105 | extension_scope=None, 106 | options=None), 107 | ], 108 | extensions=[], 109 | nested_types=[], 110 | enum_types=[], 111 | options=None, 112 | is_extendable=False, 113 | syntax='proto2', 114 | extension_ranges=[], 115 | oneofs=[], 116 | serialized_start=104, 117 | serialized_end=158, 118 | ) 119 | 120 | _SENTENCESCORESPROTO.fields_by_name[ 121 | 'scores'].message_type = tensor__pb2._TENSORPROTO 122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO 123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO 124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO 125 | 126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType( 127 | 'SentenceScoresProto', 128 | (_message.Message, ), 129 | dict(DESCRIPTOR=_SENTENCESCORESPROTO, 130 | __module__='scores_pb2' 131 | # @@protoc_insertion_point(class_scope:SentenceScoresProto) 132 | )) 133 | _sym_db.RegisterMessage(SentenceScoresProto) 134 | 135 | ScoresProto = _reflection.GeneratedProtocolMessageType( 136 | 'ScoresProto', 137 | (_message.Message, ), 138 | dict(DESCRIPTOR=_SCORESPROTO, 139 | __module__='scores_pb2' 140 | # @@protoc_insertion_point(class_scope:ScoresProto) 141 | )) 142 | _sym_db.RegisterMessage(ScoresProto) 143 | 144 | # @@protoc_insertion_point(module_scope) 145 | -------------------------------------------------------------------------------- /src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/syntactic_extraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import codecs 3 | 4 | from .dictionary import Dictionary 5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN 6 | from collections import OrderedDict 7 | 8 | 9 | class SyntacticTree(object): 10 | def __init__(self, sentence_id): 11 | self.sentence_id = sentence_id 12 | self.word_forms = ["Root"] 13 | self.word_forms_ids = [] 14 | self.char_ids = [[]] # 2D 15 | self.pos_forms = ["Root"] 16 | self.heads = [0] 17 | self.labels = ["Root"] 18 | self.labels_id = [] 19 | 20 | 21 | class SyntacticCONLL(object): 22 | def __init__(self): 23 | self.file_name = "" 24 | self.trees = [] 25 | self.sample_dep_data = None 26 | 27 | def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8): 28 | self.file_name = filename 29 | 30 | print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio)) 31 | conll_file = codecs.open(self.file_name, 'r', encoding="utf8") 32 | if conll_file.closed: 33 | print("Cannot open the syntactic conll file! Please check {}".format(self.file_name)) 34 | 35 | sentence_id = 0 36 | a_tree = SyntacticTree(sentence_id) 37 | find_root = False 38 | for line in conll_file: 39 | if line == '\n' or line == '\r\n': # new sentence 40 | sentence_id += 1 41 | if len(a_tree.word_forms) <= max_sentence_length: 42 | assert find_root is True 43 | # keep the sentence with the length < max_sentence_l 44 | self.trees.append(a_tree) 45 | a_tree = SyntacticTree(sentence_id) 46 | find_root = False 47 | continue 48 | tokens = line.strip().split('\t') 49 | a_tree.word_forms.append(tokens[1]) 50 | a_tree.pos_forms.append(tokens[3]) 51 | # head = int(tokens[6]) if int(tokens[6]) > 0 else -1 52 | head = int(tokens[6]) - 1 # root's head is 0 53 | if head == -1: 54 | assert tokens[7] == "root" 55 | find_root = True 56 | a_tree.heads.append(head) 57 | a_tree.labels.append(tokens[7]) 58 | token_9 = tokens[9] # or tokens 9 will be 'unicode' type 59 | dep_prob = 1.0 if isinstance(token_9, str) else float(token_9) 60 | if dep_prob < prune_ratio: 61 | a_tree.heads[-1] = -1 62 | print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees))) 63 | 64 | @staticmethod 65 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 66 | ids = [] 67 | for s in list_of_words: 68 | s = s 69 | if s is None: 70 | ids.append(-1) 71 | continue 72 | if lowercase: 73 | s = s.lower() 74 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 75 | s = UNKNOWN_TOKEN 76 | ids.append(dictionary.add(s)) 77 | return ids 78 | 79 | def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None): 80 | for tree in self.trees: 81 | tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False, 82 | pretrained_word_embedding) 83 | words = tree.word_forms 84 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 85 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64) 86 | for i, word in enumerate(words): 87 | single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict) 88 | # Add the sample char tokens into the sample_char_tokens 89 | tree.char_ids = single_sample_char_tokens 90 | 91 | tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict) 92 | 93 | sample_word_texts = [tree.word_forms for tree in self.trees] 94 | sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees] 95 | sample_char_ids = [tree.char_ids for tree in self.trees] 96 | sample_heads = [np.asarray(tree.heads) for tree in self.trees] 97 | sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees] 98 | self.sample_dep_data = list(zip(sample_word_texts, 99 | sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids)) 100 | 101 | def get_syntactic_label_dict(self, syn_label_dict=None): 102 | if syn_label_dict is None: 103 | syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) 104 | else: 105 | assert syn_label_dict.accept_new is False 106 | sentences_length = len(self.trees) 107 | for i in range(sentences_length): 108 | ith_sentence_length = len(self.trees[i].labels) 109 | for j in range(ith_sentence_length): 110 | self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j])) 111 | return syn_label_dict 112 | 113 | 114 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings): 115 | dep_trees = SyntacticCONLL() 116 | dep_trees.read_from_file(file_path, max_sentence_length=2000) 117 | dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings) 118 | 119 | auto_dep_trees = OrderedDict() 120 | for tree in dep_trees.trees: 121 | sentence = ' '.join(tree.word_forms[1:]) # remove the "Root" 122 | auto_dep_trees[sentence] = tree 123 | return auto_dep_trees 124 | 125 | 126 | class SyntacticRepresentation(object): 127 | def __init__(self): 128 | self.file_name = "" 129 | self.representations = [] 130 | 131 | def read_from_file(self, filename): 132 | self.file_name = filename 133 | print("Reading lstm representations from {}".format(self.file_name)) 134 | representation_file = open(self.file_name, 'r') 135 | if representation_file.closed: 136 | print("Cannot open the representation file! Please check {}".format(self.file_name)) 137 | exit() 138 | each_sentence_representations = [] 139 | for line in representation_file: 140 | if line == '\n' or line == "\r\n": # new sentence 141 | self.representations.append(each_sentence_representations) 142 | each_sentence_representations = [] 143 | continue 144 | line = line.strip() 145 | line = line.split('\t') 146 | line = line[1].split(' ') 147 | rep = np.asarray(line, dtype=np.float32) 148 | each_sentence_representations.append(rep) 149 | representation_file.close() 150 | print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations))) 151 | 152 | def minus_by_the_predicate(self, corpus_tensors): 153 | has_processed_sentence_id = {} 154 | for i, data in enumerate(corpus_tensors): 155 | sentence_id = data[0][0][0] 156 | predicates = data[0][2] 157 | predicate_id = predicates.argmax() 158 | if sentence_id in has_processed_sentence_id: 159 | continue 160 | else: 161 | has_processed_sentence_id[sentence_id] = 1 162 | for j in range(1, len(self.representations[sentence_id])): # Root doesn't use. 163 | self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j] 164 | 165 | def check_math_corpus(self, lengths): 166 | for i, length in enumerate(lengths): 167 | if len(self.representations[i]) != length + 1: # 1 means the first one, Root. Actually never use it. 168 | print(i, length, len(self.representations[i])) 169 | print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length) 170 | exit() 171 | print("LSTM representation match the corpus!") 172 | -------------------------------------------------------------------------------- /src/orl-4.1/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["neural_srl"] -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/TreeLSTM/Encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from torch.nn.utils.rnn import pack_padded_sequence as pack 5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack 6 | from .TreeGRU import DTTreeGRU, TDTreeGRU 7 | from .Tree import creatTree 8 | 9 | 10 | class EncoderRNN(nn.Module): 11 | """ The standard RNN encoder. 12 | """ 13 | def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1): 14 | super(EncoderRNN, self).__init__() 15 | self.hidden_size = hidden_size 16 | self.num_layers = num_layers 17 | self.dropout = nn.Dropout(dropout) 18 | 19 | self.rnn = nn.GRU(input_size=input_size, 20 | hidden_size=hidden_size, 21 | num_layers=num_layers, 22 | bidirectional=True) # batch_first = False 23 | self.transform = nn.Linear(in_features=2 * hidden_size, 24 | out_features=input_size, 25 | bias=True) 26 | self.dt_tree = DTTreeGRU(input_size, hidden_size) 27 | self.td_tree = TDTreeGRU(input_size, hidden_size) 28 | 29 | def forward(self, input, heads, lengths=None, hidden=None): 30 | """ See EncoderBase.forward() for description of args and returns. 31 | inputs: [L, B, H], including the -ROOT- 32 | heads: [heads] * B 33 | """ 34 | emb = self.dropout(input) 35 | 36 | packed_emb = emb 37 | if lengths is not None: 38 | # Lengths data is wrapped inside a Variable. 39 | packed_emb = pack(emb, lengths) 40 | 41 | outputs, hidden_t = self.rnn(packed_emb, hidden) 42 | 43 | if lengths is not None: 44 | outputs = unpack(outputs)[0] 45 | 46 | outputs = self.dropout(self.transform(outputs)) 47 | max_length, batch_size, input_dim = outputs.size() 48 | trees = [] 49 | indexes = np.full((max_length, batch_size), -1, 50 | dtype=np.int32) # a col is a sentence 51 | for b, head in enumerate(heads): 52 | root, tree = creatTree( 53 | head) # head: a sentence's heads; sentence base 54 | root.traverse() # traverse the tree 55 | for step, index in enumerate(root.order): 56 | indexes[step, b] = index 57 | trees.append(tree) 58 | 59 | dt_outputs, dt_hidden_ts = self.dt_tree.forward( 60 | outputs, indexes, trees) 61 | td_outputs, td_hidden_ts = self.td_tree.forward( 62 | outputs, indexes, trees) 63 | 64 | outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1) 65 | output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0) 66 | 67 | return outputs, output_t 68 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/TreeLSTM/Tree.py: -------------------------------------------------------------------------------- 1 | class Tree(object): 2 | def __init__(self, index): 3 | self.parent = None 4 | self.is_left = False 5 | self.index = index 6 | self.left_children = list() 7 | self.left_num = 0 8 | self.right_children = list() 9 | self.right_num = 0 10 | self._depth = -1 11 | self.order = [] 12 | 13 | def add_left(self, child): 14 | """ 15 | :param child: a Tree object represent the child 16 | :return: 17 | """ 18 | child.parent = self 19 | child.is_left = True 20 | self.left_children.append(child) 21 | self.left_num += 1 22 | 23 | def add_right(self, child): 24 | """ 25 | :param child: a Tree object represent the child 26 | :return: 27 | """ 28 | child.parent = self 29 | child.is_left = False 30 | self.right_children.append(child) 31 | self.right_num += 1 32 | 33 | def size(self): # compute the total size of the Tree 34 | if hasattr(self, '_size'): 35 | return self._size 36 | count = 1 37 | for i in range(self.left_num): 38 | count += self.left_children[i].size() 39 | for i in range(self.right_num): 40 | count += self.right_children[i].size() 41 | self._size = count 42 | return self._size 43 | 44 | def depth(self): # compute the depth of the Tree 45 | if self._depth > 0: 46 | return self._depth 47 | count = 0 48 | if self.left_num + self.right_num > 0: 49 | for i in range(self.left_num): 50 | child_depth = self.left_children[i].depth() 51 | if child_depth > count: 52 | count = child_depth 53 | for i in range(self.right_num): 54 | child_depth = self.right_children[i].depth() 55 | if child_depth > count: 56 | count = child_depth 57 | count += 1 58 | self._depth = count 59 | return self._depth 60 | 61 | def traverse(self): # traverse the Tree 62 | if len(self.order) > 0: 63 | return self.order 64 | 65 | for i in range(self.left_num): 66 | left_order = self.left_children[i].traverse() 67 | self.order.extend(left_order) 68 | for i in range(self.right_num): 69 | right_order = self.right_children[i].traverse() 70 | self.order.extend(right_order) 71 | self.order.append(self.index) # append the root 72 | return self.order 73 | 74 | 75 | def creatTree(heads): 76 | tree = [] 77 | # current sentence has already been numberized [form, head, rel] 78 | root = None 79 | for idx, head in enumerate(heads): 80 | tree.append(Tree(idx)) 81 | 82 | for idx, head in enumerate(heads): 83 | if head == -1: # -1 mszhang, 0 kiro 84 | root = tree[idx] 85 | continue 86 | if head < 0: 87 | print('error: multi roots') 88 | if head > idx: 89 | tree[head].add_left(tree[idx]) 90 | if head < idx: 91 | tree[head].add_right(tree[idx]) 92 | if head == idx: 93 | print('error: head is it self.') 94 | 95 | return root, tree 96 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/TreeLSTM/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["Encoder", "Tree", "TreeGRU"] 2 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/gcn.py: -------------------------------------------------------------------------------- 1 | """ 2 | GCN model for relation extraction. 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | from ..shared.constants import PAD_ID 10 | import numpy as np 11 | 12 | 13 | class GCN(nn.Module): 14 | def __init__(self, config, input_dim, mem_dim, num_layers): 15 | super(GCN, self).__init__() 16 | self.config = config 17 | self.input_dim = input_dim 18 | self.mem_dim = mem_dim 19 | self.layers = num_layers 20 | 21 | # rnn layer 22 | if self.config.gcn_rnn is True: 23 | input_size = self.input_dim 24 | self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True, 25 | dropout=self.config.gcn_rnn_dropout, bidirectional=True) 26 | self.in_dim = self.config.gcn_rnn_hidden * 2 27 | self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout) # use on last layer output 28 | 29 | self.in_drop = nn.Dropout(self.config.gcn_input_dropout) 30 | self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout) 31 | 32 | # gcn layer 33 | self.W = nn.ModuleList() 34 | self.layer_normalization = nn.ModuleList() 35 | 36 | for layer in range(self.layers): 37 | # input_dim = self.in_dim if layer == 0 else self.mem_dim 38 | self.W.append(nn.Linear(self.in_dim, self.in_dim)) 39 | self.layer_normalization.append(LayerNormalization(self.in_dim)) 40 | 41 | def conv_l2(self): 42 | conv_weights = [] 43 | for w in self.W: 44 | conv_weights += [w.weight, w.bias] 45 | return sum([x.pow(2).sum() for x in conv_weights]) 46 | 47 | def encode_with_rnn(self, rnn_inputs, masks, batch_size): 48 | seq_lens = masks.data.eq(1).long().sum(1).squeeze() 49 | h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers) 50 | 51 | # SORT YOUR TENSORS BY LENGTH! 52 | seq_lens, perm_idx = seq_lens.sort(0, descending=True) 53 | 54 | rnn_inputs = rnn_inputs[perm_idx] 55 | rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True) 56 | rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0)) 57 | rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True) 58 | 59 | _, unperm_idx = perm_idx.sort(0) 60 | rnn_outputs = rnn_outputs[unperm_idx] 61 | return rnn_outputs 62 | 63 | def forward(self, adj, embs, masks): 64 | batch_size = masks.size()[0] 65 | embs = self.in_drop(embs) 66 | # rnn layer 67 | if self.config.gcn_rnn is True: 68 | gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size)) 69 | else: 70 | gcn_inputs = embs 71 | 72 | # gcn layer 73 | denom = adj.sum(2).unsqueeze(2) + 1 74 | mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2) 75 | # # zero out adj for ablation 76 | # if self.opt.get('no_adj', False): 77 | # adj = torch.zeros_like(adj) 78 | 79 | for l in range(self.layers): 80 | # print(gcn_inputs.size(), adj.size()) 81 | x = gcn_inputs 82 | Ax = adj.bmm(gcn_inputs) 83 | AxW = self.W[l](Ax) 84 | AxW = AxW + self.W[l](gcn_inputs) # self loop 85 | AxW = AxW / denom 86 | 87 | gAxW = F.relu(AxW) 88 | gcn_inputs = self.gcn_drop(gAxW) 89 | self.layer_normalization[l].forward(gcn_inputs + x) 90 | 91 | return gcn_inputs, mask 92 | 93 | 94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True): 95 | total_layers = num_layers * 2 if bidirectional else num_layers 96 | state_shape = (total_layers, batch_size, hidden_dim) 97 | h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False) 98 | if use_cuda: 99 | return h0.cuda(), c0.cuda() 100 | else: 101 | return h0, c0 102 | 103 | 104 | class LayerNormalization(nn.Module): 105 | ''' Layer normalization module ''' 106 | 107 | def __init__(self, d_hid, eps=1e-3): # 108 | super(LayerNormalization, self).__init__() 109 | self.eps = eps 110 | self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) 111 | self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) 112 | 113 | def forward(self, z): 114 | if z.size(1) == 1: 115 | return z 116 | mu = torch.mean(z, keepdim=True, dim=-1) 117 | sigma = torch.std(z, keepdim=True, dim=-1) 118 | ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) # 1e-3 is ok, because variance and std. 119 | ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) 120 | return ln_out 121 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/gcn_model/tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basic operations on trees. 3 | """ 4 | 5 | import numpy as np 6 | from collections import defaultdict 7 | 8 | 9 | class Tree(object): 10 | """ 11 | Reused tree object from stanfordnlp/treelstm. 12 | """ 13 | 14 | def __init__(self): 15 | self.parent = None 16 | # head probability 17 | self.phead = -1 18 | self.num_children = 0 19 | self.children = list() 20 | 21 | def add_child(self, child): 22 | child.parent = self 23 | self.num_children += 1 24 | self.children.append(child) 25 | 26 | def size(self): 27 | if getattr(self, '_size'): 28 | return self._size 29 | count = 1 30 | for i in xrange(self.num_children): 31 | count += self.children[i].size() 32 | self._size = count 33 | return self._size 34 | 35 | def depth(self): 36 | if getattr(self, '_depth'): 37 | return self._depth 38 | count = 0 39 | if self.num_children > 0: 40 | for i in xrange(self.num_children): 41 | child_depth = self.children[i].depth() 42 | if child_depth > count: 43 | count = child_depth 44 | count += 1 45 | self._depth = count 46 | return self._depth 47 | 48 | def __iter__(self): 49 | yield self 50 | for c in self.children: 51 | for x in c: 52 | yield x 53 | 54 | 55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos): 56 | """ 57 | Convert a sequence of head indexes into a tree object. 58 | """ 59 | tokens = tokens[:len_].tolist() 60 | head = head[:len_].tolist() 61 | root = None 62 | 63 | if prune < 0: 64 | nodes = [Tree() for _ in head] 65 | 66 | for i in range(len(nodes)): 67 | h = head[i] 68 | nodes[i].idx = i 69 | nodes[i].dist = -1 # just a filler 70 | if h == 0: 71 | root = nodes[i] 72 | else: 73 | nodes[h - 1].add_child(nodes[i]) 74 | else: 75 | # find dependency path 76 | subj_pos = [i for i in range(len_) if subj_pos[i] == 0] 77 | obj_pos = [i for i in range(len_) if obj_pos[i] == 0] 78 | 79 | cas = None 80 | 81 | subj_ancestors = set(subj_pos) 82 | for s in subj_pos: 83 | h = head[s] 84 | tmp = [s] 85 | while h > 0: 86 | tmp += [h - 1] 87 | subj_ancestors.add(h - 1) 88 | h = head[h - 1] 89 | 90 | if cas is None: 91 | cas = set(tmp) 92 | else: 93 | cas.intersection_update(tmp) 94 | 95 | obj_ancestors = set(obj_pos) 96 | for o in obj_pos: 97 | h = head[o] 98 | tmp = [o] 99 | while h > 0: 100 | tmp += [h - 1] 101 | obj_ancestors.add(h - 1) 102 | h = head[h - 1] 103 | cas.intersection_update(tmp) 104 | 105 | # find lowest common ancestor 106 | if len(cas) == 1: 107 | lca = list(cas)[0] 108 | else: 109 | child_count = {k: 0 for k in cas} 110 | for ca in cas: 111 | if head[ca] > 0 and head[ca] - 1 in cas: 112 | child_count[head[ca] - 1] += 1 113 | 114 | # the LCA has no child in the CA set 115 | for ca in cas: 116 | if child_count[ca] == 0: 117 | lca = ca 118 | break 119 | 120 | path_nodes = subj_ancestors.union(obj_ancestors).difference(cas) 121 | path_nodes.add(lca) 122 | 123 | # compute distance to path_nodes 124 | dist = [-1 if i not in path_nodes else 0 for i in range(len_)] 125 | 126 | for i in range(len_): 127 | if dist[i] < 0: 128 | stack = [i] 129 | while stack[-1] >= 0 and stack[-1] not in path_nodes: 130 | stack.append(head[stack[-1]] - 1) 131 | 132 | if stack[-1] in path_nodes: 133 | for d, j in enumerate(reversed(stack)): 134 | dist[j] = d 135 | else: 136 | for j in stack: 137 | if j >= 0 and dist[j] < 0: 138 | dist[j] = int(1e4) # aka infinity 139 | 140 | highest_node = lca 141 | nodes = [Tree() if dist[i] <= prune else None for i in range(len_)] 142 | 143 | for i in range(len(nodes)): 144 | if nodes[i] is None: 145 | continue 146 | h = head[i] 147 | nodes[i].idx = i 148 | nodes[i].dist = dist[i] 149 | if h > 0 and i != highest_node: 150 | assert nodes[h - 1] is not None 151 | nodes[h - 1].add_child(nodes[i]) 152 | 153 | root = nodes[highest_node] 154 | 155 | assert root is not None 156 | return root 157 | 158 | 159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False): 160 | """ 161 | Convert a tree object to an (numpy) adjacency matrix. 162 | """ 163 | ret = np.zeros((sent_len, sent_len), dtype=np.float32) 164 | 165 | queue = [tree] 166 | idx = [] 167 | while len(queue) > 0: 168 | t, queue = queue[0], queue[1:] 169 | 170 | idx += [t.idx] 171 | 172 | for c in t.children: 173 | ret[t.idx, c.idx] = 1 174 | queue += t.children 175 | 176 | if not directed: 177 | ret = ret + ret.T 178 | 179 | if self_loop: 180 | for i in idx: 181 | ret[i, i] = 1 182 | 183 | return ret 184 | 185 | 186 | def tree_to_dist(sent_len, tree): 187 | ret = -1 * np.ones(sent_len, dtype=np.int64) 188 | 189 | for node in tree: 190 | ret[node.idx] = node.dist 191 | 192 | return ret 193 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/implicit_syntactic_representations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from torch.nn.utils.rnn import pad_sequence 6 | 7 | 8 | from .model import drop_sequence_sharedmask, _model_var 9 | from .HighWayLSTM import Highway_Concat_BiLSTM 10 | from .layer import NonLinear, Biaffine 11 | 12 | 13 | class ImplicitDependencyRepresentations(nn.Module): 14 | def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size): 15 | super(ImplicitDependencyRepresentations, self).__init__() 16 | self.config = config 17 | self.lstm_input_size = lstm_input_size 18 | self.lstm_hidden_size = lstm_hidden_size 19 | self.dep_label_space_size = dep_label_space_size 20 | # softmax weights 21 | self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0])) 22 | self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0])) 23 | for _ in range(self.config.dep_num_lstm_layers)]) 24 | self.cuda = True 25 | 26 | self.dep_bilstm = Highway_Concat_BiLSTM( 27 | input_size=self.lstm_input_size, 28 | hidden_size=self.lstm_hidden_size, # // 2 for MyLSTM 29 | num_layers=self.config.dep_num_lstm_layers, 30 | batch_first=True, 31 | bidirectional=True, 32 | dropout_in=config.input_dropout_prob, 33 | dropout_out=config.recurrent_dropout_prob 34 | ) 35 | 36 | # dependency parsing module 37 | self.mlp_arc_dep = NonLinear( 38 | input_size=2 * config.lstm_hidden_size, 39 | hidden_size=config.mlp_arc_size + config.mlp_rel_size, 40 | activation=nn.LeakyReLU(0.1)) 41 | self.mlp_arc_head = NonLinear( 42 | input_size=2 * config.lstm_hidden_size, 43 | hidden_size=config.mlp_arc_size + config.mlp_rel_size, 44 | activation=nn.LeakyReLU(0.1)) 45 | 46 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100) 47 | self.arc_num = int(config.mlp_arc_size / 100) 48 | self.rel_num = int(config.mlp_rel_size / 100) 49 | 50 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False)) 51 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size, 52 | bias=(True, True)) 53 | 54 | def init_masks(self, batch_size, lengths): 55 | max_sent_length = max(lengths) 56 | num_sentences = batch_size 57 | indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1) 58 | masks = indices < lengths.unsqueeze(1) 59 | masks = masks.type(torch.FloatTensor) 60 | if self.cuda: 61 | masks = masks.cuda() 62 | return masks 63 | 64 | def forward(self, num_sentences, context_embeddings, sent_lengths, dep): 65 | masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths)) 66 | lstm_out, _ = self.dep_bilstm(context_embeddings, masks) 67 | 68 | if self.training: 69 | lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp) 70 | 71 | x_all_dep = self.mlp_arc_dep(lstm_out) 72 | x_all_head = self.mlp_arc_head(lstm_out) 73 | 74 | if self.training: 75 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 76 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 77 | 78 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2) 79 | x_all_head_splits = torch.split(x_all_head, 100, dim=2) 80 | 81 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 82 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 83 | 84 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 85 | arc_logit = torch.squeeze(arc_logit, dim=3) 86 | 87 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 88 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 89 | 90 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 91 | 92 | self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond 93 | 94 | heads, rels = dep[0], dep[1] 95 | loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist()) # compute the dep loss 96 | return loss, self.arc_logits 97 | 98 | def compute_dep_loss(self, true_arcs, true_rels, lengths): 99 | b, l1, l2 = self.arc_logits.size() 100 | index_true_arcs = _model_var( 101 | self.parameters(), 102 | pad_sequence(true_arcs, padding_value=0, batch_first=True) 103 | ) 104 | true_arcs = _model_var( 105 | self.parameters(), 106 | pad_sequence(true_arcs, padding_value=-1, batch_first=True) 107 | ) 108 | 109 | masks = [] 110 | for length in lengths: 111 | mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length)) 112 | mask = _model_var(self.parameters(), mask) 113 | mask = torch.unsqueeze(mask, dim=1).expand(-1, l1) 114 | masks.append(mask.transpose(0, 1)) 115 | length_mask = torch.stack(masks, 0) 116 | arc_logits = self.arc_logits + length_mask 117 | 118 | arc_loss = F.cross_entropy( 119 | arc_logits.view(b * l1, l2), true_arcs.view(b * l1), 120 | ignore_index=-1, reduction="sum") 121 | 122 | size = self.rel_logits.size() 123 | output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3])) 124 | for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))): 125 | rel_probs = [] 126 | for i in range(l1): 127 | rel_probs.append(logits[i][int(arcs[i])]) 128 | rel_probs = torch.stack(rel_probs, dim=0) 129 | output_logits[batch_index] = torch.squeeze(rel_probs, dim=1) 130 | 131 | b, l1, d = output_logits.size() 132 | true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True)) 133 | 134 | rel_loss = F.cross_entropy( 135 | output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum") 136 | 137 | loss = arc_loss + rel_loss 138 | return loss 139 | 140 | def get_reps(self, context_embeddings, masks): 141 | dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks) 142 | normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0) 143 | normed_weights = torch.split(normed_weights, 1) # split_size_or_sections=1, split_size=1) # 0.3.0 144 | dep_representations = self.dep_gamma * \ 145 | sum([normed_weights[i] * dep_lstm_outputs[i] for i in 146 | range(self.config.dep_num_lstm_layers)]) 147 | if self.training: 148 | lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp) 149 | 150 | x_all_dep = self.mlp_arc_dep(dep_lstm_out) 151 | x_all_head = self.mlp_arc_head(dep_lstm_out) 152 | 153 | if self.training: 154 | x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp) 155 | x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp) 156 | 157 | x_all_dep_splits = torch.split(x_all_dep, 100, dim=2) 158 | x_all_head_splits = torch.split(x_all_head, 100, dim=2) 159 | 160 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 161 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 162 | 163 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 164 | arc_logit = torch.squeeze(arc_logit, dim=3) 165 | return dep_representations, arc_logit 166 | 167 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from .layer import MyLSTM, NonLinear, Biaffine 5 | 6 | 7 | def _model_var(parameters, x): 8 | p = next(iter(filter(lambda p: p.requires_grad, parameters))) 9 | if p.is_cuda: 10 | x = x.cuda(p.get_device()) 11 | return torch.autograd.Variable(x) 12 | 13 | 14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb): 15 | batch_size, seq_length, _ = word_embeddings.size() 16 | # tensor.new: build a tensor with the same data type 17 | word_masks = word_embeddings.data.new(batch_size, 18 | seq_length).fill_(1 - dropout_emb) 19 | word_masks = torch.Tensor(torch.bernoulli(word_masks)) 20 | word_masks.requires_grad = False 21 | tag_masks = tag_embeddings.data.new(batch_size, 22 | seq_length).fill_(1 - dropout_emb) 23 | tag_masks = torch.Tensor(torch.bernoulli(tag_masks)) 24 | tag_masks.requires_grad = False 25 | scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12) 26 | word_masks *= scale 27 | tag_masks *= scale 28 | # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position. 29 | word_masks = word_masks.unsqueeze(dim=2) # ? 30 | tag_masks = tag_masks.unsqueeze(dim=2) 31 | word_embeddings = word_embeddings * word_masks 32 | tag_embeddings = tag_embeddings * tag_masks 33 | 34 | return word_embeddings, tag_embeddings 35 | 36 | 37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True): 38 | if batch_first: 39 | inputs = inputs.transpose(0, 1) 40 | seq_length, batch_size, hidden_size = inputs.size() 41 | drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout) 42 | drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type()) 43 | drop_masks.requires_grad = False 44 | drop_masks = drop_masks / (1 - dropout) 45 | drop_masks = torch.unsqueeze(drop_masks, 46 | dim=2).expand(-1, -1, 47 | seq_length).permute(2, 0, 1) 48 | inputs = inputs * drop_masks 49 | 50 | return inputs.transpose(1, 0) 51 | 52 | 53 | class ParserModel(nn.Module): # build a biaffine parser model 54 | def __init__(self, vocab, config, pretrained_embedding): 55 | super(ParserModel, self).__init__() 56 | self.config = config 57 | self.word_embed = nn.Embedding(vocab.vocab_size, 58 | config.word_dims, 59 | padding_idx=0) 60 | self.extword_embed = nn.Embedding(vocab.extvocab_size, 61 | config.word_dims, 62 | padding_idx=0) 63 | self.tag_embed = nn.Embedding(vocab.tag_size, 64 | config.tag_dims, 65 | padding_idx=0) 66 | 67 | word_init = np.zeros((vocab.vocab_size, config.word_dims), 68 | dtype=np.float32) 69 | self.word_embed.weight.data.copy_(torch.from_numpy(word_init)) 70 | 71 | tag_init = np.random.randn(vocab.tag_size, 72 | config.tag_dims).astype(np.float32) 73 | self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init)) 74 | 75 | self.extword_embed.weight.data.copy_( 76 | torch.from_numpy(pretrained_embedding)) 77 | self.extword_embed.weight.requires_grad = False 78 | 79 | self.lstm = MyLSTM( 80 | input_size=config.word_dims + config.tag_dims, 81 | hidden_size=config.lstm_hiddens, 82 | num_layers=config.lstm_layers, 83 | batch_first=True, 84 | bidirectional=True, 85 | dropout_in=config.dropout_lstm_input, 86 | dropout_out=config.dropout_lstm_hidden, 87 | ) 88 | 89 | self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens, 90 | hidden_size=config.mlp_arc_size + 91 | config.mlp_rel_size, 92 | activation=nn.LeakyReLU(0.1)) 93 | self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens, 94 | hidden_size=config.mlp_arc_size + 95 | config.mlp_rel_size, 96 | activation=nn.LeakyReLU(0.1)) 97 | 98 | self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100) 99 | self.arc_num = int(config.mlp_arc_size / 100) # config: 500 100 | self.rel_num = int(config.mlp_rel_size / 100) # config: 100 101 | 102 | self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 103 | 1, bias=(True, False)) 104 | self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, 105 | vocab.rel_size, bias=(True, True)) 106 | 107 | def forward( 108 | self, words, extwords, tags, 109 | masks): # words [batch, max_sentence_length], padding with zeros 110 | # x = (batch size, sequence length, dimension of embedding) 111 | x_word_embed = self.word_embed(words) 112 | x_extword_embed = self.extword_embed(extwords) 113 | x_embed = x_word_embed + x_extword_embed 114 | x_tag_embed = self.tag_embed(tags) 115 | 116 | if self.training: 117 | x_embed, x_tag_embed = drop_input_independent( 118 | x_embed, x_tag_embed, self.config.dropout_emb) 119 | 120 | x_lexical = torch.cat((x_embed, x_tag_embed), dim=2) 121 | 122 | outputs, _ = self.lstm(x_lexical, masks, None) 123 | outputs = outputs.transpose(1, 0) 124 | 125 | if self.training: 126 | outputs = drop_sequence_sharedmask(outputs, 127 | self.config.dropout_mlp) 128 | 129 | x_all_dep = self.mlp_arc_dep(outputs) 130 | x_all_head = self.mlp_arc_head(outputs) 131 | 132 | if self.training: 133 | x_all_dep = drop_sequence_sharedmask(x_all_dep, 134 | self.config.dropout_mlp) 135 | x_all_head = drop_sequence_sharedmask(x_all_head, 136 | self.config.dropout_mlp) 137 | 138 | x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2) 139 | x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2) 140 | 141 | x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2) 142 | x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2) 143 | 144 | arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head) 145 | arc_logit = torch.squeeze(arc_logit, dim=3) 146 | 147 | x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2) 148 | x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2) 149 | 150 | rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head) 151 | return arc_logit, rel_logit_cond 152 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/pre_trained_language_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import Parameter 5 | from torch.nn.utils.rnn import pad_sequence 6 | from transformers import BertModel 7 | from transformers import BertTokenizer 8 | 9 | 10 | class ScalarMix(torch.nn.Module): 11 | def __init__(self, mixture_size=4): 12 | super(ScalarMix, self).__init__() 13 | self.mixture_size = mixture_size 14 | self.scalar_parameters = Parameter(torch.ones(mixture_size)) 15 | self.gamma = Parameter(torch.tensor(1.0)) 16 | 17 | def forward(self, layers): 18 | normed_weights = F.softmax(self.scalar_parameters, dim=0) 19 | return self.gamma * sum( 20 | weight * tensor for weight, tensor in zip(normed_weights, layers) 21 | ) 22 | 23 | 24 | class Bert_Embedding(nn.Module): 25 | def __init__(self, bert_path, bert_layer, bert_dim, freeze=True): 26 | super(Bert_Embedding, self).__init__() 27 | self.bert_layer = bert_layer 28 | self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True) 29 | print(self.bert.config) 30 | self.scalar_mix = ScalarMix(bert_layer) 31 | 32 | if freeze: 33 | self.freeze() 34 | 35 | def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask): 36 | self.eval() 37 | sen_lens = token_starts_masks.sum(dim=1) 38 | _, _, bert_outs = self.bert( 39 | subword_idxs, 40 | attention_mask=subword_masks 41 | ) # tuple([Batch_size, max_sentence_length, dim]) 42 | bert_outs = bert_outs[-self.bert_layer:] 43 | bert_outs = self.scalar_mix(bert_outs) 44 | # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist()) 45 | # bert_outs = pad_sequence(bert_outs, batch_first=True) 46 | zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1)) 47 | zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks]) 48 | subwords_lens = subwords_mask.sum(-1) 49 | subwords_lens += (subwords_lens == 0).type(subwords_lens.type()) # 0.0 / 0 -> 0.0 / 1 50 | bert_outs = zeros.sum(2) / subwords_lens.unsqueeze(-1) 51 | return bert_outs 52 | 53 | def freeze(self): 54 | for para in self.bert.parameters(): 55 | para.requires_grad = False 56 | 57 | 58 | class Bert_Encoder(nn.Module): 59 | def __init__(self, bert_path, bert_layer, freeze=False, fix_layer_number=None): 60 | super(Bert_Encoder, self).__init__() 61 | self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True) 62 | self.bert_layer = bert_layer 63 | 64 | if freeze: 65 | self.freeze() 66 | if fix_layer_number is not None: 67 | self.fix_several_layers(fix_layer_number) 68 | 69 | def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask): 70 | sen_lens = token_starts_masks.sum(dim=1) 71 | _, _, bert_outs = self.bert( 72 | subword_idxs, 73 | token_type_ids=None, 74 | attention_mask=subword_masks, 75 | ) 76 | bert_outs = bert_outs[-1] # the last layer of BERT outputs 77 | # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist()) 78 | zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1)) 79 | zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks]) 80 | bert_outs = pad_sequence(zeros, batch_first=True) 81 | subwords_lens = subwords_mask.sum(-1) 82 | subwords_lens += (subwords_lens == 0).type(subwords_lens.type()) # 0.0 / 0 -> 0.0 / 1 83 | bert_outs = bert_outs.sum(2) / subwords_lens.unsqueeze(-1) 84 | return bert_outs 85 | 86 | def freeze(self): 87 | for para in self.bert.parameters(): 88 | para.requires_grad = False 89 | 90 | def fix_several_layers(self, layer_numer): 91 | fixed_layer_names = ["embeddings"] if layer_numer >= 0 else [] 92 | for i in range(layer_numer): 93 | fixed_layer_names.append("encoder.layer." + str(i) + '.') 94 | print("{} will be fixed".format(fixed_layer_names)) 95 | for name, para in self.bert.named_parameters(): 96 | for layer_name in fixed_layer_names: 97 | if layer_name in name: 98 | para.requires_grad = False 99 | break 100 | 101 | 102 | class Vocab(object): 103 | def __init__(self, bert_vocab_path): 104 | self.tokenizer = BertTokenizer.from_pretrained( 105 | bert_vocab_path, do_lower_case=False 106 | ) 107 | 108 | def numericalize(self, seqs, training=True): 109 | subwords, masks, starts = [], [], [] 110 | text_masks, subwords_mask = [], [] 111 | 112 | for seq in seqs: 113 | seq = [self.tokenizer.tokenize(token) for token in seq] 114 | seq = [piece if piece else ["[PAD]"] for piece in seq] 115 | seq = [["[CLS]"]] + seq + [["[SEP]"]] 116 | lengths = [0] + [len(piece) for piece in seq] 117 | # flatten the word pieces 118 | tokens = sum(seq, []) 119 | # subwords indexes 120 | token_idx = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)) 121 | subwords.append(token_idx) 122 | 123 | # subword masks 124 | mask = torch.ones(len(tokens), dtype=torch.bool) 125 | masks.append(mask) 126 | # subword text mask 127 | text_mask = torch.BoolTensor([0] + [1] * (len(tokens) - 2) + [0]) 128 | text_masks.append(text_mask) 129 | 130 | # record the start position of all words 131 | start_idxs = torch.tensor(lengths).cumsum(0)[1:-2] # bos:0 eos:-2 132 | # subword start masks 133 | start_mask = torch.zeros(len(tokens), dtype=torch.bool) 134 | start_mask[start_idxs] = 1 135 | starts.append(start_mask) 136 | 137 | # record the start and last position of all words 138 | start_end_idxs = torch.tensor(lengths).cumsum(0)[1:-1] 139 | subword_mask = [torch.ones(start_end_idxs[i + 1] - start_end_idxs[i]) 140 | for i in range(len(start_end_idxs) - 1)] 141 | subword_mask = pad_sequence(subword_mask, batch_first=True) 142 | subwords_mask.append(subword_mask) 143 | max_subword_length = max(m.size(-1) for m in subwords_mask) 144 | max_sentence_length = max(m.size(0) for m in subwords_mask) 145 | subwords_mask = [F.pad(mask, (0, max_subword_length - mask.size(1), 0, max_sentence_length - mask.size(0))) 146 | for mask in subwords_mask] # [left, right, top, down] 147 | subwords_mask = torch.stack(subwords_mask) 148 | return subwords, masks, starts, text_masks, subwords_mask 149 | 150 | 151 | class BERT_input(nn.Module): 152 | def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim): 153 | super(BERT_input, self).__init__() 154 | self.vocab = Vocab(bert_vocab_path) 155 | self.bert_input = Bert_Embedding(bert_path, bert_layer, bert_dim) 156 | 157 | def forward(self, seqs): 158 | subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs) 159 | subwords = pad_sequence(subwords, batch_first=True).cuda() 160 | masks = pad_sequence(masks, batch_first=True).cuda() 161 | starts = pad_sequence(starts, batch_first=True).cuda() 162 | text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda() 163 | subwords_mask = subwords_mask.type(torch.BoolTensor).cuda() 164 | bert_outs = self.bert_input.forward(subwords, masks, starts, text_masks, subwords_mask) 165 | return bert_outs 166 | 167 | 168 | class BERT_model(nn.Module): 169 | def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim, fix_layer_number=None): 170 | super(BERT_model, self).__init__() 171 | self.vocab = Vocab(bert_vocab_path) 172 | self.bert_encoder = Bert_Encoder(bert_path, bert_layer, 173 | freeze=False, fix_layer_number=fix_layer_number) 174 | 175 | def forward(self, seqs): 176 | subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs) 177 | subwords = pad_sequence(subwords, batch_first=True).cuda() 178 | masks = pad_sequence(masks, batch_first=True).cuda() 179 | starts = pad_sequence(starts, batch_first=True).type(torch.BoolTensor).cuda() 180 | text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda() 181 | subwords_mask = subwords_mask.type(torch.BoolTensor).cuda() 182 | bert_outs = self.bert_encoder.forward(subwords, masks, starts, text_masks, subwords_mask) 183 | return bert_outs 184 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/pytorch/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | 6 | def block_orth_normal_initializer(input_size, output_size): 7 | weight = [] 8 | for o in output_size: 9 | for i in input_size: 10 | param = torch.FloatTensor(o, i) 11 | torch.nn.init.orthogonal_(param) 12 | weight.append(param) 13 | return torch.cat(weight) 14 | 15 | 16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights): 17 | batch_size = len(batch_x) # batch size 18 | length = max(batch_lengths) 19 | 20 | words = Variable(torch.LongTensor(batch_size, length).zero_(), 21 | requires_grad=False) # padding with 0 22 | predicates = Variable(torch.LongTensor(batch_size, length).zero_(), 23 | requires_grad=False) 24 | masks = Variable(torch.Tensor(batch_size, length).zero_(), 25 | requires_grad=False) 26 | padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(), 27 | requires_grad=False) 28 | labels, lengths = [], [] 29 | 30 | b = 0 31 | for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y, 32 | batch_lengths, 33 | batch_weights): 34 | lengths.append(s_length) 35 | rel = np.zeros((s_length), dtype=np.int32) 36 | for i in range(s_length): 37 | words[b, i] = s_words[1][i] # word 38 | predicates[b, i] = s_words[2][i] # predicate 39 | rel[i] = s_answer[0][i] 40 | padding_answers[b, i] = s_answer[0][i] 41 | masks[b, i] = 1 42 | 43 | # sentence_id = s_words[0][0] # get the dep_labels_ids of each sentence 44 | b += 1 45 | labels.append(rel) 46 | 47 | return words, predicates, labels, torch.LongTensor( 48 | lengths), masks, padding_answers 49 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__init__.py -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/configuration.py: -------------------------------------------------------------------------------- 1 | ''' Configuration for experiments. 2 | ''' 3 | import json 4 | from argparse import Namespace 5 | 6 | 7 | def get_config(config_filepath): 8 | with open(config_filepath, 'r') as config_file: 9 | conf = json.load(config_file, object_hook=lambda d: Namespace(**d)) 10 | return conf 11 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/conll_utils.py: -------------------------------------------------------------------------------- 1 | def bio_to_se(labels): 2 | slen = len(labels) 3 | new_labels = [] 4 | has_opening = False 5 | for i in range(slen): 6 | label = labels[i] 7 | if label == 'O': 8 | new_labels.append('*') 9 | continue 10 | new_label = '*' 11 | if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]: 12 | new_label = '(' + label[2:] + new_label 13 | has_opening = True 14 | if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]: 15 | new_label = new_label + ')' 16 | has_opening = False 17 | new_labels.append(new_label) 18 | 19 | if has_opening: 20 | ''' logging ''' 21 | print("Has unclosed opening: {}".format(labels)) 22 | return new_labels 23 | 24 | 25 | def print_sentence_to_conll(fout, tokens, labels): 26 | for label_column in labels: 27 | assert len(label_column) == len(tokens) 28 | for i in range(len(tokens)): 29 | fout.write(tokens[i].ljust(15)) 30 | for label_column in labels: 31 | fout.write(label_column[i].rjust(15)) 32 | fout.write("\n") 33 | fout.write("\n") 34 | 35 | 36 | def print_to_conll(pred_labels, gold_props_file, output_filename): 37 | """ 38 | """ 39 | fout = open(output_filename, 'w') 40 | seq_ptr = 0 41 | num_props_for_sentence = 0 42 | tokens_buf = [] 43 | 44 | for line in open(gold_props_file, 'r'): 45 | line = line.strip() 46 | if line == "" and len(tokens_buf) > 0: 47 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence]) 48 | seq_ptr += num_props_for_sentence 49 | tokens_buf = [] 50 | num_props_for_sentence = 0 51 | else: 52 | info = line.split() 53 | num_props_for_sentence = len(info) - 1 54 | tokens_buf.append(info[0]) 55 | 56 | # Output last sentence. 57 | if len(tokens_buf) > 0: 58 | print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence]) 59 | 60 | fout.close() 61 | 62 | 63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename): 64 | fout = open(output_filename, 'w') 65 | props_buf = [] 66 | labels_buf = [] 67 | tokens_buf = [] 68 | prev_words = '' 69 | 70 | x, y, num_tokens, _ = data 71 | for (sent, gold, slen) in zip(x, y, num_tokens): 72 | words = [word_dict.idx2str[w[0]] for w in sent[:slen]] 73 | labels = [label_dict.idx2str[l] for l in gold[:slen]] 74 | 75 | concat_words = ' '.join(words) 76 | if concat_words != prev_words and len(props_buf) > 0: 77 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)] 78 | 79 | print_sentence_to_conll(fout, tokens, labels_buf) 80 | props_buf = [] 81 | tokens_buf = [] 82 | labels_buf = [] 83 | prev_words = '' 84 | 85 | if prev_words == '': 86 | prev_words = concat_words 87 | tokens_buf = [w for w in words] 88 | if 'B-V' in labels: 89 | prop_id = labels.index('B-V') 90 | props_buf.append(prop_id) 91 | labels_buf.append(bio_to_se(labels)) 92 | 93 | if len(props_buf) > 0: 94 | tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)] 95 | print_sentence_to_conll(fout, tokens, labels_buf) 96 | 97 | fout.close() 98 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/constants.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | import os 3 | import random 4 | 5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../') 6 | 7 | RANDOM_SEED = 12345 8 | random.seed(RANDOM_SEED) 9 | 10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh') 11 | 12 | START_MARKER = '' 13 | END_MARKER = '' 14 | PADDING_TOKEN = '*PAD*' 15 | UNKNOWN_TOKEN = '*UNKNOWN*' 16 | NULL_LABEL = 'O' 17 | 18 | TEMP_DIR = join(ROOT_DIR, '../temp') 19 | 20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT) 21 | if not os.path.exists(TEMP_DIR): 22 | os.makedirs(TEMP_DIR) 23 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/constituent_extraction.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import sys 3 | import numpy as np 4 | import random 5 | 6 | from .dictionary import Dictionary 7 | from collections import OrderedDict 8 | from nltk.tree import Tree 9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN 10 | # from .reader import list_of_words_to_ids 11 | 12 | 13 | PREFIX = "--PTB-CONS-LABEL--" 14 | 15 | 16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 17 | ids = [] 18 | for s in list_of_words: 19 | # s = s.encode('utf-8') # unicode -> utf-8 20 | if s is None: 21 | ids.append(-1) 22 | continue 23 | if lowercase: 24 | s = s.lower() 25 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 26 | s = UNKNOWN_TOKEN 27 | ids.append(dictionary.add(s)) 28 | return ids 29 | 30 | 31 | class constituent_tree(): 32 | def __init__(self, sentence, words, tree): 33 | self.sentence = sentence 34 | self.words = words 35 | self.tree = tree 36 | self.heads = None 37 | self.nodes = None 38 | self.indicator = [] # 0 no terminal, 1 terminal 39 | self.word_position = [] 40 | self.node_idx = [] 41 | self.node_char_idx = [] 42 | 43 | self.sentence_length = len(words) 44 | self.input_length = -1 45 | self.sentence_index = -1 46 | 47 | def pos(self): 48 | """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]""" 49 | return self.tree.pos() 50 | 51 | def traverse_tree(self, tree, nodes, indicator, heads, parent, pos, label, word_embeddings): 52 | # print(tree) 53 | # print("subtree", subtree) 54 | if tree.height() > 2: 55 | subtree_label = PREFIX + tree.label() 56 | label.add(subtree_label) 57 | constituent_tree.add_unknown_labels(subtree_label, word_embeddings) 58 | nodes.append(subtree_label) 59 | indicator.append(0) 60 | heads.append(parent - 1) 61 | else: 62 | # print("YY", subtree) 63 | pos.add(tree.label()) 64 | subtree_pos = tree[0] # word 65 | subtree_pos = constituent_tree.add_word(subtree_pos, label, word_embeddings) 66 | nodes.append(subtree_pos) 67 | indicator.append(1) 68 | idx = len(nodes) - 1 69 | self.word_position.append(idx) 70 | heads.append(parent - 1) 71 | if tree.height() <= 2: 72 | return 73 | parent = len(nodes) 74 | for i, subtree in enumerate(tree): 75 | self.traverse_tree(subtree, nodes, indicator, heads, parent, pos, label, word_embeddings) 76 | 77 | @staticmethod 78 | def add_unknown_labels(label, word_embeddings): 79 | if label not in word_embeddings: 80 | embedding_size = len(word_embeddings[PADDING_TOKEN]) 81 | word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)]) 82 | 83 | @staticmethod 84 | def add_word(word, word_dict, word_embeddings): 85 | if word not in word_embeddings: 86 | word = UNKNOWN_TOKEN 87 | idx = word_dict.add(word) 88 | return word 89 | 90 | @staticmethod 91 | def get_node_char_idx(words, char_dict, lowercase=False): 92 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 93 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int) 94 | for i, word in enumerate(words): 95 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase) 96 | return single_sample_char_tokens 97 | 98 | def generate_adjacent(self, pos, label_dict, char_dict, word_embeddings): 99 | assert self.heads is None 100 | root_label = PREFIX + self.tree.label() 101 | nodes, heads = [], [] # TODO notice 102 | self.traverse_tree(self.tree, nodes, self.indicator, heads, 103 | len(heads), pos, label_dict, word_embeddings) 104 | self.nodes = nodes 105 | self.heads = heads 106 | self.input_length = len(self.nodes) 107 | self.sentence_index = self.input_length - self.sentence_length - 1 108 | self.node_idx = [label_dict.get_index(node) for node in self.nodes] 109 | 110 | max_word_length = max([len(w) for w in self.nodes] + [3, 4, 5]) # compare with character cnn filter width 111 | self.node_char_idx = np.zeros([len(self.nodes), max_word_length], dtype=np.int64) 112 | for i, word in enumerate(self.nodes): 113 | self.node_char_idx[i, :len(word)] = list_of_words_to_ids(word, char_dict) 114 | 115 | self.node_char_idx = constituent_tree.get_node_char_idx(self.nodes, char_dict) 116 | 117 | 118 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings): 119 | data = [] 120 | with open(file_path, 'r') as input_file: 121 | sentence = "" 122 | for line in input_file.readlines(): 123 | if line.strip() == "": 124 | data.append(sentence) 125 | sentence = "" 126 | continue 127 | line = line.strip() 128 | if ' ' not in line: # avoid the split of leave node of it's PoS 129 | line = ' ' + line 130 | sentence += line 131 | print("Read {} sentence from {}".format(len(data), file_path)) 132 | 133 | cons_trees = OrderedDict() 134 | for sentence in data: 135 | tree = Tree.fromstring(sentence) 136 | words = tree.leaves() 137 | sentence = ' '.join(words) 138 | cons_trees[sentence] = constituent_tree(sentence, words, tree) 139 | 140 | pos_dict = Dictionary(padding_token=PADDING_TOKEN) 141 | for sen in cons_trees: 142 | tree = cons_trees[sen] 143 | tree.generate_adjacent(pos_dict, word_dict, char_dict, word_embeddings) 144 | 145 | return cons_trees, pos_dict 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/constituent_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import codecs 3 | import numpy as np 4 | 5 | 6 | from sortedcontainers import SortedSet 7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL 8 | from .dictionary import Dictionary 9 | 10 | 11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 12 | ids = [] 13 | for s in list_of_words: 14 | # s = s.encode('utf-8') # unicode -> utf-8 15 | if s is None: 16 | ids.append(-1) 17 | continue 18 | if lowercase: 19 | s = s.lower() 20 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 21 | s = UNKNOWN_TOKEN 22 | ids.append(dictionary.add(s)) 23 | return ids 24 | 25 | 26 | class constituent_sentence(): 27 | def __init__(self, obj): 28 | self.sentence = obj["sentence"] 29 | self.constituent_spans = obj["constituents"] 30 | self.max_span_width = 30 31 | 32 | def tokenize_cons_spans(self, dictionary, max_cons_width=60): 33 | cons_span = [] 34 | set_cons_span = set() 35 | for cons_s in self.constituent_spans: # remove self-loop V-V 36 | cons_start, cons_end, cons_label = cons_s 37 | if cons_label in ["TOP", "S"]: # todo: add some constrains here 38 | continue 39 | if cons_end - cons_start + 1 >= max_cons_width: 40 | continue 41 | if (cons_start, cons_end) not in set_cons_span: 42 | set_cons_span.add((cons_start, cons_end)) 43 | cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))]) 44 | else: 45 | # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence) 46 | pass 47 | if len(cons_span) == 0: # if the sentence has no arguments. 48 | return [[], [], []] 49 | tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \ 50 | zip(*cons_span) 51 | return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels 52 | 53 | 54 | def read_constituent_file(file_path): 55 | sentences = [] 56 | with codecs.open(file_path, encoding="utf8") as f: 57 | for line in f.readlines(): 58 | sen = json.loads(line) 59 | cons_sen = constituent_sentence(sen) 60 | sentences.append(cons_sen) 61 | print("{} total constituent sentences number {}".format(file_path, len(sentences))) 62 | return sentences 63 | 64 | 65 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False): 66 | sample_word_tokens = [list_of_words_to_ids( 67 | sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples] 68 | # for the character 69 | sample_char_tokens = [] 70 | for sent in samples: 71 | words = sent.sentence 72 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 73 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int) 74 | for i, word in enumerate(words): 75 | single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase) 76 | # Add the sample char tokens into the sample_char_tokens 77 | sample_char_tokens.append(single_sample_char_tokens) 78 | sample_lengths = [len(sent.sentence)for sent in samples] 79 | sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples] 80 | return list(zip(sample_lengths, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens)) 81 | 82 | 83 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None): 84 | raw_cons_sentences = read_constituent_file(file_path) 85 | cons_label_dict = Dictionary() 86 | cons_label_dict.set_unknown_token(NULL_LABEL) 87 | 88 | # tokenized the data 89 | if word_dict.accept_new is False: 90 | word_dict.accept_new = True 91 | if char_dict.accept_new is False: 92 | char_dict.accept_new = True 93 | cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict, 94 | False, word_embeddings) 95 | # word_dict.accept_new = False 96 | # char_dict.accept_new = False 97 | # cons_label_dict.accept_new = False 98 | 99 | print("="*10, "Constituent Info", "="*10) 100 | print("Extract {} tags".format(cons_label_dict.size())) 101 | # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size())) 102 | print("Max sentence length: {}".format(max([s[0] for s in cons_samples]))) 103 | return cons_samples, word_dict, char_dict, cons_label_dict 104 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/dictionary.py: -------------------------------------------------------------------------------- 1 | ''' Bidirectional dictionary that maps between words and ids. 2 | ''' 3 | 4 | 5 | class Dictionary(object): 6 | def __init__(self, padding_token=None, unknown_token=None): 7 | self.str2idx = {} 8 | self.idx2str = [] 9 | 10 | self.accept_new = True 11 | self.padding_token = None 12 | self.padding_id = None 13 | self.unknown_token = None 14 | self.unknown_id = None 15 | if padding_token is not None: # add the padding info into the dictionary 16 | self.set_padding_token(padding_token) 17 | if unknown_token is not None: 18 | self.set_unknown_token(unknown_token) 19 | 20 | def set_padding_token(self, padding_token): 21 | self.padding_token = padding_token 22 | self.padding_id = self.add(self.padding_token) 23 | 24 | def set_unknown_token(self, unknown_token): 25 | self.unknown_token = unknown_token 26 | self.unknown_id = self.add(self.unknown_token) 27 | 28 | def add(self, new_str): 29 | if new_str not in self.str2idx: 30 | if self.accept_new: 31 | self.str2idx[new_str] = len(self.idx2str) 32 | self.idx2str.append(new_str) 33 | else: 34 | if new_str == "C-ADV": 35 | return self.str2idx["O"] 36 | if self.unknown_id is None: 37 | raise LookupError( 38 | 'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str) 39 | return self.unknown_id 40 | 41 | return self.str2idx[new_str] 42 | 43 | def add_all(self, str_list): 44 | return [self.add(s) for s in str_list] 45 | 46 | def get_index(self, input_str): 47 | if input_str in self.str2idx: 48 | return self.str2idx[input_str] 49 | return None 50 | 51 | def size(self): 52 | return len(self.idx2str) 53 | 54 | def save(self, filename): 55 | with open(filename, 'w') as f: 56 | for s in self.idx2str: 57 | f.write(s + '\n') 58 | f.close() 59 | 60 | def load(self, filename): 61 | with open(filename, 'r') as f: 62 | for line in f: 63 | line = line.strip() 64 | if line != '': 65 | self.add(line) 66 | f.close() 67 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/evaluation.py: -------------------------------------------------------------------------------- 1 | ''' Framework independent evaluator. Not in use yet. 2 | ''' 3 | import numpy 4 | import os 5 | from os.path import join 6 | # import subprocess 7 | from .constants import ROOT_DIR 8 | from .conll_utils import print_gold_to_conll 9 | # from .measurements import Timer 10 | 11 | 12 | class TaggerEvaluator(object): 13 | def __init__(self, data): 14 | self.data = data 15 | self.best_accuracy = 0.0 16 | self.has_best = False 17 | 18 | def compute_accuracy(self, predictions): 19 | for x, y in zip(predictions, 20 | [sent[2] for sent in self.data 21 | ]): # the predication's order should be the origin 22 | assert len(x) == y 23 | predictions = numpy.concatenate(predictions) 24 | tensors = self.data 25 | answer = numpy.concatenate( 26 | [sent[1].reshape(sent[1].shape[1]) for sent in tensors]) 27 | # predictions.resize(predictions.shape[0]) # resize the answer to the [length, 1] 28 | num_correct = numpy.equal(predictions, answer).sum() 29 | num_total = answer.shape[0] 30 | self.accuracy = (100.0 * num_correct) / num_total 31 | print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct, 32 | num_total)) 33 | 34 | def evaluate(self, predictions): 35 | self.compute_accuracy(predictions) 36 | self.has_best = self.accuracy > self.best_accuracy 37 | if self.has_best: 38 | print("Best accuracy so far: {:.3f}".format(self.accuracy)) 39 | self.best_accuracy = self.accuracy 40 | 41 | 42 | class PropIdEvaluator(object): 43 | def __init__(self, data, label_dict, target_label='V', 44 | use_se_marker=False): 45 | self.data = data 46 | self.label_dict = label_dict 47 | self.target_label_id = label_dict.str2idx[target_label] 48 | self.best_accuracy = 0.0 49 | self.has_best = False 50 | 51 | def compute_accuracy(self, predictions): 52 | _, y, _, weights = self.data 53 | # print predictions.shape, predictions 54 | identified = numpy.equal(predictions, self.target_label_id) 55 | print(y) 56 | # print self.target_label_id 57 | # print identified 58 | # exit() 59 | num_correct = numpy.sum( 60 | numpy.logical_and(numpy.equal(predictions, y), identified) * weights) 61 | num_identified = numpy.sum(identified * weights) 62 | num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights) 63 | self.precision = 100.0 * num_correct / num_identified 64 | self.recall = 100.0 * num_correct / num_gold 65 | self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall) 66 | print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format( 67 | self.accuracy, self.precision, self.recall)) 68 | 69 | def evaluate(self, predictions): 70 | self.compute_accuracy(predictions) 71 | self.has_best = self.accuracy > self.best_accuracy 72 | if self.has_best: 73 | print("Best accuracy so far: {:.3f}".format(self.accuracy)) 74 | self.best_accuracy = self.accuracy 75 | 76 | 77 | class SRLEvaluator(TaggerEvaluator): 78 | def __init__(self): 79 | self.best_accuracy = -1.0 80 | self.has_best = False 81 | 82 | def compute_accuracy(self, predictions): 83 | print("exit()") 84 | exit() 85 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/features.py: -------------------------------------------------------------------------------- 1 | def get_srl_features(sentences, config, feature_dicts=None): 2 | ''' TODO: Support adding more features. 3 | ''' 4 | feature_names = config.features 5 | feature_sizes = config.feature_sizes 6 | use_se_marker = config.use_se_marker 7 | 8 | features = [] 9 | feature_shapes = [] 10 | for fname, fsize in zip(feature_names, feature_sizes): 11 | if fname == "predicate": 12 | offset = int(use_se_marker) 13 | offset = 1 # pad is in the position 0 14 | features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences]) 15 | feature_shapes.append([2, fsize]) 16 | return (zip(*features), feature_shapes) 17 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/inference.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | def get_transition_params(label_strs): 5 | """Construct transtion scoresd (0 for allowed, -inf for invalid). 6 | Args: 7 | label_strs: A [num_tags,] sequence of BIO-tags. 8 | Returns: 9 | A [num_tags, num_tags] matrix of transition scores. 10 | """ 11 | num_tags = len(label_strs) 12 | transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32) 13 | for i, prev_label in enumerate(label_strs): 14 | for j, label in enumerate(label_strs): 15 | if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]: 16 | transition_params[i, j] = numpy.NINF 17 | return transition_params 18 | 19 | 20 | def viterbi_decode(score, transition_params): 21 | """ Adapted from Tensorflow implementation. 22 | Decode the highest scoring sequence of tags outside of TensorFlow. 23 | This should only be used at test time. 24 | Args: 25 | score: A [seq_len, num_tags] matrix of unary potentials. 26 | transition_params: A [num_tags, num_tags] matrix of binary potentials. 27 | Returns: 28 | viterbi: A [seq_len] list of integers containing the highest scoring tag 29 | indicies. 30 | viterbi_score: A float containing the score for the Viterbi sequence. 31 | """ 32 | trellis = numpy.zeros_like(score) 33 | backpointers = numpy.zeros_like(score, dtype=numpy.int32) 34 | trellis[0] = score[0] 35 | for t in range(1, score.shape[0]): 36 | v = numpy.expand_dims(trellis[t - 1], 1) + transition_params 37 | trellis[t] = score[t] + numpy.max(v, 0) 38 | backpointers[t] = numpy.argmax(v, 0) 39 | viterbi = [numpy.argmax(trellis[-1])] 40 | for bp in reversed(backpointers[1:]): 41 | viterbi.append(bp[viterbi[-1]]) 42 | viterbi.reverse() 43 | viterbi_score = numpy.max(trellis[-1]) 44 | return viterbi, viterbi_score 45 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/io_utils.py: -------------------------------------------------------------------------------- 1 | from google.protobuf.internal import encoder 2 | 3 | _EncodeVarint = encoder._VarintEncoder() 4 | 5 | 6 | def write_delimited_to(out_file, message): 7 | msg_size = message.ByteSize() 8 | pieces = [] 9 | _EncodeVarint(pieces.append, msg_size) 10 | out_file.write(b"".join(pieces)) 11 | out_file.write(message.SerializeToString()) 12 | 13 | 14 | def read_gold_props(gold_props_file): 15 | """ Read gold predicates from CoNLL-formatted file. 16 | """ 17 | gold_props = [] 18 | props = [] 19 | with open(gold_props_file, 'r') as f: 20 | for line in f: 21 | line = line.strip() 22 | if line == '': 23 | gold_props.append(props) 24 | props = [] 25 | else: 26 | props.append(line.split()[0]) 27 | f.close() 28 | if len(props) > 0: 29 | gold_props.append(props) 30 | return gold_props 31 | 32 | 33 | def write_predprops_to(predictions, 34 | label_dict, 35 | input_file, 36 | output_file, 37 | gold_props_file=None, 38 | output_props_file=None): 39 | """ Write predicted predicate information to files. 40 | 41 | Arguments: 42 | predictions: Predictions from the predicate identification model. 43 | Is a numpy array of size [num_sentences, max_sentence_length]. 44 | label_dict: Label dictionary. 45 | input_file: Input sequential tagging file. 46 | output_file: Output SRL file with identified predicates. 47 | gold_props_file: Input file with gold predicates in CoNLL format. 48 | output_props_file: Output SRL file with identified predicates, in CoNLL format. 49 | """ 50 | 51 | fin = open(input_file, 'r') 52 | fout = open(output_file, 'w') 53 | 54 | if output_props_file is not None and output_props_file != '': 55 | fout_props = open(output_props_file, 'w') 56 | else: 57 | fout_props = None 58 | 59 | if gold_props_file is not None and gold_props_file != '': 60 | gold_props = read_gold_props(gold_props_file) 61 | print(len(gold_props), len(predictions)) 62 | assert len(gold_props) == len(predictions) 63 | else: 64 | gold_props = None 65 | 66 | sent_id = 0 67 | for line in fin: 68 | # Read original sentence from input file. 69 | raw_sent = line.split('|||')[0].strip() 70 | tokens = raw_sent.split(' ') 71 | slen = len(tokens) 72 | pred = predictions[sent_id, :slen] 73 | props = [] 74 | 75 | for (t, p) in enumerate(pred): 76 | if label_dict.idx2str[p] == 'V': 77 | out_tags = ['O' for _ in range(slen)] 78 | out_tags[t] = 'B-V' 79 | out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join( 80 | out_tags) + '\n' 81 | fout.write(out_line) 82 | props.append(t) 83 | 84 | if fout_props is not None: 85 | if sent_id > 0: 86 | fout_props.write('\n') 87 | for t in range(slen): 88 | lemma = 'P' + tokens[t].lower() 89 | # In order for CoNLL evaluation script to run, we need to output the same 90 | # lemma as the gold predicate in the CoNLL-formatted file. 91 | if gold_props is not None and gold_props[sent_id][t] != '-': 92 | lemma = gold_props[sent_id][t] 93 | if t in props: 94 | fout_props.write(lemma) 95 | else: 96 | fout_props.write('-') 97 | for p in props: 98 | if t == p: 99 | fout_props.write('\t(V*)') 100 | else: 101 | fout_props.write('\t*') 102 | fout_props.write('\n') 103 | sent_id += 1 104 | 105 | fout.close() 106 | print('Predicted predicates in sequential-tagging format written to: {}.'. 107 | format(output_file)) 108 | if fout_props is not None: 109 | fout_props.close() 110 | print('CoNLL-formatted predicate information written to: {}.'.format( 111 | output_props_file)) 112 | 113 | 114 | def bio_to_spans(predictions, label_dict): 115 | """ Convert BIO-based predictions to a set of arguments. 116 | Arguments: 117 | predictions: A single integer array, already truncated to the original sequence lengths. 118 | label_dict: Label dictionary. 119 | Returns: 120 | A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions. 121 | """ 122 | args = [] 123 | tags = [label_dict.idx2str[p] for p in predictions] 124 | for (i, tag) in enumerate(tags): 125 | if tag == 'O': 126 | continue 127 | label = tag[2:] 128 | # Append new span. 129 | if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]: 130 | args.append([label, i, -1]) 131 | # Close current span. 132 | if i == len(predictions) - 1 or tags[ 133 | i + 1][0] == 'B' or label != tags[i + 1][2:]: 134 | args[-1][2] = i 135 | return args 136 | 137 | 138 | def print_to_readable(predictions, num_tokens, label_dict, input_path, 139 | output_path): 140 | """ Print predictions to human-readable format. 141 | """ 142 | fout = open(output_path, 'w') 143 | sample_id = 0 144 | for line in open(input_path, 'r'): 145 | info = line.split('|||')[0].strip().split() 146 | pid = int(info[0]) 147 | sent = info[1:] 148 | fout.write(' '.join(sent) + '\n') 149 | fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid)) 150 | 151 | tags = predictions[sample_id] 152 | arg_spans = bio_to_spans(tags, label_dict) 153 | for arg in arg_spans: 154 | fout.write('\t\t{}: {}\n'.format(arg[0], " ".join( 155 | sent[arg[1]:arg[2] + 1]))) 156 | fout.write('\n') 157 | sample_id += 1 158 | 159 | fout.close() 160 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/measurements.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | 5 | class Timer: 6 | def __init__(self, name, active=True): 7 | self.name = name if active else None 8 | 9 | def __enter__(self): 10 | self.start = time.time() 11 | self.last_tick = self.start 12 | return self 13 | 14 | def __exit__(self, *args): 15 | if self.name is not None: 16 | print("{} duration was {}.".format( 17 | self.name, self.readable(time.time() - self.start))) 18 | 19 | def readable(self, seconds): 20 | return str(datetime.timedelta(seconds=int(seconds))) 21 | 22 | def tick(self, message): 23 | current = time.time() 24 | print("{} took {} ({} since last tick).".format( 25 | message, self.readable(current - self.start), 26 | self.readable(current - self.last_tick))) 27 | self.last_tick = current 28 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/numpy_utils.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | def orth_normal_initializer(factor=1.0, seed=None): 5 | ''' Reference: Exact solutions to the nonlinear dynamics of learning in 6 | deep linear neural networks 7 | Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf 8 | Adapted from the original implementation by Mingxuan Wang. 9 | ''' 10 | def _initializer(shape, dtype): 11 | assert len(shape) == 2 12 | rng = numpy.random.RandomState(seed) 13 | if shape[0] == shape[1]: 14 | M = rng.randn(*shape).astype(dtype) 15 | Q, R = numpy.linalg.qr(M) 16 | Q = Q * numpy.sign(numpy.diag(R)) 17 | param = Q * factor 18 | return param 19 | else: 20 | M1 = rng.randn(shape[0], shape[0]).astype(dtype) 21 | M2 = rng.randn(shape[1], shape[1]).astype(dtype) 22 | Q1, R1 = numpy.linalg.qr(M1) 23 | Q2, R2 = numpy.linalg.qr(M2) 24 | Q1 = Q1 * numpy.sign(numpy.diag(R1)) 25 | Q2 = Q2 * numpy.sign(numpy.diag(R2)) 26 | n_min = min(shape[0], shape[1]) 27 | param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor 28 | return param 29 | 30 | return _initializer 31 | 32 | 33 | def block_orth_normal_initializer(input_shapes, 34 | output_shapes, 35 | factor=1.0, 36 | seed=None): 37 | ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix. 38 | Input: 39 | - input_shapes: the sizes of each block alone dimension 0. 40 | - output_shapes: the sizes of each block along dimension 1. 41 | for example input_shapes = [100, 128] output_shapes=[100,100,100,100] 42 | indicates eight blocks with shapes [100,100], [128,100], etc. 43 | ''' 44 | def _initializer(shape, dtype): 45 | assert len(shape) == 2 46 | initializer = orth_normal_initializer(factor, seed) 47 | params = numpy.concatenate([ 48 | numpy.concatenate([ 49 | initializer([dim_in, dim_out], dtype) 50 | for dim_out in output_shapes 51 | ], 1) for dim_in in input_shapes 52 | ], 0) 53 | return params 54 | 55 | return _initializer 56 | 57 | 58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None): 59 | def _initializer(shape, dtype): 60 | rng = numpy.random.RandomState(seed) 61 | return numpy.asarray(rng.normal(mean, stddev, shape), dtype) 62 | 63 | return _initializer 64 | 65 | 66 | def all_zero_initializer(): 67 | def _initializer(shape, dtype): 68 | return numpy.zeros(shape).astype(dtype) 69 | 70 | return _initializer 71 | 72 | 73 | def uniform_initializer(value=0.01): 74 | def _initializer(shape, dtype): 75 | return numpy.full(shape, value).astype(dtype) 76 | 77 | return _initializer 78 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/scores_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: scores.proto 3 | 4 | import sys 5 | import tensor_pb2 as tensor__pb2 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | # from google.protobuf import descriptor_pb2 11 | 12 | 13 | # @@protoc_insertion_point(imports) 14 | _b = sys.version_info[0] < 3 and (lambda x: x) or ( 15 | lambda x: x.encode('latin1')) 16 | 17 | 18 | _sym_db = _symbol_database.Default() 19 | 20 | 21 | DESCRIPTOR = _descriptor.FileDescriptor( 22 | name='scores.proto', 23 | package='', 24 | syntax='proto2', 25 | serialized_pb=_b( 26 | '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\ 27 | sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto' 28 | ), 29 | dependencies=[ 30 | tensor__pb2.DESCRIPTOR, 31 | ]) 32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 33 | 34 | _SENTENCESCORESPROTO = _descriptor.Descriptor( 35 | name='SentenceScoresProto', 36 | full_name='SentenceScoresProto', 37 | filename=None, 38 | file=DESCRIPTOR, 39 | containing_type=None, 40 | fields=[ 41 | _descriptor.FieldDescriptor( 42 | name='sentence_id', 43 | full_name='SentenceScoresProto.sentence_id', 44 | index=0, 45 | number=1, 46 | type=13, 47 | cpp_type=3, 48 | label=1, 49 | has_default_value=False, 50 | default_value=0, 51 | message_type=None, 52 | enum_type=None, 53 | containing_type=None, 54 | is_extension=False, 55 | extension_scope=None, 56 | options=None), 57 | _descriptor.FieldDescriptor(name='scores', 58 | full_name='SentenceScoresProto.scores', 59 | index=1, 60 | number=2, 61 | type=11, 62 | cpp_type=10, 63 | label=1, 64 | has_default_value=False, 65 | default_value=None, 66 | message_type=None, 67 | enum_type=None, 68 | containing_type=None, 69 | is_extension=False, 70 | extension_scope=None, 71 | options=None), 72 | ], 73 | extensions=[], 74 | nested_types=[], 75 | enum_types=[], 76 | options=None, 77 | is_extendable=False, 78 | syntax='proto2', 79 | extension_ranges=[], 80 | oneofs=[], 81 | serialized_start=30, 82 | serialized_end=102, 83 | ) 84 | 85 | _SCORESPROTO = _descriptor.Descriptor( 86 | name='ScoresProto', 87 | full_name='ScoresProto', 88 | filename=None, 89 | file=DESCRIPTOR, 90 | containing_type=None, 91 | fields=[ 92 | _descriptor.FieldDescriptor(name='sentences', 93 | full_name='ScoresProto.sentences', 94 | index=0, 95 | number=1, 96 | type=11, 97 | cpp_type=10, 98 | label=3, 99 | has_default_value=False, 100 | default_value=[], 101 | message_type=None, 102 | enum_type=None, 103 | containing_type=None, 104 | is_extension=False, 105 | extension_scope=None, 106 | options=None), 107 | ], 108 | extensions=[], 109 | nested_types=[], 110 | enum_types=[], 111 | options=None, 112 | is_extendable=False, 113 | syntax='proto2', 114 | extension_ranges=[], 115 | oneofs=[], 116 | serialized_start=104, 117 | serialized_end=158, 118 | ) 119 | 120 | _SENTENCESCORESPROTO.fields_by_name[ 121 | 'scores'].message_type = tensor__pb2._TENSORPROTO 122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO 123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO 124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO 125 | 126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType( 127 | 'SentenceScoresProto', 128 | (_message.Message, ), 129 | dict(DESCRIPTOR=_SENTENCESCORESPROTO, 130 | __module__='scores_pb2' 131 | # @@protoc_insertion_point(class_scope:SentenceScoresProto) 132 | )) 133 | _sym_db.RegisterMessage(SentenceScoresProto) 134 | 135 | ScoresProto = _reflection.GeneratedProtocolMessageType( 136 | 'ScoresProto', 137 | (_message.Message, ), 138 | dict(DESCRIPTOR=_SCORESPROTO, 139 | __module__='scores_pb2' 140 | # @@protoc_insertion_point(class_scope:ScoresProto) 141 | )) 142 | _sym_db.RegisterMessage(ScoresProto) 143 | 144 | # @@protoc_insertion_point(module_scope) 145 | -------------------------------------------------------------------------------- /src/orl-4.1/neural_srl/shared/syntactic_extraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import codecs 3 | 4 | from .dictionary import Dictionary 5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN 6 | from collections import OrderedDict 7 | 8 | 9 | class SyntacticTree(object): 10 | def __init__(self, sentence_id): 11 | self.sentence_id = sentence_id 12 | self.word_forms = ["Root"] 13 | self.word_forms_ids = [] 14 | self.char_ids = [[]] # 2D 15 | self.pos_forms = ["Root"] 16 | self.heads = [0] 17 | self.labels = ["Root"] 18 | self.labels_id = [] 19 | 20 | 21 | class SyntacticCONLL(object): 22 | def __init__(self): 23 | self.file_name = "" 24 | self.trees = [] 25 | self.sample_dep_data = None 26 | 27 | def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8): 28 | self.file_name = filename 29 | 30 | print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio)) 31 | conll_file = codecs.open(self.file_name, 'r', encoding="utf8") 32 | if conll_file.closed: 33 | print("Cannot open the syntactic conll file! Please check {}".format(self.file_name)) 34 | 35 | sentence_id = 0 36 | a_tree = SyntacticTree(sentence_id) 37 | find_root = False 38 | for line in conll_file: 39 | if line == '\n' or line == '\r\n': # new sentence 40 | sentence_id += 1 41 | if len(a_tree.word_forms) <= max_sentence_length: 42 | assert find_root is True 43 | # keep the sentence with the length < max_sentence_l 44 | self.trees.append(a_tree) 45 | a_tree = SyntacticTree(sentence_id) 46 | find_root = False 47 | continue 48 | tokens = line.strip().split('\t') 49 | a_tree.word_forms.append(tokens[1]) 50 | a_tree.pos_forms.append(tokens[3]) 51 | # head = int(tokens[6]) if int(tokens[6]) > 0 else -1 52 | head = int(tokens[6]) - 1 # root's head is 0 53 | if head == -1: 54 | assert tokens[7] == "root" 55 | find_root = True 56 | a_tree.heads.append(head) 57 | a_tree.labels.append(tokens[7]) 58 | token_9 = tokens[9] # or tokens 9 will be 'unicode' type 59 | dep_prob = 1.0 if isinstance(token_9, str) else float(token_9) 60 | if dep_prob < prune_ratio: 61 | a_tree.heads[-1] = -1 62 | print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees))) 63 | 64 | @staticmethod 65 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None): 66 | ids = [] 67 | for s in list_of_words: 68 | s = s 69 | if s is None: 70 | ids.append(-1) 71 | continue 72 | if lowercase: 73 | s = s.lower() 74 | if (pretrained_embeddings is not None) and (s not in pretrained_embeddings): 75 | s = UNKNOWN_TOKEN 76 | ids.append(dictionary.add(s)) 77 | return ids 78 | 79 | def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None): 80 | for tree in self.trees: 81 | tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False, 82 | pretrained_word_embedding) 83 | words = tree.word_forms 84 | max_word_length = max([len(w) for w in words] + [3, 4, 5]) # compare with character cnn filter width 85 | single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64) 86 | for i, word in enumerate(words): 87 | single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict) 88 | # Add the sample char tokens into the sample_char_tokens 89 | tree.char_ids = single_sample_char_tokens 90 | 91 | tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict) 92 | 93 | sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees] 94 | sample_char_ids = [tree.char_ids for tree in self.trees] 95 | sample_heads = [np.asarray(tree.heads) for tree in self.trees] 96 | sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees] 97 | self.sample_dep_data = list(zip(sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids)) 98 | 99 | def get_syntactic_label_dict(self, syn_label_dict=None): 100 | if syn_label_dict is None: 101 | syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN) 102 | else: 103 | assert syn_label_dict.accept_new is False 104 | sentences_length = len(self.trees) 105 | for i in range(sentences_length): 106 | ith_sentence_length = len(self.trees[i].labels) 107 | for j in range(ith_sentence_length): 108 | self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j])) 109 | return syn_label_dict 110 | 111 | 112 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings): 113 | dep_trees = SyntacticCONLL() 114 | dep_trees.read_from_file(file_path, max_sentence_length=2000) 115 | dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings) 116 | 117 | auto_dep_trees = OrderedDict() 118 | for tree in dep_trees.trees: 119 | sentence = ' '.join(tree.word_forms[1:]) # remove the "Root" 120 | auto_dep_trees[sentence] = tree 121 | return auto_dep_trees 122 | 123 | 124 | class SyntacticRepresentation(object): 125 | def __init__(self): 126 | self.file_name = "" 127 | self.representations = [] 128 | 129 | def read_from_file(self, filename): 130 | self.file_name = filename 131 | print("Reading lstm representations from {}".format(self.file_name)) 132 | representation_file = open(self.file_name, 'r') 133 | if representation_file.closed: 134 | print("Cannot open the representation file! Please check {}".format(self.file_name)) 135 | exit() 136 | each_sentence_representations = [] 137 | for line in representation_file: 138 | if line == '\n' or line == "\r\n": # new sentence 139 | self.representations.append(each_sentence_representations) 140 | each_sentence_representations = [] 141 | continue 142 | line = line.strip() 143 | line = line.split('\t') 144 | line = line[1].split(' ') 145 | rep = np.asarray(line, dtype=np.float32) 146 | each_sentence_representations.append(rep) 147 | representation_file.close() 148 | print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations))) 149 | 150 | def minus_by_the_predicate(self, corpus_tensors): 151 | has_processed_sentence_id = {} 152 | for i, data in enumerate(corpus_tensors): 153 | sentence_id = data[0][0][0] 154 | predicates = data[0][2] 155 | predicate_id = predicates.argmax() 156 | if sentence_id in has_processed_sentence_id: 157 | continue 158 | else: 159 | has_processed_sentence_id[sentence_id] = 1 160 | for j in range(1, len(self.representations[sentence_id])): # Root doesn't use. 161 | self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j] 162 | 163 | def check_math_corpus(self, lengths): 164 | for i, length in enumerate(lengths): 165 | if len(self.representations[i]) != length + 1: # 1 means the first one, Root. Actually never use it. 166 | print(i, length, len(self.representations[i])) 167 | print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length) 168 | exit() 169 | print("LSTM representation match the corpus!") 170 | --------------------------------------------------------------------------------