├── .DS_Store
├── README.md
├── exp-4.1-baseline
    ├── config.json
    ├── log.txt
    ├── model
    │   ├── char_dict
    │   ├── checkpoints.tsv
    │   ├── config
    │   ├── cons_label_dict
    │   ├── dep_label_dict
    │   ├── label_dict
    │   ├── pos_dict
    │   └── word_dict
    ├── predict.sh
    └── train.sh
├── figures
    ├── model.jpg
    └── model.pdf
├── scripts
    ├── convert_orl_conll_to_json.py
    ├── eval_averaged_metrics.py
    ├── eval_orl_conll_file.py
    ├── eval_orl_e2e_json_file.py
    ├── eval_orl_json_file.py
    └── generate_constituent_trees_from_benepar.py
└── src
    ├── orl-4.1-ultimate-hard-e2e
        ├── __init__.py
        ├── analyze.py
        ├── neural_srl
        │   ├── TreeLSTM
        │   │   ├── Encoder.py
        │   │   ├── Tree.py
        │   │   ├── TreeGRU.py
        │   │   └── __init__.py
        │   ├── __init__.py
        │   ├── gcn_model
        │   │   ├── __init__.py
        │   │   ├── gcn.py
        │   │   ├── tree.py
        │   │   └── various_gcn.py
        │   ├── pytorch
        │   │   ├── HBiLSTM.py
        │   │   ├── HighWayLSTM.py
        │   │   ├── __init__.py
        │   │   ├── implicit_syntactic_representations.py
        │   │   ├── layer.py
        │   │   ├── model.py
        │   │   ├── pre_trained_language_model.py
        │   │   ├── tagger.py
        │   │   └── util.py
        │   └── shared
        │   │   ├── __init__.py
        │   │   ├── configuration.py
        │   │   ├── conll_utils.py
        │   │   ├── constants.py
        │   │   ├── constituent_extraction.py
        │   │   ├── constituent_reader.py
        │   │   ├── dictionary.py
        │   │   ├── evaluation.py
        │   │   ├── features.py
        │   │   ├── inference.py
        │   │   ├── inference_utils.py
        │   │   ├── io_utils.py
        │   │   ├── measurements.py
        │   │   ├── numpy_utils.py
        │   │   ├── reader.py
        │   │   ├── scores_pb2.py
        │   │   ├── srl_eval_utils.py
        │   │   ├── syntactic_extraction.py
        │   │   ├── tagger_data.py
        │   │   └── tensor_pb2.py
        ├── predict.py
        └── train.py
    └── orl-4.1
        ├── __init__.py
        ├── analyze.py
        ├── neural_srl
            ├── TreeLSTM
            │   ├── Encoder.py
            │   ├── Tree.py
            │   ├── TreeGRU.py
            │   └── __init__.py
            ├── __init__.py
            ├── __pycache__
            │   └── __init__.cpython-37.pyc
            ├── gcn_model
            │   ├── __init__.py
            │   ├── __pycache__
            │   │   ├── __init__.cpython-37.pyc
            │   │   ├── tree.cpython-37.pyc
            │   │   └── various_gcn.cpython-37.pyc
            │   ├── gcn.py
            │   ├── tree.py
            │   └── various_gcn.py
            ├── pytorch
            │   ├── HBiLSTM.py
            │   ├── HighWayLSTM.py
            │   ├── __init__.py
            │   ├── __pycache__
            │   │   ├── HighWayLSTM.cpython-37.pyc
            │   │   ├── __init__.cpython-37.pyc
            │   │   ├── implicit_syntactic_representations.cpython-37.pyc
            │   │   ├── layer.cpython-37.pyc
            │   │   ├── model.cpython-37.pyc
            │   │   ├── pre_trained_language_model.cpython-37.pyc
            │   │   ├── tagger.cpython-37.pyc
            │   │   └── util.cpython-37.pyc
            │   ├── implicit_syntactic_representations.py
            │   ├── layer.py
            │   ├── model.py
            │   ├── pre_trained_language_model.py
            │   ├── tagger.py
            │   └── util.py
            └── shared
            │   ├── __init__.py
            │   ├── __pycache__
            │       ├── __init__.cpython-37.pyc
            │       ├── configuration.cpython-37.pyc
            │       ├── conll_utils.cpython-37.pyc
            │       ├── constants.cpython-37.pyc
            │       ├── constituent_extraction.cpython-37.pyc
            │       ├── constituent_reader.cpython-37.pyc
            │       ├── dictionary.cpython-37.pyc
            │       ├── evaluation.cpython-37.pyc
            │       ├── inference_utils.cpython-37.pyc
            │       ├── measurements.cpython-37.pyc
            │       ├── reader.cpython-37.pyc
            │       ├── srl_eval_utils.cpython-37.pyc
            │       ├── syntactic_extraction.cpython-37.pyc
            │       └── tagger_data.cpython-37.pyc
            │   ├── configuration.py
            │   ├── conll_utils.py
            │   ├── constants.py
            │   ├── constituent_extraction.py
            │   ├── constituent_reader.py
            │   ├── dictionary.py
            │   ├── evaluation.py
            │   ├── features.py
            │   ├── inference.py
            │   ├── inference_utils.py
            │   ├── io_utils.py
            │   ├── measurements.py
            │   ├── numpy_utils.py
            │   ├── reader.py
            │   ├── scores_pb2.py
            │   ├── srl_eval_utils.py
            │   ├── syntactic_extraction.py
            │   ├── tagger_data.py
            │   └── tensor_pb2.py
        ├── predict.py
        └── train.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # opinion_mining_with_syn_cons
 2 | This repositry contains our code, configurations, and model for our work on "A Unified Span-Based Approach for Opinion Mining with Syntactic Constituents", which is published on NAACL-2021.
 3 | The src directory contains our code and the exp-4.1-baseline contains our experiment for "Baseline+BERT" (data0, the first data of the five fold cross-validation).
 4 | 
 5 | ![model](https://github.com/KiroSummer/opinion_mining_with_syn_cons/blob/main/figures/model.jpg)
 6 | 
 7 | ## Environment
 8 | Python3, Pytorch, Transformers 2.1.1 (for BERT)
 9 | 
10 | ### Data
11 | MPQA2.0 [url](http://mpqa.cs.pitt.edu/corpora/mpqa_corpus/mpqa_corpus_2_0/)
12 | PTB and OntoNotes can be download from LDC.
13 | 
14 | ### Training
15 | Please reset and check the files in the train.sh and config.json when you want to run the code.
16 | 
17 | ```
18 | sh train.sh GPU\_ID
19 | ```
20 | 
21 | ### Test
22 | To test the performance of the trained model, you should run the following script.
23 | 
24 | ```
25 | sh predict.sh GPU\_ID
26 | ```
27 | We release the sample model of the "exp-4.1-baseline" on the Google Drive, [url](https://drive.google.com/file/d/17u8ofyaBThb66qYPZe-60A2lyEnWCNil/view?usp=sharing).
28 | Important, use the offline evaluation script to eval the output file.
29 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"max_train_length": 100,
 3 | 	"batch_size"   : 32,
 4 |     "subbatch_size": 1,
 5 | 	"max_tokens_per_batch" : 700,
 6 | 	"features"     : ["predicate"],
 7 | 	"feature_sizes": [100],
 8 |  	"dev_batch_size": 40,
 9 | 
10 |     "use_bert": true,
11 |     "bert_vocab_path": "bert-base-cased",
12 |     "bert_path": "bert-base-cased",
13 |     "bert_dim": 768,
14 | 
15 |     "mtl_cons": false,
16 |     "use_cons_labels": false,
17 | 
18 |     "use_cons_gcn": false,
19 | 
20 |     "mtl_dep": false,
21 |     "dep_prune_ratio": 0.8,
22 |     "dep_num_lstm_layers": 3,
23 |     "mlp_arc_size": 500,
24 |     "mlp_rel_size": 100,
25 |     "dropout_mlp": 0.33,
26 | 
27 |     "use_dep_gcn": false,
28 |     "gcn_dep_num_layers": 2,
29 | 
30 | 	"joint": true,
31 | 	"mtl": false,
32 | 	"analyze": false,
33 | 
34 | 	"learning_rate":0.001,
35 |     "input_dropout_prob":0.0,
36 |     "feature_dropout": 0.5,
37 | 	"lexical_dropout" : 0.5,
38 | 	"dropout" : 0.3,
39 |     "recurrent_dropout_prob":0.4,
40 |     "mlp_dropout_prob": 0.2,
41 | 	"max_grad_norm": 5.0,
42 |     "weight_decay": 1e-7,
43 |     "decay_steps": 50,
44 |     "fl_alpha": 1.0,
45 |     "fl_gamma": 3.0,
46 | 	"pruning_by_arg_prob": false,
47 | 	"arg_boundary_prob_threshold": 0.0,
48 | 	"pruning_by_three_threshold": false,
49 | 	"arg_three_p_boundary_prob_threshold": 0.02,
50 | 	"neg_threshold": 80,
51 | 
52 |     "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt",
53 |     "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt",
54 | 	"char_emb_size" : 8,
55 |     "pos_emb_size" : 100,
56 |     "cons_label_dim": 100,
57 | 	"span_width_feature_size" : 20,
58 | 	"num_attention_heads" : 1,
59 | 	"kernel_sizes" : [3, 4, 5],
60 | 	"output_channel" : 50,
61 | 	"argument_ratio" : 0.8,
62 | 	"predicate_ratio" : 0.4,
63 | 	"linear_projection_size" : 400,
64 |     "cons_num_lstm_layers": 3,
65 | 	"num_lstm_layers" : 2,
66 | 	"lstm_hidden_size": 300,
67 | 	"max_arg_width" : 60,
68 | 	"lstm_cell":"highway",
69 |     "mlp_label_size":100,
70 |     "per_layer_dropout":true,
71 | 
72 |     "gcn_rnn": true,
73 |     "gcn_rnn_hidden": 200,
74 |     "gcn_rnn_layers": 1,
75 |     "gcn_rnn_dropout": 0.4,
76 |     "gcn_hidden_dim": 300,
77 |     "gcn_num_layers": 3,
78 |     "gcn_drop": 0.3,
79 | 
80 |     "pred_size": 300,
81 |     "arg_start_size": 200,
82 |     "arg_end_size": 200,
83 |     "argu_size": 300,
84 |     "argu_size_u": 400,
85 | 	"num_attention_heads" : 1,
86 | 	"ffnn_size" : 150,
87 | 	"ffnn_depth" : 1,
88 | 
89 | 	"trainer"   : "Adadelta",
90 | 	"max_epochs": 500,
91 | 	"checkpoint_every_x_epochs": 1,
92 | 
93 | 	"enforce_srl_constraint": false,
94 | 	"use_gold_predicates": true,
95 |     "use_gold_arguments": false
96 | }
97 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/char_dict:
--------------------------------------------------------------------------------
 1 | *PAD*
 2 | *UNKNOWN*
 3 | @
 4 | p
 5 | Y
 6 | s
 7 | :
 8 | P
 9 | a
10 | _
11 | 4
12 | Z
13 | }
14 | o
15 | +
16 | w
17 | r
18 | 8
19 | #
20 | 0
21 | h
22 | R
23 | E
24 | g
25 | 2
26 | x
27 | U
28 | $
29 | d
30 | [
31 | ?
32 | F
33 | X
34 | \
35 | 7
36 | Q
37 | 9
38 | '
39 | z
40 | e
41 | t
42 | 3
43 | c
44 | "
45 | v
46 | ˙
47 | k
48 | ò
49 | *
50 | m
51 | ,
52 | %
53 | S
54 | `
55 | K
56 | A
57 | -
58 | .
59 | q
60 | L
61 | B
62 | J
63 | Ì
64 | j
65 | <
66 | i
67 | 1
68 | ö
69 | ’
70 | N
71 | &
72 | ]
73 | 5
74 | H
75 | T
76 | Û
77 | b
78 | y
79 | ;
80 | G
81 | V
82 | f
83 | !
84 | >
85 | /
86 | O
87 | W
88 | D
89 | u
90 | {
91 | M
92 | =
93 | 6
94 | l
95 | n
96 | C
97 | I
98 | Ê
99 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/config:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"max_train_length": 100,
 3 | 	"batch_size"   : 32,
 4 |     "subbatch_size": 1,
 5 | 	"max_tokens_per_batch" : 700,
 6 | 	"features"     : ["predicate"],
 7 | 	"feature_sizes": [100],
 8 |  	"dev_batch_size": 40,
 9 | 
10 |     "use_bert": true,
11 |     "bert_vocab_path": "bert-base-cased",
12 |     "bert_path": "bert-base-cased",
13 |     "bert_dim": 768,
14 | 
15 |     "mtl_cons": false,
16 |     "use_cons_labels": false,
17 | 
18 |     "use_cons_gcn": false,
19 | 
20 |     "mtl_dep": false,
21 |     "dep_prune_ratio": 0.8,
22 |     "dep_num_lstm_layers": 3,
23 |     "mlp_arc_size": 500,
24 |     "mlp_rel_size": 100,
25 |     "dropout_mlp": 0.33,
26 | 
27 |     "use_dep_gcn": false,
28 |     "gcn_dep_num_layers": 2,
29 | 
30 | 	"joint": true,
31 | 	"mtl": false,
32 | 	"analyze": false,
33 | 
34 | 	"learning_rate":0.001,
35 |     "input_dropout_prob":0.0,
36 |     "feature_dropout": 0.5,
37 | 	"lexical_dropout" : 0.5,
38 | 	"dropout" : 0.3,
39 |     "recurrent_dropout_prob":0.4,
40 |     "mlp_dropout_prob": 0.2,
41 | 	"max_grad_norm": 5.0,
42 |     "weight_decay": 1e-7,
43 |     "decay_steps": 50,
44 |     "fl_alpha": 1.0,
45 |     "fl_gamma": 3.0,
46 | 	"pruning_by_arg_prob": false,
47 | 	"arg_boundary_prob_threshold": 0.0,
48 | 	"pruning_by_three_threshold": false,
49 | 	"arg_three_p_boundary_prob_threshold": 0.02,
50 | 	"neg_threshold": 80,
51 | 
52 |     "word_embedding" : "../data/embeddings/glove.840B.300d.txt.filtered.opinion0.conll12.train.txt",
53 |     "char_vocab_file" : "../data/opinion0.train.conll12.train.char.txt",
54 | 	"char_emb_size" : 8,
55 |     "pos_emb_size" : 100,
56 |     "cons_label_dim": 100,
57 | 	"span_width_feature_size" : 20,
58 | 	"num_attention_heads" : 1,
59 | 	"kernel_sizes" : [3, 4, 5],
60 | 	"output_channel" : 50,
61 | 	"argument_ratio" : 0.8,
62 | 	"predicate_ratio" : 0.4,
63 | 	"linear_projection_size" : 400,
64 |     "cons_num_lstm_layers": 3,
65 | 	"num_lstm_layers" : 2,
66 | 	"lstm_hidden_size": 300,
67 | 	"max_arg_width" : 60,
68 | 	"lstm_cell":"highway",
69 |     "mlp_label_size":100,
70 |     "per_layer_dropout":true,
71 | 
72 |     "gcn_rnn": true,
73 |     "gcn_rnn_hidden": 200,
74 |     "gcn_rnn_layers": 1,
75 |     "gcn_rnn_dropout": 0.4,
76 |     "gcn_hidden_dim": 300,
77 |     "gcn_num_layers": 3,
78 |     "gcn_drop": 0.3,
79 | 
80 |     "pred_size": 300,
81 |     "arg_start_size": 200,
82 |     "arg_end_size": 200,
83 |     "argu_size": 300,
84 |     "argu_size_u": 400,
85 | 	"num_attention_heads" : 1,
86 | 	"ffnn_size" : 150,
87 | 	"ffnn_depth" : 1,
88 | 
89 | 	"trainer"   : "Adadelta",
90 | 	"max_epochs": 500,
91 | 	"checkpoint_every_x_epochs": 1,
92 | 
93 | 	"enforce_srl_constraint": false,
94 | 	"use_gold_predicates": true,
95 |     "use_gold_arguments": false
96 | }
97 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/cons_label_dict:
--------------------------------------------------------------------------------
 1 | O
 2 | WHNP
 3 | NP
 4 | PP
 5 | SBARQ
 6 | ADVP
 7 | VP
 8 | ADJP
 9 | NML
10 | SINV
11 | PRT
12 | WHADVP
13 | SBAR
14 | INTJ
15 | SQ
16 | QP
17 | CONJP
18 | UCP
19 | X
20 | FRAG
21 | PRN
22 | WHPP
23 | WHADJP
24 | LST
25 | NAC
26 | RRC
27 | META
28 | NX
29 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/dep_label_dict:
--------------------------------------------------------------------------------
 1 | Root
 2 | prep
 3 | det
 4 | nn
 5 | num
 6 | pobj
 7 | punct
 8 | poss
 9 | possessive
10 | amod
11 | nsubj
12 | dep
13 | dobj
14 | cc
15 | conj
16 | nsubjpass
17 | partmod
18 | auxpass
19 | advmod
20 | root
21 | ccomp
22 | aux
23 | cop
24 | xcomp
25 | quantmod
26 | tmod
27 | appos
28 | npadvmod
29 | neg
30 | infmod
31 | rcmod
32 | pcomp
33 | mark
34 | advcl
35 | predet
36 | mwe
37 | parataxis
38 | number
39 | acomp
40 | prt
41 | iobj
42 | expl
43 | csubj
44 | preconj
45 | discourse
46 | csubjpass
47 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/label_dict:
--------------------------------------------------------------------------------
1 | O
2 | AGENT
3 | TARGET
4 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/model/pos_dict:
--------------------------------------------------------------------------------
 1 | *PAD*
 2 | DT
 3 | NNP
 4 | JJ
 5 | NN
 6 | VBD
 7 | PRP
 8 | MD
 9 | RB
10 | VB
11 | IN
12 | CD
13 | PRP$
14 | NNS
15 | .
16 | :
17 | VBG
18 | VBN
19 | TO
20 | ``
21 | VBZ
22 | ,
23 | VBP
24 | JJR
25 | ''
26 | -LRB-
27 | -RRB-
28 | POS
29 | FW
30 | CC
31 | WP
32 | $
33 | RP
34 | WDT
35 | EX
36 | RBS
37 | WRB
38 | NNPS
39 | UH
40 | RBR
41 | JJS
42 | PDT
43 | WP$
44 | LS
45 | SYM
46 | #
47 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/predict.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PATH=/usr/local/cuda/bin:$PATH
 3 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib
 4 | 
 5 | MODEL_PATH="./model"
 6 | 
 7 | #INPUT_PATH="../data/aaai19srl.train0.conll.srl.json"
 8 | #OUTPUT_PATH="../temp/orl.train0.out"
 9 | 
10 | INPUT_PATH="../data/aaai19srl.dev0.conll.json"
11 | GOLD_PATH="../data/conll_format/aaai19srl.dev0.conll"
12 | OUTPUT_PATH="../temp/orl.devel0.out"
13 | 
14 | INPUT_PATH="../data/aaai19srl.test0.conll.json"
15 | GOLD_PATH="../data/conll_format/aaai19srl.test0.conll"
16 | OUTPUT_PATH="../temp/orl.test0.out"
17 | 
18 | ORL_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt"
19 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt"
20 | 
21 | CUDA_VISIBLE_DEVICES=$1 python3 ../src/orl-4.1/predict.py \
22 |   --span="span" \
23 |   --model="$MODEL_PATH" \
24 |   --input="$INPUT_PATH" \
25 |   --gold="$GOLD_PATH" \
26 |   --orl_cons=$ORL_CONS \
27 |   --auto_dep_trees=$SYS_DEP  \
28 |   --output="$OUTPUT_PATH" \
29 |   --gpu=$1
30 | 
31 | 


--------------------------------------------------------------------------------
/exp-4.1-baseline/train.sh:
--------------------------------------------------------------------------------
 1 | export PATH=/usr/local/cuda/bin:$PATH
 2 | export LD_LIBRARY_PATH=/usr/local/cuda:/usr/local/cuda/lib64:/opt/OpenBLAS/lib
 3 | 
 4 | CONFIG="config.json"
 5 | MODEL="model"
 6 | 
 7 | TRAIN_PATH="../data/aaai19srl.train0.conll.json"
 8 | #TRAIN_PATH="../data/aaai19srl.dev0.conll.json"
 9 | DEV_PATH="../data/aaai19srl.dev0.conll.json"
10 | GOLD_PATH="../data/english/srl/conll05/conll05.devel.props.gold.txt"
11 | 
12 | CONS_PATH="../data/constituent_conll12/ontonote5.0.train.constituents.json"
13 | DEP_TREES="/data2/qrxia/SRL-w-Heterogenous-Dep/data/english/dependency/ptb_from_baidu_from_n171/ptb.english.conll.train.txt.opentest.tag.projective"
14 | 
15 | SYS_CONS="../data/sentences/orl.2.0.all0.sentences.txt.constituent.txt"
16 | SYS_DEP="../data/dependency_trees/orl.2.0.auto.dep.txt"
17 | 
18 | gpu_id=$1
19 | CUDA_VISIBLE_DEVICES=$gpu_id python3 ../src/orl-4.1/train.py \
20 |    --info="orl baseline bert" \
21 |    --config=$CONFIG \
22 |    --span="span" \
23 |    --model=$MODEL \
24 |    --train=$TRAIN_PATH \
25 |    --dev=$DEV_PATH \
26 |    --gold=$GOLD_PATH \
27 |    --cons_trees=$CONS_PATH \
28 |    --dep_trees=$DEP_TREES \
29 |    --auto_cons_trees=$SYS_CONS \
30 |    --auto_dep_trees=$SYS_DEP  \
31 |    --gpu=$1
32 | 


--------------------------------------------------------------------------------
/figures/model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.jpg


--------------------------------------------------------------------------------
/figures/model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/figures/model.pdf


--------------------------------------------------------------------------------
/scripts/convert_orl_conll_to_json.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | from collections import OrderedDict
  4 | 
  5 | 
  6 | DSE="DSE"
  7 | TARGET="TARGET"
  8 | AGENT="AGENT"
  9 | 
 10 | max_dse_length, max_target_length, max_agent_length = 0, 0, 0
 11 | 
 12 | 
 13 | class orl_data():
 14 |     def __init__(self, tuples):
 15 |         self.idx = []
 16 |         self.words = []
 17 |         self.labels = []
 18 |         self.des = []
 19 |         self.des_head = []
 20 |         self.target = []
 21 |         self.agent = []
 22 |         self.orl = []
 23 |         self.init_by_typles(tuples)
 24 | 
 25 |     def output_to_srl_json(self):
 26 |         srl_span = []
 27 |         for span in self.orl:
 28 |             s, e, a_s, a_e, label = span
 29 |             if label == "DSE":
 30 |                 continue
 31 |             count = 0
 32 |             for des_head in self.des_head:
 33 |                 if s <= des_head <= e:
 34 |                     srl_span.append([des_head, a_s, a_e, label])
 35 |                     count += 1
 36 |             if count != 1:
 37 |                 print(self.words, self.des_head)
 38 |             # assert count == 1
 39 |         for des_head in self.des_head:
 40 |             srl_span.append([des_head, des_head, des_head, "V"])
 41 |         output = {
 42 |             "speakers": [["-"] * len(self.words)],
 43 |             "doc_key": "S0",
 44 |             "sentences": [self.words],
 45 |             "srl": [srl_span],
 46 |             "constituents": [[]],
 47 |             "clusters": [],
 48 |             "ner": [[]]
 49 | 
 50 |         }
 51 |         return output
 52 | 
 53 |     def output_to_json(self):
 54 |         output = {
 55 |             "sentences": self.words,
 56 |             "orl": self.orl
 57 |         }
 58 |         return output
 59 | 
 60 |     def a_complete_span(self, des, span, label):
 61 |         # print(des, span, label)
 62 |         t = (des + span)
 63 |         t.append(label)
 64 |         # print(t)
 65 |         assert len(t) == 5
 66 |         if t[-1] == DSE:
 67 |             global max_dse_length
 68 |             dse_length = t[1] - t[0]
 69 |             max_dse_length = max_dse_length if max_dse_length > dse_length else dse_length
 70 |             self.des.append(t)
 71 |         elif t[-1] == TARGET:
 72 |             global max_target_length
 73 |             target_length = t[3] - t[2]
 74 |             max_target_length = max_target_length if max_target_length > target_length else target_length
 75 |             self.target.append(t)
 76 |         else:
 77 |             assert t[-1] == AGENT
 78 |             global max_agent_length
 79 |             agent_length = t[3] - t[2]
 80 |             max_agent_length = max_agent_length if max_agent_length > agent_length else agent_length
 81 |             self.agent.append(t)
 82 |         self.orl.append(t)
 83 | 
 84 |     @staticmethod
 85 |     def compose_a_span(des, spans, labels, span, l):
 86 |         assert len(span) == 2 and l != ""
 87 |         if l == DSE:
 88 |             assert len(des) == 0
 89 |             des = span
 90 |             # print("DES!!!!")
 91 |             spans.append(des)
 92 |             labels.append(l)
 93 |         else:
 94 |             spans.append(span)
 95 |             labels.append(l)
 96 |         return des
 97 | 
 98 |     def init_by_typles(self, tuples):
 99 |         # print(tuples)
100 |         self.idx, self.words, self.labels = tuples[0], tuples[1], tuples[2:]
101 |         for expression_aware_label in self.labels:
102 |             des, spans, labels = [], [], []
103 |             span, l = [], ''
104 |             # print(self.label)
105 |             for i, label in enumerate(expression_aware_label):
106 |                 if label.endswith("-*"):  # we do n't need the * that marks the ``head word''
107 |                     label = label[:-2]
108 |                     self.des_head.append(i)
109 |                 if label == "S-DSE":
110 |                     self.des_head.append(i)
111 | 
112 |                 if label.startswith("B"):
113 |                     assert len(span) == 0
114 |                     span.append(i)
115 |                     l = label[2:]
116 |                 elif label.startswith("M"):
117 |                     assert l == label[2:]
118 |                 elif label.startswith("E"):
119 |                     assert l == label[2:]
120 |                     span.append(i)
121 |                     des = orl_data.compose_a_span(des, spans, labels, span, l)
122 |                     span, l = [], ''
123 |                 elif label.startswith("S"):
124 |                     span = [i, i]
125 |                     l = label[2:]
126 |                     # print("label", l)
127 |                     des = orl_data.compose_a_span(des, spans, labels, span, l)
128 |                     # print("XXX", des)
129 |                     span, l = [], ''
130 |                 else:
131 |                     assert label == 'O'
132 | 
133 |             assert len(spans) == len(labels)
134 |             for s, l in list(zip(spans, labels)):
135 |                 self.a_complete_span(des, s, l)
136 | 
137 |     def write_to_json(self):
138 |         pass
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     input_filepath = sys.argv[1]
143 | 
144 |     input_data = OrderedDict()
145 |     original_sentence_number, unique_sentence_number = 0, 0
146 |     duplicate_sentence_number = 0
147 |     duplicate_sentence_label_number = 0
148 |     with open(input_filepath, 'r') as input_orl_file:
149 |         sentence = []
150 |         for line in input_orl_file.readlines():
151 |             if line.strip() == "":
152 |                 original_sentence_number += 1
153 |                 tuples = list(zip(*sentence))
154 |                 sen = ' '.join(tuples[1])
155 |                 # print(sen)
156 |                 if len(input_data) != 0:
157 |                     if sen in input_data.keys():  # if it is the same sentence
158 |                         # print("xx")
159 |                         if tuples[-1] not in input_data[sen][2:]:
160 |                             input_data[sen].append(tuples[-1])
161 |                             duplicate_sentence_number += 1
162 |                         else:
163 |                             print(tuples[-1], "already in previous sample", input_data[sen])
164 |                             duplicate_sentence_label_number += 1
165 |                     else:
166 |                         input_data[sen] = tuples
167 |                         unique_sentence_number += 1
168 |                 else:
169 |                     input_data[sen] = tuples
170 |                     unique_sentence_number += 1
171 |                 sentence = []
172 |                 continue
173 |             tokens = line.strip().split()
174 |             # print(tokens)
175 |             sentence.append(tokens)
176 |     # check for sentences appear more than once
177 |     assert original_sentence_number == unique_sentence_number + duplicate_sentence_number +\
178 |            duplicate_sentence_label_number
179 |     print("original sentence number:", original_sentence_number)
180 |     print("unique_sentence_number:", unique_sentence_number)
181 |     print("duplicate_sentence_number:", duplicate_sentence_number)
182 |     print("duplicate_sentence_label_number", duplicate_sentence_label_number)
183 |     # generate_chars
184 |     # with open(input_filepath + ".char.txt", 'w') as char_file:
185 |     #     char = set()
186 |     #     for sen in input_data.keys():
187 |     #         words = sen.strip().split()
188 |     #         for word in words:
189 |     #             for c in word:
190 |     #                 char.add(c)
191 |     #     for c in char:
192 |     #         char_file.write(c + '\n')
193 | 
194 |     sentences = set()
195 |     for data in input_data.keys():
196 |         sen = ' '.join(data)
197 |         if sen not in sentences:
198 |             sentences.add(sen)
199 |         else:
200 |             print(sen, "already appears!")
201 |             # pass
202 |     # generate orl data
203 |     orl_dataset = []
204 |     for data in input_data.keys():
205 |         orl_dataset.append(orl_data(input_data[data]))
206 |     # global max_dse_length
207 |     # global max_target_length
208 |     # global max_agent_length
209 |     print("max_dse_length", max_dse_length, "max_target_length", max_target_length,
210 |           "max_agent_length", max_agent_length)
211 |     # output to json
212 |     json_filename = input_filepath + '.json'
213 |     with open(json_filename, 'w') as output_json:
214 |         for orl in orl_dataset:
215 |             # print(orl.output_to_json())
216 |             output_json.write(json.dumps(orl.output_to_json()) + '\n')
217 | 
218 | 
219 | 


--------------------------------------------------------------------------------
/scripts/eval_averaged_metrics.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # from eval_orl_conll_file import load_eval_data, analyze_error_prediction_matrix
 3 | from eval_orl_e2e_json_file import load_eval_data, analyze_error_prediction_matrix
 4 | 
 5 | 
 6 | def average_fscore(all_metrics):
 7 |     zipped_metrics = list(zip(*all_metrics))
 8 | 
 9 |     def avg(metrics):
10 |         return sum([item.f for item in metrics]) / len(metrics)
11 | 
12 |     print('='*10, "Binary F1", '='*10)
13 |     print("Agent", avg(zipped_metrics[0]))
14 |     print("Target", avg(zipped_metrics[1]))
15 |     print("Agent", avg(zipped_metrics[2]))
16 | 
17 |     print('='*10, "Proportional F1", '='*10)
18 |     print("Agent", avg(zipped_metrics[3]))
19 |     print("Target", avg(zipped_metrics[4]))
20 |     print("Agent", avg(zipped_metrics[5]))
21 | 
22 |     print('='*10, "Exact F1", '='*10)
23 |     print("Agent", avg(zipped_metrics[6]))
24 |     print("Target", avg(zipped_metrics[7]))
25 |     print("Agent", avg(zipped_metrics[8]))
26 | 
27 |     print('=' * 10, "Expression F1", '=' * 10)
28 |     print("Binary", avg(zipped_metrics[9]))
29 |     print("Proportional", avg(zipped_metrics[10]))
30 |     print("Exact", avg(zipped_metrics[11]))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     averaged_metric = []
35 |     assert len(sys.argv[1:]) == 5
36 |     for file_path in sys.argv[1:]:
37 |         result = load_eval_data(file_path)
38 | 
39 |         x = analyze_error_prediction_matrix(result)
40 |         (agent_binary, target_binary, all_binary), \
41 |         (agent_proportional, target_proportional, all_proportional), \
42 |         (agent_exact, target_exact, all_exact), \
43 |         (exp_binary, exp_proportional, exp_exact) = x
44 | 
45 |         averaged_metric.append([agent_binary, target_binary, all_binary,
46 |                                 agent_proportional, target_proportional, all_proportional,
47 |                                 agent_exact, target_exact, all_exact,
48 |                                 exp_binary, exp_proportional, exp_exact])
49 |     average_fscore(averaged_metric)
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/eval_orl_json_file.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | 
  4 | from collections import OrderedDict, Counter
  5 | 
  6 | 
  7 | AGENT="AGENT"
  8 | TARGET="TARGET"
  9 | 
 10 | 
 11 | class Sample():
 12 |     def __init__(self, obj):
 13 |         self.sentence = obj["sentence"]
 14 |         self.gold_orl = obj['gold_orl']
 15 |         self.sys_orl = obj["sys_orl"]
 16 |         # self.sys_argus_constituents = obj['sys_argus_constituents']
 17 |         # self.constituent = obj["constituent"]
 18 | 
 19 | 
 20 | def load_eval_data(eval_path):
 21 |     eval_data = []
 22 |     with open(eval_path, 'r') as f:
 23 |         eval_data = [Sample(json.loads(jsonline)) for jsonline in f.readlines()]
 24 |     print("Loaded {} eval examples.".format(len(eval_data)))
 25 |     return eval_data
 26 | 
 27 | 
 28 | class EvalMetric():
 29 |     def __init__(self, name="None"):
 30 |         self.name = name
 31 |         self.matched, self.sys, self.gold = 0, 0, 0
 32 |         self.p = self.r = self.f = 0.0
 33 | 
 34 |     def compute_prf(self):
 35 |         try:
 36 |             self.p = 100.0 * self.matched / self.sys
 37 |         except:
 38 |             self.p = 0.0
 39 |         try:
 40 |             self.r = 100.0 * self.matched / self.gold
 41 |         except:
 42 |             self.r = 0.0
 43 |         try:
 44 |             self.f = 2.0 * self.p * self.r / (self.p + self.r)
 45 |         except:
 46 |             self.f = 0.0
 47 |         print("="*5, self.name, "="*5)
 48 |         print("Precision:", self.matched, '/', self.sys, '=', self.p)
 49 |         print("Recall:", self.matched, '/', self.gold, '=', self.r)
 50 |         print("F1 score:", self.f)
 51 | 
 52 | 
 53 | def analyze_error_prediction_matrix(samples):
 54 |     agent_binary, target_binary, all_binary = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
 55 |     agent_proportional, target_proportional, all_proportional = \
 56 |         EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
 57 |     agent_exact, target_exact, all_exact = EvalMetric("Agent"), EvalMetric("Target"), EvalMetric("All")
 58 |     for sample in samples:
 59 |         gold_orl, sys_orl = sample.gold_orl, sample.sys_orl
 60 |         # dict s-e: label
 61 |         dict_gold_orl, dict_sys_orl = OrderedDict(), OrderedDict()
 62 |         for g_orl in gold_orl:  # construct the expression-argument tuples
 63 |             dse_s, dse_e, s, e, label = g_orl
 64 |             expression = str(dse_s) + '-' + str(dse_e)
 65 |             argument = (s, e, label)
 66 |             if expression not in dict_gold_orl:
 67 |                 dict_gold_orl[expression] = []
 68 |                 dict_gold_orl[expression].append(argument)
 69 |             else:
 70 |                 dict_gold_orl[expression].append(argument)
 71 |         for s_orl in sys_orl:  # construct the expression-argument tuples
 72 |             dse_s, dse_e, s, e, label = s_orl
 73 |             expression = str(dse_s) + '-' + str(dse_e)
 74 |             argument = (s, e, label)
 75 |             if expression not in dict_sys_orl:
 76 |                 dict_sys_orl[expression] = []
 77 |                 dict_sys_orl[expression].append(argument)
 78 |             else:
 79 |                 dict_sys_orl[expression].append(argument)
 80 | 
 81 |         for expression in dict_gold_orl:  # compute the gold
 82 |             for argument in dict_gold_orl[expression]:
 83 |                 s, e, label = argument
 84 |                 all_binary.gold += 1
 85 |                 all_proportional.gold += 1
 86 |                 all_exact.gold += 1
 87 |                 if label == AGENT:
 88 |                     agent_binary.gold += 1
 89 |                     agent_proportional.gold += 1
 90 |                     agent_exact.gold += 1
 91 |                 else:
 92 |                     assert label == TARGET
 93 |                     target_binary.gold += 1
 94 |                     target_proportional.gold += 1
 95 |                     target_exact.gold += 1
 96 | 
 97 |         for expression in dict_sys_orl:  # compute the sys
 98 |             for argument in dict_sys_orl[expression]:
 99 |                 s, e, label = argument
100 |                 all_binary.sys += 1
101 |                 all_proportional.sys += 1
102 |                 all_exact.sys += 1
103 |                 if label == AGENT:
104 |                     agent_binary.sys += 1
105 |                     agent_proportional.sys += 1
106 |                     agent_exact.sys += 1
107 |                 else:
108 |                     assert label == TARGET
109 |                     target_binary.sys += 1
110 |                     target_proportional.sys += 1
111 |                     target_exact.sys += 1
112 | 
113 |         for expression in dict_sys_orl:  # compute the sys
114 |             if expression not in dict_gold_orl:  # debug: some gold orl has no argument, only expression
115 |                 # print(sample.sentence)
116 |                 continue
117 |             gold_arguments = dict_gold_orl[expression]
118 |             for argument in dict_sys_orl[expression]:
119 |                 s, e, label = argument
120 |                 if argument in gold_arguments:  # exact
121 |                     all_binary.matched += 1
122 |                     all_proportional.matched += 1
123 |                     all_exact.matched += 1
124 |                     if label == AGENT:
125 |                         agent_binary.matched += 1
126 |                         agent_proportional.matched += 1
127 |                         agent_exact.matched += 1
128 |                     else:
129 |                         assert label == TARGET
130 |                         target_binary.matched += 1
131 |                         target_proportional.matched += 1
132 |                         target_exact.matched += 1
133 |                 else:
134 |                     # binary
135 |                     find = False
136 |                     for index in range(s, e + 1):
137 |                         for gold_arg in gold_arguments:
138 |                             g_s, g_e, g_label = gold_arg
139 |                             if g_label == label:
140 |                                 if g_s <= index <= g_e:
141 |                                     all_binary.matched += 1
142 |                                     if label == AGENT:
143 |                                         agent_binary.matched += 1
144 |                                     else:
145 |                                         target_binary.matched += 1
146 |                                     find = True
147 |                                     break
148 |                         if find is True:
149 |                             break
150 |                     # proportional
151 |                     list_of_proportional = []
152 |                     for gold_argument in dict_gold_orl[expression]:
153 |                         g_s, g_e, g_label = gold_argument
154 |                         matched_positions = 0
155 |                         if label != g_label:
156 |                             pass
157 |                         else:
158 |                             for position in range(g_s, g_e + 1):
159 |                                 if s <= position <= e:
160 |                                     matched_positions += 1
161 |                             list_of_proportional.append(1.0 * matched_positions / (g_e - g_s + 1))
162 |                     if len(list_of_proportional) > 0:  # matched a gold argument
163 |                         all_proportional.matched += max(list_of_proportional)
164 |                         if label == AGENT:
165 |                             agent_proportional.matched += max(list_of_proportional)
166 |                         else:
167 |                             target_proportional.matched += max(list_of_proportional)
168 | 
169 |     print("="*15, 'Binary Metric', "="*15)
170 |     agent_binary.compute_prf()
171 |     target_binary.compute_prf()
172 |     all_binary.compute_prf()
173 | 
174 |     print("="*15, 'Proportional Metric', "="*15)
175 |     agent_proportional.compute_prf()
176 |     target_proportional.compute_prf()
177 |     all_proportional.compute_prf()
178 | 
179 |     print("="*15, 'Exact Metric', "="*15)
180 |     agent_exact.compute_prf()
181 |     target_exact.compute_prf()
182 |     all_exact.compute_prf()
183 | 
184 | 
185 | if __name__ == "__main__":
186 |     input_file_path = sys.argv[1]
187 |     data = load_eval_data(input_file_path)
188 |     analyze_error_prediction_matrix(data)
189 | 


--------------------------------------------------------------------------------
/scripts/generate_constituent_trees_from_benepar.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import sys
 3 | import benepar
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     filepath = sys.argv[1]
 8 |     sentences = []
 9 |     with open(filepath, 'r') as input_file:
10 |         for line in input_file.readlines():
11 |             sentence = line.strip()
12 |             words = sentence.split(' ')
13 |             sentences.append(words)
14 | 
15 |     parser = benepar.Parser("benepar_en2")
16 |     constituent_trees = []
17 |     for sentence in sentences:
18 |         tree = parser.parse(sentence)
19 |         constituent_trees.append(tree)
20 | 
21 |     with open(filepath + '.constituent.txt', 'w') as output_file:
22 |         for t in constituent_trees:
23 |             output_file.write(str(t) + '\n' + '\n')
24 | 
25 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["neural_srl"]


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from torch.nn.utils.rnn import pack_padded_sequence as pack
 5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
 6 | from .TreeGRU import DTTreeGRU, TDTreeGRU
 7 | from .Tree import creatTree
 8 | 
 9 | 
10 | class EncoderRNN(nn.Module):
11 |     """ The standard RNN encoder.
12 |     """
13 |     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
14 |         super(EncoderRNN, self).__init__()
15 |         self.hidden_size = hidden_size
16 |         self.num_layers = num_layers
17 |         self.dropout = nn.Dropout(dropout)
18 | 
19 |         self.rnn = nn.GRU(input_size=input_size,
20 |                           hidden_size=hidden_size,
21 |                           num_layers=num_layers,
22 |                           bidirectional=True)  # batch_first = False
23 |         self.transform = nn.Linear(in_features=2 * hidden_size,
24 |                                    out_features=input_size,
25 |                                    bias=True)
26 |         self.dt_tree = DTTreeGRU(input_size, hidden_size)
27 |         self.td_tree = TDTreeGRU(input_size, hidden_size)
28 | 
29 |     def forward(self, input, heads, lengths=None, hidden=None):
30 |         """ See EncoderBase.forward() for description of args and returns.
31 |         inputs: [L, B, H], including the -ROOT-
32 |         heads: [heads] * B
33 |         """
34 |         emb = self.dropout(input)
35 | 
36 |         packed_emb = emb
37 |         if lengths is not None:
38 |             # Lengths data is wrapped inside a Variable.
39 |             packed_emb = pack(emb, lengths)
40 | 
41 |         outputs, hidden_t = self.rnn(packed_emb, hidden)
42 | 
43 |         if lengths is not None:
44 |             outputs = unpack(outputs)[0]
45 | 
46 |         outputs = self.dropout(self.transform(outputs))
47 |         max_length, batch_size, input_dim = outputs.size()
48 |         trees = []
49 |         indexes = np.full((max_length, batch_size), -1,
50 |                           dtype=np.int32)  # a col is a sentence
51 |         for b, head in enumerate(heads):
52 |             root, tree = creatTree(
53 |                 head)  # head: a sentence's heads; sentence base
54 |             root.traverse()  # traverse the tree
55 |             for step, index in enumerate(root.order):
56 |                 indexes[step, b] = index
57 |             trees.append(tree)
58 | 
59 |         dt_outputs, dt_hidden_ts = self.dt_tree.forward(
60 |             outputs, indexes, trees)
61 |         td_outputs, td_hidden_ts = self.td_tree.forward(
62 |             outputs, indexes, trees)
63 | 
64 |         outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1)
65 |         output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0)
66 | 
67 |         return outputs, output_t
68 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/Tree.py:
--------------------------------------------------------------------------------
 1 | class Tree(object):
 2 |     def __init__(self, index):
 3 |         self.parent = None
 4 |         self.is_left = False
 5 |         self.index = index
 6 |         self.left_children = list()
 7 |         self.left_num = 0
 8 |         self.right_children = list()
 9 |         self.right_num = 0
10 |         self._depth = -1
11 |         self.order = []
12 | 
13 |     def add_left(self, child):
14 |         """
15 |         :param child: a Tree object represent the child
16 |         :return:
17 |         """
18 |         child.parent = self
19 |         child.is_left = True
20 |         self.left_children.append(child)
21 |         self.left_num += 1
22 | 
23 |     def add_right(self, child):
24 |         """
25 |         :param child: a Tree object represent the child
26 |         :return:
27 |         """
28 |         child.parent = self
29 |         child.is_left = False
30 |         self.right_children.append(child)
31 |         self.right_num += 1
32 | 
33 |     def size(self):  # compute the total size of the Tree
34 |         if hasattr(self, '_size'):
35 |             return self._size
36 |         count = 1
37 |         for i in range(self.left_num):
38 |             count += self.left_children[i].size()
39 |         for i in range(self.right_num):
40 |             count += self.right_children[i].size()
41 |         self._size = count
42 |         return self._size
43 | 
44 |     def depth(self):  # compute the depth of the Tree
45 |         if self._depth > 0:
46 |             return self._depth
47 |         count = 0
48 |         if self.left_num + self.right_num > 0:
49 |             for i in range(self.left_num):
50 |                 child_depth = self.left_children[i].depth()
51 |                 if child_depth > count:
52 |                     count = child_depth
53 |             for i in range(self.right_num):
54 |                 child_depth = self.right_children[i].depth()
55 |                 if child_depth > count:
56 |                     count = child_depth
57 |             count += 1
58 |         self._depth = count
59 |         return self._depth
60 | 
61 |     def traverse(self):  # traverse the Tree
62 |         if len(self.order) > 0:
63 |             return self.order
64 | 
65 |         for i in range(self.left_num):
66 |             left_order = self.left_children[i].traverse()
67 |             self.order.extend(left_order)
68 |         for i in range(self.right_num):
69 |             right_order = self.right_children[i].traverse()
70 |             self.order.extend(right_order)
71 |         self.order.append(self.index)  # append the root
72 |         return self.order
73 | 
74 | 
75 | def creatTree(heads):
76 |     tree = []
77 |     # current sentence has already been numberized [form, head, rel]
78 |     root = None
79 |     for idx, head in enumerate(heads):
80 |         tree.append(Tree(idx))
81 | 
82 |     for idx, head in enumerate(heads):
83 |         if head == -1:  # -1 mszhang, 0 kiro
84 |             root = tree[idx]
85 |             continue
86 |         if head < 0:
87 |             print('error: multi roots')
88 |         if head > idx:
89 |             tree[head].add_left(tree[idx])
90 |         if head < idx:
91 |             tree[head].add_right(tree[idx])
92 |         if head == idx:
93 |             print('error: head is it self.')
94 | 
95 |     return root, tree
96 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/TreeLSTM/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["Encoder", "Tree", "TreeGRU"]
2 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/gcn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GCN model for relation extraction.
  3 | """
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | from ..shared.constants import PAD_ID
 10 | import numpy as np
 11 | 
 12 | 
 13 | class GCN(nn.Module):
 14 |     def __init__(self, config, input_dim, mem_dim, num_layers):
 15 |         super(GCN, self).__init__()
 16 |         self.config = config
 17 |         self.input_dim = input_dim
 18 |         self.mem_dim = mem_dim
 19 |         self.layers = num_layers
 20 | 
 21 |         # rnn layer
 22 |         if self.config.gcn_rnn is True:
 23 |             input_size = self.input_dim
 24 |             self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True,
 25 |                                dropout=self.config.gcn_rnn_dropout, bidirectional=True)
 26 |             self.in_dim = self.config.gcn_rnn_hidden * 2
 27 |             self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout)  # use on last layer output
 28 | 
 29 |         self.in_drop = nn.Dropout(self.config.gcn_input_dropout)
 30 |         self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout)
 31 | 
 32 |         # gcn layer
 33 |         self.W = nn.ModuleList()
 34 |         self.layer_normalization = nn.ModuleList()
 35 | 
 36 |         for layer in range(self.layers):
 37 |             # input_dim = self.in_dim if layer == 0 else self.mem_dim
 38 |             self.W.append(nn.Linear(self.in_dim, self.in_dim))
 39 |             self.layer_normalization.append(LayerNormalization(self.in_dim))
 40 | 
 41 |     def conv_l2(self):
 42 |         conv_weights = []
 43 |         for w in self.W:
 44 |             conv_weights += [w.weight, w.bias]
 45 |         return sum([x.pow(2).sum() for x in conv_weights])
 46 | 
 47 |     def encode_with_rnn(self, rnn_inputs, masks, batch_size):
 48 |         seq_lens = masks.data.eq(1).long().sum(1).squeeze()
 49 |         h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers)
 50 | 
 51 |         # SORT YOUR TENSORS BY LENGTH!
 52 |         seq_lens, perm_idx = seq_lens.sort(0, descending=True)
 53 | 
 54 |         rnn_inputs = rnn_inputs[perm_idx]
 55 |         rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True)
 56 |         rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0))
 57 |         rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)
 58 | 
 59 |         _, unperm_idx = perm_idx.sort(0)
 60 |         rnn_outputs = rnn_outputs[unperm_idx]
 61 |         return rnn_outputs
 62 | 
 63 |     def forward(self, adj, embs, masks):
 64 |         batch_size = masks.size()[0]
 65 |         embs = self.in_drop(embs)
 66 |         # rnn layer
 67 |         if self.config.gcn_rnn is True:
 68 |             gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size))
 69 |         else:
 70 |             gcn_inputs = embs
 71 | 
 72 |         # gcn layer
 73 |         denom = adj.sum(2).unsqueeze(2) + 1
 74 |         mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2)
 75 |         # # zero out adj for ablation
 76 |         # if self.opt.get('no_adj', False):
 77 |         #     adj = torch.zeros_like(adj)
 78 | 
 79 |         for l in range(self.layers):
 80 |             # print(gcn_inputs.size(), adj.size())
 81 |             x = gcn_inputs
 82 |             Ax = adj.bmm(gcn_inputs)
 83 |             AxW = self.W[l](Ax)
 84 |             AxW = AxW + self.W[l](gcn_inputs)  # self loop
 85 |             AxW = AxW / denom
 86 | 
 87 |             gAxW = F.relu(AxW)
 88 |             gcn_inputs = self.gcn_drop(gAxW)
 89 |             self.layer_normalization[l].forward(gcn_inputs + x)
 90 | 
 91 |         return gcn_inputs, mask
 92 | 
 93 | 
 94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True):
 95 |     total_layers = num_layers * 2 if bidirectional else num_layers
 96 |     state_shape = (total_layers, batch_size, hidden_dim)
 97 |     h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False)
 98 |     if use_cuda:
 99 |         return h0.cuda(), c0.cuda()
100 |     else:
101 |         return h0, c0
102 | 
103 | 
104 | class LayerNormalization(nn.Module):
105 |     ''' Layer normalization module '''
106 | 
107 |     def __init__(self, d_hid, eps=1e-3):  #
108 |         super(LayerNormalization, self).__init__()
109 |         self.eps = eps
110 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
111 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
112 | 
113 |     def forward(self, z):
114 |         if z.size(1) == 1:
115 |             return z
116 |         mu = torch.mean(z, keepdim=True, dim=-1)
117 |         sigma = torch.std(z, keepdim=True, dim=-1)
118 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)  # 1e-3 is ok, because variance and std.
119 |         ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
120 |         return ln_out
121 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/gcn_model/tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic operations on trees.
  3 | """
  4 | 
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | class Tree(object):
 10 |     """
 11 |     Reused tree object from stanfordnlp/treelstm.
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         self.parent = None
 16 |         # head probability
 17 |         self.phead = -1
 18 |         self.num_children = 0
 19 |         self.children = list()
 20 | 
 21 |     def add_child(self, child):
 22 |         child.parent = self
 23 |         self.num_children += 1
 24 |         self.children.append(child)
 25 | 
 26 |     def size(self):
 27 |         if getattr(self, '_size'):
 28 |             return self._size
 29 |         count = 1
 30 |         for i in xrange(self.num_children):
 31 |             count += self.children[i].size()
 32 |         self._size = count
 33 |         return self._size
 34 | 
 35 |     def depth(self):
 36 |         if getattr(self, '_depth'):
 37 |             return self._depth
 38 |         count = 0
 39 |         if self.num_children > 0:
 40 |             for i in xrange(self.num_children):
 41 |                 child_depth = self.children[i].depth()
 42 |                 if child_depth > count:
 43 |                     count = child_depth
 44 |             count += 1
 45 |         self._depth = count
 46 |         return self._depth
 47 | 
 48 |     def __iter__(self):
 49 |         yield self
 50 |         for c in self.children:
 51 |             for x in c:
 52 |                 yield x
 53 | 
 54 | 
 55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos):
 56 |     """
 57 |     Convert a sequence of head indexes into a tree object.
 58 |     """
 59 |     tokens = tokens[:len_].tolist()
 60 |     head = head[:len_].tolist()
 61 |     root = None
 62 | 
 63 |     if prune < 0:
 64 |         nodes = [Tree() for _ in head]
 65 | 
 66 |         for i in range(len(nodes)):
 67 |             h = head[i]
 68 |             nodes[i].idx = i
 69 |             nodes[i].dist = -1  # just a filler
 70 |             if h == 0:
 71 |                 root = nodes[i]
 72 |             else:
 73 |                 nodes[h - 1].add_child(nodes[i])
 74 |     else:
 75 |         # find dependency path
 76 |         subj_pos = [i for i in range(len_) if subj_pos[i] == 0]
 77 |         obj_pos = [i for i in range(len_) if obj_pos[i] == 0]
 78 | 
 79 |         cas = None
 80 | 
 81 |         subj_ancestors = set(subj_pos)
 82 |         for s in subj_pos:
 83 |             h = head[s]
 84 |             tmp = [s]
 85 |             while h > 0:
 86 |                 tmp += [h - 1]
 87 |                 subj_ancestors.add(h - 1)
 88 |                 h = head[h - 1]
 89 | 
 90 |             if cas is None:
 91 |                 cas = set(tmp)
 92 |             else:
 93 |                 cas.intersection_update(tmp)
 94 | 
 95 |         obj_ancestors = set(obj_pos)
 96 |         for o in obj_pos:
 97 |             h = head[o]
 98 |             tmp = [o]
 99 |             while h > 0:
100 |                 tmp += [h - 1]
101 |                 obj_ancestors.add(h - 1)
102 |                 h = head[h - 1]
103 |             cas.intersection_update(tmp)
104 | 
105 |         # find lowest common ancestor
106 |         if len(cas) == 1:
107 |             lca = list(cas)[0]
108 |         else:
109 |             child_count = {k: 0 for k in cas}
110 |             for ca in cas:
111 |                 if head[ca] > 0 and head[ca] - 1 in cas:
112 |                     child_count[head[ca] - 1] += 1
113 | 
114 |             # the LCA has no child in the CA set
115 |             for ca in cas:
116 |                 if child_count[ca] == 0:
117 |                     lca = ca
118 |                     break
119 | 
120 |         path_nodes = subj_ancestors.union(obj_ancestors).difference(cas)
121 |         path_nodes.add(lca)
122 | 
123 |         # compute distance to path_nodes
124 |         dist = [-1 if i not in path_nodes else 0 for i in range(len_)]
125 | 
126 |         for i in range(len_):
127 |             if dist[i] < 0:
128 |                 stack = [i]
129 |                 while stack[-1] >= 0 and stack[-1] not in path_nodes:
130 |                     stack.append(head[stack[-1]] - 1)
131 | 
132 |                 if stack[-1] in path_nodes:
133 |                     for d, j in enumerate(reversed(stack)):
134 |                         dist[j] = d
135 |                 else:
136 |                     for j in stack:
137 |                         if j >= 0 and dist[j] < 0:
138 |                             dist[j] = int(1e4)  # aka infinity
139 | 
140 |         highest_node = lca
141 |         nodes = [Tree() if dist[i] <= prune else None for i in range(len_)]
142 | 
143 |         for i in range(len(nodes)):
144 |             if nodes[i] is None:
145 |                 continue
146 |             h = head[i]
147 |             nodes[i].idx = i
148 |             nodes[i].dist = dist[i]
149 |             if h > 0 and i != highest_node:
150 |                 assert nodes[h - 1] is not None
151 |                 nodes[h - 1].add_child(nodes[i])
152 | 
153 |         root = nodes[highest_node]
154 | 
155 |     assert root is not None
156 |     return root
157 | 
158 | 
159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False):
160 |     """
161 |     Convert a tree object to an (numpy) adjacency matrix.
162 |     """
163 |     ret = np.zeros((sent_len, sent_len), dtype=np.float32)
164 | 
165 |     queue = [tree]
166 |     idx = []
167 |     while len(queue) > 0:
168 |         t, queue = queue[0], queue[1:]
169 | 
170 |         idx += [t.idx]
171 | 
172 |         for c in t.children:
173 |             ret[t.idx, c.idx] = 1
174 |         queue += t.children
175 | 
176 |     if not directed:
177 |         ret = ret + ret.T
178 | 
179 |     if self_loop:
180 |         for i in idx:
181 |             ret[i, i] = 1
182 | 
183 |     return ret
184 | 
185 | 
186 | def tree_to_dist(sent_len, tree):
187 |     ret = -1 * np.ones(sent_len, dtype=np.int64)
188 | 
189 |     for node in tree:
190 |         ret[node.idx] = node.dist
191 | 
192 |     return ret
193 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/implicit_syntactic_representations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | 
  7 | 
  8 | from .model import drop_sequence_sharedmask, _model_var
  9 | from .HighWayLSTM import Highway_Concat_BiLSTM
 10 | from .layer import NonLinear, Biaffine
 11 | 
 12 | 
 13 | class ImplicitDependencyRepresentations(nn.Module):
 14 |     def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size):
 15 |         super(ImplicitDependencyRepresentations, self).__init__()
 16 |         self.config = config
 17 |         self.lstm_input_size = lstm_input_size
 18 |         self.lstm_hidden_size = lstm_hidden_size
 19 |         self.dep_label_space_size = dep_label_space_size
 20 |         # softmax weights
 21 |         self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0]))
 22 |         self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0]))
 23 |                                                      for _ in range(self.config.dep_num_lstm_layers)])
 24 |         self.cuda = True
 25 | 
 26 |         self.dep_bilstm = Highway_Concat_BiLSTM(
 27 |             input_size=self.lstm_input_size,
 28 |             hidden_size=self.lstm_hidden_size,  # // 2 for MyLSTM
 29 |             num_layers=self.config.dep_num_lstm_layers,
 30 |             batch_first=True,
 31 |             bidirectional=True,
 32 |             dropout_in=config.input_dropout_prob,
 33 |             dropout_out=config.recurrent_dropout_prob
 34 |         )
 35 | 
 36 |         # dependency parsing module
 37 |         self.mlp_arc_dep = NonLinear(
 38 |             input_size=2 * config.lstm_hidden_size,
 39 |             hidden_size=config.mlp_arc_size + config.mlp_rel_size,
 40 |             activation=nn.LeakyReLU(0.1))
 41 |         self.mlp_arc_head = NonLinear(
 42 |             input_size=2 * config.lstm_hidden_size,
 43 |             hidden_size=config.mlp_arc_size + config.mlp_rel_size,
 44 |             activation=nn.LeakyReLU(0.1))
 45 | 
 46 |         self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
 47 |         self.arc_num = int(config.mlp_arc_size / 100)
 48 |         self.rel_num = int(config.mlp_rel_size / 100)
 49 | 
 50 |         self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False))
 51 |         self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size,
 52 |                                      bias=(True, True))
 53 | 
 54 |     def init_masks(self, batch_size, lengths):
 55 |         max_sent_length = max(lengths)
 56 |         num_sentences = batch_size
 57 |         indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1)
 58 |         masks = indices < lengths.unsqueeze(1)
 59 |         masks = masks.type(torch.FloatTensor)
 60 |         if self.cuda:
 61 |             masks = masks.cuda()
 62 |         return masks
 63 | 
 64 |     def forward(self, num_sentences, context_embeddings, sent_lengths, dep):
 65 |         masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths))
 66 |         lstm_out, _ = self.dep_bilstm(context_embeddings, masks)
 67 | 
 68 |         if self.training:
 69 |             lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp)
 70 | 
 71 |         x_all_dep = self.mlp_arc_dep(lstm_out)
 72 |         x_all_head = self.mlp_arc_head(lstm_out)
 73 | 
 74 |         if self.training:
 75 |             x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
 76 |             x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
 77 | 
 78 |         x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
 79 |         x_all_head_splits = torch.split(x_all_head, 100, dim=2)
 80 | 
 81 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
 82 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
 83 | 
 84 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
 85 |         arc_logit = torch.squeeze(arc_logit, dim=3)
 86 | 
 87 |         x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
 88 |         x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
 89 | 
 90 |         rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
 91 | 
 92 |         self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond
 93 | 
 94 |         heads, rels = dep[0], dep[1]
 95 |         loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist())  # compute the dep loss
 96 |         return loss, self.arc_logits
 97 | 
 98 |     def compute_dep_loss(self, true_arcs, true_rels, lengths):
 99 |         b, l1, l2 = self.arc_logits.size()
100 |         index_true_arcs = _model_var(
101 |             self.parameters(),
102 |             pad_sequence(true_arcs, padding_value=0, batch_first=True)
103 |         )
104 |         true_arcs = _model_var(
105 |             self.parameters(),
106 |             pad_sequence(true_arcs, padding_value=-1, batch_first=True)
107 |         )
108 | 
109 |         masks = []
110 |         for length in lengths:
111 |             mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length))
112 |             mask = _model_var(self.parameters(), mask)
113 |             mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
114 |             masks.append(mask.transpose(0, 1))
115 |         length_mask = torch.stack(masks, 0)
116 |         arc_logits = self.arc_logits + length_mask
117 | 
118 |         arc_loss = F.cross_entropy(
119 |             arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
120 |             ignore_index=-1, reduction="sum")
121 | 
122 |         size = self.rel_logits.size()
123 |         output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3]))
124 |         for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))):
125 |             rel_probs = []
126 |             for i in range(l1):
127 |                 rel_probs.append(logits[i][int(arcs[i])])
128 |             rel_probs = torch.stack(rel_probs, dim=0)
129 |             output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
130 | 
131 |         b, l1, d = output_logits.size()
132 |         true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True))
133 | 
134 |         rel_loss = F.cross_entropy(
135 |             output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum")
136 | 
137 |         loss = arc_loss + rel_loss
138 |         return loss
139 | 
140 |     def get_reps(self, context_embeddings, masks):
141 |         dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks)
142 |         normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0)
143 |         normed_weights = torch.split(normed_weights, 1)  # split_size_or_sections=1, split_size=1)  # 0.3.0
144 |         dep_representations = self.dep_gamma * \
145 |                               sum([normed_weights[i] * dep_lstm_outputs[i] for i in
146 |                                    range(self.config.dep_num_lstm_layers)])
147 |         if self.training:
148 |             lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp)
149 | 
150 |         x_all_dep = self.mlp_arc_dep(dep_lstm_out)
151 |         x_all_head = self.mlp_arc_head(dep_lstm_out)
152 | 
153 |         if self.training:
154 |             x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
155 |             x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
156 | 
157 |         x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
158 |         x_all_head_splits = torch.split(x_all_head, 100, dim=2)
159 | 
160 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
161 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
162 | 
163 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
164 |         arc_logit = torch.squeeze(arc_logit, dim=3)
165 |         return dep_representations, arc_logit
166 | 
167 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | from .layer import MyLSTM, NonLinear, Biaffine
  5 | 
  6 | 
  7 | def _model_var(parameters, x):
  8 |     p = next(iter(filter(lambda p: p.requires_grad, parameters)))
  9 |     if p.is_cuda:
 10 |         x = x.cuda(p.get_device())
 11 |     return torch.autograd.Variable(x)
 12 | 
 13 | 
 14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb):
 15 |     batch_size, seq_length, _ = word_embeddings.size()
 16 |     # tensor.new: build a tensor with the same data type
 17 |     word_masks = word_embeddings.data.new(batch_size,
 18 |                                           seq_length).fill_(1 - dropout_emb)
 19 |     word_masks = torch.Tensor(torch.bernoulli(word_masks))
 20 |     word_masks.requires_grad = False
 21 |     tag_masks = tag_embeddings.data.new(batch_size,
 22 |                                         seq_length).fill_(1 - dropout_emb)
 23 |     tag_masks = torch.Tensor(torch.bernoulli(tag_masks))
 24 |     tag_masks.requires_grad = False
 25 |     scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12)
 26 |     word_masks *= scale
 27 |     tag_masks *= scale
 28 |     # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.
 29 |     word_masks = word_masks.unsqueeze(dim=2)  # ?
 30 |     tag_masks = tag_masks.unsqueeze(dim=2)
 31 |     word_embeddings = word_embeddings * word_masks
 32 |     tag_embeddings = tag_embeddings * tag_masks
 33 | 
 34 |     return word_embeddings, tag_embeddings
 35 | 
 36 | 
 37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
 38 |     if batch_first:
 39 |         inputs = inputs.transpose(0, 1)
 40 |     seq_length, batch_size, hidden_size = inputs.size()
 41 |     drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout)
 42 |     drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type())
 43 |     drop_masks.requires_grad = False
 44 |     drop_masks = drop_masks / (1 - dropout)
 45 |     drop_masks = torch.unsqueeze(drop_masks,
 46 |                                  dim=2).expand(-1, -1,
 47 |                                                seq_length).permute(2, 0, 1)
 48 |     inputs = inputs * drop_masks
 49 | 
 50 |     return inputs.transpose(1, 0)
 51 | 
 52 | 
 53 | class ParserModel(nn.Module):  # build a biaffine parser model
 54 |     def __init__(self, vocab, config, pretrained_embedding):
 55 |         super(ParserModel, self).__init__()
 56 |         self.config = config
 57 |         self.word_embed = nn.Embedding(vocab.vocab_size,
 58 |                                        config.word_dims,
 59 |                                        padding_idx=0)
 60 |         self.extword_embed = nn.Embedding(vocab.extvocab_size,
 61 |                                           config.word_dims,
 62 |                                           padding_idx=0)
 63 |         self.tag_embed = nn.Embedding(vocab.tag_size,
 64 |                                       config.tag_dims,
 65 |                                       padding_idx=0)
 66 | 
 67 |         word_init = np.zeros((vocab.vocab_size, config.word_dims),
 68 |                              dtype=np.float32)
 69 |         self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
 70 | 
 71 |         tag_init = np.random.randn(vocab.tag_size,
 72 |                                    config.tag_dims).astype(np.float32)
 73 |         self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
 74 | 
 75 |         self.extword_embed.weight.data.copy_(
 76 |             torch.from_numpy(pretrained_embedding))
 77 |         self.extword_embed.weight.requires_grad = False
 78 | 
 79 |         self.lstm = MyLSTM(
 80 |             input_size=config.word_dims + config.tag_dims,
 81 |             hidden_size=config.lstm_hiddens,
 82 |             num_layers=config.lstm_layers,
 83 |             batch_first=True,
 84 |             bidirectional=True,
 85 |             dropout_in=config.dropout_lstm_input,
 86 |             dropout_out=config.dropout_lstm_hidden,
 87 |         )
 88 | 
 89 |         self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens,
 90 |                                      hidden_size=config.mlp_arc_size +
 91 |                                      config.mlp_rel_size,
 92 |                                      activation=nn.LeakyReLU(0.1))
 93 |         self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens,
 94 |                                       hidden_size=config.mlp_arc_size +
 95 |                                       config.mlp_rel_size,
 96 |                                       activation=nn.LeakyReLU(0.1))
 97 | 
 98 |         self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
 99 |         self.arc_num = int(config.mlp_arc_size / 100)  # config: 500
100 |         self.rel_num = int(config.mlp_rel_size / 100)  # config: 100
101 | 
102 |         self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size,
103 |                                      1, bias=(True, False))
104 |         self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size,
105 |                                      vocab.rel_size, bias=(True, True))
106 | 
107 |     def forward(
108 |             self, words, extwords, tags,
109 |             masks):  # words [batch, max_sentence_length], padding with zeros
110 |         # x = (batch size, sequence length, dimension of embedding)
111 |         x_word_embed = self.word_embed(words)
112 |         x_extword_embed = self.extword_embed(extwords)
113 |         x_embed = x_word_embed + x_extword_embed
114 |         x_tag_embed = self.tag_embed(tags)
115 | 
116 |         if self.training:
117 |             x_embed, x_tag_embed = drop_input_independent(
118 |                 x_embed, x_tag_embed, self.config.dropout_emb)
119 | 
120 |         x_lexical = torch.cat((x_embed, x_tag_embed), dim=2)
121 | 
122 |         outputs, _ = self.lstm(x_lexical, masks, None)
123 |         outputs = outputs.transpose(1, 0)
124 | 
125 |         if self.training:
126 |             outputs = drop_sequence_sharedmask(outputs,
127 |                                                self.config.dropout_mlp)
128 | 
129 |         x_all_dep = self.mlp_arc_dep(outputs)
130 |         x_all_head = self.mlp_arc_head(outputs)
131 | 
132 |         if self.training:
133 |             x_all_dep = drop_sequence_sharedmask(x_all_dep,
134 |                                                  self.config.dropout_mlp)
135 |             x_all_head = drop_sequence_sharedmask(x_all_head,
136 |                                                   self.config.dropout_mlp)
137 | 
138 |         x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
139 |         x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
140 | 
141 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
142 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
143 | 
144 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
145 |         arc_logit = torch.squeeze(arc_logit, dim=3)
146 | 
147 |         x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
148 |         x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
149 | 
150 |         rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
151 |         return arc_logit, rel_logit_cond
152 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/pytorch/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | def block_orth_normal_initializer(input_size, output_size):
 7 |     weight = []
 8 |     for o in output_size:
 9 |         for i in input_size:
10 |             param = torch.FloatTensor(o, i)
11 |             torch.nn.init.orthogonal_(param)
12 |             weight.append(param)
13 |     return torch.cat(weight)
14 | 
15 | 
16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights):
17 |     batch_size = len(batch_x)  # batch size
18 |     length = max(batch_lengths)
19 | 
20 |     words = Variable(torch.LongTensor(batch_size, length).zero_(),
21 |                      requires_grad=False)  # padding with 0
22 |     predicates = Variable(torch.LongTensor(batch_size, length).zero_(),
23 |                           requires_grad=False)
24 |     masks = Variable(torch.Tensor(batch_size, length).zero_(),
25 |                      requires_grad=False)
26 |     padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(),
27 |                                requires_grad=False)
28 |     labels, lengths = [], []
29 | 
30 |     b = 0
31 |     for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y,
32 |                                                       batch_lengths,
33 |                                                       batch_weights):
34 |         lengths.append(s_length)
35 |         rel = np.zeros((s_length), dtype=np.int32)
36 |         for i in range(s_length):
37 |             words[b, i] = s_words[1][i]  # word
38 |             predicates[b, i] = s_words[2][i]  # predicate
39 |             rel[i] = s_answer[0][i]
40 |             padding_answers[b, i] = s_answer[0][i]
41 |             masks[b, i] = 1
42 | 
43 |         # sentence_id = s_words[0][0]  # get the dep_labels_ids of each sentence
44 |         b += 1
45 |         labels.append(rel)
46 | 
47 |     return words, predicates, labels, torch.LongTensor(
48 |         lengths), masks, padding_answers
49 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/configuration.py:
--------------------------------------------------------------------------------
 1 | ''' Configuration for experiments.
 2 | '''
 3 | import json
 4 | from argparse import Namespace
 5 | 
 6 | 
 7 | def get_config(config_filepath):
 8 |     with open(config_filepath, 'r') as config_file:
 9 |         conf = json.load(config_file, object_hook=lambda d: Namespace(**d))
10 |     return conf
11 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/conll_utils.py:
--------------------------------------------------------------------------------
 1 | def bio_to_se(labels):
 2 |     slen = len(labels)
 3 |     new_labels = []
 4 |     has_opening = False
 5 |     for i in range(slen):
 6 |         label = labels[i]
 7 |         if label == 'O':
 8 |             new_labels.append('*')
 9 |             continue
10 |         new_label = '*'
11 |         if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]:
12 |             new_label = '(' + label[2:] + new_label
13 |             has_opening = True
14 |         if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]:
15 |             new_label = new_label + ')'
16 |             has_opening = False
17 |         new_labels.append(new_label)
18 | 
19 |     if has_opening:
20 |         ''' logging '''
21 |         print("Has unclosed opening: {}".format(labels))
22 |     return new_labels
23 | 
24 | 
25 | def print_sentence_to_conll(fout, tokens, labels):
26 |     for label_column in labels:
27 |         assert len(label_column) == len(tokens)
28 |     for i in range(len(tokens)):
29 |         fout.write(tokens[i].ljust(15))
30 |         for label_column in labels:
31 |             fout.write(label_column[i].rjust(15))
32 |         fout.write("\n")
33 |     fout.write("\n")
34 | 
35 | 
36 | def print_to_conll(pred_labels, gold_props_file, output_filename):
37 |     """
38 |   """
39 |     fout = open(output_filename, 'w')
40 |     seq_ptr = 0
41 |     num_props_for_sentence = 0
42 |     tokens_buf = []
43 | 
44 |     for line in open(gold_props_file, 'r'):
45 |         line = line.strip()
46 |         if line == "" and len(tokens_buf) > 0:
47 |             print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
48 |             seq_ptr += num_props_for_sentence
49 |             tokens_buf = []
50 |             num_props_for_sentence = 0
51 |         else:
52 |             info = line.split()
53 |             num_props_for_sentence = len(info) - 1
54 |             tokens_buf.append(info[0])
55 | 
56 |     # Output last sentence.
57 |     if len(tokens_buf) > 0:
58 |         print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
59 | 
60 |     fout.close()
61 | 
62 | 
63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename):
64 |     fout = open(output_filename, 'w')
65 |     props_buf = []
66 |     labels_buf = []
67 |     tokens_buf = []
68 |     prev_words = ''
69 | 
70 |     x, y, num_tokens, _ = data
71 |     for (sent, gold, slen) in zip(x, y, num_tokens):
72 |         words = [word_dict.idx2str[w[0]] for w in sent[:slen]]
73 |         labels = [label_dict.idx2str[l] for l in gold[:slen]]
74 | 
75 |         concat_words = ' '.join(words)
76 |         if concat_words != prev_words and len(props_buf) > 0:
77 |             tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
78 | 
79 |             print_sentence_to_conll(fout, tokens, labels_buf)
80 |             props_buf = []
81 |             tokens_buf = []
82 |             labels_buf = []
83 |             prev_words = ''
84 | 
85 |         if prev_words == '':
86 |             prev_words = concat_words
87 |             tokens_buf = [w for w in words]
88 |         if 'B-V' in labels:
89 |             prop_id = labels.index('B-V')
90 |             props_buf.append(prop_id)
91 |             labels_buf.append(bio_to_se(labels))
92 | 
93 |     if len(props_buf) > 0:
94 |         tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
95 |         print_sentence_to_conll(fout, tokens, labels_buf)
96 | 
97 |     fout.close()
98 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constants.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | import os
 3 | import random
 4 | 
 5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../')
 6 | 
 7 | RANDOM_SEED = 12345
 8 | random.seed(RANDOM_SEED)
 9 | 
10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh')
11 | 
12 | START_MARKER = '<S>'
13 | END_MARKER = '</S>'
14 | PADDING_TOKEN = '*PAD*'
15 | UNKNOWN_TOKEN = '*UNKNOWN*'
16 | NULL_LABEL = 'O'
17 | 
18 | TEMP_DIR = join(ROOT_DIR, '../temp')
19 | 
20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT)
21 | if not os.path.exists(TEMP_DIR):
22 |     os.makedirs(TEMP_DIR)
23 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_extraction.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import sys
  3 | import numpy as np
  4 | import random
  5 | 
  6 | from .dictionary import Dictionary
  7 | from collections import OrderedDict
  8 | from nltk.tree import Tree
  9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN
 10 | # from .reader import list_of_words_to_ids
 11 | 
 12 | 
 13 | PREFIX = "--PTB-CONS-LABEL--"
 14 | 
 15 | 
 16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 17 |     ids = []
 18 |     for s in list_of_words:
 19 |         # s = s.encode('utf-8')  # unicode -> utf-8
 20 |         if s is None:
 21 |             ids.append(-1)
 22 |             continue
 23 |         if lowercase:
 24 |             s = s.lower()
 25 |         if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 26 |             s = UNKNOWN_TOKEN
 27 |         ids.append(dictionary.add(s))
 28 |     return ids
 29 | 
 30 | 
 31 | class constituent_tree():
 32 |     def __init__(self, sentence, words, tree):
 33 |         self.sentence = sentence
 34 |         self.words = words
 35 |         self.tree = tree
 36 | 
 37 |         self.heads = []
 38 |         self.non_terminal_nodes = []  # cons labels, e.g., NP, VP
 39 |         self.terminal_nodes = []  # words
 40 |         self.indicator = []  # 1 no terminal, 2 terminal
 41 | 
 42 |         self.non_terminal_nodes_idx = []
 43 |         self.non_terminal_nodes_char_idx = []
 44 |         self.terminal_node_idx = []
 45 |         self.terminal_node_char_idx = []
 46 | 
 47 |         self.sentence_length = len(words)
 48 |         self.input_length = -1
 49 |         self.sentence_index = -1
 50 | 
 51 |     def pos(self):
 52 |         """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]"""
 53 |         return self.tree.pos()
 54 | 
 55 |     def traverse_tree(self, tree,
 56 |                       non_terminal_nodes, terminal_nodes,
 57 |                       non_terminal_nodes_idx, terminal_nodes_idx,
 58 |                       indicator,
 59 |                       heads,
 60 |                       parent,
 61 |                       non_terminal_dict, word_dict, pos,
 62 |                       word_embeddings):
 63 |         # print(tree)
 64 |         # print("subtree", subtree)
 65 |         if tree.height() > 2:
 66 |             non_terminal = tree.label()
 67 | 
 68 |             non_terminal_nodes.append(non_terminal)
 69 |             non_terminal_nodes_idx.append(non_terminal_dict.add(non_terminal))
 70 |             indicator.append(1)
 71 |             heads.append(parent - 1)
 72 |         else:
 73 |             # print("YY", subtree)
 74 |             terminal = tree[0]  # word
 75 |             terminal_nodes.append(terminal)
 76 |             terminal_nodes_idx.append(
 77 |                 constituent_tree.add_word(terminal, word_dict, word_embeddings)
 78 |             )
 79 |             indicator.append(2)
 80 | 
 81 |             pos.add(tree.label())
 82 |             heads.append(parent - 1)
 83 |         if tree.height() <= 2:  # 2 == ["V", Tree("Chased")]
 84 |             return
 85 |         parent = len(non_terminal_nodes) + len(terminal_nodes)
 86 |         for i, subtree in enumerate(tree):
 87 |             self.traverse_tree(subtree,
 88 |                                non_terminal_nodes, terminal_nodes,
 89 |                                non_terminal_nodes_idx, terminal_nodes_idx,
 90 |                                indicator,
 91 |                                heads, parent,
 92 |                                non_terminal_dict, word_dict, pos,
 93 |                                word_embeddings)
 94 | 
 95 |     @staticmethod
 96 |     def add_unknown_labels(label, word_embeddings):
 97 |         if label not in word_embeddings:
 98 |             embedding_size = len(word_embeddings[PADDING_TOKEN])
 99 |             word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)])
100 | 
101 |     @staticmethod
102 |     def add_word(word, word_dict, word_embeddings):
103 |         if word not in word_embeddings:
104 |             word = UNKNOWN_TOKEN
105 |         idx = word_dict.add(word)
106 |         return idx
107 | 
108 |     @staticmethod
109 |     def get_node_char_idx(words, char_dict, lowercase=False):
110 |         max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
111 |         single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
112 |         for i, word in enumerate(words):
113 |             single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
114 |         return single_sample_char_tokens
115 | 
116 |     def generate_adjacent(self, non_terminal_dict, word_dict, char_dict, pos, word_embeddings):
117 |         root_label = self.tree.label()
118 |         self.traverse_tree(self.tree,
119 |                            self.non_terminal_nodes, self.terminal_nodes,
120 |                            self.non_terminal_nodes_idx, self.terminal_node_idx,
121 |                            self.indicator,
122 |                            self.heads, len(self.heads),
123 |                            non_terminal_dict, word_dict, pos,
124 |                            word_embeddings
125 |                            )
126 |         self.input_length = len(self.non_terminal_nodes) + len(self.terminal_nodes)
127 |         self.sentence_index = self.input_length - self.sentence_length - 1
128 | 
129 |         self.non_terminal_nodes_char_idx = constituent_tree.get_node_char_idx(
130 |             self.non_terminal_nodes, char_dict
131 |         )
132 |         self.terminal_node_char_idx = constituent_tree.get_node_char_idx(
133 |             self.terminal_nodes, char_dict
134 |         )
135 | 
136 | 
137 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings):
138 |     data = []
139 |     with open(file_path, 'r') as input_file:
140 |         sentence = ""
141 |         for line in input_file.readlines():
142 |             if line.strip() == "":
143 |                 data.append(sentence)
144 |                 sentence = ""
145 |                 continue
146 |             line = line.strip()
147 |             if ' ' not in line:  # avoid the split of leave node of it's PoS
148 |                 line = ' ' + line
149 |             sentence += line
150 |         print("Read {} sentence from {}".format(len(data), file_path))
151 | 
152 |     def reset_sentence(sentence):
153 |         for i in range(len(sentence)):
154 |             if sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]:
155 |                 sentence[i] = '-'
156 | 
157 |     cons_trees = OrderedDict()
158 |     for sentence in data:
159 |         tree = Tree.fromstring(sentence)
160 |         words = tree.leaves()
161 |         reset_sentence(words)
162 |         sentence = ' '.join(words)
163 |         cons_trees[sentence] = constituent_tree(sentence, words, tree)
164 | 
165 |     pos_dict = Dictionary(padding_token=PADDING_TOKEN)
166 |     non_terminal_dict = Dictionary(padding_token=PADDING_TOKEN)
167 |     for sen in cons_trees:
168 |         tree = cons_trees[sen]
169 |         tree.generate_adjacent(non_terminal_dict, word_dict, char_dict, pos_dict, word_embeddings)
170 | 
171 |     return cons_trees, non_terminal_dict, pos_dict,
172 | 
173 | 
174 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/constituent_reader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import codecs
  3 | import numpy as np
  4 | 
  5 | 
  6 | from sortedcontainers import SortedSet
  7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL
  8 | from .dictionary import Dictionary
  9 | 
 10 | 
 11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 12 |     ids = []
 13 |     for s in list_of_words:
 14 |         # s = s.encode('utf-8')  # unicode -> utf-8
 15 |         if s is None:
 16 |             ids.append(-1)
 17 |             continue
 18 |         if lowercase:
 19 |             s = s.lower()
 20 |         if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 21 |             s = UNKNOWN_TOKEN
 22 |         ids.append(dictionary.add(s))
 23 |     return ids
 24 | 
 25 | 
 26 | class constituent_sentence():
 27 |     def __init__(self, obj):
 28 |         self.sentence = obj["sentence"]
 29 |         self.constituent_spans = obj["constituents"]
 30 |         self.max_span_width = 30
 31 |         self.reset_sentence()
 32 | 
 33 |     def reset_sentence(self):
 34 |         for i in range(len(self.sentence)):
 35 |             if self.sentence[i] in ["[", "]", "(", ")", "{", "}", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"]:
 36 |                 self.sentence[i] = '-'
 37 |             self.sentence[i] = self.sentence[i].replace("\\/", "/")
 38 | 
 39 |     def tokenize_cons_spans(self, dictionary, max_cons_width=60):
 40 |         cons_span = []
 41 |         set_cons_span = set()
 42 |         for cons_s in self.constituent_spans:  # remove self-loop V-V
 43 |             cons_start, cons_end, cons_label = cons_s
 44 |             if cons_label in ["TOP", "S"]:  # todo: add some constrains here
 45 |                 continue
 46 |             if cons_end - cons_start + 1 >= max_cons_width:
 47 |                 continue
 48 |             if (cons_start, cons_end) not in set_cons_span:
 49 |                 set_cons_span.add((cons_start, cons_end))
 50 |                 cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))])
 51 |             else:
 52 |                 # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence)
 53 |                 pass
 54 |         if len(cons_span) == 0:  # if the sentence has no arguments.
 55 |             return [[], [], []]
 56 |         tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \
 57 |             zip(*cons_span)
 58 |         return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels
 59 | 
 60 | 
 61 | def read_constituent_file(file_path):
 62 |     sentences = []
 63 |     with codecs.open(file_path, encoding="utf8") as f:
 64 |         for line in f.readlines():
 65 |             sen = json.loads(line)
 66 |             cons_sen = constituent_sentence(sen)
 67 |             sentences.append(cons_sen)
 68 |     print("{} total constituent sentences number {}".format(file_path, len(sentences)))
 69 |     return sentences
 70 | 
 71 | 
 72 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False):
 73 |     sample_word_tokens = [list_of_words_to_ids(
 74 |         sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples]
 75 |     # for the character
 76 |     sample_char_tokens = []
 77 |     for sent in samples:
 78 |         words = sent.sentence
 79 |         max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
 80 |         single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
 81 |         for i, word in enumerate(words):
 82 |             single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
 83 |         # Add the sample char tokens into the sample_char_tokens
 84 |         sample_char_tokens.append(single_sample_char_tokens)
 85 |     sample_texts = [sent.sentence for sent in samples]
 86 |     sample_lengths = [len(sent.sentence) for sent in samples]
 87 |     sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples]
 88 |     return list(zip(sample_lengths, sample_texts, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens))
 89 | 
 90 | 
 91 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None):
 92 |     raw_cons_sentences = read_constituent_file(file_path)
 93 |     cons_label_dict = Dictionary()
 94 |     cons_label_dict.set_unknown_token(NULL_LABEL)
 95 | 
 96 |     # tokenized the data
 97 |     if word_dict.accept_new is False:
 98 |         word_dict.accept_new = True
 99 |     if char_dict.accept_new is False:
100 |         char_dict.accept_new = True
101 |     cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict,
102 |                                       False, word_embeddings)
103 |     # word_dict.accept_new = False
104 |     # char_dict.accept_new = False
105 |     # cons_label_dict.accept_new = False
106 | 
107 |     print("="*10, "Constituent Info", "="*10)
108 |     print("Extract {} tags".format(cons_label_dict.size()))
109 |     # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size()))
110 |     print("Max sentence length: {}".format(max([s[0] for s in cons_samples])))
111 |     return cons_samples, word_dict, char_dict, cons_label_dict
112 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/dictionary.py:
--------------------------------------------------------------------------------
 1 | ''' Bidirectional dictionary that maps between words and ids.
 2 | '''
 3 | 
 4 | 
 5 | class Dictionary(object):
 6 |     def __init__(self, padding_token=None, unknown_token=None):
 7 |         self.str2idx = {}
 8 |         self.idx2str = []
 9 | 
10 |         self.accept_new = True
11 |         self.padding_token = None
12 |         self.padding_id = None
13 |         self.unknown_token = None
14 |         self.unknown_id = None
15 |         if padding_token is not None:  # add the padding info into the dictionary
16 |             self.set_padding_token(padding_token)
17 |         if unknown_token is not None:
18 |             self.set_unknown_token(unknown_token)
19 | 
20 |     def set_padding_token(self, padding_token):
21 |         self.padding_token = padding_token
22 |         self.padding_id = self.add(self.padding_token)
23 | 
24 |     def set_unknown_token(self, unknown_token):
25 |         self.unknown_token = unknown_token
26 |         self.unknown_id = self.add(self.unknown_token)
27 | 
28 |     def add(self, new_str):
29 |         if new_str not in self.str2idx:
30 |             if self.accept_new:
31 |                 self.str2idx[new_str] = len(self.idx2str)
32 |                 self.idx2str.append(new_str)
33 |             else:
34 |                 if new_str == "C-ADV":
35 |                     return self.str2idx["O"]
36 |                 if self.unknown_id is None:
37 |                     raise LookupError(
38 |                         'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str)
39 |                 return self.unknown_id
40 | 
41 |         return self.str2idx[new_str]
42 | 
43 |     def add_all(self, str_list):
44 |         return [self.add(s) for s in str_list]
45 | 
46 |     def get_index(self, input_str):
47 |         if input_str in self.str2idx:
48 |             return self.str2idx[input_str]
49 |         return None
50 | 
51 |     def size(self):
52 |         return len(self.idx2str)
53 | 
54 |     def save(self, filename):
55 |         with open(filename, 'w') as f:
56 |             for s in self.idx2str:
57 |                 f.write(s + '\n')
58 |             f.close()
59 | 
60 |     def load(self, filename):
61 |         with open(filename, 'r') as f:
62 |             for line in f:
63 |                 line = line.strip()
64 |                 if line != '':
65 |                     self.add(line)
66 |             f.close()
67 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/evaluation.py:
--------------------------------------------------------------------------------
 1 | ''' Framework independent evaluator. Not in use yet.
 2 | '''
 3 | import numpy
 4 | import os
 5 | from os.path import join
 6 | # import subprocess
 7 | from .constants import ROOT_DIR
 8 | from .conll_utils import print_gold_to_conll
 9 | # from .measurements import Timer
10 | 
11 | 
12 | class TaggerEvaluator(object):
13 |     def __init__(self, data):
14 |         self.data = data
15 |         self.best_accuracy = 0.0
16 |         self.has_best = False
17 | 
18 |     def compute_accuracy(self, predictions):
19 |         for x, y in zip(predictions,
20 |                         [sent[2] for sent in self.data
21 |                          ]):  # the predication's order should be the origin
22 |             assert len(x) == y
23 |         predictions = numpy.concatenate(predictions)
24 |         tensors = self.data
25 |         answer = numpy.concatenate(
26 |             [sent[1].reshape(sent[1].shape[1]) for sent in tensors])
27 |         # predictions.resize(predictions.shape[0])  # resize the answer to the [length, 1]
28 |         num_correct = numpy.equal(predictions, answer).sum()
29 |         num_total = answer.shape[0]
30 |         self.accuracy = (100.0 * num_correct) / num_total
31 |         print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct,
32 |                                                 num_total))
33 | 
34 |     def evaluate(self, predictions):
35 |         self.compute_accuracy(predictions)
36 |         self.has_best = self.accuracy > self.best_accuracy
37 |         if self.has_best:
38 |             print("Best accuracy so far: {:.3f}".format(self.accuracy))
39 |             self.best_accuracy = self.accuracy
40 | 
41 | 
42 | class PropIdEvaluator(object):
43 |     def __init__(self, data, label_dict, target_label='V',
44 |                  use_se_marker=False):
45 |         self.data = data
46 |         self.label_dict = label_dict
47 |         self.target_label_id = label_dict.str2idx[target_label]
48 |         self.best_accuracy = 0.0
49 |         self.has_best = False
50 | 
51 |     def compute_accuracy(self, predictions):
52 |         _, y, _, weights = self.data
53 |         # print predictions.shape, predictions
54 |         identified = numpy.equal(predictions, self.target_label_id)
55 |         print(y)
56 |         # print self.target_label_id
57 |         # print identified
58 |         # exit()
59 |         num_correct = numpy.sum(
60 |             numpy.logical_and(numpy.equal(predictions, y), identified) * weights)
61 |         num_identified = numpy.sum(identified * weights)
62 |         num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights)
63 |         self.precision = 100.0 * num_correct / num_identified
64 |         self.recall = 100.0 * num_correct / num_gold
65 |         self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall)
66 |         print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format(
67 |             self.accuracy, self.precision, self.recall))
68 | 
69 |     def evaluate(self, predictions):
70 |         self.compute_accuracy(predictions)
71 |         self.has_best = self.accuracy > self.best_accuracy
72 |         if self.has_best:
73 |             print("Best accuracy so far: {:.3f}".format(self.accuracy))
74 |             self.best_accuracy = self.accuracy
75 | 
76 | 
77 | class SRLEvaluator(TaggerEvaluator):
78 |     def __init__(self):
79 |         self.best_accuracy = -1.0
80 |         self.has_best = False
81 | 
82 |     def compute_accuracy(self, predictions):
83 |         print("exit()")
84 |         exit()
85 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/features.py:
--------------------------------------------------------------------------------
 1 | def get_srl_features(sentences, config, feature_dicts=None):
 2 |     ''' TODO: Support adding more features.
 3 |     '''
 4 |     feature_names = config.features
 5 |     feature_sizes = config.feature_sizes
 6 |     use_se_marker = config.use_se_marker
 7 | 
 8 |     features = []
 9 |     feature_shapes = []
10 |     for fname, fsize in zip(feature_names, feature_sizes):
11 |         if fname == "predicate":
12 |             offset = int(use_se_marker)
13 |             offset = 1  # pad is in the position 0
14 |             features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences])
15 |             feature_shapes.append([2, fsize])
16 |     return (zip(*features), feature_shapes)
17 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/inference.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | def get_transition_params(label_strs):
 5 |     """Construct transtion scoresd (0 for allowed, -inf for invalid).
 6 |       Args:
 7 |         label_strs: A [num_tags,] sequence of BIO-tags.
 8 |       Returns:
 9 |         A [num_tags, num_tags] matrix of transition scores.
10 |     """
11 |     num_tags = len(label_strs)
12 |     transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32)
13 |     for i, prev_label in enumerate(label_strs):
14 |         for j, label in enumerate(label_strs):
15 |             if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]:
16 |                 transition_params[i, j] = numpy.NINF
17 |     return transition_params
18 | 
19 | 
20 | def viterbi_decode(score, transition_params):
21 |     """ Adapted from Tensorflow implementation.
22 |         Decode the highest scoring sequence of tags outside of TensorFlow.
23 |         This should only be used at test time.
24 |         Args:
25 |             score: A [seq_len, num_tags] matrix of unary potentials.
26 |             transition_params: A [num_tags, num_tags] matrix of binary potentials.
27 |         Returns:
28 |             viterbi: A [seq_len] list of integers containing the highest scoring tag
29 |               indicies.
30 |             viterbi_score: A float containing the score for the Viterbi sequence.
31 |     """
32 |     trellis = numpy.zeros_like(score)
33 |     backpointers = numpy.zeros_like(score, dtype=numpy.int32)
34 |     trellis[0] = score[0]
35 |     for t in range(1, score.shape[0]):
36 |         v = numpy.expand_dims(trellis[t - 1], 1) + transition_params
37 |         trellis[t] = score[t] + numpy.max(v, 0)
38 |         backpointers[t] = numpy.argmax(v, 0)
39 |     viterbi = [numpy.argmax(trellis[-1])]
40 |     for bp in reversed(backpointers[1:]):
41 |         viterbi.append(bp[viterbi[-1]])
42 |     viterbi.reverse()
43 |     viterbi_score = numpy.max(trellis[-1])
44 |     return viterbi, viterbi_score
45 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/io_utils.py:
--------------------------------------------------------------------------------
  1 | from google.protobuf.internal import encoder
  2 | 
  3 | _EncodeVarint = encoder._VarintEncoder()
  4 | 
  5 | 
  6 | def write_delimited_to(out_file, message):
  7 |     msg_size = message.ByteSize()
  8 |     pieces = []
  9 |     _EncodeVarint(pieces.append, msg_size)
 10 |     out_file.write(b"".join(pieces))
 11 |     out_file.write(message.SerializeToString())
 12 | 
 13 | 
 14 | def read_gold_props(gold_props_file):
 15 |     """ Read gold predicates from CoNLL-formatted file.
 16 |   """
 17 |     gold_props = []
 18 |     props = []
 19 |     with open(gold_props_file, 'r') as f:
 20 |         for line in f:
 21 |             line = line.strip()
 22 |             if line == '':
 23 |                 gold_props.append(props)
 24 |                 props = []
 25 |             else:
 26 |                 props.append(line.split()[0])
 27 |         f.close()
 28 |     if len(props) > 0:
 29 |         gold_props.append(props)
 30 |     return gold_props
 31 | 
 32 | 
 33 | def write_predprops_to(predictions,
 34 |                        label_dict,
 35 |                        input_file,
 36 |                        output_file,
 37 |                        gold_props_file=None,
 38 |                        output_props_file=None):
 39 |     """ Write predicted predicate information to files.
 40 | 
 41 |       Arguments:
 42 |         predictions: Predictions from the predicate identification model.
 43 |                       Is a numpy array of size [num_sentences, max_sentence_length].
 44 |         label_dict: Label dictionary.
 45 |         input_file: Input sequential tagging file.
 46 |         output_file: Output SRL file with identified predicates.
 47 |         gold_props_file: Input file with gold predicates in CoNLL format.
 48 |         output_props_file: Output SRL file with identified predicates, in CoNLL format.
 49 |   """
 50 | 
 51 |     fin = open(input_file, 'r')
 52 |     fout = open(output_file, 'w')
 53 | 
 54 |     if output_props_file is not None and output_props_file != '':
 55 |         fout_props = open(output_props_file, 'w')
 56 |     else:
 57 |         fout_props = None
 58 | 
 59 |     if gold_props_file is not None and gold_props_file != '':
 60 |         gold_props = read_gold_props(gold_props_file)
 61 |         print(len(gold_props), len(predictions))
 62 |         assert len(gold_props) == len(predictions)
 63 |     else:
 64 |         gold_props = None
 65 | 
 66 |     sent_id = 0
 67 |     for line in fin:
 68 |         # Read original sentence from input file.
 69 |         raw_sent = line.split('|||')[0].strip()
 70 |         tokens = raw_sent.split(' ')
 71 |         slen = len(tokens)
 72 |         pred = predictions[sent_id, :slen]
 73 |         props = []
 74 | 
 75 |         for (t, p) in enumerate(pred):
 76 |             if label_dict.idx2str[p] == 'V':
 77 |                 out_tags = ['O' for _ in range(slen)]
 78 |                 out_tags[t] = 'B-V'
 79 |                 out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join(
 80 |                     out_tags) + '\n'
 81 |                 fout.write(out_line)
 82 |                 props.append(t)
 83 | 
 84 |         if fout_props is not None:
 85 |             if sent_id > 0:
 86 |                 fout_props.write('\n')
 87 |             for t in range(slen):
 88 |                 lemma = 'P' + tokens[t].lower()
 89 |                 # In order for CoNLL evaluation script to run, we need to output the same
 90 |                 # lemma as the gold predicate in the CoNLL-formatted file.
 91 |                 if gold_props is not None and gold_props[sent_id][t] != '-':
 92 |                     lemma = gold_props[sent_id][t]
 93 |                 if t in props:
 94 |                     fout_props.write(lemma)
 95 |                 else:
 96 |                     fout_props.write('-')
 97 |                 for p in props:
 98 |                     if t == p:
 99 |                         fout_props.write('\t(V*)')
100 |                     else:
101 |                         fout_props.write('\t*')
102 |                 fout_props.write('\n')
103 |             sent_id += 1
104 | 
105 |     fout.close()
106 |     print('Predicted predicates in sequential-tagging format written to: {}.'.
107 |           format(output_file))
108 |     if fout_props is not None:
109 |         fout_props.close()
110 |         print('CoNLL-formatted predicate information written to: {}.'.format(
111 |             output_props_file))
112 | 
113 | 
114 | def bio_to_spans(predictions, label_dict):
115 |     """ Convert BIO-based predictions to a set of arguments.
116 |       Arguments:
117 |         predictions: A single integer array, already truncated to the original sequence lengths.
118 |         label_dict: Label dictionary.
119 |       Returns:
120 |         A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions.
121 |   """
122 |     args = []
123 |     tags = [label_dict.idx2str[p] for p in predictions]
124 |     for (i, tag) in enumerate(tags):
125 |         if tag == 'O':
126 |             continue
127 |         label = tag[2:]
128 |         # Append new span.
129 |         if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]:
130 |             args.append([label, i, -1])
131 |         # Close current span.
132 |         if i == len(predictions) - 1 or tags[
133 |                 i + 1][0] == 'B' or label != tags[i + 1][2:]:
134 |             args[-1][2] = i
135 |     return args
136 | 
137 | 
138 | def print_to_readable(predictions, num_tokens, label_dict, input_path,
139 |                       output_path):
140 |     """ Print predictions to human-readable format.
141 |   """
142 |     fout = open(output_path, 'w')
143 |     sample_id = 0
144 |     for line in open(input_path, 'r'):
145 |         info = line.split('|||')[0].strip().split()
146 |         pid = int(info[0])
147 |         sent = info[1:]
148 |         fout.write(' '.join(sent) + '\n')
149 |         fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid))
150 | 
151 |         tags = predictions[sample_id]
152 |         arg_spans = bio_to_spans(tags, label_dict)
153 |         for arg in arg_spans:
154 |             fout.write('\t\t{}: {}\n'.format(arg[0], " ".join(
155 |                 sent[arg[1]:arg[2] + 1])))
156 |         fout.write('\n')
157 |         sample_id += 1
158 | 
159 |     fout.close()
160 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/measurements.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | 
 4 | 
 5 | class Timer:
 6 |     def __init__(self, name, active=True):
 7 |         self.name = name if active else None
 8 | 
 9 |     def __enter__(self):
10 |         self.start = time.time()
11 |         self.last_tick = self.start
12 |         return self
13 | 
14 |     def __exit__(self, *args):
15 |         if self.name is not None:
16 |             print("{} duration was {}.".format(
17 |                 self.name, self.readable(time.time() - self.start)))
18 | 
19 |     def readable(self, seconds):
20 |         return str(datetime.timedelta(seconds=int(seconds)))
21 | 
22 |     def tick(self, message):
23 |         current = time.time()
24 |         print("{} took {} ({} since last tick).".format(
25 |             message, self.readable(current - self.start),
26 |             self.readable(current - self.last_tick)))
27 |         self.last_tick = current
28 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/numpy_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | def orth_normal_initializer(factor=1.0, seed=None):
 5 |     ''' Reference: Exact solutions to the nonlinear dynamics of learning in
 6 |                  deep linear neural networks
 7 |         Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf
 8 |       Adapted from the original implementation by Mingxuan Wang.
 9 |   '''
10 |     def _initializer(shape, dtype):
11 |         assert len(shape) == 2
12 |         rng = numpy.random.RandomState(seed)
13 |         if shape[0] == shape[1]:
14 |             M = rng.randn(*shape).astype(dtype)
15 |             Q, R = numpy.linalg.qr(M)
16 |             Q = Q * numpy.sign(numpy.diag(R))
17 |             param = Q * factor
18 |             return param
19 |         else:
20 |             M1 = rng.randn(shape[0], shape[0]).astype(dtype)
21 |             M2 = rng.randn(shape[1], shape[1]).astype(dtype)
22 |             Q1, R1 = numpy.linalg.qr(M1)
23 |             Q2, R2 = numpy.linalg.qr(M2)
24 |             Q1 = Q1 * numpy.sign(numpy.diag(R1))
25 |             Q2 = Q2 * numpy.sign(numpy.diag(R2))
26 |             n_min = min(shape[0], shape[1])
27 |             param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor
28 |             return param
29 | 
30 |     return _initializer
31 | 
32 | 
33 | def block_orth_normal_initializer(input_shapes,
34 |                                   output_shapes,
35 |                                   factor=1.0,
36 |                                   seed=None):
37 |     ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix.
38 |     Input:
39 |       - input_shapes: the sizes of each block alone dimension 0.
40 |       - output_shapes: the sizes of each block along dimension 1.
41 |       for example input_shapes = [100, 128] output_shapes=[100,100,100,100]
42 |         indicates eight blocks with shapes [100,100], [128,100], etc.
43 |   '''
44 |     def _initializer(shape, dtype):
45 |         assert len(shape) == 2
46 |         initializer = orth_normal_initializer(factor, seed)
47 |         params = numpy.concatenate([
48 |             numpy.concatenate([
49 |                 initializer([dim_in, dim_out], dtype)
50 |                 for dim_out in output_shapes
51 |             ], 1) for dim_in in input_shapes
52 |         ], 0)
53 |         return params
54 | 
55 |     return _initializer
56 | 
57 | 
58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None):
59 |     def _initializer(shape, dtype):
60 |         rng = numpy.random.RandomState(seed)
61 |         return numpy.asarray(rng.normal(mean, stddev, shape), dtype)
62 | 
63 |     return _initializer
64 | 
65 | 
66 | def all_zero_initializer():
67 |     def _initializer(shape, dtype):
68 |         return numpy.zeros(shape).astype(dtype)
69 | 
70 |     return _initializer
71 | 
72 | 
73 | def uniform_initializer(value=0.01):
74 |     def _initializer(shape, dtype):
75 |         return numpy.full(shape, value).astype(dtype)
76 | 
77 |     return _initializer
78 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/scores_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: scores.proto
  3 | 
  4 | import sys
  5 | import tensor_pb2 as tensor__pb2
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | # from google.protobuf import descriptor_pb2
 11 | 
 12 | 
 13 | # @@protoc_insertion_point(imports)
 14 | _b = sys.version_info[0] < 3 and (lambda x: x) or (
 15 |     lambda x: x.encode('latin1'))
 16 | 
 17 | 
 18 | _sym_db = _symbol_database.Default()
 19 | 
 20 | 
 21 | DESCRIPTOR = _descriptor.FileDescriptor(
 22 |     name='scores.proto',
 23 |     package='',
 24 |     syntax='proto2',
 25 |     serialized_pb=_b(
 26 |         '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\
 27 |           sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto'
 28 |     ),
 29 |     dependencies=[
 30 |         tensor__pb2.DESCRIPTOR,
 31 |     ])
 32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 33 | 
 34 | _SENTENCESCORESPROTO = _descriptor.Descriptor(
 35 |     name='SentenceScoresProto',
 36 |     full_name='SentenceScoresProto',
 37 |     filename=None,
 38 |     file=DESCRIPTOR,
 39 |     containing_type=None,
 40 |     fields=[
 41 |         _descriptor.FieldDescriptor(
 42 |             name='sentence_id',
 43 |             full_name='SentenceScoresProto.sentence_id',
 44 |             index=0,
 45 |             number=1,
 46 |             type=13,
 47 |             cpp_type=3,
 48 |             label=1,
 49 |             has_default_value=False,
 50 |             default_value=0,
 51 |             message_type=None,
 52 |             enum_type=None,
 53 |             containing_type=None,
 54 |             is_extension=False,
 55 |             extension_scope=None,
 56 |             options=None),
 57 |         _descriptor.FieldDescriptor(name='scores',
 58 |                                     full_name='SentenceScoresProto.scores',
 59 |                                     index=1,
 60 |                                     number=2,
 61 |                                     type=11,
 62 |                                     cpp_type=10,
 63 |                                     label=1,
 64 |                                     has_default_value=False,
 65 |                                     default_value=None,
 66 |                                     message_type=None,
 67 |                                     enum_type=None,
 68 |                                     containing_type=None,
 69 |                                     is_extension=False,
 70 |                                     extension_scope=None,
 71 |                                     options=None),
 72 |     ],
 73 |     extensions=[],
 74 |     nested_types=[],
 75 |     enum_types=[],
 76 |     options=None,
 77 |     is_extendable=False,
 78 |     syntax='proto2',
 79 |     extension_ranges=[],
 80 |     oneofs=[],
 81 |     serialized_start=30,
 82 |     serialized_end=102,
 83 | )
 84 | 
 85 | _SCORESPROTO = _descriptor.Descriptor(
 86 |     name='ScoresProto',
 87 |     full_name='ScoresProto',
 88 |     filename=None,
 89 |     file=DESCRIPTOR,
 90 |     containing_type=None,
 91 |     fields=[
 92 |         _descriptor.FieldDescriptor(name='sentences',
 93 |                                     full_name='ScoresProto.sentences',
 94 |                                     index=0,
 95 |                                     number=1,
 96 |                                     type=11,
 97 |                                     cpp_type=10,
 98 |                                     label=3,
 99 |                                     has_default_value=False,
100 |                                     default_value=[],
101 |                                     message_type=None,
102 |                                     enum_type=None,
103 |                                     containing_type=None,
104 |                                     is_extension=False,
105 |                                     extension_scope=None,
106 |                                     options=None),
107 |     ],
108 |     extensions=[],
109 |     nested_types=[],
110 |     enum_types=[],
111 |     options=None,
112 |     is_extendable=False,
113 |     syntax='proto2',
114 |     extension_ranges=[],
115 |     oneofs=[],
116 |     serialized_start=104,
117 |     serialized_end=158,
118 | )
119 | 
120 | _SENTENCESCORESPROTO.fields_by_name[
121 |     'scores'].message_type = tensor__pb2._TENSORPROTO
122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO
123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO
124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO
125 | 
126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType(
127 |     'SentenceScoresProto',
128 |     (_message.Message, ),
129 |     dict(DESCRIPTOR=_SENTENCESCORESPROTO,
130 |          __module__='scores_pb2'
131 |          # @@protoc_insertion_point(class_scope:SentenceScoresProto)
132 |          ))
133 | _sym_db.RegisterMessage(SentenceScoresProto)
134 | 
135 | ScoresProto = _reflection.GeneratedProtocolMessageType(
136 |     'ScoresProto',
137 |     (_message.Message, ),
138 |     dict(DESCRIPTOR=_SCORESPROTO,
139 |          __module__='scores_pb2'
140 |          # @@protoc_insertion_point(class_scope:ScoresProto)
141 |          ))
142 | _sym_db.RegisterMessage(ScoresProto)
143 | 
144 | # @@protoc_insertion_point(module_scope)
145 | 


--------------------------------------------------------------------------------
/src/orl-4.1-ultimate-hard-e2e/neural_srl/shared/syntactic_extraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import codecs
  3 | 
  4 | from .dictionary import Dictionary
  5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | class SyntacticTree(object):
 10 |     def __init__(self, sentence_id):
 11 |         self.sentence_id = sentence_id
 12 |         self.word_forms = ["Root"]
 13 |         self.word_forms_ids = []
 14 |         self.char_ids = [[]]  # 2D
 15 |         self.pos_forms = ["Root"]
 16 |         self.heads = [0]
 17 |         self.labels = ["Root"]
 18 |         self.labels_id = []
 19 | 
 20 | 
 21 | class SyntacticCONLL(object):
 22 |     def __init__(self):
 23 |         self.file_name = ""
 24 |         self.trees = []
 25 |         self.sample_dep_data = None
 26 | 
 27 |     def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8):
 28 |         self.file_name = filename
 29 | 
 30 |         print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio))
 31 |         conll_file = codecs.open(self.file_name, 'r', encoding="utf8")
 32 |         if conll_file.closed:
 33 |             print("Cannot open the syntactic conll file! Please check {}".format(self.file_name))
 34 | 
 35 |         sentence_id = 0
 36 |         a_tree = SyntacticTree(sentence_id)
 37 |         find_root = False
 38 |         for line in conll_file:
 39 |             if line == '\n' or line == '\r\n':  # new sentence
 40 |                 sentence_id += 1
 41 |                 if len(a_tree.word_forms) <= max_sentence_length:
 42 |                     assert find_root is True
 43 |                     # keep the sentence with the length < max_sentence_l
 44 |                     self.trees.append(a_tree)
 45 |                 a_tree = SyntacticTree(sentence_id)
 46 |                 find_root = False
 47 |                 continue
 48 |             tokens = line.strip().split('\t')
 49 |             a_tree.word_forms.append(tokens[1])
 50 |             a_tree.pos_forms.append(tokens[3])
 51 |             # head = int(tokens[6]) if int(tokens[6]) > 0 else -1
 52 |             head = int(tokens[6]) - 1  # root's head is 0
 53 |             if head == -1:
 54 |                 assert tokens[7] == "root"
 55 |                 find_root = True
 56 |             a_tree.heads.append(head)
 57 |             a_tree.labels.append(tokens[7])
 58 |             token_9 = tokens[9]  # or tokens 9 will be 'unicode' type
 59 |             dep_prob = 1.0 if isinstance(token_9, str) else float(token_9)
 60 |             if dep_prob < prune_ratio:
 61 |                 a_tree.heads[-1] = -1
 62 |         print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees)))
 63 | 
 64 |     @staticmethod
 65 |     def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 66 |         ids = []
 67 |         for s in list_of_words:
 68 |             s = s
 69 |             if s is None:
 70 |                 ids.append(-1)
 71 |                 continue
 72 |             if lowercase:
 73 |                 s = s.lower()
 74 |             if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 75 |                 s = UNKNOWN_TOKEN
 76 |             ids.append(dictionary.add(s))
 77 |         return ids
 78 | 
 79 |     def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None):
 80 |         for tree in self.trees:
 81 |             tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False,
 82 |                                                                       pretrained_word_embedding)
 83 |             words = tree.word_forms
 84 |             max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
 85 |             single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64)
 86 |             for i, word in enumerate(words):
 87 |                 single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict)
 88 |             # Add the sample char tokens into the sample_char_tokens
 89 |             tree.char_ids = single_sample_char_tokens
 90 | 
 91 |             tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict)
 92 | 
 93 |         sample_word_texts = [tree.word_forms for tree in self.trees]
 94 |         sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees]
 95 |         sample_char_ids = [tree.char_ids for tree in self.trees]
 96 |         sample_heads = [np.asarray(tree.heads) for tree in self.trees]
 97 |         sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees]
 98 |         self.sample_dep_data = list(zip(sample_word_texts,
 99 |                                         sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids))
100 | 
101 |     def get_syntactic_label_dict(self, syn_label_dict=None):
102 |         if syn_label_dict is None:
103 |             syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)
104 |         else:
105 |             assert syn_label_dict.accept_new is False
106 |         sentences_length = len(self.trees)
107 |         for i in range(sentences_length):
108 |             ith_sentence_length = len(self.trees[i].labels)
109 |             for j in range(ith_sentence_length):
110 |                 self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j]))
111 |         return syn_label_dict
112 | 
113 | 
114 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings):
115 |     dep_trees = SyntacticCONLL()
116 |     dep_trees.read_from_file(file_path, max_sentence_length=2000)
117 |     dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings)
118 | 
119 |     auto_dep_trees = OrderedDict()
120 |     for tree in dep_trees.trees:
121 |         sentence = ' '.join(tree.word_forms[1:])  # remove the "Root"
122 |         auto_dep_trees[sentence] = tree
123 |     return auto_dep_trees
124 | 
125 | 
126 | class SyntacticRepresentation(object):
127 |     def __init__(self):
128 |         self.file_name = ""
129 |         self.representations = []
130 | 
131 |     def read_from_file(self, filename):
132 |         self.file_name = filename
133 |         print("Reading lstm representations from {}".format(self.file_name))
134 |         representation_file = open(self.file_name, 'r')
135 |         if representation_file.closed:
136 |             print("Cannot open the representation file! Please check {}".format(self.file_name))
137 |             exit()
138 |         each_sentence_representations = []
139 |         for line in representation_file:
140 |             if line == '\n' or line == "\r\n":  # new sentence
141 |                 self.representations.append(each_sentence_representations)
142 |                 each_sentence_representations = []
143 |                 continue
144 |             line = line.strip()
145 |             line = line.split('\t')
146 |             line = line[1].split(' ')
147 |             rep = np.asarray(line, dtype=np.float32)
148 |             each_sentence_representations.append(rep)
149 |         representation_file.close()
150 |         print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations)))
151 | 
152 |     def minus_by_the_predicate(self, corpus_tensors):
153 |         has_processed_sentence_id = {}
154 |         for i, data in enumerate(corpus_tensors):
155 |             sentence_id = data[0][0][0]
156 |             predicates = data[0][2]
157 |             predicate_id = predicates.argmax()
158 |             if sentence_id in has_processed_sentence_id:
159 |                 continue
160 |             else:
161 |                 has_processed_sentence_id[sentence_id] = 1
162 |             for j in range(1, len(self.representations[sentence_id])):  # Root doesn't use.
163 |                 self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j]
164 | 
165 |     def check_math_corpus(self, lengths):
166 |         for i, length in enumerate(lengths):
167 |             if len(self.representations[i]) != length + 1:  # 1 means the first one, Root. Actually never use it.
168 |                 print(i, length, len(self.representations[i]))
169 |                 print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length)
170 |                 exit()
171 |         print("LSTM representation match the corpus!")
172 | 


--------------------------------------------------------------------------------
/src/orl-4.1/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["neural_srl"]


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/Encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from torch.nn.utils.rnn import pack_padded_sequence as pack
 5 | from torch.nn.utils.rnn import pad_packed_sequence as unpack
 6 | from .TreeGRU import DTTreeGRU, TDTreeGRU
 7 | from .Tree import creatTree
 8 | 
 9 | 
10 | class EncoderRNN(nn.Module):
11 |     """ The standard RNN encoder.
12 |     """
13 |     def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.1):
14 |         super(EncoderRNN, self).__init__()
15 |         self.hidden_size = hidden_size
16 |         self.num_layers = num_layers
17 |         self.dropout = nn.Dropout(dropout)
18 | 
19 |         self.rnn = nn.GRU(input_size=input_size,
20 |                           hidden_size=hidden_size,
21 |                           num_layers=num_layers,
22 |                           bidirectional=True)  # batch_first = False
23 |         self.transform = nn.Linear(in_features=2 * hidden_size,
24 |                                    out_features=input_size,
25 |                                    bias=True)
26 |         self.dt_tree = DTTreeGRU(input_size, hidden_size)
27 |         self.td_tree = TDTreeGRU(input_size, hidden_size)
28 | 
29 |     def forward(self, input, heads, lengths=None, hidden=None):
30 |         """ See EncoderBase.forward() for description of args and returns.
31 |         inputs: [L, B, H], including the -ROOT-
32 |         heads: [heads] * B
33 |         """
34 |         emb = self.dropout(input)
35 | 
36 |         packed_emb = emb
37 |         if lengths is not None:
38 |             # Lengths data is wrapped inside a Variable.
39 |             packed_emb = pack(emb, lengths)
40 | 
41 |         outputs, hidden_t = self.rnn(packed_emb, hidden)
42 | 
43 |         if lengths is not None:
44 |             outputs = unpack(outputs)[0]
45 | 
46 |         outputs = self.dropout(self.transform(outputs))
47 |         max_length, batch_size, input_dim = outputs.size()
48 |         trees = []
49 |         indexes = np.full((max_length, batch_size), -1,
50 |                           dtype=np.int32)  # a col is a sentence
51 |         for b, head in enumerate(heads):
52 |             root, tree = creatTree(
53 |                 head)  # head: a sentence's heads; sentence base
54 |             root.traverse()  # traverse the tree
55 |             for step, index in enumerate(root.order):
56 |                 indexes[step, b] = index
57 |             trees.append(tree)
58 | 
59 |         dt_outputs, dt_hidden_ts = self.dt_tree.forward(
60 |             outputs, indexes, trees)
61 |         td_outputs, td_hidden_ts = self.td_tree.forward(
62 |             outputs, indexes, trees)
63 | 
64 |         outputs = torch.cat([dt_outputs, td_outputs], dim=2).transpose(0, 1)
65 |         output_t = torch.cat([dt_hidden_ts, td_hidden_ts], dim=1).unsqueeze(0)
66 | 
67 |         return outputs, output_t
68 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/Tree.py:
--------------------------------------------------------------------------------
 1 | class Tree(object):
 2 |     def __init__(self, index):
 3 |         self.parent = None
 4 |         self.is_left = False
 5 |         self.index = index
 6 |         self.left_children = list()
 7 |         self.left_num = 0
 8 |         self.right_children = list()
 9 |         self.right_num = 0
10 |         self._depth = -1
11 |         self.order = []
12 | 
13 |     def add_left(self, child):
14 |         """
15 |         :param child: a Tree object represent the child
16 |         :return:
17 |         """
18 |         child.parent = self
19 |         child.is_left = True
20 |         self.left_children.append(child)
21 |         self.left_num += 1
22 | 
23 |     def add_right(self, child):
24 |         """
25 |         :param child: a Tree object represent the child
26 |         :return:
27 |         """
28 |         child.parent = self
29 |         child.is_left = False
30 |         self.right_children.append(child)
31 |         self.right_num += 1
32 | 
33 |     def size(self):  # compute the total size of the Tree
34 |         if hasattr(self, '_size'):
35 |             return self._size
36 |         count = 1
37 |         for i in range(self.left_num):
38 |             count += self.left_children[i].size()
39 |         for i in range(self.right_num):
40 |             count += self.right_children[i].size()
41 |         self._size = count
42 |         return self._size
43 | 
44 |     def depth(self):  # compute the depth of the Tree
45 |         if self._depth > 0:
46 |             return self._depth
47 |         count = 0
48 |         if self.left_num + self.right_num > 0:
49 |             for i in range(self.left_num):
50 |                 child_depth = self.left_children[i].depth()
51 |                 if child_depth > count:
52 |                     count = child_depth
53 |             for i in range(self.right_num):
54 |                 child_depth = self.right_children[i].depth()
55 |                 if child_depth > count:
56 |                     count = child_depth
57 |             count += 1
58 |         self._depth = count
59 |         return self._depth
60 | 
61 |     def traverse(self):  # traverse the Tree
62 |         if len(self.order) > 0:
63 |             return self.order
64 | 
65 |         for i in range(self.left_num):
66 |             left_order = self.left_children[i].traverse()
67 |             self.order.extend(left_order)
68 |         for i in range(self.right_num):
69 |             right_order = self.right_children[i].traverse()
70 |             self.order.extend(right_order)
71 |         self.order.append(self.index)  # append the root
72 |         return self.order
73 | 
74 | 
75 | def creatTree(heads):
76 |     tree = []
77 |     # current sentence has already been numberized [form, head, rel]
78 |     root = None
79 |     for idx, head in enumerate(heads):
80 |         tree.append(Tree(idx))
81 | 
82 |     for idx, head in enumerate(heads):
83 |         if head == -1:  # -1 mszhang, 0 kiro
84 |             root = tree[idx]
85 |             continue
86 |         if head < 0:
87 |             print('error: multi roots')
88 |         if head > idx:
89 |             tree[head].add_left(tree[idx])
90 |         if head < idx:
91 |             tree[head].add_right(tree[idx])
92 |         if head == idx:
93 |             print('error: head is it self.')
94 | 
95 |     return root, tree
96 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/TreeLSTM/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["Encoder", "Tree", "TreeGRU"]
2 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/tree.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/gcn_model/__pycache__/various_gcn.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/gcn.py:
--------------------------------------------------------------------------------
  1 | """
  2 | GCN model for relation extraction.
  3 | """
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | from ..shared.constants import PAD_ID
 10 | import numpy as np
 11 | 
 12 | 
 13 | class GCN(nn.Module):
 14 |     def __init__(self, config, input_dim, mem_dim, num_layers):
 15 |         super(GCN, self).__init__()
 16 |         self.config = config
 17 |         self.input_dim = input_dim
 18 |         self.mem_dim = mem_dim
 19 |         self.layers = num_layers
 20 | 
 21 |         # rnn layer
 22 |         if self.config.gcn_rnn is True:
 23 |             input_size = self.input_dim
 24 |             self.rnn = nn.LSTM(input_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers, batch_first=True,
 25 |                                dropout=self.config.gcn_rnn_dropout, bidirectional=True)
 26 |             self.in_dim = self.config.gcn_rnn_hidden * 2
 27 |             self.rnn_drop = nn.Dropout(self.config.gcn_rnn_dropout)  # use on last layer output
 28 | 
 29 |         self.in_drop = nn.Dropout(self.config.gcn_input_dropout)
 30 |         self.gcn_drop = nn.Dropout(self.config.gcn_gcn_dropout)
 31 | 
 32 |         # gcn layer
 33 |         self.W = nn.ModuleList()
 34 |         self.layer_normalization = nn.ModuleList()
 35 | 
 36 |         for layer in range(self.layers):
 37 |             # input_dim = self.in_dim if layer == 0 else self.mem_dim
 38 |             self.W.append(nn.Linear(self.in_dim, self.in_dim))
 39 |             self.layer_normalization.append(LayerNormalization(self.in_dim))
 40 | 
 41 |     def conv_l2(self):
 42 |         conv_weights = []
 43 |         for w in self.W:
 44 |             conv_weights += [w.weight, w.bias]
 45 |         return sum([x.pow(2).sum() for x in conv_weights])
 46 | 
 47 |     def encode_with_rnn(self, rnn_inputs, masks, batch_size):
 48 |         seq_lens = masks.data.eq(1).long().sum(1).squeeze()
 49 |         h0, c0 = rnn_zero_state(batch_size, self.config.gcn_rnn_hidden, self.config.gcn_rnn_layers)
 50 | 
 51 |         # SORT YOUR TENSORS BY LENGTH!
 52 |         seq_lens, perm_idx = seq_lens.sort(0, descending=True)
 53 | 
 54 |         rnn_inputs = rnn_inputs[perm_idx]
 55 |         rnn_inputs = nn.utils.rnn.pack_padded_sequence(rnn_inputs, seq_lens, batch_first=True)
 56 |         rnn_outputs, (ht, ct) = self.rnn(rnn_inputs, (h0, c0))
 57 |         rnn_outputs, _ = nn.utils.rnn.pad_packed_sequence(rnn_outputs, batch_first=True)
 58 | 
 59 |         _, unperm_idx = perm_idx.sort(0)
 60 |         rnn_outputs = rnn_outputs[unperm_idx]
 61 |         return rnn_outputs
 62 | 
 63 |     def forward(self, adj, embs, masks):
 64 |         batch_size = masks.size()[0]
 65 |         embs = self.in_drop(embs)
 66 |         # rnn layer
 67 |         if self.config.gcn_rnn is True:
 68 |             gcn_inputs = self.rnn_drop(self.encode_with_rnn(embs, masks, batch_size))
 69 |         else:
 70 |             gcn_inputs = embs
 71 | 
 72 |         # gcn layer
 73 |         denom = adj.sum(2).unsqueeze(2) + 1
 74 |         mask = (adj.sum(2) + adj.sum(1)).eq(0).unsqueeze(2)
 75 |         # # zero out adj for ablation
 76 |         # if self.opt.get('no_adj', False):
 77 |         #     adj = torch.zeros_like(adj)
 78 | 
 79 |         for l in range(self.layers):
 80 |             # print(gcn_inputs.size(), adj.size())
 81 |             x = gcn_inputs
 82 |             Ax = adj.bmm(gcn_inputs)
 83 |             AxW = self.W[l](Ax)
 84 |             AxW = AxW + self.W[l](gcn_inputs)  # self loop
 85 |             AxW = AxW / denom
 86 | 
 87 |             gAxW = F.relu(AxW)
 88 |             gcn_inputs = self.gcn_drop(gAxW)
 89 |             self.layer_normalization[l].forward(gcn_inputs + x)
 90 | 
 91 |         return gcn_inputs, mask
 92 | 
 93 | 
 94 | def rnn_zero_state(batch_size, hidden_dim, num_layers, bidirectional=True, use_cuda=True):
 95 |     total_layers = num_layers * 2 if bidirectional else num_layers
 96 |     state_shape = (total_layers, batch_size, hidden_dim)
 97 |     h0 = c0 = Variable(torch.zeros(*state_shape), requires_grad=False)
 98 |     if use_cuda:
 99 |         return h0.cuda(), c0.cuda()
100 |     else:
101 |         return h0, c0
102 | 
103 | 
104 | class LayerNormalization(nn.Module):
105 |     ''' Layer normalization module '''
106 | 
107 |     def __init__(self, d_hid, eps=1e-3):  #
108 |         super(LayerNormalization, self).__init__()
109 |         self.eps = eps
110 |         self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
111 |         self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)
112 | 
113 |     def forward(self, z):
114 |         if z.size(1) == 1:
115 |             return z
116 |         mu = torch.mean(z, keepdim=True, dim=-1)
117 |         sigma = torch.std(z, keepdim=True, dim=-1)
118 |         ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)  # 1e-3 is ok, because variance and std.
119 |         ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)
120 |         return ln_out
121 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/gcn_model/tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basic operations on trees.
  3 | """
  4 | 
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | 
  8 | 
  9 | class Tree(object):
 10 |     """
 11 |     Reused tree object from stanfordnlp/treelstm.
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         self.parent = None
 16 |         # head probability
 17 |         self.phead = -1
 18 |         self.num_children = 0
 19 |         self.children = list()
 20 | 
 21 |     def add_child(self, child):
 22 |         child.parent = self
 23 |         self.num_children += 1
 24 |         self.children.append(child)
 25 | 
 26 |     def size(self):
 27 |         if getattr(self, '_size'):
 28 |             return self._size
 29 |         count = 1
 30 |         for i in xrange(self.num_children):
 31 |             count += self.children[i].size()
 32 |         self._size = count
 33 |         return self._size
 34 | 
 35 |     def depth(self):
 36 |         if getattr(self, '_depth'):
 37 |             return self._depth
 38 |         count = 0
 39 |         if self.num_children > 0:
 40 |             for i in xrange(self.num_children):
 41 |                 child_depth = self.children[i].depth()
 42 |                 if child_depth > count:
 43 |                     count = child_depth
 44 |             count += 1
 45 |         self._depth = count
 46 |         return self._depth
 47 | 
 48 |     def __iter__(self):
 49 |         yield self
 50 |         for c in self.children:
 51 |             for x in c:
 52 |                 yield x
 53 | 
 54 | 
 55 | def head_to_tree(head, tokens, len_, prune, subj_pos, obj_pos):
 56 |     """
 57 |     Convert a sequence of head indexes into a tree object.
 58 |     """
 59 |     tokens = tokens[:len_].tolist()
 60 |     head = head[:len_].tolist()
 61 |     root = None
 62 | 
 63 |     if prune < 0:
 64 |         nodes = [Tree() for _ in head]
 65 | 
 66 |         for i in range(len(nodes)):
 67 |             h = head[i]
 68 |             nodes[i].idx = i
 69 |             nodes[i].dist = -1  # just a filler
 70 |             if h == 0:
 71 |                 root = nodes[i]
 72 |             else:
 73 |                 nodes[h - 1].add_child(nodes[i])
 74 |     else:
 75 |         # find dependency path
 76 |         subj_pos = [i for i in range(len_) if subj_pos[i] == 0]
 77 |         obj_pos = [i for i in range(len_) if obj_pos[i] == 0]
 78 | 
 79 |         cas = None
 80 | 
 81 |         subj_ancestors = set(subj_pos)
 82 |         for s in subj_pos:
 83 |             h = head[s]
 84 |             tmp = [s]
 85 |             while h > 0:
 86 |                 tmp += [h - 1]
 87 |                 subj_ancestors.add(h - 1)
 88 |                 h = head[h - 1]
 89 | 
 90 |             if cas is None:
 91 |                 cas = set(tmp)
 92 |             else:
 93 |                 cas.intersection_update(tmp)
 94 | 
 95 |         obj_ancestors = set(obj_pos)
 96 |         for o in obj_pos:
 97 |             h = head[o]
 98 |             tmp = [o]
 99 |             while h > 0:
100 |                 tmp += [h - 1]
101 |                 obj_ancestors.add(h - 1)
102 |                 h = head[h - 1]
103 |             cas.intersection_update(tmp)
104 | 
105 |         # find lowest common ancestor
106 |         if len(cas) == 1:
107 |             lca = list(cas)[0]
108 |         else:
109 |             child_count = {k: 0 for k in cas}
110 |             for ca in cas:
111 |                 if head[ca] > 0 and head[ca] - 1 in cas:
112 |                     child_count[head[ca] - 1] += 1
113 | 
114 |             # the LCA has no child in the CA set
115 |             for ca in cas:
116 |                 if child_count[ca] == 0:
117 |                     lca = ca
118 |                     break
119 | 
120 |         path_nodes = subj_ancestors.union(obj_ancestors).difference(cas)
121 |         path_nodes.add(lca)
122 | 
123 |         # compute distance to path_nodes
124 |         dist = [-1 if i not in path_nodes else 0 for i in range(len_)]
125 | 
126 |         for i in range(len_):
127 |             if dist[i] < 0:
128 |                 stack = [i]
129 |                 while stack[-1] >= 0 and stack[-1] not in path_nodes:
130 |                     stack.append(head[stack[-1]] - 1)
131 | 
132 |                 if stack[-1] in path_nodes:
133 |                     for d, j in enumerate(reversed(stack)):
134 |                         dist[j] = d
135 |                 else:
136 |                     for j in stack:
137 |                         if j >= 0 and dist[j] < 0:
138 |                             dist[j] = int(1e4)  # aka infinity
139 | 
140 |         highest_node = lca
141 |         nodes = [Tree() if dist[i] <= prune else None for i in range(len_)]
142 | 
143 |         for i in range(len(nodes)):
144 |             if nodes[i] is None:
145 |                 continue
146 |             h = head[i]
147 |             nodes[i].idx = i
148 |             nodes[i].dist = dist[i]
149 |             if h > 0 and i != highest_node:
150 |                 assert nodes[h - 1] is not None
151 |                 nodes[h - 1].add_child(nodes[i])
152 | 
153 |         root = nodes[highest_node]
154 | 
155 |     assert root is not None
156 |     return root
157 | 
158 | 
159 | def tree_to_adj(sent_len, tree, directed=True, self_loop=False):
160 |     """
161 |     Convert a tree object to an (numpy) adjacency matrix.
162 |     """
163 |     ret = np.zeros((sent_len, sent_len), dtype=np.float32)
164 | 
165 |     queue = [tree]
166 |     idx = []
167 |     while len(queue) > 0:
168 |         t, queue = queue[0], queue[1:]
169 | 
170 |         idx += [t.idx]
171 | 
172 |         for c in t.children:
173 |             ret[t.idx, c.idx] = 1
174 |         queue += t.children
175 | 
176 |     if not directed:
177 |         ret = ret + ret.T
178 | 
179 |     if self_loop:
180 |         for i in idx:
181 |             ret[i, i] = 1
182 | 
183 |     return ret
184 | 
185 | 
186 | def tree_to_dist(sent_len, tree):
187 |     ret = -1 * np.ones(sent_len, dtype=np.int64)
188 | 
189 |     for node in tree:
190 |         ret[node.idx] = node.dist
191 | 
192 |     return ret
193 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/HighWayLSTM.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/implicit_syntactic_representations.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/layer.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/model.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/pre_trained_language_model.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/tagger.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/pytorch/__pycache__/util.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/implicit_syntactic_representations.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | 
  7 | 
  8 | from .model import drop_sequence_sharedmask, _model_var
  9 | from .HighWayLSTM import Highway_Concat_BiLSTM
 10 | from .layer import NonLinear, Biaffine
 11 | 
 12 | 
 13 | class ImplicitDependencyRepresentations(nn.Module):
 14 |     def __init__(self, config, lstm_input_size, lstm_hidden_size, dep_label_space_size):
 15 |         super(ImplicitDependencyRepresentations, self).__init__()
 16 |         self.config = config
 17 |         self.lstm_input_size = lstm_input_size
 18 |         self.lstm_hidden_size = lstm_hidden_size
 19 |         self.dep_label_space_size = dep_label_space_size
 20 |         # softmax weights
 21 |         self.dep_gamma = nn.Parameter(torch.FloatTensor([1.0]))
 22 |         self.softmax_dep_weights = nn.ParameterList([nn.Parameter(torch.FloatTensor([0.0]))
 23 |                                                      for _ in range(self.config.dep_num_lstm_layers)])
 24 |         self.cuda = True
 25 | 
 26 |         self.dep_bilstm = Highway_Concat_BiLSTM(
 27 |             input_size=self.lstm_input_size,
 28 |             hidden_size=self.lstm_hidden_size,  # // 2 for MyLSTM
 29 |             num_layers=self.config.dep_num_lstm_layers,
 30 |             batch_first=True,
 31 |             bidirectional=True,
 32 |             dropout_in=config.input_dropout_prob,
 33 |             dropout_out=config.recurrent_dropout_prob
 34 |         )
 35 | 
 36 |         # dependency parsing module
 37 |         self.mlp_arc_dep = NonLinear(
 38 |             input_size=2 * config.lstm_hidden_size,
 39 |             hidden_size=config.mlp_arc_size + config.mlp_rel_size,
 40 |             activation=nn.LeakyReLU(0.1))
 41 |         self.mlp_arc_head = NonLinear(
 42 |             input_size=2 * config.lstm_hidden_size,
 43 |             hidden_size=config.mlp_arc_size + config.mlp_rel_size,
 44 |             activation=nn.LeakyReLU(0.1))
 45 | 
 46 |         self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
 47 |         self.arc_num = int(config.mlp_arc_size / 100)
 48 |         self.rel_num = int(config.mlp_rel_size / 100)
 49 | 
 50 |         self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size, 1, bias=(True, False))
 51 |         self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size, self.dep_label_space_size,
 52 |                                      bias=(True, True))
 53 | 
 54 |     def init_masks(self, batch_size, lengths):
 55 |         max_sent_length = max(lengths)
 56 |         num_sentences = batch_size
 57 |         indices = torch.arange(0, max_sent_length).unsqueeze(0).expand(num_sentences, -1)
 58 |         masks = indices < lengths.unsqueeze(1)
 59 |         masks = masks.type(torch.FloatTensor)
 60 |         if self.cuda:
 61 |             masks = masks.cuda()
 62 |         return masks
 63 | 
 64 |     def forward(self, num_sentences, context_embeddings, sent_lengths, dep):
 65 |         masks = self.init_masks(num_sentences, torch.LongTensor(sent_lengths))
 66 |         lstm_out, _ = self.dep_bilstm(context_embeddings, masks)
 67 | 
 68 |         if self.training:
 69 |             lstm_out = drop_sequence_sharedmask(lstm_out, self.config.dropout_mlp)
 70 | 
 71 |         x_all_dep = self.mlp_arc_dep(lstm_out)
 72 |         x_all_head = self.mlp_arc_head(lstm_out)
 73 | 
 74 |         if self.training:
 75 |             x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
 76 |             x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
 77 | 
 78 |         x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
 79 |         x_all_head_splits = torch.split(x_all_head, 100, dim=2)
 80 | 
 81 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
 82 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
 83 | 
 84 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
 85 |         arc_logit = torch.squeeze(arc_logit, dim=3)
 86 | 
 87 |         x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
 88 |         x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
 89 | 
 90 |         rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
 91 | 
 92 |         self.arc_logits, self.rel_logits = arc_logit, rel_logit_cond
 93 | 
 94 |         heads, rels = dep[0], dep[1]
 95 |         loss = self.compute_dep_loss(heads, rels, sent_lengths.tolist())  # compute the dep loss
 96 |         return loss, self.arc_logits
 97 | 
 98 |     def compute_dep_loss(self, true_arcs, true_rels, lengths):
 99 |         b, l1, l2 = self.arc_logits.size()
100 |         index_true_arcs = _model_var(
101 |             self.parameters(),
102 |             pad_sequence(true_arcs, padding_value=0, batch_first=True)
103 |         )
104 |         true_arcs = _model_var(
105 |             self.parameters(),
106 |             pad_sequence(true_arcs, padding_value=-1, batch_first=True)
107 |         )
108 | 
109 |         masks = []
110 |         for length in lengths:
111 |             mask = torch.FloatTensor([0] * length + [-1000] * (l2 - length))
112 |             mask = _model_var(self.parameters(), mask)
113 |             mask = torch.unsqueeze(mask, dim=1).expand(-1, l1)
114 |             masks.append(mask.transpose(0, 1))
115 |         length_mask = torch.stack(masks, 0)
116 |         arc_logits = self.arc_logits + length_mask
117 | 
118 |         arc_loss = F.cross_entropy(
119 |             arc_logits.view(b * l1, l2), true_arcs.view(b * l1),
120 |             ignore_index=-1, reduction="sum")
121 | 
122 |         size = self.rel_logits.size()
123 |         output_logits = _model_var(self.parameters(), torch.zeros(size[0], size[1], size[3]))
124 |         for batch_index, (logits, arcs) in enumerate(list(zip(self.rel_logits, index_true_arcs))):
125 |             rel_probs = []
126 |             for i in range(l1):
127 |                 rel_probs.append(logits[i][int(arcs[i])])
128 |             rel_probs = torch.stack(rel_probs, dim=0)
129 |             output_logits[batch_index] = torch.squeeze(rel_probs, dim=1)
130 | 
131 |         b, l1, d = output_logits.size()
132 |         true_rels = _model_var(self.parameters(), pad_sequence(true_rels, padding_value=-1, batch_first=True))
133 | 
134 |         rel_loss = F.cross_entropy(
135 |             output_logits.view(b * l1, d), true_rels.view(b * l1), ignore_index=-1, reduction="sum")
136 | 
137 |         loss = arc_loss + rel_loss
138 |         return loss
139 | 
140 |     def get_reps(self, context_embeddings, masks):
141 |         dep_lstm_out, dep_lstm_outputs = self.dep_bilstm.forward(context_embeddings, masks)
142 |         normed_weights = F.softmax(torch.cat([param for param in self.softmax_dep_weights]), dim=0)
143 |         normed_weights = torch.split(normed_weights, 1)  # split_size_or_sections=1, split_size=1)  # 0.3.0
144 |         dep_representations = self.dep_gamma * \
145 |                               sum([normed_weights[i] * dep_lstm_outputs[i] for i in
146 |                                    range(self.config.dep_num_lstm_layers)])
147 |         if self.training:
148 |             lstm_out = drop_sequence_sharedmask(dep_lstm_out, self.config.dropout_mlp)
149 | 
150 |         x_all_dep = self.mlp_arc_dep(dep_lstm_out)
151 |         x_all_head = self.mlp_arc_head(dep_lstm_out)
152 | 
153 |         if self.training:
154 |             x_all_dep = drop_sequence_sharedmask(x_all_dep, self.config.dropout_mlp)
155 |             x_all_head = drop_sequence_sharedmask(x_all_head, self.config.dropout_mlp)
156 | 
157 |         x_all_dep_splits = torch.split(x_all_dep, 100, dim=2)
158 |         x_all_head_splits = torch.split(x_all_head, 100, dim=2)
159 | 
160 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
161 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
162 | 
163 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
164 |         arc_logit = torch.squeeze(arc_logit, dim=3)
165 |         return dep_representations, arc_logit
166 | 
167 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch import nn
  4 | from .layer import MyLSTM, NonLinear, Biaffine
  5 | 
  6 | 
  7 | def _model_var(parameters, x):
  8 |     p = next(iter(filter(lambda p: p.requires_grad, parameters)))
  9 |     if p.is_cuda:
 10 |         x = x.cuda(p.get_device())
 11 |     return torch.autograd.Variable(x)
 12 | 
 13 | 
 14 | def drop_input_independent(word_embeddings, tag_embeddings, dropout_emb):
 15 |     batch_size, seq_length, _ = word_embeddings.size()
 16 |     # tensor.new: build a tensor with the same data type
 17 |     word_masks = word_embeddings.data.new(batch_size,
 18 |                                           seq_length).fill_(1 - dropout_emb)
 19 |     word_masks = torch.Tensor(torch.bernoulli(word_masks))
 20 |     word_masks.requires_grad = False
 21 |     tag_masks = tag_embeddings.data.new(batch_size,
 22 |                                         seq_length).fill_(1 - dropout_emb)
 23 |     tag_masks = torch.Tensor(torch.bernoulli(tag_masks))
 24 |     tag_masks.requires_grad = False
 25 |     scale = 3.0 / (2.0 * word_masks + tag_masks + 1e-12)
 26 |     word_masks *= scale
 27 |     tag_masks *= scale
 28 |     # unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.
 29 |     word_masks = word_masks.unsqueeze(dim=2)  # ?
 30 |     tag_masks = tag_masks.unsqueeze(dim=2)
 31 |     word_embeddings = word_embeddings * word_masks
 32 |     tag_embeddings = tag_embeddings * tag_masks
 33 | 
 34 |     return word_embeddings, tag_embeddings
 35 | 
 36 | 
 37 | def drop_sequence_sharedmask(inputs, dropout, batch_first=True):
 38 |     if batch_first:
 39 |         inputs = inputs.transpose(0, 1)
 40 |     seq_length, batch_size, hidden_size = inputs.size()
 41 |     drop_masks = torch.Tensor(batch_size, hidden_size).fill_(1 - dropout)
 42 |     drop_masks = torch.Tensor(torch.bernoulli(drop_masks)).type(inputs.type())
 43 |     drop_masks.requires_grad = False
 44 |     drop_masks = drop_masks / (1 - dropout)
 45 |     drop_masks = torch.unsqueeze(drop_masks,
 46 |                                  dim=2).expand(-1, -1,
 47 |                                                seq_length).permute(2, 0, 1)
 48 |     inputs = inputs * drop_masks
 49 | 
 50 |     return inputs.transpose(1, 0)
 51 | 
 52 | 
 53 | class ParserModel(nn.Module):  # build a biaffine parser model
 54 |     def __init__(self, vocab, config, pretrained_embedding):
 55 |         super(ParserModel, self).__init__()
 56 |         self.config = config
 57 |         self.word_embed = nn.Embedding(vocab.vocab_size,
 58 |                                        config.word_dims,
 59 |                                        padding_idx=0)
 60 |         self.extword_embed = nn.Embedding(vocab.extvocab_size,
 61 |                                           config.word_dims,
 62 |                                           padding_idx=0)
 63 |         self.tag_embed = nn.Embedding(vocab.tag_size,
 64 |                                       config.tag_dims,
 65 |                                       padding_idx=0)
 66 | 
 67 |         word_init = np.zeros((vocab.vocab_size, config.word_dims),
 68 |                              dtype=np.float32)
 69 |         self.word_embed.weight.data.copy_(torch.from_numpy(word_init))
 70 | 
 71 |         tag_init = np.random.randn(vocab.tag_size,
 72 |                                    config.tag_dims).astype(np.float32)
 73 |         self.tag_embed.weight.data.copy_(torch.from_numpy(tag_init))
 74 | 
 75 |         self.extword_embed.weight.data.copy_(
 76 |             torch.from_numpy(pretrained_embedding))
 77 |         self.extword_embed.weight.requires_grad = False
 78 | 
 79 |         self.lstm = MyLSTM(
 80 |             input_size=config.word_dims + config.tag_dims,
 81 |             hidden_size=config.lstm_hiddens,
 82 |             num_layers=config.lstm_layers,
 83 |             batch_first=True,
 84 |             bidirectional=True,
 85 |             dropout_in=config.dropout_lstm_input,
 86 |             dropout_out=config.dropout_lstm_hidden,
 87 |         )
 88 | 
 89 |         self.mlp_arc_dep = NonLinear(input_size=2 * config.lstm_hiddens,
 90 |                                      hidden_size=config.mlp_arc_size +
 91 |                                      config.mlp_rel_size,
 92 |                                      activation=nn.LeakyReLU(0.1))
 93 |         self.mlp_arc_head = NonLinear(input_size=2 * config.lstm_hiddens,
 94 |                                       hidden_size=config.mlp_arc_size +
 95 |                                       config.mlp_rel_size,
 96 |                                       activation=nn.LeakyReLU(0.1))
 97 | 
 98 |         self.total_num = int((config.mlp_arc_size + config.mlp_rel_size) / 100)
 99 |         self.arc_num = int(config.mlp_arc_size / 100)  # config: 500
100 |         self.rel_num = int(config.mlp_rel_size / 100)  # config: 100
101 | 
102 |         self.arc_biaffine = Biaffine(config.mlp_arc_size, config.mlp_arc_size,
103 |                                      1, bias=(True, False))
104 |         self.rel_biaffine = Biaffine(config.mlp_rel_size, config.mlp_rel_size,
105 |                                      vocab.rel_size, bias=(True, True))
106 | 
107 |     def forward(
108 |             self, words, extwords, tags,
109 |             masks):  # words [batch, max_sentence_length], padding with zeros
110 |         # x = (batch size, sequence length, dimension of embedding)
111 |         x_word_embed = self.word_embed(words)
112 |         x_extword_embed = self.extword_embed(extwords)
113 |         x_embed = x_word_embed + x_extword_embed
114 |         x_tag_embed = self.tag_embed(tags)
115 | 
116 |         if self.training:
117 |             x_embed, x_tag_embed = drop_input_independent(
118 |                 x_embed, x_tag_embed, self.config.dropout_emb)
119 | 
120 |         x_lexical = torch.cat((x_embed, x_tag_embed), dim=2)
121 | 
122 |         outputs, _ = self.lstm(x_lexical, masks, None)
123 |         outputs = outputs.transpose(1, 0)
124 | 
125 |         if self.training:
126 |             outputs = drop_sequence_sharedmask(outputs,
127 |                                                self.config.dropout_mlp)
128 | 
129 |         x_all_dep = self.mlp_arc_dep(outputs)
130 |         x_all_head = self.mlp_arc_head(outputs)
131 | 
132 |         if self.training:
133 |             x_all_dep = drop_sequence_sharedmask(x_all_dep,
134 |                                                  self.config.dropout_mlp)
135 |             x_all_head = drop_sequence_sharedmask(x_all_head,
136 |                                                   self.config.dropout_mlp)
137 | 
138 |         x_all_dep_splits = torch.split(x_all_dep, split_size=100, dim=2)
139 |         x_all_head_splits = torch.split(x_all_head, split_size=100, dim=2)
140 | 
141 |         x_arc_dep = torch.cat(x_all_dep_splits[:self.arc_num], dim=2)
142 |         x_arc_head = torch.cat(x_all_head_splits[:self.arc_num], dim=2)
143 | 
144 |         arc_logit = self.arc_biaffine(x_arc_dep, x_arc_head)
145 |         arc_logit = torch.squeeze(arc_logit, dim=3)
146 | 
147 |         x_rel_dep = torch.cat(x_all_dep_splits[self.arc_num:], dim=2)
148 |         x_rel_head = torch.cat(x_all_head_splits[self.arc_num:], dim=2)
149 | 
150 |         rel_logit_cond = self.rel_biaffine(x_rel_dep, x_rel_head)
151 |         return arc_logit, rel_logit_cond
152 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/pre_trained_language_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.nn import Parameter
  5 | from torch.nn.utils.rnn import pad_sequence
  6 | from transformers import BertModel
  7 | from transformers import BertTokenizer
  8 | 
  9 | 
 10 | class ScalarMix(torch.nn.Module):
 11 |     def __init__(self, mixture_size=4):
 12 |         super(ScalarMix, self).__init__()
 13 |         self.mixture_size = mixture_size
 14 |         self.scalar_parameters = Parameter(torch.ones(mixture_size))
 15 |         self.gamma = Parameter(torch.tensor(1.0))
 16 | 
 17 |     def forward(self, layers):
 18 |         normed_weights = F.softmax(self.scalar_parameters, dim=0)
 19 |         return self.gamma * sum(
 20 |             weight * tensor for weight, tensor in zip(normed_weights, layers)
 21 |         )
 22 | 
 23 | 
 24 | class Bert_Embedding(nn.Module):
 25 |     def __init__(self, bert_path, bert_layer, bert_dim, freeze=True):
 26 |         super(Bert_Embedding, self).__init__()
 27 |         self.bert_layer = bert_layer
 28 |         self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True)
 29 |         print(self.bert.config)
 30 |         self.scalar_mix = ScalarMix(bert_layer)
 31 | 
 32 |         if freeze:
 33 |             self.freeze()
 34 | 
 35 |     def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask):
 36 |         self.eval()
 37 |         sen_lens = token_starts_masks.sum(dim=1)
 38 |         _, _, bert_outs = self.bert(
 39 |             subword_idxs,
 40 |             attention_mask=subword_masks
 41 |         )  # tuple([Batch_size, max_sentence_length, dim])
 42 |         bert_outs = bert_outs[-self.bert_layer:]
 43 |         bert_outs = self.scalar_mix(bert_outs)
 44 |         # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
 45 |         # bert_outs = pad_sequence(bert_outs, batch_first=True)
 46 |         zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1))
 47 |         zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks])
 48 |         subwords_lens = subwords_mask.sum(-1)
 49 |         subwords_lens += (subwords_lens == 0).type(subwords_lens.type())  # 0.0 / 0 -> 0.0 / 1
 50 |         bert_outs = zeros.sum(2) / subwords_lens.unsqueeze(-1)
 51 |         return bert_outs
 52 | 
 53 |     def freeze(self):
 54 |         for para in self.bert.parameters():
 55 |             para.requires_grad = False
 56 | 
 57 | 
 58 | class Bert_Encoder(nn.Module):
 59 |     def __init__(self, bert_path, bert_layer, freeze=False, fix_layer_number=None):
 60 |         super(Bert_Encoder, self).__init__()
 61 |         self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True)
 62 |         self.bert_layer = bert_layer
 63 | 
 64 |         if freeze:
 65 |             self.freeze()
 66 |         if fix_layer_number is not None:
 67 |             self.fix_several_layers(fix_layer_number)
 68 | 
 69 |     def forward(self, subword_idxs, subword_masks, token_starts_masks, text_masks, subwords_mask):
 70 |         sen_lens = token_starts_masks.sum(dim=1)
 71 |         _, _, bert_outs = self.bert(
 72 |             subword_idxs,
 73 |             token_type_ids=None,
 74 |             attention_mask=subword_masks,
 75 |         )
 76 |         bert_outs = bert_outs[-1]  # the last layer of BERT outputs
 77 |         # bert_outs = torch.split(bert_outs[token_starts_masks], sen_lens.tolist())
 78 |         zeros = bert_outs.new_zeros(*subwords_mask.size(), bert_outs.size(-1))
 79 |         zeros.masked_scatter_(subwords_mask.unsqueeze(-1), bert_outs[text_masks])
 80 |         bert_outs = pad_sequence(zeros, batch_first=True)
 81 |         subwords_lens = subwords_mask.sum(-1)
 82 |         subwords_lens += (subwords_lens == 0).type(subwords_lens.type())  # 0.0 / 0 -> 0.0 / 1
 83 |         bert_outs = bert_outs.sum(2) / subwords_lens.unsqueeze(-1)
 84 |         return bert_outs
 85 | 
 86 |     def freeze(self):
 87 |         for para in self.bert.parameters():
 88 |             para.requires_grad = False
 89 | 
 90 |     def fix_several_layers(self, layer_numer):
 91 |         fixed_layer_names = ["embeddings"] if layer_numer >= 0 else []
 92 |         for i in range(layer_numer):
 93 |             fixed_layer_names.append("encoder.layer." + str(i) + '.')
 94 |         print("{} will be fixed".format(fixed_layer_names))
 95 |         for name, para in self.bert.named_parameters():
 96 |             for layer_name in fixed_layer_names:
 97 |                 if layer_name in name:
 98 |                     para.requires_grad = False
 99 |                     break
100 | 
101 | 
102 | class Vocab(object):
103 |     def __init__(self, bert_vocab_path):
104 |         self.tokenizer = BertTokenizer.from_pretrained(
105 |             bert_vocab_path, do_lower_case=False
106 |         )
107 | 
108 |     def numericalize(self, seqs, training=True):
109 |         subwords, masks, starts = [], [], []
110 |         text_masks, subwords_mask = [], []
111 | 
112 |         for seq in seqs:
113 |             seq = [self.tokenizer.tokenize(token) for token in seq]
114 |             seq = [piece if piece else ["[PAD]"] for piece in seq]
115 |             seq = [["[CLS]"]] + seq + [["[SEP]"]]
116 |             lengths = [0] + [len(piece) for piece in seq]
117 |             # flatten the word pieces
118 |             tokens = sum(seq, [])
119 |             # subwords indexes
120 |             token_idx = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens))
121 |             subwords.append(token_idx)
122 | 
123 |             # subword masks
124 |             mask = torch.ones(len(tokens), dtype=torch.bool)
125 |             masks.append(mask)
126 |             # subword text mask
127 |             text_mask = torch.BoolTensor([0] + [1] * (len(tokens) - 2) + [0])
128 |             text_masks.append(text_mask)
129 | 
130 |             # record the start position of all words
131 |             start_idxs = torch.tensor(lengths).cumsum(0)[1:-2]  # bos:0 eos:-2
132 |             # subword start masks
133 |             start_mask = torch.zeros(len(tokens), dtype=torch.bool)
134 |             start_mask[start_idxs] = 1
135 |             starts.append(start_mask)
136 | 
137 |             # record the start and last position of all words
138 |             start_end_idxs = torch.tensor(lengths).cumsum(0)[1:-1]
139 |             subword_mask = [torch.ones(start_end_idxs[i + 1] - start_end_idxs[i])
140 |                             for i in range(len(start_end_idxs) - 1)]
141 |             subword_mask = pad_sequence(subword_mask, batch_first=True)
142 |             subwords_mask.append(subword_mask)
143 |         max_subword_length = max(m.size(-1) for m in subwords_mask)
144 |         max_sentence_length = max(m.size(0) for m in subwords_mask)
145 |         subwords_mask = [F.pad(mask, (0, max_subword_length - mask.size(1), 0, max_sentence_length - mask.size(0)))
146 |                          for mask in subwords_mask]  # [left, right, top, down]
147 |         subwords_mask = torch.stack(subwords_mask)
148 |         return subwords, masks, starts, text_masks, subwords_mask
149 | 
150 | 
151 | class BERT_input(nn.Module):
152 |     def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim):
153 |         super(BERT_input, self).__init__()
154 |         self.vocab = Vocab(bert_vocab_path)
155 |         self.bert_input = Bert_Embedding(bert_path, bert_layer, bert_dim)
156 | 
157 |     def forward(self, seqs):
158 |         subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs)
159 |         subwords = pad_sequence(subwords, batch_first=True).cuda()
160 |         masks = pad_sequence(masks, batch_first=True).cuda()
161 |         starts = pad_sequence(starts, batch_first=True).cuda()
162 |         text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda()
163 |         subwords_mask = subwords_mask.type(torch.BoolTensor).cuda()
164 |         bert_outs = self.bert_input.forward(subwords, masks, starts, text_masks, subwords_mask)
165 |         return bert_outs
166 | 
167 | 
168 | class BERT_model(nn.Module):
169 |     def __init__(self, bert_vocab_path, bert_path, bert_layer, bert_dim, fix_layer_number=None):
170 |         super(BERT_model, self).__init__()
171 |         self.vocab = Vocab(bert_vocab_path)
172 |         self.bert_encoder = Bert_Encoder(bert_path, bert_layer,
173 |                                          freeze=False, fix_layer_number=fix_layer_number)
174 | 
175 |     def forward(self, seqs):
176 |         subwords, masks, starts, text_masks, subwords_mask = self.vocab.numericalize(seqs)
177 |         subwords = pad_sequence(subwords, batch_first=True).cuda()
178 |         masks = pad_sequence(masks, batch_first=True).cuda()
179 |         starts = pad_sequence(starts, batch_first=True).type(torch.BoolTensor).cuda()
180 |         text_masks = pad_sequence(text_masks, batch_first=True).type(torch.BoolTensor).cuda()
181 |         subwords_mask = subwords_mask.type(torch.BoolTensor).cuda()
182 |         bert_outs = self.bert_encoder.forward(subwords, masks, starts, text_masks, subwords_mask)
183 |         return bert_outs
184 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/pytorch/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | def block_orth_normal_initializer(input_size, output_size):
 7 |     weight = []
 8 |     for o in output_size:
 9 |         for i in input_size:
10 |             param = torch.FloatTensor(o, i)
11 |             torch.nn.init.orthogonal_(param)
12 |             weight.append(param)
13 |     return torch.cat(weight)
14 | 
15 | 
16 | def batch_data_variable(batch_x, batch_y, batch_lengths, batch_weights):
17 |     batch_size = len(batch_x)  # batch size
18 |     length = max(batch_lengths)
19 | 
20 |     words = Variable(torch.LongTensor(batch_size, length).zero_(),
21 |                      requires_grad=False)  # padding with 0
22 |     predicates = Variable(torch.LongTensor(batch_size, length).zero_(),
23 |                           requires_grad=False)
24 |     masks = Variable(torch.Tensor(batch_size, length).zero_(),
25 |                      requires_grad=False)
26 |     padding_answers = Variable(torch.LongTensor(batch_size, length).zero_(),
27 |                                requires_grad=False)
28 |     labels, lengths = [], []
29 | 
30 |     b = 0
31 |     for s_words, s_answer, s_length, s_weights in zip(batch_x, batch_y,
32 |                                                       batch_lengths,
33 |                                                       batch_weights):
34 |         lengths.append(s_length)
35 |         rel = np.zeros((s_length), dtype=np.int32)
36 |         for i in range(s_length):
37 |             words[b, i] = s_words[1][i]  # word
38 |             predicates[b, i] = s_words[2][i]  # predicate
39 |             rel[i] = s_answer[0][i]
40 |             padding_answers[b, i] = s_answer[0][i]
41 |             masks[b, i] = 1
42 | 
43 |         # sentence_id = s_words[0][0]  # get the dep_labels_ids of each sentence
44 |         b += 1
45 |         labels.append(rel)
46 | 
47 |     return words, predicates, labels, torch.LongTensor(
48 |         lengths), masks, padding_answers
49 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__init__.py


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/configuration.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/conll_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constants.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_extraction.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/constituent_reader.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/dictionary.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/evaluation.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/inference_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/measurements.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/reader.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/srl_eval_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/syntactic_extraction.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KiroSummer/opinion_mining_with_syn_cons/8ad22dbf2daeeb3963111231b1c9f52fb59e76bc/src/orl-4.1/neural_srl/shared/__pycache__/tagger_data.cpython-37.pyc


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/configuration.py:
--------------------------------------------------------------------------------
 1 | ''' Configuration for experiments.
 2 | '''
 3 | import json
 4 | from argparse import Namespace
 5 | 
 6 | 
 7 | def get_config(config_filepath):
 8 |     with open(config_filepath, 'r') as config_file:
 9 |         conf = json.load(config_file, object_hook=lambda d: Namespace(**d))
10 |     return conf
11 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/conll_utils.py:
--------------------------------------------------------------------------------
 1 | def bio_to_se(labels):
 2 |     slen = len(labels)
 3 |     new_labels = []
 4 |     has_opening = False
 5 |     for i in range(slen):
 6 |         label = labels[i]
 7 |         if label == 'O':
 8 |             new_labels.append('*')
 9 |             continue
10 |         new_label = '*'
11 |         if label[0] == 'B' or i == 0 or label[1:] != labels[i - 1][1:]:
12 |             new_label = '(' + label[2:] + new_label
13 |             has_opening = True
14 |         if i == slen - 1 or labels[i + 1][0] == 'B' or label[1:] != labels[i + 1][1:]:
15 |             new_label = new_label + ')'
16 |             has_opening = False
17 |         new_labels.append(new_label)
18 | 
19 |     if has_opening:
20 |         ''' logging '''
21 |         print("Has unclosed opening: {}".format(labels))
22 |     return new_labels
23 | 
24 | 
25 | def print_sentence_to_conll(fout, tokens, labels):
26 |     for label_column in labels:
27 |         assert len(label_column) == len(tokens)
28 |     for i in range(len(tokens)):
29 |         fout.write(tokens[i].ljust(15))
30 |         for label_column in labels:
31 |             fout.write(label_column[i].rjust(15))
32 |         fout.write("\n")
33 |     fout.write("\n")
34 | 
35 | 
36 | def print_to_conll(pred_labels, gold_props_file, output_filename):
37 |     """
38 |   """
39 |     fout = open(output_filename, 'w')
40 |     seq_ptr = 0
41 |     num_props_for_sentence = 0
42 |     tokens_buf = []
43 | 
44 |     for line in open(gold_props_file, 'r'):
45 |         line = line.strip()
46 |         if line == "" and len(tokens_buf) > 0:
47 |             print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
48 |             seq_ptr += num_props_for_sentence
49 |             tokens_buf = []
50 |             num_props_for_sentence = 0
51 |         else:
52 |             info = line.split()
53 |             num_props_for_sentence = len(info) - 1
54 |             tokens_buf.append(info[0])
55 | 
56 |     # Output last sentence.
57 |     if len(tokens_buf) > 0:
58 |         print_sentence_to_conll(fout, tokens_buf, pred_labels[seq_ptr:seq_ptr + num_props_for_sentence])
59 | 
60 |     fout.close()
61 | 
62 | 
63 | def print_gold_to_conll(data, word_dict, label_dict, output_filename):
64 |     fout = open(output_filename, 'w')
65 |     props_buf = []
66 |     labels_buf = []
67 |     tokens_buf = []
68 |     prev_words = ''
69 | 
70 |     x, y, num_tokens, _ = data
71 |     for (sent, gold, slen) in zip(x, y, num_tokens):
72 |         words = [word_dict.idx2str[w[0]] for w in sent[:slen]]
73 |         labels = [label_dict.idx2str[l] for l in gold[:slen]]
74 | 
75 |         concat_words = ' '.join(words)
76 |         if concat_words != prev_words and len(props_buf) > 0:
77 |             tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
78 | 
79 |             print_sentence_to_conll(fout, tokens, labels_buf)
80 |             props_buf = []
81 |             tokens_buf = []
82 |             labels_buf = []
83 |             prev_words = ''
84 | 
85 |         if prev_words == '':
86 |             prev_words = concat_words
87 |             tokens_buf = [w for w in words]
88 |         if 'B-V' in labels:
89 |             prop_id = labels.index('B-V')
90 |             props_buf.append(prop_id)
91 |             labels_buf.append(bio_to_se(labels))
92 | 
93 |     if len(props_buf) > 0:
94 |         tokens = [w if i in props_buf else '-' for i, w in enumerate(tokens_buf)]
95 |         print_sentence_to_conll(fout, tokens, labels_buf)
96 | 
97 |     fout.close()
98 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constants.py:
--------------------------------------------------------------------------------
 1 | from os.path import join
 2 | import os
 3 | import random
 4 | 
 5 | ROOT_DIR = join(os.path.dirname(os.path.abspath(__file__)), '../../../')
 6 | 
 7 | RANDOM_SEED = 12345
 8 | random.seed(RANDOM_SEED)
 9 | 
10 | SRL_CONLL_EVAL_SCRIPT = join(ROOT_DIR, '../run_eval.sh')
11 | 
12 | START_MARKER = '<S>'
13 | END_MARKER = '</S>'
14 | PADDING_TOKEN = '*PAD*'
15 | UNKNOWN_TOKEN = '*UNKNOWN*'
16 | NULL_LABEL = 'O'
17 | 
18 | TEMP_DIR = join(ROOT_DIR, '../temp')
19 | 
20 | # assert os.path.exists(SRL_CONLL_EVAL_SCRIPT)
21 | if not os.path.exists(TEMP_DIR):
22 |     os.makedirs(TEMP_DIR)
23 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constituent_extraction.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import sys
  3 | import numpy as np
  4 | import random
  5 | 
  6 | from .dictionary import Dictionary
  7 | from collections import OrderedDict
  8 | from nltk.tree import Tree
  9 | from .constants import PADDING_TOKEN, UNKNOWN_TOKEN
 10 | # from .reader import list_of_words_to_ids
 11 | 
 12 | 
 13 | PREFIX = "--PTB-CONS-LABEL--"
 14 | 
 15 | 
 16 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 17 |     ids = []
 18 |     for s in list_of_words:
 19 |         # s = s.encode('utf-8')  # unicode -> utf-8
 20 |         if s is None:
 21 |             ids.append(-1)
 22 |             continue
 23 |         if lowercase:
 24 |             s = s.lower()
 25 |         if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 26 |             s = UNKNOWN_TOKEN
 27 |         ids.append(dictionary.add(s))
 28 |     return ids
 29 | 
 30 | 
 31 | class constituent_tree():
 32 |     def __init__(self, sentence, words, tree):
 33 |         self.sentence = sentence
 34 |         self.words = words
 35 |         self.tree = tree
 36 |         self.heads = None
 37 |         self.nodes = None
 38 |         self.indicator = []  # 0 no terminal, 1 terminal
 39 |         self.word_position = []
 40 |         self.node_idx = []
 41 |         self.node_char_idx = []
 42 | 
 43 |         self.sentence_length = len(words)
 44 |         self.input_length = -1
 45 |         self.sentence_index = -1
 46 | 
 47 |     def pos(self):
 48 |         """[('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]"""
 49 |         return self.tree.pos()
 50 | 
 51 |     def traverse_tree(self, tree, nodes, indicator, heads, parent, pos, label, word_embeddings):
 52 |         # print(tree)
 53 |         # print("subtree", subtree)
 54 |         if tree.height() > 2:
 55 |             subtree_label = PREFIX + tree.label()
 56 |             label.add(subtree_label)
 57 |             constituent_tree.add_unknown_labels(subtree_label, word_embeddings)
 58 |             nodes.append(subtree_label)
 59 |             indicator.append(0)
 60 |             heads.append(parent - 1)
 61 |         else:
 62 |             # print("YY", subtree)
 63 |             pos.add(tree.label())
 64 |             subtree_pos = tree[0]  # word
 65 |             subtree_pos = constituent_tree.add_word(subtree_pos, label, word_embeddings)
 66 |             nodes.append(subtree_pos)
 67 |             indicator.append(1)
 68 |             idx = len(nodes) - 1
 69 |             self.word_position.append(idx)
 70 |             heads.append(parent - 1)
 71 |         if tree.height() <= 2:
 72 |             return
 73 |         parent = len(nodes)
 74 |         for i, subtree in enumerate(tree):
 75 |             self.traverse_tree(subtree, nodes, indicator, heads, parent, pos, label, word_embeddings)
 76 | 
 77 |     @staticmethod
 78 |     def add_unknown_labels(label, word_embeddings):
 79 |         if label not in word_embeddings:
 80 |             embedding_size = len(word_embeddings[PADDING_TOKEN])
 81 |             word_embeddings[label] = np.asarray([random.gauss(0, 0.01) for _ in range(embedding_size)])
 82 | 
 83 |     @staticmethod
 84 |     def add_word(word, word_dict, word_embeddings):
 85 |         if word not in word_embeddings:
 86 |             word = UNKNOWN_TOKEN
 87 |         idx = word_dict.add(word)
 88 |         return word
 89 | 
 90 |     @staticmethod
 91 |     def get_node_char_idx(words, char_dict, lowercase=False):
 92 |         max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
 93 |         single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
 94 |         for i, word in enumerate(words):
 95 |             single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
 96 |         return single_sample_char_tokens
 97 | 
 98 |     def generate_adjacent(self, pos, label_dict, char_dict, word_embeddings):
 99 |         assert self.heads is None
100 |         root_label = PREFIX + self.tree.label()
101 |         nodes, heads = [], []  # TODO notice
102 |         self.traverse_tree(self.tree, nodes, self.indicator, heads,
103 |                            len(heads), pos, label_dict, word_embeddings)
104 |         self.nodes = nodes
105 |         self.heads = heads
106 |         self.input_length = len(self.nodes)
107 |         self.sentence_index = self.input_length - self.sentence_length - 1
108 |         self.node_idx = [label_dict.get_index(node) for node in self.nodes]
109 | 
110 |         max_word_length = max([len(w) for w in self.nodes] + [3, 4, 5])  # compare with character cnn filter width
111 |         self.node_char_idx = np.zeros([len(self.nodes), max_word_length], dtype=np.int64)
112 |         for i, word in enumerate(self.nodes):
113 |             self.node_char_idx[i, :len(word)] = list_of_words_to_ids(word, char_dict)
114 | 
115 |         self.node_char_idx = constituent_tree.get_node_char_idx(self.nodes, char_dict)
116 | 
117 | 
118 | def load_constituent_trees(file_path, word_dict, char_dict, word_embeddings):
119 |     data = []
120 |     with open(file_path, 'r') as input_file:
121 |         sentence = ""
122 |         for line in input_file.readlines():
123 |             if line.strip() == "":
124 |                 data.append(sentence)
125 |                 sentence = ""
126 |                 continue
127 |             line = line.strip()
128 |             if ' ' not in line:  # avoid the split of leave node of it's PoS
129 |                 line = ' ' + line
130 |             sentence += line
131 |         print("Read {} sentence from {}".format(len(data), file_path))
132 | 
133 |     cons_trees = OrderedDict()
134 |     for sentence in data:
135 |         tree = Tree.fromstring(sentence)
136 |         words = tree.leaves()
137 |         sentence = ' '.join(words)
138 |         cons_trees[sentence] = constituent_tree(sentence, words, tree)
139 | 
140 |     pos_dict = Dictionary(padding_token=PADDING_TOKEN)
141 |     for sen in cons_trees:
142 |         tree = cons_trees[sen]
143 |         tree.generate_adjacent(pos_dict, word_dict, char_dict, word_embeddings)
144 | 
145 |     return cons_trees, pos_dict
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/constituent_reader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import codecs
  3 | import numpy as np
  4 | 
  5 | 
  6 | from sortedcontainers import SortedSet
  7 | from .constants import START_MARKER, END_MARKER, UNKNOWN_TOKEN, PADDING_TOKEN, NULL_LABEL
  8 | from .dictionary import Dictionary
  9 | 
 10 | 
 11 | def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 12 |     ids = []
 13 |     for s in list_of_words:
 14 |         # s = s.encode('utf-8')  # unicode -> utf-8
 15 |         if s is None:
 16 |             ids.append(-1)
 17 |             continue
 18 |         if lowercase:
 19 |             s = s.lower()
 20 |         if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 21 |             s = UNKNOWN_TOKEN
 22 |         ids.append(dictionary.add(s))
 23 |     return ids
 24 | 
 25 | 
 26 | class constituent_sentence():
 27 |     def __init__(self, obj):
 28 |         self.sentence = obj["sentence"]
 29 |         self.constituent_spans = obj["constituents"]
 30 |         self.max_span_width = 30
 31 | 
 32 |     def tokenize_cons_spans(self, dictionary, max_cons_width=60):
 33 |         cons_span = []
 34 |         set_cons_span = set()
 35 |         for cons_s in self.constituent_spans:  # remove self-loop V-V
 36 |             cons_start, cons_end, cons_label = cons_s
 37 |             if cons_label in ["TOP", "S"]:  # todo: add some constrains here
 38 |                 continue
 39 |             if cons_end - cons_start + 1 >= max_cons_width:
 40 |                 continue
 41 |             if (cons_start, cons_end) not in set_cons_span:
 42 |                 set_cons_span.add((cons_start, cons_end))
 43 |                 cons_span.append([int(cons_start), int(cons_end), int(dictionary.add(cons_label))])
 44 |             else:
 45 |                 # print("duplicate span of", (cons_start, cons_end, cons_label), '\n', self.sentence)
 46 |                 pass
 47 |         if len(cons_span) == 0:  # if the sentence has no arguments.
 48 |             return [[], [], []]
 49 |         tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels = \
 50 |             zip(*cons_span)
 51 |         return tokenized_arg_starts, tokenized_arg_ends, tokenized_arg_labels
 52 | 
 53 | 
 54 | def read_constituent_file(file_path):
 55 |     sentences = []
 56 |     with codecs.open(file_path, encoding="utf8") as f:
 57 |         for line in f.readlines():
 58 |             sen = json.loads(line)
 59 |             cons_sen = constituent_sentence(sen)
 60 |             sentences.append(cons_sen)
 61 |     print("{} total constituent sentences number {}".format(file_path, len(sentences)))
 62 |     return sentences
 63 | 
 64 | 
 65 | def tokenize_cons_data(samples, word_dict, char_dict, label_dict, lowercase=False, pretrained_word_embedding=False):
 66 |     sample_word_tokens = [list_of_words_to_ids(
 67 |         sent.sentence, word_dict, lowercase, pretrained_word_embedding) for sent in samples]
 68 |     # for the character
 69 |     sample_char_tokens = []
 70 |     for sent in samples:
 71 |         words = sent.sentence
 72 |         max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
 73 |         single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int)
 74 |         for i, word in enumerate(words):
 75 |             single_sample_char_tokens[i, :len(word)] = list_of_words_to_ids(word, char_dict, lowercase)
 76 |         # Add the sample char tokens into the sample_char_tokens
 77 |         sample_char_tokens.append(single_sample_char_tokens)
 78 |     sample_lengths = [len(sent.sentence)for sent in samples]
 79 |     sample_cons_span_tokens = [sent.tokenize_cons_spans(label_dict) for sent in samples]
 80 |     return list(zip(sample_lengths, sample_word_tokens, sample_char_tokens, sample_cons_span_tokens))
 81 | 
 82 | 
 83 | def get_constituent_data(config, file_path, word_dict=None, char_dict=None, word_embeddings=None):
 84 |     raw_cons_sentences = read_constituent_file(file_path)
 85 |     cons_label_dict = Dictionary()
 86 |     cons_label_dict.set_unknown_token(NULL_LABEL)
 87 | 
 88 |     # tokenized the data
 89 |     if word_dict.accept_new is False:
 90 |         word_dict.accept_new = True
 91 |     if char_dict.accept_new is False:
 92 |         char_dict.accept_new = True
 93 |     cons_samples = tokenize_cons_data(raw_cons_sentences, word_dict, char_dict, cons_label_dict,
 94 |                                       False, word_embeddings)
 95 |     # word_dict.accept_new = False
 96 |     # char_dict.accept_new = False
 97 |     # cons_label_dict.accept_new = False
 98 | 
 99 |     print("="*10, "Constituent Info", "="*10)
100 |     print("Extract {} tags".format(cons_label_dict.size()))
101 |     # print("Extract {} words and {} tags".format(word_dict.size(), cons_label_dict.size()))
102 |     print("Max sentence length: {}".format(max([s[0] for s in cons_samples])))
103 |     return cons_samples, word_dict, char_dict, cons_label_dict
104 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/dictionary.py:
--------------------------------------------------------------------------------
 1 | ''' Bidirectional dictionary that maps between words and ids.
 2 | '''
 3 | 
 4 | 
 5 | class Dictionary(object):
 6 |     def __init__(self, padding_token=None, unknown_token=None):
 7 |         self.str2idx = {}
 8 |         self.idx2str = []
 9 | 
10 |         self.accept_new = True
11 |         self.padding_token = None
12 |         self.padding_id = None
13 |         self.unknown_token = None
14 |         self.unknown_id = None
15 |         if padding_token is not None:  # add the padding info into the dictionary
16 |             self.set_padding_token(padding_token)
17 |         if unknown_token is not None:
18 |             self.set_unknown_token(unknown_token)
19 | 
20 |     def set_padding_token(self, padding_token):
21 |         self.padding_token = padding_token
22 |         self.padding_id = self.add(self.padding_token)
23 | 
24 |     def set_unknown_token(self, unknown_token):
25 |         self.unknown_token = unknown_token
26 |         self.unknown_id = self.add(self.unknown_token)
27 | 
28 |     def add(self, new_str):
29 |         if new_str not in self.str2idx:
30 |             if self.accept_new:
31 |                 self.str2idx[new_str] = len(self.idx2str)
32 |                 self.idx2str.append(new_str)
33 |             else:
34 |                 if new_str == "C-ADV":
35 |                     return self.str2idx["O"]
36 |                 if self.unknown_id is None:
37 |                     raise LookupError(
38 |                         'Trying to add new token to a freezed dictionary with no pre-defined unknown token: ' + new_str)
39 |                 return self.unknown_id
40 | 
41 |         return self.str2idx[new_str]
42 | 
43 |     def add_all(self, str_list):
44 |         return [self.add(s) for s in str_list]
45 | 
46 |     def get_index(self, input_str):
47 |         if input_str in self.str2idx:
48 |             return self.str2idx[input_str]
49 |         return None
50 | 
51 |     def size(self):
52 |         return len(self.idx2str)
53 | 
54 |     def save(self, filename):
55 |         with open(filename, 'w') as f:
56 |             for s in self.idx2str:
57 |                 f.write(s + '\n')
58 |             f.close()
59 | 
60 |     def load(self, filename):
61 |         with open(filename, 'r') as f:
62 |             for line in f:
63 |                 line = line.strip()
64 |                 if line != '':
65 |                     self.add(line)
66 |             f.close()
67 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/evaluation.py:
--------------------------------------------------------------------------------
 1 | ''' Framework independent evaluator. Not in use yet.
 2 | '''
 3 | import numpy
 4 | import os
 5 | from os.path import join
 6 | # import subprocess
 7 | from .constants import ROOT_DIR
 8 | from .conll_utils import print_gold_to_conll
 9 | # from .measurements import Timer
10 | 
11 | 
12 | class TaggerEvaluator(object):
13 |     def __init__(self, data):
14 |         self.data = data
15 |         self.best_accuracy = 0.0
16 |         self.has_best = False
17 | 
18 |     def compute_accuracy(self, predictions):
19 |         for x, y in zip(predictions,
20 |                         [sent[2] for sent in self.data
21 |                          ]):  # the predication's order should be the origin
22 |             assert len(x) == y
23 |         predictions = numpy.concatenate(predictions)
24 |         tensors = self.data
25 |         answer = numpy.concatenate(
26 |             [sent[1].reshape(sent[1].shape[1]) for sent in tensors])
27 |         # predictions.resize(predictions.shape[0])  # resize the answer to the [length, 1]
28 |         num_correct = numpy.equal(predictions, answer).sum()
29 |         num_total = answer.shape[0]
30 |         self.accuracy = (100.0 * num_correct) / num_total
31 |         print("Accuracy: {:.3f} ({}/{})".format(self.accuracy, num_correct,
32 |                                                 num_total))
33 | 
34 |     def evaluate(self, predictions):
35 |         self.compute_accuracy(predictions)
36 |         self.has_best = self.accuracy > self.best_accuracy
37 |         if self.has_best:
38 |             print("Best accuracy so far: {:.3f}".format(self.accuracy))
39 |             self.best_accuracy = self.accuracy
40 | 
41 | 
42 | class PropIdEvaluator(object):
43 |     def __init__(self, data, label_dict, target_label='V',
44 |                  use_se_marker=False):
45 |         self.data = data
46 |         self.label_dict = label_dict
47 |         self.target_label_id = label_dict.str2idx[target_label]
48 |         self.best_accuracy = 0.0
49 |         self.has_best = False
50 | 
51 |     def compute_accuracy(self, predictions):
52 |         _, y, _, weights = self.data
53 |         # print predictions.shape, predictions
54 |         identified = numpy.equal(predictions, self.target_label_id)
55 |         print(y)
56 |         # print self.target_label_id
57 |         # print identified
58 |         # exit()
59 |         num_correct = numpy.sum(
60 |             numpy.logical_and(numpy.equal(predictions, y), identified) * weights)
61 |         num_identified = numpy.sum(identified * weights)
62 |         num_gold = numpy.sum(numpy.equal(y, self.target_label_id) * weights)
63 |         self.precision = 100.0 * num_correct / num_identified
64 |         self.recall = 100.0 * num_correct / num_gold
65 |         self.accuracy = 2 * self.precision * self.recall / (self.precision + self.recall)
66 |         print("Accuracy: {:.3f} ({:.3f}, {:.3f})".format(
67 |             self.accuracy, self.precision, self.recall))
68 | 
69 |     def evaluate(self, predictions):
70 |         self.compute_accuracy(predictions)
71 |         self.has_best = self.accuracy > self.best_accuracy
72 |         if self.has_best:
73 |             print("Best accuracy so far: {:.3f}".format(self.accuracy))
74 |             self.best_accuracy = self.accuracy
75 | 
76 | 
77 | class SRLEvaluator(TaggerEvaluator):
78 |     def __init__(self):
79 |         self.best_accuracy = -1.0
80 |         self.has_best = False
81 | 
82 |     def compute_accuracy(self, predictions):
83 |         print("exit()")
84 |         exit()
85 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/features.py:
--------------------------------------------------------------------------------
 1 | def get_srl_features(sentences, config, feature_dicts=None):
 2 |     ''' TODO: Support adding more features.
 3 |     '''
 4 |     feature_names = config.features
 5 |     feature_sizes = config.feature_sizes
 6 |     use_se_marker = config.use_se_marker
 7 | 
 8 |     features = []
 9 |     feature_shapes = []
10 |     for fname, fsize in zip(feature_names, feature_sizes):
11 |         if fname == "predicate":
12 |             offset = int(use_se_marker)
13 |             offset = 1  # pad is in the position 0
14 |             features.append([[int((i == sent[2]) + offset) for i in range(len(sent[1]))] for sent in sentences])
15 |             feature_shapes.append([2, fsize])
16 |     return (zip(*features), feature_shapes)
17 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/inference.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | def get_transition_params(label_strs):
 5 |     """Construct transtion scoresd (0 for allowed, -inf for invalid).
 6 |       Args:
 7 |         label_strs: A [num_tags,] sequence of BIO-tags.
 8 |       Returns:
 9 |         A [num_tags, num_tags] matrix of transition scores.
10 |     """
11 |     num_tags = len(label_strs)
12 |     transition_params = numpy.zeros([num_tags, num_tags], dtype=numpy.float32)
13 |     for i, prev_label in enumerate(label_strs):
14 |         for j, label in enumerate(label_strs):
15 |             if i != j and label[0] == 'I' and not prev_label == 'B' + label[1:]:
16 |                 transition_params[i, j] = numpy.NINF
17 |     return transition_params
18 | 
19 | 
20 | def viterbi_decode(score, transition_params):
21 |     """ Adapted from Tensorflow implementation.
22 |         Decode the highest scoring sequence of tags outside of TensorFlow.
23 |         This should only be used at test time.
24 |         Args:
25 |             score: A [seq_len, num_tags] matrix of unary potentials.
26 |             transition_params: A [num_tags, num_tags] matrix of binary potentials.
27 |         Returns:
28 |             viterbi: A [seq_len] list of integers containing the highest scoring tag
29 |               indicies.
30 |             viterbi_score: A float containing the score for the Viterbi sequence.
31 |     """
32 |     trellis = numpy.zeros_like(score)
33 |     backpointers = numpy.zeros_like(score, dtype=numpy.int32)
34 |     trellis[0] = score[0]
35 |     for t in range(1, score.shape[0]):
36 |         v = numpy.expand_dims(trellis[t - 1], 1) + transition_params
37 |         trellis[t] = score[t] + numpy.max(v, 0)
38 |         backpointers[t] = numpy.argmax(v, 0)
39 |     viterbi = [numpy.argmax(trellis[-1])]
40 |     for bp in reversed(backpointers[1:]):
41 |         viterbi.append(bp[viterbi[-1]])
42 |     viterbi.reverse()
43 |     viterbi_score = numpy.max(trellis[-1])
44 |     return viterbi, viterbi_score
45 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/io_utils.py:
--------------------------------------------------------------------------------
  1 | from google.protobuf.internal import encoder
  2 | 
  3 | _EncodeVarint = encoder._VarintEncoder()
  4 | 
  5 | 
  6 | def write_delimited_to(out_file, message):
  7 |     msg_size = message.ByteSize()
  8 |     pieces = []
  9 |     _EncodeVarint(pieces.append, msg_size)
 10 |     out_file.write(b"".join(pieces))
 11 |     out_file.write(message.SerializeToString())
 12 | 
 13 | 
 14 | def read_gold_props(gold_props_file):
 15 |     """ Read gold predicates from CoNLL-formatted file.
 16 |   """
 17 |     gold_props = []
 18 |     props = []
 19 |     with open(gold_props_file, 'r') as f:
 20 |         for line in f:
 21 |             line = line.strip()
 22 |             if line == '':
 23 |                 gold_props.append(props)
 24 |                 props = []
 25 |             else:
 26 |                 props.append(line.split()[0])
 27 |         f.close()
 28 |     if len(props) > 0:
 29 |         gold_props.append(props)
 30 |     return gold_props
 31 | 
 32 | 
 33 | def write_predprops_to(predictions,
 34 |                        label_dict,
 35 |                        input_file,
 36 |                        output_file,
 37 |                        gold_props_file=None,
 38 |                        output_props_file=None):
 39 |     """ Write predicted predicate information to files.
 40 | 
 41 |       Arguments:
 42 |         predictions: Predictions from the predicate identification model.
 43 |                       Is a numpy array of size [num_sentences, max_sentence_length].
 44 |         label_dict: Label dictionary.
 45 |         input_file: Input sequential tagging file.
 46 |         output_file: Output SRL file with identified predicates.
 47 |         gold_props_file: Input file with gold predicates in CoNLL format.
 48 |         output_props_file: Output SRL file with identified predicates, in CoNLL format.
 49 |   """
 50 | 
 51 |     fin = open(input_file, 'r')
 52 |     fout = open(output_file, 'w')
 53 | 
 54 |     if output_props_file is not None and output_props_file != '':
 55 |         fout_props = open(output_props_file, 'w')
 56 |     else:
 57 |         fout_props = None
 58 | 
 59 |     if gold_props_file is not None and gold_props_file != '':
 60 |         gold_props = read_gold_props(gold_props_file)
 61 |         print(len(gold_props), len(predictions))
 62 |         assert len(gold_props) == len(predictions)
 63 |     else:
 64 |         gold_props = None
 65 | 
 66 |     sent_id = 0
 67 |     for line in fin:
 68 |         # Read original sentence from input file.
 69 |         raw_sent = line.split('|||')[0].strip()
 70 |         tokens = raw_sent.split(' ')
 71 |         slen = len(tokens)
 72 |         pred = predictions[sent_id, :slen]
 73 |         props = []
 74 | 
 75 |         for (t, p) in enumerate(pred):
 76 |             if label_dict.idx2str[p] == 'V':
 77 |                 out_tags = ['O' for _ in range(slen)]
 78 |                 out_tags[t] = 'B-V'
 79 |                 out_line = str(t) + '\t' + raw_sent + ' ||| ' + ' '.join(
 80 |                     out_tags) + '\n'
 81 |                 fout.write(out_line)
 82 |                 props.append(t)
 83 | 
 84 |         if fout_props is not None:
 85 |             if sent_id > 0:
 86 |                 fout_props.write('\n')
 87 |             for t in range(slen):
 88 |                 lemma = 'P' + tokens[t].lower()
 89 |                 # In order for CoNLL evaluation script to run, we need to output the same
 90 |                 # lemma as the gold predicate in the CoNLL-formatted file.
 91 |                 if gold_props is not None and gold_props[sent_id][t] != '-':
 92 |                     lemma = gold_props[sent_id][t]
 93 |                 if t in props:
 94 |                     fout_props.write(lemma)
 95 |                 else:
 96 |                     fout_props.write('-')
 97 |                 for p in props:
 98 |                     if t == p:
 99 |                         fout_props.write('\t(V*)')
100 |                     else:
101 |                         fout_props.write('\t*')
102 |                 fout_props.write('\n')
103 |             sent_id += 1
104 | 
105 |     fout.close()
106 |     print('Predicted predicates in sequential-tagging format written to: {}.'.
107 |           format(output_file))
108 |     if fout_props is not None:
109 |         fout_props.close()
110 |         print('CoNLL-formatted predicate information written to: {}.'.format(
111 |             output_props_file))
112 | 
113 | 
114 | def bio_to_spans(predictions, label_dict):
115 |     """ Convert BIO-based predictions to a set of arguments.
116 |       Arguments:
117 |         predictions: A single integer array, already truncated to the original sequence lengths.
118 |         label_dict: Label dictionary.
119 |       Returns:
120 |         A sequence of labeled arguments: [ ("ARG_LABEL", span_start, span_end), ... ], ordered by their positions.
121 |   """
122 |     args = []
123 |     tags = [label_dict.idx2str[p] for p in predictions]
124 |     for (i, tag) in enumerate(tags):
125 |         if tag == 'O':
126 |             continue
127 |         label = tag[2:]
128 |         # Append new span.
129 |         if tag[0] == 'B' or len(args) == 0 or label != tags[i - 1][2:]:
130 |             args.append([label, i, -1])
131 |         # Close current span.
132 |         if i == len(predictions) - 1 or tags[
133 |                 i + 1][0] == 'B' or label != tags[i + 1][2:]:
134 |             args[-1][2] = i
135 |     return args
136 | 
137 | 
138 | def print_to_readable(predictions, num_tokens, label_dict, input_path,
139 |                       output_path):
140 |     """ Print predictions to human-readable format.
141 |   """
142 |     fout = open(output_path, 'w')
143 |     sample_id = 0
144 |     for line in open(input_path, 'r'):
145 |         info = line.split('|||')[0].strip().split()
146 |         pid = int(info[0])
147 |         sent = info[1:]
148 |         fout.write(' '.join(sent) + '\n')
149 |         fout.write('\tPredicate: {}({})\n'.format(sent[pid], pid))
150 | 
151 |         tags = predictions[sample_id]
152 |         arg_spans = bio_to_spans(tags, label_dict)
153 |         for arg in arg_spans:
154 |             fout.write('\t\t{}: {}\n'.format(arg[0], " ".join(
155 |                 sent[arg[1]:arg[2] + 1])))
156 |         fout.write('\n')
157 |         sample_id += 1
158 | 
159 |     fout.close()
160 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/measurements.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import time
 3 | 
 4 | 
 5 | class Timer:
 6 |     def __init__(self, name, active=True):
 7 |         self.name = name if active else None
 8 | 
 9 |     def __enter__(self):
10 |         self.start = time.time()
11 |         self.last_tick = self.start
12 |         return self
13 | 
14 |     def __exit__(self, *args):
15 |         if self.name is not None:
16 |             print("{} duration was {}.".format(
17 |                 self.name, self.readable(time.time() - self.start)))
18 | 
19 |     def readable(self, seconds):
20 |         return str(datetime.timedelta(seconds=int(seconds)))
21 | 
22 |     def tick(self, message):
23 |         current = time.time()
24 |         print("{} took {} ({} since last tick).".format(
25 |             message, self.readable(current - self.start),
26 |             self.readable(current - self.last_tick)))
27 |         self.last_tick = current
28 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/numpy_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | def orth_normal_initializer(factor=1.0, seed=None):
 5 |     ''' Reference: Exact solutions to the nonlinear dynamics of learning in
 6 |                  deep linear neural networks
 7 |         Saxe et al., 2014. https://arxiv.org/pdf/1312.6120.pdf
 8 |       Adapted from the original implementation by Mingxuan Wang.
 9 |   '''
10 |     def _initializer(shape, dtype):
11 |         assert len(shape) == 2
12 |         rng = numpy.random.RandomState(seed)
13 |         if shape[0] == shape[1]:
14 |             M = rng.randn(*shape).astype(dtype)
15 |             Q, R = numpy.linalg.qr(M)
16 |             Q = Q * numpy.sign(numpy.diag(R))
17 |             param = Q * factor
18 |             return param
19 |         else:
20 |             M1 = rng.randn(shape[0], shape[0]).astype(dtype)
21 |             M2 = rng.randn(shape[1], shape[1]).astype(dtype)
22 |             Q1, R1 = numpy.linalg.qr(M1)
23 |             Q2, R2 = numpy.linalg.qr(M2)
24 |             Q1 = Q1 * numpy.sign(numpy.diag(R1))
25 |             Q2 = Q2 * numpy.sign(numpy.diag(R2))
26 |             n_min = min(shape[0], shape[1])
27 |             param = numpy.dot(Q1[:, :n_min], Q2[:n_min, :]) * factor
28 |             return param
29 | 
30 |     return _initializer
31 | 
32 | 
33 | def block_orth_normal_initializer(input_shapes,
34 |                                   output_shapes,
35 |                                   factor=1.0,
36 |                                   seed=None):
37 |     ''' Initialize a gigantic weight matrix where each block is a normal orthogonal matrix.
38 |     Input:
39 |       - input_shapes: the sizes of each block alone dimension 0.
40 |       - output_shapes: the sizes of each block along dimension 1.
41 |       for example input_shapes = [100, 128] output_shapes=[100,100,100,100]
42 |         indicates eight blocks with shapes [100,100], [128,100], etc.
43 |   '''
44 |     def _initializer(shape, dtype):
45 |         assert len(shape) == 2
46 |         initializer = orth_normal_initializer(factor, seed)
47 |         params = numpy.concatenate([
48 |             numpy.concatenate([
49 |                 initializer([dim_in, dim_out], dtype)
50 |                 for dim_out in output_shapes
51 |             ], 1) for dim_in in input_shapes
52 |         ], 0)
53 |         return params
54 | 
55 |     return _initializer
56 | 
57 | 
58 | def random_normal_initializer(mean=0.0, stddev=0.01, seed=None):
59 |     def _initializer(shape, dtype):
60 |         rng = numpy.random.RandomState(seed)
61 |         return numpy.asarray(rng.normal(mean, stddev, shape), dtype)
62 | 
63 |     return _initializer
64 | 
65 | 
66 | def all_zero_initializer():
67 |     def _initializer(shape, dtype):
68 |         return numpy.zeros(shape).astype(dtype)
69 | 
70 |     return _initializer
71 | 
72 | 
73 | def uniform_initializer(value=0.01):
74 |     def _initializer(shape, dtype):
75 |         return numpy.full(shape, value).astype(dtype)
76 | 
77 |     return _initializer
78 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/scores_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: scores.proto
  3 | 
  4 | import sys
  5 | import tensor_pb2 as tensor__pb2
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | # from google.protobuf import descriptor_pb2
 11 | 
 12 | 
 13 | # @@protoc_insertion_point(imports)
 14 | _b = sys.version_info[0] < 3 and (lambda x: x) or (
 15 |     lambda x: x.encode('latin1'))
 16 | 
 17 | 
 18 | _sym_db = _symbol_database.Default()
 19 | 
 20 | 
 21 | DESCRIPTOR = _descriptor.FileDescriptor(
 22 |     name='scores.proto',
 23 |     package='',
 24 |     syntax='proto2',
 25 |     serialized_pb=_b(
 26 |         '\n\x0cscores.proto\x1a\x0ctensor.proto\"H\n\x13SentenceScoresProto\x12\x13\n\x0b\
 27 |           sentence_id\x18\x01 \x01(\r\x12\x1c\n\x06scores\x18\x02 \x01(\x0b\x32\x0c.TensorProto\"6\n\x0bScoresProto\x12\'\n\tsentences\x18\x01 \x03(\x0b\x32\x14.SentenceScoresProto'
 28 |     ),
 29 |     dependencies=[
 30 |         tensor__pb2.DESCRIPTOR,
 31 |     ])
 32 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 33 | 
 34 | _SENTENCESCORESPROTO = _descriptor.Descriptor(
 35 |     name='SentenceScoresProto',
 36 |     full_name='SentenceScoresProto',
 37 |     filename=None,
 38 |     file=DESCRIPTOR,
 39 |     containing_type=None,
 40 |     fields=[
 41 |         _descriptor.FieldDescriptor(
 42 |             name='sentence_id',
 43 |             full_name='SentenceScoresProto.sentence_id',
 44 |             index=0,
 45 |             number=1,
 46 |             type=13,
 47 |             cpp_type=3,
 48 |             label=1,
 49 |             has_default_value=False,
 50 |             default_value=0,
 51 |             message_type=None,
 52 |             enum_type=None,
 53 |             containing_type=None,
 54 |             is_extension=False,
 55 |             extension_scope=None,
 56 |             options=None),
 57 |         _descriptor.FieldDescriptor(name='scores',
 58 |                                     full_name='SentenceScoresProto.scores',
 59 |                                     index=1,
 60 |                                     number=2,
 61 |                                     type=11,
 62 |                                     cpp_type=10,
 63 |                                     label=1,
 64 |                                     has_default_value=False,
 65 |                                     default_value=None,
 66 |                                     message_type=None,
 67 |                                     enum_type=None,
 68 |                                     containing_type=None,
 69 |                                     is_extension=False,
 70 |                                     extension_scope=None,
 71 |                                     options=None),
 72 |     ],
 73 |     extensions=[],
 74 |     nested_types=[],
 75 |     enum_types=[],
 76 |     options=None,
 77 |     is_extendable=False,
 78 |     syntax='proto2',
 79 |     extension_ranges=[],
 80 |     oneofs=[],
 81 |     serialized_start=30,
 82 |     serialized_end=102,
 83 | )
 84 | 
 85 | _SCORESPROTO = _descriptor.Descriptor(
 86 |     name='ScoresProto',
 87 |     full_name='ScoresProto',
 88 |     filename=None,
 89 |     file=DESCRIPTOR,
 90 |     containing_type=None,
 91 |     fields=[
 92 |         _descriptor.FieldDescriptor(name='sentences',
 93 |                                     full_name='ScoresProto.sentences',
 94 |                                     index=0,
 95 |                                     number=1,
 96 |                                     type=11,
 97 |                                     cpp_type=10,
 98 |                                     label=3,
 99 |                                     has_default_value=False,
100 |                                     default_value=[],
101 |                                     message_type=None,
102 |                                     enum_type=None,
103 |                                     containing_type=None,
104 |                                     is_extension=False,
105 |                                     extension_scope=None,
106 |                                     options=None),
107 |     ],
108 |     extensions=[],
109 |     nested_types=[],
110 |     enum_types=[],
111 |     options=None,
112 |     is_extendable=False,
113 |     syntax='proto2',
114 |     extension_ranges=[],
115 |     oneofs=[],
116 |     serialized_start=104,
117 |     serialized_end=158,
118 | )
119 | 
120 | _SENTENCESCORESPROTO.fields_by_name[
121 |     'scores'].message_type = tensor__pb2._TENSORPROTO
122 | _SCORESPROTO.fields_by_name['sentences'].message_type = _SENTENCESCORESPROTO
123 | DESCRIPTOR.message_types_by_name['SentenceScoresProto'] = _SENTENCESCORESPROTO
124 | DESCRIPTOR.message_types_by_name['ScoresProto'] = _SCORESPROTO
125 | 
126 | SentenceScoresProto = _reflection.GeneratedProtocolMessageType(
127 |     'SentenceScoresProto',
128 |     (_message.Message, ),
129 |     dict(DESCRIPTOR=_SENTENCESCORESPROTO,
130 |          __module__='scores_pb2'
131 |          # @@protoc_insertion_point(class_scope:SentenceScoresProto)
132 |          ))
133 | _sym_db.RegisterMessage(SentenceScoresProto)
134 | 
135 | ScoresProto = _reflection.GeneratedProtocolMessageType(
136 |     'ScoresProto',
137 |     (_message.Message, ),
138 |     dict(DESCRIPTOR=_SCORESPROTO,
139 |          __module__='scores_pb2'
140 |          # @@protoc_insertion_point(class_scope:ScoresProto)
141 |          ))
142 | _sym_db.RegisterMessage(ScoresProto)
143 | 
144 | # @@protoc_insertion_point(module_scope)
145 | 


--------------------------------------------------------------------------------
/src/orl-4.1/neural_srl/shared/syntactic_extraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import codecs
  3 | 
  4 | from .dictionary import Dictionary
  5 | from .constants import UNKNOWN_TOKEN, PADDING_TOKEN
  6 | from collections import OrderedDict
  7 | 
  8 | 
  9 | class SyntacticTree(object):
 10 |     def __init__(self, sentence_id):
 11 |         self.sentence_id = sentence_id
 12 |         self.word_forms = ["Root"]
 13 |         self.word_forms_ids = []
 14 |         self.char_ids = [[]]  # 2D
 15 |         self.pos_forms = ["Root"]
 16 |         self.heads = [0]
 17 |         self.labels = ["Root"]
 18 |         self.labels_id = []
 19 | 
 20 | 
 21 | class SyntacticCONLL(object):
 22 |     def __init__(self):
 23 |         self.file_name = ""
 24 |         self.trees = []
 25 |         self.sample_dep_data = None
 26 | 
 27 |     def read_from_file(self, filename, max_sentence_length=100, prune_ratio=0.8):
 28 |         self.file_name = filename
 29 | 
 30 |         print("Reading conll syntactic trees from {} and the prune ratio is {}".format(self.file_name, prune_ratio))
 31 |         conll_file = codecs.open(self.file_name, 'r', encoding="utf8")
 32 |         if conll_file.closed:
 33 |             print("Cannot open the syntactic conll file! Please check {}".format(self.file_name))
 34 | 
 35 |         sentence_id = 0
 36 |         a_tree = SyntacticTree(sentence_id)
 37 |         find_root = False
 38 |         for line in conll_file:
 39 |             if line == '\n' or line == '\r\n':  # new sentence
 40 |                 sentence_id += 1
 41 |                 if len(a_tree.word_forms) <= max_sentence_length:
 42 |                     assert find_root is True
 43 |                     # keep the sentence with the length < max_sentence_l
 44 |                     self.trees.append(a_tree)
 45 |                 a_tree = SyntacticTree(sentence_id)
 46 |                 find_root = False
 47 |                 continue
 48 |             tokens = line.strip().split('\t')
 49 |             a_tree.word_forms.append(tokens[1])
 50 |             a_tree.pos_forms.append(tokens[3])
 51 |             # head = int(tokens[6]) if int(tokens[6]) > 0 else -1
 52 |             head = int(tokens[6]) - 1  # root's head is 0
 53 |             if head == -1:
 54 |                 assert tokens[7] == "root"
 55 |                 find_root = True
 56 |             a_tree.heads.append(head)
 57 |             a_tree.labels.append(tokens[7])
 58 |             token_9 = tokens[9]  # or tokens 9 will be 'unicode' type
 59 |             dep_prob = 1.0 if isinstance(token_9, str) else float(token_9)
 60 |             if dep_prob < prune_ratio:
 61 |                 a_tree.heads[-1] = -1
 62 |         print("Total {} conll trees, load {} conll syntactic trees.".format(sentence_id, len(self.trees)))
 63 | 
 64 |     @staticmethod
 65 |     def list_of_words_to_ids(list_of_words, dictionary, lowercase=False, pretrained_embeddings=None):
 66 |         ids = []
 67 |         for s in list_of_words:
 68 |             s = s
 69 |             if s is None:
 70 |                 ids.append(-1)
 71 |                 continue
 72 |             if lowercase:
 73 |                 s = s.lower()
 74 |             if (pretrained_embeddings is not None) and (s not in pretrained_embeddings):
 75 |                 s = UNKNOWN_TOKEN
 76 |             ids.append(dictionary.add(s))
 77 |         return ids
 78 | 
 79 |     def tokenize_dep_trees(self, word_dict, char_dict, syn_label_dict, pretrained_word_embedding=None):
 80 |         for tree in self.trees:
 81 |             tree.word_forms_ids = SyntacticCONLL.list_of_words_to_ids(tree.word_forms, word_dict, False,
 82 |                                                                       pretrained_word_embedding)
 83 |             words = tree.word_forms
 84 |             max_word_length = max([len(w) for w in words] + [3, 4, 5])  # compare with character cnn filter width
 85 |             single_sample_char_tokens = np.zeros([len(words), max_word_length], dtype=np.int64)
 86 |             for i, word in enumerate(words):
 87 |                 single_sample_char_tokens[i, :len(word)] = SyntacticCONLL.list_of_words_to_ids(word, char_dict)
 88 |             # Add the sample char tokens into the sample_char_tokens
 89 |             tree.char_ids = single_sample_char_tokens
 90 | 
 91 |             tree.labels_id = SyntacticCONLL.list_of_words_to_ids(tree.labels, syn_label_dict)
 92 | 
 93 |         sample_word_forms_ids = [tree.word_forms_ids for tree in self.trees]
 94 |         sample_char_ids = [tree.char_ids for tree in self.trees]
 95 |         sample_heads = [np.asarray(tree.heads) for tree in self.trees]
 96 |         sample_labels_ids = [np.asarray(tree.labels_id) for tree in self.trees]
 97 |         self.sample_dep_data = list(zip(sample_word_forms_ids, sample_char_ids, sample_heads, sample_labels_ids))
 98 | 
 99 |     def get_syntactic_label_dict(self, syn_label_dict=None):
100 |         if syn_label_dict is None:
101 |             syn_label_dict = Dictionary(padding_token=PADDING_TOKEN, unknown_token=UNKNOWN_TOKEN)
102 |         else:
103 |             assert syn_label_dict.accept_new is False
104 |         sentences_length = len(self.trees)
105 |         for i in range(sentences_length):
106 |             ith_sentence_length = len(self.trees[i].labels)
107 |             for j in range(ith_sentence_length):
108 |                 self.trees[i].labels_id.append(syn_label_dict.add(self.trees[i].labels[j]))
109 |         return syn_label_dict
110 | 
111 | 
112 | def load_dependency_trees(file_path, word_dict, char_dict, syn_label_dict, word_embeddings):
113 |     dep_trees = SyntacticCONLL()
114 |     dep_trees.read_from_file(file_path, max_sentence_length=2000)
115 |     dep_trees.tokenize_dep_trees(word_dict, char_dict, syn_label_dict, word_embeddings)
116 | 
117 |     auto_dep_trees = OrderedDict()
118 |     for tree in dep_trees.trees:
119 |         sentence = ' '.join(tree.word_forms[1:])  # remove the "Root"
120 |         auto_dep_trees[sentence] = tree
121 |     return auto_dep_trees
122 | 
123 | 
124 | class SyntacticRepresentation(object):
125 |     def __init__(self):
126 |         self.file_name = ""
127 |         self.representations = []
128 | 
129 |     def read_from_file(self, filename):
130 |         self.file_name = filename
131 |         print("Reading lstm representations from {}".format(self.file_name))
132 |         representation_file = open(self.file_name, 'r')
133 |         if representation_file.closed:
134 |             print("Cannot open the representation file! Please check {}".format(self.file_name))
135 |             exit()
136 |         each_sentence_representations = []
137 |         for line in representation_file:
138 |             if line == '\n' or line == "\r\n":  # new sentence
139 |                 self.representations.append(each_sentence_representations)
140 |                 each_sentence_representations = []
141 |                 continue
142 |             line = line.strip()
143 |             line = line.split('\t')
144 |             line = line[1].split(' ')
145 |             rep = np.asarray(line, dtype=np.float32)
146 |             each_sentence_representations.append(rep)
147 |         representation_file.close()
148 |         print("Load LSTM representations done, total {} sentences' representations".format(len(self.representations)))
149 | 
150 |     def minus_by_the_predicate(self, corpus_tensors):
151 |         has_processed_sentence_id = {}
152 |         for i, data in enumerate(corpus_tensors):
153 |             sentence_id = data[0][0][0]
154 |             predicates = data[0][2]
155 |             predicate_id = predicates.argmax()
156 |             if sentence_id in has_processed_sentence_id:
157 |                 continue
158 |             else:
159 |                 has_processed_sentence_id[sentence_id] = 1
160 |             for j in range(1, len(self.representations[sentence_id])):  # Root doesn't use.
161 |                 self.representations[sentence_id][j] = self.representations[sentence_id][predicate_id] - self.representations[sentence_id][j]
162 | 
163 |     def check_math_corpus(self, lengths):
164 |         for i, length in enumerate(lengths):
165 |             if len(self.representations[i]) != length + 1:  # 1 means the first one, Root. Actually never use it.
166 |                 print(i, length, len(self.representations[i]))
167 |                 print("sentence {} doesn't match: lstm representation {} vs corpus {}" .format(i, len(self.representations[i])), length)
168 |                 exit()
169 |         print("LSTM representation match the corpus!")
170 | 


--------------------------------------------------------------------------------