├── .gitignore ├── NeuralRST ├── __init__.py ├── in_out │ ├── __init__.py │ ├── instance.py │ ├── node.py │ ├── preprocess.py │ ├── reader.py │ ├── rst_feature.py │ ├── tree.py │ └── util.py ├── models │ ├── __init__.py │ ├── alphabet.py │ ├── architecture.py │ ├── config.py │ ├── explorer.py │ ├── metric.py │ └── vocab.py ├── modules │ ├── __init__.py │ ├── embedding.py │ ├── function_variational_rnn.py │ ├── layer.py │ └── variational_rnn.py ├── requirements.txt ├── run_rst_parser.py ├── train_rst_parser.py └── transition │ ├── __init__.py │ ├── action.py │ ├── atom_feature.py │ └── state.py ├── README.md ├── biaffine_model.py ├── corenlp.py ├── extract_latent_feature.py ├── extract_shallow_feature.py ├── extract_tree.py ├── models ├── biaffine │ ├── alphabets │ │ ├── character.json │ │ ├── pos.json │ │ ├── type.json │ │ └── word.json │ ├── network.pt │ └── network.pt.arg.json └── rst │ ├── alphabets │ ├── action_label_alpha.json │ ├── etype_alpha.json │ ├── gold_action_alpha.json │ ├── tag_alpha.json │ └── word_alpha.json │ ├── config.cfg │ └── network.pt ├── neuronlp2 ├── __init__.py ├── biaffine_model.py ├── io │ ├── __init__.py │ ├── alphabet.py │ ├── conll03_data.py │ ├── conllx_data.py │ ├── conllx_stacked_data.py │ ├── instance.py │ ├── logger.py │ ├── reader.py │ ├── utils.py │ └── writer.py ├── models │ ├── __init__.py │ ├── parsing.py │ └── sequence_labeling.py ├── nlinalg │ ├── __init__.py │ └── nlinalg.py ├── nn │ ├── __init__.py │ ├── _functions │ │ ├── __init__.py │ │ ├── masked_rnn.py │ │ ├── skipconnect_rnn.py │ │ └── variational_rnn.py │ ├── init.py │ ├── modules │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── crf.py │ │ ├── linear.py │ │ ├── masked_rnn.py │ │ ├── skipconnect_rnn.py │ │ ├── sparse.py │ │ └── variational_rnn.py │ └── utils.py ├── tasks │ ├── __init__.py │ └── parser.py └── utils.py ├── requirements.txt ├── rst_model.py └── sentence.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /NeuralRST/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/__init__.py -------------------------------------------------------------------------------- /NeuralRST/in_out/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/in_out/__init__.py -------------------------------------------------------------------------------- /NeuralRST/in_out/instance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from NeuralRST.in_out.node import Node 3 | 4 | # representing one document / one set 5 | class Instance(object): 6 | def __init__(self, total_words, total_tags, edus, gold_actions, result): 7 | self.total_words = total_words 8 | self.total_tags = total_tags 9 | self.edus = edus 10 | self.gold_actions = gold_actions 11 | self.result = result 12 | 13 | def evaluate(self, other_result, span, nuclear, relation, full): # is_trained=False, max_edu_size=0): 14 | main_subtrees = self.result.subtrees 15 | span.overall_label_count += len(main_subtrees) 16 | span.predicated_label_count += len(other_result.subtrees) 17 | for i in range (len(other_result.subtrees)): 18 | for j in range (len(main_subtrees)): 19 | if other_result.subtrees[i].span_equal(main_subtrees[j]): 20 | span.correct_label_count += 1 21 | break 22 | 23 | nuclear.overall_label_count += len(main_subtrees) 24 | nuclear.predicated_label_count += len(other_result.subtrees) 25 | for i in range (len(other_result.subtrees)): 26 | for j in range (len(main_subtrees)): 27 | if other_result.subtrees[i].nuclear_equal(main_subtrees[j]): 28 | nuclear.correct_label_count += 1 29 | break 30 | 31 | relation.overall_label_count += len(main_subtrees) 32 | relation.predicated_label_count += len(other_result.subtrees) 33 | for i in range (len(other_result.subtrees)): 34 | for j in range (len(main_subtrees)): 35 | if other_result.subtrees[i].relation_equal(main_subtrees[j]): 36 | relation.correct_label_count += 1 37 | break 38 | 39 | full.overall_label_count += len(main_subtrees) 40 | full.predicated_label_count += len(other_result.subtrees) 41 | for i in range (len(other_result.subtrees)): 42 | for j in range (len(main_subtrees)): 43 | if other_result.subtrees[i].full_equal(main_subtrees[j]): 44 | full.correct_label_count += 1 45 | break 46 | return span, nuclear, relation, full 47 | 48 | # representing 1 EDU 49 | class EDU(object): 50 | def __init__(self, start_index, end_index): 51 | self.start_index = start_index # int 52 | self.end_index = end_index # int 53 | self.etype = '' # string 54 | self.words = [] # list of word (string) 55 | self.tags = [] # list of tag (string) 56 | self.syntax_features = [] 57 | 58 | # nuclear will be: NUCLEAR, SATELLITE, span 59 | class SubTree(object): 60 | NUCLEAR='NUCLEAR' 61 | SATELLITE='SATELLITE' 62 | SPAN='span' 63 | 64 | def __init__(self): 65 | self.nuclear = '' 66 | self.relation = '' 67 | self.edu_start = -1 68 | self.edu_end = -1 69 | 70 | def clear(self): 71 | self.nuclear = '' 72 | self.relation = '' 73 | self.edu_start = -1 74 | self.edu_end = -1 75 | 76 | def span_equal(self, tree): 77 | return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end 78 | 79 | def nuclear_equal(self, tree): 80 | return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.nuclear == tree.nuclear 81 | 82 | def relation_equal(self, tree): 83 | return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.relation == tree.relation 84 | 85 | def full_equal(self, tree): 86 | return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.relation == tree.relation and self.nuclear and tree.nuclear 87 | 88 | def get_str(self): 89 | return self.nuclear +' '+self.relation+' edu('+str(self.edu_start)+'-'+str(self.edu_end) +')' 90 | 91 | class CResult(object): 92 | def __init__(self): 93 | self.subtrees = [] 94 | 95 | def clear(self): 96 | self.subtrees = [] 97 | 98 | def save(self, file_path): 99 | np.save(file_path, np.array(self.subtrees)) 100 | 101 | def obtain_tree(self): 102 | p_subtree = {} 103 | subtrees = self.subtrees 104 | assert len(subtrees) % 2 == 0 105 | for idx in range(0, len(subtrees), 2): 106 | edu_span = (subtrees[idx].edu_start, subtrees[idx+1].edu_end) 107 | nuclear = subtrees[idx].nuclear + " " + subtrees[idx+1].nuclear 108 | relation = subtrees[idx].relation 109 | if 'span' == relation: 110 | relation = subtrees[idx+1].relation 111 | tree = Node(edu_span, nuclear, relation) 112 | 113 | #set child: 114 | if p_subtree.get(edu_span[0], None) is not None: 115 | tree.left = p_subtree[edu_span[0]] 116 | p_subtree[edu_span[0]].parent = tree 117 | elif subtrees[idx].edu_start == subtrees[idx].edu_end: 118 | leaf = Node((subtrees[idx].edu_start, subtrees[idx].edu_end), '', '') 119 | tree.left = leaf 120 | leaf.parent = tree 121 | if p_subtree.get(edu_span[1], None) is not None: 122 | tree.right = p_subtree[edu_span[1]] 123 | p_subtree[edu_span[1]].parent = tree 124 | elif subtrees[idx+1].edu_start == subtrees[idx+1].edu_end: 125 | leaf = Node((subtrees[idx+1].edu_start, subtrees[idx+1].edu_end), '', '') 126 | tree.right = leaf 127 | leaf.parent = tree 128 | p_subtree[edu_span[0]] = tree 129 | p_subtree[edu_span[1]] = tree 130 | if len(subtrees) != 0: 131 | return p_subtree[0] 132 | else: 133 | return None 134 | 135 | # representing ONE word 136 | class SynFeat(object): 137 | def __init__(self, arc_dep, arc_head, rel_dep, rel_head): 138 | self.arc_dep = arc_dep 139 | self.arc_head = arc_head 140 | self.rel_dep = rel_dep 141 | self.rel_head = rel_head 142 | # self.lstm_out1 = lstm_out1 143 | # self.lstm_out2 = lstm_out2 144 | 145 | def concat(self): 146 | return self.arc_dep + self.rel_dep + self.arc_head + self.rel_head 147 | 148 | -------------------------------------------------------------------------------- /NeuralRST/in_out/node.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Node(object): 4 | def __init__(self, edu_span, nuclear, relation): 5 | self.edu_span = edu_span 6 | self.nuclear = nuclear 7 | self.relation = relation 8 | self.left = None 9 | self.right = None 10 | self.parent = None 11 | 12 | def str(self): 13 | return self.nuclear + ' ' + self.relation 14 | 15 | -------------------------------------------------------------------------------- /NeuralRST/in_out/preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import numpy as np 5 | 6 | from NeuralRST.models.metric import Metric 7 | from NeuralRST.models.alphabet import Alphabet 8 | from NeuralRST.in_out.util import lower_with_digit_transform 9 | from NeuralRST.transition.state import CState 10 | from torch.autograd import Variable 11 | 12 | def construct_embedding_table(alpha, hidden_size, freeze, pretrained_embed = None): 13 | if alpha is None: 14 | return None 15 | scale = np.sqrt(6.0 / (alpha.size()+hidden_size)) 16 | table = np.empty([alpha.size(), hidden_size], dtype=np.float32) 17 | for word, index, in alpha.alpha2id.items(): 18 | if pretrained_embed is not None: 19 | if word in pretrained_embed: 20 | embedding = pretrained_embed[word] 21 | elif word.lower() in pretrained_embed: 22 | embedding = pretrained_embed[word.lower()] 23 | else: 24 | embedding = np.zeros([1, hidden_size]).astype(np.float32) if freeze else np.random.uniform(-scale, scale, [1, hidden_size]).astype(np.float32) 25 | else: 26 | embedding = np.random.uniform(-scale, scale, [1, hidden_size]).astype(np.float32) 27 | table[index, :] = embedding 28 | return torch.from_numpy(table) 29 | 30 | 31 | def create_alphabet(instances, alphabet_directory, logger): 32 | word_size = 0 33 | gold_size = 0 34 | 35 | word_stat = {} 36 | tag_stat = {} 37 | gold_action_stat = {} 38 | action_label_stat = {} 39 | etype_stat = {} 40 | 41 | if not os.path.isdir(alphabet_directory): 42 | print("Creating Alphabets") 43 | for instance in instances: 44 | for i in range(len(instance.total_words)): 45 | word = lower_with_digit_transform(instance.total_words[i].strip()) 46 | tag = instance.total_tags[i] 47 | word_stat[word] = word_stat.get(word, 0) + 1 48 | tag_stat[tag] = tag_stat.get(tag, 0) + 1 49 | 50 | for action in instance.gold_actions: 51 | if (not action.is_shift() and not action.is_finish()): 52 | action_label_stat[action.label] = action_label_stat.get(action.label, 0) + 1 53 | gold_action_stat[action.get_str()] = gold_action_stat.get(action.get_str(), 0) + 1 54 | 55 | for k in range(len(instance.edus)): 56 | etype_stat[instance.edus[k].etype] = etype_stat.get(instance.edus[k].etype, 0) + 1 57 | 58 | word_alpha = Alphabet(word_stat, 'word_alpha') 59 | tag_alpha = Alphabet(tag_stat, 'tag_alpha') 60 | gold_action_alpha = Alphabet(gold_action_stat, 'gold_action_alpha', for_label_index=True) 61 | action_label_alpha = Alphabet(action_label_stat, 'action_label_alpha', for_label_index=True) 62 | etype_alpha = Alphabet(etype_stat, 'etype_alpha') 63 | 64 | word_alpha.save(alphabet_directory) 65 | tag_alpha.save(alphabet_directory) 66 | gold_action_alpha.save(alphabet_directory) 67 | action_label_alpha.save(alphabet_directory) 68 | etype_alpha.save(alphabet_directory) 69 | else: 70 | print("Loading Alphabets") 71 | word_alpha = Alphabet(word_stat, 'word_alpha') 72 | tag_alpha = Alphabet(tag_stat, 'tag_alpha') 73 | gold_action_alpha = Alphabet(gold_action_stat, 'gold_action_alpha') 74 | action_label_alpha = Alphabet(action_label_stat, 'action_label_alpha') 75 | etype_alpha = Alphabet(etype_stat, 'etype_alpha') 76 | 77 | word_alpha.load(alphabet_directory) 78 | tag_alpha.load(alphabet_directory) 79 | gold_action_alpha.load(alphabet_directory, for_label_index=True) 80 | action_label_alpha.load(alphabet_directory, for_label_index=True) 81 | etype_alpha.load(alphabet_directory) 82 | 83 | logger.info("Word alphabet size: " + str(word_alpha.size())) 84 | logger.info("Tag alphabet size: " + str(tag_alpha.size())) 85 | logger.info("Gold action alphabet size: " + str(gold_action_alpha.size())) 86 | logger.info("Action Label alphabet size: " + str(action_label_alpha.size())) 87 | logger.info("Etype alphabet size: " + str(etype_alpha.size())) 88 | return word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha 89 | 90 | 91 | def validate_gold_actions(instances, maxStateSize): 92 | shift_num = 0; reduce_nn_num = 0; reduce_ns_num = 0; reduce_sn_num = 0 93 | span = Metric(); nuclear = Metric(); relation = Metric(); full = Metric() 94 | 95 | for inst in instances: 96 | for ac in inst.gold_actions: 97 | if ac.is_shift(): 98 | shift_num+=1 99 | if ac.is_reduce(): 100 | if ac.nuclear == 'NN': 101 | reduce_nn_num += 1 102 | elif ac.nuclear == 'NS': 103 | reduce_ns_num += 1 104 | elif ac.nuclear == 'SN': 105 | reduce_sn_num += 1 106 | else: 107 | raise Exception('Reduce error, this must have nuclearity') 108 | # something is here 109 | assert(ac.label_id != -1) 110 | 111 | print("Reduce NN: " + str(reduce_nn_num)) 112 | print("Reduce NS: " + str(reduce_ns_num)) 113 | print("Reduce SN: " + str(reduce_sn_num)) 114 | print("Shift: " + str(shift_num)) 115 | 116 | print("Checking the gold Actions, it will be interrupted if there is error assertion") 117 | # all_states = [CState() for i in range(maxStateSize)] 118 | # for inst in instances: 119 | # step = 0 120 | # gold_actions = inst.gold_actions 121 | # action_size = len(gold_actions) 122 | # all_states[0].ready(inst) 123 | # while(not all_states[step].is_end()): 124 | # assert(step < action_size) 125 | # all_states[step+1] = all_states[step].move(all_states[step+1], gold_actions[step]) 126 | # step += 1 127 | # assert(step == action_size) 128 | # result = all_states[step].get_result() 129 | # span, nuclear, relation, full = inst.evaluate(result, span, nuclear, relation, full) 130 | # if not span.bIdentical() or not nuclear.bIdentical() or not relation.bIdentical() or not full.bIdentical(): 131 | # raise Exception('Error state conversion!! ') 132 | 133 | def get_max_parameter(instances): 134 | max_edu_size = 0 135 | max_sent_size = 0 136 | max_state_size = 0 137 | 138 | for instance in instances: 139 | len_state = len(instance.gold_actions) 140 | if len_state > max_state_size: 141 | max_state_size = len_state 142 | len_edu = len(instance.edus) 143 | if len_edu > max_edu_size: 144 | max_edu_size = len_edu 145 | for edu in instance.edus: 146 | len_sent = len(edu.words) 147 | if len_sent > max_sent_size: 148 | max_sent_size = len_sent 149 | return max_edu_size, max_sent_size, max_state_size 150 | 151 | def batch_data_variable(data, indices, vocab, config, is_training=True): 152 | batch_size = len(indices) 153 | indices = indices.tolist() 154 | batch = data[indices] 155 | max_edu_len = -1 156 | max_edu_num = -1 157 | for data in batch: 158 | edu_num = len(data.edus) 159 | if edu_num > max_edu_num: max_edu_num = edu_num 160 | for edu in data.edus: 161 | edu_len = len(edu.words) 162 | if edu_len > max_edu_len: max_edu_len = edu_len 163 | 164 | edu_words = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 165 | edu_types = Variable(torch.LongTensor(batch_size, max_edu_num).zero_(), requires_grad=False) 166 | edu_syntax = np.zeros([batch_size, max_edu_num, max_edu_len, config.syntax_dim], dtype=np.float32) 167 | word_mask = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 168 | edu_tags = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 169 | edu_mask = Variable(torch.Tensor(batch_size, max_edu_num).zero_(), requires_grad=False) 170 | word_denominator = Variable(torch.ones(batch_size, max_edu_num).type(torch.FloatTensor) * -1, requires_grad=False) 171 | gold_action_var = np.ones([batch_size, config.max_state_size], dtype=np.int64) * (vocab.gold_action_alpha.size()) 172 | len_edus = np.zeros([batch_size], dtype=np.int64) 173 | 174 | for idx in range(batch_size): 175 | for idy in range(len(batch[idx].edus)): 176 | len_edus[idx] = len(batch[idx].edus) 177 | edu = batch[idx].edus[idy] 178 | edu_mask[idx, idy] = 1 179 | edu_types[idx, idy] = vocab.etype_alpha.word2id(edu.etype) 180 | edu_len = len(edu.words) 181 | word_denominator[idx, idy] = edu_len 182 | for idz in range(edu_len): 183 | word = edu.words[idz] 184 | tag = edu.tags[idz] 185 | edu_words[idx, idy, idz] = vocab.word_alpha.word2id(word) 186 | edu_tags[idx, idy, idz] = vocab.tag_alpha.word2id(tag) 187 | edu_syntax[idx, idy, idz] = edu.syntax_features[idz] 188 | word_mask[idx, idy, idz] = 1 189 | 190 | if is_training: 191 | max_gold = len(batch[idx].gold_actions) 192 | if max_gold > config.max_state_size: max_gold = config.max_state_size 193 | for idy in range(max_gold): 194 | gold_action_str = batch[idx].gold_actions[idy].get_str() 195 | gold_action_var[idx][idy] = vocab.gold_action_alpha.word2id(gold_action_str) 196 | gold_action_var = Variable(torch.from_numpy(gold_action_var), volatile=False, requires_grad=False) 197 | edu_syntax = Variable(torch.from_numpy(edu_syntax), volatile=False, requires_grad=False) 198 | if config.use_gpu: 199 | edu_words = edu_words.cuda() 200 | edu_tags = edu_tags.cuda() 201 | edu_types = edu_types.cuda() 202 | edu_mask = edu_mask.cuda() 203 | word_mask = word_mask.cuda() 204 | gold_action_var = gold_action_var.cuda() 205 | word_denominator = word_denominator.cuda() 206 | edu_syntax = edu_syntax.cuda() 207 | 208 | return edu_words, edu_tags, edu_types, edu_mask, word_mask, gold_action_var, len_edus, word_denominator, edu_syntax 209 | 210 | 211 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /NeuralRST/in_out/rst_feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class RSTFeature (object): 4 | def __init__(self): 5 | self.ns_score = {} 6 | self.rel_type_score = {} 7 | # self.relations = ['attribution', 'background', 'cause', 'comparison', 'condition', 8 | # 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 9 | # 'mannermeans', 'summary', 'temporal', 'topic', 'sameunit', 'textualorganization', 'list'] 10 | self.relations= ["purp", "cont", "attr", "evid", "comp", "list", "back", "same", "topic", 11 | "mann", "summ", "cond", "temp", "eval", "text", "cause", "prob", "elab"] 12 | 13 | def depth(self, node): 14 | left_depth = self.depth(node.left) if node.left else 0 15 | right_depth = self.depth(node.right) if node.right else 0 16 | return max(left_depth, right_depth) + 1 17 | 18 | def get_max_edu(self, node): 19 | if node.right is None: 20 | return node.edu_span[1] 21 | return self.get_max_edu(node.right) 22 | 23 | # author: Fajri Koto 24 | # 6 May 2019 25 | def generate_heuristic_feature(self, node): 26 | if node is None: 27 | print('WARNING: There is a None tree') 28 | return np.array([[0] * 21]) 29 | 30 | # Initialization 31 | depth = self.depth(node) 32 | relation_score = {} 33 | for relation in self.relations: 34 | relation_score[relation] = 0 35 | max_score = 0 36 | for i in range(1,depth+1,1): max_score += i 37 | # Compute! Output is stored in self.ns_score and self.rel_type_score 38 | 39 | self.compute_ns_score(node, depth, depth) 40 | self.compute_relation_score(node, max_score, depth, relation_score) 41 | 42 | # Store output 43 | output = [] 44 | vectors = [] 45 | max_edu = self.get_max_edu(node) 46 | assert max_edu+1 == len(self.ns_score) 47 | for id_edu in range(max_edu+1): 48 | vector1 = self.ns_score[id_edu] 49 | vector2 = self.rel_type_score[id_edu] 50 | vectors.append(vector1+vector2) 51 | vectors = np.array(vectors, np.float32) 52 | return vectors 53 | 54 | # author: Fajri Koto 55 | # 6 May 2019 56 | def compute_ns_score(self, node, height, n_score): 57 | if node.left == None and node.right == None: 58 | assert node.edu_span[0] == node.edu_span[1] 59 | self.ns_score[node.edu_span[0]] = [1.0*n_score/height] 60 | return 61 | n1, n2 = node.nuclear.split(' ') 62 | if n1 == 'SATELLITE': 63 | self.compute_ns_score(node.left, height, n_score-1) 64 | else: 65 | self.compute_ns_score(node.left, height, n_score) 66 | if n2 == 'SATELLITE': 67 | self.compute_ns_score(node.right, height, n_score-1) 68 | else: 69 | self.compute_ns_score(node.right, height, n_score) 70 | 71 | #author Fajri Koto 72 | # 6 May 2019 73 | def compute_relation_score(self, node, max_score, depth, relation_score): 74 | if node.relation != '': 75 | if (relation_score.get(node.relation, -1) != -1): 76 | relation_score[node.relation]+=depth 77 | 78 | if node.left is None and node.right is None: 79 | assert node.edu_span[0] == node.edu_span[1] 80 | result = [] 81 | 82 | # find if you are left or right 83 | if node.parent is not None: 84 | n1, n2 = node.parent.nuclear.split(' ') 85 | n1_v = 0; n2_v = 0 86 | if n1 == 'NUCLEAR': 87 | n1_v = 1 88 | if n2 == 'NUCLEAR': 89 | n2_v = 1 90 | 91 | if node.parent.left == node: 92 | result.append(n1_v) 93 | result.append(n2_v) 94 | else: 95 | assert node.parent.right == node 96 | result.append(n2_v) 97 | result.append(n1_v) 98 | else: 99 | result.append(1) 100 | result.append(1) 101 | 102 | # Score of relations 103 | for relation in self.relations: 104 | result.append(1.0*relation_score[relation]/max_score) 105 | self.rel_type_score[node.edu_span[0]] = result 106 | return 107 | 108 | self.compute_relation_score(node.left, max_score, depth-1, relation_score.copy()) 109 | self.compute_relation_score(node.right, max_score, depth-1, relation_score.copy()) 110 | -------------------------------------------------------------------------------- /NeuralRST/in_out/tree.py: -------------------------------------------------------------------------------- 1 | class Tree(object): 2 | def __init__(self, edu_span, nuclear, relation): 3 | self.edu_span = edu_span 4 | self.nuclear = nuclear 5 | self.relation = relation 6 | self.left = None 7 | self.right = None 8 | 9 | def str(self): 10 | return self.nuclear + ' ' + self.relation 11 | 12 | def get_id(self, vocab): 13 | tmp = self.nuclear.split(' ') 14 | action_str = "REDUCE_" + tmp[0][0] + tmp[1][0] + '_' + self.relation 15 | return vocab.relation_alpha.word2id(action_str) 16 | 17 | def get_nodes(self, nodes, vocab): 18 | cur_id = self.get_id (vocab) 19 | if self.left is not None: 20 | left_id = self.left.get_id(vocab) 21 | key = (cur_id, left_id) 22 | nodes[key] = nodes.get(key, 0) + 1 23 | nodes = self.left.get_nodes(nodes, vocab) 24 | 25 | if self.right is not None: 26 | right_id = self.right.get_id(vocab) 27 | key = (cur_id, right_id) 28 | nodes[key] = nodes.get(key, 0) + 1 29 | nodes = self.right.get_nodes(nodes, vocab) 30 | return nodes 31 | -------------------------------------------------------------------------------- /NeuralRST/in_out/util.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import gzip 4 | import re 5 | import logging 6 | import sys 7 | from datetime import date 8 | from gensim.models.word2vec import Word2Vec 9 | 10 | MAX_CHAR_LENGTH = 45 11 | NUM_CHAR_PAD = 2 12 | 13 | # Regular expressions used to normalize digits. 14 | DIGIT_RE = re.compile(br"\d") 15 | DIGIT_RE2 = re.compile(r"\d") 16 | 17 | 18 | def lower_with_digit_transform(string): 19 | return DIGIT_RE2.sub("0", string.lower()) 20 | 21 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True): 22 | """ 23 | load word embeddings from file 24 | :param embedding: 25 | :param embedding_path: 26 | :return: embedding dict, embedding dimention, caseless 27 | """ 28 | print("Loading embedding: %s from %s" % (embedding, embedding_path)) 29 | if embedding == 'word2vec': 30 | # loading word2vec 31 | word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) 32 | embedd_dim = word2vec.vector_size 33 | return word2vec, embedd_dim 34 | elif embedding == 'glove': 35 | # loading GloVe 36 | embedd_dim = -1 37 | embedd_dict = dict() 38 | with gzip.open(embedding_path, 'r') as file: 39 | for line in file: 40 | line = line.strip() 41 | line = line.decode('utf-8') 42 | if len(line) == 0: 43 | continue 44 | 45 | tokens = line.split() 46 | if embedd_dim < 0: 47 | embedd_dim = len(tokens) - 1 48 | else: 49 | assert (embedd_dim + 1 == len(tokens)) 50 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 51 | embedd[:] = tokens[1:] 52 | word = DIGIT_RE2.sub("0", tokens[0]) if normalize_digits else tokens[0] 53 | embedd_dict[word] = embedd 54 | return embedd_dict, embedd_dim 55 | elif embedding == 'senna': 56 | # loading Senna 57 | embedd_dim = -1 58 | embedd_dict = dict() 59 | with gzip.open(embedding_path, 'r') as file: 60 | for line in file: 61 | line = line.strip() 62 | line = line.decode('utf-8') 63 | if len(line) == 0: 64 | continue 65 | 66 | tokens = line.split() 67 | if embedd_dim < 0: 68 | embedd_dim = len(tokens) - 1 69 | else: 70 | assert (embedd_dim + 1 == len(tokens)) 71 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 72 | embedd[:] = tokens[1:] 73 | word = DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0] 74 | embedd_dict[word] = embedd 75 | return embedd_dict, embedd_dim 76 | elif embedding == 'sskip': 77 | embedd_dim = -1 78 | embedd_dict = dict() 79 | with gzip.open(embedding_path, 'r') as file: 80 | # skip the first line 81 | file.readline() 82 | for line in file: 83 | line = line.strip() 84 | try: 85 | line = line.decode('utf-8') 86 | if len(line) == 0: 87 | continue 88 | 89 | tokens = line.split() 90 | if len(tokens) < embedd_dim: 91 | continue 92 | 93 | if embedd_dim < 0: 94 | embedd_dim = len(tokens) - 1 95 | 96 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 97 | start = len(tokens) - embedd_dim 98 | word = ' '.join(tokens[0:start]) 99 | embedd[:] = tokens[start:] 100 | word = DIGIT_RE.sub(b"0", word) if normalize_digits else word 101 | embedd_dict[word] = embedd 102 | except UnicodeDecodeError: 103 | continue 104 | return embedd_dict, embedd_dim 105 | elif embedding == 'polyglot': 106 | words, embeddings = pickle.load(open(embedding_path, 'rb')) 107 | _, embedd_dim = embeddings.shape 108 | embedd_dict = dict() 109 | for i, word in enumerate(words): 110 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 111 | embedd[:] = embeddings[i, :] 112 | word = DIGIT_RE.sub(b"0", word) if normalize_digits else word 113 | embedd_dict[word] = embedd 114 | return embedd_dict, embedd_dim 115 | 116 | else: 117 | raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]") 118 | 119 | 120 | def get_logger(name, is_dynamic, model_path, level=logging.INFO, handler=sys.stdout, 121 | formatter='%(asctime)s - %(name)s - %(message)s'): 122 | logger = logging.getLogger(name) 123 | logger.setLevel(logging.INFO) 124 | formatter = logging.Formatter(formatter) 125 | stream_handler = logging.StreamHandler(handler) 126 | stream_handler.setLevel(level) 127 | stream_handler.setFormatter(formatter) 128 | logger.addHandler(stream_handler) 129 | today = date.today().isoformat() 130 | if is_dynamic: 131 | hdlr = logging.FileHandler(model_path+'/log_dynamic_'+str(today)+'.txt') 132 | else: 133 | hdlr = logging.FileHandler(model_path+'/log_static_'+str(today)+'.txt') 134 | hdlr.setFormatter(formatter) 135 | logger.addHandler(hdlr) 136 | return logger 137 | 138 | -------------------------------------------------------------------------------- /NeuralRST/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/models/__init__.py -------------------------------------------------------------------------------- /NeuralRST/models/alphabet.py: -------------------------------------------------------------------------------- 1 | import json, os 2 | 3 | UNK_ID = 0 4 | 5 | class Alphabet(object): 6 | def __init__(self, dictionary, name, for_label_index = False): 7 | self.alpha2id = {} 8 | self.id2alpha = {} 9 | self.name = name 10 | self.for_label_index = for_label_index 11 | self.alphas = list(dictionary.keys()) 12 | 13 | ids = 0 14 | if not for_label_index: # for non label 15 | self.alpha2id ['UNK'] = 0 16 | self.id2alpha [0] = 'UNK' 17 | ids += 1 18 | 19 | for alpha in self.alphas: 20 | self.alpha2id[alpha] = ids 21 | self.id2alpha[ids] = alpha 22 | ids += 1 23 | 24 | # add PAD for PADDING, it is used for label / action 25 | if for_label_index: 26 | self.alpha2id ['PAD'] = ids 27 | self.id2alpha [ids] = 'PAD' 28 | self.alphas += ['PAD'] 29 | 30 | # add 'UNK' for non label index alphabet 31 | if not for_label_index: 32 | self.alphas += ['UNK'] 33 | 34 | def get_content(self): 35 | return {'alpha2id': self.alpha2id, 'id2alpha': self.id2alpha, 'alphas': self.alphas} 36 | 37 | def word2id(self, word): 38 | if not self.for_label_index: 39 | return self.alpha2id.get(word, UNK_ID) 40 | else: 41 | return self.alpha2id.get(word, self.alpha2id['PAD']) 42 | 43 | def id2word(self, int_id): 44 | if not self.for_label_index: 45 | return self.id2alpha.get(int_id, 'UNK') 46 | else: 47 | return self.id2alpha.get(int_id, 'PAD') 48 | 49 | def __from_json(self, data): 50 | self.alphas = data["alphas"] 51 | self.alpha2id = data['alpha2id'] 52 | for index, word in data['id2alpha'].items(): 53 | self.id2alpha[int(index)] = word 54 | 55 | def size(self): 56 | if self.for_label_index: 57 | return len(self.alphas) - 1 58 | return len(self.alphas) 59 | 60 | def save(self, output_directory): 61 | try: 62 | if not os.path.exists(output_directory): 63 | os.makedirs(output_directory) 64 | json.dump(self.get_content(), 65 | open(os.path.join(output_directory, self.name + ".json"), "w"), indent=4) 66 | 67 | except Exception as e: 68 | self.logger.warn("Alphabet is not saved: %s" % repr(e)) 69 | 70 | def load(self, input_directory, for_label_index=False): 71 | self.__from_json(json.load(open(os.path.join(input_directory, self.name + ".json")))) 72 | self.for_label_index = for_label_index 73 | -------------------------------------------------------------------------------- /NeuralRST/models/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | class Config(object): 5 | def __init__(self, args): 6 | if args is None: 7 | return 8 | 9 | self.use_gpu = torch.cuda.is_available() 10 | self.use_dynamic_oracle = args.use_dynamic_oracle == 1 11 | self.flag_oracle = False 12 | self.word_embedding = args.word_embedding 13 | self.word_embedding_file = args.word_embedding_file 14 | 15 | self.train_path = args.train 16 | self.test_path = args.test 17 | self.dev_path = args.dev 18 | self.train_syn_feat_path = args.train_syn_feat 19 | self.test_syn_feat_path = args.test_syn_feat 20 | self.dev_syn_feat_path = args.dev_syn_feat 21 | self.model_path = args.model_path +'/'+ args.experiment 22 | self.model_name = args.model_name 23 | self.alphabet_path = os.path.join(self.model_path, 'alphabets/') 24 | 25 | self.max_iter = args.max_iter 26 | self.word_dim = args.word_dim 27 | self.tag_dim = args.tag_dim 28 | self.etype_dim = args.etype_dim 29 | self.syntax_dim = args.syntax_dim 30 | self.max_sent_size = args.max_sent_size 31 | self.max_edu_size = args.max_edu_size 32 | self.max_state_size = args.max_state_size 33 | self.hidden_size = args.hidden_size 34 | 35 | self.freeze = args.freeze 36 | self.drop_prob = args.drop_prob 37 | self.num_layers = args.num_layers 38 | 39 | self.batch_size = args.batch_size 40 | self.opt = args.opt 41 | self.lr = args.lr 42 | self.ada_eps = args.ada_eps 43 | self.momentum = 0.9 44 | self.beta1 = args.beta1 45 | self.beta2 = args.beta2 46 | self.betas = (self.beta1, self.beta2) 47 | self.gamma = args.gamma 48 | self.start_decay = args.start_decay 49 | 50 | self.clip = args.clip 51 | 52 | self.decay = args.decay 53 | self.oracle_prob = args.oracle_prob 54 | self.start_dynamic_oracle = args.start_dynamic_oracle 55 | self.early_stopping = args.early_stopping 56 | 57 | def save(self): 58 | f = open(self.model_path + '/config.cfg', 'w') 59 | f.write("use_gpu = " + str(self.use_gpu) + '\n') 60 | f.write("use_dynamic_oracle = "+ str(self.use_dynamic_oracle) + '\n') 61 | f.write("flag_oracle = " + str(self.flag_oracle) + '\n') 62 | f.write("word_embedding = " + str(self.word_embedding) + '\n') 63 | f.write("word_embedding_file = " + str(self.word_embedding_file) + '\n') 64 | 65 | f.write("train_path = " + str(self.train_path) + '\n') 66 | f.write("test_path = " + str(self.test_path) + '\n') 67 | f.write("dev_path = " + str(self.dev_path) + '\n') 68 | f.write("train_syn_feat_path = " + str(self.train_syn_feat_path) + '\n') 69 | f.write("test_syn_feat_path = " + str(self.test_syn_feat_path) + '\n') 70 | f.write("dev_syn_feat_path = " + str(self.dev_syn_feat_path) + '\n') 71 | f.write("model_path = " + str(self.model_path) + '\n') 72 | f.write("model_name = " + str(self.model_name) + '\n') 73 | f.write("alphabet_path = " + str(self.alphabet_path) + '\n') 74 | 75 | f.write("max_iter = " + str(self.max_iter) + '\n') 76 | f.write("word_dim = " + str(self.word_dim) + '\n') 77 | f.write("tag_dim = " + str(self.tag_dim) + '\n') 78 | f.write("etype_dim = " + str(self.etype_dim) + '\n') 79 | f.write("syntax_dim = " + str(self.syntax_dim) + '\n') 80 | f.write("max_sent_size = " + str(self.max_sent_size) + '\n') 81 | f.write("max_edu_size = " + str(self.max_edu_size) + '\n') 82 | f.write("max_state_size = " + str(self.max_state_size) + '\n') 83 | f.write("hidden_size = " + str(self.hidden_size) + '\n') 84 | 85 | f.write("freeze = " + str(self.freeze) + '\n') 86 | f.write("drop_prob = " + str(self.drop_prob) + '\n') 87 | f.write("num_layers = " + str(self.num_layers) + '\n') 88 | 89 | f.write("batch_size = " + str(self.batch_size) + '\n') 90 | f.write("opt = " + str(self.opt) + '\n') 91 | f.write("lr = " + str(self.lr) + '\n') 92 | f.write("ada_eps = " + str(self.ada_eps) + '\n') 93 | f.write("momentum = " + str(self.momentum) + '\n') 94 | f.write("beta1 = " + str(self.beta1) + '\n') 95 | f.write("beta2 = " + str(self.beta2) + '\n') 96 | f.write("gamma = " + str(self.gamma) + '\n') 97 | f.write("start_decay = " + str(self.start_decay) + '\n') 98 | 99 | f.write("clip = " + str(self.clip) + '\n') 100 | 101 | f.write("decay = " + str(self.decay) + '\n') 102 | f.write("oracle_prob = " + str(self.oracle_prob) + '\n') 103 | f.write("start_dynamic_oracle = " + str(self.start_dynamic_oracle) + '\n') 104 | f.write("early_stopping = " + str(self.early_stopping) + '\n') 105 | f.close() 106 | 107 | def load_config(self, path): 108 | f = open(path, 'r') 109 | self.use_gpu = f.readline().strip().split(' = ')[-1] == 'True' 110 | self.use_dynamic_oracle = f.readline().strip().split(' = ')[-1] == 'True' 111 | self.flag_oracle = f.readline().strip().split(' = ')[-1] == 'True' 112 | self.word_embedding = f.readline().strip().split(' = ')[-1] 113 | self.word_embedding_file = f.readline().strip().split(' = ')[-1] 114 | 115 | self.train_path = f.readline().strip().split(' = ')[-1] 116 | self.test_path = f.readline().strip().split(' = ')[-1] 117 | self.dev_path = f.readline().strip().split(' = ')[-1] 118 | self.train_syn_feat_path = f.readline().strip().split(' = ')[-1] 119 | self.test_syn_feat_path = f.readline().strip().split(' = ')[-1] 120 | self.dev_syn_feat_path = f.readline().strip().split(' = ')[-1] 121 | self.model_path = f.readline().strip().split(' = ')[-1] 122 | self.model_name = f.readline().strip().split(' = ')[-1] 123 | self.alphabet_path = f.readline().strip().split(' = ')[-1] 124 | 125 | self.max_iter = int(f.readline().strip().split(' = ')[-1]) 126 | self.word_dim = int(f.readline().strip().split(' = ')[-1]) 127 | self.tag_dim = int(f.readline().strip().split(' = ')[-1]) 128 | self.etype_dim = int(f.readline().strip().split(' = ')[-1]) 129 | self.syntax_dim = int(f.readline().strip().split(' = ')[-1]) 130 | self.max_sent_size = int(f.readline().strip().split(' = ')[-1]) 131 | self.max_edu_size = int(f.readline().strip().split(' = ')[-1]) 132 | self.max_state_size = int(f.readline().strip().split(' = ')[-1]) 133 | self.hidden_size = int(f.readline().strip().split(' = ')[-1]) 134 | 135 | self.freeze = f.readline().strip().split(' = ')[-1] == 'True' 136 | self.drop_prob = float(f.readline().strip().split(' = ')[-1]) 137 | self.num_layers = int(f.readline().strip().split(' = ')[-1]) 138 | 139 | self.batch_size = int(f.readline().strip().split(' = ')[-1]) 140 | self.opt = f.readline().strip().split(' = ')[-1] 141 | self.lr = float(f.readline().strip().split(' = ')[-1]) 142 | self.ada_eps = float(f.readline().strip().split(' = ')[-1]) 143 | self.momentum = float(f.readline().strip().split(' = ')[-1]) 144 | self.beta1 = float(f.readline().strip().split(' = ')[-1]) 145 | self.beta2 = float(f.readline().strip().split(' = ')[-1]) 146 | self.betas = (self.beta1, self.beta2) 147 | self.gamma = float(f.readline().strip().split(' = ')[-1]) 148 | self.start_decay = int(f.readline().strip().split(' = ')[-1]) 149 | 150 | self.clip = float(f.readline().strip().split(' = ')[-1]) 151 | 152 | self.decay = int(f.readline().strip().split(' = ')[-1]) 153 | self.oracle_prob = float(f.readline().strip().split(' = ')[-1]) 154 | self.start_dynamic_oracle = int(f.readline().strip().split(' = ')[-1]) 155 | self.early_stopping = int(f.readline().strip().split(' = ')[-1]) 156 | f.close() 157 | -------------------------------------------------------------------------------- /NeuralRST/models/explorer.py: -------------------------------------------------------------------------------- 1 | import random 2 | from NeuralRST.transition.action import CAction 3 | from NeuralRST.in_out.instance import SubTree 4 | 5 | class Explorer(object): 6 | def __init__(self, vocab): 7 | self.gold_action_alpha = vocab.gold_action_alpha 8 | self.action_label_alpha = vocab.action_label_alpha 9 | 10 | def subtree_loss(self, subtree, gold_tree): 11 | subtree_size = len(gold_tree) 12 | loss = 3 13 | for i in range(subtree_size): 14 | gold_subtree = gold_tree[i] 15 | if subtree.span_equal(gold_subtree): 16 | loss -= 1 17 | if subtree.nuclear == gold_subtree.nuclear: 18 | loss -= 1 19 | if subtree.relation == gold_subtree.relation: 20 | loss -= 1 21 | break 22 | return loss 23 | 24 | # CAction ac 25 | # Cstate error_cstate 26 | # SubTree[] gold_tree 27 | def nuclear_label_loss(self, ac, error_cstate, gold_tree): 28 | assert(error_cstate.stack_size >= 2) 29 | top0 = error_cstate.stack[error_cstate.stack_size - 1] 30 | top1 = error_cstate.stack[error_cstate.stack_size - 2] 31 | subtree0 = SubTree() 32 | subtree1 = SubTree() 33 | if ac.nuclear == CAction.NN: 34 | subtree0.edu_start = top0.edu_start 35 | subtree0.edu_end = top0.edu_end 36 | subtree0.nuclear = SubTree.NUCLEAR 37 | subtree0.relation = ac.label 38 | subtree1.edu_start = top1.edu_start 39 | subtree1.edu_end = top1.edu_end 40 | subtree1.nuclear = SubTree.NUCLEAR 41 | subtree1.relation = ac.label 42 | elif ac.nuclear == CAction.NS: 43 | subtree0.edu_start = top0.edu_start 44 | subtree0.edu_end = top0.edu_end 45 | subtree0.nuclear = SubTree.SATELLITE 46 | subtree0.relation = ac.label 47 | subtree1.edu_start = top1.edu_start 48 | subtree1.edu_end = top1.edu_end 49 | subtree1.nuclear = SubTree.NUCLEAR 50 | subtree1.relation = SubTree.SPAN 51 | elif ac.nuclear == CAction.SN: 52 | subtree0.edu_start = top0.edu_start 53 | subtree0.edu_end = top0.edu_end 54 | subtree0.nuclear = SubTree.NUCLEAR 55 | subtree0.relation = SubTree.SPAN 56 | subtree1.edu_start = top1.edu_start 57 | subtree1.edu_end = top1.edu_end 58 | subtree1.nuclear = SubTree.SATELLITE 59 | subtree1.relation = ac.label 60 | loss0 = self.subtree_loss(subtree0, gold_tree) 61 | loss1 = self.subtree_loss(subtree1, gold_tree) 62 | 63 | return loss0 + loss1 64 | 65 | 66 | def shift_loss(self, error_cstate, gold_tree): 67 | assert(error_cstate.stack_size >= 1) 68 | end = error_cstate.stack[error_cstate.stack_size - 1].edu_end 69 | gold_action_size = len(gold_tree) 70 | count = 0 71 | max_size = error_cstate.stack_size - 1 72 | for i in range(0, max_size): 73 | start = error_cstate.stack[i].edu_start 74 | for j in range(0, gold_action_size): 75 | gold_subtree = gold_tree[j] 76 | if start == gold_subtree.edu_start and end == gold_subtree.edu_end: 77 | count += 1 78 | return count 79 | 80 | def reduce_loss(self,error_cstate, gold_tree): 81 | assert(error_cstate.stack_size >= 1) 82 | start = error_cstate.stack[error_cstate.stack_size - 1].edu_start 83 | gold_action_size = len(gold_tree) 84 | count = 0 85 | for i in range(error_cstate.next_index, error_cstate.edu_size): 86 | end = i 87 | for j in range(0, gold_action_size): 88 | gold_subtree = gold_tree[j] 89 | if start == gold_subtree.edu_start and end == gold_subtree.edu_end: 90 | count += 1 91 | return count 92 | 93 | def get_reduce_candidate(self, error_cstate, gold_tree, candidate_actions): 94 | assert(error_cstate.stack_size >= 2) 95 | label_size = self.gold_action_alpha.size() 96 | tmp_acts = [] # 1 element is tuple (CAction, int) 97 | for nuclear in ['NN', 'NS', 'SN']: 98 | for label in self.action_label_alpha.alphas: 99 | ac = CAction(CAction.REDUCE, nuclear, label) 100 | action_str = ac.get_str() 101 | pad_id = self.gold_action_alpha.alpha2id['PAD'] 102 | if self.gold_action_alpha.word2id(action_str) != pad_id: 103 | loss = self.nuclear_label_loss(ac, error_cstate, gold_tree) 104 | tmp_acts.append((ac, loss)) 105 | if loss == 0: 106 | candidate_actions.append(ac) 107 | return candidate_actions 108 | assert(len(tmp_acts) > 0) 109 | action_size = len(tmp_acts) 110 | min_loss = tmp_acts[0][1] 111 | min_index = 0 112 | for i in range(1, action_size): 113 | cur_iter = tmp_acts[i] 114 | cur_loss = cur_iter[1] 115 | if cur_loss < min_loss: 116 | min_index = i 117 | min_loss = cur_loss 118 | 119 | for i in range(action_size): 120 | cur_iter = tmp_acts[i] 121 | if cur_iter[1] == min_loss: 122 | candidate_actions.append(cur_iter[0]) 123 | return candidate_actions 124 | 125 | # parameter: 126 | # error_cstate (CState) 127 | # gold_tree (SubTree []) 128 | # return CState optimal_action 129 | def get_oracle(self, error_cstate, gold_tree): 130 | candidate_actions = [] 131 | ac = CAction('', '', '') 132 | if error_cstate.stack_size < 2: 133 | if error_cstate.next_index == error_cstate.edu_size: 134 | ac.set(CAction.POP_ROOT, '', '') 135 | else: 136 | ac.set(CAction.SHIFT, '', '') 137 | candidate_actions.append(ac) 138 | elif error_cstate.next_index == error_cstate.edu_size: 139 | ac.set(CAction.REDUCE, '', '') 140 | else: 141 | shift_loss = self.shift_loss(error_cstate, gold_tree) 142 | reduce_loss = self.reduce_loss(error_cstate, gold_tree) 143 | if shift_loss < reduce_loss: 144 | ac.set(CAction.SHIFT, '', '') 145 | candidate_actions.append(ac) 146 | elif shift_loss >= reduce_loss: 147 | ac.set(CAction.REDUCE, '', '') 148 | if shift_loss == reduce_loss: 149 | shift_action = CAction(CAction.SHIFT, '', '') 150 | candidate_actions.append(shift_action) 151 | if ac.is_reduce(): 152 | candidate_actions = self.get_reduce_candidate(error_cstate, gold_tree, candidate_actions) 153 | minimum = 0 154 | maximum = len(candidate_actions) 155 | rand_index = int(random.random() * (maximum-minimum)) 156 | # import ipdb; ipdb.set_trace() 157 | return candidate_actions[rand_index] 158 | 159 | -------------------------------------------------------------------------------- /NeuralRST/models/metric.py: -------------------------------------------------------------------------------- 1 | class Metric(object): 2 | def __init__(self): 3 | self.overall_label_count = 0 4 | self.correct_label_count = 0 5 | self.predicated_label_count = 0 6 | 7 | def set(metric): 8 | self.overall_label_count = metric.overall_label_count 9 | self.correct_label_count = metric.correct_label_count 10 | self.predicated_label_count = metric.predicated_label_count 11 | 12 | def get_accuracy(self): 13 | if self.overall_label_count == 0: 14 | return 1.0 15 | if self.predicated_label_count == 0: 16 | return 1.0 * self.correct_label_count / self.overall_label_count 17 | else: 18 | return self.correct_label_count * 2.0 / (self.overall_label_count + self.predicated_label_count) 19 | 20 | def get_f_measure(self): 21 | return self.correct_label_count*2.0 / (self.overall_label_count + self.predicated_label_count) 22 | 23 | def print_metric(self): 24 | if self.predicated_label_count == 0: 25 | return ("Precision: P=" + str (self.correct_label_count) + "/" + str(self.overall_label_count) + \ 26 | "="+ str(self.correct_label_count*1.0 / self.overall_label_count)) 27 | else: 28 | return ("Recall: P=" + str(self.correct_label_count) + "/" + str(self.overall_label_count) + "=" + str(self.correct_label_count*1.0 / self.overall_label_count) + \ 29 | ", " + "Precision: P=" + str(self.correct_label_count) + "/" + str(self.predicated_label_count) + "=" + str(self.correct_label_count*1.0 / self.predicated_label_count) + \ 30 | ", " + "Fmeasure: " + str(self.correct_label_count*2.0 / (self.overall_label_count + self.predicated_label_count))) 31 | 32 | def bIdentical(self): 33 | if self.predicated_label_count == 0: 34 | if self.overall_label_count == self.correct_label_count: 35 | return True 36 | return False 37 | else: 38 | if self.overall_label_count == self.correct_label_count and self.predicated_label_count == self.correct_label_count: 39 | return True 40 | return False 41 | 42 | def reset(self): 43 | self.overall_label_count = 0 44 | self.correct_label_count = 0 45 | self.predicated_label_count = 0 46 | 47 | -------------------------------------------------------------------------------- /NeuralRST/models/vocab.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from NeuralRST.transition.action import CAction 3 | 4 | class Vocab(object): 5 | def __init__(self, word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha): 6 | self.word_alpha = word_alpha 7 | self.tag_alpha = tag_alpha 8 | self.etype_alpha = etype_alpha 9 | self.gold_action_alpha = gold_action_alpha 10 | self.action_label_alpha = action_label_alpha 11 | 12 | self.id2action = {} 13 | for key in self.gold_action_alpha.id2alpha.keys(): 14 | 15 | if key != self.gold_action_alpha.size(): 16 | self.id2action[key] = self.get_action(key) 17 | 18 | self.mask_reduce = np.array([False] * self.gold_action_alpha.size()) 19 | self.mask_no_action = np.array([False] * self.gold_action_alpha.size()) 20 | self.mask_shift = np.array([False] * self.gold_action_alpha.size()) 21 | self.mask_pop_root = np.array([False] * self.gold_action_alpha.size()) 22 | for key in self.gold_action_alpha.id2alpha.keys(): 23 | if 'SHIFT' in self.gold_action_alpha.id2alpha[key]: 24 | self.mask_shift[key] = True 25 | if 'REDUCE' in self.gold_action_alpha.id2alpha[key]: 26 | self.mask_reduce[key] = True 27 | if 'POPROOT' in self.gold_action_alpha.id2alpha[key]: 28 | self.mask_pop_root[key] = True 29 | if 'NOACTION' in self.gold_action_alpha.id2alpha[key]: 30 | self.mask_no_action[key] = True 31 | 32 | def get_action(self, id_selected_action): 33 | mapper = {'SHIFT': 'SH', 'REDUCE': 'RD', 'POPROOT': 'PR', 'NOACTION': ''} 34 | str_selected_action = self.gold_action_alpha.id2word(id_selected_action).split('_') 35 | selected_action = CAction(mapper[str_selected_action[0]], 36 | str_selected_action[1], 37 | str_selected_action[2]) 38 | return selected_action 39 | -------------------------------------------------------------------------------- /NeuralRST/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/modules/__init__.py -------------------------------------------------------------------------------- /NeuralRST/modules/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from torch.nn.parameter import Parameter 6 | from torch.autograd import Variable 7 | 8 | def assign_tensor(tensor, val): 9 | """ 10 | copy val to tensor 11 | Args: 12 | tensor: an n-dimensional torch.Tensor or autograd.Variable 13 | val: an n-dimensional torch.Tensor to fill the tensor with 14 | 15 | Returns: 16 | """ 17 | if isinstance(tensor, Variable): 18 | assign_tensor(tensor.data, val) 19 | return tensor 20 | return tensor.copy_(val) 21 | 22 | 23 | class Embedding(nn.Module): 24 | r"""A simple lookup table that stores embeddings of a fixed dictionary and size. 25 | This module is often used to store word embeddings and retrieve them using indices. 26 | The input to the module is a list of indices, and the output is the corresponding 27 | word embeddings. 28 | Args: 29 | num_embeddings (int): size of the dictionary of embeddings 30 | embedding_dim (int): the size of each embedding vector 31 | init_embedding (Tensor or Variable): If given, the embedding will be initialized with the given tensor. 32 | freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process. 33 | padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index. 34 | max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this 35 | norm_type (float, optional): The p of the p-norm to compute for the max_norm option 36 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of 37 | the words in the mini-batch. 38 | sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for 39 | more details regarding sparse gradients. 40 | Attributes: 41 | weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) 42 | Shape: 43 | - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch 44 | - Output: `(N1, N2, ..., Nm, W, embedding_dim)` 45 | Notes: 46 | Keep in mind that only a limited number of optimizers support 47 | sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`), 48 | and `optim.Adagrad` (`cpu`) 49 | """ 50 | 51 | def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None, 52 | max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False): 53 | super(Embedding, self).__init__() 54 | self.num_embeddings = num_embeddings 55 | self.embedding_dim = embedding_dim 56 | self.padding_idx = padding_idx 57 | self.max_norm = max_norm 58 | self.norm_type = norm_type 59 | self.scale_grad_by_freq = scale_grad_by_freq 60 | self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim)) 61 | self.frozen = freeze 62 | self.sparse = sparse 63 | 64 | self.reset_parameters(init_embedding) 65 | 66 | def reset_parameters(self, init_embedding): 67 | if init_embedding is None: 68 | scale = np.sqrt(3.0 / self.embedding_dim) 69 | self.weight.data.uniform_(-scale, scale) 70 | else: 71 | assign_tensor(self.weight, init_embedding) 72 | if self.padding_idx is not None: 73 | self.weight.data[self.padding_idx].fill_(0) 74 | 75 | if self.frozen: 76 | if init_embedding is None: 77 | raise Warning('Freeze embeddings which are randomly initialized.') 78 | self.weight.requires_grad = False 79 | 80 | def freeze(self): 81 | self.weight.requires_grad = False 82 | self.frozen = True 83 | 84 | def forward(self, input): 85 | padding_idx = self.padding_idx 86 | if padding_idx is None: 87 | padding_idx = -1 88 | 89 | input_size = input.size() 90 | if input.dim() > 2: 91 | num_inputs = int(np.prod(input_size[:-1])) 92 | input = input.view(num_inputs, input_size[-1]) 93 | 94 | output_size = input_size + (self.embedding_dim,) 95 | return self._backend.Embedding.apply( 96 | input, self.weight, 97 | padding_idx, self.max_norm, self.norm_type, 98 | self.scale_grad_by_freq, self.sparse).view(output_size) 99 | 100 | def __repr__(self): 101 | s = '{name}({num_embeddings}, {embedding_dim}' 102 | if self.padding_idx is not None: 103 | s += ', padding_idx={padding_idx}' 104 | if self.max_norm is not None: 105 | s += ', max_norm={max_norm}' 106 | if self.norm_type != 2: 107 | s += ', norm_type={norm_type}' 108 | if self.scale_grad_by_freq is not False: 109 | s += ', scale_grad_by_freq={scale_grad_by_freq}' 110 | if self.sparse is not False: 111 | s += ', sparse=True' 112 | s += ')' 113 | return s.format(name=self.__class__.__name__, **self.__dict__) 114 | -------------------------------------------------------------------------------- /NeuralRST/modules/function_variational_rnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend 3 | from torch.nn import functional as F 4 | 5 | 6 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 7 | if noise_in is not None: 8 | input = input * noise_in 9 | if noise_hidden is not None: 10 | hidden = hidden * noise_hidden 11 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 12 | return hy 13 | 14 | 15 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 16 | if noise_in is not None: 17 | input = input * noise_in 18 | if noise_hidden is not None: 19 | hidden = hidden * noise_hidden 20 | hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 21 | return hy 22 | 23 | 24 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 25 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 26 | 27 | hx, cx = hidden 28 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 29 | 30 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 31 | 32 | ingate, forgetgate, cellgate, outgate = gates 33 | 34 | ingate = F.sigmoid(ingate) 35 | forgetgate = F.sigmoid(forgetgate) 36 | cellgate = F.tanh(cellgate) 37 | outgate = F.sigmoid(outgate) 38 | 39 | cy = (forgetgate * cx) + (ingate * cellgate) 40 | hy = outgate * F.tanh(cy) 41 | 42 | return hy, cy 43 | 44 | 45 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 46 | if noise_in is not None: 47 | input = input * noise_in 48 | 49 | if input.is_cuda: 50 | igates = F.linear(input, w_ih) 51 | hgates = F.linear(hidden[0], w_hh) if noise_hidden is None else F.linear(hidden[0] * noise_hidden, w_hh) 52 | state = fusedBackend.LSTMFused.apply 53 | return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh) 54 | 55 | hx, cx = hidden 56 | if noise_hidden is not None: 57 | hx = hx * noise_hidden 58 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 59 | 60 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 61 | 62 | ingate = F.sigmoid(ingate) 63 | forgetgate = F.sigmoid(forgetgate) 64 | cellgate = F.tanh(cellgate) 65 | outgate = F.sigmoid(outgate) 66 | 67 | cy = (forgetgate * cx) + (ingate * cellgate) 68 | hy = outgate * F.tanh(cy) 69 | 70 | return hy, cy 71 | 72 | 73 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 74 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 75 | hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden 76 | 77 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) 78 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 79 | i_r, i_i, i_n = gi 80 | h_r, h_i, h_n = gh 81 | 82 | resetgate = F.sigmoid(i_r + h_r) 83 | inputgate = F.sigmoid(i_i + h_i) 84 | newgate = F.tanh(i_n + resetgate * h_n) 85 | hy = newgate + inputgate * (hidden - newgate) 86 | 87 | return hy 88 | 89 | 90 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 91 | if noise_in is not None: 92 | input = input * noise_in 93 | 94 | hx = hidden if noise_hidden is None else hidden * noise_hidden 95 | if input.is_cuda: 96 | gi = F.linear(input, w_ih) 97 | gh = F.linear(hx, w_hh) 98 | state = fusedBackend.GRUFused.apply 99 | return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh) 100 | 101 | gi = F.linear(input, w_ih, b_ih) 102 | gh = F.linear(hx, w_hh, b_hh) 103 | i_r, i_i, i_n = gi.chunk(3, 1) 104 | h_r, h_i, h_n = gh.chunk(3, 1) 105 | 106 | resetgate = F.sigmoid(i_r + h_r) 107 | inputgate = F.sigmoid(i_i + h_i) 108 | newgate = F.tanh(i_n + resetgate * h_n) 109 | hy = newgate + inputgate * (hidden - newgate) 110 | 111 | return hy 112 | 113 | 114 | def VarMaskedRecurrent(reverse=False): 115 | def forward(input, hidden, cell, mask): 116 | output = [] 117 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 118 | for i in steps: 119 | if mask is None or mask[i].data.min() > 0.5: 120 | hidden = cell(input[i], hidden) 121 | elif mask[i].data.max() > 0.5: 122 | hidden_next = cell(input[i], hidden) 123 | # hack to handle LSTM 124 | if isinstance(hidden, tuple): 125 | hx, cx = hidden 126 | hp1, cp1 = hidden_next 127 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 128 | else: 129 | hidden = hidden + (hidden_next - hidden) * mask[i] 130 | # hack to handle LSTM 131 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden) 132 | 133 | if reverse: 134 | output.reverse() 135 | output = torch.cat(output, 0).view(input.size(0), *output[0].size()) 136 | 137 | return hidden, output 138 | 139 | return forward 140 | 141 | 142 | def StackedRNN(inners, num_layers, lstm=False): 143 | num_directions = len(inners) 144 | total_layers = num_layers * num_directions 145 | 146 | def forward(input, hidden, cells, mask): 147 | assert (len(cells) == total_layers) 148 | next_hidden = [] 149 | 150 | if lstm: 151 | hidden = list(zip(*hidden)) 152 | 153 | for i in range(num_layers): 154 | all_output = [] 155 | for j, inner in enumerate(inners): 156 | l = i * num_directions + j 157 | hy, output = inner(input, hidden[l], cells[l], mask) 158 | next_hidden.append(hy) 159 | all_output.append(output) 160 | 161 | input = torch.cat(all_output, input.dim() - 1) 162 | 163 | if lstm: 164 | next_h, next_c = zip(*next_hidden) 165 | next_hidden = ( 166 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 167 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 168 | ) 169 | else: 170 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 171 | 172 | return next_hidden, input 173 | 174 | return forward 175 | 176 | 177 | def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): 178 | rec_factory = VarMaskedRecurrent 179 | 180 | if bidirectional: 181 | layer = (rec_factory(), rec_factory(reverse=True)) 182 | else: 183 | layer = (rec_factory(),) 184 | 185 | func = StackedRNN(layer, 186 | num_layers, 187 | lstm=lstm) 188 | 189 | def forward(input, cells, hidden, mask): 190 | if batch_first: 191 | input = input.transpose(0, 1) 192 | if mask is not None: 193 | mask = mask.transpose(0, 1) 194 | 195 | nexth, output = func(input, hidden, cells, mask) 196 | 197 | if batch_first: 198 | output = output.transpose(0, 1) 199 | 200 | return output, nexth 201 | 202 | return forward 203 | 204 | 205 | def VarMaskedStep(): 206 | def forward(input, hidden, cell, mask): 207 | if mask is None or mask.data.min() > 0.5: 208 | hidden = cell(input, hidden) 209 | elif mask.data.max() > 0.5: 210 | hidden_next = cell(input, hidden) 211 | # hack to handle LSTM 212 | if isinstance(hidden, tuple): 213 | hx, cx = hidden 214 | hp1, cp1 = hidden_next 215 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 216 | else: 217 | hidden = hidden + (hidden_next - hidden) * mask 218 | # hack to handle LSTM 219 | output = hidden[0] if isinstance(hidden, tuple) else hidden 220 | 221 | return hidden, output 222 | 223 | return forward 224 | 225 | 226 | def StackedStep(layer, num_layers, lstm=False): 227 | def forward(input, hidden, cells, mask): 228 | assert (len(cells) == num_layers) 229 | next_hidden = [] 230 | 231 | if lstm: 232 | hidden = list(zip(*hidden)) 233 | 234 | for l in range(num_layers): 235 | hy, output = layer(input, hidden[l], cells[l], mask) 236 | next_hidden.append(hy) 237 | input = output 238 | 239 | if lstm: 240 | next_h, next_c = zip(*next_hidden) 241 | next_hidden = ( 242 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 243 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 244 | ) 245 | else: 246 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 247 | 248 | return next_hidden, input 249 | 250 | return forward 251 | 252 | 253 | def AutogradVarMaskedStep(num_layers=1, lstm=False): 254 | layer = VarMaskedStep() 255 | 256 | func = StackedStep(layer, 257 | num_layers, 258 | lstm=lstm) 259 | 260 | def forward(input, cells, hidden, mask): 261 | nexth, output = func(input, hidden, cells, mask) 262 | return output, nexth 263 | 264 | return forward 265 | -------------------------------------------------------------------------------- /NeuralRST/requirements.txt: -------------------------------------------------------------------------------- 1 | backports.shutil-get-terminal-size==1.0.0 2 | boto==2.49.0 3 | boto3==1.9.93 4 | botocore==1.12.93 5 | bz2file==0.98 6 | certifi==2018.11.29 7 | chardet==3.0.4 8 | decorator==4.3.2 9 | docutils==0.14 10 | enum34==1.1.6 11 | futures==3.2.0 12 | gensim==3.7.1 13 | idna==2.8 14 | ipdb==0.11 15 | ipython==5.8.0 16 | ipython-genutils==0.2.0 17 | jmespath==0.9.3 18 | numpy==1.16.1 19 | pathlib2==2.3.3 20 | pexpect==4.6.0 21 | pickleshare==0.7.5 22 | prompt-toolkit==1.0.15 23 | ptyprocess==0.6.0 24 | Pygments==2.3.1 25 | python-dateutil==2.8.0 26 | PyYAML==3.13 27 | requests==2.21.0 28 | s3transfer==0.2.0 29 | scandir==1.9.0 30 | scipy==1.2.1 31 | simplegeneric==0.8.1 32 | six==1.12.0 33 | smart-open==1.8.0 34 | torch==0.3.1 35 | traitlets==4.3.2 36 | urllib3==1.24.1 37 | wcwidth==0.1.7 38 | -------------------------------------------------------------------------------- /NeuralRST/run_rst_parser.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | import numpy as np 3 | import random 4 | from datetime import datetime 5 | 6 | sys.path.append(".") 7 | 8 | import argparse 9 | import torch 10 | import json 11 | 12 | from in_out.reader import Reader 13 | from in_out.util import load_embedding_dict, get_logger 14 | from in_out.preprocess import create_alphabet 15 | from in_out.preprocess import batch_data_variable 16 | from models.vocab import Vocab 17 | from models.metric import Metric 18 | from models.config import Config 19 | from models.architecture import MainArchitecture 20 | 21 | 22 | main_path='/home/ffajri/' 23 | def main(): 24 | args_parser = argparse.ArgumentParser() 25 | args_parser.add_argument('--config_path', required=True) 26 | args = args_parser.parse_args() 27 | config = Config(None) 28 | config.load_config(args.config_path) 29 | 30 | logger = get_logger("RSTParser RUN", config.use_dynamic_oracle, config.model_path) 31 | word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet(None, config.alphabet_path, logger) 32 | vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha) 33 | 34 | network = MainArchitecture(vocab, config) 35 | network.load_state_dict(torch.load(config.model_name)) 36 | 37 | if config.use_gpu: 38 | network = network.cuda() 39 | network.eval() 40 | 41 | logger.info('Reading test instance') 42 | reader = Reader(config.test_path, config.test_syn_feat_path) 43 | test_instances = reader.read_data() 44 | time_start = datetime.now() 45 | batch_size = config.batch_size 46 | span = Metric(); nuclear = Metric(); relation = Metric(); full = Metric() 47 | predictions = [] 48 | total_data_test = len(test_instances) 49 | for i in range(0, total_data_test, batch_size): 50 | end_index = i+batch_size 51 | if end_index > total_data_test: 52 | end_index = total_data_test 53 | indices = np.array(range(i, end_index)) 54 | subset_data_test = batch_data_variable(test_instances, indices, vocab, config) 55 | prediction_of_subtrees = network.loss(subset_data_test, None) 56 | predictions += prediction_of_subtrees 57 | for i in range(total_data_test): 58 | span, nuclear, relation, full = test_instances[i].evaluate(predictions[i], span, nuclear, relation, full) 59 | time_elapsed = datetime.now() - time_start 60 | m,s = divmod(time_elapsed.seconds, 60) 61 | logger.info('TEST is finished in {} mins {} secs'.format(m,s)) 62 | logger.info("S: " + span.print_metric()) 63 | logger.info("N: " + nuclear.print_metric()) 64 | logger.info("R: " + relation.print_metric()) 65 | logger.info("F: " + full.print_metric()) 66 | 67 | 68 | 69 | import ipdb; ipdb.set_trace() 70 | if __name__ == '__main__': 71 | main() 72 | -------------------------------------------------------------------------------- /NeuralRST/transition/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/transition/__init__.py -------------------------------------------------------------------------------- /NeuralRST/transition/action.py: -------------------------------------------------------------------------------- 1 | 2 | class CAction(object): 3 | """ 4 | Four types of code: 5 | 1. REDUCE = RD 6 | 2. SHIFT = SH 7 | 3. POP_ROOT = PR 8 | 4. NO_ACTION = '' 9 | 10 | Label is the relation, eg: cause, elab, back, same, attr etc. 11 | There are 19 relations in our dataset 12 | - label is String 13 | - label_id is its id in integer 14 | 15 | Three types of Nuclear: 16 | 1. NN 17 | 2. NS 18 | 3. SN 19 | 4. '' 20 | """ 21 | POP_ROOT='PR' 22 | REDUCE='RD' 23 | SHIFT='SH' 24 | NO_ACTION='' 25 | 26 | NN='NN' 27 | NS='NS' 28 | SN='SN' 29 | NO_NUCLEAR='' 30 | 31 | # All string except label_id 32 | def __init__(self, code, nuclear, label): 33 | self.code = code 34 | self.label = label 35 | self.nuclear = nuclear 36 | self.label_id = -1 37 | 38 | def is_none(self): 39 | return self.code == '' 40 | def is_finish(self): 41 | return self.code == 'PR' 42 | def is_shift(self): 43 | return self.code == 'SH' 44 | def is_reduce(self): 45 | return self.code == 'RD' 46 | 47 | def set_label_id(self, label_alpha): 48 | # for leaf the id is set into -1) 49 | self.label_id = label_alpha.get(self.label, -1) 50 | 51 | def get_str(self): 52 | if self.is_shift(): 53 | return "SHIFT__" 54 | elif self.is_reduce(): 55 | if self.nuclear == 'NN': 56 | return "REDUCE_NN_" + self.label 57 | if self.nuclear == 'NS': 58 | return "REDUCE_NS_" + self.label 59 | if self.nuclear == 'SN': 60 | return "REDUCE_SN_" + self.label 61 | elif self.is_finish(): 62 | return "POPROOT__" 63 | else: 64 | return "NOACTION__" 65 | 66 | def set(self, code, nuclear, label): 67 | self.code = code 68 | self.nuclear = nuclear 69 | self.label = label 70 | 71 | def set_from_object(self, ac): 72 | self.code = ac.code 73 | self.label = ac.label 74 | self.nuclear = ac.nuclear 75 | self.label_id = ac.label_id 76 | 77 | -------------------------------------------------------------------------------- /NeuralRST/transition/atom_feature.py: -------------------------------------------------------------------------------- 1 | class CNode(object): 2 | def __init__(self): 3 | self.nuclear = '' 4 | self.label = '' 5 | self.edu_start = -1 6 | self.edu_end = -1 7 | self.is_validate = False 8 | 9 | def clear(self): 10 | self.nuclear = '' 11 | self.label = '' 12 | self.edu_start = -1 13 | self.edu_end = -1 14 | self.is_validate = False 15 | 16 | class AtomFeat: 17 | def __init__(self): 18 | self.s0 = CNode() 19 | self.s1 = CNode() 20 | self.s2 = CNode() 21 | self.q0 = CNode() 22 | 23 | def getFeat(self): 24 | return self.s0, self.s1, self.s2, self.q0 25 | -------------------------------------------------------------------------------- /NeuralRST/transition/state.py: -------------------------------------------------------------------------------- 1 | from NeuralRST.transition.atom_feature import CNode, AtomFeat 2 | from NeuralRST.transition.action import CAction 3 | from NeuralRST.in_out.instance import Instance 4 | from NeuralRST.in_out.instance import SubTree 5 | from NeuralRST.in_out.instance import CResult 6 | 7 | import copy 8 | import numpy as np 9 | 10 | NUCLEAR = 'NUCLEAR' 11 | SATELLITE = 'SATELLITE' 12 | SPAN = 'span' 13 | MAX_LENGTH= 512 14 | class CState(object): 15 | def __init__(self): 16 | self.stack = [CNode() for i in range(MAX_LENGTH)] #list of CNode 17 | self.stack_size = 0 #int 18 | self.edu_size = 0 #int 19 | self.next_index = 0 #int 20 | self.pre_state = None #CState 21 | self.pre_action = CAction('', '', '') #CAction 22 | self.is_start = True 23 | self.atom_feat = AtomFeat() #AtomFeat 24 | 25 | def clear(self): 26 | self.stack_size = 0 #int 27 | self.edu_size = 0 #int 28 | self.next_index = 0 #int 29 | self.pre_state = None #CState 30 | self.pre_action = CAction('', '', '') #CAction 31 | self.is_start = True 32 | self.atom_feat = AtomFeat() #AtomFeat 33 | 34 | def ready(self, edu_size): 35 | self.edu_size = edu_size 36 | 37 | def is_end(self): 38 | if (self.pre_action.is_finish()): 39 | return True 40 | else: 41 | return False 42 | 43 | def copy_state(self, cstate): 44 | cstate.stack = copy.deepcopy(self.stack) 45 | cstate.edu_size = self.edu_size 46 | cstate.pre_state = self 47 | 48 | def done_mark(self): 49 | self.stack[self.stack_size].clear() 50 | 51 | def shift(self, cstate): 52 | cstate.stack_size = self.stack_size + 1 53 | cstate.next_index = self.next_index + 1 54 | self.copy_state(cstate) 55 | top = cstate.stack[cstate.stack_size - 1] 56 | top.clear() 57 | top.is_validate = True 58 | top.edu_start = self.next_index 59 | top.edu_end = self.next_index 60 | 61 | cstate.pre_action.set('SH', '', '') 62 | cstate.done_mark() 63 | 64 | def reduce(self, cstate, nuclear, label): 65 | cstate.stack_size = self.stack_size - 1 66 | cstate.next_index = self.next_index 67 | self.copy_state(cstate) 68 | top0 = cstate.stack[self.stack_size - 1] 69 | top1 = cstate.stack[self.stack_size - 2] 70 | try: 71 | assert(top0.edu_start == top1.edu_end + 1) 72 | assert(top0.is_validate and top1.is_validate) 73 | except: 74 | import ipdb; ipdb.set_trace() 75 | top1.edu_end = top0.edu_end 76 | top1.nuclear = nuclear 77 | top1.label = label 78 | top0.clear() 79 | 80 | cstate.stack[self.stack_size - 1] = top0 81 | cstate.stack[self.stack_size - 2] = top1 82 | 83 | cstate.pre_action.set('RD', nuclear, label) 84 | cstate.done_mark() 85 | 86 | def pop_root(self, cstate): 87 | assert self.stack_size == 1 and self.next_index == self.edu_size 88 | cstate.stack_size = 0 89 | cstate.next_index = self.edu_size 90 | self.copy_state(cstate) 91 | top0 = cstate.stack[self.stack_size - 1] 92 | # assert(top0.edu_start == 0 and top0.edu_end + 1 == self.edu_size) 93 | assert(top0.edu_start == 0) 94 | assert(top0.is_validate) 95 | top0.clear() 96 | 97 | cstate.stack[self.stack_size - 1] = top0 98 | cstate.pre_action.set('PR', '', '') 99 | cstate.done_mark() 100 | 101 | #cstate = CState 102 | #ac = CAction 103 | def move(self, cstate, ac): 104 | cstate.is_start = False 105 | if ac.is_shift(): 106 | self.shift(cstate) 107 | elif ac.is_reduce(): 108 | self.reduce(cstate, ac.nuclear, ac.label) 109 | elif ac.is_finish(): 110 | self.pop_root(cstate) 111 | else: 112 | raise Exception('Error Action!') 113 | return cstate 114 | 115 | def get_result(self): 116 | result = CResult() 117 | state = self 118 | while(not state.pre_state.is_start): 119 | ac = state.pre_action 120 | st = state.pre_state 121 | if (ac.is_reduce()): 122 | assert(st.stack_size >= 2) 123 | right_node = st.stack[st.stack_size-1] 124 | left_node = st.stack[st.stack_size-2] 125 | left_subtree = SubTree() 126 | right_subtree = SubTree() 127 | 128 | left_subtree.edu_start = left_node.edu_start 129 | left_subtree.edu_end = left_node.edu_end 130 | 131 | right_subtree.edu_start = right_node.edu_start 132 | right_subtree.edu_end = right_node.edu_end 133 | 134 | if ac.nuclear == 'NN': 135 | left_subtree.nuclear = NUCLEAR 136 | right_subtree.nuclear = NUCLEAR 137 | left_subtree.relation = ac.label 138 | right_subtree.relation = ac.label 139 | elif ac.nuclear == 'SN': 140 | left_subtree.nuclear = SATELLITE 141 | right_subtree.nuclear = NUCLEAR 142 | left_subtree.relation = ac.label 143 | right_subtree.relation = SPAN 144 | elif ac.nuclear == 'NS': 145 | left_subtree.nuclear = NUCLEAR 146 | right_subtree.nuclear =SATELLITE 147 | left_subtree.relation = SPAN 148 | right_subtree.relation = ac.label 149 | 150 | result.subtrees.insert(0, right_subtree) 151 | result.subtrees.insert(0, left_subtree) 152 | state = state.pre_state 153 | return result 154 | 155 | def allow_shift(self): 156 | if self.next_index == self.edu_size: 157 | return False 158 | return True 159 | 160 | def allow_reduce(self): 161 | if self.stack_size >= 2: 162 | return True 163 | return False 164 | 165 | def allow_pop_root(self): 166 | if self.next_index == self.edu_size and self.stack_size == 1: 167 | return True 168 | return False 169 | 170 | def get_candidate_actions(self, vocab): 171 | mask = np.array([False] * vocab.gold_action_alpha.size()) 172 | if self.allow_reduce(): 173 | mask = mask | vocab.mask_reduce 174 | if self.is_end(): 175 | mask = mask | vocab.mask_no_action 176 | if self.allow_shift(): 177 | mask = mask | vocab.mask_shift 178 | if self.allow_pop_root(): 179 | mask = mask | vocab.mask_pop_root 180 | return ~mask 181 | 182 | def prepare_index(self): 183 | if self.stack_size > 0: 184 | self.atom_feat.s0 = self.stack[self.stack_size - 1] 185 | else: 186 | self.atom_feat.s0 = None 187 | if self.stack_size > 1: 188 | self.atom_feat.s1 = self.stack[self.stack_size - 2] 189 | else: 190 | self.atom_feat.s1 = None 191 | if self.stack_size > 2: 192 | self.atom_feat.s2 = self.stack[self.stack_size - 3] 193 | else: 194 | self.atom_feat.s2 = None 195 | if self.next_index >= 0 and self.next_index < self.edu_size: 196 | self.atom_feat.q0 = self.next_index 197 | else: 198 | self.atom_feat.q0 = None 199 | return self.atom_feat 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RSTExtractor 2 | 3 | This code is the combination of: 4 | 1. NeuroNLP2 (https://github.com/XuezheMax/NeuroNLP2) -- paper: Deep Biaffine Attention for Neural Dependency Parsing (https://arxiv.org/abs/1611.01734). 5 | 2. Neural RST Parser (https://github.com/fajri91/NeuralRST) -- paper: Transition-based Neural RST Parsing with Implicit Syntax Features (https://www.aclweb.org/anthology/C18-1047/). 6 | 7 | This code is used to extract: 8 | 1. Latent feature of discourse units. 9 | 2. Shallow feature of discourse units. 10 | 11 | For more technical details, please refer to our paper: 12 | 13 | Fajri Koto, Jey Han Lau, Timothy Baldwin. [Improved Document Modelling with a Neural Discourse Parser.](https://www.aclweb.org/anthology/U19-1010.pdf) In Proceedings of the 2019 Australasian Language Technology Workshop, Sydney. 14 | 15 | ## Dependencies and Installation 16 | 1. Python 2.7 17 | 2. Run `pip install -r requirements.txt` 18 | 19 | ## Pre-Extraction 20 | There are three main steps: 21 | 1. Using standford corenlp. After downloading the appropriate stanford corenlp, please run `python corenlp.py --source=PATH_TO_YOUR_DOCUMENTS/* --target=PATH_TO_YOUR_OUTPUT`. Please make sure you put all the necessary files of stanford corenlp in this repo with a folder name `stanford-corenlp`. 22 | 2. For the next two steps, please follow https://github.com/fajri91/DPLP for: 23 | * Converting XML file to CoNLL format. 24 | * Segmenting CoNLL file to get EDUs. The output is *.merge file. 25 | 26 | ## Extraction 27 | Now you are ready to extaract latent/shallow features as well as the RST tree. 28 | 1. For latent feature, please run `python extract_latent_feature.py` 29 | 2. For shallow feature, please first run `python extract_tree.py` and after that run `python extract_shallow_feature.py` 30 | 31 | Note1: Please manually adjust all PATHs in the code as I have'nt implemented args.parse in the code.
32 | Note2: Our RST parser performance is similar to _Transition-based Neural RST Parsing with Implicit Syntax Features_ (https://www.aclweb.org/anthology/C18-1047/). 33 | -------------------------------------------------------------------------------- /biaffine_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import numpy as np 5 | from torch.autograd import Variable 6 | from neuronlp2.io import get_logger, conllx_data 7 | from neuronlp2.io import CoNLLXWriter, utils 8 | from neuronlp2.tasks import parser 9 | from neuronlp2.models import BiRecurrentConvBiAffine 10 | 11 | # Special vocabulary symbols - we always put them at the start. 12 | PAD = b"_PAD" 13 | PAD_POS = b"_PAD_POS" 14 | PAD_TYPE = b"_" 15 | PAD_CHAR = b"_PAD_CHAR" 16 | ROOT = b"_ROOT" 17 | ROOT_POS = b"_ROOT_POS" 18 | ROOT_TYPE = b"_" 19 | ROOT_CHAR = b"_ROOT_CHAR" 20 | END = b"_END" 21 | END_POS = b"_END_POS" 22 | END_TYPE = b"_" 23 | END_CHAR = b"_END_CHAR" 24 | _START_VOCAB = [PAD, ROOT, END] 25 | 26 | UNK_ID = 0 27 | PAD_ID_WORD = 1 28 | PAD_ID_CHAR = 1 29 | PAD_ID_TAG = 0 30 | 31 | NUM_SYMBOLIC_TAGS = 3 32 | 33 | _buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 140, 200, 300] 34 | 35 | 36 | class BiaffineModel(object): 37 | def __init__(self, model_path, model_name): 38 | print("................................................") 39 | print("LOADING Biaffine Model") 40 | alphabet_path = os.path.join(model_path, 'alphabets/') 41 | model_name = os.path.join(model_path, model_name) 42 | 43 | self.word_alpha, self.char_alpha, self.tag_alpha, self.type_alpha = conllx_data.create_alphabets(alphabet_path, None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None) 44 | self.id2word = {v: k for k, v in self.word_alpha.instance2index.iteritems()} 45 | 46 | num_words = self.word_alpha.size() 47 | num_chars = self.char_alpha.size() 48 | num_pos = self.tag_alpha.size() 49 | num_types = self.type_alpha.size() 50 | 51 | print("Word Alphabet Size: %d" % num_words) 52 | print("Character Alphabet Size: %d" % num_chars) 53 | print("POS Alphabet Size: %d" % num_pos) 54 | print("Type Alphabet Size: %d" % num_types) 55 | 56 | 57 | def load_model_arguments_from_json(): 58 | arguments = json.load(open(arg_path, 'r')) 59 | return arguments['args'], arguments['kwargs'] 60 | 61 | arg_path = model_name + '.arg.json' 62 | args, kwargs = load_model_arguments_from_json() 63 | self.network = BiRecurrentConvBiAffine(*args, **kwargs) 64 | self.network.load_state_dict(torch.load(model_name)) 65 | 66 | self.network.id2word = self.id2word 67 | self.network.cuda() 68 | self.network.eval() 69 | 70 | def prepare_data(self, sentences, use_gpu=True): 71 | ret_value = [] 72 | for sentence in sentences: 73 | inst_size = sentence.length() 74 | data = None 75 | max_len = 0 76 | bucket = 0 77 | for bucket_size in _buckets: 78 | if inst_size < bucket_size: 79 | bucket = bucket_size 80 | data = [sentence.word_ids, sentence.seq_char_ids, sentence.tag_ids] 81 | max_len = max([len(seq_char) for seq_char in sentence.seq_chars]) 82 | break 83 | if data is None: # meaning the sentence is too long, we cut it into 300 length 84 | bucket = _buckets[-1] 85 | data = [sentence.word_ids[:bucket], sentence.seq_char_ids[:bucket], sentence.tag_ids[:bucket]] 86 | max_len = max([len(seq_char) for seq_char in sentence.seq_chars]) 87 | 88 | 89 | char_length = min(utils.MAX_CHAR_LENGTH, max_len + utils.NUM_CHAR_PAD) 90 | wid_inputs = np.empty([1, bucket], dtype=np.int64) 91 | cid_inputs = np.empty([1, bucket, char_length], dtype=np.int64) 92 | pid_inputs = np.empty([1, bucket], dtype=np.int64) 93 | 94 | masks = np.zeros([1, bucket], dtype=np.float32) 95 | single = np.zeros([1, bucket], dtype=np.int64) 96 | 97 | lengths = np.empty(bucket, dtype=np.int64) 98 | 99 | wids = data[0] 100 | cid_seqs = data[1] 101 | pids = data[2] 102 | inst_size = len(wids) 103 | lengths[0] = inst_size 104 | # word ids 105 | wid_inputs[0, :inst_size] = wids 106 | wid_inputs[0, inst_size:] = PAD_ID_WORD 107 | for c, cids in enumerate(cid_seqs): 108 | limit = len(cids) 109 | if limit > char_length: limit = char_length 110 | try: 111 | cid_inputs[0, c, :limit] = cids[:limit] 112 | cid_inputs[0, c, limit:] = PAD_ID_CHAR 113 | except: 114 | import ipdb; ipdb.set_trace() 115 | cid_inputs[0, inst_size:, :] = PAD_ID_CHAR 116 | # pos ids 117 | pid_inputs[0, :inst_size] = pids 118 | pid_inputs[0, inst_size:] = PAD_ID_TAG 119 | # masks 120 | masks[0, :inst_size] = 1.0 121 | for j, wid in enumerate(wids): 122 | if self.word_alpha.is_singleton(wid): 123 | single[0, j] = 1 124 | 125 | words = Variable(torch.from_numpy(wid_inputs), volatile=False) 126 | chars = Variable(torch.from_numpy(cid_inputs), volatile=False) 127 | pos = Variable(torch.from_numpy(pid_inputs), volatile=False) 128 | masks = Variable(torch.from_numpy(masks), volatile=False) 129 | single = Variable(torch.from_numpy(single), volatile=False) 130 | lengths = torch.from_numpy(lengths) 131 | if use_gpu: 132 | words = words.cuda() 133 | chars = chars.cuda() 134 | pos = pos.cuda() 135 | masks = masks.cuda() 136 | single = single.cuda() 137 | lengths = lengths.cuda() 138 | index = slice(0,1) 139 | ret_value.append((words[index], chars[index], pos[index], masks[index], lengths[index], sentence.words, sentence.edu_ids)) 140 | return ret_value 141 | 142 | def get_syntax_feature(self, data_test, sentences): 143 | sent = 0 144 | syntax_features = [] 145 | for data in data_test: 146 | cur_length = len(sentences[sent].words) 147 | word, char, pos, masks, lengths, original_words, edu_ids = data 148 | sent += 1 149 | syntax_feature = self.network.get_syntax_feature(original_words, word, char, pos, mask=masks, length=lengths) 150 | _ , sent_len, dim = syntax_feature.shape 151 | if sent_len != cur_length: 152 | assert sent_len < cur_length 153 | diff = cur_length - sent_len 154 | zeros = Variable(torch.zeros(1, diff, dim)).type(torch.FloatTensor).cuda() 155 | syntax_feature = torch.cat([syntax_feature, zeros], dim=1) 156 | syntax_features.append(syntax_feature) 157 | return syntax_features 158 | -------------------------------------------------------------------------------- /corenlp.py: -------------------------------------------------------------------------------- 1 | from subprocess import call 2 | import os 3 | import glob 4 | import threading 5 | import math 6 | import argparse 7 | 8 | 9 | scriptdir = '.' 10 | THREADS = 27 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-s", "--source", help="provide source path", type=str) 14 | parser.add_argument("-t", "--target", help="provide target path", type=str) 15 | 16 | args = parser.parse_args() 17 | if args.source: 18 | PATH=args.source 19 | if args.target: 20 | TARGET=args.target 21 | 22 | files = glob.glob(PATH) 23 | targets = glob.glob(TARGET+'/*') 24 | TOTAL_FILES = len(files) 25 | BATCH_SIZE = int(math.ceil(TOTAL_FILES/THREADS)) 26 | 27 | def run_thread(ftmp): 28 | os.system('/usr/bin/java -mx150g -cp "stanford-corenlp/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -ssplit.eolonly -tokenize.whitespace true -filelist '+ftmp+' -outputFormat xml -outputDirectory '+TARGET) 29 | 30 | def generate_listfile(start, end, ftmp): 31 | sliced_files = files[start:end] 32 | w = open (ftmp, 'w') 33 | for f in sliced_files: 34 | fname = TARGET+'/' + f.split('/')[-1] + '.xml' 35 | if not fname in targets: 36 | w.write(f+'\n') 37 | w.close() 38 | 39 | for i in range(THREADS): 40 | start = i * BATCH_SIZE 41 | end = start + BATCH_SIZE 42 | if end > TOTAL_FILES: 43 | end = TOTAL_FILES 44 | 45 | ftmp = 'tmp'+str(i)+'.txt' 46 | generate_listfile(start,end,ftmp) 47 | 48 | t = threading.Thread(target=run_thread, args=(ftmp,)) 49 | t.start() 50 | print (start, end) 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /extract_latent_feature.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | import numpy as np 3 | import random 4 | import re, os 5 | import argparse 6 | from sentence import Sentence, Instance, EDU 7 | from rst_model import RSTModel 8 | from biaffine_model import BiaffineModel 9 | 10 | sys.path.append(".") 11 | ROOT = b"_ROOT" 12 | ROOT_POS = b"_ROOT_POS" 13 | ROOT_CHAR = b"_ROOT_CHAR" 14 | END = b"_END" 15 | END_POS = b"_END_POS" 16 | END_CHAR = b"_END_CHAR" 17 | 18 | UNK_ID = 0 19 | PAD_ID_WORD = 1 20 | PAD_ID_CHAR = 1 21 | PAD_ID_TAG = 0 22 | 23 | NUM_SYMBOLIC_TAGS = 3 24 | 25 | # Regular expressions used to normalize digits. 26 | DIGIT_RE = re.compile(br"\d") 27 | BIAFFINE_PATH = "/home/ffajri/Workspace/RSTExtractor/models/biaffine" 28 | BIAFFINE_MODEL = "network.pt" 29 | RST_CONFIG_PATH = "/home/ffajri/Workspace/RSTExtractor/models/rst/config.cfg" 30 | DATA_PATH = '/home/ffajri/Data/Petition/UK/processed/merge/*' 31 | THREADS = 10 32 | 33 | if not os.path.exists('output'): 34 | os.makedirs('output') 35 | if not os.path.exists('output_enc'): 36 | os.makedirs('output_enc') 37 | 38 | def form_sentence(lines, word_alpha, char_alpha, tag_alpha, symbolic_root=False, symbolic_end=False): 39 | words = [] 40 | word_ids = [] 41 | seq_chars = [] 42 | seq_char_ids = [] 43 | tags = [] 44 | tag_ids = [] 45 | edu_ids = [] 46 | 47 | if symbolic_root: 48 | words.append(ROOT) 49 | word_ids.append(word_alpha.get_index(ROOT)) 50 | seq_chars.append([ROOT_CHAR, ]) 51 | seq_char_ids.append([char_alpha.get_index(ROOT_CHAR), ]) 52 | tags.append(ROOT_POS) 53 | tag_ids.append(tag_alpha.get_index(ROOT_POS)) 54 | 55 | for line in lines: 56 | chars = [] 57 | char_ids = [] 58 | data = line.strip().split('\t') 59 | word = DIGIT_RE.sub(b"0", data[2]) 60 | word_id = word_alpha.get_index(word) 61 | for c in words: 62 | chars.append(c) 63 | char_ids.append(char_alpha.get_index(c)) 64 | tag = '$' if data[4] == '#' else data[4] 65 | tag_id = tag_alpha.get_index(tag) 66 | edu_id = int(data[9]) 67 | 68 | words.append(word) 69 | word_ids.append(word_id) 70 | seq_chars.append(chars) 71 | seq_char_ids.append(char_ids) 72 | tags.append(tag) 73 | tag_ids.append(tag_id) 74 | edu_ids.append(edu_id) 75 | 76 | if symbolic_end: 77 | words.append(END) 78 | word_ids.append(word_alpha.get_index(END)) 79 | seq_chars.append([END_CHAR, ]) 80 | seq_char_ids.append([char_alpha.get_index(END_CHAR), ]) 81 | tags.append(END_POS) 82 | tag_ids.append(tag_alpha.get_index(END_POS)) 83 | 84 | return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids) 85 | 86 | def data_reader(file_path, biaffine): 87 | f = open(file_path, 'r') 88 | sentences = [] 89 | lines = [] 90 | for line in f.readlines(): 91 | if line.strip() == '': 92 | sentences.append(form_sentence(lines, biaffine.word_alpha, biaffine.char_alpha, biaffine.tag_alpha)) 93 | lines = [] 94 | else: 95 | lines.append(line) 96 | data = biaffine.prepare_data(sentences) 97 | syntax_features = biaffine.get_syntax_feature(data, sentences) 98 | 99 | for i in range(len(sentences)): 100 | assert(len(sentences[i].words) == syntax_features[i].shape[1]) 101 | instance = Instance(sentences, syntax_features) 102 | return instance 103 | 104 | def write_to_file(filename, instance, edu_features): 105 | f1 = open('output/'+filename, 'w') 106 | f2_name = 'output_enc/'+filename 107 | 108 | for idx in range(len (instance.edus)): 109 | for word in instance.edus[idx].words: 110 | f1.write(word+'|'+str(idx)+' ') 111 | f1.close() 112 | np.save(f2_name, edu_features.data.numpy()) 113 | 114 | import glob 115 | import threading 116 | import math 117 | from multiprocessing import Process 118 | files = glob.glob(DATA_PATH) 119 | 120 | def run_thread(files): 121 | rst = RSTModel(RST_CONFIG_PATH) 122 | biaffine = BiaffineModel(BIAFFINE_PATH, BIAFFINE_MODEL) 123 | for filepath in files: 124 | filename = filepath.split('/')[-1].replace('.merge', '') 125 | instance = data_reader(filepath, biaffine) 126 | rst_data = rst.prepare_data([instance], 1) 127 | edu_features = rst.get_edu_representation(rst_data).view(len(instance.edus), -1) 128 | write_to_file(filename, instance, edu_features) 129 | 130 | partitions = [] 131 | size = int(math.ceil(1.0*len(files)/THREADS)) 132 | processes = list() 133 | for i in range(THREADS): 134 | start = i * size 135 | end = start + size 136 | if end > len(files): 137 | end = len(files) 138 | p = files[start:end] 139 | 140 | process = Process(target=run_thread, args=(p,)) 141 | process.start() 142 | processes.append(process) 143 | if end == len(files): 144 | break 145 | for process in processes: 146 | process.join() 147 | -------------------------------------------------------------------------------- /extract_shallow_feature.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import glob 3 | from NeuralRST.in_out.instance import CResult 4 | from NeuralRST.in_out.rst_feature import RSTFeature 5 | import threading 6 | import math 7 | from multiprocessing import Process 8 | 9 | THREADS = 40 10 | SOURCE_PATH = '/home/ffajri/Data/Petition/US/processed/output_tree/*' 11 | TARGET_PATH = '/home/ffajri/Data/Petition/US/processed/output_shallow/' 12 | allfiles = glob.glob(SOURCE_PATH) 13 | 14 | def run_thread(files): 15 | for filepath in files: 16 | filename = filepath.split('/')[-1].replace('.npy', '') 17 | cresult = CResult() 18 | cresult.subtrees = list(np.load(filepath)) 19 | tree = cresult.obtain_tree() 20 | rst_feature = RSTFeature() 21 | feat = rst_feature.generate_heuristic_feature(tree) 22 | np.save(TARGET_PATH+filename, feat) 23 | 24 | partitions = [] 25 | size = int(math.ceil(1.0*len(allfiles)/THREADS)) 26 | processes = list() 27 | for i in range(THREADS): 28 | start = i * size 29 | end = start + size 30 | if end > len(allfiles): 31 | end = len(allfiles) 32 | p = allfiles[start:end] 33 | 34 | process = Process(target=run_thread, args=(p,)) 35 | process.start() 36 | processes.append(process) 37 | if end == len(allfiles): 38 | break 39 | for process in processes: 40 | process.join() 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /extract_tree.py: -------------------------------------------------------------------------------- 1 | import sys, time 2 | import numpy as np 3 | import random 4 | import re, os 5 | from datetime import datetime 6 | import argparse 7 | from sentence import Sentence, Instance, EDU 8 | from rst_model import RSTModel 9 | from biaffine_model import BiaffineModel 10 | import glob 11 | import threading 12 | import math 13 | from multiprocessing import Process 14 | 15 | sys.path.append(".") 16 | ROOT = b"_ROOT" 17 | ROOT_POS = b"_ROOT_POS" 18 | ROOT_CHAR = b"_ROOT_CHAR" 19 | END = b"_END" 20 | END_POS = b"_END_POS" 21 | END_CHAR = b"_END_CHAR" 22 | 23 | UNK_ID = 0 24 | PAD_ID_WORD = 1 25 | PAD_ID_CHAR = 1 26 | PAD_ID_TAG = 0 27 | 28 | NUM_SYMBOLIC_TAGS = 3 29 | 30 | # Regular expressions used to normalize digits. 31 | DIGIT_RE = re.compile(br"\d") 32 | BIAFFINE_PATH = "/home/ffajri/Workspace/RSTExtractor/models/biaffine" 33 | BIAFFINE_MODEL = "network.pt" 34 | RST_CONFIG_PATH = "/home/ffajri/Workspace/RSTExtractor/models/rst/config.cfg" 35 | DATA_PATH = '/home/ffajri/Data/Petition/US/processed/merge/*' 36 | THREADS = 10 37 | 38 | if not os.path.exists('output_tree'): 39 | os.makedirs('output_tree') 40 | 41 | def form_sentence(lines, word_alpha, char_alpha, tag_alpha, symbolic_root=False, symbolic_end=False): 42 | words = [] 43 | word_ids = [] 44 | seq_chars = [] 45 | seq_char_ids = [] 46 | tags = [] 47 | tag_ids = [] 48 | edu_ids = [] 49 | 50 | if symbolic_root: 51 | words.append(ROOT) 52 | word_ids.append(word_alpha.get_index(ROOT)) 53 | seq_chars.append([ROOT_CHAR, ]) 54 | seq_char_ids.append([char_alpha.get_index(ROOT_CHAR), ]) 55 | tags.append(ROOT_POS) 56 | tag_ids.append(tag_alpha.get_index(ROOT_POS)) 57 | 58 | for line in lines: 59 | chars = [] 60 | char_ids = [] 61 | data = line.strip().split('\t') 62 | word = DIGIT_RE.sub(b"0", data[2]) 63 | word_id = word_alpha.get_index(word) 64 | for c in words: 65 | chars.append(c) 66 | char_ids.append(char_alpha.get_index(c)) 67 | tag = '$' if data[4] == '#' else data[4] 68 | tag_id = tag_alpha.get_index(tag) 69 | edu_id = int(data[9]) 70 | 71 | words.append(word) 72 | word_ids.append(word_id) 73 | seq_chars.append(chars) 74 | seq_char_ids.append(char_ids) 75 | tags.append(tag) 76 | tag_ids.append(tag_id) 77 | edu_ids.append(edu_id) 78 | 79 | if symbolic_end: 80 | words.append(END) 81 | word_ids.append(word_alpha.get_index(END)) 82 | seq_chars.append([END_CHAR, ]) 83 | seq_char_ids.append([char_alpha.get_index(END_CHAR), ]) 84 | tags.append(END_POS) 85 | tag_ids.append(tag_alpha.get_index(END_POS)) 86 | return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids) 87 | 88 | def data_reader(file_path, biaffine): 89 | f = open(file_path, 'r') 90 | sentences = [] 91 | lines = [] 92 | for line in f.readlines(): 93 | if line.strip() == '': 94 | sentences.append(form_sentence(lines, biaffine.word_alpha, biaffine.char_alpha, biaffine.tag_alpha)) 95 | lines = [] 96 | else: 97 | lines.append(line) 98 | data = biaffine.prepare_data(sentences) 99 | syntax_features = biaffine.get_syntax_feature(data, sentences) 100 | 101 | for i in range(len(sentences)): 102 | assert(len(sentences[i].words) == syntax_features[i].shape[1]) 103 | instance = Instance(sentences, syntax_features) 104 | return instance 105 | 106 | files=glob.glob(DATA_PATH) 107 | 108 | def run_thread(files): 109 | rst = RSTModel(RST_CONFIG_PATH) 110 | biaffine = BiaffineModel(BIAFFINE_PATH, BIAFFINE_MODEL) 111 | for filepath in files: 112 | filename = filepath.split('/')[-1].replace('.merge', '') 113 | instance = data_reader(filepath, biaffine) 114 | rst_data = rst.prepare_data([instance], 1) 115 | tree = rst.get_subtree(rst_data)[0] 116 | tree.save('output_tree/' + filename) 117 | 118 | partitions = [] 119 | size = int(math.ceil(1.0*len(files)/THREADS)) 120 | processes = list() 121 | for i in range(THREADS): 122 | start = i * size 123 | end = start + size 124 | if end > len(files): 125 | end = len(files) 126 | p = files[start:end] 127 | 128 | process = Process(target=run_thread, args=(p,)) 129 | process.start() 130 | processes.append(process) 131 | if end == len(files): 132 | break 133 | for process in processes: 134 | process.join() 135 | -------------------------------------------------------------------------------- /models/biaffine/alphabets/character.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [ 3 | "_PAD_CHAR", 4 | "_ROOT_CHAR", 5 | "_END_CHAR", 6 | "F", 7 | "o", 8 | "r", 9 | "m", 10 | "e", 11 | "U", 12 | ".", 13 | "N", 14 | "A", 15 | "b", 16 | "a", 17 | "s", 18 | "d", 19 | "J", 20 | "n", 21 | "K", 22 | "i", 23 | "k", 24 | "p", 25 | "t", 26 | "c", 27 | ",", 28 | "C", 29 | "`", 30 | "l", 31 | "G", 32 | "g", 33 | "'", 34 | "u", 35 | "O", 36 | "7", 37 | "f", 38 | "H", 39 | "h", 40 | "-", 41 | ":", 42 | "I", 43 | "w", 44 | "T", 45 | "y", 46 | "v", 47 | "E", 48 | "S", 49 | "L", 50 | "P", 51 | "W", 52 | "x", 53 | "R", 54 | "B", 55 | "3", 56 | "6", 57 | "%", 58 | "1", 59 | "2", 60 | "5", 61 | "9", 62 | "0", 63 | "$", 64 | "4", 65 | "8", 66 | "M", 67 | "Y", 68 | "D", 69 | "q", 70 | "Q", 71 | "X", 72 | "&", 73 | "z", 74 | "j", 75 | "/", 76 | "{", 77 | "V", 78 | "}", 79 | "?", 80 | ";", 81 | "!", 82 | "Z", 83 | "#", 84 | "*", 85 | "=", 86 | "@" 87 | ], 88 | "instance2index": { 89 | "m": 7, 90 | "M": 64, 91 | "_ROOT_CHAR": 2, 92 | "!": 79, 93 | "#": 81, 94 | "%": 55, 95 | "$": 61, 96 | "'": 31, 97 | "&": 70, 98 | "*": 82, 99 | "-": 38, 100 | ",": 25, 101 | "/": 73, 102 | ".": 10, 103 | "1": 56, 104 | "0": 60, 105 | "3": 53, 106 | "2": 57, 107 | "5": 58, 108 | "4": 62, 109 | "7": 34, 110 | "6": 54, 111 | "9": 59, 112 | "8": 63, 113 | ";": 78, 114 | ":": 39, 115 | "=": 83, 116 | "?": 77, 117 | "A": 12, 118 | "@": 84, 119 | "C": 26, 120 | "B": 52, 121 | "E": 45, 122 | "D": 66, 123 | "G": 29, 124 | "F": 4, 125 | "I": 40, 126 | "H": 36, 127 | "K": 19, 128 | "J": 17, 129 | "_END_CHAR": 3, 130 | "L": 47, 131 | "O": 33, 132 | "N": 11, 133 | "Q": 68, 134 | "P": 48, 135 | "S": 46, 136 | "R": 51, 137 | "U": 9, 138 | "T": 42, 139 | "W": 49, 140 | "V": 75, 141 | "Y": 65, 142 | "X": 69, 143 | "Z": 80, 144 | "a": 14, 145 | "`": 27, 146 | "c": 24, 147 | "b": 13, 148 | "e": 8, 149 | "d": 16, 150 | "g": 30, 151 | "f": 35, 152 | "i": 20, 153 | "h": 37, 154 | "k": 21, 155 | "j": 72, 156 | "_PAD_CHAR": 1, 157 | "l": 28, 158 | "o": 5, 159 | "n": 18, 160 | "q": 67, 161 | "p": 22, 162 | "s": 15, 163 | "r": 6, 164 | "u": 32, 165 | "t": 23, 166 | "w": 41, 167 | "v": 44, 168 | "y": 43, 169 | "x": 50, 170 | "{": 74, 171 | "z": 71, 172 | "}": 76 173 | } 174 | } -------------------------------------------------------------------------------- /models/biaffine/alphabets/pos.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [ 3 | "_PAD_POS", 4 | "_ROOT_POS", 5 | "_END_POS", 6 | "JJ", 7 | "NNP", 8 | ",", 9 | "IN", 10 | "DT", 11 | "``", 12 | "NN", 13 | "''", 14 | "CD", 15 | "HYPH", 16 | ":", 17 | "PRP", 18 | "VBP", 19 | "VBD", 20 | ".", 21 | "MD", 22 | "VB", 23 | "TO", 24 | "CC", 25 | "RB", 26 | "VBG", 27 | "NNPS", 28 | "PRP$", 29 | "NNS", 30 | "$", 31 | "VBN", 32 | "POS", 33 | "VBZ", 34 | "JJR", 35 | "RBR", 36 | "-LRB-", 37 | "-RRB-", 38 | "WDT", 39 | "WP", 40 | "WRB", 41 | "SYM", 42 | "RP", 43 | "EX", 44 | "JJS", 45 | "LS", 46 | "RBS", 47 | "PDT", 48 | "FW", 49 | "WP$", 50 | "UH", 51 | "NFP", 52 | "AFX" 53 | ], 54 | "instance2index": { 55 | "PRP$": 25, 56 | "VBG": 23, 57 | "VBD": 16, 58 | "NFP": 48, 59 | "``": 8, 60 | "_ROOT_POS": 1, 61 | "''": 10, 62 | "VBP": 15, 63 | "VBN": 28, 64 | "_END_POS": 2, 65 | "JJ": 3, 66 | "WP": 36, 67 | "VBZ": 30, 68 | "DT": 7, 69 | "RP": 39, 70 | "$": 27, 71 | "NN": 9, 72 | "FW": 45, 73 | ",": 5, 74 | ".": 17, 75 | "TO": 20, 76 | "UH": 47, 77 | "PRP": 14, 78 | "RB": 22, 79 | "-LRB-": 33, 80 | ":": 13, 81 | "NNS": 26, 82 | "HYPH": 12, 83 | "VB": 19, 84 | "WRB": 37, 85 | "CC": 21, 86 | "LS": 42, 87 | "PDT": 44, 88 | "RBS": 43, 89 | "RBR": 32, 90 | "CD": 11, 91 | "AFX": 49, 92 | "EX": 40, 93 | "IN": 6, 94 | "WP$": 46, 95 | "MD": 18, 96 | "NNPS": 24, 97 | "-RRB-": 34, 98 | "POS": 29, 99 | "JJS": 41, 100 | "JJR": 31, 101 | "SYM": 38, 102 | "_PAD_POS": 0, 103 | "WDT": 35, 104 | "NNP": 4 105 | } 106 | } -------------------------------------------------------------------------------- /models/biaffine/alphabets/type.json: -------------------------------------------------------------------------------- 1 | { 2 | "instances": [ 3 | "_", 4 | "_", 5 | "_", 6 | "amod", 7 | "nn", 8 | "root", 9 | "punct", 10 | "prep", 11 | "det", 12 | "pobj", 13 | "tmod", 14 | "num", 15 | "nsubj", 16 | "cop", 17 | "ccomp", 18 | "aux", 19 | "xcomp", 20 | "dobj", 21 | "cc", 22 | "conj", 23 | "dep", 24 | "appos", 25 | "poss", 26 | "npadvmod", 27 | "partmod", 28 | "nsubjpass", 29 | "auxpass", 30 | "possessive", 31 | "advmod", 32 | "pcomp", 33 | "parataxis", 34 | "number", 35 | "mark", 36 | "advcl", 37 | "rcmod", 38 | "acomp", 39 | "prt", 40 | "infmod", 41 | "quantmod", 42 | "expl", 43 | "preconj", 44 | "csubj", 45 | "neg", 46 | "mwe", 47 | "iobj", 48 | "predet", 49 | "discourse", 50 | "csubjpass" 51 | ], 52 | "instance2index": { 53 | "cc": 18, 54 | "number": 31, 55 | "ccomp": 14, 56 | "possessive": 27, 57 | "prt": 36, 58 | "num": 11, 59 | "nsubjpass": 25, 60 | "csubj": 41, 61 | "conj": 19, 62 | "amod": 3, 63 | "_": 0, 64 | "nn": 4, 65 | "neg": 42, 66 | "discourse": 46, 67 | "mark": 32, 68 | "auxpass": 26, 69 | "infmod": 37, 70 | "_": 1, 71 | "advcl": 33, 72 | "aux": 15, 73 | "prep": 7, 74 | "parataxis": 30, 75 | "mwe": 43, 76 | "nsubj": 12, 77 | "rcmod": 34, 78 | "advmod": 28, 79 | "punct": 6, 80 | "quantmod": 38, 81 | "tmod": 10, 82 | "acomp": 35, 83 | "pcomp": 29, 84 | "csubjpass": 47, 85 | "poss": 22, 86 | "npadvmod": 23, 87 | "xcomp": 16, 88 | "cop": 13, 89 | "partmod": 24, 90 | "_": 2, 91 | "appos": 21, 92 | "det": 8, 93 | "dobj": 17, 94 | "dep": 20, 95 | "pobj": 9, 96 | "iobj": 44, 97 | "expl": 39, 98 | "predet": 45, 99 | "preconj": 40, 100 | "root": 5 101 | } 102 | } -------------------------------------------------------------------------------- /models/biaffine/network.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/models/biaffine/network.pt -------------------------------------------------------------------------------- /models/biaffine/network.pt.arg.json: -------------------------------------------------------------------------------- 1 | { 2 | "args": [ 3 | 100, 4 | 35765, 5 | 100, 6 | 85, 7 | 100, 8 | 50, 9 | 100, 10 | 3, 11 | "FastLSTM", 12 | 400, 13 | 3, 14 | 48, 15 | 500, 16 | 100 17 | ], 18 | "kwargs": { 19 | "p_in": 0.33, 20 | "p_out": 0.33, 21 | "biaffine": true, 22 | "pos": true, 23 | "char": true, 24 | "p_rnn": [ 25 | 0.33, 26 | 0.33 27 | ] 28 | } 29 | } -------------------------------------------------------------------------------- /models/rst/alphabets/action_label_alpha.json: -------------------------------------------------------------------------------- 1 | { 2 | "id2alpha": { 3 | "0": "purp", 4 | "1": "cont", 5 | "2": "attr", 6 | "3": "evid", 7 | "4": "comp", 8 | "5": "list", 9 | "6": "back", 10 | "7": "same", 11 | "8": "topic", 12 | "9": "mann", 13 | "10": "summ", 14 | "11": "cond", 15 | "12": "temp", 16 | "13": "eval", 17 | "14": "text", 18 | "15": "cause", 19 | "16": "prob", 20 | "17": "elab", 21 | "18": "PAD" 22 | }, 23 | "alpha2id": { 24 | "purp": 0, 25 | "cont": 1, 26 | "attr": 2, 27 | "evid": 3, 28 | "comp": 4, 29 | "elab": 17, 30 | "list": 5, 31 | "back": 6, 32 | "same": 7, 33 | "topic": 8, 34 | "summ": 10, 35 | "cond": 11, 36 | "temp": 12, 37 | "eval": 13, 38 | "text": 14, 39 | "PAD": 18, 40 | "cause": 15, 41 | "prob": 16, 42 | "mann": 9 43 | }, 44 | "alphas": [ 45 | "purp", 46 | "cont", 47 | "attr", 48 | "evid", 49 | "comp", 50 | "list", 51 | "back", 52 | "same", 53 | "topic", 54 | "mann", 55 | "summ", 56 | "cond", 57 | "temp", 58 | "eval", 59 | "text", 60 | "cause", 61 | "prob", 62 | "elab", 63 | "PAD" 64 | ] 65 | } -------------------------------------------------------------------------------- /models/rst/alphabets/etype_alpha.json: -------------------------------------------------------------------------------- 1 | { 2 | "id2alpha": { 3 | "0": "UNK", 4 | "1": "", 5 | "2": "

" 6 | }, 7 | "alpha2id": { 8 | "": 1, 9 | "

": 2, 10 | "UNK": 0 11 | }, 12 | "alphas": [ 13 | "", 14 | "

", 15 | "UNK" 16 | ] 17 | } -------------------------------------------------------------------------------- /models/rst/alphabets/gold_action_alpha.json: -------------------------------------------------------------------------------- 1 | { 2 | "id2alpha": { 3 | "0": "REDUCE_NN_temp", 4 | "1": "REDUCE_NS_purp", 5 | "2": "REDUCE_NN_text", 6 | "3": "REDUCE_SN_elab", 7 | "4": "REDUCE_NS_summ", 8 | "5": "REDUCE_NS_attr", 9 | "6": "REDUCE_NN_eval", 10 | "7": "REDUCE_NS_evid", 11 | "8": "REDUCE_SN_comp", 12 | "9": "REDUCE_NS_topic", 13 | "10": "REDUCE_SN_purp", 14 | "11": "REDUCE_NS_comp", 15 | "12": "REDUCE_NS_elab", 16 | "13": "REDUCE_SN_eval", 17 | "14": "REDUCE_NN_cause", 18 | "15": "REDUCE_NS_eval", 19 | "16": "REDUCE_NS_back", 20 | "17": "REDUCE_SN_temp", 21 | "18": "REDUCE_NS_temp", 22 | "19": "REDUCE_SN_attr", 23 | "20": "REDUCE_SN_summ", 24 | "21": "REDUCE_NS_mann", 25 | "22": "REDUCE_NS_prob", 26 | "23": "REDUCE_NN_evid", 27 | "24": "REDUCE_NN_cont", 28 | "25": "REDUCE_SN_cond", 29 | "26": "REDUCE_SN_prob", 30 | "27": "REDUCE_NS_cause", 31 | "28": "REDUCE_NN_cond", 32 | "29": "REDUCE_SN_cont", 33 | "30": "REDUCE_SN_evid", 34 | "31": "POPROOT__", 35 | "32": "REDUCE_NN_topic", 36 | "33": "REDUCE_NN_same", 37 | "34": "REDUCE_NN_list", 38 | "35": "SHIFT__", 39 | "36": "REDUCE_NS_cont", 40 | "37": "REDUCE_NS_cond", 41 | "38": "REDUCE_SN_mann", 42 | "39": "REDUCE_SN_back", 43 | "40": "REDUCE_NN_prob", 44 | "41": "REDUCE_SN_cause", 45 | "42": "REDUCE_NN_comp", 46 | "43": "PAD" 47 | }, 48 | "alpha2id": { 49 | "REDUCE_NN_temp": 0, 50 | "REDUCE_NS_purp": 1, 51 | "REDUCE_NN_text": 2, 52 | "REDUCE_SN_summ": 20, 53 | "REDUCE_SN_cont": 29, 54 | "REDUCE_NS_attr": 5, 55 | "REDUCE_NN_eval": 6, 56 | "PAD": 43, 57 | "REDUCE_NS_evid": 7, 58 | "REDUCE_SN_comp": 8, 59 | "REDUCE_SN_purp": 10, 60 | "REDUCE_NS_comp": 11, 61 | "REDUCE_NS_elab": 12, 62 | "REDUCE_SN_eval": 13, 63 | "REDUCE_NN_cause": 14, 64 | "REDUCE_NS_eval": 15, 65 | "REDUCE_NS_back": 16, 66 | "REDUCE_SN_temp": 17, 67 | "REDUCE_NS_temp": 18, 68 | "REDUCE_SN_attr": 19, 69 | "REDUCE_SN_elab": 3, 70 | "REDUCE_NS_mann": 21, 71 | "REDUCE_NS_prob": 22, 72 | "REDUCE_NN_same": 33, 73 | "REDUCE_NN_cont": 24, 74 | "REDUCE_SN_cond": 25, 75 | "REDUCE_SN_prob": 26, 76 | "REDUCE_NS_cause": 27, 77 | "REDUCE_NN_cond": 28, 78 | "REDUCE_NS_summ": 4, 79 | "REDUCE_SN_evid": 30, 80 | "POPROOT__": 31, 81 | "REDUCE_NN_topic": 32, 82 | "REDUCE_NN_evid": 23, 83 | "REDUCE_NN_list": 34, 84 | "SHIFT__": 35, 85 | "REDUCE_NS_cont": 36, 86 | "REDUCE_NS_topic": 9, 87 | "REDUCE_SN_mann": 38, 88 | "REDUCE_SN_cause": 41, 89 | "REDUCE_SN_back": 39, 90 | "REDUCE_NN_prob": 40, 91 | "REDUCE_NS_cond": 37, 92 | "REDUCE_NN_comp": 42 93 | }, 94 | "alphas": [ 95 | "REDUCE_NN_temp", 96 | "REDUCE_NS_purp", 97 | "REDUCE_NN_text", 98 | "REDUCE_SN_elab", 99 | "REDUCE_NS_summ", 100 | "REDUCE_NS_attr", 101 | "REDUCE_NN_eval", 102 | "REDUCE_NS_evid", 103 | "REDUCE_SN_comp", 104 | "REDUCE_NS_topic", 105 | "REDUCE_SN_purp", 106 | "REDUCE_NS_comp", 107 | "REDUCE_NS_elab", 108 | "REDUCE_SN_eval", 109 | "REDUCE_NN_cause", 110 | "REDUCE_NS_eval", 111 | "REDUCE_NS_back", 112 | "REDUCE_SN_temp", 113 | "REDUCE_NS_temp", 114 | "REDUCE_SN_attr", 115 | "REDUCE_SN_summ", 116 | "REDUCE_NS_mann", 117 | "REDUCE_NS_prob", 118 | "REDUCE_NN_evid", 119 | "REDUCE_NN_cont", 120 | "REDUCE_SN_cond", 121 | "REDUCE_SN_prob", 122 | "REDUCE_NS_cause", 123 | "REDUCE_NN_cond", 124 | "REDUCE_SN_cont", 125 | "REDUCE_SN_evid", 126 | "POPROOT__", 127 | "REDUCE_NN_topic", 128 | "REDUCE_NN_same", 129 | "REDUCE_NN_list", 130 | "SHIFT__", 131 | "REDUCE_NS_cont", 132 | "REDUCE_NS_cond", 133 | "REDUCE_SN_mann", 134 | "REDUCE_SN_back", 135 | "REDUCE_NN_prob", 136 | "REDUCE_SN_cause", 137 | "REDUCE_NN_comp", 138 | "PAD" 139 | ] 140 | } -------------------------------------------------------------------------------- /models/rst/alphabets/tag_alpha.json: -------------------------------------------------------------------------------- 1 | { 2 | "id2alpha": { 3 | "0": "UNK", 4 | "1": "", 5 | "2": "PRP$", 6 | "3": "VBG", 7 | "4": "VBD", 8 | "5": "``", 9 | "6": "VBN", 10 | "7": "POS", 11 | "8": "''", 12 | "9": "VBP", 13 | "10": "WDT", 14 | "11": "JJ", 15 | "12": "WP", 16 | "13": "VBZ", 17 | "14": "DT", 18 | "15": "#", 19 | "16": "RP", 20 | "17": "$", 21 | "18": "NN", 22 | "19": "FW", 23 | "20": ",", 24 | "21": ".", 25 | "22": "TO", 26 | "23": "PRP", 27 | "24": "RB", 28 | "25": "-LRB-", 29 | "26": ":", 30 | "27": "NNS", 31 | "28": "NNP", 32 | "29": "VB", 33 | "30": "WRB", 34 | "31": "CC", 35 | "32": "LS", 36 | "33": "PDT", 37 | "34": "RBS", 38 | "35": "RBR", 39 | "36": "CD", 40 | "37": "EX", 41 | "38": "IN", 42 | "39": "WP$", 43 | "40": "MD", 44 | "41": "NNPS", 45 | "42": "-RRB-", 46 | "43": "JJS", 47 | "44": "JJR", 48 | "45": "SYM", 49 | "46": "UH" 50 | }, 51 | "alpha2id": { 52 | "": 1, 53 | "PRP$": 2, 54 | "VBG": 3, 55 | "VBD": 4, 56 | "VBN": 6, 57 | ",": 20, 58 | "''": 8, 59 | "VBP": 9, 60 | "WDT": 10, 61 | "JJ": 11, 62 | "WP": 12, 63 | "VBZ": 13, 64 | "DT": 14, 65 | "#": 15, 66 | "RP": 16, 67 | "$": 17, 68 | "NN": 18, 69 | "FW": 19, 70 | "POS": 7, 71 | ".": 21, 72 | "TO": 22, 73 | "PRP": 23, 74 | "RB": 24, 75 | "-LRB-": 25, 76 | ":": 26, 77 | "NNS": 27, 78 | "NNP": 28, 79 | "``": 5, 80 | "WRB": 30, 81 | "CC": 31, 82 | "LS": 32, 83 | "PDT": 33, 84 | "RBS": 34, 85 | "RBR": 35, 86 | "CD": 36, 87 | "EX": 37, 88 | "IN": 38, 89 | "WP$": 39, 90 | "UNK": 0, 91 | "MD": 40, 92 | "NNPS": 41, 93 | "-RRB-": 42, 94 | "JJS": 43, 95 | "JJR": 44, 96 | "SYM": 45, 97 | "VB": 29, 98 | "UH": 46 99 | }, 100 | "alphas": [ 101 | "", 102 | "PRP$", 103 | "VBG", 104 | "VBD", 105 | "``", 106 | "VBN", 107 | "POS", 108 | "''", 109 | "VBP", 110 | "WDT", 111 | "JJ", 112 | "WP", 113 | "VBZ", 114 | "DT", 115 | "#", 116 | "RP", 117 | "$", 118 | "NN", 119 | "FW", 120 | ",", 121 | ".", 122 | "TO", 123 | "PRP", 124 | "RB", 125 | "-LRB-", 126 | ":", 127 | "NNS", 128 | "NNP", 129 | "VB", 130 | "WRB", 131 | "CC", 132 | "LS", 133 | "PDT", 134 | "RBS", 135 | "RBR", 136 | "CD", 137 | "EX", 138 | "IN", 139 | "WP$", 140 | "MD", 141 | "NNPS", 142 | "-RRB-", 143 | "JJS", 144 | "JJR", 145 | "SYM", 146 | "UH", 147 | "UNK" 148 | ] 149 | } -------------------------------------------------------------------------------- /models/rst/config.cfg: -------------------------------------------------------------------------------- 1 | use_gpu = True 2 | use_dynamic_oracle = True 3 | flag_oracle = True 4 | word_embedding = glove 5 | word_embedding_file = /home/ffajri/Data/NeuralRST/glove.6B.200d.txt.gz 6 | train_path = /home/ffajri/Data/NeuralRST/rst.train312 7 | test_path = /home/ffajri/Data/NeuralRST/rst.test38 8 | dev_path = /home/ffajri/Data/NeuralRST/rst.dev35 9 | train_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/train.conll.dump.results 10 | test_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/test.conll.dump.results 11 | dev_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/dev.conll.dump.results 12 | model_path = /home/ffajri/Workspace/RSTExtractor/models/rst/ 13 | model_name = /home/ffajri/Workspace/RSTExtractor/models/rst/network.pt 14 | alphabet_path = /home/ffajri/Workspace/RSTExtractor/models/rst/alphabets/ 15 | max_iter = 1000 16 | word_dim = 200 17 | tag_dim = 200 18 | etype_dim = 100 19 | syntax_dim = 1200 20 | max_sent_size = 40 21 | max_edu_size = 400 22 | max_state_size = 1024 23 | hidden_size = 200 24 | freeze = True 25 | drop_prob = 0.5 26 | num_layers = 1 27 | batch_size = 4 28 | opt = adam 29 | lr = 0.001 30 | ada_eps = 1e-08 31 | momentum = 0.9 32 | beta1 = 0.9 33 | beta2 = 0.999 34 | gamma = 2e-06 35 | start_decay = 0 36 | clip = 10.0 37 | decay = 0 38 | oracle_prob = 0.66666 39 | start_dynamic_oracle = 15 40 | early_stopping = 50 41 | -------------------------------------------------------------------------------- /models/rst/network.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/models/rst/network.pt -------------------------------------------------------------------------------- /neuronlp2/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from . import io 4 | from . import nn 5 | from . import utils 6 | from . import nlinalg 7 | from . import models 8 | 9 | __version__ = "0.1.dev1" -------------------------------------------------------------------------------- /neuronlp2/biaffine_model.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import math 3 | THREADS=7 4 | files = glob.glob(DATA_PATH) 5 | DATA_PATH = '/home/ffajri/Data/segmenter/*' 6 | size = int(math.ceil(1.0*len(files)/THREADS)) 7 | 8 | allfiles = [] 9 | for i in range(THREADS): 10 | start = i * size 11 | end = start + size 12 | if end > len(files): 13 | end = len(files) 14 | p = files[start:end] 15 | allfiles.append(p) 16 | if end == len(files): 17 | break 18 | 19 | for idx in range(len(allfiles)): 20 | f = open(str(idx)+'.list', 'w') 21 | for l in allfiles[idx]: 22 | f.write(l+'\n') 23 | f.close() 24 | -------------------------------------------------------------------------------- /neuronlp2/io/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .alphabet import * 4 | from .instance import * 5 | from .logger import * 6 | from .writer import * 7 | from . import conllx_data 8 | from . import conllx_stacked_data 9 | from . import conll03_data -------------------------------------------------------------------------------- /neuronlp2/io/alphabet.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | """ 4 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects. 5 | """ 6 | import json 7 | import os 8 | from .logger import get_logger 9 | 10 | class Alphabet(object): 11 | def __init__(self, name, defualt_value=False, keep_growing=True, singleton=False): 12 | self.__name = name 13 | 14 | self.instance2index = {} 15 | self.instances = [] 16 | self.default_value = defualt_value 17 | self.offset = 1 if self.default_value else 0 18 | self.keep_growing = keep_growing 19 | self.singletons = set() if singleton else None 20 | 21 | # Index 0 is occupied by default, all else following. 22 | self.default_index = 0 if self.default_value else None 23 | 24 | self.next_index = self.offset 25 | 26 | self.logger = get_logger('Alphabet') 27 | 28 | def add(self, instance): 29 | if instance not in self.instance2index: 30 | self.instances.append(instance) 31 | self.instance2index[instance] = self.next_index 32 | self.next_index += 1 33 | 34 | def add_singleton(self, id): 35 | if self.singletons is None: 36 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 37 | else: 38 | self.singletons.add(id) 39 | 40 | def add_singletons(self, ids): 41 | if self.singletons is None: 42 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 43 | else: 44 | self.singletons.update(ids) 45 | 46 | def is_singleton(self, id): 47 | if self.singletons is None: 48 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name) 49 | else: 50 | return id in self.singletons 51 | 52 | def get_index(self, instance): 53 | try: 54 | return self.instance2index[instance] 55 | except KeyError: 56 | if self.keep_growing: 57 | index = self.next_index 58 | self.add(instance) 59 | return index 60 | else: 61 | if self.default_value: 62 | return self.default_index 63 | else: 64 | raise KeyError("instance not found: %s" % instance) 65 | 66 | def get_instance(self, index): 67 | if self.default_value and index == self.default_index: 68 | # First index is occupied by the wildcard element. 69 | return '<_UNK>' 70 | else: 71 | try: 72 | return self.instances[index - self.offset] 73 | except IndexError: 74 | raise IndexError('unknown index: %d' % index) 75 | 76 | def size(self): 77 | return len(self.instances) + self.offset 78 | 79 | def singleton_size(self): 80 | return len(self.singletons) 81 | 82 | def items(self): 83 | return self.instance2index.items() 84 | 85 | def enumerate_items(self, start): 86 | if start < self.offset or start >= self.size(): 87 | raise IndexError("Enumerate is allowed between [%d : size of the alphabet)" % self.offset) 88 | return zip(range(start, len(self.instances) + self.offset), self.instances[start - self.offset:]) 89 | 90 | def close(self): 91 | self.keep_growing = False 92 | 93 | def open(self): 94 | self.keep_growing = True 95 | 96 | def get_content(self): 97 | if self.singletons is None: 98 | return {'instance2index': self.instance2index, 'instances': self.instances} 99 | else: 100 | return {'instance2index': self.instance2index, 'instances': self.instances, 101 | 'singletions': list(self.singletons)} 102 | 103 | def __from_json(self, data): 104 | self.instances = data["instances"] 105 | self.instance2index = data["instance2index"] 106 | if 'singletions' in data: 107 | self.singletons = set(data['singletions']) 108 | else: 109 | self.singletons = None 110 | 111 | def save(self, output_directory, name=None): 112 | """ 113 | Save both alhpabet records to the given directory. 114 | :param output_directory: Directory to save model and weights. 115 | :param name: The alphabet saving name, optional. 116 | :return: 117 | """ 118 | saving_name = name if name else self.__name 119 | try: 120 | if not os.path.exists(output_directory): 121 | os.makedirs(output_directory) 122 | 123 | json.dump(self.get_content(), 124 | open(os.path.join(output_directory, saving_name + ".json"), 'w'), indent=4) 125 | except Exception as e: 126 | self.logger.warn("Alphabet is not saved: %s" % repr(e)) 127 | 128 | def load(self, input_directory, name=None): 129 | """ 130 | Load model architecture and weights from the give directory. This allow we use old models even the structure 131 | changes. 132 | :param input_directory: Directory to save model and weights 133 | :return: 134 | """ 135 | loading_name = name if name else self.__name 136 | self.__from_json(json.load(open(os.path.join(input_directory, loading_name + ".json")))) 137 | self.next_index = len(self.instances) + self.offset 138 | self.keep_growing = False 139 | -------------------------------------------------------------------------------- /neuronlp2/io/instance.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | 4 | class Sentence(object): 5 | def __init__(self, words, word_ids, char_seqs, char_id_seqs): 6 | self.words = words 7 | self.word_ids = word_ids 8 | self.char_seqs = char_seqs 9 | self.char_id_seqs = char_id_seqs 10 | 11 | def length(self): 12 | return len(self.words) 13 | 14 | 15 | class DependencyInstance(object): 16 | def __init__(self, sentence, postags, pos_ids, heads=None, types=None, type_ids=None): 17 | self.sentence = sentence 18 | self.postags = postags 19 | self.pos_ids = pos_ids 20 | self.heads = heads 21 | self.types = types 22 | self.type_ids = type_ids 23 | 24 | def length(self): 25 | return self.sentence.length() 26 | 27 | 28 | class NERInstance(object): 29 | def __init__(self, sentence, postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids): 30 | self.sentence = sentence 31 | self.postags = postags 32 | self.pos_ids = pos_ids 33 | self.chunk_tags = chunk_tags 34 | self.chunk_ids = chunk_ids 35 | self.ner_tags = ner_tags 36 | self.ner_ids = ner_ids 37 | 38 | def length(self): 39 | return self.sentence.length() 40 | -------------------------------------------------------------------------------- /neuronlp2/io/logger.py: -------------------------------------------------------------------------------- 1 | _author__ = 'max' 2 | 3 | import logging 4 | import sys 5 | 6 | 7 | def get_logger(name, level=logging.INFO, handler=sys.stdout, 8 | formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'): 9 | logger = logging.getLogger(name) 10 | logger.setLevel(logging.INFO) 11 | formatter = logging.Formatter(formatter) 12 | stream_handler = logging.StreamHandler(handler) 13 | stream_handler.setLevel(level) 14 | stream_handler.setFormatter(formatter) 15 | logger.addHandler(stream_handler) 16 | 17 | return logger 18 | -------------------------------------------------------------------------------- /neuronlp2/io/reader.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .instance import DependencyInstance, NERInstance 4 | from .instance import Sentence 5 | from .conllx_data import ROOT, ROOT_POS, ROOT_CHAR, ROOT_TYPE, END, END_POS, END_CHAR, END_TYPE 6 | from . import utils 7 | 8 | 9 | class CoNLLXReader(object): 10 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet): 11 | self.__source_file = open(file_path, 'r') 12 | self.__word_alphabet = word_alphabet 13 | self.__char_alphabet = char_alphabet 14 | self.__pos_alphabet = pos_alphabet 15 | self.__type_alphabet = type_alphabet 16 | 17 | def close(self): 18 | self.__source_file.close() 19 | 20 | def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): 21 | line = self.__source_file.readline() 22 | # skip multiple blank lines. 23 | while len(line) > 0 and len(line.strip()) == 0: 24 | line = self.__source_file.readline() 25 | if len(line) == 0: 26 | return None 27 | 28 | lines = [] 29 | while len(line.strip()) > 0: 30 | line = line.strip() 31 | line = line.decode('utf-8') 32 | lines.append(line.split('\t')) 33 | line = self.__source_file.readline() 34 | 35 | length = len(lines) 36 | if length == 0: 37 | return None 38 | 39 | words = [] 40 | word_ids = [] 41 | char_seqs = [] 42 | char_id_seqs = [] 43 | postags = [] 44 | pos_ids = [] 45 | types = [] 46 | type_ids = [] 47 | heads = [] 48 | 49 | if symbolic_root: 50 | words.append(ROOT) 51 | word_ids.append(self.__word_alphabet.get_index(ROOT)) 52 | char_seqs.append([ROOT_CHAR, ]) 53 | char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ]) 54 | postags.append(ROOT_POS) 55 | pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) 56 | types.append(ROOT_TYPE) 57 | type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE)) 58 | heads.append(0) 59 | 60 | for tokens in lines: 61 | chars = [] 62 | char_ids = [] 63 | for char in tokens[1]: 64 | chars.append(char) 65 | char_ids.append(self.__char_alphabet.get_index(char)) 66 | if len(chars) > utils.MAX_CHAR_LENGTH: 67 | chars = chars[:utils.MAX_CHAR_LENGTH] 68 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH] 69 | char_seqs.append(chars) 70 | char_id_seqs.append(char_ids) 71 | 72 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1] 73 | pos = tokens[4] 74 | head = int(tokens[6]) 75 | type = tokens[7] 76 | 77 | words.append(word) 78 | word_ids.append(self.__word_alphabet.get_index(word)) 79 | 80 | postags.append(pos) 81 | pos_ids.append(self.__pos_alphabet.get_index(pos)) 82 | 83 | types.append(type) 84 | type_ids.append(self.__type_alphabet.get_index(type)) 85 | 86 | heads.append(head) 87 | 88 | if symbolic_end: 89 | words.append(END) 90 | word_ids.append(self.__word_alphabet.get_index(END)) 91 | char_seqs.append([END_CHAR, ]) 92 | char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ]) 93 | postags.append(END_POS) 94 | pos_ids.append(self.__pos_alphabet.get_index(END_POS)) 95 | types.append(END_TYPE) 96 | type_ids.append(self.__type_alphabet.get_index(END_TYPE)) 97 | heads.append(0) 98 | 99 | return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids) 100 | 101 | def getNextForTest(self, normalize_digits=True, symbolic_root=False, symbolic_end=False): 102 | line = self.__source_file.readline() 103 | # skip multiple blank lines. 104 | while len(line) > 0 and len(line.strip()) == 0: 105 | line = self.__source_file.readline() 106 | if len(line) == 0: 107 | return None 108 | 109 | lines = [] 110 | while len(line.strip()) > 0: 111 | line = line.strip() 112 | line = line.decode('utf-8') 113 | lines.append(line.split('\t')) 114 | line = self.__source_file.readline() 115 | 116 | length = len(lines) 117 | if length == 0: 118 | return None 119 | 120 | words = [] 121 | word_ids = [] 122 | char_seqs = [] 123 | char_id_seqs = [] 124 | postags = [] 125 | pos_ids = [] 126 | 127 | if symbolic_root: 128 | words.append(ROOT) 129 | word_ids.append(self.__word_alphabet.get_index(ROOT)) 130 | char_seqs.append([ROOT_CHAR, ]) 131 | char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ]) 132 | postags.append(ROOT_POS) 133 | pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS)) 134 | 135 | for tokens in lines: 136 | chars = [] 137 | char_ids = [] 138 | for char in tokens[1]: 139 | chars.append(char) 140 | char_ids.append(self.__char_alphabet.get_index(char)) 141 | if len(chars) > utils.MAX_CHAR_LENGTH: 142 | chars = chars[:utils.MAX_CHAR_LENGTH] 143 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH] 144 | char_seqs.append(chars) 145 | char_id_seqs.append(char_ids) 146 | 147 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1] 148 | pos = tokens[4] 149 | if pos == '_': 150 | pos = tokens[3] 151 | if pos == '#': 152 | pos = '$' 153 | words.append(word) 154 | word_ids.append(self.__word_alphabet.get_index(word)) 155 | 156 | postags.append(pos) 157 | pos_ids.append(self.__pos_alphabet.get_index(pos)) 158 | 159 | 160 | if symbolic_end: 161 | words.append(END) 162 | word_ids.append(self.__word_alphabet.get_index(END)) 163 | char_seqs.append([END_CHAR, ]) 164 | char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ]) 165 | postags.append(END_POS) 166 | pos_ids.append(self.__pos_alphabet.get_index(END_POS)) 167 | 168 | return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids) 169 | 170 | class CoNLL03Reader(object): 171 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet): 172 | self.__source_file = open(file_path, 'r') 173 | self.__word_alphabet = word_alphabet 174 | self.__char_alphabet = char_alphabet 175 | self.__pos_alphabet = pos_alphabet 176 | self.__chunk_alphabet = chunk_alphabet 177 | self.__ner_alphabet = ner_alphabet 178 | 179 | def close(self): 180 | self.__source_file.close() 181 | 182 | def getNext(self, normalize_digits=True): 183 | line = self.__source_file.readline() 184 | # skip multiple blank lines. 185 | while len(line) > 0 and len(line.strip()) == 0: 186 | line = self.__source_file.readline() 187 | if len(line) == 0: 188 | return None 189 | 190 | lines = [] 191 | while len(line.strip()) > 0: 192 | line = line.strip() 193 | line = line.decode('utf-8') 194 | lines.append(line.split(' ')) 195 | line = self.__source_file.readline() 196 | 197 | length = len(lines) 198 | if length == 0: 199 | return None 200 | 201 | words = [] 202 | word_ids = [] 203 | char_seqs = [] 204 | char_id_seqs = [] 205 | postags = [] 206 | pos_ids = [] 207 | chunk_tags = [] 208 | chunk_ids = [] 209 | ner_tags = [] 210 | ner_ids = [] 211 | 212 | for tokens in lines: 213 | chars = [] 214 | char_ids = [] 215 | for char in tokens[1]: 216 | chars.append(char) 217 | char_ids.append(self.__char_alphabet.get_index(char)) 218 | if len(chars) > utils.MAX_CHAR_LENGTH: 219 | chars = chars[:utils.MAX_CHAR_LENGTH] 220 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH] 221 | char_seqs.append(chars) 222 | char_id_seqs.append(char_ids) 223 | 224 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1] 225 | pos = tokens[2] 226 | chunk = tokens[3] 227 | ner = tokens[4] 228 | 229 | words.append(word) 230 | word_ids.append(self.__word_alphabet.get_index(word)) 231 | 232 | postags.append(pos) 233 | pos_ids.append(self.__pos_alphabet.get_index(pos)) 234 | 235 | chunk_tags.append(chunk) 236 | chunk_ids.append(self.__chunk_alphabet.get_index(chunk)) 237 | 238 | ner_tags.append(ner) 239 | ner_ids.append(self.__ner_alphabet.get_index(ner)) 240 | 241 | return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, chunk_tags, chunk_ids, 242 | ner_tags, ner_ids) 243 | -------------------------------------------------------------------------------- /neuronlp2/io/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import re 4 | MAX_CHAR_LENGTH = 45 5 | NUM_CHAR_PAD = 2 6 | 7 | # Regular expressions used to normalize digits. 8 | DIGIT_RE = re.compile(br"\d") 9 | -------------------------------------------------------------------------------- /neuronlp2/io/writer.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | 4 | class CoNLL03Writer(object): 5 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet): 6 | self.__source_file = None 7 | self.__word_alphabet = word_alphabet 8 | self.__char_alphabet = char_alphabet 9 | self.__pos_alphabet = pos_alphabet 10 | self.__chunk_alphabet = chunk_alphabet 11 | self.__ner_alphabet = ner_alphabet 12 | 13 | def start(self, file_path): 14 | self.__source_file = open(file_path, 'w') 15 | 16 | def close(self): 17 | self.__source_file.close() 18 | 19 | def write(self, word, pos, chunk, predictions, targets, lengths): 20 | batch_size, _ = word.shape 21 | for i in range(batch_size): 22 | for j in range(lengths[i]): 23 | w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8') 24 | p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8') 25 | ch = self.__chunk_alphabet.get_instance(chunk[i, j]).encode('utf-8') 26 | tgt = self.__ner_alphabet.get_instance(targets[i, j]).encode('utf-8') 27 | pred = self.__ner_alphabet.get_instance(predictions[i, j]).encode('utf-8') 28 | self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, p, ch, tgt, pred)) 29 | self.__source_file.write('\n') 30 | 31 | 32 | class CoNLLXWriter(object): 33 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, type_alphabet): 34 | self.__source_file = None 35 | self.__word_alphabet = word_alphabet 36 | self.__char_alphabet = char_alphabet 37 | self.__pos_alphabet = pos_alphabet 38 | self.__type_alphabet = type_alphabet 39 | 40 | def start(self, file_path): 41 | self.__source_file = open(file_path, 'w') 42 | 43 | def close(self): 44 | self.__source_file.close() 45 | 46 | def write(self, word, pos, head, type, lengths, symbolic_root=False, symbolic_end=False): 47 | batch_size, _ = word.shape 48 | start = 1 if symbolic_root else 0 49 | end = 1 if symbolic_end else 0 50 | for i in range(batch_size): 51 | for j in range(start, lengths[i] - end): 52 | w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8') 53 | p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8') 54 | t = self.__type_alphabet.get_instance(type[i, j]).encode('utf-8') 55 | h = head[i, j] 56 | self.__source_file.write('%d\t%s\t_\t_\t%s\t_\t%d\t%s\n' % (j, w, p, h, t)) 57 | self.__source_file.write('\n') 58 | -------------------------------------------------------------------------------- /neuronlp2/models/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .sequence_labeling import * 4 | from .parsing import * 5 | 6 | -------------------------------------------------------------------------------- /neuronlp2/nlinalg/__init__.py: -------------------------------------------------------------------------------- 1 | _author__ = 'max' 2 | 3 | from .nlinalg import * 4 | -------------------------------------------------------------------------------- /neuronlp2/nlinalg/nlinalg.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import numpy 4 | import torch 5 | from torch.autograd.function import Function 6 | 7 | 8 | def logdet(x): 9 | """ 10 | 11 | Args: 12 | x: 2D positive semidefinite matrix. 13 | 14 | Returns: log determinant of x 15 | 16 | """ 17 | # TODO for pytorch 2.0.4, use inside potrf for variable. 18 | print(torch.log(torch.eig(x.data)[0])) 19 | print(x) 20 | u_chol = x.potrf() 21 | return torch.sum(torch.log(u_chol.diag())) * 2 22 | 23 | 24 | def logsumexp(x, dim=None): 25 | """ 26 | 27 | Args: 28 | x: A pytorch tensor (any dimension will do) 29 | dim: int or None, over which to perform the summation. `None`, the 30 | default, performs over all axes. 31 | 32 | Returns: The result of the log(sum(exp(...))) operation. 33 | 34 | """ 35 | if dim is None: 36 | xmax = x.max() 37 | xmax_ = x.max() 38 | return xmax_ + torch.log(torch.exp(x - xmax).sum()) 39 | else: 40 | xmax, _ = x.max(dim, keepdim=True) 41 | xmax_, _ = x.max(dim) 42 | return xmax_ + torch.log(torch.exp(x - xmax).sum(dim)) 43 | -------------------------------------------------------------------------------- /neuronlp2/nn/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .modules import * 4 | from . import init 5 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from . import masked_rnn 4 | from . import variational_rnn 5 | from . import skipconnect_rnn 6 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/masked_rnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | from torch.nn import functional as F 5 | 6 | 7 | def MaskedRecurrent(reverse=False): 8 | def forward(input, hidden, cell, mask): 9 | output = [] 10 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 11 | for i in steps: 12 | if mask is None or mask[i].data.min() > 0.5: 13 | hidden = cell(input[i], hidden) 14 | elif mask[i].data.max() > 0.5: 15 | hidden_next = cell(input[i], hidden) 16 | # hack to handle LSTM 17 | if isinstance(hidden, tuple): 18 | hx, cx = hidden 19 | hp1, cp1 = hidden_next 20 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 21 | else: 22 | hidden = hidden + (hidden_next - hidden) * mask[i] 23 | # hack to handle LSTM 24 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden) 25 | 26 | if reverse: 27 | output.reverse() 28 | output = torch.cat(output, 0).view(input.size(0), *output[0].size()) 29 | 30 | return hidden, output 31 | 32 | return forward 33 | 34 | 35 | def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True): 36 | num_directions = len(inners) 37 | total_layers = num_layers * num_directions 38 | 39 | def forward(input, hidden, cells, mask): 40 | assert (len(cells) == total_layers) 41 | next_hidden = [] 42 | 43 | if lstm: 44 | hidden = list(zip(*hidden)) 45 | 46 | for i in range(num_layers): 47 | all_output = [] 48 | for j, inner in enumerate(inners): 49 | l = i * num_directions + j 50 | hy, output = inner(input, hidden[l], cells[l], mask) 51 | next_hidden.append(hy) 52 | all_output.append(output) 53 | 54 | input = torch.cat(all_output, input.dim() - 1) 55 | 56 | if dropout != 0 and i < num_layers - 1: 57 | input = F.dropout(input, p=dropout, training=train, inplace=False) 58 | 59 | if lstm: 60 | next_h, next_c = zip(*next_hidden) 61 | next_hidden = ( 62 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 63 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 64 | ) 65 | else: 66 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 67 | 68 | return next_hidden, input 69 | 70 | return forward 71 | 72 | 73 | def AutogradMaskedRNN(num_layers=1, batch_first=False, dropout=0, train=True, bidirectional=False, lstm=False): 74 | rec_factory = MaskedRecurrent 75 | 76 | if bidirectional: 77 | layer = (rec_factory(), rec_factory(reverse=True)) 78 | else: 79 | layer = (rec_factory(),) 80 | 81 | func = StackedRNN(layer, 82 | num_layers, 83 | lstm=lstm, 84 | dropout=dropout, 85 | train=train) 86 | 87 | def forward(input, cells, hidden, mask): 88 | if batch_first: 89 | input = input.transpose(0, 1) 90 | if mask is not None: 91 | mask = mask.transpose(0, 1) 92 | 93 | nexth, output = func(input, hidden, cells, mask) 94 | 95 | if batch_first: 96 | output = output.transpose(0, 1) 97 | 98 | return output, nexth 99 | 100 | return forward 101 | 102 | 103 | def MaskedStep(): 104 | def forward(input, hidden, cell, mask): 105 | if mask is None or mask.data.min() > 0.5: 106 | hidden = cell(input, hidden) 107 | elif mask.data.max() > 0.5: 108 | hidden_next = cell(input, hidden) 109 | # hack to handle LSTM 110 | if isinstance(hidden, tuple): 111 | hx, cx = hidden 112 | hp1, cp1 = hidden_next 113 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 114 | else: 115 | hidden = hidden + (hidden_next - hidden) * mask 116 | # hack to handle LSTM 117 | output = hidden[0] if isinstance(hidden, tuple) else hidden 118 | 119 | return hidden, output 120 | 121 | return forward 122 | 123 | 124 | def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True): 125 | def forward(input, hidden, cells, mask): 126 | assert (len(cells) == num_layers) 127 | next_hidden = [] 128 | 129 | if lstm: 130 | hidden = list(zip(*hidden)) 131 | 132 | for l in range(num_layers): 133 | hy, output = layer(input, hidden[l], cells[l], mask) 134 | next_hidden.append(hy) 135 | input = output 136 | 137 | if dropout != 0 and l < num_layers - 1: 138 | input = F.dropout(input, p=dropout, training=train, inplace=False) 139 | 140 | if lstm: 141 | next_h, next_c = zip(*next_hidden) 142 | next_hidden = ( 143 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 144 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 145 | ) 146 | else: 147 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 148 | 149 | return next_hidden, input 150 | 151 | return forward 152 | 153 | 154 | def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False): 155 | layer = MaskedStep() 156 | 157 | func = StackedStep(layer, 158 | num_layers, 159 | lstm=lstm, 160 | dropout=dropout, 161 | train=train) 162 | 163 | def forward(input, cells, hidden, mask): 164 | nexth, output = func(input, hidden, cells, mask) 165 | return output, nexth 166 | 167 | return forward 168 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/skipconnect_rnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend 6 | from torch.nn import functional as F 7 | 8 | 9 | def SkipConnectRNNReLUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None, noise_skip=None): 10 | if noise_in is not None: 11 | input = input * noise_in 12 | 13 | hidden = torch.cat([hidden, hidden_skip], dim=1) 14 | if noise_hidden is not None: 15 | hidden = hidden * noise_hidden 16 | 17 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 18 | return hy 19 | 20 | 21 | def SkipConnectRNNTanhCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 22 | if noise_in is not None: 23 | input = input * noise_in 24 | 25 | hidden = torch.cat([hidden, hidden_skip], dim=1) 26 | if noise_hidden is not None: 27 | hidden = hidden * noise_hidden 28 | 29 | hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 30 | return hy 31 | 32 | 33 | def SkipConnectLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 34 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 35 | 36 | hx, cx = hidden 37 | hx = torch.cat([hx, hidden_skip], dim=1) 38 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 39 | 40 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 41 | 42 | ingate, forgetgate, cellgate, outgate = gates 43 | 44 | ingate = F.sigmoid(ingate) 45 | forgetgate = F.sigmoid(forgetgate) 46 | cellgate = F.tanh(cellgate) 47 | outgate = F.sigmoid(outgate) 48 | 49 | cy = (forgetgate * cx) + (ingate * cellgate) 50 | hy = outgate * F.tanh(cy) 51 | 52 | return hy, cy 53 | 54 | 55 | def SkipConnectFastLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 56 | if noise_in is not None: 57 | input = input * noise_in 58 | 59 | hx, cx = hidden 60 | hx = torch.cat([hx, hidden_skip], dim=1) 61 | if noise_hidden is not None: 62 | hx = hx * noise_hidden 63 | 64 | if input.is_cuda: 65 | igates = F.linear(input, w_ih) 66 | hgates = F.linear(hx, w_hh) 67 | state = fusedBackend.LSTMFused.apply 68 | return state(igates, hgates, cx) if b_ih is None else state(igates, hgates, cx, b_ih, b_hh) 69 | 70 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 71 | 72 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 73 | 74 | ingate = F.sigmoid(ingate) 75 | forgetgate = F.sigmoid(forgetgate) 76 | cellgate = F.tanh(cellgate) 77 | outgate = F.sigmoid(outgate) 78 | 79 | cy = (forgetgate * cx) + (ingate * cellgate) 80 | hy = outgate * F.tanh(cy) 81 | 82 | return hy, cy 83 | 84 | 85 | def SkipConnectGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 86 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 87 | hx = torch.cat([hidden, hidden_skip], dim=1) 88 | hx = hx.expand(3, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 89 | 90 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) 91 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 92 | i_r, i_i, i_n = gi 93 | h_r, h_i, h_n = gh 94 | 95 | resetgate = F.sigmoid(i_r + h_r) 96 | inputgate = F.sigmoid(i_i + h_i) 97 | newgate = F.tanh(i_n + resetgate * h_n) 98 | hy = newgate + inputgate * (hidden - newgate) 99 | 100 | return hy 101 | 102 | 103 | def SkipConnectFastGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 104 | if noise_in is not None: 105 | input = input * noise_in 106 | 107 | hx = torch.cat([hidden, hidden_skip], dim=1) 108 | if noise_hidden is not None: 109 | hx = hx * noise_hidden 110 | 111 | if input.is_cuda: 112 | gi = F.linear(input, w_ih) 113 | gh = F.linear(hx, w_hh) 114 | state = fusedBackend.GRUFused.apply 115 | return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh) 116 | 117 | gi = F.linear(input, w_ih, b_ih) 118 | gh = F.linear(hx, w_hh, b_hh) 119 | i_r, i_i, i_n = gi.chunk(3, 1) 120 | h_r, h_i, h_n = gh.chunk(3, 1) 121 | 122 | resetgate = F.sigmoid(i_r + h_r) 123 | inputgate = F.sigmoid(i_i + h_i) 124 | newgate = F.tanh(i_n + resetgate * h_n) 125 | hy = newgate + inputgate * (hidden - newgate) 126 | 127 | return hy 128 | 129 | 130 | def SkipConnectRecurrent(reverse=False): 131 | def forward(input, skip_connect, hidden, cell, mask): 132 | # hack to handle LSTM 133 | h0 = hidden[0] if isinstance(hidden, tuple) else hidden 134 | # [length + 1, batch, hidden_size] 135 | output = Variable(input.data.new(input.size(0) + 1, *h0.size()).zero_()) + h0 136 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 137 | # create batch index 138 | batch_index = torch.arange(0, h0.size(0)).type_as(skip_connect) 139 | for i in steps: 140 | if mask is None or mask[i].data.min() > 0.5: 141 | hidden_skip = output[skip_connect[i], batch_index] 142 | hidden = cell(input[i], hidden, hidden_skip) 143 | elif mask[i].data.max() > 0.5: 144 | hidden_skip = output[skip_connect[i], batch_index] 145 | hidden_next = cell(input[i], hidden, hidden_skip) 146 | # hack to handle LSTM 147 | if isinstance(hidden, tuple): 148 | hx, cx = hidden 149 | hp1, cp1 = hidden_next 150 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 151 | else: 152 | hidden = hidden + (hidden_next - hidden) * mask[i] 153 | # hack to handle LSTM 154 | if reverse: 155 | output[i] = hidden[0] if isinstance(hidden, tuple) else hidden 156 | else: 157 | output[i + 1] = hidden[0] if isinstance(hidden, tuple) else hidden 158 | 159 | if reverse: 160 | # remove last position 161 | output = output[:-1] 162 | else: 163 | # remove position 0 164 | output = output[1:] 165 | 166 | return hidden, output 167 | 168 | return forward 169 | 170 | 171 | def StackedRNN(inners, num_layers, lstm=False): 172 | num_directions = len(inners) 173 | total_layers = num_layers * num_directions 174 | 175 | def reverse_skip_connection(skip_connect): 176 | # TODO reverse skip connection for bidirectional rnn. 177 | return skip_connect 178 | 179 | def forward(input, skip_connect, hidden, cells, mask): 180 | assert (len(cells) == total_layers) 181 | next_hidden = [] 182 | 183 | skip_connect_forward = skip_connect 184 | skip_connec_backward = reverse_skip_connection(skip_connect) if num_directions == 2 else None 185 | 186 | if lstm: 187 | hidden = list(zip(*hidden)) 188 | 189 | for i in range(num_layers): 190 | all_output = [] 191 | for j, inner in enumerate(inners): 192 | l = i * num_directions + j 193 | skip_connect = skip_connect_forward if j == 0 else skip_connec_backward 194 | hy, output = inner(input, skip_connect, hidden[l], cells[l], mask) 195 | next_hidden.append(hy) 196 | all_output.append(output) 197 | 198 | input = torch.cat(all_output, input.dim() - 1) 199 | 200 | if lstm: 201 | next_h, next_c = zip(*next_hidden) 202 | next_hidden = ( 203 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 204 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 205 | ) 206 | else: 207 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 208 | 209 | return next_hidden, input 210 | 211 | return forward 212 | 213 | 214 | def AutogradSkipConnectRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): 215 | rec_factory = SkipConnectRecurrent 216 | 217 | if bidirectional: 218 | layer = (rec_factory(), rec_factory(reverse=True)) 219 | else: 220 | layer = (rec_factory(),) 221 | 222 | func = StackedRNN(layer, 223 | num_layers, 224 | lstm=lstm) 225 | 226 | def forward(input, skip_connect, cells, hidden, mask): 227 | if batch_first: 228 | input = input.transpose(0, 1) 229 | skip_connect = skip_connect.transpose(0, 1) 230 | if mask is not None: 231 | mask = mask.transpose(0, 1) 232 | 233 | nexth, output = func(input, skip_connect, hidden, cells, mask) 234 | 235 | if batch_first: 236 | output = output.transpose(0, 1) 237 | 238 | return output, nexth 239 | 240 | return forward 241 | 242 | 243 | def SkipConnectStep(): 244 | def forward(input, hidden, hidden_skip, cell, mask): 245 | if mask is None or mask.data.min() > 0.5: 246 | hidden = cell(input, hidden, hidden_skip) 247 | elif mask.data.max() > 0.5: 248 | hidden_next = cell(input, hidden, hidden_skip) 249 | # hack to handle LSTM 250 | if isinstance(hidden, tuple): 251 | hx, cx = hidden 252 | hp1, cp1 = hidden_next 253 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 254 | else: 255 | hidden = hidden + (hidden_next - hidden) * mask 256 | # hack to handle LSTM 257 | output = hidden[0] if isinstance(hidden, tuple) else hidden 258 | 259 | return hidden, output 260 | 261 | return forward 262 | 263 | 264 | def StackedStep(layer, num_layers, lstm=False): 265 | def forward(input, hidden, hidden_skip, cells, mask): 266 | assert (len(cells) == num_layers) 267 | next_hidden = [] 268 | 269 | if lstm: 270 | hidden = list(zip(*hidden)) 271 | 272 | for l in range(num_layers): 273 | hy, output = layer(input, hidden[l], hidden_skip[l], cells[l], mask) 274 | next_hidden.append(hy) 275 | input = output 276 | 277 | if lstm: 278 | next_h, next_c = zip(*next_hidden) 279 | next_hidden = ( 280 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 281 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 282 | ) 283 | else: 284 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 285 | 286 | return next_hidden, input 287 | 288 | return forward 289 | 290 | 291 | def AutogradSkipConnectStep(num_layers=1, lstm=False): 292 | layer = SkipConnectStep() 293 | 294 | func = StackedStep(layer, 295 | num_layers, 296 | lstm=lstm) 297 | 298 | def forward(input, cells, hidden, hidden_skip, mask): 299 | nexth, output = func(input, hidden, hidden_skip, cells, mask) 300 | return output, nexth 301 | 302 | return forward 303 | -------------------------------------------------------------------------------- /neuronlp2/nn/_functions/variational_rnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend 5 | from torch.nn import functional as F 6 | 7 | 8 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 9 | if noise_in is not None: 10 | input = input * noise_in 11 | if noise_hidden is not None: 12 | hidden = hidden * noise_hidden 13 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 14 | return hy 15 | 16 | 17 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 18 | if noise_in is not None: 19 | input = input * noise_in 20 | if noise_hidden is not None: 21 | hidden = hidden * noise_hidden 22 | hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh)) 23 | return hy 24 | 25 | 26 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 27 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 28 | 29 | hx, cx = hidden 30 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden 31 | 32 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 33 | 34 | ingate, forgetgate, cellgate, outgate = gates 35 | 36 | ingate = F.sigmoid(ingate) 37 | forgetgate = F.sigmoid(forgetgate) 38 | cellgate = F.tanh(cellgate) 39 | outgate = F.sigmoid(outgate) 40 | 41 | cy = (forgetgate * cx) + (ingate * cellgate) 42 | hy = outgate * F.tanh(cy) 43 | 44 | return hy, cy 45 | 46 | 47 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 48 | if noise_in is not None: 49 | input = input * noise_in 50 | 51 | if input.is_cuda: 52 | igates = F.linear(input, w_ih) 53 | hgates = F.linear(hidden[0], w_hh) if noise_hidden is None else F.linear(hidden[0] * noise_hidden, w_hh) 54 | state = fusedBackend.LSTMFused.apply 55 | return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh) 56 | 57 | hx, cx = hidden 58 | if noise_hidden is not None: 59 | hx = hx * noise_hidden 60 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) 61 | 62 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) 63 | 64 | ingate = F.sigmoid(ingate) 65 | forgetgate = F.sigmoid(forgetgate) 66 | cellgate = F.tanh(cellgate) 67 | outgate = F.sigmoid(outgate) 68 | 69 | cy = (forgetgate * cx) + (ingate * cellgate) 70 | hy = outgate * F.tanh(cy) 71 | 72 | return hy, cy 73 | 74 | 75 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 76 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in 77 | hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden 78 | 79 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) 80 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh) 81 | i_r, i_i, i_n = gi 82 | h_r, h_i, h_n = gh 83 | 84 | resetgate = F.sigmoid(i_r + h_r) 85 | inputgate = F.sigmoid(i_i + h_i) 86 | newgate = F.tanh(i_n + resetgate * h_n) 87 | hy = newgate + inputgate * (hidden - newgate) 88 | 89 | return hy 90 | 91 | 92 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): 93 | if noise_in is not None: 94 | input = input * noise_in 95 | 96 | hx = hidden if noise_hidden is None else hidden * noise_hidden 97 | if input.is_cuda: 98 | gi = F.linear(input, w_ih) 99 | gh = F.linear(hx, w_hh) 100 | state = fusedBackend.GRUFused.apply 101 | return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh) 102 | 103 | gi = F.linear(input, w_ih, b_ih) 104 | gh = F.linear(hx, w_hh, b_hh) 105 | i_r, i_i, i_n = gi.chunk(3, 1) 106 | h_r, h_i, h_n = gh.chunk(3, 1) 107 | 108 | resetgate = F.sigmoid(i_r + h_r) 109 | inputgate = F.sigmoid(i_i + h_i) 110 | newgate = F.tanh(i_n + resetgate * h_n) 111 | hy = newgate + inputgate * (hidden - newgate) 112 | 113 | return hy 114 | 115 | 116 | def VarMaskedRecurrent(reverse=False): 117 | def forward(input, hidden, cell, mask): 118 | output = [] 119 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) 120 | for i in steps: 121 | if mask is None or mask[i].data.min() > 0.5: 122 | hidden = cell(input[i], hidden) 123 | elif mask[i].data.max() > 0.5: 124 | hidden_next = cell(input[i], hidden) 125 | # hack to handle LSTM 126 | if isinstance(hidden, tuple): 127 | hx, cx = hidden 128 | hp1, cp1 = hidden_next 129 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) 130 | else: 131 | hidden = hidden + (hidden_next - hidden) * mask[i] 132 | # hack to handle LSTM 133 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden) 134 | 135 | if reverse: 136 | output.reverse() 137 | output = torch.cat(output, 0).view(input.size(0), *output[0].size()) 138 | 139 | return hidden, output 140 | 141 | return forward 142 | 143 | 144 | def StackedRNN(inners, num_layers, lstm=False): 145 | num_directions = len(inners) 146 | total_layers = num_layers * num_directions 147 | 148 | def forward(input, hidden, cells, mask): 149 | assert (len(cells) == total_layers) 150 | next_hidden = [] 151 | 152 | if lstm: 153 | hidden = list(zip(*hidden)) 154 | 155 | for i in range(num_layers): 156 | all_output = [] 157 | for j, inner in enumerate(inners): 158 | l = i * num_directions + j 159 | hy, output = inner(input, hidden[l], cells[l], mask) 160 | next_hidden.append(hy) 161 | all_output.append(output) 162 | 163 | input = torch.cat(all_output, input.dim() - 1) 164 | 165 | if lstm: 166 | next_h, next_c = zip(*next_hidden) 167 | next_hidden = ( 168 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), 169 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) 170 | ) 171 | else: 172 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) 173 | 174 | return next_hidden, input 175 | 176 | return forward 177 | 178 | 179 | def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): 180 | rec_factory = VarMaskedRecurrent 181 | 182 | if bidirectional: 183 | layer = (rec_factory(), rec_factory(reverse=True)) 184 | else: 185 | layer = (rec_factory(),) 186 | 187 | func = StackedRNN(layer, 188 | num_layers, 189 | lstm=lstm) 190 | 191 | def forward(input, cells, hidden, mask): 192 | if batch_first: 193 | input = input.transpose(0, 1) 194 | if mask is not None: 195 | mask = mask.transpose(0, 1) 196 | 197 | nexth, output = func(input, hidden, cells, mask) 198 | 199 | if batch_first: 200 | output = output.transpose(0, 1) 201 | 202 | return output, nexth 203 | 204 | return forward 205 | 206 | 207 | def VarMaskedStep(): 208 | def forward(input, hidden, cell, mask): 209 | if mask is None or mask.data.min() > 0.5: 210 | hidden = cell(input, hidden) 211 | elif mask.data.max() > 0.5: 212 | hidden_next = cell(input, hidden) 213 | # hack to handle LSTM 214 | if isinstance(hidden, tuple): 215 | hx, cx = hidden 216 | hp1, cp1 = hidden_next 217 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) 218 | else: 219 | hidden = hidden + (hidden_next - hidden) * mask 220 | # hack to handle LSTM 221 | output = hidden[0] if isinstance(hidden, tuple) else hidden 222 | 223 | return hidden, output 224 | 225 | return forward 226 | 227 | 228 | def StackedStep(layer, num_layers, lstm=False): 229 | def forward(input, hidden, cells, mask): 230 | assert (len(cells) == num_layers) 231 | next_hidden = [] 232 | 233 | if lstm: 234 | hidden = list(zip(*hidden)) 235 | 236 | for l in range(num_layers): 237 | hy, output = layer(input, hidden[l], cells[l], mask) 238 | next_hidden.append(hy) 239 | input = output 240 | 241 | if lstm: 242 | next_h, next_c = zip(*next_hidden) 243 | next_hidden = ( 244 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), 245 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) 246 | ) 247 | else: 248 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) 249 | 250 | return next_hidden, input 251 | 252 | return forward 253 | 254 | 255 | def AutogradVarMaskedStep(num_layers=1, lstm=False): 256 | layer = VarMaskedStep() 257 | 258 | func = StackedStep(layer, 259 | num_layers, 260 | lstm=lstm) 261 | 262 | def forward(input, cells, hidden, mask): 263 | nexth, output = func(input, hidden, cells, mask) 264 | return output, nexth 265 | 266 | return forward 267 | -------------------------------------------------------------------------------- /neuronlp2/nn/init.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from torch.autograd import Variable 4 | 5 | 6 | def assign_tensor(tensor, val): 7 | """ 8 | copy val to tensor 9 | Args: 10 | tensor: an n-dimensional torch.Tensor or autograd.Variable 11 | val: an n-dimensional torch.Tensor to fill the tensor with 12 | 13 | Returns: 14 | 15 | """ 16 | if isinstance(tensor, Variable): 17 | assign_tensor(tensor.data, val) 18 | return tensor 19 | return tensor.copy_(val) 20 | -------------------------------------------------------------------------------- /neuronlp2/nn/modules/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .masked_rnn import * 4 | from .variational_rnn import * 5 | from .skipconnect_rnn import * 6 | from .crf import * 7 | from .sparse import * 8 | from .attention import * 9 | from .linear import * 10 | -------------------------------------------------------------------------------- /neuronlp2/nn/modules/attention.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn.parameter import Parameter 7 | 8 | 9 | class BiAAttention(nn.Module): 10 | ''' 11 | Bi-Affine attention layer. 12 | ''' 13 | 14 | def __init__(self, input_size_encoder, input_size_decoder, num_labels, biaffine=True, **kwargs): 15 | ''' 16 | 17 | Args: 18 | input_size_encoder: int 19 | the dimension of the encoder input. 20 | input_size_decoder: int 21 | the dimension of the decoder input. 22 | num_labels: int 23 | the number of labels of the crf layer 24 | biaffine: bool 25 | if apply bi-affine parameter. 26 | **kwargs: 27 | ''' 28 | super(BiAAttention, self).__init__() 29 | self.input_size_encoder = input_size_encoder 30 | self.input_size_decoder = input_size_decoder 31 | self.num_labels = num_labels 32 | self.biaffine = biaffine 33 | 34 | self.W_d = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder)) 35 | self.W_e = Parameter(torch.Tensor(self.num_labels, self.input_size_encoder)) 36 | self.b = Parameter(torch.Tensor(self.num_labels, 1, 1)) 37 | if self.biaffine: 38 | self.U = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder, self.input_size_encoder)) 39 | else: 40 | self.register_parameter('U', None) 41 | 42 | self.reset_parameters() 43 | 44 | def reset_parameters(self): 45 | nn.init.xavier_uniform(self.W_d) 46 | nn.init.xavier_uniform(self.W_e) 47 | nn.init.constant(self.b, 0.) 48 | if self.biaffine: 49 | nn.init.xavier_uniform(self.U) 50 | 51 | def forward(self, input_d, input_e, mask_d=None, mask_e=None): 52 | ''' 53 | 54 | Args: 55 | input_d: Tensor 56 | the decoder input tensor with shape = [batch, length_decoder, input_size] 57 | input_e: Tensor 58 | the child input tensor with shape = [batch, length_encoder, input_size] 59 | mask_d: Tensor or None 60 | the mask tensor for decoder with shape = [batch, length_decoder] 61 | mask_e: Tensor or None 62 | the mask tensor for encoder with shape = [batch, length_encoder] 63 | 64 | Returns: Tensor 65 | the energy tensor with shape = [batch, num_label, length, length] 66 | 67 | ''' 68 | assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.' 69 | batch, length_decoder, _ = input_d.size() 70 | _, length_encoder, _ = input_e.size() 71 | 72 | # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder] 73 | # the output shape is [batch, num_label, length_decoder] 74 | out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3) 75 | # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder] 76 | # the output shape is [batch, num_label, length_encoder] 77 | out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2) 78 | 79 | # output shape [batch, num_label, length_decoder, length_encoder] 80 | if self.biaffine: 81 | # compute bi-affine part 82 | # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder] 83 | # output shape [batch, num_label, length_decoder, input_size_encoder] 84 | output = torch.matmul(input_d.unsqueeze(1), self.U) 85 | # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder] 86 | # output shape [batch, num_label, length_decoder, length_encoder] 87 | output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3)) 88 | 89 | output = output + out_d + out_e + self.b 90 | else: 91 | output = out_d + out_d + self.b 92 | 93 | if mask_d is not None: 94 | output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2) 95 | 96 | return output 97 | 98 | 99 | class ConcatAttention(nn.Module): 100 | ''' 101 | Concatenate attention layer. 102 | ''' 103 | # TODO test it! 104 | 105 | def __init__(self, input_size_encoder, input_size_decoder, hidden_size, num_labels, **kwargs): 106 | ''' 107 | 108 | Args: 109 | input_size_encoder: int 110 | the dimension of the encoder input. 111 | input_size_decoder: int 112 | the dimension of the decoder input. 113 | hidden_size: int 114 | the dimension of the hidden. 115 | num_labels: int 116 | the number of labels of the crf layer 117 | biaffine: bool 118 | if apply bi-affine parameter. 119 | **kwargs: 120 | ''' 121 | super(ConcatAttention, self).__init__() 122 | self.input_size_encoder = input_size_encoder 123 | self.input_size_decoder = input_size_decoder 124 | self.hidden_size = hidden_size 125 | self.num_labels = num_labels 126 | 127 | self.W_d = Parameter(torch.Tensor(self.input_size_decoder, self.hidden_size)) 128 | self.W_e = Parameter(torch.Tensor(self.input_size_encoder, self.hidden_size)) 129 | self.b = Parameter(torch.Tensor(self.hidden_size)) 130 | self.v = Parameter(torch.Tensor(self.hidden_size, self.num_labels)) 131 | 132 | self.reset_parameters() 133 | 134 | def reset_parameters(self): 135 | nn.init.xavier_uniform(self.W_d) 136 | nn.init.xavier_uniform(self.W_e) 137 | nn.init.xavier_uniform(self.v) 138 | nn.init.constant(self.b, 0.) 139 | 140 | def forward(self, input_d, input_e, mask_d=None, mask_e=None): 141 | ''' 142 | 143 | Args: 144 | input_d: Tensor 145 | the decoder input tensor with shape = [batch, length_decoder, input_size] 146 | input_e: Tensor 147 | the child input tensor with shape = [batch, length_encoder, input_size] 148 | mask_d: Tensor or None 149 | the mask tensor for decoder with shape = [batch, length_decoder] 150 | mask_e: Tensor or None 151 | the mask tensor for encoder with shape = [batch, length_encoder] 152 | 153 | Returns: Tensor 154 | the energy tensor with shape = [batch, num_label, length, length] 155 | 156 | ''' 157 | assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.' 158 | batch, length_decoder, _ = input_d.size() 159 | _, length_encoder, _ = input_e.size() 160 | 161 | # compute decoder part: [batch, length_decoder, input_size_decoder] * [input_size_decoder, hidden_size] 162 | # the output shape is [batch, length_decoder, hidden_size] 163 | # then --> [batch, 1, length_decoder, hidden_size] 164 | out_d = torch.matmul(input_d, self.W_d).unsqueeze(1) 165 | # compute decoder part: [batch, length_encoder, input_size_encoder] * [input_size_encoder, hidden_size] 166 | # the output shape is [batch, length_encoder, hidden_size] 167 | # then --> [batch, length_encoder, 1, hidden_size] 168 | out_e = torch.matmul(input_e, self.W_e).unsqueeze(2) 169 | 170 | # add them together [batch, length_encoder, length_decoder, hidden_size] 171 | out = F.tanh(out_d + out_e + self.b) 172 | 173 | # product with v 174 | # [batch, length_encoder, length_decoder, hidden_size] * [hidden, num_label] 175 | # [batch, length_encoder, length_decoder, num_labels] 176 | # then --> [batch, num_labels, length_decoder, length_encoder] 177 | return torch.matmul(out, self.v).transpose(1, 3) 178 | -------------------------------------------------------------------------------- /neuronlp2/nn/modules/linear.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import math 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn.parameter import Parameter 9 | 10 | 11 | class BiLinear(nn.Module): 12 | ''' 13 | Bi-linear layer 14 | ''' 15 | def __init__(self, left_features, right_features, out_features, bias=True): 16 | ''' 17 | 18 | Args: 19 | left_features: size of left input 20 | right_features: size of right input 21 | out_features: size of output 22 | bias: If set to False, the layer will not learn an additive bias. 23 | Default: True 24 | ''' 25 | super(BiLinear, self).__init__() 26 | self.left_features = left_features 27 | self.right_features = right_features 28 | self.out_features = out_features 29 | 30 | self.U = Parameter(torch.Tensor(self.out_features, self.left_features, self.right_features)) 31 | self.W_l = Parameter(torch.Tensor(self.out_features, self.left_features)) 32 | self.W_r = Parameter(torch.Tensor(self.out_features, self.left_features)) 33 | 34 | if bias: 35 | self.bias = Parameter(torch.Tensor(out_features)) 36 | else: 37 | self.register_parameter('bias', None) 38 | 39 | self.reset_parameters() 40 | 41 | def reset_parameters(self): 42 | nn.init.xavier_uniform(self.W_l) 43 | nn.init.xavier_uniform(self.W_r) 44 | nn.init.constant(self.bias, 0.) 45 | nn.init.xavier_uniform(self.U) 46 | 47 | def forward(self, input_left, input_right): 48 | ''' 49 | 50 | Args: 51 | input_left: Tensor 52 | the left input tensor with shape = [batch1, batch2, ..., left_features] 53 | input_right: Tensor 54 | the right input tensor with shape = [batch1, batch2, ..., right_features] 55 | 56 | Returns: 57 | 58 | ''' 59 | 60 | left_size = input_left.size() 61 | right_size = input_right.size() 62 | assert left_size[:-1] == right_size[:-1], \ 63 | "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1]) 64 | batch = int(np.prod(left_size[:-1])) 65 | 66 | # convert left and right input to matrices [batch, left_features], [batch, right_features] 67 | input_left = input_left.view(batch, self.left_features) 68 | input_right = input_right.view(batch, self.right_features) 69 | 70 | # output [batch, out_features] 71 | output = F.bilinear(input_left, input_right, self.U, self.bias) 72 | output = output + F.linear(input_left, self.W_l, None) + F.linear(input_right, self.W_r, None) 73 | # convert back to [batch1, batch2, ..., out_features] 74 | return output.view(left_size[:-1] + (self.out_features, )) 75 | 76 | def __repr__(self): 77 | return self.__class__.__name__ + ' (' \ 78 | + 'in1_features=' + str(self.left_features) \ 79 | + ', in2_features=' + str(self.right_features) \ 80 | + ', out_features=' + str(self.out_features) + ')' 81 | -------------------------------------------------------------------------------- /neuronlp2/nn/modules/sparse.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | from torch.autograd import Variable 7 | from torch.nn.parameter import Parameter 8 | 9 | from ..init import assign_tensor 10 | 11 | 12 | class Embedding(nn.Module): 13 | r"""A simple lookup table that stores embeddings of a fixed dictionary and size. 14 | This module is often used to store word embeddings and retrieve them using indices. 15 | The input to the module is a list of indices, and the output is the corresponding 16 | word embeddings. 17 | Args: 18 | num_embeddings (int): size of the dictionary of embeddings 19 | embedding_dim (int): the size of each embedding vector 20 | init_embedding (Tensor or Variable): If given, the embedding will be initialized with the given tensor. 21 | freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process. 22 | padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index. 23 | max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this 24 | norm_type (float, optional): The p of the p-norm to compute for the max_norm option 25 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of 26 | the words in the mini-batch. 27 | sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for 28 | more details regarding sparse gradients. 29 | Attributes: 30 | weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) 31 | Shape: 32 | - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch 33 | - Output: `(N1, N2, ..., Nm, W, embedding_dim)` 34 | Notes: 35 | Keep in mind that only a limited number of optimizers support 36 | sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`), 37 | and `optim.Adagrad` (`cpu`) 38 | """ 39 | 40 | def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None, 41 | max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False): 42 | super(Embedding, self).__init__() 43 | self.num_embeddings = num_embeddings 44 | self.embedding_dim = embedding_dim 45 | self.padding_idx = padding_idx 46 | self.max_norm = max_norm 47 | self.norm_type = norm_type 48 | self.scale_grad_by_freq = scale_grad_by_freq 49 | self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim)) 50 | self.frozen = freeze 51 | self.sparse = sparse 52 | 53 | self.reset_parameters(init_embedding) 54 | 55 | def reset_parameters(self, init_embedding): 56 | if init_embedding is None: 57 | scale = np.sqrt(3.0 / self.embedding_dim) 58 | self.weight.data.uniform_(-scale, scale) 59 | else: 60 | assign_tensor(self.weight, init_embedding) 61 | if self.padding_idx is not None: 62 | self.weight.data[self.padding_idx].fill_(0) 63 | 64 | if self.frozen: 65 | if init_embedding is None: 66 | raise Warning('Freeze embeddings which are randomly initialized.') 67 | self.weight.requires_grad = False 68 | 69 | def freeze(self): 70 | self.weight.requires_grad = False 71 | self.frozen = True 72 | 73 | def forward(self, input): 74 | padding_idx = self.padding_idx 75 | if padding_idx is None: 76 | padding_idx = -1 77 | 78 | input_size = input.size() 79 | if input.dim() > 2: 80 | num_inputs = int(np.prod(input_size[:-1])) 81 | input = input.view(num_inputs, input_size[-1]) 82 | 83 | output_size = input_size + (self.embedding_dim,) 84 | return self._backend.Embedding.apply( 85 | input, self.weight, 86 | padding_idx, self.max_norm, self.norm_type, 87 | self.scale_grad_by_freq, self.sparse).view(output_size) 88 | 89 | def __repr__(self): 90 | s = '{name}({num_embeddings}, {embedding_dim}' 91 | if self.padding_idx is not None: 92 | s += ', padding_idx={padding_idx}' 93 | if self.max_norm is not None: 94 | s += ', max_norm={max_norm}' 95 | if self.norm_type != 2: 96 | s += ', norm_type={norm_type}' 97 | if self.scale_grad_by_freq is not False: 98 | s += ', scale_grad_by_freq={scale_grad_by_freq}' 99 | if self.sparse is not False: 100 | s += ', sparse=True' 101 | s += ')' 102 | return s.format(name=self.__class__.__name__, **self.__dict__) 103 | -------------------------------------------------------------------------------- /neuronlp2/nn/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from itertools import repeat 3 | import torch 4 | import torch.nn.utils.rnn as rnn_utils 5 | from torch.autograd import Variable 6 | 7 | 8 | def _ntuple(n): 9 | def parse(x): 10 | if isinstance(x, collections.Iterable): 11 | return x 12 | return tuple(repeat(x, n)) 13 | return parse 14 | 15 | _single = _ntuple(1) 16 | _pair = _ntuple(2) 17 | _triple = _ntuple(3) 18 | _quadruple = _ntuple(4) 19 | 20 | 21 | def prepare_rnn_seq(rnn_input, lengths, hx=None, masks=None, batch_first=False): 22 | ''' 23 | 24 | Args: 25 | rnn_input: [seq_len, batch, input_size]: tensor containing the features of the input sequence. 26 | lengths: [batch]: tensor containing the lengthes of the input sequence 27 | hx: [num_layers * num_directions, batch, hidden_size]: tensor containing the initial hidden state for each element in the batch. 28 | masks: [seq_len, batch]: tensor containing the mask for each element in the batch. 29 | batch_first: If True, then the input and output tensors are provided as [batch, seq_len, feature]. 30 | 31 | Returns: 32 | 33 | ''' 34 | def check_decreasing(lengths): 35 | lens, order = torch.sort(lengths, dim=0, descending=True) 36 | if torch.ne(lens, lengths).sum() == 0: 37 | return None 38 | else: 39 | _, rev_order = torch.sort(order) 40 | return lens, Variable(order), Variable(rev_order) 41 | 42 | check_res = check_decreasing(lengths) 43 | 44 | if check_res is None: 45 | lens = lengths 46 | rev_order = None 47 | else: 48 | lens, order, rev_order = check_res 49 | batch_dim = 0 if batch_first else 1 50 | rnn_input = rnn_input.index_select(batch_dim, order) 51 | if hx is not None: 52 | # hack lstm 53 | if isinstance(hx, tuple): 54 | hx, cx = hx 55 | hx = hx.index_select(1, order) 56 | cx = cx.index_select(1, order) 57 | hx = (hx, cx) 58 | else: 59 | hx = hx.index_select(1, order) 60 | 61 | lens = lens.tolist() 62 | seq = rnn_utils.pack_padded_sequence(rnn_input, lens, batch_first=batch_first) 63 | if masks is not None: 64 | if batch_first: 65 | masks = masks[:, :lens[0]] 66 | else: 67 | masks = masks[:lens[0]] 68 | return seq, hx, rev_order, masks 69 | 70 | 71 | def recover_rnn_seq(seq, rev_order, hx=None, batch_first=False): 72 | output, _ = rnn_utils.pad_packed_sequence(seq, batch_first=batch_first) 73 | if rev_order is not None: 74 | batch_dim = 0 if batch_first else 1 75 | output = output.index_select(batch_dim, rev_order) 76 | if hx is not None: 77 | # hack lstm 78 | if isinstance(hx, tuple): 79 | hx, cx = hx 80 | hx = hx.index_select(1, rev_order) 81 | cx = cx.index_select(1, rev_order) 82 | hx = (hx, cx) 83 | else: 84 | hx = hx.index_select(1, rev_order) 85 | return output, hx 86 | -------------------------------------------------------------------------------- /neuronlp2/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | from .parser import * 4 | -------------------------------------------------------------------------------- /neuronlp2/tasks/parser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import re 4 | import numpy as np 5 | 6 | def is_uni_punctuation(word): 7 | match = re.match("^[^\w\s]+$]", word, flags=re.UNICODE) 8 | return match is not None 9 | 10 | def is_punctuation(word, pos, punct_set=None): 11 | if punct_set is None: 12 | return is_uni_punctuation(word) 13 | else: 14 | return pos in punct_set 15 | 16 | 17 | def eval(words, postags, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths, 18 | punct_set=None, symbolic_root=False, symbolic_end=False): 19 | batch_size, _ = words.shape 20 | ucorr = 0. 21 | lcorr = 0. 22 | total = 0. 23 | ucomplete_match = 0. 24 | lcomplete_match = 0. 25 | 26 | ucorr_nopunc = 0. 27 | lcorr_nopunc = 0. 28 | total_nopunc = 0. 29 | ucomplete_match_nopunc = 0. 30 | lcomplete_match_nopunc = 0. 31 | 32 | corr_root = 0. 33 | total_root = 0. 34 | start = 1 if symbolic_root else 0 35 | end = 1 if symbolic_end else 0 36 | for i in range(batch_size): 37 | ucm = 1. 38 | lcm = 1. 39 | ucm_nopunc = 1. 40 | lcm_nopunc = 1. 41 | for j in range(start, lengths[i] - end): 42 | word = word_alphabet.get_instance(words[i, j]) 43 | word = word.encode('utf8') 44 | 45 | pos = pos_alphabet.get_instance(postags[i, j]) 46 | pos = pos.encode('utf8') 47 | 48 | total += 1 49 | if heads[i, j] == heads_pred[i, j]: 50 | ucorr += 1 51 | if types[i, j] == types_pred[i, j]: 52 | lcorr += 1 53 | else: 54 | lcm = 0 55 | else: 56 | ucm = 0 57 | lcm = 0 58 | 59 | if not is_punctuation(word, pos, punct_set): 60 | total_nopunc += 1 61 | if heads[i, j] == heads_pred[i, j]: 62 | ucorr_nopunc += 1 63 | if types[i, j] == types_pred[i, j]: 64 | lcorr_nopunc += 1 65 | else: 66 | lcm_nopunc = 0 67 | else: 68 | ucm_nopunc = 0 69 | lcm_nopunc = 0 70 | 71 | if heads[i, j] == 0: 72 | total_root += 1 73 | corr_root += 1 if heads_pred[i, j] == 0 else 0 74 | 75 | ucomplete_match += ucm 76 | lcomplete_match += lcm 77 | ucomplete_match_nopunc += ucm_nopunc 78 | lcomplete_match_nopunc += lcm_nopunc 79 | 80 | return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \ 81 | (ucorr_nopunc, lcorr_nopunc, total_nopunc, ucomplete_match_nopunc, lcomplete_match_nopunc), \ 82 | (corr_root, total_root), batch_size 83 | 84 | 85 | def decode_MST(energies, lengths, leading_symbolic=0, labeled=True): 86 | """ 87 | decode best parsing tree with MST algorithm. 88 | :param energies: energies: numpy 4D tensor 89 | energies of each edge. the shape is [batch_size, num_labels, n_steps, n_steps], 90 | where the summy root is at index 0. 91 | :param masks: numpy 2D tensor 92 | masks in the shape [batch_size, n_steps]. 93 | :param leading_symbolic: int 94 | number of symbolic dependency types leading in type alphabets) 95 | :return: 96 | """ 97 | 98 | def find_cycle(par): 99 | added = np.zeros([length], np.bool) 100 | added[0] = True 101 | cycle = set() 102 | findcycle = False 103 | for i in range(1, length): 104 | if findcycle: 105 | break 106 | 107 | if added[i] or not curr_nodes[i]: 108 | continue 109 | 110 | # init cycle 111 | tmp_cycle = set() 112 | tmp_cycle.add(i) 113 | added[i] = True 114 | findcycle = True 115 | l = i 116 | 117 | while par[l] not in tmp_cycle: 118 | l = par[l] 119 | if added[l]: 120 | findcycle = False 121 | break 122 | added[l] = True 123 | tmp_cycle.add(l) 124 | 125 | if findcycle: 126 | lorg = l 127 | cycle.add(lorg) 128 | l = par[lorg] 129 | while l != lorg: 130 | cycle.add(l) 131 | l = par[l] 132 | break 133 | 134 | return findcycle, cycle 135 | 136 | def chuLiuEdmonds(): 137 | par = np.zeros([length], dtype=np.int32) 138 | # create best graph 139 | par[0] = -1 140 | for i in range(1, length): 141 | # only interested at current nodes 142 | if curr_nodes[i]: 143 | max_score = score_matrix[0, i] 144 | par[i] = 0 145 | for j in range(1, length): 146 | if j == i or not curr_nodes[j]: 147 | continue 148 | 149 | new_score = score_matrix[j, i] 150 | if new_score > max_score: 151 | max_score = new_score 152 | par[i] = j 153 | 154 | # find a cycle 155 | findcycle, cycle = find_cycle(par) 156 | # no cycles, get all edges and return them. 157 | if not findcycle: 158 | final_edges[0] = -1 159 | for i in range(1, length): 160 | if not curr_nodes[i]: 161 | continue 162 | 163 | pr = oldI[par[i], i] 164 | ch = oldO[par[i], i] 165 | final_edges[ch] = pr 166 | return 167 | 168 | cyc_len = len(cycle) 169 | cyc_weight = 0.0 170 | cyc_nodes = np.zeros([cyc_len], dtype=np.int32) 171 | id = 0 172 | for cyc_node in cycle: 173 | cyc_nodes[id] = cyc_node 174 | id += 1 175 | cyc_weight += score_matrix[par[cyc_node], cyc_node] 176 | 177 | rep = cyc_nodes[0] 178 | for i in range(length): 179 | if not curr_nodes[i] or i in cycle: 180 | continue 181 | 182 | max1 = float("-inf") 183 | wh1 = -1 184 | max2 = float("-inf") 185 | wh2 = -1 186 | 187 | for j in range(cyc_len): 188 | j1 = cyc_nodes[j] 189 | if score_matrix[j1, i] > max1: 190 | max1 = score_matrix[j1, i] 191 | wh1 = j1 192 | 193 | scr = cyc_weight + score_matrix[i, j1] - score_matrix[par[j1], j1] 194 | 195 | if scr > max2: 196 | max2 = scr 197 | wh2 = j1 198 | 199 | score_matrix[rep, i] = max1 200 | oldI[rep, i] = oldI[wh1, i] 201 | oldO[rep, i] = oldO[wh1, i] 202 | score_matrix[i, rep] = max2 203 | oldO[i, rep] = oldO[i, wh2] 204 | oldI[i, rep] = oldI[i, wh2] 205 | 206 | rep_cons = [] 207 | for i in range(cyc_len): 208 | rep_cons.append(set()) 209 | cyc_node = cyc_nodes[i] 210 | for cc in reps[cyc_node]: 211 | rep_cons[i].add(cc) 212 | 213 | for i in range(1, cyc_len): 214 | cyc_node = cyc_nodes[i] 215 | curr_nodes[cyc_node] = False 216 | for cc in reps[cyc_node]: 217 | reps[rep].add(cc) 218 | 219 | chuLiuEdmonds() 220 | 221 | # check each node in cycle, if one of its representatives is a key in the final_edges, it is the one. 222 | found = False 223 | wh = -1 224 | for i in range(cyc_len): 225 | for repc in rep_cons[i]: 226 | if repc in final_edges: 227 | wh = cyc_nodes[i] 228 | found = True 229 | break 230 | if found: 231 | break 232 | 233 | l = par[wh] 234 | while l != wh: 235 | ch = oldO[par[l], l] 236 | pr = oldI[par[l], l] 237 | final_edges[ch] = pr 238 | l = par[l] 239 | 240 | if labeled: 241 | assert energies.ndim == 4, 'dimension of energies is not equal to 4' 242 | else: 243 | assert energies.ndim == 3, 'dimension of energies is not equal to 3' 244 | input_shape = energies.shape 245 | batch_size = input_shape[0] 246 | max_length = input_shape[2] 247 | 248 | pars = np.zeros([batch_size, max_length], dtype=np.int32) 249 | types = np.zeros([batch_size, max_length], dtype=np.int32) if labeled else None 250 | for i in range(batch_size): 251 | energy = energies[i] 252 | 253 | # calc the realy length of this instance 254 | length = lengths[i] 255 | 256 | # calc real energy matrix shape = [length, length, num_labels - #symbolic] (remove the label for symbolic types). 257 | if labeled: 258 | energy = energy[leading_symbolic:, :length, :length] 259 | # get best label for each edge. 260 | label_id_matrix = energy.argmax(axis=0) + leading_symbolic 261 | energy = energy.max(axis=0) 262 | else: 263 | energy = energy[:length, :length] 264 | label_id_matrix = None 265 | # get original score matrix 266 | orig_score_matrix = energy 267 | # initialize score matrix to original score matrix 268 | score_matrix = np.array(orig_score_matrix, copy=True) 269 | 270 | oldI = np.zeros([length, length], dtype=np.int32) 271 | oldO = np.zeros([length, length], dtype=np.int32) 272 | curr_nodes = np.zeros([length], dtype=np.bool) 273 | reps = [] 274 | 275 | for s in range(length): 276 | orig_score_matrix[s, s] = 0.0 277 | score_matrix[s, s] = 0.0 278 | curr_nodes[s] = True 279 | reps.append(set()) 280 | reps[s].add(s) 281 | for t in range(s + 1, length): 282 | oldI[s, t] = s 283 | oldO[s, t] = t 284 | 285 | oldI[t, s] = t 286 | oldO[t, s] = s 287 | 288 | final_edges = dict() 289 | chuLiuEdmonds() 290 | par = np.zeros([max_length], np.int32) 291 | if labeled: 292 | type = np.ones([max_length], np.int32) 293 | type[0] = 0 294 | else: 295 | type = None 296 | 297 | for ch, pr in final_edges.items(): 298 | par[ch] = pr 299 | if labeled and ch != 0: 300 | type[ch] = label_id_matrix[pr, ch] 301 | 302 | par[0] = 0 303 | pars[i] = par 304 | if labeled: 305 | types[i] = type 306 | 307 | return pars, types 308 | -------------------------------------------------------------------------------- /neuronlp2/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | 3 | import pickle 4 | import numpy as np 5 | from gensim.models.word2vec import Word2Vec 6 | import gzip 7 | 8 | from .io import utils 9 | 10 | 11 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True): 12 | """ 13 | load word embeddings from file 14 | :param embedding: 15 | :param embedding_path: 16 | :return: embedding dict, embedding dimention, caseless 17 | """ 18 | print("loading embedding: %s from %s" % (embedding, embedding_path)) 19 | if embedding == 'word2vec': 20 | # loading word2vec 21 | word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True) 22 | embedd_dim = word2vec.vector_size 23 | return word2vec, embedd_dim 24 | elif embedding == 'glove': 25 | # loading GloVe 26 | embedd_dim = -1 27 | embedd_dict = dict() 28 | with gzip.open(embedding_path, 'r') as file: 29 | for line in file: 30 | line = line.strip() 31 | line = line.decode('utf-8') 32 | if len(line) == 0: 33 | continue 34 | 35 | tokens = line.split() 36 | if embedd_dim < 0: 37 | embedd_dim = len(tokens) - 1 38 | else: 39 | assert (embedd_dim + 1 == len(tokens)) 40 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 41 | embedd[:] = tokens[1:] 42 | word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0] 43 | embedd_dict[word] = embedd 44 | return embedd_dict, embedd_dim 45 | elif embedding == 'senna': 46 | # loading Senna 47 | embedd_dim = -1 48 | embedd_dict = dict() 49 | with gzip.open(embedding_path, 'r') as file: 50 | for line in file: 51 | line = line.strip() 52 | line = line.decode('utf-8') 53 | if len(line) == 0: 54 | continue 55 | 56 | tokens = line.split() 57 | if embedd_dim < 0: 58 | embedd_dim = len(tokens) - 1 59 | else: 60 | assert (embedd_dim + 1 == len(tokens)) 61 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 62 | embedd[:] = tokens[1:] 63 | word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0] 64 | embedd_dict[word] = embedd 65 | return embedd_dict, embedd_dim 66 | elif embedding == 'sskip': 67 | embedd_dim = -1 68 | embedd_dict = dict() 69 | with gzip.open(embedding_path, 'r') as file: 70 | # skip the first line 71 | file.readline() 72 | for line in file: 73 | line = line.strip() 74 | try: 75 | line = line.decode('utf-8') 76 | if len(line) == 0: 77 | continue 78 | 79 | tokens = line.split() 80 | if len(tokens) < embedd_dim: 81 | continue 82 | 83 | if embedd_dim < 0: 84 | embedd_dim = len(tokens) - 1 85 | 86 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 87 | start = len(tokens) - embedd_dim 88 | word = ' '.join(tokens[0:start]) 89 | embedd[:] = tokens[start:] 90 | word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word 91 | embedd_dict[word] = embedd 92 | except UnicodeDecodeError: 93 | continue 94 | return embedd_dict, embedd_dim 95 | elif embedding == 'polyglot': 96 | words, embeddings = pickle.load(open(embedding_path, 'rb')) 97 | _, embedd_dim = embeddings.shape 98 | embedd_dict = dict() 99 | for i, word in enumerate(words): 100 | embedd = np.empty([1, embedd_dim], dtype=np.float32) 101 | embedd[:] = embeddings[i, :] 102 | word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word 103 | embedd_dict[word] = embedd 104 | return embedd_dict, embedd_dim 105 | 106 | else: 107 | raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]") 108 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backports.shutil-get-terminal-size==1.0.0 2 | boto==2.49.0 3 | boto3==1.9.93 4 | botocore==1.12.93 5 | bz2file==0.98 6 | certifi==2019.6.16 7 | chardet==3.0.4 8 | decorator==4.3.2 9 | docutils==0.14 10 | enum34==1.1.6 11 | futures==3.2.0 12 | gensim==3.7.1 13 | idna==2.8 14 | ipdb==0.11 15 | ipython==5.8.0 16 | ipython-genutils==0.2.0 17 | jmespath==0.9.3 18 | mkl-fft==1.0.6 19 | mkl-random==1.0.1 20 | nltk==3.4.1 21 | numpy==1.16.1 22 | pathlib2==2.3.3 23 | pexpect==4.6.0 24 | pickleshare==0.7.5 25 | prompt-toolkit==1.0.15 26 | ptyprocess==0.6.0 27 | Pygments==2.3.1 28 | python-dateutil==2.8.0 29 | PyYAML==3.13 30 | requests==2.21.0 31 | s3transfer==0.2.0 32 | scandir==1.9.0 33 | scikit-learn==0.20.3 34 | scipy==1.2.1 35 | simplegeneric==0.8.1 36 | singledispatch==3.4.0.3 37 | six==1.12.0 38 | sklearn==0.0 39 | smart-open==1.8.0 40 | torch==0.3.1 41 | traitlets==4.3.2 42 | urllib3==1.24.1 43 | wcwidth==0.1.7 44 | -------------------------------------------------------------------------------- /rst_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | 5 | from NeuralRST.in_out.util import load_embedding_dict, get_logger 6 | from NeuralRST.in_out.preprocess import create_alphabet 7 | from NeuralRST.in_out.preprocess import batch_data_variable 8 | from NeuralRST.models.vocab import Vocab 9 | from NeuralRST.models.metric import Metric 10 | from NeuralRST.models.config import Config 11 | from NeuralRST.models.architecture import MainArchitecture 12 | 13 | 14 | class RSTModel(object): 15 | def __init__(self, rst_config_path): 16 | print("................................................") 17 | print("LOADING RST Model") 18 | self.config = Config(None) 19 | self.config.load_config(rst_config_path) 20 | self.logger = get_logger("RSTParser RUN", self.config.use_dynamic_oracle, self.config.model_path) 21 | word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet(None, self.config.alphabet_path, self.logger) 22 | self.vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha) 23 | self.network = MainArchitecture(self.vocab, self.config) 24 | self.network.load_state_dict(torch.load(self.config.model_name)) 25 | if self.config.use_gpu: 26 | self.network = self.network.cuda() 27 | self.network.eval() 28 | 29 | def prepare_data(self, batch, batch_size): 30 | config = self.config 31 | vocab = self.vocab 32 | max_edu_len = -1 33 | max_edu_num = -1 34 | for data in batch: 35 | edu_num = len(data.edus) 36 | if edu_num > max_edu_num: max_edu_num = edu_num 37 | for edu in data.edus: 38 | edu_len = len(edu.words) 39 | if edu_len > max_edu_len: max_edu_len = edu_len 40 | 41 | edu_words = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 42 | edu_types = Variable(torch.LongTensor(batch_size, max_edu_num).zero_(), requires_grad=False) 43 | edu_syntax = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len, config.syntax_dim).zero_(), requires_grad=False) 44 | word_mask = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 45 | edu_tags = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False) 46 | edu_mask = Variable(torch.Tensor(batch_size, max_edu_num).zero_(), requires_grad=False) 47 | word_denominator = Variable(torch.ones(batch_size, max_edu_num).type(torch.FloatTensor) * -1, requires_grad=False) 48 | len_edus = np.zeros([batch_size], dtype=np.int64) 49 | 50 | for idx in range(batch_size): 51 | for idy in range(len(batch[idx].edus)): 52 | len_edus[idx] = len(batch[idx].edus) 53 | edu = batch[idx].edus[idy] 54 | edu_mask[idx, idy] = 1 55 | edu_types[idx, idy] = vocab.etype_alpha.word2id(edu.etype) 56 | edu_len = len(edu.words) 57 | word_denominator[idx, idy] = edu_len 58 | for idz in range(edu_len): 59 | word = edu.words[idz] 60 | tag = edu.tags[idz] 61 | edu_words[idx, idy, idz] = vocab.word_alpha.word2id(word) 62 | edu_tags[idx, idy, idz] = vocab.tag_alpha.word2id(tag) 63 | edu_syntax[idx, idy, idz] = edu.syntax_features[idz].view(config.syntax_dim) 64 | word_mask[idx, idy, idz] = 1 65 | 66 | if config.use_gpu: 67 | edu_words = edu_words.cuda() 68 | edu_tags = edu_tags.cuda() 69 | edu_types = edu_types.cuda() 70 | edu_mask = edu_mask.cuda() 71 | word_mask = word_mask.cuda() 72 | word_denominator = word_denominator.cuda() 73 | edu_syntax = edu_syntax.cuda() 74 | 75 | return edu_words, edu_tags, edu_types, edu_mask, word_mask, len_edus, word_denominator, edu_syntax 76 | 77 | def get_edu_representation(self, data_test): 78 | words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test 79 | encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax) 80 | return encoder_output 81 | 82 | def get_subtree(self, data_test): 83 | words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test 84 | self.network.training = False 85 | encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax) 86 | results = self.network.decode(encoder_output, [], [], len_edus) 87 | return results 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /sentence.py: -------------------------------------------------------------------------------- 1 | class Sentence(object): 2 | def __init__ (self, words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids): 3 | self.words = words 4 | self.seq_chars = seq_chars 5 | self.tags = tags 6 | self.word_ids = word_ids 7 | self.seq_char_ids = seq_char_ids 8 | self.tag_ids = tag_ids 9 | self.edu_ids = edu_ids 10 | 11 | def length(self): 12 | return len(self.words) 13 | 14 | class Instance(object): 15 | def __init__(self, sentences, syntax_features): 16 | self.edus = [] 17 | 18 | cur_edu_id = 1 19 | cur_words = [] 20 | cur_tags = [] 21 | cur_syntax = [] 22 | for idx in range(len(sentences)): 23 | sentence = sentences[idx] 24 | syntax = syntax_features[idx] 25 | for idy in range(len(sentence.words)): 26 | if sentence.edu_ids[idy] != cur_edu_id: 27 | cur_edu_id += 1 28 | self.edus.append(EDU(cur_words, cur_tags, '', cur_syntax)) 29 | cur_words = [] 30 | cur_tags = [] 31 | cur_syntax = [] 32 | cur_words.append(sentence.words[idy]) 33 | cur_tags.append(sentence.tags[idy]) 34 | cur_syntax.append(syntax[:,idy,:]) 35 | self.edus.append(EDU(cur_words, cur_tags, '

', cur_syntax)) 36 | 37 | class EDU(object): 38 | def __init__(self, words, tags, etype, syntax_features): 39 | self.words = words 40 | self.tags = tags 41 | self.etype = etype 42 | self.syntax_features = syntax_features 43 | 44 | --------------------------------------------------------------------------------