├── .gitignore
├── NeuralRST
    ├── __init__.py
    ├── in_out
    │   ├── __init__.py
    │   ├── instance.py
    │   ├── node.py
    │   ├── preprocess.py
    │   ├── reader.py
    │   ├── rst_feature.py
    │   ├── tree.py
    │   └── util.py
    ├── models
    │   ├── __init__.py
    │   ├── alphabet.py
    │   ├── architecture.py
    │   ├── config.py
    │   ├── explorer.py
    │   ├── metric.py
    │   └── vocab.py
    ├── modules
    │   ├── __init__.py
    │   ├── embedding.py
    │   ├── function_variational_rnn.py
    │   ├── layer.py
    │   └── variational_rnn.py
    ├── requirements.txt
    ├── run_rst_parser.py
    ├── train_rst_parser.py
    └── transition
    │   ├── __init__.py
    │   ├── action.py
    │   ├── atom_feature.py
    │   └── state.py
├── README.md
├── biaffine_model.py
├── corenlp.py
├── extract_latent_feature.py
├── extract_shallow_feature.py
├── extract_tree.py
├── models
    ├── biaffine
    │   ├── alphabets
    │   │   ├── character.json
    │   │   ├── pos.json
    │   │   ├── type.json
    │   │   └── word.json
    │   ├── network.pt
    │   └── network.pt.arg.json
    └── rst
    │   ├── alphabets
    │       ├── action_label_alpha.json
    │       ├── etype_alpha.json
    │       ├── gold_action_alpha.json
    │       ├── tag_alpha.json
    │       └── word_alpha.json
    │   ├── config.cfg
    │   └── network.pt
├── neuronlp2
    ├── __init__.py
    ├── biaffine_model.py
    ├── io
    │   ├── __init__.py
    │   ├── alphabet.py
    │   ├── conll03_data.py
    │   ├── conllx_data.py
    │   ├── conllx_stacked_data.py
    │   ├── instance.py
    │   ├── logger.py
    │   ├── reader.py
    │   ├── utils.py
    │   └── writer.py
    ├── models
    │   ├── __init__.py
    │   ├── parsing.py
    │   └── sequence_labeling.py
    ├── nlinalg
    │   ├── __init__.py
    │   └── nlinalg.py
    ├── nn
    │   ├── __init__.py
    │   ├── _functions
    │   │   ├── __init__.py
    │   │   ├── masked_rnn.py
    │   │   ├── skipconnect_rnn.py
    │   │   └── variational_rnn.py
    │   ├── init.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── crf.py
    │   │   ├── linear.py
    │   │   ├── masked_rnn.py
    │   │   ├── skipconnect_rnn.py
    │   │   ├── sparse.py
    │   │   └── variational_rnn.py
    │   └── utils.py
    ├── tasks
    │   ├── __init__.py
    │   └── parser.py
    └── utils.py
├── requirements.txt
├── rst_model.py
└── sentence.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/NeuralRST/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/__init__.py


--------------------------------------------------------------------------------
/NeuralRST/in_out/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/in_out/__init__.py


--------------------------------------------------------------------------------
/NeuralRST/in_out/instance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from NeuralRST.in_out.node import Node
  3 | 
  4 | # representing one document / one set
  5 | class Instance(object):
  6 |     def __init__(self, total_words, total_tags, edus, gold_actions, result):
  7 |         self.total_words = total_words
  8 |         self.total_tags = total_tags
  9 |         self.edus = edus
 10 |         self.gold_actions = gold_actions
 11 |         self.result = result
 12 | 
 13 |     def evaluate(self, other_result, span, nuclear, relation, full): # is_trained=False, max_edu_size=0):
 14 |         main_subtrees = self.result.subtrees
 15 |         span.overall_label_count += len(main_subtrees)
 16 |         span.predicated_label_count += len(other_result.subtrees)
 17 |         for i in range (len(other_result.subtrees)):
 18 |             for j in range (len(main_subtrees)):
 19 |                 if other_result.subtrees[i].span_equal(main_subtrees[j]):
 20 |                     span.correct_label_count += 1
 21 |                     break
 22 |         
 23 |         nuclear.overall_label_count += len(main_subtrees)
 24 |         nuclear.predicated_label_count += len(other_result.subtrees)
 25 |         for i in range (len(other_result.subtrees)):
 26 |             for j in range (len(main_subtrees)):
 27 |                 if other_result.subtrees[i].nuclear_equal(main_subtrees[j]):
 28 |                     nuclear.correct_label_count += 1
 29 |                     break
 30 | 
 31 |         relation.overall_label_count += len(main_subtrees)
 32 |         relation.predicated_label_count += len(other_result.subtrees)
 33 |         for i in range (len(other_result.subtrees)):
 34 |             for j in range (len(main_subtrees)):
 35 |                 if other_result.subtrees[i].relation_equal(main_subtrees[j]):
 36 |                     relation.correct_label_count += 1
 37 |                     break
 38 | 
 39 |         full.overall_label_count += len(main_subtrees)
 40 |         full.predicated_label_count += len(other_result.subtrees)
 41 |         for i in range (len(other_result.subtrees)):
 42 |             for j in range (len(main_subtrees)):
 43 |                 if other_result.subtrees[i].full_equal(main_subtrees[j]):
 44 |                     full.correct_label_count += 1
 45 |                     break
 46 |         return span, nuclear, relation, full 
 47 | 
 48 | # representing 1 EDU
 49 | class EDU(object):
 50 |     def __init__(self, start_index, end_index):
 51 |         self.start_index = start_index # int
 52 |         self.end_index = end_index # int
 53 |         self.etype = '' # string
 54 |         self.words = [] # list of word (string)
 55 |         self.tags = [] # list of tag (string)
 56 |         self.syntax_features = []
 57 | 
 58 | # nuclear will be: NUCLEAR, SATELLITE, span
 59 | class SubTree(object):
 60 |     NUCLEAR='NUCLEAR'
 61 |     SATELLITE='SATELLITE'
 62 |     SPAN='span'
 63 | 
 64 |     def __init__(self):
 65 |         self.nuclear = ''
 66 |         self.relation = ''
 67 |         self.edu_start = -1
 68 |         self.edu_end = -1
 69 | 
 70 |     def clear(self):
 71 |         self.nuclear = ''
 72 |         self.relation = ''
 73 |         self.edu_start = -1
 74 |         self.edu_end = -1
 75 | 
 76 |     def span_equal(self, tree):
 77 |         return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end
 78 |     
 79 |     def nuclear_equal(self, tree):
 80 |         return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.nuclear == tree.nuclear
 81 | 
 82 |     def relation_equal(self, tree):
 83 |         return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.relation == tree.relation
 84 | 
 85 |     def full_equal(self, tree):
 86 |         return self.edu_start == tree.edu_start and self.edu_end == tree.edu_end and self.relation == tree.relation and self.nuclear and tree.nuclear
 87 | 
 88 |     def get_str(self):
 89 |         return self.nuclear +' '+self.relation+' edu('+str(self.edu_start)+'-'+str(self.edu_end) +')'
 90 | 
 91 | class CResult(object):
 92 |     def __init__(self):
 93 |         self.subtrees = []
 94 |     
 95 |     def clear(self):
 96 |         self.subtrees = []
 97 |     
 98 |     def save(self, file_path):
 99 |         np.save(file_path, np.array(self.subtrees))
100 | 
101 |     def obtain_tree(self):
102 |         p_subtree = {}
103 |         subtrees = self.subtrees
104 |         assert len(subtrees) % 2 == 0
105 |         for idx in range(0, len(subtrees), 2):
106 |             edu_span = (subtrees[idx].edu_start, subtrees[idx+1].edu_end)
107 |             nuclear = subtrees[idx].nuclear + " " + subtrees[idx+1].nuclear
108 |             relation = subtrees[idx].relation
109 |             if 'span' == relation:
110 |                 relation = subtrees[idx+1].relation
111 |             tree = Node(edu_span, nuclear, relation)
112 |             
113 |             #set child:
114 |             if p_subtree.get(edu_span[0], None) is not None:
115 |                 tree.left = p_subtree[edu_span[0]]
116 |                 p_subtree[edu_span[0]].parent = tree
117 |             elif subtrees[idx].edu_start == subtrees[idx].edu_end:
118 |                 leaf = Node((subtrees[idx].edu_start, subtrees[idx].edu_end), '', '')
119 |                 tree.left = leaf
120 |                 leaf.parent = tree
121 |             if p_subtree.get(edu_span[1], None) is not None:
122 |                 tree.right = p_subtree[edu_span[1]]
123 |                 p_subtree[edu_span[1]].parent = tree
124 |             elif subtrees[idx+1].edu_start == subtrees[idx+1].edu_end:
125 |                 leaf =  Node((subtrees[idx+1].edu_start, subtrees[idx+1].edu_end), '', '')
126 |                 tree.right = leaf
127 |                 leaf.parent = tree
128 |             p_subtree[edu_span[0]] = tree
129 |             p_subtree[edu_span[1]] = tree
130 |         if len(subtrees) != 0:
131 |             return p_subtree[0]
132 |         else:
133 |             return None
134 | 
135 | # representing ONE word
136 | class SynFeat(object):
137 |     def __init__(self, arc_dep, arc_head, rel_dep, rel_head):
138 |         self.arc_dep = arc_dep
139 |         self.arc_head = arc_head
140 |         self.rel_dep = rel_dep
141 |         self.rel_head = rel_head
142 |         # self.lstm_out1 = lstm_out1
143 |         # self.lstm_out2 = lstm_out2
144 | 
145 |     def concat(self):
146 |         return self.arc_dep + self.rel_dep + self.arc_head + self.rel_head
147 | 
148 | 


--------------------------------------------------------------------------------
/NeuralRST/in_out/node.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Node(object):
 4 |     def __init__(self, edu_span, nuclear, relation):
 5 |         self.edu_span = edu_span
 6 |         self.nuclear = nuclear
 7 |         self.relation = relation
 8 |         self.left = None
 9 |         self.right = None
10 |         self.parent = None
11 | 
12 |     def str(self):
13 |         return self.nuclear + ' ' + self.relation
14 | 
15 | 


--------------------------------------------------------------------------------
/NeuralRST/in_out/preprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import torch
  4 | import numpy as np
  5 | 
  6 | from NeuralRST.models.metric import Metric
  7 | from NeuralRST.models.alphabet import Alphabet
  8 | from NeuralRST.in_out.util import lower_with_digit_transform
  9 | from NeuralRST.transition.state import CState
 10 | from torch.autograd import Variable
 11 | 
 12 | def construct_embedding_table(alpha, hidden_size, freeze, pretrained_embed = None):
 13 |     if alpha is None:
 14 |         return None
 15 |     scale = np.sqrt(6.0 / (alpha.size()+hidden_size))
 16 |     table = np.empty([alpha.size(), hidden_size], dtype=np.float32)
 17 |     for word, index, in alpha.alpha2id.items():
 18 |         if pretrained_embed is not None:
 19 |             if word in pretrained_embed:
 20 |                 embedding = pretrained_embed[word]
 21 |             elif word.lower() in pretrained_embed:
 22 |                 embedding = pretrained_embed[word.lower()]
 23 |             else:
 24 |                 embedding = np.zeros([1, hidden_size]).astype(np.float32) if freeze else np.random.uniform(-scale, scale, [1, hidden_size]).astype(np.float32)
 25 |         else:
 26 |             embedding = np.random.uniform(-scale, scale, [1, hidden_size]).astype(np.float32)
 27 |         table[index, :] = embedding
 28 |     return torch.from_numpy(table)
 29 | 
 30 | 
 31 | def create_alphabet(instances, alphabet_directory, logger):
 32 |     word_size = 0
 33 |     gold_size = 0
 34 |         
 35 |     word_stat = {}
 36 |     tag_stat = {}
 37 |     gold_action_stat = {}
 38 |     action_label_stat = {}
 39 |     etype_stat = {}
 40 | 
 41 |     if not os.path.isdir(alphabet_directory):
 42 |         print("Creating Alphabets")
 43 |         for instance in instances:
 44 |             for i in range(len(instance.total_words)):
 45 |                 word = lower_with_digit_transform(instance.total_words[i].strip())
 46 |                 tag = instance.total_tags[i]
 47 |                 word_stat[word] = word_stat.get(word, 0) + 1
 48 |                 tag_stat[tag] = tag_stat.get(tag, 0) + 1
 49 | 
 50 |             for action in instance.gold_actions:
 51 |                 if (not action.is_shift() and not action.is_finish()):
 52 |                     action_label_stat[action.label] = action_label_stat.get(action.label, 0) + 1
 53 |                 gold_action_stat[action.get_str()] = gold_action_stat.get(action.get_str(), 0) + 1
 54 |             
 55 |             for k in range(len(instance.edus)):
 56 |                 etype_stat[instance.edus[k].etype] = etype_stat.get(instance.edus[k].etype, 0) + 1
 57 |         
 58 |         word_alpha = Alphabet(word_stat, 'word_alpha')
 59 |         tag_alpha = Alphabet(tag_stat, 'tag_alpha')
 60 |         gold_action_alpha = Alphabet(gold_action_stat, 'gold_action_alpha', for_label_index=True)
 61 |         action_label_alpha = Alphabet(action_label_stat, 'action_label_alpha', for_label_index=True)
 62 |         etype_alpha = Alphabet(etype_stat, 'etype_alpha')
 63 | 
 64 |         word_alpha.save(alphabet_directory)
 65 |         tag_alpha.save(alphabet_directory)
 66 |         gold_action_alpha.save(alphabet_directory)
 67 |         action_label_alpha.save(alphabet_directory)
 68 |         etype_alpha.save(alphabet_directory)
 69 |     else:
 70 |         print("Loading Alphabets")
 71 |         word_alpha = Alphabet(word_stat, 'word_alpha')
 72 |         tag_alpha = Alphabet(tag_stat, 'tag_alpha')
 73 |         gold_action_alpha = Alphabet(gold_action_stat, 'gold_action_alpha')
 74 |         action_label_alpha = Alphabet(action_label_stat, 'action_label_alpha')
 75 |         etype_alpha = Alphabet(etype_stat, 'etype_alpha')
 76 |         
 77 |         word_alpha.load(alphabet_directory)
 78 |         tag_alpha.load(alphabet_directory)
 79 |         gold_action_alpha.load(alphabet_directory, for_label_index=True)
 80 |         action_label_alpha.load(alphabet_directory, for_label_index=True)
 81 |         etype_alpha.load(alphabet_directory)
 82 | 
 83 |     logger.info("Word alphabet size: " + str(word_alpha.size()))
 84 |     logger.info("Tag alphabet size: " + str(tag_alpha.size()))
 85 |     logger.info("Gold action alphabet size: " + str(gold_action_alpha.size()))
 86 |     logger.info("Action Label alphabet size: " + str(action_label_alpha.size()))
 87 |     logger.info("Etype alphabet size: " + str(etype_alpha.size()))
 88 |     return word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha
 89 | 
 90 | 
 91 | def validate_gold_actions(instances, maxStateSize):
 92 |     shift_num = 0; reduce_nn_num = 0; reduce_ns_num = 0; reduce_sn_num = 0
 93 |     span = Metric(); nuclear = Metric(); relation = Metric(); full = Metric()
 94 | 
 95 |     for inst in instances:
 96 |         for ac in inst.gold_actions:
 97 |             if ac.is_shift():
 98 |                 shift_num+=1
 99 |             if ac.is_reduce():
100 |                 if ac.nuclear == 'NN':
101 |                     reduce_nn_num += 1
102 |                 elif ac.nuclear == 'NS':
103 |                     reduce_ns_num += 1
104 |                 elif ac.nuclear == 'SN':
105 |                     reduce_sn_num += 1
106 |                 else:
107 |                     raise Exception('Reduce error, this must have nuclearity')
108 |                 # something is here
109 |                 assert(ac.label_id != -1)
110 | 
111 |     print("Reduce NN: " + str(reduce_nn_num))
112 |     print("Reduce NS: " + str(reduce_ns_num))
113 |     print("Reduce SN: " + str(reduce_sn_num))
114 |     print("Shift: " + str(shift_num))
115 | 
116 |     print("Checking the gold Actions, it will be interrupted if there is error assertion")
117 |     # all_states = [CState() for i in range(maxStateSize)]
118 |     # for inst in instances:
119 |         # step = 0
120 |         # gold_actions = inst.gold_actions
121 |         # action_size = len(gold_actions)
122 |         # all_states[0].ready(inst)
123 |         # while(not all_states[step].is_end()):
124 |             # assert(step < action_size)
125 |             # all_states[step+1] = all_states[step].move(all_states[step+1], gold_actions[step])
126 |             # step += 1
127 |         # assert(step == action_size)
128 |         # result = all_states[step].get_result()
129 |         # span, nuclear, relation, full = inst.evaluate(result, span, nuclear, relation, full)
130 |         # if not span.bIdentical() or not nuclear.bIdentical() or not relation.bIdentical() or not full.bIdentical():
131 |             # raise Exception('Error state conversion!! ')
132 | 
133 | def get_max_parameter(instances):
134 |     max_edu_size = 0
135 |     max_sent_size = 0
136 |     max_state_size = 0
137 |     
138 |     for instance in instances:
139 |         len_state = len(instance.gold_actions)
140 |         if len_state > max_state_size:
141 |             max_state_size = len_state
142 |         len_edu = len(instance.edus)
143 |         if len_edu > max_edu_size:
144 |             max_edu_size = len_edu
145 |         for edu in instance.edus:
146 |             len_sent = len(edu.words)
147 |             if len_sent > max_sent_size:
148 |                 max_sent_size = len_sent
149 |     return max_edu_size, max_sent_size, max_state_size
150 | 
151 | def batch_data_variable(data, indices, vocab, config, is_training=True):
152 |     batch_size  = len(indices)
153 |     indices = indices.tolist()
154 |     batch = data[indices]
155 |     max_edu_len = -1
156 |     max_edu_num = -1
157 |     for data in batch:
158 |         edu_num = len(data.edus)
159 |         if edu_num > max_edu_num: max_edu_num = edu_num
160 |         for edu in data.edus:
161 |             edu_len = len(edu.words)
162 |             if edu_len > max_edu_len: max_edu_len = edu_len
163 | 
164 |     edu_words = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
165 |     edu_types = Variable(torch.LongTensor(batch_size, max_edu_num).zero_(), requires_grad=False)
166 |     edu_syntax = np.zeros([batch_size, max_edu_num, max_edu_len, config.syntax_dim], dtype=np.float32)
167 |     word_mask = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
168 |     edu_tags = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
169 |     edu_mask = Variable(torch.Tensor(batch_size, max_edu_num).zero_(), requires_grad=False)
170 |     word_denominator = Variable(torch.ones(batch_size, max_edu_num).type(torch.FloatTensor) * -1, requires_grad=False)
171 |     gold_action_var = np.ones([batch_size, config.max_state_size], dtype=np.int64)  * (vocab.gold_action_alpha.size())
172 |     len_edus = np.zeros([batch_size], dtype=np.int64)
173 | 
174 |     for idx in range(batch_size):
175 |         for idy in range(len(batch[idx].edus)):
176 |             len_edus[idx] = len(batch[idx].edus)
177 |             edu = batch[idx].edus[idy]
178 |             edu_mask[idx, idy] = 1
179 |             edu_types[idx, idy] = vocab.etype_alpha.word2id(edu.etype)
180 |             edu_len = len(edu.words)
181 |             word_denominator[idx, idy] = edu_len
182 |             for idz in range(edu_len):
183 |                 word = edu.words[idz]
184 |                 tag = edu.tags[idz]
185 |                 edu_words[idx, idy, idz] = vocab.word_alpha.word2id(word)
186 |                 edu_tags[idx, idy, idz] = vocab.tag_alpha.word2id(tag)
187 |                 edu_syntax[idx, idy, idz] = edu.syntax_features[idz]
188 |                 word_mask[idx, idy, idz] = 1
189 |     
190 |         if is_training:
191 |             max_gold = len(batch[idx].gold_actions)
192 |             if max_gold > config.max_state_size: max_gold = config.max_state_size
193 |             for idy in range(max_gold):
194 |                 gold_action_str = batch[idx].gold_actions[idy].get_str()
195 |                 gold_action_var[idx][idy] = vocab.gold_action_alpha.word2id(gold_action_str)
196 |     gold_action_var = Variable(torch.from_numpy(gold_action_var), volatile=False, requires_grad=False)
197 |     edu_syntax = Variable(torch.from_numpy(edu_syntax), volatile=False, requires_grad=False)
198 |     if config.use_gpu:
199 |         edu_words = edu_words.cuda()
200 |         edu_tags = edu_tags.cuda()
201 |         edu_types = edu_types.cuda()
202 |         edu_mask = edu_mask.cuda()
203 |         word_mask = word_mask.cuda()
204 |         gold_action_var = gold_action_var.cuda()
205 |         word_denominator = word_denominator.cuda()
206 |         edu_syntax = edu_syntax.cuda()
207 |     
208 |     return edu_words, edu_tags, edu_types, edu_mask, word_mask, gold_action_var, len_edus, word_denominator, edu_syntax
209 | 
210 | 
211 | 
212 |     
213 |     
214 | 


--------------------------------------------------------------------------------
/NeuralRST/in_out/rst_feature.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class RSTFeature (object):
  4 |     def __init__(self):
  5 |         self.ns_score = {}
  6 |         self.rel_type_score = {}
  7 |         # self.relations = ['attribution', 'background', 'cause', 'comparison', 'condition', 
  8 |                 # 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint',
  9 |                 # 'mannermeans', 'summary', 'temporal', 'topic', 'sameunit', 'textualorganization', 'list']
 10 |         self.relations= ["purp", "cont", "attr", "evid", "comp", "list", "back", "same", "topic",
 11 |                 "mann", "summ", "cond", "temp", "eval", "text", "cause", "prob", "elab"]
 12 | 
 13 |     def depth(self, node):
 14 |         left_depth = self.depth(node.left) if node.left else 0
 15 |         right_depth = self.depth(node.right) if node.right else 0
 16 |         return max(left_depth, right_depth) + 1
 17 | 
 18 |     def get_max_edu(self, node):
 19 |         if node.right is None:
 20 |             return node.edu_span[1]
 21 |         return self.get_max_edu(node.right)
 22 | 
 23 |     # author: Fajri Koto
 24 |     # 6 May 2019
 25 |     def generate_heuristic_feature(self, node):
 26 |         if node is None:
 27 |             print('WARNING: There is a None tree')
 28 |             return np.array([[0] * 21])
 29 | 
 30 |         # Initialization
 31 |         depth = self.depth(node)
 32 |         relation_score = {}
 33 |         for relation in self.relations:
 34 |             relation_score[relation] = 0
 35 |         max_score = 0
 36 |         for i in range(1,depth+1,1): max_score += i
 37 |         # Compute! Output is stored in self.ns_score and self.rel_type_score
 38 |         
 39 |         self.compute_ns_score(node, depth, depth)
 40 |         self.compute_relation_score(node, max_score, depth, relation_score)
 41 | 
 42 |         # Store output
 43 |         output = []
 44 |         vectors = []
 45 |         max_edu = self.get_max_edu(node)
 46 |         assert max_edu+1 == len(self.ns_score)
 47 |         for id_edu in range(max_edu+1):
 48 |             vector1 = self.ns_score[id_edu]
 49 |             vector2 = self.rel_type_score[id_edu]
 50 |             vectors.append(vector1+vector2)
 51 |         vectors = np.array(vectors, np.float32)
 52 |         return vectors
 53 | 
 54 |     # author: Fajri Koto
 55 |     # 6 May 2019
 56 |     def compute_ns_score(self, node, height, n_score):
 57 |         if node.left == None and node.right == None:
 58 |             assert node.edu_span[0] == node.edu_span[1]
 59 |             self.ns_score[node.edu_span[0]] = [1.0*n_score/height]
 60 |             return
 61 |         n1, n2 = node.nuclear.split(' ')
 62 |         if n1 == 'SATELLITE':
 63 |             self.compute_ns_score(node.left, height, n_score-1)
 64 |         else:
 65 |             self.compute_ns_score(node.left, height, n_score)
 66 |         if n2 == 'SATELLITE':
 67 |             self.compute_ns_score(node.right, height, n_score-1)
 68 |         else:
 69 |             self.compute_ns_score(node.right, height, n_score)
 70 | 
 71 |     #author Fajri Koto
 72 |     # 6 May 2019
 73 |     def compute_relation_score(self, node, max_score, depth, relation_score):
 74 |         if node.relation != '':
 75 |             if (relation_score.get(node.relation, -1) != -1):
 76 |                 relation_score[node.relation]+=depth
 77 | 
 78 |         if node.left is None and node.right is None:
 79 |             assert node.edu_span[0] == node.edu_span[1]
 80 |             result = []
 81 |             
 82 |             # find if you are left or right
 83 |             if node.parent is not None:
 84 |                 n1, n2 = node.parent.nuclear.split(' ')
 85 |                 n1_v = 0; n2_v = 0
 86 |                 if n1 == 'NUCLEAR':
 87 |                     n1_v = 1
 88 |                 if n2 == 'NUCLEAR':
 89 |                     n2_v = 1
 90 | 
 91 |                 if node.parent.left == node:
 92 |                     result.append(n1_v)
 93 |                     result.append(n2_v)
 94 |                 else:
 95 |                     assert node.parent.right == node
 96 |                     result.append(n2_v)
 97 |                     result.append(n1_v)
 98 |             else:
 99 |                 result.append(1)
100 |                 result.append(1)
101 | 
102 |             # Score of relations
103 |             for relation in self.relations:
104 |                 result.append(1.0*relation_score[relation]/max_score)
105 |             self.rel_type_score[node.edu_span[0]] = result
106 |             return
107 |         
108 |         self.compute_relation_score(node.left, max_score, depth-1, relation_score.copy())
109 |         self.compute_relation_score(node.right, max_score, depth-1, relation_score.copy())
110 | 


--------------------------------------------------------------------------------
/NeuralRST/in_out/tree.py:
--------------------------------------------------------------------------------
 1 | class Tree(object):
 2 |     def __init__(self, edu_span, nuclear, relation):
 3 |         self.edu_span = edu_span
 4 |         self.nuclear = nuclear
 5 |         self.relation = relation
 6 |         self.left = None
 7 |         self.right = None
 8 | 
 9 |     def str(self):
10 |         return self.nuclear + ' ' + self.relation
11 | 
12 |     def get_id(self, vocab):
13 |         tmp = self.nuclear.split(' ')
14 |         action_str = "REDUCE_" + tmp[0][0] + tmp[1][0] + '_' + self.relation
15 |         return vocab.relation_alpha.word2id(action_str)
16 | 
17 |     def get_nodes(self, nodes, vocab):
18 |         cur_id = self.get_id (vocab)
19 |         if self.left is not None:
20 |             left_id = self.left.get_id(vocab)
21 |             key = (cur_id, left_id)
22 |             nodes[key] = nodes.get(key, 0) + 1
23 |             nodes = self.left.get_nodes(nodes, vocab)
24 | 
25 |         if self.right is not None:
26 |             right_id = self.right.get_id(vocab)
27 |             key = (cur_id, right_id)
28 |             nodes[key] = nodes.get(key, 0) + 1
29 |             nodes = self.right.get_nodes(nodes, vocab)
30 |         return nodes
31 | 


--------------------------------------------------------------------------------
/NeuralRST/in_out/util.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | import gzip
  4 | import re
  5 | import logging
  6 | import sys
  7 | from datetime import date    
  8 | from gensim.models.word2vec import Word2Vec
  9 | 
 10 | MAX_CHAR_LENGTH = 45
 11 | NUM_CHAR_PAD = 2
 12 | 
 13 | # Regular expressions used to normalize digits.
 14 | DIGIT_RE = re.compile(br"\d")
 15 | DIGIT_RE2 = re.compile(r"\d")
 16 | 
 17 | 
 18 | def lower_with_digit_transform(string):
 19 |     return DIGIT_RE2.sub("0", string.lower())
 20 | 
 21 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
 22 |     """
 23 |     load word embeddings from file
 24 |     :param embedding:
 25 |     :param embedding_path:
 26 |     :return: embedding dict, embedding dimention, caseless
 27 |     """
 28 |     print("Loading embedding: %s from %s" % (embedding, embedding_path))
 29 |     if embedding == 'word2vec':
 30 |         # loading word2vec
 31 |         word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
 32 |         embedd_dim = word2vec.vector_size
 33 |         return word2vec, embedd_dim
 34 |     elif embedding == 'glove':
 35 |         # loading GloVe
 36 |         embedd_dim = -1
 37 |         embedd_dict = dict()
 38 |         with gzip.open(embedding_path, 'r') as file:
 39 |             for line in file:
 40 |                 line = line.strip()
 41 |                 line = line.decode('utf-8')
 42 |                 if len(line) == 0:
 43 |                     continue
 44 | 
 45 |                 tokens = line.split()
 46 |                 if embedd_dim < 0:
 47 |                     embedd_dim = len(tokens) - 1
 48 |                 else:
 49 |                     assert (embedd_dim + 1 == len(tokens))
 50 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 51 |                 embedd[:] = tokens[1:]
 52 |                 word = DIGIT_RE2.sub("0", tokens[0]) if normalize_digits else tokens[0]
 53 |                 embedd_dict[word] = embedd
 54 |         return embedd_dict, embedd_dim
 55 |     elif embedding == 'senna':
 56 |         # loading Senna
 57 |         embedd_dim = -1
 58 |         embedd_dict = dict()
 59 |         with gzip.open(embedding_path, 'r') as file:
 60 |             for line in file:
 61 |                 line = line.strip()
 62 |                 line = line.decode('utf-8')
 63 |                 if len(line) == 0:
 64 |                     continue
 65 | 
 66 |                 tokens = line.split()
 67 |                 if embedd_dim < 0:
 68 |                     embedd_dim = len(tokens) - 1
 69 |                 else:
 70 |                     assert (embedd_dim + 1 == len(tokens))
 71 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 72 |                 embedd[:] = tokens[1:]
 73 |                 word = DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
 74 |                 embedd_dict[word] = embedd
 75 |         return embedd_dict, embedd_dim
 76 |     elif embedding == 'sskip':
 77 |         embedd_dim = -1
 78 |         embedd_dict = dict()
 79 |         with gzip.open(embedding_path, 'r') as file:
 80 |             # skip the first line
 81 |             file.readline()
 82 |             for line in file:
 83 |                 line = line.strip()
 84 |                 try:
 85 |                     line = line.decode('utf-8')
 86 |                     if len(line) == 0:
 87 |                         continue
 88 | 
 89 |                     tokens = line.split()
 90 |                     if len(tokens) < embedd_dim:
 91 |                         continue
 92 | 
 93 |                     if embedd_dim < 0:
 94 |                         embedd_dim = len(tokens) - 1
 95 | 
 96 |                     embedd = np.empty([1, embedd_dim], dtype=np.float32)
 97 |                     start = len(tokens) - embedd_dim
 98 |                     word = ' '.join(tokens[0:start])
 99 |                     embedd[:] = tokens[start:]
100 |                     word = DIGIT_RE.sub(b"0", word) if normalize_digits else word
101 |                     embedd_dict[word] = embedd
102 |                 except UnicodeDecodeError:
103 |                     continue
104 |         return embedd_dict, embedd_dim
105 |     elif embedding == 'polyglot':
106 |         words, embeddings = pickle.load(open(embedding_path, 'rb'))
107 |         _, embedd_dim = embeddings.shape
108 |         embedd_dict = dict()
109 |         for i, word in enumerate(words):
110 |             embedd = np.empty([1, embedd_dim], dtype=np.float32)
111 |             embedd[:] = embeddings[i, :]
112 |             word = DIGIT_RE.sub(b"0", word) if normalize_digits else word
113 |             embedd_dict[word] = embedd
114 |         return embedd_dict, embedd_dim
115 | 
116 |     else:
117 |         raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")
118 | 
119 | 
120 | def get_logger(name, is_dynamic, model_path, level=logging.INFO, handler=sys.stdout,
121 |                formatter='%(asctime)s - %(name)s - %(message)s'):
122 |     logger = logging.getLogger(name)
123 |     logger.setLevel(logging.INFO)
124 |     formatter = logging.Formatter(formatter)
125 |     stream_handler = logging.StreamHandler(handler)
126 |     stream_handler.setLevel(level)
127 |     stream_handler.setFormatter(formatter)
128 |     logger.addHandler(stream_handler)
129 |     today = date.today().isoformat()
130 |     if is_dynamic:
131 |         hdlr = logging.FileHandler(model_path+'/log_dynamic_'+str(today)+'.txt')
132 |     else:
133 |         hdlr = logging.FileHandler(model_path+'/log_static_'+str(today)+'.txt')
134 |     hdlr.setFormatter(formatter)
135 |     logger.addHandler(hdlr) 
136 |     return logger
137 | 
138 | 


--------------------------------------------------------------------------------
/NeuralRST/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/models/__init__.py


--------------------------------------------------------------------------------
/NeuralRST/models/alphabet.py:
--------------------------------------------------------------------------------
 1 | import json, os
 2 | 
 3 | UNK_ID = 0
 4 | 
 5 | class Alphabet(object):
 6 |     def __init__(self, dictionary, name, for_label_index = False):
 7 |         self.alpha2id = {}
 8 |         self.id2alpha = {}
 9 |         self.name = name
10 |         self.for_label_index = for_label_index
11 |         self.alphas = list(dictionary.keys()) 
12 |         
13 |         ids = 0 
14 |         if not for_label_index: # for non label
15 |             self.alpha2id ['UNK'] = 0
16 |             self.id2alpha [0] = 'UNK'
17 |             ids += 1
18 |         
19 |         for alpha in self.alphas:
20 |            self.alpha2id[alpha] = ids
21 |            self.id2alpha[ids] = alpha
22 |            ids += 1
23 |         
24 |         # add PAD for PADDING, it is used for label / action
25 |         if for_label_index:
26 |             self.alpha2id ['PAD'] = ids
27 |             self.id2alpha [ids] = 'PAD'
28 |             self.alphas += ['PAD']
29 | 
30 |         # add 'UNK' for non label index alphabet
31 |         if not for_label_index:
32 |             self.alphas += ['UNK']
33 | 
34 |     def get_content(self):
35 |         return {'alpha2id': self.alpha2id, 'id2alpha': self.id2alpha, 'alphas': self.alphas}
36 | 
37 |     def word2id(self, word):
38 |         if not self.for_label_index:
39 |             return self.alpha2id.get(word, UNK_ID)
40 |         else:
41 |             return self.alpha2id.get(word, self.alpha2id['PAD'])
42 | 
43 |     def id2word(self, int_id):
44 |         if not self.for_label_index:
45 |             return self.id2alpha.get(int_id, 'UNK')
46 |         else:
47 |             return self.id2alpha.get(int_id, 'PAD')
48 |     
49 |     def __from_json(self, data):
50 |         self.alphas = data["alphas"]
51 |         self.alpha2id = data['alpha2id']
52 |         for index, word in data['id2alpha'].items():
53 |             self.id2alpha[int(index)] = word
54 |         
55 |     def size(self):
56 |         if self.for_label_index:
57 |             return len(self.alphas) - 1
58 |         return len(self.alphas)
59 | 
60 |     def save(self, output_directory):
61 |         try:
62 |             if not os.path.exists(output_directory):
63 |                 os.makedirs(output_directory)
64 |             json.dump(self.get_content(),
65 |                     open(os.path.join(output_directory, self.name + ".json"), "w"), indent=4)
66 | 
67 |         except Exception as e:
68 |             self.logger.warn("Alphabet is not saved: %s" % repr(e))
69 | 
70 |     def load(self, input_directory, for_label_index=False):
71 |         self.__from_json(json.load(open(os.path.join(input_directory, self.name + ".json"))))
72 |         self.for_label_index = for_label_index
73 | 


--------------------------------------------------------------------------------
/NeuralRST/models/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | 
  4 | class Config(object):
  5 |     def __init__(self, args):
  6 |         if args is None:
  7 |             return
  8 | 
  9 |         self.use_gpu = torch.cuda.is_available()
 10 |         self.use_dynamic_oracle = args.use_dynamic_oracle == 1
 11 |         self.flag_oracle = False
 12 |         self.word_embedding = args.word_embedding
 13 |         self.word_embedding_file = args.word_embedding_file
 14 |     
 15 |         self.train_path = args.train
 16 |         self.test_path = args.test
 17 |         self.dev_path = args.dev
 18 |         self.train_syn_feat_path = args.train_syn_feat
 19 |         self.test_syn_feat_path = args.test_syn_feat
 20 |         self.dev_syn_feat_path = args.dev_syn_feat
 21 |         self.model_path = args.model_path +'/'+ args.experiment
 22 |         self.model_name = args.model_name
 23 |         self.alphabet_path = os.path.join(self.model_path, 'alphabets/')
 24 | 
 25 |         self.max_iter = args.max_iter
 26 |         self.word_dim = args.word_dim
 27 |         self.tag_dim = args.tag_dim
 28 |         self.etype_dim = args.etype_dim
 29 |         self.syntax_dim = args.syntax_dim
 30 |         self.max_sent_size = args.max_sent_size
 31 |         self.max_edu_size = args.max_edu_size
 32 |         self.max_state_size = args.max_state_size
 33 |         self.hidden_size = args.hidden_size
 34 |         
 35 |         self.freeze = args.freeze
 36 |         self.drop_prob = args.drop_prob
 37 |         self.num_layers = args.num_layers
 38 | 
 39 |         self.batch_size = args.batch_size
 40 |         self.opt = args.opt
 41 |         self.lr = args.lr
 42 |         self.ada_eps = args.ada_eps
 43 |         self.momentum = 0.9
 44 |         self.beta1 = args.beta1
 45 |         self.beta2 = args.beta2 
 46 |         self.betas = (self.beta1, self.beta2)
 47 |         self.gamma = args.gamma
 48 |         self.start_decay = args.start_decay
 49 | 
 50 |         self.clip = args.clip
 51 | 
 52 |         self.decay = args.decay
 53 |         self.oracle_prob = args.oracle_prob
 54 |         self.start_dynamic_oracle = args.start_dynamic_oracle
 55 |         self.early_stopping = args.early_stopping
 56 | 
 57 |     def save(self):
 58 |         f = open(self.model_path + '/config.cfg', 'w')
 59 |         f.write("use_gpu =  " + str(self.use_gpu) + '\n')
 60 |         f.write("use_dynamic_oracle = "+ str(self.use_dynamic_oracle) + '\n')
 61 |         f.write("flag_oracle = " + str(self.flag_oracle) + '\n')
 62 |         f.write("word_embedding = " + str(self.word_embedding) + '\n')
 63 |         f.write("word_embedding_file = " + str(self.word_embedding_file) + '\n')
 64 |     
 65 |         f.write("train_path = " + str(self.train_path) + '\n')
 66 |         f.write("test_path = " + str(self.test_path) + '\n')
 67 |         f.write("dev_path = " + str(self.dev_path) + '\n')
 68 |         f.write("train_syn_feat_path = " + str(self.train_syn_feat_path) + '\n')
 69 |         f.write("test_syn_feat_path = " + str(self.test_syn_feat_path) + '\n')
 70 |         f.write("dev_syn_feat_path = " + str(self.dev_syn_feat_path) + '\n')
 71 |         f.write("model_path = " + str(self.model_path) + '\n')
 72 |         f.write("model_name = " + str(self.model_name) + '\n')
 73 |         f.write("alphabet_path = " + str(self.alphabet_path) + '\n')
 74 | 
 75 |         f.write("max_iter = " + str(self.max_iter) + '\n')
 76 |         f.write("word_dim = " + str(self.word_dim) + '\n')
 77 |         f.write("tag_dim = " + str(self.tag_dim) + '\n')
 78 |         f.write("etype_dim = " + str(self.etype_dim) + '\n')
 79 |         f.write("syntax_dim = " + str(self.syntax_dim) + '\n')
 80 |         f.write("max_sent_size = " + str(self.max_sent_size) + '\n')
 81 |         f.write("max_edu_size = " + str(self.max_edu_size) + '\n')
 82 |         f.write("max_state_size = " + str(self.max_state_size) + '\n')
 83 |         f.write("hidden_size = " + str(self.hidden_size) + '\n')
 84 |         
 85 |         f.write("freeze = " + str(self.freeze) + '\n')
 86 |         f.write("drop_prob = " + str(self.drop_prob) + '\n')
 87 |         f.write("num_layers = " + str(self.num_layers) + '\n')
 88 | 
 89 |         f.write("batch_size = " + str(self.batch_size) + '\n')
 90 |         f.write("opt = " + str(self.opt) + '\n')
 91 |         f.write("lr = " + str(self.lr) + '\n')
 92 |         f.write("ada_eps = " + str(self.ada_eps) + '\n')
 93 |         f.write("momentum = " + str(self.momentum) + '\n')
 94 |         f.write("beta1 = " + str(self.beta1) + '\n')
 95 |         f.write("beta2 = " + str(self.beta2) + '\n')
 96 |         f.write("gamma = " + str(self.gamma) + '\n')
 97 |         f.write("start_decay = " + str(self.start_decay) + '\n')
 98 | 
 99 |         f.write("clip = " + str(self.clip) + '\n')
100 | 
101 |         f.write("decay = " + str(self.decay) + '\n')
102 |         f.write("oracle_prob = " + str(self.oracle_prob) + '\n')
103 |         f.write("start_dynamic_oracle = " + str(self.start_dynamic_oracle) + '\n')
104 |         f.write("early_stopping = " + str(self.early_stopping) + '\n')
105 |         f.close()
106 | 
107 |     def load_config(self, path):
108 |         f = open(path, 'r')
109 |         self.use_gpu = f.readline().strip().split(' = ')[-1] == 'True'
110 |         self.use_dynamic_oracle = f.readline().strip().split(' = ')[-1] == 'True'
111 |         self.flag_oracle = f.readline().strip().split(' = ')[-1] == 'True'
112 |         self.word_embedding = f.readline().strip().split(' = ')[-1] 
113 |         self.word_embedding_file = f.readline().strip().split(' = ')[-1] 
114 |     
115 |         self.train_path = f.readline().strip().split(' = ')[-1] 
116 |         self.test_path = f.readline().strip().split(' = ')[-1] 
117 |         self.dev_path = f.readline().strip().split(' = ')[-1] 
118 |         self.train_syn_feat_path = f.readline().strip().split(' = ')[-1] 
119 |         self.test_syn_feat_path = f.readline().strip().split(' = ')[-1] 
120 |         self.dev_syn_feat_path = f.readline().strip().split(' = ')[-1] 
121 |         self.model_path = f.readline().strip().split(' = ')[-1] 
122 |         self.model_name = f.readline().strip().split(' = ')[-1] 
123 |         self.alphabet_path = f.readline().strip().split(' = ')[-1] 
124 | 
125 |         self.max_iter = int(f.readline().strip().split(' = ')[-1])
126 |         self.word_dim = int(f.readline().strip().split(' = ')[-1])
127 |         self.tag_dim = int(f.readline().strip().split(' = ')[-1])
128 |         self.etype_dim = int(f.readline().strip().split(' = ')[-1])
129 |         self.syntax_dim = int(f.readline().strip().split(' = ')[-1])
130 |         self.max_sent_size = int(f.readline().strip().split(' = ')[-1])
131 |         self.max_edu_size = int(f.readline().strip().split(' = ')[-1])
132 |         self.max_state_size = int(f.readline().strip().split(' = ')[-1])
133 |         self.hidden_size = int(f.readline().strip().split(' = ')[-1])
134 |         
135 |         self.freeze = f.readline().strip().split(' = ')[-1] == 'True'
136 |         self.drop_prob = float(f.readline().strip().split(' = ')[-1])
137 |         self.num_layers = int(f.readline().strip().split(' = ')[-1])
138 | 
139 |         self.batch_size = int(f.readline().strip().split(' = ')[-1])
140 |         self.opt = f.readline().strip().split(' = ')[-1] 
141 |         self.lr = float(f.readline().strip().split(' = ')[-1])
142 |         self.ada_eps = float(f.readline().strip().split(' = ')[-1])
143 |         self.momentum = float(f.readline().strip().split(' = ')[-1])
144 |         self.beta1 = float(f.readline().strip().split(' = ')[-1])
145 |         self.beta2 = float(f.readline().strip().split(' = ')[-1])
146 |         self.betas = (self.beta1, self.beta2)
147 |         self.gamma = float(f.readline().strip().split(' = ')[-1])
148 |         self.start_decay = int(f.readline().strip().split(' = ')[-1])
149 | 
150 |         self.clip = float(f.readline().strip().split(' = ')[-1])
151 | 
152 |         self.decay = int(f.readline().strip().split(' = ')[-1])
153 |         self.oracle_prob = float(f.readline().strip().split(' = ')[-1])
154 |         self.start_dynamic_oracle = int(f.readline().strip().split(' = ')[-1])
155 |         self.early_stopping = int(f.readline().strip().split(' = ')[-1])
156 |         f.close()
157 | 


--------------------------------------------------------------------------------
/NeuralRST/models/explorer.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from NeuralRST.transition.action import CAction
  3 | from NeuralRST.in_out.instance import SubTree
  4 | 
  5 | class Explorer(object):
  6 |     def __init__(self, vocab):
  7 |         self.gold_action_alpha = vocab.gold_action_alpha
  8 |         self.action_label_alpha = vocab.action_label_alpha
  9 | 
 10 |     def subtree_loss(self, subtree, gold_tree):
 11 |         subtree_size = len(gold_tree)
 12 |         loss = 3
 13 |         for i in range(subtree_size):
 14 |             gold_subtree = gold_tree[i]
 15 |             if subtree.span_equal(gold_subtree):
 16 |                 loss -= 1
 17 |                 if subtree.nuclear == gold_subtree.nuclear:
 18 |                     loss -= 1
 19 |                     if subtree.relation == gold_subtree.relation:
 20 |                         loss -= 1
 21 |                 break
 22 |         return loss
 23 | 
 24 |     # CAction ac
 25 |     # Cstate error_cstate
 26 |     # SubTree[] gold_tree
 27 |     def nuclear_label_loss(self, ac, error_cstate, gold_tree):
 28 |         assert(error_cstate.stack_size >= 2)
 29 |         top0 = error_cstate.stack[error_cstate.stack_size - 1]
 30 |         top1 = error_cstate.stack[error_cstate.stack_size - 2]
 31 |         subtree0 = SubTree()
 32 |         subtree1 = SubTree()
 33 |         if ac.nuclear == CAction.NN:
 34 |             subtree0.edu_start = top0.edu_start
 35 |             subtree0.edu_end = top0.edu_end
 36 |             subtree0.nuclear = SubTree.NUCLEAR
 37 |             subtree0.relation = ac.label
 38 |             subtree1.edu_start = top1.edu_start
 39 |             subtree1.edu_end = top1.edu_end
 40 |             subtree1.nuclear = SubTree.NUCLEAR
 41 |             subtree1.relation = ac.label
 42 |         elif ac.nuclear == CAction.NS:
 43 |             subtree0.edu_start = top0.edu_start
 44 |             subtree0.edu_end = top0.edu_end
 45 |             subtree0.nuclear = SubTree.SATELLITE
 46 |             subtree0.relation = ac.label
 47 |             subtree1.edu_start = top1.edu_start
 48 |             subtree1.edu_end = top1.edu_end
 49 |             subtree1.nuclear = SubTree.NUCLEAR
 50 |             subtree1.relation = SubTree.SPAN
 51 |         elif ac.nuclear == CAction.SN:
 52 |             subtree0.edu_start = top0.edu_start
 53 |             subtree0.edu_end = top0.edu_end
 54 |             subtree0.nuclear = SubTree.NUCLEAR
 55 |             subtree0.relation = SubTree.SPAN
 56 |             subtree1.edu_start = top1.edu_start
 57 |             subtree1.edu_end = top1.edu_end
 58 |             subtree1.nuclear = SubTree.SATELLITE
 59 |             subtree1.relation = ac.label
 60 |         loss0 = self.subtree_loss(subtree0, gold_tree)
 61 |         loss1 = self.subtree_loss(subtree1, gold_tree)
 62 | 
 63 |         return loss0 + loss1
 64 | 
 65 | 
 66 |     def shift_loss(self, error_cstate, gold_tree):
 67 |         assert(error_cstate.stack_size >= 1)
 68 |         end = error_cstate.stack[error_cstate.stack_size - 1].edu_end
 69 |         gold_action_size = len(gold_tree)
 70 |         count = 0
 71 |         max_size = error_cstate.stack_size - 1
 72 |         for i in range(0, max_size):
 73 |             start = error_cstate.stack[i].edu_start
 74 |             for j in range(0, gold_action_size):
 75 |                 gold_subtree = gold_tree[j]
 76 |                 if start == gold_subtree.edu_start and end == gold_subtree.edu_end:
 77 |                     count += 1
 78 |         return count
 79 | 
 80 |     def reduce_loss(self,error_cstate, gold_tree):
 81 |         assert(error_cstate.stack_size >= 1)
 82 |         start = error_cstate.stack[error_cstate.stack_size - 1].edu_start
 83 |         gold_action_size = len(gold_tree)
 84 |         count = 0
 85 |         for i in range(error_cstate.next_index, error_cstate.edu_size):
 86 |             end = i
 87 |             for j in range(0, gold_action_size):
 88 |                 gold_subtree = gold_tree[j]
 89 |                 if start == gold_subtree.edu_start and end == gold_subtree.edu_end:
 90 |                     count += 1
 91 |         return count
 92 | 
 93 |     def get_reduce_candidate(self, error_cstate, gold_tree, candidate_actions):
 94 |         assert(error_cstate.stack_size >= 2)
 95 |         label_size = self.gold_action_alpha.size()
 96 |         tmp_acts = [] # 1 element is tuple (CAction, int)
 97 |         for nuclear in ['NN', 'NS', 'SN']:
 98 |             for label in self.action_label_alpha.alphas:
 99 |                 ac = CAction(CAction.REDUCE, nuclear, label)
100 |                 action_str = ac.get_str()
101 |                 pad_id = self.gold_action_alpha.alpha2id['PAD']
102 |                 if self.gold_action_alpha.word2id(action_str) != pad_id:
103 |                     loss = self.nuclear_label_loss(ac, error_cstate, gold_tree)
104 |                     tmp_acts.append((ac, loss))
105 |                     if loss == 0:
106 |                         candidate_actions.append(ac)
107 |                         return candidate_actions
108 |         assert(len(tmp_acts) > 0)
109 |         action_size = len(tmp_acts)
110 |         min_loss = tmp_acts[0][1]
111 |         min_index = 0
112 |         for i in range(1, action_size):
113 |             cur_iter = tmp_acts[i]
114 |             cur_loss = cur_iter[1]
115 |             if cur_loss < min_loss:
116 |                 min_index = i
117 |                 min_loss = cur_loss
118 | 
119 |         for i in range(action_size):
120 |             cur_iter = tmp_acts[i]
121 |             if cur_iter[1] == min_loss:
122 |                 candidate_actions.append(cur_iter[0])
123 |         return candidate_actions
124 | 
125 |     # parameter:
126 |     #  error_cstate (CState)
127 |     #  gold_tree (SubTree [])
128 |     # return CState optimal_action
129 |     def get_oracle(self, error_cstate, gold_tree):
130 |         candidate_actions = []
131 |         ac = CAction('', '', '')
132 |         if error_cstate.stack_size < 2:
133 |             if error_cstate.next_index == error_cstate.edu_size:
134 |                 ac.set(CAction.POP_ROOT, '', '')
135 |             else:
136 |                 ac.set(CAction.SHIFT, '', '')
137 |             candidate_actions.append(ac)
138 |         elif error_cstate.next_index == error_cstate.edu_size:
139 |             ac.set(CAction.REDUCE, '', '')
140 |         else:
141 |             shift_loss = self.shift_loss(error_cstate, gold_tree)
142 |             reduce_loss = self.reduce_loss(error_cstate, gold_tree)
143 |             if shift_loss < reduce_loss:
144 |                 ac.set(CAction.SHIFT, '', '')
145 |                 candidate_actions.append(ac)
146 |             elif shift_loss >= reduce_loss:
147 |                 ac.set(CAction.REDUCE, '', '')
148 |                 if shift_loss == reduce_loss:
149 |                     shift_action = CAction(CAction.SHIFT, '', '')
150 |                     candidate_actions.append(shift_action)
151 |         if ac.is_reduce():
152 |             candidate_actions = self.get_reduce_candidate(error_cstate, gold_tree, candidate_actions)
153 |         minimum = 0
154 |         maximum = len(candidate_actions)
155 |         rand_index = int(random.random() * (maximum-minimum))
156 |         # import ipdb; ipdb.set_trace()
157 |         return candidate_actions[rand_index]
158 |         
159 | 


--------------------------------------------------------------------------------
/NeuralRST/models/metric.py:
--------------------------------------------------------------------------------
 1 | class Metric(object):
 2 |     def __init__(self):
 3 |         self.overall_label_count = 0
 4 |         self.correct_label_count = 0
 5 |         self.predicated_label_count = 0
 6 | 
 7 |     def set(metric):
 8 |         self.overall_label_count = metric.overall_label_count
 9 |         self.correct_label_count = metric.correct_label_count
10 |         self.predicated_label_count = metric.predicated_label_count
11 | 
12 |     def get_accuracy(self):
13 |         if self.overall_label_count == 0:
14 |             return 1.0
15 |         if self.predicated_label_count == 0:
16 |             return 1.0 * self.correct_label_count / self.overall_label_count
17 |         else:
18 |             return self.correct_label_count * 2.0 / (self.overall_label_count + self.predicated_label_count)
19 | 
20 |     def get_f_measure(self):
21 |         return self.correct_label_count*2.0 / (self.overall_label_count + self.predicated_label_count)
22 | 
23 |     def print_metric(self):
24 |         if self.predicated_label_count == 0:
25 |             return ("Precision: P=" + str (self.correct_label_count) + "/" + str(self.overall_label_count) + \
26 |                        "="+ str(self.correct_label_count*1.0 / self.overall_label_count))
27 |         else:
28 |             return ("Recall: P=" + str(self.correct_label_count) + "/" + str(self.overall_label_count) + "=" + str(self.correct_label_count*1.0 / self.overall_label_count) + \
29 |                     ", " + "Precision: P=" + str(self.correct_label_count) + "/" + str(self.predicated_label_count) + "=" + str(self.correct_label_count*1.0 / self.predicated_label_count) + \
30 |                      ", " + "Fmeasure: " + str(self.correct_label_count*2.0 / (self.overall_label_count + self.predicated_label_count)))
31 | 
32 |     def bIdentical(self):
33 |         if self.predicated_label_count == 0:
34 |             if self.overall_label_count == self.correct_label_count:
35 |                 return True
36 |             return False
37 |         else:
38 |             if self.overall_label_count == self.correct_label_count and self.predicated_label_count == self.correct_label_count:
39 |                 return True
40 |             return False
41 | 
42 |     def reset(self):
43 |         self.overall_label_count = 0
44 |         self.correct_label_count = 0
45 |         self.predicated_label_count = 0
46 | 
47 | 


--------------------------------------------------------------------------------
/NeuralRST/models/vocab.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from NeuralRST.transition.action import CAction
 3 | 
 4 | class Vocab(object):
 5 |     def __init__(self, word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha):
 6 |         self.word_alpha = word_alpha
 7 |         self.tag_alpha = tag_alpha
 8 |         self.etype_alpha = etype_alpha
 9 |         self.gold_action_alpha = gold_action_alpha
10 |         self.action_label_alpha = action_label_alpha
11 | 
12 |         self.id2action = {}
13 |         for key in self.gold_action_alpha.id2alpha.keys():
14 | 
15 |             if key != self.gold_action_alpha.size():
16 |                 self.id2action[key] = self.get_action(key)
17 |         
18 |         self.mask_reduce = np.array([False] * self.gold_action_alpha.size())
19 |         self.mask_no_action = np.array([False] * self.gold_action_alpha.size())
20 |         self.mask_shift = np.array([False] * self.gold_action_alpha.size())
21 |         self.mask_pop_root = np.array([False] * self.gold_action_alpha.size())
22 |         for key in self.gold_action_alpha.id2alpha.keys():
23 |             if 'SHIFT' in self.gold_action_alpha.id2alpha[key]:
24 |                 self.mask_shift[key] = True
25 |             if 'REDUCE' in self.gold_action_alpha.id2alpha[key]:
26 |                 self.mask_reduce[key] = True
27 |             if 'POPROOT' in self.gold_action_alpha.id2alpha[key]:
28 |                 self.mask_pop_root[key] = True
29 |             if 'NOACTION' in self.gold_action_alpha.id2alpha[key]:
30 |                 self.mask_no_action[key] = True
31 | 
32 |     def get_action(self, id_selected_action):
33 |         mapper = {'SHIFT': 'SH', 'REDUCE': 'RD', 'POPROOT': 'PR', 'NOACTION': ''}
34 |         str_selected_action = self.gold_action_alpha.id2word(id_selected_action).split('_')
35 |         selected_action = CAction(mapper[str_selected_action[0]],
36 |                                   str_selected_action[1],
37 |                                   str_selected_action[2])
38 |         return selected_action
39 | 


--------------------------------------------------------------------------------
/NeuralRST/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/modules/__init__.py


--------------------------------------------------------------------------------
/NeuralRST/modules/embedding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | from torch.autograd import Variable
  5 | from torch.nn.parameter import Parameter
  6 | from torch.autograd import Variable
  7 | 
  8 | def assign_tensor(tensor, val):
  9 |     """
 10 |     copy val to tensor
 11 |     Args:
 12 |         tensor: an n-dimensional torch.Tensor or autograd.Variable
 13 |         val: an n-dimensional torch.Tensor to fill the tensor with
 14 | 
 15 |     Returns:
 16 |     """
 17 |     if isinstance(tensor, Variable):
 18 |         assign_tensor(tensor.data, val)
 19 |         return tensor
 20 |     return tensor.copy_(val)
 21 | 
 22 | 
 23 | class Embedding(nn.Module):
 24 |     r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
 25 |     This module is often used to store word embeddings and retrieve them using indices.
 26 |     The input to the module is a list of indices, and the output is the corresponding
 27 |     word embeddings.
 28 |     Args:
 29 |         num_embeddings (int): size of the dictionary of embeddings
 30 |         embedding_dim (int): the size of each embedding vector
 31 |         init_embedding (Tensor or Variable): If given, the embedding will be initialized with the given tensor.
 32 |         freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
 33 |         padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index.
 34 |         max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
 35 |         norm_type (float, optional): The p of the p-norm to compute for the max_norm option
 36 |         scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
 37 |                                                 the words in the mini-batch.
 38 |         sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
 39 |                                     more details regarding sparse gradients.
 40 |     Attributes:
 41 |         weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
 42 |     Shape:
 43 |         - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch
 44 |         - Output: `(N1, N2, ..., Nm, W, embedding_dim)`
 45 |     Notes:
 46 |         Keep in mind that only a limited number of optimizers support
 47 |         sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`),
 48 |         and `optim.Adagrad` (`cpu`)
 49 |     """
 50 | 
 51 |     def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None,
 52 |                  max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False):
 53 |         super(Embedding, self).__init__()
 54 |         self.num_embeddings = num_embeddings
 55 |         self.embedding_dim = embedding_dim
 56 |         self.padding_idx = padding_idx
 57 |         self.max_norm = max_norm
 58 |         self.norm_type = norm_type
 59 |         self.scale_grad_by_freq = scale_grad_by_freq
 60 |         self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
 61 |         self.frozen = freeze
 62 |         self.sparse = sparse
 63 | 
 64 |         self.reset_parameters(init_embedding)
 65 | 
 66 |     def reset_parameters(self, init_embedding):
 67 |         if init_embedding is None:
 68 |             scale = np.sqrt(3.0 / self.embedding_dim)
 69 |             self.weight.data.uniform_(-scale, scale)
 70 |         else:
 71 |             assign_tensor(self.weight, init_embedding)
 72 |         if self.padding_idx is not None:
 73 |             self.weight.data[self.padding_idx].fill_(0)
 74 | 
 75 |         if self.frozen:
 76 |             if init_embedding is None:
 77 |                 raise Warning('Freeze embeddings which are randomly initialized.')
 78 |             self.weight.requires_grad = False
 79 | 
 80 |     def freeze(self):
 81 |         self.weight.requires_grad = False
 82 |         self.frozen = True
 83 | 
 84 |     def forward(self, input):
 85 |         padding_idx = self.padding_idx
 86 |         if padding_idx is None:
 87 |             padding_idx = -1
 88 | 
 89 |         input_size = input.size()
 90 |         if input.dim() > 2:
 91 |             num_inputs = int(np.prod(input_size[:-1]))
 92 |             input = input.view(num_inputs, input_size[-1])
 93 | 
 94 |         output_size = input_size + (self.embedding_dim,)
 95 |         return self._backend.Embedding.apply(
 96 |             input, self.weight,
 97 |             padding_idx, self.max_norm, self.norm_type,
 98 |             self.scale_grad_by_freq, self.sparse).view(output_size)
 99 | 
100 |     def __repr__(self):
101 |         s = '{name}({num_embeddings}, {embedding_dim}'
102 |         if self.padding_idx is not None:
103 |             s += ', padding_idx={padding_idx}'
104 |         if self.max_norm is not None:
105 |             s += ', max_norm={max_norm}'
106 |         if self.norm_type != 2:
107 |             s += ', norm_type={norm_type}'
108 |         if self.scale_grad_by_freq is not False:
109 |             s += ', scale_grad_by_freq={scale_grad_by_freq}'
110 |         if self.sparse is not False:
111 |             s += ', sparse=True'
112 |         s += ')'
113 |         return s.format(name=self.__class__.__name__, **self.__dict__)
114 | 


--------------------------------------------------------------------------------
/NeuralRST/modules/function_variational_rnn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
  3 | from torch.nn import functional as F
  4 | 
  5 | 
  6 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
  7 |     if noise_in is not None:
  8 |         input = input * noise_in
  9 |     if noise_hidden is not None:
 10 |         hidden = hidden * noise_hidden
 11 |     hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 12 |     return hy
 13 | 
 14 | 
 15 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 16 |     if noise_in is not None:
 17 |         input = input * noise_in
 18 |     if noise_hidden is not None:
 19 |         hidden = hidden * noise_hidden
 20 |     hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 21 |     return hy
 22 | 
 23 | 
 24 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 25 |     input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 26 | 
 27 |     hx, cx = hidden
 28 |     hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 29 | 
 30 |     gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 31 | 
 32 |     ingate, forgetgate, cellgate, outgate = gates
 33 | 
 34 |     ingate = F.sigmoid(ingate)
 35 |     forgetgate = F.sigmoid(forgetgate)
 36 |     cellgate = F.tanh(cellgate)
 37 |     outgate = F.sigmoid(outgate)
 38 | 
 39 |     cy = (forgetgate * cx) + (ingate * cellgate)
 40 |     hy = outgate * F.tanh(cy)
 41 | 
 42 |     return hy, cy
 43 | 
 44 | 
 45 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 46 |     if noise_in is not None:
 47 |         input = input * noise_in
 48 | 
 49 |     if input.is_cuda:
 50 |         igates = F.linear(input, w_ih)
 51 |         hgates = F.linear(hidden[0], w_hh) if noise_hidden is None else F.linear(hidden[0] * noise_hidden, w_hh)
 52 |         state = fusedBackend.LSTMFused.apply
 53 |         return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)
 54 | 
 55 |     hx, cx = hidden
 56 |     if noise_hidden is not None:
 57 |         hx = hx * noise_hidden
 58 |     gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
 59 | 
 60 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 61 | 
 62 |     ingate = F.sigmoid(ingate)
 63 |     forgetgate = F.sigmoid(forgetgate)
 64 |     cellgate = F.tanh(cellgate)
 65 |     outgate = F.sigmoid(outgate)
 66 | 
 67 |     cy = (forgetgate * cx) + (ingate * cellgate)
 68 |     hy = outgate * F.tanh(cy)
 69 | 
 70 |     return hy, cy
 71 | 
 72 | 
 73 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 74 |     input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 75 |     hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden
 76 | 
 77 |     gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
 78 |     gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 79 |     i_r, i_i, i_n = gi
 80 |     h_r, h_i, h_n = gh
 81 | 
 82 |     resetgate = F.sigmoid(i_r + h_r)
 83 |     inputgate = F.sigmoid(i_i + h_i)
 84 |     newgate = F.tanh(i_n + resetgate * h_n)
 85 |     hy = newgate + inputgate * (hidden - newgate)
 86 | 
 87 |     return hy
 88 | 
 89 | 
 90 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 91 |     if noise_in is not None:
 92 |         input = input * noise_in
 93 | 
 94 |     hx = hidden if noise_hidden is None else hidden * noise_hidden
 95 |     if input.is_cuda:
 96 |         gi = F.linear(input, w_ih)
 97 |         gh = F.linear(hx, w_hh)
 98 |         state = fusedBackend.GRUFused.apply
 99 |         return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
100 | 
101 |     gi = F.linear(input, w_ih, b_ih)
102 |     gh = F.linear(hx, w_hh, b_hh)
103 |     i_r, i_i, i_n = gi.chunk(3, 1)
104 |     h_r, h_i, h_n = gh.chunk(3, 1)
105 | 
106 |     resetgate = F.sigmoid(i_r + h_r)
107 |     inputgate = F.sigmoid(i_i + h_i)
108 |     newgate = F.tanh(i_n + resetgate * h_n)
109 |     hy = newgate + inputgate * (hidden - newgate)
110 | 
111 |     return hy
112 | 
113 | 
114 | def VarMaskedRecurrent(reverse=False):
115 |     def forward(input, hidden, cell, mask):
116 |         output = []
117 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
118 |         for i in steps:
119 |             if mask is None or mask[i].data.min() > 0.5:
120 |                 hidden = cell(input[i], hidden)
121 |             elif mask[i].data.max() > 0.5:
122 |                 hidden_next = cell(input[i], hidden)
123 |                 # hack to handle LSTM
124 |                 if isinstance(hidden, tuple):
125 |                     hx, cx = hidden
126 |                     hp1, cp1 = hidden_next
127 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
128 |                 else:
129 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
130 |             # hack to handle LSTM
131 |             output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
132 | 
133 |         if reverse:
134 |             output.reverse()
135 |         output = torch.cat(output, 0).view(input.size(0), *output[0].size())
136 | 
137 |         return hidden, output
138 | 
139 |     return forward
140 | 
141 | 
142 | def StackedRNN(inners, num_layers, lstm=False):
143 |     num_directions = len(inners)
144 |     total_layers = num_layers * num_directions
145 | 
146 |     def forward(input, hidden, cells, mask):
147 |         assert (len(cells) == total_layers)
148 |         next_hidden = []
149 | 
150 |         if lstm:
151 |             hidden = list(zip(*hidden))
152 | 
153 |         for i in range(num_layers):
154 |             all_output = []
155 |             for j, inner in enumerate(inners):
156 |                 l = i * num_directions + j
157 |                 hy, output = inner(input, hidden[l], cells[l], mask)
158 |                 next_hidden.append(hy)
159 |                 all_output.append(output)
160 | 
161 |             input = torch.cat(all_output, input.dim() - 1)
162 | 
163 |         if lstm:
164 |             next_h, next_c = zip(*next_hidden)
165 |             next_hidden = (
166 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
167 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
168 |             )
169 |         else:
170 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
171 | 
172 |         return next_hidden, input
173 | 
174 |     return forward
175 | 
176 | 
177 | def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
178 |     rec_factory = VarMaskedRecurrent
179 | 
180 |     if bidirectional:
181 |         layer = (rec_factory(), rec_factory(reverse=True))
182 |     else:
183 |         layer = (rec_factory(),)
184 | 
185 |     func = StackedRNN(layer,
186 |                       num_layers,
187 |                       lstm=lstm)
188 | 
189 |     def forward(input, cells, hidden, mask):
190 |         if batch_first:
191 |             input = input.transpose(0, 1)
192 |             if mask is not None:
193 |                 mask = mask.transpose(0, 1)
194 | 
195 |         nexth, output = func(input, hidden, cells, mask)
196 | 
197 |         if batch_first:
198 |             output = output.transpose(0, 1)
199 | 
200 |         return output, nexth
201 | 
202 |     return forward
203 | 
204 | 
205 | def VarMaskedStep():
206 |     def forward(input, hidden, cell, mask):
207 |         if mask is None or mask.data.min() > 0.5:
208 |             hidden = cell(input, hidden)
209 |         elif mask.data.max() > 0.5:
210 |             hidden_next = cell(input, hidden)
211 |             # hack to handle LSTM
212 |             if isinstance(hidden, tuple):
213 |                 hx, cx = hidden
214 |                 hp1, cp1 = hidden_next
215 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
216 |             else:
217 |                 hidden = hidden + (hidden_next - hidden) * mask
218 |         # hack to handle LSTM
219 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
220 | 
221 |         return hidden, output
222 | 
223 |     return forward
224 | 
225 | 
226 | def StackedStep(layer, num_layers, lstm=False):
227 |     def forward(input, hidden, cells, mask):
228 |         assert (len(cells) == num_layers)
229 |         next_hidden = []
230 | 
231 |         if lstm:
232 |             hidden = list(zip(*hidden))
233 | 
234 |         for l in range(num_layers):
235 |             hy, output = layer(input, hidden[l], cells[l], mask)
236 |             next_hidden.append(hy)
237 |             input = output
238 | 
239 |         if lstm:
240 |             next_h, next_c = zip(*next_hidden)
241 |             next_hidden = (
242 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
243 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
244 |             )
245 |         else:
246 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
247 | 
248 |         return next_hidden, input
249 | 
250 |     return forward
251 | 
252 | 
253 | def AutogradVarMaskedStep(num_layers=1, lstm=False):
254 |     layer = VarMaskedStep()
255 | 
256 |     func = StackedStep(layer,
257 |                        num_layers,
258 |                        lstm=lstm)
259 | 
260 |     def forward(input, cells, hidden, mask):
261 |         nexth, output = func(input, hidden, cells, mask)
262 |         return output, nexth
263 | 
264 |     return forward
265 | 


--------------------------------------------------------------------------------
/NeuralRST/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.shutil-get-terminal-size==1.0.0
 2 | boto==2.49.0
 3 | boto3==1.9.93
 4 | botocore==1.12.93
 5 | bz2file==0.98
 6 | certifi==2018.11.29
 7 | chardet==3.0.4
 8 | decorator==4.3.2
 9 | docutils==0.14
10 | enum34==1.1.6
11 | futures==3.2.0
12 | gensim==3.7.1
13 | idna==2.8
14 | ipdb==0.11
15 | ipython==5.8.0
16 | ipython-genutils==0.2.0
17 | jmespath==0.9.3
18 | numpy==1.16.1
19 | pathlib2==2.3.3
20 | pexpect==4.6.0
21 | pickleshare==0.7.5
22 | prompt-toolkit==1.0.15
23 | ptyprocess==0.6.0
24 | Pygments==2.3.1
25 | python-dateutil==2.8.0
26 | PyYAML==3.13
27 | requests==2.21.0
28 | s3transfer==0.2.0
29 | scandir==1.9.0
30 | scipy==1.2.1
31 | simplegeneric==0.8.1
32 | six==1.12.0
33 | smart-open==1.8.0
34 | torch==0.3.1
35 | traitlets==4.3.2
36 | urllib3==1.24.1
37 | wcwidth==0.1.7
38 | 


--------------------------------------------------------------------------------
/NeuralRST/run_rst_parser.py:
--------------------------------------------------------------------------------
 1 | import sys, time
 2 | import numpy as np
 3 | import random
 4 | from datetime import datetime
 5 | 
 6 | sys.path.append(".")
 7 | 
 8 | import argparse
 9 | import torch
10 | import json
11 | 
12 | from in_out.reader import Reader
13 | from in_out.util import load_embedding_dict, get_logger
14 | from in_out.preprocess import create_alphabet
15 | from in_out.preprocess import batch_data_variable
16 | from models.vocab import Vocab
17 | from models.metric import Metric
18 | from models.config import Config
19 | from models.architecture import MainArchitecture
20 | 
21 | 
22 | main_path='/home/ffajri/'
23 | def main():
24 |     args_parser = argparse.ArgumentParser()
25 |     args_parser.add_argument('--config_path', required=True)
26 |     args = args_parser.parse_args()
27 |     config = Config(None)
28 |     config.load_config(args.config_path)
29 |     
30 |     logger = get_logger("RSTParser RUN", config.use_dynamic_oracle, config.model_path)
31 |     word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet(None, config.alphabet_path, logger)
32 |     vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha)
33 |     
34 |     network = MainArchitecture(vocab, config) 
35 |     network.load_state_dict(torch.load(config.model_name))
36 | 
37 |     if config.use_gpu:
38 |         network = network.cuda()
39 |     network.eval()
40 |     
41 |     logger.info('Reading test instance')
42 |     reader = Reader(config.test_path, config.test_syn_feat_path)
43 |     test_instances  = reader.read_data()
44 |     time_start = datetime.now()
45 |     batch_size = config.batch_size
46 |     span = Metric(); nuclear = Metric(); relation = Metric(); full = Metric()
47 |     predictions = []
48 |     total_data_test = len(test_instances)
49 |     for i in range(0, total_data_test, batch_size):
50 |         end_index = i+batch_size
51 |         if end_index > total_data_test:
52 |             end_index = total_data_test
53 |         indices = np.array(range(i, end_index))
54 |         subset_data_test = batch_data_variable(test_instances, indices, vocab, config)
55 |         prediction_of_subtrees = network.loss(subset_data_test, None)
56 |         predictions += prediction_of_subtrees
57 |     for i in range(total_data_test):
58 |         span, nuclear, relation, full = test_instances[i].evaluate(predictions[i], span, nuclear, relation, full)
59 |     time_elapsed = datetime.now() - time_start
60 |     m,s = divmod(time_elapsed.seconds, 60)
61 |     logger.info('TEST is finished in {} mins {} secs'.format(m,s))
62 |     logger.info("S: " + span.print_metric())
63 |     logger.info("N: " + nuclear.print_metric())
64 |     logger.info("R: " + relation.print_metric())
65 |     logger.info("F: " + full.print_metric())
66 | 
67 | 
68 | 
69 |     import ipdb; ipdb.set_trace()
70 | if __name__ == '__main__':
71 |     main()
72 | 


--------------------------------------------------------------------------------
/NeuralRST/transition/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/NeuralRST/transition/__init__.py


--------------------------------------------------------------------------------
/NeuralRST/transition/action.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class CAction(object):
 3 |     """
 4 |         Four types of code:
 5 |          1. REDUCE = RD
 6 |          2. SHIFT = SH
 7 |          3. POP_ROOT = PR
 8 |          4. NO_ACTION = ''
 9 | 
10 |         Label is the relation, eg: cause, elab, back, same, attr etc.
11 |         There are 19 relations in our dataset
12 |         - label is String
13 |         - label_id is its id in integer
14 | 
15 |         Three types of Nuclear:
16 |          1. NN
17 |          2. NS
18 |          3. SN
19 |          4. ''
20 |     """
21 |     POP_ROOT='PR'
22 |     REDUCE='RD'
23 |     SHIFT='SH'
24 |     NO_ACTION=''
25 |     
26 |     NN='NN'
27 |     NS='NS'
28 |     SN='SN'
29 |     NO_NUCLEAR=''
30 |     
31 |     # All string except label_id
32 |     def __init__(self, code, nuclear, label):
33 |         self.code = code
34 |         self.label = label
35 |         self.nuclear = nuclear
36 |         self.label_id = -1
37 | 
38 |     def is_none(self):
39 |         return self.code == ''
40 |     def is_finish(self):
41 |         return self.code == 'PR'
42 |     def is_shift(self):
43 |         return self.code == 'SH'
44 |     def is_reduce(self):
45 |         return self.code == 'RD'
46 | 
47 |     def set_label_id(self, label_alpha):
48 |         # for leaf the id is set into -1)
49 |         self.label_id = label_alpha.get(self.label, -1)
50 | 
51 |     def get_str(self):
52 |         if self.is_shift():
53 |             return "SHIFT__"
54 |         elif self.is_reduce():
55 |             if self.nuclear == 'NN':
56 |                 return "REDUCE_NN_" + self.label
57 |             if self.nuclear == 'NS':
58 |                 return "REDUCE_NS_" + self.label
59 |             if self.nuclear == 'SN':
60 |                 return "REDUCE_SN_" + self.label
61 |         elif self.is_finish():
62 |             return "POPROOT__"
63 |         else:
64 |             return "NOACTION__"
65 |     
66 |     def set(self, code, nuclear, label):
67 |         self.code = code
68 |         self.nuclear = nuclear
69 |         self.label = label
70 | 
71 |     def set_from_object(self, ac):
72 |         self.code = ac.code
73 |         self.label = ac.label
74 |         self.nuclear = ac.nuclear
75 |         self.label_id = ac.label_id
76 | 
77 | 


--------------------------------------------------------------------------------
/NeuralRST/transition/atom_feature.py:
--------------------------------------------------------------------------------
 1 | class CNode(object):
 2 |     def __init__(self):
 3 |         self.nuclear = ''
 4 |         self.label = ''
 5 |         self.edu_start = -1
 6 |         self.edu_end = -1
 7 |         self.is_validate = False
 8 | 
 9 |     def clear(self):
10 |         self.nuclear = ''
11 |         self.label = ''
12 |         self.edu_start = -1
13 |         self.edu_end = -1
14 |         self.is_validate = False
15 | 
16 | class AtomFeat:
17 |     def __init__(self):
18 |         self.s0 = CNode()
19 |         self.s1 = CNode()
20 |         self.s2 = CNode()
21 |         self.q0 = CNode()
22 | 
23 |     def getFeat(self):
24 |         return self.s0, self.s1, self.s2, self.q0
25 | 


--------------------------------------------------------------------------------
/NeuralRST/transition/state.py:
--------------------------------------------------------------------------------
  1 | from NeuralRST.transition.atom_feature import CNode, AtomFeat
  2 | from NeuralRST.transition.action import CAction
  3 | from NeuralRST.in_out.instance import Instance
  4 | from NeuralRST.in_out.instance import SubTree
  5 | from NeuralRST.in_out.instance import CResult
  6 | 
  7 | import copy
  8 | import numpy as np
  9 | 
 10 | NUCLEAR = 'NUCLEAR'
 11 | SATELLITE = 'SATELLITE'
 12 | SPAN = 'span'
 13 | MAX_LENGTH= 512
 14 | class CState(object):
 15 |     def __init__(self):
 16 |         self.stack = [CNode() for i in range(MAX_LENGTH)] #list of CNode
 17 |         self.stack_size = 0 #int
 18 |         self.edu_size = 0 #int
 19 |         self.next_index = 0 #int
 20 |         self.pre_state = None #CState
 21 |         self.pre_action = CAction('', '', '') #CAction
 22 |         self.is_start = True
 23 |         self.atom_feat = AtomFeat() #AtomFeat
 24 | 
 25 |     def clear(self):
 26 |         self.stack_size = 0 #int
 27 |         self.edu_size = 0 #int
 28 |         self.next_index = 0 #int
 29 |         self.pre_state = None #CState
 30 |         self.pre_action = CAction('', '', '') #CAction
 31 |         self.is_start = True
 32 |         self.atom_feat = AtomFeat() #AtomFeat
 33 | 
 34 |     def ready(self, edu_size):
 35 |         self.edu_size = edu_size
 36 | 
 37 |     def is_end(self):
 38 |         if (self.pre_action.is_finish()):
 39 |             return True
 40 |         else:
 41 |             return False
 42 | 
 43 |     def copy_state(self, cstate):
 44 |         cstate.stack = copy.deepcopy(self.stack)
 45 |         cstate.edu_size = self.edu_size
 46 |         cstate.pre_state = self
 47 |     
 48 |     def done_mark(self):
 49 |         self.stack[self.stack_size].clear()
 50 | 
 51 |     def shift(self, cstate):
 52 |         cstate.stack_size = self.stack_size + 1
 53 |         cstate.next_index = self.next_index + 1
 54 |         self.copy_state(cstate)
 55 |         top = cstate.stack[cstate.stack_size - 1]
 56 |         top.clear()
 57 |         top.is_validate = True
 58 |         top.edu_start  = self.next_index
 59 |         top.edu_end = self.next_index
 60 |         
 61 |         cstate.pre_action.set('SH', '', '')
 62 |         cstate.done_mark()
 63 | 
 64 |     def reduce(self, cstate, nuclear, label):
 65 |         cstate.stack_size = self.stack_size - 1
 66 |         cstate.next_index = self.next_index
 67 |         self.copy_state(cstate)
 68 |         top0 = cstate.stack[self.stack_size - 1]
 69 |         top1 = cstate.stack[self.stack_size - 2]
 70 |         try:
 71 |             assert(top0.edu_start == top1.edu_end + 1)
 72 |             assert(top0.is_validate and top1.is_validate)
 73 |         except:
 74 |             import ipdb; ipdb.set_trace()
 75 |         top1.edu_end = top0.edu_end
 76 |         top1.nuclear = nuclear
 77 |         top1.label = label
 78 |         top0.clear()
 79 |         
 80 |         cstate.stack[self.stack_size - 1] = top0
 81 |         cstate.stack[self.stack_size - 2] = top1
 82 |         
 83 |         cstate.pre_action.set('RD', nuclear, label)
 84 |         cstate.done_mark()
 85 | 
 86 |     def pop_root(self, cstate):
 87 |         assert  self.stack_size == 1 and self.next_index == self.edu_size
 88 |         cstate.stack_size = 0
 89 |         cstate.next_index = self.edu_size
 90 |         self.copy_state(cstate)
 91 |         top0 = cstate.stack[self.stack_size - 1]
 92 |         # assert(top0.edu_start == 0 and top0.edu_end + 1 == self.edu_size)
 93 |         assert(top0.edu_start == 0)
 94 |         assert(top0.is_validate)
 95 |         top0.clear()
 96 |         
 97 |         cstate.stack[self.stack_size - 1] = top0
 98 |         cstate.pre_action.set('PR', '', '')
 99 |         cstate.done_mark()
100 | 
101 |     #cstate = CState
102 |     #ac = CAction
103 |     def move(self, cstate, ac):
104 |         cstate.is_start = False
105 |         if ac.is_shift():
106 |             self.shift(cstate)
107 |         elif ac.is_reduce():
108 |             self.reduce(cstate, ac.nuclear, ac.label)
109 |         elif ac.is_finish():
110 |             self.pop_root(cstate)
111 |         else:
112 |             raise Exception('Error Action!')
113 |         return cstate
114 | 
115 |     def get_result(self):
116 |         result = CResult()
117 |         state = self
118 |         while(not state.pre_state.is_start):
119 |             ac = state.pre_action
120 |             st = state.pre_state
121 |             if (ac.is_reduce()):
122 |                 assert(st.stack_size >= 2)
123 |                 right_node = st.stack[st.stack_size-1]
124 |                 left_node = st.stack[st.stack_size-2]
125 |                 left_subtree = SubTree()
126 |                 right_subtree = SubTree()
127 | 
128 |                 left_subtree.edu_start = left_node.edu_start
129 |                 left_subtree.edu_end = left_node.edu_end
130 | 
131 |                 right_subtree.edu_start = right_node.edu_start
132 |                 right_subtree.edu_end = right_node.edu_end
133 | 
134 |                 if ac.nuclear == 'NN':
135 |                     left_subtree.nuclear = NUCLEAR
136 |                     right_subtree.nuclear = NUCLEAR
137 |                     left_subtree.relation = ac.label
138 |                     right_subtree.relation = ac.label
139 |                 elif ac.nuclear == 'SN':
140 |                     left_subtree.nuclear = SATELLITE
141 |                     right_subtree.nuclear = NUCLEAR
142 |                     left_subtree.relation = ac.label
143 |                     right_subtree.relation = SPAN
144 |                 elif ac.nuclear == 'NS':
145 |                     left_subtree.nuclear = NUCLEAR
146 |                     right_subtree.nuclear =SATELLITE
147 |                     left_subtree.relation = SPAN
148 |                     right_subtree.relation = ac.label
149 |                 
150 |                 result.subtrees.insert(0, right_subtree)
151 |                 result.subtrees.insert(0, left_subtree)
152 |             state = state.pre_state
153 |         return result
154 |     
155 |     def allow_shift(self):
156 |         if self.next_index == self.edu_size:
157 |             return False
158 |         return True
159 |     
160 |     def allow_reduce(self):
161 |         if self.stack_size >= 2:
162 |             return True
163 |         return False
164 | 
165 |     def allow_pop_root(self):
166 |         if self.next_index == self.edu_size and self.stack_size == 1:
167 |             return True
168 |         return False
169 | 
170 |     def get_candidate_actions(self, vocab):
171 |         mask = np.array([False] * vocab.gold_action_alpha.size())
172 |         if self.allow_reduce():
173 |             mask = mask | vocab.mask_reduce
174 |         if self.is_end():
175 |             mask = mask | vocab.mask_no_action
176 |         if self.allow_shift():
177 |             mask = mask | vocab.mask_shift
178 |         if self.allow_pop_root():
179 |             mask = mask | vocab.mask_pop_root
180 |         return ~mask
181 | 
182 |     def prepare_index(self):
183 |         if self.stack_size > 0:
184 |             self.atom_feat.s0 = self.stack[self.stack_size - 1]
185 |         else:
186 |             self.atom_feat.s0 = None
187 |         if self.stack_size > 1:
188 |             self.atom_feat.s1 = self.stack[self.stack_size - 2]
189 |         else:
190 |             self.atom_feat.s1 = None
191 |         if self.stack_size > 2:
192 |             self.atom_feat.s2 = self.stack[self.stack_size - 3]
193 |         else:
194 |             self.atom_feat.s2 = None
195 |         if self.next_index >= 0 and self.next_index < self.edu_size:
196 |             self.atom_feat.q0 = self.next_index
197 |         else:
198 |             self.atom_feat.q0 = None
199 |         return self.atom_feat
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RSTExtractor
 2 | 
 3 | This code is the combination of:
 4 | 1. NeuroNLP2 (https://github.com/XuezheMax/NeuroNLP2) -- paper: Deep Biaffine Attention for Neural Dependency Parsing (https://arxiv.org/abs/1611.01734).
 5 | 2. Neural RST Parser (https://github.com/fajri91/NeuralRST) -- paper: Transition-based Neural RST Parsing with Implicit Syntax Features (https://www.aclweb.org/anthology/C18-1047/).
 6 | 
 7 | This code is used to extract:
 8 | 1. Latent feature of discourse units.
 9 | 2. Shallow feature of discourse units.
10 | 
11 | For more technical details, please refer to our paper: 
12 | 
13 | Fajri Koto, Jey Han Lau, Timothy Baldwin. [Improved Document Modelling with a Neural Discourse Parser.](https://www.aclweb.org/anthology/U19-1010.pdf) In Proceedings of the 2019 Australasian Language Technology Workshop, Sydney.
14 | 
15 | ## Dependencies and Installation
16 | 1. Python 2.7
17 | 2. Run `pip install -r requirements.txt`
18 | 
19 | ## Pre-Extraction
20 | There are three main steps:
21 | 1. Using standford corenlp. After downloading the appropriate stanford corenlp, please run `python corenlp.py --source=PATH_TO_YOUR_DOCUMENTS/* --target=PATH_TO_YOUR_OUTPUT`.  Please make sure you put all the necessary files of stanford corenlp in this repo with a folder name `stanford-corenlp`.
22 | 2. For the next two steps, please follow https://github.com/fajri91/DPLP for:
23 |   * Converting XML file to CoNLL format.
24 |   * Segmenting CoNLL file to get EDUs. The output is *.merge file.
25 | 
26 | ## Extraction
27 | Now you are ready to extaract latent/shallow features as well as the RST tree.
28 | 1. For latent feature, please run `python extract_latent_feature.py`
29 | 2. For shallow feature, please first run `python extract_tree.py` and after that run `python extract_shallow_feature.py`
30 | 
31 | Note1: Please manually adjust all PATHs in the code as I have'nt implemented args.parse in the code. <br />
32 | Note2: Our RST parser performance is similar to _Transition-based Neural RST Parsing with Implicit Syntax Features_ (https://www.aclweb.org/anthology/C18-1047/).
33 | 


--------------------------------------------------------------------------------
/biaffine_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import numpy as np
  5 | from torch.autograd import Variable
  6 | from neuronlp2.io import get_logger, conllx_data
  7 | from neuronlp2.io import CoNLLXWriter, utils
  8 | from neuronlp2.tasks import parser
  9 | from neuronlp2.models import BiRecurrentConvBiAffine
 10 | 
 11 | # Special vocabulary symbols - we always put them at the start.
 12 | PAD = b"_PAD"
 13 | PAD_POS = b"_PAD_POS"
 14 | PAD_TYPE = b"_<PAD>"
 15 | PAD_CHAR = b"_PAD_CHAR"
 16 | ROOT = b"_ROOT"
 17 | ROOT_POS = b"_ROOT_POS"
 18 | ROOT_TYPE = b"_<ROOT>"
 19 | ROOT_CHAR = b"_ROOT_CHAR"
 20 | END = b"_END"
 21 | END_POS = b"_END_POS"
 22 | END_TYPE = b"_<END>"
 23 | END_CHAR = b"_END_CHAR"
 24 | _START_VOCAB = [PAD, ROOT, END]
 25 | 
 26 | UNK_ID = 0
 27 | PAD_ID_WORD = 1
 28 | PAD_ID_CHAR = 1
 29 | PAD_ID_TAG = 0
 30 | 
 31 | NUM_SYMBOLIC_TAGS = 3
 32 | 
 33 | _buckets = [10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 140, 200, 300]
 34 | 
 35 | 
 36 | class BiaffineModel(object):
 37 |     def __init__(self, model_path, model_name):
 38 |         print("................................................")
 39 |         print("LOADING Biaffine Model")
 40 |         alphabet_path = os.path.join(model_path, 'alphabets/')
 41 |         model_name = os.path.join(model_path, model_name)
 42 | 
 43 |         self.word_alpha, self.char_alpha, self.tag_alpha, self.type_alpha = conllx_data.create_alphabets(alphabet_path, None, data_paths=[None, None], max_vocabulary_size=50000, embedd_dict=None)
 44 |         self.id2word = {v: k for k, v in self.word_alpha.instance2index.iteritems()}
 45 |         
 46 |         num_words = self.word_alpha.size()
 47 |         num_chars = self.char_alpha.size()
 48 |         num_pos = self.tag_alpha.size()
 49 |         num_types = self.type_alpha.size()
 50 | 
 51 |         print("Word Alphabet Size: %d" % num_words)
 52 |         print("Character Alphabet Size: %d" % num_chars)
 53 |         print("POS Alphabet Size: %d" % num_pos)
 54 |         print("Type Alphabet Size: %d" % num_types)
 55 | 
 56 | 
 57 |         def load_model_arguments_from_json():
 58 |             arguments = json.load(open(arg_path, 'r'))
 59 |             return arguments['args'], arguments['kwargs']
 60 | 
 61 |         arg_path = model_name + '.arg.json'
 62 |         args, kwargs = load_model_arguments_from_json()
 63 |         self.network = BiRecurrentConvBiAffine(*args, **kwargs)
 64 |         self.network.load_state_dict(torch.load(model_name))
 65 |         
 66 |         self.network.id2word = self.id2word
 67 |         self.network.cuda()
 68 |         self.network.eval()
 69 | 
 70 |     def prepare_data(self, sentences, use_gpu=True):
 71 |         ret_value = []
 72 |         for sentence in sentences:
 73 |             inst_size = sentence.length()
 74 |             data = None
 75 |             max_len = 0
 76 |             bucket = 0
 77 |             for bucket_size in _buckets:
 78 |                 if inst_size < bucket_size:
 79 |                     bucket = bucket_size
 80 |                     data = [sentence.word_ids, sentence.seq_char_ids, sentence.tag_ids]
 81 |                     max_len = max([len(seq_char) for seq_char in sentence.seq_chars])
 82 |                     break
 83 |             if data is None: # meaning the sentence is too long, we cut it into 300 length
 84 |                 bucket = _buckets[-1]
 85 |                 data = [sentence.word_ids[:bucket], sentence.seq_char_ids[:bucket], sentence.tag_ids[:bucket]]
 86 |                 max_len = max([len(seq_char) for seq_char in sentence.seq_chars])
 87 |                 
 88 | 
 89 |             char_length = min(utils.MAX_CHAR_LENGTH, max_len + utils.NUM_CHAR_PAD)
 90 |             wid_inputs = np.empty([1, bucket], dtype=np.int64)
 91 |             cid_inputs = np.empty([1, bucket, char_length], dtype=np.int64)
 92 |             pid_inputs = np.empty([1, bucket], dtype=np.int64)
 93 | 
 94 |             masks = np.zeros([1, bucket], dtype=np.float32)
 95 |             single = np.zeros([1, bucket], dtype=np.int64)
 96 | 
 97 |             lengths = np.empty(bucket, dtype=np.int64)
 98 | 
 99 |             wids = data[0]
100 |             cid_seqs = data[1]
101 |             pids = data[2]
102 |             inst_size = len(wids)
103 |             lengths[0] = inst_size
104 |             # word ids
105 |             wid_inputs[0, :inst_size] = wids
106 |             wid_inputs[0, inst_size:] = PAD_ID_WORD
107 |             for c, cids in enumerate(cid_seqs):
108 |                 limit = len(cids)
109 |                 if limit > char_length: limit = char_length
110 |                 try:
111 |                     cid_inputs[0, c, :limit] = cids[:limit]
112 |                     cid_inputs[0, c, limit:] = PAD_ID_CHAR
113 |                 except:
114 |                     import ipdb; ipdb.set_trace()
115 |             cid_inputs[0, inst_size:, :] = PAD_ID_CHAR
116 |             # pos ids
117 |             pid_inputs[0, :inst_size] = pids
118 |             pid_inputs[0, inst_size:] = PAD_ID_TAG
119 |             # masks
120 |             masks[0, :inst_size] = 1.0
121 |             for j, wid in enumerate(wids):
122 |                 if self.word_alpha.is_singleton(wid):
123 |                     single[0, j] = 1
124 | 
125 |             words = Variable(torch.from_numpy(wid_inputs), volatile=False)
126 |             chars = Variable(torch.from_numpy(cid_inputs), volatile=False)
127 |             pos = Variable(torch.from_numpy(pid_inputs), volatile=False)
128 |             masks = Variable(torch.from_numpy(masks), volatile=False)
129 |             single = Variable(torch.from_numpy(single), volatile=False)
130 |             lengths = torch.from_numpy(lengths)
131 |             if use_gpu:
132 |                 words = words.cuda()
133 |                 chars = chars.cuda()
134 |                 pos = pos.cuda()
135 |                 masks = masks.cuda()
136 |                 single = single.cuda()
137 |                 lengths = lengths.cuda()
138 |             index = slice(0,1)
139 |             ret_value.append((words[index], chars[index], pos[index], masks[index], lengths[index], sentence.words, sentence.edu_ids))
140 |         return ret_value
141 | 
142 |     def get_syntax_feature(self, data_test, sentences):
143 |         sent = 0
144 |         syntax_features = []
145 |         for data in data_test:
146 |             cur_length = len(sentences[sent].words)
147 |             word, char, pos, masks, lengths, original_words, edu_ids = data
148 |             sent += 1
149 |             syntax_feature = self.network.get_syntax_feature(original_words, word, char, pos, mask=masks, length=lengths)
150 |             _ , sent_len, dim = syntax_feature.shape
151 |             if sent_len != cur_length:
152 |                 assert sent_len < cur_length
153 |                 diff = cur_length - sent_len
154 |                 zeros = Variable(torch.zeros(1, diff, dim)).type(torch.FloatTensor).cuda()
155 |                 syntax_feature = torch.cat([syntax_feature, zeros], dim=1)
156 |             syntax_features.append(syntax_feature)
157 |         return syntax_features
158 | 


--------------------------------------------------------------------------------
/corenlp.py:
--------------------------------------------------------------------------------
 1 | from subprocess import call
 2 | import os
 3 | import glob
 4 | import threading
 5 | import math
 6 | import argparse
 7 | 
 8 | 
 9 | scriptdir = '.'
10 | THREADS = 27
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("-s", "--source", help="provide source path", type=str)
14 | parser.add_argument("-t", "--target", help="provide target path", type=str)
15 | 
16 | args = parser.parse_args()
17 | if args.source:
18 |     PATH=args.source
19 | if args.target:
20 |     TARGET=args.target
21 | 
22 | files = glob.glob(PATH)
23 | targets = glob.glob(TARGET+'/*')
24 | TOTAL_FILES = len(files)
25 | BATCH_SIZE = int(math.ceil(TOTAL_FILES/THREADS))
26 | 
27 | def run_thread(ftmp):
28 |     os.system('/usr/bin/java -mx150g -cp "stanford-corenlp/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -ssplit.eolonly -tokenize.whitespace true -filelist '+ftmp+' -outputFormat xml -outputDirectory '+TARGET)
29 | 
30 | def generate_listfile(start, end, ftmp):
31 |     sliced_files = files[start:end]
32 |     w = open (ftmp, 'w')
33 |     for f in sliced_files:
34 |         fname = TARGET+'/' + f.split('/')[-1] + '.xml'
35 |         if not fname in targets:
36 |             w.write(f+'\n')
37 |     w.close()
38 | 
39 | for i in range(THREADS):
40 |     start = i * BATCH_SIZE
41 |     end = start + BATCH_SIZE
42 |     if end > TOTAL_FILES:
43 |         end = TOTAL_FILES
44 |     
45 |     ftmp = 'tmp'+str(i)+'.txt'
46 |     generate_listfile(start,end,ftmp)
47 |     
48 |     t = threading.Thread(target=run_thread, args=(ftmp,))
49 |     t.start()
50 |     print (start, end)
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/extract_latent_feature.py:
--------------------------------------------------------------------------------
  1 | import sys, time
  2 | import numpy as np
  3 | import random
  4 | import re, os
  5 | import argparse
  6 | from sentence import Sentence, Instance, EDU
  7 | from rst_model import RSTModel
  8 | from biaffine_model import BiaffineModel
  9 | 
 10 | sys.path.append(".")
 11 | ROOT = b"_ROOT"
 12 | ROOT_POS = b"_ROOT_POS"
 13 | ROOT_CHAR = b"_ROOT_CHAR"
 14 | END = b"_END"
 15 | END_POS = b"_END_POS"
 16 | END_CHAR = b"_END_CHAR"
 17 | 
 18 | UNK_ID = 0
 19 | PAD_ID_WORD = 1
 20 | PAD_ID_CHAR = 1
 21 | PAD_ID_TAG = 0
 22 | 
 23 | NUM_SYMBOLIC_TAGS = 3
 24 | 
 25 | # Regular expressions used to normalize digits.
 26 | DIGIT_RE = re.compile(br"\d")
 27 | BIAFFINE_PATH = "/home/ffajri/Workspace/RSTExtractor/models/biaffine"
 28 | BIAFFINE_MODEL = "network.pt"
 29 | RST_CONFIG_PATH = "/home/ffajri/Workspace/RSTExtractor/models/rst/config.cfg"
 30 | DATA_PATH = '/home/ffajri/Data/Petition/UK/processed/merge/*'
 31 | THREADS = 10
 32 | 
 33 | if not os.path.exists('output'):
 34 |     os.makedirs('output')
 35 | if not os.path.exists('output_enc'):
 36 |     os.makedirs('output_enc')
 37 | 
 38 | def form_sentence(lines, word_alpha, char_alpha, tag_alpha, symbolic_root=False, symbolic_end=False):
 39 |     words = []
 40 |     word_ids = []
 41 |     seq_chars = []
 42 |     seq_char_ids = []
 43 |     tags = []
 44 |     tag_ids = []
 45 |     edu_ids = []
 46 |         
 47 |     if symbolic_root:
 48 |         words.append(ROOT)
 49 |         word_ids.append(word_alpha.get_index(ROOT))
 50 |         seq_chars.append([ROOT_CHAR, ])
 51 |         seq_char_ids.append([char_alpha.get_index(ROOT_CHAR), ])
 52 |         tags.append(ROOT_POS)
 53 |         tag_ids.append(tag_alpha.get_index(ROOT_POS))
 54 | 
 55 |     for line in lines:
 56 |         chars = []
 57 |         char_ids = []
 58 |         data = line.strip().split('\t')
 59 |         word = DIGIT_RE.sub(b"0", data[2])
 60 |         word_id = word_alpha.get_index(word)
 61 |         for c in words:
 62 |             chars.append(c)
 63 |             char_ids.append(char_alpha.get_index(c))
 64 |         tag = '$' if data[4] == '#' else data[4]
 65 |         tag_id = tag_alpha.get_index(tag)
 66 |         edu_id = int(data[9])
 67 | 
 68 |         words.append(word)
 69 |         word_ids.append(word_id)
 70 |         seq_chars.append(chars)
 71 |         seq_char_ids.append(char_ids)
 72 |         tags.append(tag)
 73 |         tag_ids.append(tag_id)
 74 |         edu_ids.append(edu_id)
 75 |     
 76 |     if symbolic_end:
 77 |         words.append(END)
 78 |         word_ids.append(word_alpha.get_index(END))
 79 |         seq_chars.append([END_CHAR, ])
 80 |         seq_char_ids.append([char_alpha.get_index(END_CHAR), ])
 81 |         tags.append(END_POS)
 82 |         tag_ids.append(tag_alpha.get_index(END_POS))
 83 |     
 84 |     return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids)
 85 | 
 86 | def data_reader(file_path, biaffine):
 87 |     f = open(file_path, 'r')
 88 |     sentences = []
 89 |     lines = []
 90 |     for line in f.readlines():
 91 |         if line.strip() == '':
 92 |             sentences.append(form_sentence(lines, biaffine.word_alpha, biaffine.char_alpha, biaffine.tag_alpha))
 93 |             lines = []
 94 |         else:
 95 |             lines.append(line)
 96 |     data = biaffine.prepare_data(sentences)
 97 |     syntax_features = biaffine.get_syntax_feature(data, sentences)
 98 |     
 99 |     for i in range(len(sentences)):
100 |         assert(len(sentences[i].words) == syntax_features[i].shape[1])
101 |     instance = Instance(sentences, syntax_features)
102 |     return instance
103 | 
104 | def write_to_file(filename, instance, edu_features):
105 |     f1 = open('output/'+filename, 'w')
106 |     f2_name = 'output_enc/'+filename
107 | 
108 |     for idx in range(len (instance.edus)):
109 |         for word in instance.edus[idx].words:
110 |             f1.write(word+'|'+str(idx)+' ')
111 |     f1.close()
112 |     np.save(f2_name, edu_features.data.numpy())
113 | 
114 | import glob
115 | import threading
116 | import math
117 | from multiprocessing import Process
118 | files = glob.glob(DATA_PATH)
119 | 
120 | def run_thread(files):
121 |     rst = RSTModel(RST_CONFIG_PATH)
122 |     biaffine = BiaffineModel(BIAFFINE_PATH, BIAFFINE_MODEL)
123 |     for filepath in files:
124 |         filename = filepath.split('/')[-1].replace('.merge', '')
125 |         instance = data_reader(filepath, biaffine)
126 |         rst_data = rst.prepare_data([instance], 1)
127 |         edu_features = rst.get_edu_representation(rst_data).view(len(instance.edus), -1)
128 |         write_to_file(filename, instance, edu_features)
129 | 
130 | partitions  = []
131 | size = int(math.ceil(1.0*len(files)/THREADS))
132 | processes = list()
133 | for i in range(THREADS):
134 |     start = i * size
135 |     end = start + size
136 |     if end > len(files):
137 |         end = len(files)
138 |     p = files[start:end]
139 |     
140 |     process = Process(target=run_thread, args=(p,))
141 |     process.start()
142 |     processes.append(process)
143 |     if end == len(files):
144 |         break
145 | for process in processes:
146 |     process.join()
147 | 


--------------------------------------------------------------------------------
/extract_shallow_feature.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import glob
 3 | from NeuralRST.in_out.instance import CResult
 4 | from NeuralRST.in_out.rst_feature import RSTFeature
 5 | import threading
 6 | import math
 7 | from multiprocessing import Process
 8 | 
 9 | THREADS = 40
10 | SOURCE_PATH = '/home/ffajri/Data/Petition/US/processed/output_tree/*'
11 | TARGET_PATH = '/home/ffajri/Data/Petition/US/processed/output_shallow/'
12 | allfiles = glob.glob(SOURCE_PATH)
13 | 
14 | def run_thread(files):
15 |     for filepath in files:
16 |         filename = filepath.split('/')[-1].replace('.npy', '')
17 |         cresult = CResult()
18 |         cresult.subtrees = list(np.load(filepath))
19 |         tree = cresult.obtain_tree()
20 |         rst_feature = RSTFeature()
21 |         feat = rst_feature.generate_heuristic_feature(tree)
22 |         np.save(TARGET_PATH+filename, feat)
23 | 
24 | partitions  = []
25 | size = int(math.ceil(1.0*len(allfiles)/THREADS))
26 | processes = list()
27 | for i in range(THREADS):
28 |     start = i * size
29 |     end = start + size
30 |     if end > len(allfiles):
31 |         end = len(allfiles)
32 |     p = allfiles[start:end]
33 |     
34 |     process = Process(target=run_thread, args=(p,))
35 |     process.start()
36 |     processes.append(process)
37 |     if end == len(allfiles):
38 |         break
39 | for process in processes:
40 |     process.join()
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/extract_tree.py:
--------------------------------------------------------------------------------
  1 | import sys, time
  2 | import numpy as np
  3 | import random
  4 | import re, os
  5 | from datetime import datetime
  6 | import argparse
  7 | from sentence import Sentence, Instance, EDU
  8 | from rst_model import RSTModel
  9 | from biaffine_model import BiaffineModel
 10 | import glob
 11 | import threading
 12 | import math
 13 | from multiprocessing import Process
 14 | 
 15 | sys.path.append(".")
 16 | ROOT = b"_ROOT"
 17 | ROOT_POS = b"_ROOT_POS"
 18 | ROOT_CHAR = b"_ROOT_CHAR"
 19 | END = b"_END"
 20 | END_POS = b"_END_POS"
 21 | END_CHAR = b"_END_CHAR"
 22 | 
 23 | UNK_ID = 0
 24 | PAD_ID_WORD = 1
 25 | PAD_ID_CHAR = 1
 26 | PAD_ID_TAG = 0
 27 | 
 28 | NUM_SYMBOLIC_TAGS = 3
 29 | 
 30 | # Regular expressions used to normalize digits.
 31 | DIGIT_RE = re.compile(br"\d")
 32 | BIAFFINE_PATH = "/home/ffajri/Workspace/RSTExtractor/models/biaffine"
 33 | BIAFFINE_MODEL = "network.pt"
 34 | RST_CONFIG_PATH = "/home/ffajri/Workspace/RSTExtractor/models/rst/config.cfg"
 35 | DATA_PATH = '/home/ffajri/Data/Petition/US/processed/merge/*'
 36 | THREADS = 10
 37 | 
 38 | if not os.path.exists('output_tree'):
 39 |     os.makedirs('output_tree')
 40 | 
 41 | def form_sentence(lines, word_alpha, char_alpha, tag_alpha, symbolic_root=False, symbolic_end=False):
 42 |     words = []
 43 |     word_ids = []
 44 |     seq_chars = []
 45 |     seq_char_ids = []
 46 |     tags = []
 47 |     tag_ids = []
 48 |     edu_ids = []
 49 |         
 50 |     if symbolic_root:
 51 |         words.append(ROOT)
 52 |         word_ids.append(word_alpha.get_index(ROOT))
 53 |         seq_chars.append([ROOT_CHAR, ])
 54 |         seq_char_ids.append([char_alpha.get_index(ROOT_CHAR), ])
 55 |         tags.append(ROOT_POS)
 56 |         tag_ids.append(tag_alpha.get_index(ROOT_POS))
 57 | 
 58 |     for line in lines:
 59 |         chars = []
 60 |         char_ids = []
 61 |         data = line.strip().split('\t')
 62 |         word = DIGIT_RE.sub(b"0", data[2])
 63 |         word_id = word_alpha.get_index(word)
 64 |         for c in words:
 65 |             chars.append(c)
 66 |             char_ids.append(char_alpha.get_index(c))
 67 |         tag = '$' if data[4] == '#' else data[4]
 68 |         tag_id = tag_alpha.get_index(tag)
 69 |         edu_id = int(data[9])
 70 | 
 71 |         words.append(word)
 72 |         word_ids.append(word_id)
 73 |         seq_chars.append(chars)
 74 |         seq_char_ids.append(char_ids)
 75 |         tags.append(tag)
 76 |         tag_ids.append(tag_id)
 77 |         edu_ids.append(edu_id)
 78 |     
 79 |     if symbolic_end:
 80 |         words.append(END)
 81 |         word_ids.append(word_alpha.get_index(END))
 82 |         seq_chars.append([END_CHAR, ])
 83 |         seq_char_ids.append([char_alpha.get_index(END_CHAR), ])
 84 |         tags.append(END_POS)
 85 |         tag_ids.append(tag_alpha.get_index(END_POS))
 86 |     return Sentence(words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids)
 87 | 
 88 | def data_reader(file_path, biaffine):
 89 |     f = open(file_path, 'r')
 90 |     sentences = []
 91 |     lines = []
 92 |     for line in f.readlines():
 93 |         if line.strip() == '':
 94 |             sentences.append(form_sentence(lines, biaffine.word_alpha, biaffine.char_alpha, biaffine.tag_alpha))
 95 |             lines = []
 96 |         else:
 97 |             lines.append(line)
 98 |     data = biaffine.prepare_data(sentences)
 99 |     syntax_features = biaffine.get_syntax_feature(data, sentences)
100 |     
101 |     for i in range(len(sentences)):
102 |         assert(len(sentences[i].words) == syntax_features[i].shape[1])
103 |     instance = Instance(sentences, syntax_features)
104 |     return instance
105 | 
106 | files=glob.glob(DATA_PATH)
107 | 
108 | def run_thread(files):
109 |     rst = RSTModel(RST_CONFIG_PATH)
110 |     biaffine = BiaffineModel(BIAFFINE_PATH, BIAFFINE_MODEL)
111 |     for filepath in files:
112 |         filename = filepath.split('/')[-1].replace('.merge', '')
113 |         instance = data_reader(filepath, biaffine)
114 |         rst_data = rst.prepare_data([instance], 1)
115 |         tree = rst.get_subtree(rst_data)[0]
116 |         tree.save('output_tree/' + filename)
117 | 
118 | partitions  = []
119 | size = int(math.ceil(1.0*len(files)/THREADS))
120 | processes = list()
121 | for i in range(THREADS):
122 |     start = i * size
123 |     end = start + size
124 |     if end > len(files):
125 |         end = len(files)
126 |     p = files[start:end]
127 |     
128 |     process = Process(target=run_thread, args=(p,))
129 |     process.start()
130 |     processes.append(process)
131 |     if end == len(files):
132 |         break
133 | for process in processes:
134 |     process.join()
135 | 


--------------------------------------------------------------------------------
/models/biaffine/alphabets/character.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instances": [
  3 |         "_PAD_CHAR", 
  4 |         "_ROOT_CHAR", 
  5 |         "_END_CHAR", 
  6 |         "F", 
  7 |         "o", 
  8 |         "r", 
  9 |         "m", 
 10 |         "e", 
 11 |         "U", 
 12 |         ".", 
 13 |         "N", 
 14 |         "A", 
 15 |         "b", 
 16 |         "a", 
 17 |         "s", 
 18 |         "d", 
 19 |         "J", 
 20 |         "n", 
 21 |         "K", 
 22 |         "i", 
 23 |         "k", 
 24 |         "p", 
 25 |         "t", 
 26 |         "c", 
 27 |         ",", 
 28 |         "C", 
 29 |         "`", 
 30 |         "l", 
 31 |         "G", 
 32 |         "g", 
 33 |         "'", 
 34 |         "u", 
 35 |         "O", 
 36 |         "7", 
 37 |         "f", 
 38 |         "H", 
 39 |         "h", 
 40 |         "-", 
 41 |         ":", 
 42 |         "I", 
 43 |         "w", 
 44 |         "T", 
 45 |         "y", 
 46 |         "v", 
 47 |         "E", 
 48 |         "S", 
 49 |         "L", 
 50 |         "P", 
 51 |         "W", 
 52 |         "x", 
 53 |         "R", 
 54 |         "B", 
 55 |         "3", 
 56 |         "6", 
 57 |         "%", 
 58 |         "1", 
 59 |         "2", 
 60 |         "5", 
 61 |         "9", 
 62 |         "0", 
 63 |         "$", 
 64 |         "4", 
 65 |         "8", 
 66 |         "M", 
 67 |         "Y", 
 68 |         "D", 
 69 |         "q", 
 70 |         "Q", 
 71 |         "X", 
 72 |         "&", 
 73 |         "z", 
 74 |         "j", 
 75 |         "/", 
 76 |         "{", 
 77 |         "V", 
 78 |         "}", 
 79 |         "?", 
 80 |         ";", 
 81 |         "!", 
 82 |         "Z", 
 83 |         "#", 
 84 |         "*", 
 85 |         "=", 
 86 |         "@"
 87 |     ], 
 88 |     "instance2index": {
 89 |         "m": 7, 
 90 |         "M": 64, 
 91 |         "_ROOT_CHAR": 2, 
 92 |         "!": 79, 
 93 |         "#": 81, 
 94 |         "%": 55, 
 95 |         "$": 61, 
 96 |         "'": 31, 
 97 |         "&": 70, 
 98 |         "*": 82, 
 99 |         "-": 38, 
100 |         ",": 25, 
101 |         "/": 73, 
102 |         ".": 10, 
103 |         "1": 56, 
104 |         "0": 60, 
105 |         "3": 53, 
106 |         "2": 57, 
107 |         "5": 58, 
108 |         "4": 62, 
109 |         "7": 34, 
110 |         "6": 54, 
111 |         "9": 59, 
112 |         "8": 63, 
113 |         ";": 78, 
114 |         ":": 39, 
115 |         "=": 83, 
116 |         "?": 77, 
117 |         "A": 12, 
118 |         "@": 84, 
119 |         "C": 26, 
120 |         "B": 52, 
121 |         "E": 45, 
122 |         "D": 66, 
123 |         "G": 29, 
124 |         "F": 4, 
125 |         "I": 40, 
126 |         "H": 36, 
127 |         "K": 19, 
128 |         "J": 17, 
129 |         "_END_CHAR": 3, 
130 |         "L": 47, 
131 |         "O": 33, 
132 |         "N": 11, 
133 |         "Q": 68, 
134 |         "P": 48, 
135 |         "S": 46, 
136 |         "R": 51, 
137 |         "U": 9, 
138 |         "T": 42, 
139 |         "W": 49, 
140 |         "V": 75, 
141 |         "Y": 65, 
142 |         "X": 69, 
143 |         "Z": 80, 
144 |         "a": 14, 
145 |         "`": 27, 
146 |         "c": 24, 
147 |         "b": 13, 
148 |         "e": 8, 
149 |         "d": 16, 
150 |         "g": 30, 
151 |         "f": 35, 
152 |         "i": 20, 
153 |         "h": 37, 
154 |         "k": 21, 
155 |         "j": 72, 
156 |         "_PAD_CHAR": 1, 
157 |         "l": 28, 
158 |         "o": 5, 
159 |         "n": 18, 
160 |         "q": 67, 
161 |         "p": 22, 
162 |         "s": 15, 
163 |         "r": 6, 
164 |         "u": 32, 
165 |         "t": 23, 
166 |         "w": 41, 
167 |         "v": 44, 
168 |         "y": 43, 
169 |         "x": 50, 
170 |         "{": 74, 
171 |         "z": 71, 
172 |         "}": 76
173 |     }
174 | }


--------------------------------------------------------------------------------
/models/biaffine/alphabets/pos.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instances": [
  3 |         "_PAD_POS", 
  4 |         "_ROOT_POS", 
  5 |         "_END_POS", 
  6 |         "JJ", 
  7 |         "NNP", 
  8 |         ",", 
  9 |         "IN", 
 10 |         "DT", 
 11 |         "``", 
 12 |         "NN", 
 13 |         "''", 
 14 |         "CD", 
 15 |         "HYPH", 
 16 |         ":", 
 17 |         "PRP", 
 18 |         "VBP", 
 19 |         "VBD", 
 20 |         ".", 
 21 |         "MD", 
 22 |         "VB", 
 23 |         "TO", 
 24 |         "CC", 
 25 |         "RB", 
 26 |         "VBG", 
 27 |         "NNPS", 
 28 |         "PRP$", 
 29 |         "NNS", 
 30 |         "$", 
 31 |         "VBN", 
 32 |         "POS", 
 33 |         "VBZ", 
 34 |         "JJR", 
 35 |         "RBR", 
 36 |         "-LRB-", 
 37 |         "-RRB-", 
 38 |         "WDT", 
 39 |         "WP", 
 40 |         "WRB", 
 41 |         "SYM", 
 42 |         "RP", 
 43 |         "EX", 
 44 |         "JJS", 
 45 |         "LS", 
 46 |         "RBS", 
 47 |         "PDT", 
 48 |         "FW", 
 49 |         "WP$", 
 50 |         "UH", 
 51 |         "NFP", 
 52 |         "AFX"
 53 |     ], 
 54 |     "instance2index": {
 55 |         "PRP$": 25, 
 56 |         "VBG": 23, 
 57 |         "VBD": 16, 
 58 |         "NFP": 48, 
 59 |         "``": 8, 
 60 |         "_ROOT_POS": 1, 
 61 |         "''": 10, 
 62 |         "VBP": 15, 
 63 |         "VBN": 28, 
 64 |         "_END_POS": 2, 
 65 |         "JJ": 3, 
 66 |         "WP": 36, 
 67 |         "VBZ": 30, 
 68 |         "DT": 7, 
 69 |         "RP": 39, 
 70 |         "$": 27, 
 71 |         "NN": 9, 
 72 |         "FW": 45, 
 73 |         ",": 5, 
 74 |         ".": 17, 
 75 |         "TO": 20, 
 76 |         "UH": 47, 
 77 |         "PRP": 14, 
 78 |         "RB": 22, 
 79 |         "-LRB-": 33, 
 80 |         ":": 13, 
 81 |         "NNS": 26, 
 82 |         "HYPH": 12, 
 83 |         "VB": 19, 
 84 |         "WRB": 37, 
 85 |         "CC": 21, 
 86 |         "LS": 42, 
 87 |         "PDT": 44, 
 88 |         "RBS": 43, 
 89 |         "RBR": 32, 
 90 |         "CD": 11, 
 91 |         "AFX": 49, 
 92 |         "EX": 40, 
 93 |         "IN": 6, 
 94 |         "WP$": 46, 
 95 |         "MD": 18, 
 96 |         "NNPS": 24, 
 97 |         "-RRB-": 34, 
 98 |         "POS": 29, 
 99 |         "JJS": 41, 
100 |         "JJR": 31, 
101 |         "SYM": 38, 
102 |         "_PAD_POS": 0, 
103 |         "WDT": 35, 
104 |         "NNP": 4
105 |     }
106 | }


--------------------------------------------------------------------------------
/models/biaffine/alphabets/type.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instances": [
  3 |         "_<PAD>", 
  4 |         "_<ROOT>", 
  5 |         "_<END>", 
  6 |         "amod", 
  7 |         "nn", 
  8 |         "root", 
  9 |         "punct", 
 10 |         "prep", 
 11 |         "det", 
 12 |         "pobj", 
 13 |         "tmod", 
 14 |         "num", 
 15 |         "nsubj", 
 16 |         "cop", 
 17 |         "ccomp", 
 18 |         "aux", 
 19 |         "xcomp", 
 20 |         "dobj", 
 21 |         "cc", 
 22 |         "conj", 
 23 |         "dep", 
 24 |         "appos", 
 25 |         "poss", 
 26 |         "npadvmod", 
 27 |         "partmod", 
 28 |         "nsubjpass", 
 29 |         "auxpass", 
 30 |         "possessive", 
 31 |         "advmod", 
 32 |         "pcomp", 
 33 |         "parataxis", 
 34 |         "number", 
 35 |         "mark", 
 36 |         "advcl", 
 37 |         "rcmod", 
 38 |         "acomp", 
 39 |         "prt", 
 40 |         "infmod", 
 41 |         "quantmod", 
 42 |         "expl", 
 43 |         "preconj", 
 44 |         "csubj", 
 45 |         "neg", 
 46 |         "mwe", 
 47 |         "iobj", 
 48 |         "predet", 
 49 |         "discourse", 
 50 |         "csubjpass"
 51 |     ], 
 52 |     "instance2index": {
 53 |         "cc": 18, 
 54 |         "number": 31, 
 55 |         "ccomp": 14, 
 56 |         "possessive": 27, 
 57 |         "prt": 36, 
 58 |         "num": 11, 
 59 |         "nsubjpass": 25, 
 60 |         "csubj": 41, 
 61 |         "conj": 19, 
 62 |         "amod": 3, 
 63 |         "_<PAD>": 0, 
 64 |         "nn": 4, 
 65 |         "neg": 42, 
 66 |         "discourse": 46, 
 67 |         "mark": 32, 
 68 |         "auxpass": 26, 
 69 |         "infmod": 37, 
 70 |         "_<ROOT>": 1, 
 71 |         "advcl": 33, 
 72 |         "aux": 15, 
 73 |         "prep": 7, 
 74 |         "parataxis": 30, 
 75 |         "mwe": 43, 
 76 |         "nsubj": 12, 
 77 |         "rcmod": 34, 
 78 |         "advmod": 28, 
 79 |         "punct": 6, 
 80 |         "quantmod": 38, 
 81 |         "tmod": 10, 
 82 |         "acomp": 35, 
 83 |         "pcomp": 29, 
 84 |         "csubjpass": 47, 
 85 |         "poss": 22, 
 86 |         "npadvmod": 23, 
 87 |         "xcomp": 16, 
 88 |         "cop": 13, 
 89 |         "partmod": 24, 
 90 |         "_<END>": 2, 
 91 |         "appos": 21, 
 92 |         "det": 8, 
 93 |         "dobj": 17, 
 94 |         "dep": 20, 
 95 |         "pobj": 9, 
 96 |         "iobj": 44, 
 97 |         "expl": 39, 
 98 |         "predet": 45, 
 99 |         "preconj": 40, 
100 |         "root": 5
101 |     }
102 | }


--------------------------------------------------------------------------------
/models/biaffine/network.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/models/biaffine/network.pt


--------------------------------------------------------------------------------
/models/biaffine/network.pt.arg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "args": [
 3 |         100, 
 4 |         35765, 
 5 |         100, 
 6 |         85, 
 7 |         100, 
 8 |         50, 
 9 |         100, 
10 |         3, 
11 |         "FastLSTM", 
12 |         400, 
13 |         3, 
14 |         48, 
15 |         500, 
16 |         100
17 |     ], 
18 |     "kwargs": {
19 |         "p_in": 0.33, 
20 |         "p_out": 0.33, 
21 |         "biaffine": true, 
22 |         "pos": true, 
23 |         "char": true, 
24 |         "p_rnn": [
25 |             0.33, 
26 |             0.33
27 |         ]
28 |     }
29 | }


--------------------------------------------------------------------------------
/models/rst/alphabets/action_label_alpha.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "id2alpha": {
 3 |         "0": "purp", 
 4 |         "1": "cont", 
 5 |         "2": "attr", 
 6 |         "3": "evid", 
 7 |         "4": "comp", 
 8 |         "5": "list", 
 9 |         "6": "back", 
10 |         "7": "same", 
11 |         "8": "topic", 
12 |         "9": "mann", 
13 |         "10": "summ", 
14 |         "11": "cond", 
15 |         "12": "temp", 
16 |         "13": "eval", 
17 |         "14": "text", 
18 |         "15": "cause", 
19 |         "16": "prob", 
20 |         "17": "elab", 
21 |         "18": "PAD"
22 |     }, 
23 |     "alpha2id": {
24 |         "purp": 0, 
25 |         "cont": 1, 
26 |         "attr": 2, 
27 |         "evid": 3, 
28 |         "comp": 4, 
29 |         "elab": 17, 
30 |         "list": 5, 
31 |         "back": 6, 
32 |         "same": 7, 
33 |         "topic": 8, 
34 |         "summ": 10, 
35 |         "cond": 11, 
36 |         "temp": 12, 
37 |         "eval": 13, 
38 |         "text": 14, 
39 |         "PAD": 18, 
40 |         "cause": 15, 
41 |         "prob": 16, 
42 |         "mann": 9
43 |     }, 
44 |     "alphas": [
45 |         "purp", 
46 |         "cont", 
47 |         "attr", 
48 |         "evid", 
49 |         "comp", 
50 |         "list", 
51 |         "back", 
52 |         "same", 
53 |         "topic", 
54 |         "mann", 
55 |         "summ", 
56 |         "cond", 
57 |         "temp", 
58 |         "eval", 
59 |         "text", 
60 |         "cause", 
61 |         "prob", 
62 |         "elab", 
63 |         "PAD"
64 |     ]
65 | }


--------------------------------------------------------------------------------
/models/rst/alphabets/etype_alpha.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "id2alpha": {
 3 |         "0": "UNK", 
 4 |         "1": "<S>", 
 5 |         "2": "<P>"
 6 |     }, 
 7 |     "alpha2id": {
 8 |         "<S>": 1, 
 9 |         "<P>": 2, 
10 |         "UNK": 0
11 |     }, 
12 |     "alphas": [
13 |         "<S>", 
14 |         "<P>", 
15 |         "UNK"
16 |     ]
17 | }


--------------------------------------------------------------------------------
/models/rst/alphabets/gold_action_alpha.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id2alpha": {
  3 |         "0": "REDUCE_NN_temp", 
  4 |         "1": "REDUCE_NS_purp", 
  5 |         "2": "REDUCE_NN_text", 
  6 |         "3": "REDUCE_SN_elab", 
  7 |         "4": "REDUCE_NS_summ", 
  8 |         "5": "REDUCE_NS_attr", 
  9 |         "6": "REDUCE_NN_eval", 
 10 |         "7": "REDUCE_NS_evid", 
 11 |         "8": "REDUCE_SN_comp", 
 12 |         "9": "REDUCE_NS_topic", 
 13 |         "10": "REDUCE_SN_purp", 
 14 |         "11": "REDUCE_NS_comp", 
 15 |         "12": "REDUCE_NS_elab", 
 16 |         "13": "REDUCE_SN_eval", 
 17 |         "14": "REDUCE_NN_cause", 
 18 |         "15": "REDUCE_NS_eval", 
 19 |         "16": "REDUCE_NS_back", 
 20 |         "17": "REDUCE_SN_temp", 
 21 |         "18": "REDUCE_NS_temp", 
 22 |         "19": "REDUCE_SN_attr", 
 23 |         "20": "REDUCE_SN_summ", 
 24 |         "21": "REDUCE_NS_mann", 
 25 |         "22": "REDUCE_NS_prob", 
 26 |         "23": "REDUCE_NN_evid", 
 27 |         "24": "REDUCE_NN_cont", 
 28 |         "25": "REDUCE_SN_cond", 
 29 |         "26": "REDUCE_SN_prob", 
 30 |         "27": "REDUCE_NS_cause", 
 31 |         "28": "REDUCE_NN_cond", 
 32 |         "29": "REDUCE_SN_cont", 
 33 |         "30": "REDUCE_SN_evid", 
 34 |         "31": "POPROOT__", 
 35 |         "32": "REDUCE_NN_topic", 
 36 |         "33": "REDUCE_NN_same", 
 37 |         "34": "REDUCE_NN_list", 
 38 |         "35": "SHIFT__", 
 39 |         "36": "REDUCE_NS_cont", 
 40 |         "37": "REDUCE_NS_cond", 
 41 |         "38": "REDUCE_SN_mann", 
 42 |         "39": "REDUCE_SN_back", 
 43 |         "40": "REDUCE_NN_prob", 
 44 |         "41": "REDUCE_SN_cause", 
 45 |         "42": "REDUCE_NN_comp", 
 46 |         "43": "PAD"
 47 |     }, 
 48 |     "alpha2id": {
 49 |         "REDUCE_NN_temp": 0, 
 50 |         "REDUCE_NS_purp": 1, 
 51 |         "REDUCE_NN_text": 2, 
 52 |         "REDUCE_SN_summ": 20, 
 53 |         "REDUCE_SN_cont": 29, 
 54 |         "REDUCE_NS_attr": 5, 
 55 |         "REDUCE_NN_eval": 6, 
 56 |         "PAD": 43, 
 57 |         "REDUCE_NS_evid": 7, 
 58 |         "REDUCE_SN_comp": 8, 
 59 |         "REDUCE_SN_purp": 10, 
 60 |         "REDUCE_NS_comp": 11, 
 61 |         "REDUCE_NS_elab": 12, 
 62 |         "REDUCE_SN_eval": 13, 
 63 |         "REDUCE_NN_cause": 14, 
 64 |         "REDUCE_NS_eval": 15, 
 65 |         "REDUCE_NS_back": 16, 
 66 |         "REDUCE_SN_temp": 17, 
 67 |         "REDUCE_NS_temp": 18, 
 68 |         "REDUCE_SN_attr": 19, 
 69 |         "REDUCE_SN_elab": 3, 
 70 |         "REDUCE_NS_mann": 21, 
 71 |         "REDUCE_NS_prob": 22, 
 72 |         "REDUCE_NN_same": 33, 
 73 |         "REDUCE_NN_cont": 24, 
 74 |         "REDUCE_SN_cond": 25, 
 75 |         "REDUCE_SN_prob": 26, 
 76 |         "REDUCE_NS_cause": 27, 
 77 |         "REDUCE_NN_cond": 28, 
 78 |         "REDUCE_NS_summ": 4, 
 79 |         "REDUCE_SN_evid": 30, 
 80 |         "POPROOT__": 31, 
 81 |         "REDUCE_NN_topic": 32, 
 82 |         "REDUCE_NN_evid": 23, 
 83 |         "REDUCE_NN_list": 34, 
 84 |         "SHIFT__": 35, 
 85 |         "REDUCE_NS_cont": 36, 
 86 |         "REDUCE_NS_topic": 9, 
 87 |         "REDUCE_SN_mann": 38, 
 88 |         "REDUCE_SN_cause": 41, 
 89 |         "REDUCE_SN_back": 39, 
 90 |         "REDUCE_NN_prob": 40, 
 91 |         "REDUCE_NS_cond": 37, 
 92 |         "REDUCE_NN_comp": 42
 93 |     }, 
 94 |     "alphas": [
 95 |         "REDUCE_NN_temp", 
 96 |         "REDUCE_NS_purp", 
 97 |         "REDUCE_NN_text", 
 98 |         "REDUCE_SN_elab", 
 99 |         "REDUCE_NS_summ", 
100 |         "REDUCE_NS_attr", 
101 |         "REDUCE_NN_eval", 
102 |         "REDUCE_NS_evid", 
103 |         "REDUCE_SN_comp", 
104 |         "REDUCE_NS_topic", 
105 |         "REDUCE_SN_purp", 
106 |         "REDUCE_NS_comp", 
107 |         "REDUCE_NS_elab", 
108 |         "REDUCE_SN_eval", 
109 |         "REDUCE_NN_cause", 
110 |         "REDUCE_NS_eval", 
111 |         "REDUCE_NS_back", 
112 |         "REDUCE_SN_temp", 
113 |         "REDUCE_NS_temp", 
114 |         "REDUCE_SN_attr", 
115 |         "REDUCE_SN_summ", 
116 |         "REDUCE_NS_mann", 
117 |         "REDUCE_NS_prob", 
118 |         "REDUCE_NN_evid", 
119 |         "REDUCE_NN_cont", 
120 |         "REDUCE_SN_cond", 
121 |         "REDUCE_SN_prob", 
122 |         "REDUCE_NS_cause", 
123 |         "REDUCE_NN_cond", 
124 |         "REDUCE_SN_cont", 
125 |         "REDUCE_SN_evid", 
126 |         "POPROOT__", 
127 |         "REDUCE_NN_topic", 
128 |         "REDUCE_NN_same", 
129 |         "REDUCE_NN_list", 
130 |         "SHIFT__", 
131 |         "REDUCE_NS_cont", 
132 |         "REDUCE_NS_cond", 
133 |         "REDUCE_SN_mann", 
134 |         "REDUCE_SN_back", 
135 |         "REDUCE_NN_prob", 
136 |         "REDUCE_SN_cause", 
137 |         "REDUCE_NN_comp", 
138 |         "PAD"
139 |     ]
140 | }


--------------------------------------------------------------------------------
/models/rst/alphabets/tag_alpha.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id2alpha": {
  3 |         "0": "UNK", 
  4 |         "1": "", 
  5 |         "2": "PRP$", 
  6 |         "3": "VBG", 
  7 |         "4": "VBD", 
  8 |         "5": "``", 
  9 |         "6": "VBN", 
 10 |         "7": "POS", 
 11 |         "8": "''", 
 12 |         "9": "VBP", 
 13 |         "10": "WDT", 
 14 |         "11": "JJ", 
 15 |         "12": "WP", 
 16 |         "13": "VBZ", 
 17 |         "14": "DT", 
 18 |         "15": "#", 
 19 |         "16": "RP", 
 20 |         "17": "$", 
 21 |         "18": "NN", 
 22 |         "19": "FW", 
 23 |         "20": ",", 
 24 |         "21": ".", 
 25 |         "22": "TO", 
 26 |         "23": "PRP", 
 27 |         "24": "RB", 
 28 |         "25": "-LRB-", 
 29 |         "26": ":", 
 30 |         "27": "NNS", 
 31 |         "28": "NNP", 
 32 |         "29": "VB", 
 33 |         "30": "WRB", 
 34 |         "31": "CC", 
 35 |         "32": "LS", 
 36 |         "33": "PDT", 
 37 |         "34": "RBS", 
 38 |         "35": "RBR", 
 39 |         "36": "CD", 
 40 |         "37": "EX", 
 41 |         "38": "IN", 
 42 |         "39": "WP$", 
 43 |         "40": "MD", 
 44 |         "41": "NNPS", 
 45 |         "42": "-RRB-", 
 46 |         "43": "JJS", 
 47 |         "44": "JJR", 
 48 |         "45": "SYM", 
 49 |         "46": "UH"
 50 |     }, 
 51 |     "alpha2id": {
 52 |         "": 1, 
 53 |         "PRP$": 2, 
 54 |         "VBG": 3, 
 55 |         "VBD": 4, 
 56 |         "VBN": 6, 
 57 |         ",": 20, 
 58 |         "''": 8, 
 59 |         "VBP": 9, 
 60 |         "WDT": 10, 
 61 |         "JJ": 11, 
 62 |         "WP": 12, 
 63 |         "VBZ": 13, 
 64 |         "DT": 14, 
 65 |         "#": 15, 
 66 |         "RP": 16, 
 67 |         "$": 17, 
 68 |         "NN": 18, 
 69 |         "FW": 19, 
 70 |         "POS": 7, 
 71 |         ".": 21, 
 72 |         "TO": 22, 
 73 |         "PRP": 23, 
 74 |         "RB": 24, 
 75 |         "-LRB-": 25, 
 76 |         ":": 26, 
 77 |         "NNS": 27, 
 78 |         "NNP": 28, 
 79 |         "``": 5, 
 80 |         "WRB": 30, 
 81 |         "CC": 31, 
 82 |         "LS": 32, 
 83 |         "PDT": 33, 
 84 |         "RBS": 34, 
 85 |         "RBR": 35, 
 86 |         "CD": 36, 
 87 |         "EX": 37, 
 88 |         "IN": 38, 
 89 |         "WP$": 39, 
 90 |         "UNK": 0, 
 91 |         "MD": 40, 
 92 |         "NNPS": 41, 
 93 |         "-RRB-": 42, 
 94 |         "JJS": 43, 
 95 |         "JJR": 44, 
 96 |         "SYM": 45, 
 97 |         "VB": 29, 
 98 |         "UH": 46
 99 |     }, 
100 |     "alphas": [
101 |         "", 
102 |         "PRP$", 
103 |         "VBG", 
104 |         "VBD", 
105 |         "``", 
106 |         "VBN", 
107 |         "POS", 
108 |         "''", 
109 |         "VBP", 
110 |         "WDT", 
111 |         "JJ", 
112 |         "WP", 
113 |         "VBZ", 
114 |         "DT", 
115 |         "#", 
116 |         "RP", 
117 |         "$", 
118 |         "NN", 
119 |         "FW", 
120 |         ",", 
121 |         ".", 
122 |         "TO", 
123 |         "PRP", 
124 |         "RB", 
125 |         "-LRB-", 
126 |         ":", 
127 |         "NNS", 
128 |         "NNP", 
129 |         "VB", 
130 |         "WRB", 
131 |         "CC", 
132 |         "LS", 
133 |         "PDT", 
134 |         "RBS", 
135 |         "RBR", 
136 |         "CD", 
137 |         "EX", 
138 |         "IN", 
139 |         "WP$", 
140 |         "MD", 
141 |         "NNPS", 
142 |         "-RRB-", 
143 |         "JJS", 
144 |         "JJR", 
145 |         "SYM", 
146 |         "UH", 
147 |         "UNK"
148 |     ]
149 | }


--------------------------------------------------------------------------------
/models/rst/config.cfg:
--------------------------------------------------------------------------------
 1 | use_gpu =  True
 2 | use_dynamic_oracle = True
 3 | flag_oracle = True
 4 | word_embedding = glove
 5 | word_embedding_file = /home/ffajri/Data/NeuralRST/glove.6B.200d.txt.gz
 6 | train_path = /home/ffajri/Data/NeuralRST/rst.train312
 7 | test_path = /home/ffajri/Data/NeuralRST/rst.test38
 8 | dev_path = /home/ffajri/Data/NeuralRST/rst.dev35
 9 | train_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/train.conll.dump.results
10 | test_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/test.conll.dump.results
11 | dev_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/dev.conll.dump.results
12 | model_path = /home/ffajri/Workspace/RSTExtractor/models/rst/
13 | model_name = /home/ffajri/Workspace/RSTExtractor/models/rst/network.pt
14 | alphabet_path = /home/ffajri/Workspace/RSTExtractor/models/rst/alphabets/
15 | max_iter = 1000
16 | word_dim = 200
17 | tag_dim = 200
18 | etype_dim = 100
19 | syntax_dim = 1200
20 | max_sent_size = 40
21 | max_edu_size = 400
22 | max_state_size = 1024
23 | hidden_size = 200
24 | freeze = True
25 | drop_prob = 0.5
26 | num_layers = 1
27 | batch_size = 4
28 | opt = adam
29 | lr = 0.001
30 | ada_eps = 1e-08
31 | momentum = 0.9
32 | beta1 = 0.9
33 | beta2 = 0.999
34 | gamma = 2e-06
35 | start_decay = 0
36 | clip = 10.0
37 | decay = 0
38 | oracle_prob = 0.66666
39 | start_dynamic_oracle = 15
40 | early_stopping = 50
41 | 


--------------------------------------------------------------------------------
/models/rst/network.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/models/rst/network.pt


--------------------------------------------------------------------------------
/neuronlp2/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from . import io
4 | from . import nn
5 | from . import utils
6 | from . import nlinalg
7 | from . import models
8 | 
9 | __version__ = "0.1.dev1"


--------------------------------------------------------------------------------
/neuronlp2/biaffine_model.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import math
 3 | THREADS=7
 4 | files = glob.glob(DATA_PATH)
 5 | DATA_PATH = '/home/ffajri/Data/segmenter/*'
 6 | size = int(math.ceil(1.0*len(files)/THREADS))
 7 | 
 8 | allfiles = []
 9 | for i in range(THREADS):
10 |     start = i * size
11 |     end = start + size
12 |     if end > len(files):
13 |         end = len(files)
14 |     p = files[start:end]
15 |     allfiles.append(p)
16 |     if end == len(files):
17 |         break
18 | 
19 | for idx in range(len(allfiles)):
20 |     f = open(str(idx)+'.list', 'w')
21 |     for l in allfiles[idx]:
22 |         f.write(l+'\n')
23 |     f.close()
24 | 


--------------------------------------------------------------------------------
/neuronlp2/io/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from .alphabet import *
4 | from .instance import *
5 | from .logger import *
6 | from .writer import *
7 | from . import conllx_data
8 | from . import conllx_stacked_data
9 | from . import conll03_data


--------------------------------------------------------------------------------
/neuronlp2/io/alphabet.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | """
  4 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects.
  5 | """
  6 | import json
  7 | import os
  8 | from .logger import get_logger
  9 | 
 10 | class Alphabet(object):
 11 |     def __init__(self, name, defualt_value=False, keep_growing=True, singleton=False):
 12 |         self.__name = name
 13 | 
 14 |         self.instance2index = {}
 15 |         self.instances = []
 16 |         self.default_value = defualt_value
 17 |         self.offset = 1 if self.default_value else 0
 18 |         self.keep_growing = keep_growing
 19 |         self.singletons = set() if singleton else None
 20 | 
 21 |         # Index 0 is occupied by default, all else following.
 22 |         self.default_index = 0 if self.default_value else None
 23 | 
 24 |         self.next_index = self.offset
 25 | 
 26 |         self.logger = get_logger('Alphabet')
 27 | 
 28 |     def add(self, instance):
 29 |         if instance not in self.instance2index:
 30 |             self.instances.append(instance)
 31 |             self.instance2index[instance] = self.next_index
 32 |             self.next_index += 1
 33 | 
 34 |     def add_singleton(self, id):
 35 |         if self.singletons is None:
 36 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 37 |         else:
 38 |             self.singletons.add(id)
 39 | 
 40 |     def add_singletons(self, ids):
 41 |         if self.singletons is None:
 42 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 43 |         else:
 44 |             self.singletons.update(ids)
 45 | 
 46 |     def is_singleton(self, id):
 47 |         if self.singletons is None:
 48 |             raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
 49 |         else:
 50 |             return id in self.singletons
 51 | 
 52 |     def get_index(self, instance):
 53 |         try:
 54 |             return self.instance2index[instance]
 55 |         except KeyError:
 56 |             if self.keep_growing:
 57 |                 index = self.next_index
 58 |                 self.add(instance)
 59 |                 return index
 60 |             else:
 61 |                 if self.default_value:
 62 |                     return self.default_index
 63 |                 else:
 64 |                     raise KeyError("instance not found: %s" % instance)
 65 | 
 66 |     def get_instance(self, index):
 67 |         if self.default_value and index == self.default_index:
 68 |             # First index is occupied by the wildcard element.
 69 |             return '<_UNK>'
 70 |         else:
 71 |             try:
 72 |                 return self.instances[index - self.offset]
 73 |             except IndexError:
 74 |                 raise IndexError('unknown index: %d' % index)
 75 | 
 76 |     def size(self):
 77 |         return len(self.instances) + self.offset
 78 | 
 79 |     def singleton_size(self):
 80 |         return len(self.singletons)
 81 | 
 82 |     def items(self):
 83 |         return self.instance2index.items()
 84 | 
 85 |     def enumerate_items(self, start):
 86 |         if start < self.offset or start >= self.size():
 87 |             raise IndexError("Enumerate is allowed between [%d : size of the alphabet)" % self.offset)
 88 |         return zip(range(start, len(self.instances) + self.offset), self.instances[start - self.offset:])
 89 | 
 90 |     def close(self):
 91 |         self.keep_growing = False
 92 | 
 93 |     def open(self):
 94 |         self.keep_growing = True
 95 | 
 96 |     def get_content(self):
 97 |         if self.singletons is None:
 98 |             return {'instance2index': self.instance2index, 'instances': self.instances}
 99 |         else:
100 |             return {'instance2index': self.instance2index, 'instances': self.instances,
101 |                     'singletions': list(self.singletons)}
102 | 
103 |     def __from_json(self, data):
104 |         self.instances = data["instances"]
105 |         self.instance2index = data["instance2index"]
106 |         if 'singletions' in data:
107 |             self.singletons = set(data['singletions'])
108 |         else:
109 |             self.singletons = None
110 | 
111 |     def save(self, output_directory, name=None):
112 |         """
113 |         Save both alhpabet records to the given directory.
114 |         :param output_directory: Directory to save model and weights.
115 |         :param name: The alphabet saving name, optional.
116 |         :return:
117 |         """
118 |         saving_name = name if name else self.__name
119 |         try:
120 |             if not os.path.exists(output_directory):
121 |                 os.makedirs(output_directory)
122 | 
123 |             json.dump(self.get_content(),
124 |                       open(os.path.join(output_directory, saving_name + ".json"), 'w'), indent=4)
125 |         except Exception as e:
126 |             self.logger.warn("Alphabet is not saved: %s" % repr(e))
127 | 
128 |     def load(self, input_directory, name=None):
129 |         """
130 |         Load model architecture and weights from the give directory. This allow we use old models even the structure
131 |         changes.
132 |         :param input_directory: Directory to save model and weights
133 |         :return:
134 |         """
135 |         loading_name = name if name else self.__name
136 |         self.__from_json(json.load(open(os.path.join(input_directory, loading_name + ".json"))))
137 |         self.next_index = len(self.instances) + self.offset
138 |         self.keep_growing = False
139 | 


--------------------------------------------------------------------------------
/neuronlp2/io/instance.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | 
 4 | class Sentence(object):
 5 |     def __init__(self, words, word_ids, char_seqs, char_id_seqs):
 6 |         self.words = words
 7 |         self.word_ids = word_ids
 8 |         self.char_seqs = char_seqs
 9 |         self.char_id_seqs = char_id_seqs
10 | 
11 |     def length(self):
12 |         return len(self.words)
13 | 
14 | 
15 | class DependencyInstance(object):
16 |     def __init__(self, sentence, postags, pos_ids, heads=None, types=None, type_ids=None):
17 |         self.sentence = sentence
18 |         self.postags = postags
19 |         self.pos_ids = pos_ids
20 |         self.heads = heads
21 |         self.types = types
22 |         self.type_ids = type_ids
23 | 
24 |     def length(self):
25 |         return self.sentence.length()
26 | 
27 | 
28 | class NERInstance(object):
29 |     def __init__(self, sentence, postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids):
30 |         self.sentence = sentence
31 |         self.postags = postags
32 |         self.pos_ids = pos_ids
33 |         self.chunk_tags = chunk_tags
34 |         self.chunk_ids = chunk_ids
35 |         self.ner_tags = ner_tags
36 |         self.ner_ids = ner_ids
37 | 
38 |     def length(self):
39 |         return self.sentence.length()
40 | 


--------------------------------------------------------------------------------
/neuronlp2/io/logger.py:
--------------------------------------------------------------------------------
 1 | _author__ = 'max'
 2 | 
 3 | import logging
 4 | import sys
 5 | 
 6 | 
 7 | def get_logger(name, level=logging.INFO, handler=sys.stdout,
 8 |                formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'):
 9 |     logger = logging.getLogger(name)
10 |     logger.setLevel(logging.INFO)
11 |     formatter = logging.Formatter(formatter)
12 |     stream_handler = logging.StreamHandler(handler)
13 |     stream_handler.setLevel(level)
14 |     stream_handler.setFormatter(formatter)
15 |     logger.addHandler(stream_handler)
16 | 
17 |     return logger
18 | 


--------------------------------------------------------------------------------
/neuronlp2/io/reader.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | from .instance import DependencyInstance, NERInstance
  4 | from .instance import Sentence
  5 | from .conllx_data import ROOT, ROOT_POS, ROOT_CHAR, ROOT_TYPE, END, END_POS, END_CHAR, END_TYPE
  6 | from . import utils
  7 | 
  8 | 
  9 | class CoNLLXReader(object):
 10 |     def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
 11 |         self.__source_file = open(file_path, 'r')
 12 |         self.__word_alphabet = word_alphabet
 13 |         self.__char_alphabet = char_alphabet
 14 |         self.__pos_alphabet = pos_alphabet
 15 |         self.__type_alphabet = type_alphabet
 16 | 
 17 |     def close(self):
 18 |         self.__source_file.close()
 19 | 
 20 |     def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False):
 21 |         line = self.__source_file.readline()
 22 |         # skip multiple blank lines.
 23 |         while len(line) > 0 and len(line.strip()) == 0:
 24 |             line = self.__source_file.readline()
 25 |         if len(line) == 0:
 26 |             return None
 27 | 
 28 |         lines = []
 29 |         while len(line.strip()) > 0:
 30 |             line = line.strip()
 31 |             line = line.decode('utf-8')
 32 |             lines.append(line.split('\t'))
 33 |             line = self.__source_file.readline()
 34 | 
 35 |         length = len(lines)
 36 |         if length == 0:
 37 |             return None
 38 | 
 39 |         words = []
 40 |         word_ids = []
 41 |         char_seqs = []
 42 |         char_id_seqs = []
 43 |         postags = []
 44 |         pos_ids = []
 45 |         types = []
 46 |         type_ids = []
 47 |         heads = []
 48 | 
 49 |         if symbolic_root:
 50 |             words.append(ROOT)
 51 |             word_ids.append(self.__word_alphabet.get_index(ROOT))
 52 |             char_seqs.append([ROOT_CHAR, ])
 53 |             char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ])
 54 |             postags.append(ROOT_POS)
 55 |             pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
 56 |             types.append(ROOT_TYPE)
 57 |             type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
 58 |             heads.append(0)
 59 | 
 60 |         for tokens in lines:
 61 |             chars = []
 62 |             char_ids = []
 63 |             for char in tokens[1]:
 64 |                 chars.append(char)
 65 |                 char_ids.append(self.__char_alphabet.get_index(char))
 66 |             if len(chars) > utils.MAX_CHAR_LENGTH:
 67 |                 chars = chars[:utils.MAX_CHAR_LENGTH]
 68 |                 char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
 69 |             char_seqs.append(chars)
 70 |             char_id_seqs.append(char_ids)
 71 | 
 72 |             word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
 73 |             pos = tokens[4]
 74 |             head = int(tokens[6])
 75 |             type = tokens[7]
 76 | 
 77 |             words.append(word)
 78 |             word_ids.append(self.__word_alphabet.get_index(word))
 79 | 
 80 |             postags.append(pos)
 81 |             pos_ids.append(self.__pos_alphabet.get_index(pos))
 82 | 
 83 |             types.append(type)
 84 |             type_ids.append(self.__type_alphabet.get_index(type))
 85 | 
 86 |             heads.append(head)
 87 | 
 88 |         if symbolic_end:
 89 |             words.append(END)
 90 |             word_ids.append(self.__word_alphabet.get_index(END))
 91 |             char_seqs.append([END_CHAR, ])
 92 |             char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ])
 93 |             postags.append(END_POS)
 94 |             pos_ids.append(self.__pos_alphabet.get_index(END_POS))
 95 |             types.append(END_TYPE)
 96 |             type_ids.append(self.__type_alphabet.get_index(END_TYPE))
 97 |             heads.append(0)
 98 | 
 99 |         return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
100 | 
101 |     def getNextForTest(self, normalize_digits=True, symbolic_root=False, symbolic_end=False):
102 |         line = self.__source_file.readline()
103 |         # skip multiple blank lines.
104 |         while len(line) > 0 and len(line.strip()) == 0:
105 |             line = self.__source_file.readline()
106 |         if len(line) == 0:
107 |             return None
108 | 
109 |         lines = []
110 |         while len(line.strip()) > 0:
111 |             line = line.strip()
112 |             line = line.decode('utf-8')
113 |             lines.append(line.split('\t'))
114 |             line = self.__source_file.readline()
115 | 
116 |         length = len(lines)
117 |         if length == 0:
118 |             return None
119 | 
120 |         words = []
121 |         word_ids = []
122 |         char_seqs = []
123 |         char_id_seqs = []
124 |         postags = []
125 |         pos_ids = []
126 | 
127 |         if symbolic_root:
128 |             words.append(ROOT)
129 |             word_ids.append(self.__word_alphabet.get_index(ROOT))
130 |             char_seqs.append([ROOT_CHAR, ])
131 |             char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ])
132 |             postags.append(ROOT_POS)
133 |             pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
134 | 
135 |         for tokens in lines:
136 |             chars = []
137 |             char_ids = []
138 |             for char in tokens[1]:
139 |                 chars.append(char)
140 |                 char_ids.append(self.__char_alphabet.get_index(char))
141 |             if len(chars) > utils.MAX_CHAR_LENGTH:
142 |                 chars = chars[:utils.MAX_CHAR_LENGTH]
143 |                 char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
144 |             char_seqs.append(chars)
145 |             char_id_seqs.append(char_ids)
146 | 
147 |             word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
148 |             pos = tokens[4]
149 |             if pos == '_':
150 |                 pos = tokens[3]
151 |             if pos == '#':
152 |                 pos = '$'
153 |             words.append(word)
154 |             word_ids.append(self.__word_alphabet.get_index(word))
155 | 
156 |             postags.append(pos)
157 |             pos_ids.append(self.__pos_alphabet.get_index(pos))
158 | 
159 | 
160 |         if symbolic_end:
161 |             words.append(END)
162 |             word_ids.append(self.__word_alphabet.get_index(END))
163 |             char_seqs.append([END_CHAR, ])
164 |             char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ])
165 |             postags.append(END_POS)
166 |             pos_ids.append(self.__pos_alphabet.get_index(END_POS))
167 | 
168 |         return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids)
169 | 
170 | class CoNLL03Reader(object):
171 |     def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
172 |         self.__source_file = open(file_path, 'r')
173 |         self.__word_alphabet = word_alphabet
174 |         self.__char_alphabet = char_alphabet
175 |         self.__pos_alphabet = pos_alphabet
176 |         self.__chunk_alphabet = chunk_alphabet
177 |         self.__ner_alphabet = ner_alphabet
178 | 
179 |     def close(self):
180 |         self.__source_file.close()
181 | 
182 |     def getNext(self, normalize_digits=True):
183 |         line = self.__source_file.readline()
184 |         # skip multiple blank lines.
185 |         while len(line) > 0 and len(line.strip()) == 0:
186 |             line = self.__source_file.readline()
187 |         if len(line) == 0:
188 |             return None
189 | 
190 |         lines = []
191 |         while len(line.strip()) > 0:
192 |             line = line.strip()
193 |             line = line.decode('utf-8')
194 |             lines.append(line.split(' '))
195 |             line = self.__source_file.readline()
196 | 
197 |         length = len(lines)
198 |         if length == 0:
199 |             return None
200 | 
201 |         words = []
202 |         word_ids = []
203 |         char_seqs = []
204 |         char_id_seqs = []
205 |         postags = []
206 |         pos_ids = []
207 |         chunk_tags = []
208 |         chunk_ids = []
209 |         ner_tags = []
210 |         ner_ids = []
211 | 
212 |         for tokens in lines:
213 |             chars = []
214 |             char_ids = []
215 |             for char in tokens[1]:
216 |                 chars.append(char)
217 |                 char_ids.append(self.__char_alphabet.get_index(char))
218 |             if len(chars) > utils.MAX_CHAR_LENGTH:
219 |                 chars = chars[:utils.MAX_CHAR_LENGTH]
220 |                 char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
221 |             char_seqs.append(chars)
222 |             char_id_seqs.append(char_ids)
223 | 
224 |             word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
225 |             pos = tokens[2]
226 |             chunk = tokens[3]
227 |             ner = tokens[4]
228 | 
229 |             words.append(word)
230 |             word_ids.append(self.__word_alphabet.get_index(word))
231 | 
232 |             postags.append(pos)
233 |             pos_ids.append(self.__pos_alphabet.get_index(pos))
234 | 
235 |             chunk_tags.append(chunk)
236 |             chunk_ids.append(self.__chunk_alphabet.get_index(chunk))
237 | 
238 |             ner_tags.append(ner)
239 |             ner_ids.append(self.__ner_alphabet.get_index(ner))
240 | 
241 |         return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, chunk_tags, chunk_ids,
242 |                            ner_tags, ner_ids)
243 | 


--------------------------------------------------------------------------------
/neuronlp2/io/utils.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | import re
4 | MAX_CHAR_LENGTH = 45
5 | NUM_CHAR_PAD = 2
6 | 
7 | # Regular expressions used to normalize digits.
8 | DIGIT_RE = re.compile(br"\d")
9 | 


--------------------------------------------------------------------------------
/neuronlp2/io/writer.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | 
 4 | class CoNLL03Writer(object):
 5 |     def __init__(self, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
 6 |         self.__source_file = None
 7 |         self.__word_alphabet = word_alphabet
 8 |         self.__char_alphabet = char_alphabet
 9 |         self.__pos_alphabet = pos_alphabet
10 |         self.__chunk_alphabet = chunk_alphabet
11 |         self.__ner_alphabet = ner_alphabet
12 | 
13 |     def start(self, file_path):
14 |         self.__source_file = open(file_path, 'w')
15 | 
16 |     def close(self):
17 |         self.__source_file.close()
18 | 
19 |     def write(self, word, pos, chunk, predictions, targets, lengths):
20 |         batch_size, _ = word.shape
21 |         for i in range(batch_size):
22 |             for j in range(lengths[i]):
23 |                 w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8')
24 |                 p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8')
25 |                 ch = self.__chunk_alphabet.get_instance(chunk[i, j]).encode('utf-8')
26 |                 tgt = self.__ner_alphabet.get_instance(targets[i, j]).encode('utf-8')
27 |                 pred = self.__ner_alphabet.get_instance(predictions[i, j]).encode('utf-8')
28 |                 self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, p, ch, tgt, pred))
29 |             self.__source_file.write('\n')
30 | 
31 | 
32 | class CoNLLXWriter(object):
33 |     def __init__(self, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
34 |         self.__source_file = None
35 |         self.__word_alphabet = word_alphabet
36 |         self.__char_alphabet = char_alphabet
37 |         self.__pos_alphabet = pos_alphabet
38 |         self.__type_alphabet = type_alphabet
39 | 
40 |     def start(self, file_path):
41 |         self.__source_file = open(file_path, 'w')
42 | 
43 |     def close(self):
44 |         self.__source_file.close()
45 | 
46 |     def write(self, word, pos, head, type, lengths, symbolic_root=False, symbolic_end=False):
47 |         batch_size, _ = word.shape
48 |         start = 1 if symbolic_root else 0
49 |         end = 1 if symbolic_end else 0
50 |         for i in range(batch_size):
51 |             for j in range(start, lengths[i] - end):
52 |                 w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8')
53 |                 p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8')
54 |                 t = self.__type_alphabet.get_instance(type[i, j]).encode('utf-8')
55 |                 h = head[i, j]
56 |                 self.__source_file.write('%d\t%s\t_\t_\t%s\t_\t%d\t%s\n' % (j, w, p, h, t))
57 |             self.__source_file.write('\n')
58 | 


--------------------------------------------------------------------------------
/neuronlp2/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from .sequence_labeling import *
4 | from .parsing import *
5 | 
6 | 


--------------------------------------------------------------------------------
/neuronlp2/nlinalg/__init__.py:
--------------------------------------------------------------------------------
1 | _author__ = 'max'
2 | 
3 | from .nlinalg import *
4 | 


--------------------------------------------------------------------------------
/neuronlp2/nlinalg/nlinalg.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | import numpy
 4 | import torch
 5 | from torch.autograd.function import Function
 6 | 
 7 | 
 8 | def logdet(x):
 9 |     """
10 | 
11 |     Args:
12 |         x: 2D positive semidefinite matrix.
13 | 
14 |     Returns: log determinant of x
15 | 
16 |     """
17 |     # TODO for pytorch 2.0.4, use inside potrf for variable.
18 |     print(torch.log(torch.eig(x.data)[0]))
19 |     print(x)
20 |     u_chol = x.potrf()
21 |     return torch.sum(torch.log(u_chol.diag())) * 2
22 | 
23 | 
24 | def logsumexp(x, dim=None):
25 |     """
26 | 
27 |     Args:
28 |         x: A pytorch tensor (any dimension will do)
29 |         dim: int or None, over which to perform the summation. `None`, the
30 |              default, performs over all axes.
31 | 
32 |     Returns: The result of the log(sum(exp(...))) operation.
33 | 
34 |     """
35 |     if dim is None:
36 |         xmax = x.max()
37 |         xmax_ = x.max()
38 |         return xmax_ + torch.log(torch.exp(x - xmax).sum())
39 |     else:
40 |         xmax, _ = x.max(dim, keepdim=True)
41 |         xmax_, _ = x.max(dim)
42 |         return xmax_ + torch.log(torch.exp(x - xmax).sum(dim))
43 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from .modules import *
4 | from . import init
5 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from . import masked_rnn
4 | from . import variational_rnn
5 | from . import skipconnect_rnn
6 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/masked_rnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | from torch.nn import functional as F
  5 | 
  6 | 
  7 | def MaskedRecurrent(reverse=False):
  8 |     def forward(input, hidden, cell, mask):
  9 |         output = []
 10 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
 11 |         for i in steps:
 12 |             if mask is None or mask[i].data.min() > 0.5:
 13 |                 hidden = cell(input[i], hidden)
 14 |             elif mask[i].data.max() > 0.5:
 15 |                 hidden_next = cell(input[i], hidden)
 16 |                 # hack to handle LSTM
 17 |                 if isinstance(hidden, tuple):
 18 |                     hx, cx = hidden
 19 |                     hp1, cp1 = hidden_next
 20 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
 21 |                 else:
 22 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
 23 |             # hack to handle LSTM
 24 |             output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
 25 | 
 26 |         if reverse:
 27 |             output.reverse()
 28 |         output = torch.cat(output, 0).view(input.size(0), *output[0].size())
 29 | 
 30 |         return hidden, output
 31 | 
 32 |     return forward
 33 | 
 34 | 
 35 | def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
 36 |     num_directions = len(inners)
 37 |     total_layers = num_layers * num_directions
 38 | 
 39 |     def forward(input, hidden, cells, mask):
 40 |         assert (len(cells) == total_layers)
 41 |         next_hidden = []
 42 | 
 43 |         if lstm:
 44 |             hidden = list(zip(*hidden))
 45 | 
 46 |         for i in range(num_layers):
 47 |             all_output = []
 48 |             for j, inner in enumerate(inners):
 49 |                 l = i * num_directions + j
 50 |                 hy, output = inner(input, hidden[l], cells[l], mask)
 51 |                 next_hidden.append(hy)
 52 |                 all_output.append(output)
 53 | 
 54 |             input = torch.cat(all_output, input.dim() - 1)
 55 | 
 56 |             if dropout != 0 and i < num_layers - 1:
 57 |                 input = F.dropout(input, p=dropout, training=train, inplace=False)
 58 | 
 59 |         if lstm:
 60 |             next_h, next_c = zip(*next_hidden)
 61 |             next_hidden = (
 62 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
 63 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
 64 |             )
 65 |         else:
 66 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
 67 | 
 68 |         return next_hidden, input
 69 | 
 70 |     return forward
 71 | 
 72 | 
 73 | def AutogradMaskedRNN(num_layers=1, batch_first=False, dropout=0, train=True, bidirectional=False, lstm=False):
 74 |     rec_factory = MaskedRecurrent
 75 | 
 76 |     if bidirectional:
 77 |         layer = (rec_factory(), rec_factory(reverse=True))
 78 |     else:
 79 |         layer = (rec_factory(),)
 80 | 
 81 |     func = StackedRNN(layer,
 82 |                       num_layers,
 83 |                       lstm=lstm,
 84 |                       dropout=dropout,
 85 |                       train=train)
 86 | 
 87 |     def forward(input, cells, hidden, mask):
 88 |         if batch_first:
 89 |             input = input.transpose(0, 1)
 90 |             if mask is not None:
 91 |                 mask = mask.transpose(0, 1)
 92 | 
 93 |         nexth, output = func(input, hidden, cells, mask)
 94 | 
 95 |         if batch_first:
 96 |             output = output.transpose(0, 1)
 97 | 
 98 |         return output, nexth
 99 | 
100 |     return forward
101 | 
102 | 
103 | def MaskedStep():
104 |     def forward(input, hidden, cell, mask):
105 |         if mask is None or mask.data.min() > 0.5:
106 |             hidden = cell(input, hidden)
107 |         elif mask.data.max() > 0.5:
108 |             hidden_next = cell(input, hidden)
109 |             # hack to handle LSTM
110 |             if isinstance(hidden, tuple):
111 |                 hx, cx = hidden
112 |                 hp1, cp1 = hidden_next
113 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
114 |             else:
115 |                 hidden = hidden + (hidden_next - hidden) * mask
116 |         # hack to handle LSTM
117 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
118 | 
119 |         return hidden, output
120 | 
121 |     return forward
122 | 
123 | 
124 | def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True):
125 |     def forward(input, hidden, cells, mask):
126 |         assert (len(cells) == num_layers)
127 |         next_hidden = []
128 | 
129 |         if lstm:
130 |             hidden = list(zip(*hidden))
131 | 
132 |         for l in range(num_layers):
133 |             hy, output = layer(input, hidden[l], cells[l], mask)
134 |             next_hidden.append(hy)
135 |             input = output
136 | 
137 |             if dropout != 0 and l < num_layers - 1:
138 |                 input = F.dropout(input, p=dropout, training=train, inplace=False)
139 | 
140 |         if lstm:
141 |             next_h, next_c = zip(*next_hidden)
142 |             next_hidden = (
143 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
144 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
145 |             )
146 |         else:
147 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
148 | 
149 |         return next_hidden, input
150 | 
151 |     return forward
152 | 
153 | 
154 | def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False):
155 |     layer = MaskedStep()
156 | 
157 |     func = StackedStep(layer,
158 |                        num_layers,
159 |                        lstm=lstm,
160 |                        dropout=dropout,
161 |                        train=train)
162 | 
163 |     def forward(input, cells, hidden, mask):
164 |         nexth, output = func(input, hidden, cells, mask)
165 |         return output, nexth
166 | 
167 |     return forward
168 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/skipconnect_rnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | from torch.autograd import Variable
  5 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
  6 | from torch.nn import functional as F
  7 | 
  8 | 
  9 | def SkipConnectRNNReLUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None, noise_skip=None):
 10 |     if noise_in is not None:
 11 |         input = input * noise_in
 12 | 
 13 |     hidden = torch.cat([hidden, hidden_skip], dim=1)
 14 |     if noise_hidden is not None:
 15 |         hidden = hidden * noise_hidden
 16 | 
 17 |     hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 18 |     return hy
 19 | 
 20 | 
 21 | def SkipConnectRNNTanhCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 22 |     if noise_in is not None:
 23 |         input = input * noise_in
 24 | 
 25 |     hidden = torch.cat([hidden, hidden_skip], dim=1)
 26 |     if noise_hidden is not None:
 27 |         hidden = hidden * noise_hidden
 28 | 
 29 |     hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 30 |     return hy
 31 | 
 32 | 
 33 | def SkipConnectLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 34 |     input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 35 | 
 36 |     hx, cx = hidden
 37 |     hx = torch.cat([hx, hidden_skip], dim=1)
 38 |     hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 39 | 
 40 |     gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 41 | 
 42 |     ingate, forgetgate, cellgate, outgate = gates
 43 | 
 44 |     ingate = F.sigmoid(ingate)
 45 |     forgetgate = F.sigmoid(forgetgate)
 46 |     cellgate = F.tanh(cellgate)
 47 |     outgate = F.sigmoid(outgate)
 48 | 
 49 |     cy = (forgetgate * cx) + (ingate * cellgate)
 50 |     hy = outgate * F.tanh(cy)
 51 | 
 52 |     return hy, cy
 53 | 
 54 | 
 55 | def SkipConnectFastLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 56 |     if noise_in is not None:
 57 |         input = input * noise_in
 58 | 
 59 |     hx, cx = hidden
 60 |     hx = torch.cat([hx, hidden_skip], dim=1)
 61 |     if noise_hidden is not None:
 62 |         hx = hx * noise_hidden
 63 | 
 64 |     if input.is_cuda:
 65 |         igates = F.linear(input, w_ih)
 66 |         hgates = F.linear(hx, w_hh)
 67 |         state = fusedBackend.LSTMFused.apply
 68 |         return state(igates, hgates, cx) if b_ih is None else state(igates, hgates, cx, b_ih, b_hh)
 69 | 
 70 |     gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
 71 | 
 72 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 73 | 
 74 |     ingate = F.sigmoid(ingate)
 75 |     forgetgate = F.sigmoid(forgetgate)
 76 |     cellgate = F.tanh(cellgate)
 77 |     outgate = F.sigmoid(outgate)
 78 | 
 79 |     cy = (forgetgate * cx) + (ingate * cellgate)
 80 |     hy = outgate * F.tanh(cy)
 81 | 
 82 |     return hy, cy
 83 | 
 84 | 
 85 | def SkipConnectGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 86 |     input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 87 |     hx = torch.cat([hidden, hidden_skip], dim=1)
 88 |     hx = hx.expand(3, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 89 | 
 90 |     gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
 91 |     gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 92 |     i_r, i_i, i_n = gi
 93 |     h_r, h_i, h_n = gh
 94 | 
 95 |     resetgate = F.sigmoid(i_r + h_r)
 96 |     inputgate = F.sigmoid(i_i + h_i)
 97 |     newgate = F.tanh(i_n + resetgate * h_n)
 98 |     hy = newgate + inputgate * (hidden - newgate)
 99 | 
100 |     return hy
101 | 
102 | 
103 | def SkipConnectFastGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
104 |     if noise_in is not None:
105 |         input = input * noise_in
106 | 
107 |     hx = torch.cat([hidden, hidden_skip], dim=1)
108 |     if noise_hidden is not None:
109 |         hx = hx * noise_hidden
110 | 
111 |     if input.is_cuda:
112 |         gi = F.linear(input, w_ih)
113 |         gh = F.linear(hx, w_hh)
114 |         state = fusedBackend.GRUFused.apply
115 |         return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
116 | 
117 |     gi = F.linear(input, w_ih, b_ih)
118 |     gh = F.linear(hx, w_hh, b_hh)
119 |     i_r, i_i, i_n = gi.chunk(3, 1)
120 |     h_r, h_i, h_n = gh.chunk(3, 1)
121 | 
122 |     resetgate = F.sigmoid(i_r + h_r)
123 |     inputgate = F.sigmoid(i_i + h_i)
124 |     newgate = F.tanh(i_n + resetgate * h_n)
125 |     hy = newgate + inputgate * (hidden - newgate)
126 | 
127 |     return hy
128 | 
129 | 
130 | def SkipConnectRecurrent(reverse=False):
131 |     def forward(input, skip_connect, hidden, cell, mask):
132 |         # hack to handle LSTM
133 |         h0 = hidden[0] if isinstance(hidden, tuple) else hidden
134 |         # [length + 1, batch, hidden_size]
135 |         output = Variable(input.data.new(input.size(0) + 1, *h0.size()).zero_()) + h0
136 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
137 |         # create batch index
138 |         batch_index = torch.arange(0, h0.size(0)).type_as(skip_connect)
139 |         for i in steps:
140 |             if mask is None or mask[i].data.min() > 0.5:
141 |                 hidden_skip = output[skip_connect[i], batch_index]
142 |                 hidden = cell(input[i], hidden, hidden_skip)
143 |             elif mask[i].data.max() > 0.5:
144 |                 hidden_skip = output[skip_connect[i], batch_index]
145 |                 hidden_next = cell(input[i], hidden, hidden_skip)
146 |                 # hack to handle LSTM
147 |                 if isinstance(hidden, tuple):
148 |                     hx, cx = hidden
149 |                     hp1, cp1 = hidden_next
150 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
151 |                 else:
152 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
153 |             # hack to handle LSTM
154 |             if reverse:
155 |                 output[i] = hidden[0] if isinstance(hidden, tuple) else hidden
156 |             else:
157 |                 output[i + 1] = hidden[0] if isinstance(hidden, tuple) else hidden
158 | 
159 |         if reverse:
160 |             # remove last position
161 |             output = output[:-1]
162 |         else:
163 |             # remove position 0
164 |             output = output[1:]
165 | 
166 |         return hidden, output
167 | 
168 |     return forward
169 | 
170 | 
171 | def StackedRNN(inners, num_layers, lstm=False):
172 |     num_directions = len(inners)
173 |     total_layers = num_layers * num_directions
174 | 
175 |     def reverse_skip_connection(skip_connect):
176 |         # TODO reverse skip connection for bidirectional rnn.
177 |         return skip_connect
178 | 
179 |     def forward(input, skip_connect, hidden, cells, mask):
180 |         assert (len(cells) == total_layers)
181 |         next_hidden = []
182 | 
183 |         skip_connect_forward = skip_connect
184 |         skip_connec_backward = reverse_skip_connection(skip_connect) if num_directions == 2 else None
185 | 
186 |         if lstm:
187 |             hidden = list(zip(*hidden))
188 | 
189 |         for i in range(num_layers):
190 |             all_output = []
191 |             for j, inner in enumerate(inners):
192 |                 l = i * num_directions + j
193 |                 skip_connect = skip_connect_forward if j == 0 else skip_connec_backward
194 |                 hy, output = inner(input, skip_connect, hidden[l], cells[l], mask)
195 |                 next_hidden.append(hy)
196 |                 all_output.append(output)
197 | 
198 |             input = torch.cat(all_output, input.dim() - 1)
199 | 
200 |         if lstm:
201 |             next_h, next_c = zip(*next_hidden)
202 |             next_hidden = (
203 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
204 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
205 |             )
206 |         else:
207 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
208 | 
209 |         return next_hidden, input
210 | 
211 |     return forward
212 | 
213 | 
214 | def AutogradSkipConnectRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
215 |     rec_factory = SkipConnectRecurrent
216 | 
217 |     if bidirectional:
218 |         layer = (rec_factory(), rec_factory(reverse=True))
219 |     else:
220 |         layer = (rec_factory(),)
221 | 
222 |     func = StackedRNN(layer,
223 |                       num_layers,
224 |                       lstm=lstm)
225 | 
226 |     def forward(input, skip_connect, cells, hidden, mask):
227 |         if batch_first:
228 |             input = input.transpose(0, 1)
229 |             skip_connect = skip_connect.transpose(0, 1)
230 |             if mask is not None:
231 |                 mask = mask.transpose(0, 1)
232 | 
233 |         nexth, output = func(input, skip_connect, hidden, cells, mask)
234 | 
235 |         if batch_first:
236 |             output = output.transpose(0, 1)
237 | 
238 |         return output, nexth
239 | 
240 |     return forward
241 | 
242 | 
243 | def SkipConnectStep():
244 |     def forward(input, hidden, hidden_skip, cell, mask):
245 |         if mask is None or mask.data.min() > 0.5:
246 |             hidden = cell(input, hidden, hidden_skip)
247 |         elif mask.data.max() > 0.5:
248 |             hidden_next = cell(input, hidden, hidden_skip)
249 |             # hack to handle LSTM
250 |             if isinstance(hidden, tuple):
251 |                 hx, cx = hidden
252 |                 hp1, cp1 = hidden_next
253 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
254 |             else:
255 |                 hidden = hidden + (hidden_next - hidden) * mask
256 |         # hack to handle LSTM
257 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
258 | 
259 |         return hidden, output
260 | 
261 |     return forward
262 | 
263 | 
264 | def StackedStep(layer, num_layers, lstm=False):
265 |     def forward(input, hidden, hidden_skip, cells, mask):
266 |         assert (len(cells) == num_layers)
267 |         next_hidden = []
268 | 
269 |         if lstm:
270 |             hidden = list(zip(*hidden))
271 | 
272 |         for l in range(num_layers):
273 |             hy, output = layer(input, hidden[l], hidden_skip[l], cells[l], mask)
274 |             next_hidden.append(hy)
275 |             input = output
276 | 
277 |         if lstm:
278 |             next_h, next_c = zip(*next_hidden)
279 |             next_hidden = (
280 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
281 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
282 |             )
283 |         else:
284 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
285 | 
286 |         return next_hidden, input
287 | 
288 |     return forward
289 | 
290 | 
291 | def AutogradSkipConnectStep(num_layers=1, lstm=False):
292 |     layer = SkipConnectStep()
293 | 
294 |     func = StackedStep(layer,
295 |                        num_layers,
296 |                        lstm=lstm)
297 | 
298 |     def forward(input, cells, hidden, hidden_skip, mask):
299 |         nexth, output = func(input, hidden, hidden_skip, cells, mask)
300 |         return output, nexth
301 | 
302 |     return forward
303 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/variational_rnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
  5 | from torch.nn import functional as F
  6 | 
  7 | 
  8 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
  9 |     if noise_in is not None:
 10 |         input = input * noise_in
 11 |     if noise_hidden is not None:
 12 |         hidden = hidden * noise_hidden
 13 |     hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 14 |     return hy
 15 | 
 16 | 
 17 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 18 |     if noise_in is not None:
 19 |         input = input * noise_in
 20 |     if noise_hidden is not None:
 21 |         hidden = hidden * noise_hidden
 22 |     hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
 23 |     return hy
 24 | 
 25 | 
 26 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 27 |     input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 28 | 
 29 |     hx, cx = hidden
 30 |     hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
 31 | 
 32 |     gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 33 | 
 34 |     ingate, forgetgate, cellgate, outgate = gates
 35 | 
 36 |     ingate = F.sigmoid(ingate)
 37 |     forgetgate = F.sigmoid(forgetgate)
 38 |     cellgate = F.tanh(cellgate)
 39 |     outgate = F.sigmoid(outgate)
 40 | 
 41 |     cy = (forgetgate * cx) + (ingate * cellgate)
 42 |     hy = outgate * F.tanh(cy)
 43 | 
 44 |     return hy, cy
 45 | 
 46 | 
 47 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 48 |     if noise_in is not None:
 49 |         input = input * noise_in
 50 | 
 51 |     if input.is_cuda:
 52 |         igates = F.linear(input, w_ih)
 53 |         hgates = F.linear(hidden[0], w_hh) if noise_hidden is None else F.linear(hidden[0] * noise_hidden, w_hh)
 54 |         state = fusedBackend.LSTMFused.apply
 55 |         return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)
 56 | 
 57 |     hx, cx = hidden
 58 |     if noise_hidden is not None:
 59 |         hx = hx * noise_hidden
 60 |     gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
 61 | 
 62 |     ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
 63 | 
 64 |     ingate = F.sigmoid(ingate)
 65 |     forgetgate = F.sigmoid(forgetgate)
 66 |     cellgate = F.tanh(cellgate)
 67 |     outgate = F.sigmoid(outgate)
 68 | 
 69 |     cy = (forgetgate * cx) + (ingate * cellgate)
 70 |     hy = outgate * F.tanh(cy)
 71 | 
 72 |     return hy, cy
 73 | 
 74 | 
 75 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 76 |     input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
 77 |     hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden
 78 | 
 79 |     gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
 80 |     gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
 81 |     i_r, i_i, i_n = gi
 82 |     h_r, h_i, h_n = gh
 83 | 
 84 |     resetgate = F.sigmoid(i_r + h_r)
 85 |     inputgate = F.sigmoid(i_i + h_i)
 86 |     newgate = F.tanh(i_n + resetgate * h_n)
 87 |     hy = newgate + inputgate * (hidden - newgate)
 88 | 
 89 |     return hy
 90 | 
 91 | 
 92 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
 93 |     if noise_in is not None:
 94 |         input = input * noise_in
 95 | 
 96 |     hx = hidden if noise_hidden is None else hidden * noise_hidden
 97 |     if input.is_cuda:
 98 |         gi = F.linear(input, w_ih)
 99 |         gh = F.linear(hx, w_hh)
100 |         state = fusedBackend.GRUFused.apply
101 |         return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
102 | 
103 |     gi = F.linear(input, w_ih, b_ih)
104 |     gh = F.linear(hx, w_hh, b_hh)
105 |     i_r, i_i, i_n = gi.chunk(3, 1)
106 |     h_r, h_i, h_n = gh.chunk(3, 1)
107 | 
108 |     resetgate = F.sigmoid(i_r + h_r)
109 |     inputgate = F.sigmoid(i_i + h_i)
110 |     newgate = F.tanh(i_n + resetgate * h_n)
111 |     hy = newgate + inputgate * (hidden - newgate)
112 | 
113 |     return hy
114 | 
115 | 
116 | def VarMaskedRecurrent(reverse=False):
117 |     def forward(input, hidden, cell, mask):
118 |         output = []
119 |         steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
120 |         for i in steps:
121 |             if mask is None or mask[i].data.min() > 0.5:
122 |                 hidden = cell(input[i], hidden)
123 |             elif mask[i].data.max() > 0.5:
124 |                 hidden_next = cell(input[i], hidden)
125 |                 # hack to handle LSTM
126 |                 if isinstance(hidden, tuple):
127 |                     hx, cx = hidden
128 |                     hp1, cp1 = hidden_next
129 |                     hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
130 |                 else:
131 |                     hidden = hidden + (hidden_next - hidden) * mask[i]
132 |             # hack to handle LSTM
133 |             output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
134 | 
135 |         if reverse:
136 |             output.reverse()
137 |         output = torch.cat(output, 0).view(input.size(0), *output[0].size())
138 | 
139 |         return hidden, output
140 | 
141 |     return forward
142 | 
143 | 
144 | def StackedRNN(inners, num_layers, lstm=False):
145 |     num_directions = len(inners)
146 |     total_layers = num_layers * num_directions
147 | 
148 |     def forward(input, hidden, cells, mask):
149 |         assert (len(cells) == total_layers)
150 |         next_hidden = []
151 | 
152 |         if lstm:
153 |             hidden = list(zip(*hidden))
154 | 
155 |         for i in range(num_layers):
156 |             all_output = []
157 |             for j, inner in enumerate(inners):
158 |                 l = i * num_directions + j
159 |                 hy, output = inner(input, hidden[l], cells[l], mask)
160 |                 next_hidden.append(hy)
161 |                 all_output.append(output)
162 | 
163 |             input = torch.cat(all_output, input.dim() - 1)
164 | 
165 |         if lstm:
166 |             next_h, next_c = zip(*next_hidden)
167 |             next_hidden = (
168 |                 torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
169 |                 torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
170 |             )
171 |         else:
172 |             next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
173 | 
174 |         return next_hidden, input
175 | 
176 |     return forward
177 | 
178 | 
179 | def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
180 |     rec_factory = VarMaskedRecurrent
181 | 
182 |     if bidirectional:
183 |         layer = (rec_factory(), rec_factory(reverse=True))
184 |     else:
185 |         layer = (rec_factory(),)
186 | 
187 |     func = StackedRNN(layer,
188 |                       num_layers,
189 |                       lstm=lstm)
190 | 
191 |     def forward(input, cells, hidden, mask):
192 |         if batch_first:
193 |             input = input.transpose(0, 1)
194 |             if mask is not None:
195 |                 mask = mask.transpose(0, 1)
196 | 
197 |         nexth, output = func(input, hidden, cells, mask)
198 | 
199 |         if batch_first:
200 |             output = output.transpose(0, 1)
201 | 
202 |         return output, nexth
203 | 
204 |     return forward
205 | 
206 | 
207 | def VarMaskedStep():
208 |     def forward(input, hidden, cell, mask):
209 |         if mask is None or mask.data.min() > 0.5:
210 |             hidden = cell(input, hidden)
211 |         elif mask.data.max() > 0.5:
212 |             hidden_next = cell(input, hidden)
213 |             # hack to handle LSTM
214 |             if isinstance(hidden, tuple):
215 |                 hx, cx = hidden
216 |                 hp1, cp1 = hidden_next
217 |                 hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
218 |             else:
219 |                 hidden = hidden + (hidden_next - hidden) * mask
220 |         # hack to handle LSTM
221 |         output = hidden[0] if isinstance(hidden, tuple) else hidden
222 | 
223 |         return hidden, output
224 | 
225 |     return forward
226 | 
227 | 
228 | def StackedStep(layer, num_layers, lstm=False):
229 |     def forward(input, hidden, cells, mask):
230 |         assert (len(cells) == num_layers)
231 |         next_hidden = []
232 | 
233 |         if lstm:
234 |             hidden = list(zip(*hidden))
235 | 
236 |         for l in range(num_layers):
237 |             hy, output = layer(input, hidden[l], cells[l], mask)
238 |             next_hidden.append(hy)
239 |             input = output
240 | 
241 |         if lstm:
242 |             next_h, next_c = zip(*next_hidden)
243 |             next_hidden = (
244 |                 torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
245 |                 torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
246 |             )
247 |         else:
248 |             next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
249 | 
250 |         return next_hidden, input
251 | 
252 |     return forward
253 | 
254 | 
255 | def AutogradVarMaskedStep(num_layers=1, lstm=False):
256 |     layer = VarMaskedStep()
257 | 
258 |     func = StackedStep(layer,
259 |                        num_layers,
260 |                        lstm=lstm)
261 | 
262 |     def forward(input, cells, hidden, mask):
263 |         nexth, output = func(input, hidden, cells, mask)
264 |         return output, nexth
265 | 
266 |     return forward
267 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/init.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | from torch.autograd import Variable
 4 | 
 5 | 
 6 | def assign_tensor(tensor, val):
 7 |     """
 8 |     copy val to tensor
 9 |     Args:
10 |         tensor: an n-dimensional torch.Tensor or autograd.Variable
11 |         val: an n-dimensional torch.Tensor to fill the tensor with
12 | 
13 |     Returns:
14 | 
15 |     """
16 |     if isinstance(tensor, Variable):
17 |         assign_tensor(tensor.data, val)
18 |         return tensor
19 |     return tensor.copy_(val)
20 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | from .masked_rnn import *
 4 | from .variational_rnn import *
 5 | from .skipconnect_rnn import *
 6 | from .crf import *
 7 | from .sparse import *
 8 | from .attention import *
 9 | from .linear import *
10 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/modules/attention.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from torch.nn.parameter import Parameter
  7 | 
  8 | 
  9 | class BiAAttention(nn.Module):
 10 |     '''
 11 |     Bi-Affine attention layer.
 12 |     '''
 13 | 
 14 |     def __init__(self, input_size_encoder, input_size_decoder, num_labels, biaffine=True, **kwargs):
 15 |         '''
 16 | 
 17 |         Args:
 18 |             input_size_encoder: int
 19 |                 the dimension of the encoder input.
 20 |             input_size_decoder: int
 21 |                 the dimension of the decoder input.
 22 |             num_labels: int
 23 |                 the number of labels of the crf layer
 24 |             biaffine: bool
 25 |                 if apply bi-affine parameter.
 26 |             **kwargs:
 27 |         '''
 28 |         super(BiAAttention, self).__init__()
 29 |         self.input_size_encoder = input_size_encoder
 30 |         self.input_size_decoder = input_size_decoder
 31 |         self.num_labels = num_labels
 32 |         self.biaffine = biaffine
 33 | 
 34 |         self.W_d = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder))
 35 |         self.W_e = Parameter(torch.Tensor(self.num_labels, self.input_size_encoder))
 36 |         self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
 37 |         if self.biaffine:
 38 |             self.U = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder, self.input_size_encoder))
 39 |         else:
 40 |             self.register_parameter('U', None)
 41 | 
 42 |         self.reset_parameters()
 43 | 
 44 |     def reset_parameters(self):
 45 |         nn.init.xavier_uniform(self.W_d)
 46 |         nn.init.xavier_uniform(self.W_e)
 47 |         nn.init.constant(self.b, 0.)
 48 |         if self.biaffine:
 49 |             nn.init.xavier_uniform(self.U)
 50 | 
 51 |     def forward(self, input_d, input_e, mask_d=None, mask_e=None):
 52 |         '''
 53 | 
 54 |         Args:
 55 |             input_d: Tensor
 56 |                 the decoder input tensor with shape = [batch, length_decoder, input_size]
 57 |             input_e: Tensor
 58 |                 the child input tensor with shape = [batch, length_encoder, input_size]
 59 |             mask_d: Tensor or None
 60 |                 the mask tensor for decoder with shape = [batch, length_decoder]
 61 |             mask_e: Tensor or None
 62 |                 the mask tensor for encoder with shape = [batch, length_encoder]
 63 | 
 64 |         Returns: Tensor
 65 |             the energy tensor with shape = [batch, num_label, length, length]
 66 | 
 67 |         '''
 68 |         assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
 69 |         batch, length_decoder, _ = input_d.size()
 70 |         _, length_encoder, _ = input_e.size()
 71 | 
 72 |         # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
 73 |         # the output shape is [batch, num_label, length_decoder]
 74 |         out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
 75 |         # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
 76 |         # the output shape is [batch, num_label, length_encoder]
 77 |         out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)
 78 | 
 79 |         # output shape [batch, num_label, length_decoder, length_encoder]
 80 |         if self.biaffine:
 81 |             # compute bi-affine part
 82 |             # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
 83 |             # output shape [batch, num_label, length_decoder, input_size_encoder]
 84 |             output = torch.matmul(input_d.unsqueeze(1), self.U)
 85 |             # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
 86 |             # output shape [batch, num_label, length_decoder, length_encoder]
 87 |             output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))
 88 | 
 89 |             output = output + out_d + out_e + self.b
 90 |         else:
 91 |             output = out_d + out_d + self.b
 92 | 
 93 |         if mask_d is not None:
 94 |             output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)
 95 | 
 96 |         return output
 97 | 
 98 | 
 99 | class ConcatAttention(nn.Module):
100 |     '''
101 |     Concatenate attention layer.
102 |     '''
103 |     # TODO test it!
104 | 
105 |     def __init__(self, input_size_encoder, input_size_decoder, hidden_size, num_labels, **kwargs):
106 |         '''
107 | 
108 |         Args:
109 |             input_size_encoder: int
110 |                 the dimension of the encoder input.
111 |             input_size_decoder: int
112 |                 the dimension of the decoder input.
113 |             hidden_size: int
114 |                 the dimension of the hidden.
115 |             num_labels: int
116 |                 the number of labels of the crf layer
117 |             biaffine: bool
118 |                 if apply bi-affine parameter.
119 |             **kwargs:
120 |         '''
121 |         super(ConcatAttention, self).__init__()
122 |         self.input_size_encoder = input_size_encoder
123 |         self.input_size_decoder = input_size_decoder
124 |         self.hidden_size = hidden_size
125 |         self.num_labels = num_labels
126 | 
127 |         self.W_d = Parameter(torch.Tensor(self.input_size_decoder, self.hidden_size))
128 |         self.W_e = Parameter(torch.Tensor(self.input_size_encoder, self.hidden_size))
129 |         self.b = Parameter(torch.Tensor(self.hidden_size))
130 |         self.v = Parameter(torch.Tensor(self.hidden_size, self.num_labels))
131 | 
132 |         self.reset_parameters()
133 | 
134 |     def reset_parameters(self):
135 |         nn.init.xavier_uniform(self.W_d)
136 |         nn.init.xavier_uniform(self.W_e)
137 |         nn.init.xavier_uniform(self.v)
138 |         nn.init.constant(self.b, 0.)
139 | 
140 |     def forward(self, input_d, input_e, mask_d=None, mask_e=None):
141 |         '''
142 | 
143 |         Args:
144 |             input_d: Tensor
145 |                 the decoder input tensor with shape = [batch, length_decoder, input_size]
146 |             input_e: Tensor
147 |                 the child input tensor with shape = [batch, length_encoder, input_size]
148 |             mask_d: Tensor or None
149 |                 the mask tensor for decoder with shape = [batch, length_decoder]
150 |             mask_e: Tensor or None
151 |                 the mask tensor for encoder with shape = [batch, length_encoder]
152 | 
153 |         Returns: Tensor
154 |             the energy tensor with shape = [batch, num_label, length, length]
155 | 
156 |         '''
157 |         assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
158 |         batch, length_decoder, _ = input_d.size()
159 |         _, length_encoder, _ = input_e.size()
160 | 
161 |         # compute decoder part: [batch, length_decoder, input_size_decoder] * [input_size_decoder, hidden_size]
162 |         # the output shape is [batch, length_decoder, hidden_size]
163 |         # then --> [batch, 1, length_decoder, hidden_size]
164 |         out_d = torch.matmul(input_d, self.W_d).unsqueeze(1)
165 |         # compute decoder part: [batch, length_encoder, input_size_encoder] * [input_size_encoder, hidden_size]
166 |         # the output shape is [batch, length_encoder, hidden_size]
167 |         # then --> [batch, length_encoder, 1, hidden_size]
168 |         out_e = torch.matmul(input_e, self.W_e).unsqueeze(2)
169 | 
170 |         # add them together [batch, length_encoder, length_decoder, hidden_size]
171 |         out = F.tanh(out_d + out_e + self.b)
172 | 
173 |         # product with v
174 |         # [batch, length_encoder, length_decoder, hidden_size] * [hidden, num_label]
175 |         # [batch, length_encoder, length_decoder, num_labels]
176 |         # then --> [batch, num_labels, length_decoder, length_encoder]
177 |         return torch.matmul(out, self.v).transpose(1, 3)
178 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/modules/linear.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'max'
 2 | 
 3 | import math
 4 | import numpy as np
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from torch.nn.parameter import Parameter
 9 | 
10 | 
11 | class BiLinear(nn.Module):
12 |     '''
13 |     Bi-linear layer
14 |     '''
15 |     def __init__(self, left_features, right_features, out_features, bias=True):
16 |         '''
17 | 
18 |         Args:
19 |             left_features: size of left input
20 |             right_features: size of right input
21 |             out_features: size of output
22 |             bias: If set to False, the layer will not learn an additive bias.
23 |                 Default: True
24 |         '''
25 |         super(BiLinear, self).__init__()
26 |         self.left_features = left_features
27 |         self.right_features = right_features
28 |         self.out_features = out_features
29 | 
30 |         self.U = Parameter(torch.Tensor(self.out_features, self.left_features, self.right_features))
31 |         self.W_l = Parameter(torch.Tensor(self.out_features, self.left_features))
32 |         self.W_r = Parameter(torch.Tensor(self.out_features, self.left_features))
33 | 
34 |         if bias:
35 |             self.bias = Parameter(torch.Tensor(out_features))
36 |         else:
37 |             self.register_parameter('bias', None)
38 | 
39 |         self.reset_parameters()
40 | 
41 |     def reset_parameters(self):
42 |         nn.init.xavier_uniform(self.W_l)
43 |         nn.init.xavier_uniform(self.W_r)
44 |         nn.init.constant(self.bias, 0.)
45 |         nn.init.xavier_uniform(self.U)
46 | 
47 |     def forward(self, input_left, input_right):
48 |         '''
49 | 
50 |         Args:
51 |             input_left: Tensor
52 |                 the left input tensor with shape = [batch1, batch2, ..., left_features]
53 |             input_right: Tensor
54 |                 the right input tensor with shape = [batch1, batch2, ..., right_features]
55 | 
56 |         Returns:
57 | 
58 |         '''
59 | 
60 |         left_size = input_left.size()
61 |         right_size = input_right.size()
62 |         assert left_size[:-1] == right_size[:-1], \
63 |             "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
64 |         batch = int(np.prod(left_size[:-1]))
65 | 
66 |         # convert left and right input to matrices [batch, left_features], [batch, right_features]
67 |         input_left = input_left.view(batch, self.left_features)
68 |         input_right = input_right.view(batch, self.right_features)
69 | 
70 |         # output [batch, out_features]
71 |         output = F.bilinear(input_left, input_right, self.U, self.bias)
72 |         output = output + F.linear(input_left, self.W_l, None) + F.linear(input_right, self.W_r, None)
73 |         # convert back to [batch1, batch2, ..., out_features]
74 |         return output.view(left_size[:-1] + (self.out_features, ))
75 | 
76 |     def __repr__(self):
77 |         return self.__class__.__name__ + ' (' \
78 |                + 'in1_features=' + str(self.left_features) \
79 |                + ', in2_features=' + str(self.right_features) \
80 |                + ', out_features=' + str(self.out_features) + ')'
81 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/modules/sparse.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch.autograd import Variable
  7 | from torch.nn.parameter import Parameter
  8 | 
  9 | from ..init import assign_tensor
 10 | 
 11 | 
 12 | class Embedding(nn.Module):
 13 |     r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
 14 |     This module is often used to store word embeddings and retrieve them using indices.
 15 |     The input to the module is a list of indices, and the output is the corresponding
 16 |     word embeddings.
 17 |     Args:
 18 |         num_embeddings (int): size of the dictionary of embeddings
 19 |         embedding_dim (int): the size of each embedding vector
 20 |         init_embedding (Tensor or Variable): If given, the embedding will be initialized with the given tensor.
 21 |         freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
 22 |         padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index.
 23 |         max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
 24 |         norm_type (float, optional): The p of the p-norm to compute for the max_norm option
 25 |         scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
 26 |                                                 the words in the mini-batch.
 27 |         sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
 28 |                                     more details regarding sparse gradients.
 29 |     Attributes:
 30 |         weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
 31 |     Shape:
 32 |         - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch
 33 |         - Output: `(N1, N2, ..., Nm, W, embedding_dim)`
 34 |     Notes:
 35 |         Keep in mind that only a limited number of optimizers support
 36 |         sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`),
 37 |         and `optim.Adagrad` (`cpu`)
 38 |     """
 39 | 
 40 |     def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None,
 41 |                  max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False):
 42 |         super(Embedding, self).__init__()
 43 |         self.num_embeddings = num_embeddings
 44 |         self.embedding_dim = embedding_dim
 45 |         self.padding_idx = padding_idx
 46 |         self.max_norm = max_norm
 47 |         self.norm_type = norm_type
 48 |         self.scale_grad_by_freq = scale_grad_by_freq
 49 |         self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
 50 |         self.frozen = freeze
 51 |         self.sparse = sparse
 52 | 
 53 |         self.reset_parameters(init_embedding)
 54 | 
 55 |     def reset_parameters(self, init_embedding):
 56 |         if init_embedding is None:
 57 |             scale = np.sqrt(3.0 / self.embedding_dim)
 58 |             self.weight.data.uniform_(-scale, scale)
 59 |         else:
 60 |             assign_tensor(self.weight, init_embedding)
 61 |         if self.padding_idx is not None:
 62 |             self.weight.data[self.padding_idx].fill_(0)
 63 | 
 64 |         if self.frozen:
 65 |             if init_embedding is None:
 66 |                 raise Warning('Freeze embeddings which are randomly initialized.')
 67 |             self.weight.requires_grad = False
 68 | 
 69 |     def freeze(self):
 70 |         self.weight.requires_grad = False
 71 |         self.frozen = True
 72 | 
 73 |     def forward(self, input):
 74 |         padding_idx = self.padding_idx
 75 |         if padding_idx is None:
 76 |             padding_idx = -1
 77 | 
 78 |         input_size = input.size()
 79 |         if input.dim() > 2:
 80 |             num_inputs = int(np.prod(input_size[:-1]))
 81 |             input = input.view(num_inputs, input_size[-1])
 82 | 
 83 |         output_size = input_size + (self.embedding_dim,)
 84 |         return self._backend.Embedding.apply(
 85 |             input, self.weight,
 86 |             padding_idx, self.max_norm, self.norm_type,
 87 |             self.scale_grad_by_freq, self.sparse).view(output_size)
 88 | 
 89 |     def __repr__(self):
 90 |         s = '{name}({num_embeddings}, {embedding_dim}'
 91 |         if self.padding_idx is not None:
 92 |             s += ', padding_idx={padding_idx}'
 93 |         if self.max_norm is not None:
 94 |             s += ', max_norm={max_norm}'
 95 |         if self.norm_type != 2:
 96 |             s += ', norm_type={norm_type}'
 97 |         if self.scale_grad_by_freq is not False:
 98 |             s += ', scale_grad_by_freq={scale_grad_by_freq}'
 99 |         if self.sparse is not False:
100 |             s += ', sparse=True'
101 |         s += ')'
102 |         return s.format(name=self.__class__.__name__, **self.__dict__)
103 | 


--------------------------------------------------------------------------------
/neuronlp2/nn/utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | from itertools import repeat
 3 | import torch
 4 | import torch.nn.utils.rnn as rnn_utils
 5 | from torch.autograd import Variable
 6 | 
 7 | 
 8 | def _ntuple(n):
 9 |     def parse(x):
10 |         if isinstance(x, collections.Iterable):
11 |             return x
12 |         return tuple(repeat(x, n))
13 |     return parse
14 | 
15 | _single = _ntuple(1)
16 | _pair = _ntuple(2)
17 | _triple = _ntuple(3)
18 | _quadruple = _ntuple(4)
19 | 
20 | 
21 | def prepare_rnn_seq(rnn_input, lengths, hx=None, masks=None, batch_first=False):
22 |     '''
23 | 
24 |     Args:
25 |         rnn_input: [seq_len, batch, input_size]: tensor containing the features of the input sequence.
26 |         lengths: [batch]: tensor containing the lengthes of the input sequence
27 |         hx: [num_layers * num_directions, batch, hidden_size]: tensor containing the initial hidden state for each element in the batch.
28 |         masks: [seq_len, batch]: tensor containing the mask for each element in the batch.
29 |         batch_first: If True, then the input and output tensors are provided as [batch, seq_len, feature].
30 | 
31 |     Returns:
32 | 
33 |     '''
34 |     def check_decreasing(lengths):
35 |         lens, order = torch.sort(lengths, dim=0, descending=True)
36 |         if torch.ne(lens, lengths).sum() == 0:
37 |             return None
38 |         else:
39 |             _, rev_order = torch.sort(order)
40 |             return lens, Variable(order), Variable(rev_order)
41 | 
42 |     check_res = check_decreasing(lengths)
43 | 
44 |     if check_res is None:
45 |         lens = lengths
46 |         rev_order = None
47 |     else:
48 |         lens, order, rev_order = check_res
49 |         batch_dim = 0 if batch_first else 1
50 |         rnn_input = rnn_input.index_select(batch_dim, order)
51 |         if hx is not None:
52 |             # hack lstm
53 |             if isinstance(hx, tuple):
54 |                 hx, cx = hx
55 |                 hx = hx.index_select(1, order)
56 |                 cx = cx.index_select(1, order)
57 |                 hx = (hx, cx)
58 |             else:
59 |                 hx = hx.index_select(1, order)
60 | 
61 |     lens = lens.tolist()
62 |     seq = rnn_utils.pack_padded_sequence(rnn_input, lens, batch_first=batch_first)
63 |     if masks is not None:
64 |         if batch_first:
65 |             masks = masks[:, :lens[0]]
66 |         else:
67 |             masks = masks[:lens[0]]
68 |     return seq, hx, rev_order, masks
69 | 
70 | 
71 | def recover_rnn_seq(seq, rev_order, hx=None, batch_first=False):
72 |     output, _ = rnn_utils.pad_packed_sequence(seq, batch_first=batch_first)
73 |     if rev_order is not None:
74 |         batch_dim = 0 if batch_first else 1
75 |         output = output.index_select(batch_dim, rev_order)
76 |         if hx is not None:
77 |             # hack lstm
78 |             if isinstance(hx, tuple):
79 |                 hx, cx = hx
80 |                 hx = hx.index_select(1, rev_order)
81 |                 cx = cx.index_select(1, rev_order)
82 |                 hx = (hx, cx)
83 |             else:
84 |                 hx = hx.index_select(1, rev_order)
85 |     return output, hx
86 | 


--------------------------------------------------------------------------------
/neuronlp2/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 
3 | from .parser import *
4 | 


--------------------------------------------------------------------------------
/neuronlp2/tasks/parser.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import re
  4 | import numpy as np
  5 | 
  6 | def is_uni_punctuation(word):
  7 |     match = re.match("^[^\w\s]+$]", word, flags=re.UNICODE)
  8 |     return match is not None
  9 | 
 10 | def is_punctuation(word, pos, punct_set=None):
 11 |     if punct_set is None:
 12 |         return is_uni_punctuation(word)
 13 |     else:
 14 |         return pos in punct_set
 15 | 
 16 | 
 17 | def eval(words, postags, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths,
 18 |          punct_set=None, symbolic_root=False, symbolic_end=False):
 19 |     batch_size, _ = words.shape
 20 |     ucorr = 0.
 21 |     lcorr = 0.
 22 |     total = 0.
 23 |     ucomplete_match = 0.
 24 |     lcomplete_match = 0.
 25 | 
 26 |     ucorr_nopunc = 0.
 27 |     lcorr_nopunc = 0.
 28 |     total_nopunc = 0.
 29 |     ucomplete_match_nopunc = 0.
 30 |     lcomplete_match_nopunc = 0.
 31 | 
 32 |     corr_root = 0.
 33 |     total_root = 0.
 34 |     start = 1 if symbolic_root else 0
 35 |     end = 1 if symbolic_end else 0
 36 |     for i in range(batch_size):
 37 |         ucm = 1.
 38 |         lcm = 1.
 39 |         ucm_nopunc = 1.
 40 |         lcm_nopunc = 1.
 41 |         for j in range(start, lengths[i] - end):
 42 |             word = word_alphabet.get_instance(words[i, j])
 43 |             word = word.encode('utf8')
 44 | 
 45 |             pos = pos_alphabet.get_instance(postags[i, j])
 46 |             pos = pos.encode('utf8')
 47 | 
 48 |             total += 1
 49 |             if heads[i, j] == heads_pred[i, j]:
 50 |                 ucorr += 1
 51 |                 if types[i, j] == types_pred[i, j]:
 52 |                     lcorr += 1
 53 |                 else:
 54 |                     lcm = 0
 55 |             else:
 56 |                 ucm = 0
 57 |                 lcm = 0
 58 | 
 59 |             if not is_punctuation(word, pos, punct_set):
 60 |                 total_nopunc += 1
 61 |                 if heads[i, j] == heads_pred[i, j]:
 62 |                     ucorr_nopunc += 1
 63 |                     if types[i, j] == types_pred[i, j]:
 64 |                         lcorr_nopunc += 1
 65 |                     else:
 66 |                         lcm_nopunc = 0
 67 |                 else:
 68 |                     ucm_nopunc = 0
 69 |                     lcm_nopunc = 0
 70 | 
 71 |             if heads[i, j] == 0:
 72 |                 total_root += 1
 73 |                 corr_root += 1 if heads_pred[i, j] == 0 else 0
 74 | 
 75 |         ucomplete_match += ucm
 76 |         lcomplete_match += lcm
 77 |         ucomplete_match_nopunc += ucm_nopunc
 78 |         lcomplete_match_nopunc += lcm_nopunc
 79 | 
 80 |     return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \
 81 |            (ucorr_nopunc, lcorr_nopunc, total_nopunc, ucomplete_match_nopunc, lcomplete_match_nopunc), \
 82 |            (corr_root, total_root), batch_size
 83 | 
 84 | 
 85 | def decode_MST(energies, lengths, leading_symbolic=0, labeled=True):
 86 |     """
 87 |     decode best parsing tree with MST algorithm.
 88 |     :param energies: energies: numpy 4D tensor
 89 |         energies of each edge. the shape is [batch_size, num_labels, n_steps, n_steps],
 90 |         where the summy root is at index 0.
 91 |     :param masks: numpy 2D tensor
 92 |         masks in the shape [batch_size, n_steps].
 93 |     :param leading_symbolic: int
 94 |         number of symbolic dependency types leading in type alphabets)
 95 |     :return:
 96 |     """
 97 | 
 98 |     def find_cycle(par):
 99 |         added = np.zeros([length], np.bool)
100 |         added[0] = True
101 |         cycle = set()
102 |         findcycle = False
103 |         for i in range(1, length):
104 |             if findcycle:
105 |                 break
106 | 
107 |             if added[i] or not curr_nodes[i]:
108 |                 continue
109 | 
110 |             # init cycle
111 |             tmp_cycle = set()
112 |             tmp_cycle.add(i)
113 |             added[i] = True
114 |             findcycle = True
115 |             l = i
116 | 
117 |             while par[l] not in tmp_cycle:
118 |                 l = par[l]
119 |                 if added[l]:
120 |                     findcycle = False
121 |                     break
122 |                 added[l] = True
123 |                 tmp_cycle.add(l)
124 | 
125 |             if findcycle:
126 |                 lorg = l
127 |                 cycle.add(lorg)
128 |                 l = par[lorg]
129 |                 while l != lorg:
130 |                     cycle.add(l)
131 |                     l = par[l]
132 |                 break
133 | 
134 |         return findcycle, cycle
135 | 
136 |     def chuLiuEdmonds():
137 |         par = np.zeros([length], dtype=np.int32)
138 |         # create best graph
139 |         par[0] = -1
140 |         for i in range(1, length):
141 |             # only interested at current nodes
142 |             if curr_nodes[i]:
143 |                 max_score = score_matrix[0, i]
144 |                 par[i] = 0
145 |                 for j in range(1, length):
146 |                     if j == i or not curr_nodes[j]:
147 |                         continue
148 | 
149 |                     new_score = score_matrix[j, i]
150 |                     if new_score > max_score:
151 |                         max_score = new_score
152 |                         par[i] = j
153 | 
154 |         # find a cycle
155 |         findcycle, cycle = find_cycle(par)
156 |         # no cycles, get all edges and return them.
157 |         if not findcycle:
158 |             final_edges[0] = -1
159 |             for i in range(1, length):
160 |                 if not curr_nodes[i]:
161 |                     continue
162 | 
163 |                 pr = oldI[par[i], i]
164 |                 ch = oldO[par[i], i]
165 |                 final_edges[ch] = pr
166 |             return
167 | 
168 |         cyc_len = len(cycle)
169 |         cyc_weight = 0.0
170 |         cyc_nodes = np.zeros([cyc_len], dtype=np.int32)
171 |         id = 0
172 |         for cyc_node in cycle:
173 |             cyc_nodes[id] = cyc_node
174 |             id += 1
175 |             cyc_weight += score_matrix[par[cyc_node], cyc_node]
176 | 
177 |         rep = cyc_nodes[0]
178 |         for i in range(length):
179 |             if not curr_nodes[i] or i in cycle:
180 |                 continue
181 | 
182 |             max1 = float("-inf")
183 |             wh1 = -1
184 |             max2 = float("-inf")
185 |             wh2 = -1
186 | 
187 |             for j in range(cyc_len):
188 |                 j1 = cyc_nodes[j]
189 |                 if score_matrix[j1, i] > max1:
190 |                     max1 = score_matrix[j1, i]
191 |                     wh1 = j1
192 | 
193 |                 scr = cyc_weight + score_matrix[i, j1] - score_matrix[par[j1], j1]
194 | 
195 |                 if scr > max2:
196 |                     max2 = scr
197 |                     wh2 = j1
198 | 
199 |             score_matrix[rep, i] = max1
200 |             oldI[rep, i] = oldI[wh1, i]
201 |             oldO[rep, i] = oldO[wh1, i]
202 |             score_matrix[i, rep] = max2
203 |             oldO[i, rep] = oldO[i, wh2]
204 |             oldI[i, rep] = oldI[i, wh2]
205 | 
206 |         rep_cons = []
207 |         for i in range(cyc_len):
208 |             rep_cons.append(set())
209 |             cyc_node = cyc_nodes[i]
210 |             for cc in reps[cyc_node]:
211 |                 rep_cons[i].add(cc)
212 | 
213 |         for i in range(1, cyc_len):
214 |             cyc_node = cyc_nodes[i]
215 |             curr_nodes[cyc_node] = False
216 |             for cc in reps[cyc_node]:
217 |                 reps[rep].add(cc)
218 | 
219 |         chuLiuEdmonds()
220 | 
221 |         # check each node in cycle, if one of its representatives is a key in the final_edges, it is the one.
222 |         found = False
223 |         wh = -1
224 |         for i in range(cyc_len):
225 |             for repc in rep_cons[i]:
226 |                 if repc in final_edges:
227 |                     wh = cyc_nodes[i]
228 |                     found = True
229 |                     break
230 |             if found:
231 |                 break
232 | 
233 |         l = par[wh]
234 |         while l != wh:
235 |             ch = oldO[par[l], l]
236 |             pr = oldI[par[l], l]
237 |             final_edges[ch] = pr
238 |             l = par[l]
239 | 
240 |     if labeled:
241 |         assert energies.ndim == 4, 'dimension of energies is not equal to 4'
242 |     else:
243 |         assert energies.ndim == 3, 'dimension of energies is not equal to 3'
244 |     input_shape = energies.shape
245 |     batch_size = input_shape[0]
246 |     max_length = input_shape[2]
247 | 
248 |     pars = np.zeros([batch_size, max_length], dtype=np.int32)
249 |     types = np.zeros([batch_size, max_length], dtype=np.int32) if labeled else None
250 |     for i in range(batch_size):
251 |         energy = energies[i]
252 | 
253 |         # calc the realy length of this instance
254 |         length = lengths[i]
255 | 
256 |         # calc real energy matrix shape = [length, length, num_labels - #symbolic] (remove the label for symbolic types).
257 |         if labeled:
258 |             energy = energy[leading_symbolic:, :length, :length]
259 |             # get best label for each edge.
260 |             label_id_matrix = energy.argmax(axis=0) + leading_symbolic
261 |             energy = energy.max(axis=0)
262 |         else:
263 |             energy = energy[:length, :length]
264 |             label_id_matrix = None
265 |         # get original score matrix
266 |         orig_score_matrix = energy
267 |         # initialize score matrix to original score matrix
268 |         score_matrix = np.array(orig_score_matrix, copy=True)
269 | 
270 |         oldI = np.zeros([length, length], dtype=np.int32)
271 |         oldO = np.zeros([length, length], dtype=np.int32)
272 |         curr_nodes = np.zeros([length], dtype=np.bool)
273 |         reps = []
274 | 
275 |         for s in range(length):
276 |             orig_score_matrix[s, s] = 0.0
277 |             score_matrix[s, s] = 0.0
278 |             curr_nodes[s] = True
279 |             reps.append(set())
280 |             reps[s].add(s)
281 |             for t in range(s + 1, length):
282 |                 oldI[s, t] = s
283 |                 oldO[s, t] = t
284 | 
285 |                 oldI[t, s] = t
286 |                 oldO[t, s] = s
287 | 
288 |         final_edges = dict()
289 |         chuLiuEdmonds()
290 |         par = np.zeros([max_length], np.int32)
291 |         if labeled:
292 |             type = np.ones([max_length], np.int32)
293 |             type[0] = 0
294 |         else:
295 |             type = None
296 | 
297 |         for ch, pr in final_edges.items():
298 |             par[ch] = pr
299 |             if labeled and ch != 0:
300 |                 type[ch] = label_id_matrix[pr, ch]
301 | 
302 |         par[0] = 0
303 |         pars[i] = par
304 |         if labeled:
305 |             types[i] = type
306 | 
307 |     return pars, types
308 | 


--------------------------------------------------------------------------------
/neuronlp2/utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'max'
  2 | 
  3 | import pickle
  4 | import numpy as np
  5 | from gensim.models.word2vec import Word2Vec
  6 | import gzip
  7 | 
  8 | from .io import utils
  9 | 
 10 | 
 11 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
 12 |     """
 13 |     load word embeddings from file
 14 |     :param embedding:
 15 |     :param embedding_path:
 16 |     :return: embedding dict, embedding dimention, caseless
 17 |     """
 18 |     print("loading embedding: %s from %s" % (embedding, embedding_path))
 19 |     if embedding == 'word2vec':
 20 |         # loading word2vec
 21 |         word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
 22 |         embedd_dim = word2vec.vector_size
 23 |         return word2vec, embedd_dim
 24 |     elif embedding == 'glove':
 25 |         # loading GloVe
 26 |         embedd_dim = -1
 27 |         embedd_dict = dict()
 28 |         with gzip.open(embedding_path, 'r') as file:
 29 |             for line in file:
 30 |                 line = line.strip()
 31 |                 line = line.decode('utf-8')
 32 |                 if len(line) == 0:
 33 |                     continue
 34 | 
 35 |                 tokens = line.split()
 36 |                 if embedd_dim < 0:
 37 |                     embedd_dim = len(tokens) - 1
 38 |                 else:
 39 |                     assert (embedd_dim + 1 == len(tokens))
 40 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 41 |                 embedd[:] = tokens[1:]
 42 |                 word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
 43 |                 embedd_dict[word] = embedd
 44 |         return embedd_dict, embedd_dim
 45 |     elif embedding == 'senna':
 46 |         # loading Senna
 47 |         embedd_dim = -1
 48 |         embedd_dict = dict()
 49 |         with gzip.open(embedding_path, 'r') as file:
 50 |             for line in file:
 51 |                 line = line.strip()
 52 |                 line = line.decode('utf-8')
 53 |                 if len(line) == 0:
 54 |                     continue
 55 | 
 56 |                 tokens = line.split()
 57 |                 if embedd_dim < 0:
 58 |                     embedd_dim = len(tokens) - 1
 59 |                 else:
 60 |                     assert (embedd_dim + 1 == len(tokens))
 61 |                 embedd = np.empty([1, embedd_dim], dtype=np.float32)
 62 |                 embedd[:] = tokens[1:]
 63 |                 word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
 64 |                 embedd_dict[word] = embedd
 65 |         return embedd_dict, embedd_dim
 66 |     elif embedding == 'sskip':
 67 |         embedd_dim = -1
 68 |         embedd_dict = dict()
 69 |         with gzip.open(embedding_path, 'r') as file:
 70 |             # skip the first line
 71 |             file.readline()
 72 |             for line in file:
 73 |                 line = line.strip()
 74 |                 try:
 75 |                     line = line.decode('utf-8')
 76 |                     if len(line) == 0:
 77 |                         continue
 78 | 
 79 |                     tokens = line.split()
 80 |                     if len(tokens) < embedd_dim:
 81 |                         continue
 82 | 
 83 |                     if embedd_dim < 0:
 84 |                         embedd_dim = len(tokens) - 1
 85 | 
 86 |                     embedd = np.empty([1, embedd_dim], dtype=np.float32)
 87 |                     start = len(tokens) - embedd_dim
 88 |                     word = ' '.join(tokens[0:start])
 89 |                     embedd[:] = tokens[start:]
 90 |                     word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word
 91 |                     embedd_dict[word] = embedd
 92 |                 except UnicodeDecodeError:
 93 |                     continue
 94 |         return embedd_dict, embedd_dim
 95 |     elif embedding == 'polyglot':
 96 |         words, embeddings = pickle.load(open(embedding_path, 'rb'))
 97 |         _, embedd_dim = embeddings.shape
 98 |         embedd_dict = dict()
 99 |         for i, word in enumerate(words):
100 |             embedd = np.empty([1, embedd_dim], dtype=np.float32)
101 |             embedd[:] = embeddings[i, :]
102 |             word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word
103 |             embedd_dict[word] = embedd
104 |         return embedd_dict, embedd_dim
105 | 
106 |     else:
107 |         raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")
108 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.shutil-get-terminal-size==1.0.0
 2 | boto==2.49.0
 3 | boto3==1.9.93
 4 | botocore==1.12.93
 5 | bz2file==0.98
 6 | certifi==2019.6.16
 7 | chardet==3.0.4
 8 | decorator==4.3.2
 9 | docutils==0.14
10 | enum34==1.1.6
11 | futures==3.2.0
12 | gensim==3.7.1
13 | idna==2.8
14 | ipdb==0.11
15 | ipython==5.8.0
16 | ipython-genutils==0.2.0
17 | jmespath==0.9.3
18 | mkl-fft==1.0.6
19 | mkl-random==1.0.1
20 | nltk==3.4.1
21 | numpy==1.16.1
22 | pathlib2==2.3.3
23 | pexpect==4.6.0
24 | pickleshare==0.7.5
25 | prompt-toolkit==1.0.15
26 | ptyprocess==0.6.0
27 | Pygments==2.3.1
28 | python-dateutil==2.8.0
29 | PyYAML==3.13
30 | requests==2.21.0
31 | s3transfer==0.2.0
32 | scandir==1.9.0
33 | scikit-learn==0.20.3
34 | scipy==1.2.1
35 | simplegeneric==0.8.1
36 | singledispatch==3.4.0.3
37 | six==1.12.0
38 | sklearn==0.0
39 | smart-open==1.8.0
40 | torch==0.3.1
41 | traitlets==4.3.2
42 | urllib3==1.24.1
43 | wcwidth==0.1.7
44 | 


--------------------------------------------------------------------------------
/rst_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from torch.autograd import Variable
 4 | 
 5 | from NeuralRST.in_out.util import load_embedding_dict, get_logger
 6 | from NeuralRST.in_out.preprocess import create_alphabet
 7 | from NeuralRST.in_out.preprocess import batch_data_variable
 8 | from NeuralRST.models.vocab import Vocab
 9 | from NeuralRST.models.metric import Metric
10 | from NeuralRST.models.config import Config
11 | from NeuralRST.models.architecture import MainArchitecture
12 | 
13 | 
14 | class RSTModel(object):
15 |     def __init__(self, rst_config_path):
16 |         print("................................................")
17 |         print("LOADING RST Model")
18 |         self.config = Config(None)
19 |         self.config.load_config(rst_config_path)
20 |         self.logger = get_logger("RSTParser RUN", self.config.use_dynamic_oracle, self.config.model_path)
21 |         word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet(None, self.config.alphabet_path, self.logger)
22 |         self.vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha)
23 |         self.network = MainArchitecture(self.vocab, self.config) 
24 |         self.network.load_state_dict(torch.load(self.config.model_name))
25 |         if self.config.use_gpu:
26 |             self.network = self.network.cuda()
27 |         self.network.eval()
28 | 
29 |     def prepare_data(self, batch, batch_size):
30 |         config = self.config
31 |         vocab = self.vocab
32 |         max_edu_len = -1
33 |         max_edu_num = -1
34 |         for data in batch:
35 |             edu_num = len(data.edus)
36 |             if edu_num > max_edu_num: max_edu_num = edu_num
37 |             for edu in data.edus:
38 |                 edu_len = len(edu.words)
39 |                 if edu_len > max_edu_len: max_edu_len = edu_len
40 | 
41 |         edu_words = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
42 |         edu_types = Variable(torch.LongTensor(batch_size, max_edu_num).zero_(), requires_grad=False)
43 |         edu_syntax = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len, config.syntax_dim).zero_(), requires_grad=False)
44 |         word_mask = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
45 |         edu_tags = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
46 |         edu_mask = Variable(torch.Tensor(batch_size, max_edu_num).zero_(), requires_grad=False)
47 |         word_denominator = Variable(torch.ones(batch_size, max_edu_num).type(torch.FloatTensor) * -1, requires_grad=False)
48 |         len_edus = np.zeros([batch_size], dtype=np.int64)
49 | 
50 |         for idx in range(batch_size):
51 |             for idy in range(len(batch[idx].edus)):
52 |                 len_edus[idx] = len(batch[idx].edus)
53 |                 edu = batch[idx].edus[idy]
54 |                 edu_mask[idx, idy] = 1
55 |                 edu_types[idx, idy] = vocab.etype_alpha.word2id(edu.etype)
56 |                 edu_len = len(edu.words)
57 |                 word_denominator[idx, idy] = edu_len
58 |                 for idz in range(edu_len):
59 |                     word = edu.words[idz]
60 |                     tag = edu.tags[idz]
61 |                     edu_words[idx, idy, idz] = vocab.word_alpha.word2id(word)
62 |                     edu_tags[idx, idy, idz] = vocab.tag_alpha.word2id(tag)
63 |                     edu_syntax[idx, idy, idz] = edu.syntax_features[idz].view(config.syntax_dim)
64 |                     word_mask[idx, idy, idz] = 1
65 |         
66 |         if config.use_gpu:
67 |             edu_words = edu_words.cuda()
68 |             edu_tags = edu_tags.cuda()
69 |             edu_types = edu_types.cuda()
70 |             edu_mask = edu_mask.cuda()
71 |             word_mask = word_mask.cuda()
72 |             word_denominator = word_denominator.cuda()
73 |             edu_syntax = edu_syntax.cuda()
74 |         
75 |         return edu_words, edu_tags, edu_types, edu_mask, word_mask, len_edus, word_denominator, edu_syntax
76 | 
77 |     def get_edu_representation(self, data_test):
78 |         words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test
79 |         encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax)
80 |         return encoder_output
81 | 
82 |     def get_subtree(self, data_test):
83 |         words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test
84 |         self.network.training = False
85 |         encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax)
86 |         results = self.network.decode(encoder_output, [], [], len_edus)
87 |         return results
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/sentence.py:
--------------------------------------------------------------------------------
 1 | class Sentence(object):
 2 |     def __init__ (self, words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids):
 3 |         self.words = words
 4 |         self.seq_chars = seq_chars
 5 |         self.tags = tags
 6 |         self.word_ids = word_ids
 7 |         self.seq_char_ids = seq_char_ids
 8 |         self.tag_ids = tag_ids
 9 |         self.edu_ids = edu_ids
10 | 
11 |     def length(self):
12 |         return len(self.words)
13 | 
14 | class Instance(object):
15 |     def __init__(self, sentences, syntax_features):
16 |         self.edus = []
17 |         
18 |         cur_edu_id = 1
19 |         cur_words = []
20 |         cur_tags = []
21 |         cur_syntax = []
22 |         for idx in range(len(sentences)):
23 |             sentence = sentences[idx]
24 |             syntax = syntax_features[idx]
25 |             for idy in range(len(sentence.words)):
26 |                 if sentence.edu_ids[idy] != cur_edu_id:
27 |                     cur_edu_id += 1
28 |                     self.edus.append(EDU(cur_words, cur_tags, '<S>', cur_syntax))
29 |                     cur_words = []
30 |                     cur_tags = []
31 |                     cur_syntax = []
32 |                 cur_words.append(sentence.words[idy])
33 |                 cur_tags.append(sentence.tags[idy])
34 |                 cur_syntax.append(syntax[:,idy,:])
35 |         self.edus.append(EDU(cur_words, cur_tags, '<P>', cur_syntax))
36 | 
37 | class EDU(object):
38 |     def __init__(self, words, tags, etype, syntax_features):
39 |         self.words = words
40 |         self.tags = tags
41 |         self.etype = etype
42 |         self.syntax_features = syntax_features
43 |             
44 | 


--------------------------------------------------------------------------------