"
6 | },
7 | "alpha2id": {
8 | "": 1,
9 | "
": 2,
10 | "UNK": 0
11 | },
12 | "alphas": [
13 | "",
14 | "
",
15 | "UNK"
16 | ]
17 | }
--------------------------------------------------------------------------------
/models/rst/alphabets/gold_action_alpha.json:
--------------------------------------------------------------------------------
1 | {
2 | "id2alpha": {
3 | "0": "REDUCE_NN_temp",
4 | "1": "REDUCE_NS_purp",
5 | "2": "REDUCE_NN_text",
6 | "3": "REDUCE_SN_elab",
7 | "4": "REDUCE_NS_summ",
8 | "5": "REDUCE_NS_attr",
9 | "6": "REDUCE_NN_eval",
10 | "7": "REDUCE_NS_evid",
11 | "8": "REDUCE_SN_comp",
12 | "9": "REDUCE_NS_topic",
13 | "10": "REDUCE_SN_purp",
14 | "11": "REDUCE_NS_comp",
15 | "12": "REDUCE_NS_elab",
16 | "13": "REDUCE_SN_eval",
17 | "14": "REDUCE_NN_cause",
18 | "15": "REDUCE_NS_eval",
19 | "16": "REDUCE_NS_back",
20 | "17": "REDUCE_SN_temp",
21 | "18": "REDUCE_NS_temp",
22 | "19": "REDUCE_SN_attr",
23 | "20": "REDUCE_SN_summ",
24 | "21": "REDUCE_NS_mann",
25 | "22": "REDUCE_NS_prob",
26 | "23": "REDUCE_NN_evid",
27 | "24": "REDUCE_NN_cont",
28 | "25": "REDUCE_SN_cond",
29 | "26": "REDUCE_SN_prob",
30 | "27": "REDUCE_NS_cause",
31 | "28": "REDUCE_NN_cond",
32 | "29": "REDUCE_SN_cont",
33 | "30": "REDUCE_SN_evid",
34 | "31": "POPROOT__",
35 | "32": "REDUCE_NN_topic",
36 | "33": "REDUCE_NN_same",
37 | "34": "REDUCE_NN_list",
38 | "35": "SHIFT__",
39 | "36": "REDUCE_NS_cont",
40 | "37": "REDUCE_NS_cond",
41 | "38": "REDUCE_SN_mann",
42 | "39": "REDUCE_SN_back",
43 | "40": "REDUCE_NN_prob",
44 | "41": "REDUCE_SN_cause",
45 | "42": "REDUCE_NN_comp",
46 | "43": "PAD"
47 | },
48 | "alpha2id": {
49 | "REDUCE_NN_temp": 0,
50 | "REDUCE_NS_purp": 1,
51 | "REDUCE_NN_text": 2,
52 | "REDUCE_SN_summ": 20,
53 | "REDUCE_SN_cont": 29,
54 | "REDUCE_NS_attr": 5,
55 | "REDUCE_NN_eval": 6,
56 | "PAD": 43,
57 | "REDUCE_NS_evid": 7,
58 | "REDUCE_SN_comp": 8,
59 | "REDUCE_SN_purp": 10,
60 | "REDUCE_NS_comp": 11,
61 | "REDUCE_NS_elab": 12,
62 | "REDUCE_SN_eval": 13,
63 | "REDUCE_NN_cause": 14,
64 | "REDUCE_NS_eval": 15,
65 | "REDUCE_NS_back": 16,
66 | "REDUCE_SN_temp": 17,
67 | "REDUCE_NS_temp": 18,
68 | "REDUCE_SN_attr": 19,
69 | "REDUCE_SN_elab": 3,
70 | "REDUCE_NS_mann": 21,
71 | "REDUCE_NS_prob": 22,
72 | "REDUCE_NN_same": 33,
73 | "REDUCE_NN_cont": 24,
74 | "REDUCE_SN_cond": 25,
75 | "REDUCE_SN_prob": 26,
76 | "REDUCE_NS_cause": 27,
77 | "REDUCE_NN_cond": 28,
78 | "REDUCE_NS_summ": 4,
79 | "REDUCE_SN_evid": 30,
80 | "POPROOT__": 31,
81 | "REDUCE_NN_topic": 32,
82 | "REDUCE_NN_evid": 23,
83 | "REDUCE_NN_list": 34,
84 | "SHIFT__": 35,
85 | "REDUCE_NS_cont": 36,
86 | "REDUCE_NS_topic": 9,
87 | "REDUCE_SN_mann": 38,
88 | "REDUCE_SN_cause": 41,
89 | "REDUCE_SN_back": 39,
90 | "REDUCE_NN_prob": 40,
91 | "REDUCE_NS_cond": 37,
92 | "REDUCE_NN_comp": 42
93 | },
94 | "alphas": [
95 | "REDUCE_NN_temp",
96 | "REDUCE_NS_purp",
97 | "REDUCE_NN_text",
98 | "REDUCE_SN_elab",
99 | "REDUCE_NS_summ",
100 | "REDUCE_NS_attr",
101 | "REDUCE_NN_eval",
102 | "REDUCE_NS_evid",
103 | "REDUCE_SN_comp",
104 | "REDUCE_NS_topic",
105 | "REDUCE_SN_purp",
106 | "REDUCE_NS_comp",
107 | "REDUCE_NS_elab",
108 | "REDUCE_SN_eval",
109 | "REDUCE_NN_cause",
110 | "REDUCE_NS_eval",
111 | "REDUCE_NS_back",
112 | "REDUCE_SN_temp",
113 | "REDUCE_NS_temp",
114 | "REDUCE_SN_attr",
115 | "REDUCE_SN_summ",
116 | "REDUCE_NS_mann",
117 | "REDUCE_NS_prob",
118 | "REDUCE_NN_evid",
119 | "REDUCE_NN_cont",
120 | "REDUCE_SN_cond",
121 | "REDUCE_SN_prob",
122 | "REDUCE_NS_cause",
123 | "REDUCE_NN_cond",
124 | "REDUCE_SN_cont",
125 | "REDUCE_SN_evid",
126 | "POPROOT__",
127 | "REDUCE_NN_topic",
128 | "REDUCE_NN_same",
129 | "REDUCE_NN_list",
130 | "SHIFT__",
131 | "REDUCE_NS_cont",
132 | "REDUCE_NS_cond",
133 | "REDUCE_SN_mann",
134 | "REDUCE_SN_back",
135 | "REDUCE_NN_prob",
136 | "REDUCE_SN_cause",
137 | "REDUCE_NN_comp",
138 | "PAD"
139 | ]
140 | }
--------------------------------------------------------------------------------
/models/rst/alphabets/tag_alpha.json:
--------------------------------------------------------------------------------
1 | {
2 | "id2alpha": {
3 | "0": "UNK",
4 | "1": "",
5 | "2": "PRP$",
6 | "3": "VBG",
7 | "4": "VBD",
8 | "5": "``",
9 | "6": "VBN",
10 | "7": "POS",
11 | "8": "''",
12 | "9": "VBP",
13 | "10": "WDT",
14 | "11": "JJ",
15 | "12": "WP",
16 | "13": "VBZ",
17 | "14": "DT",
18 | "15": "#",
19 | "16": "RP",
20 | "17": "$",
21 | "18": "NN",
22 | "19": "FW",
23 | "20": ",",
24 | "21": ".",
25 | "22": "TO",
26 | "23": "PRP",
27 | "24": "RB",
28 | "25": "-LRB-",
29 | "26": ":",
30 | "27": "NNS",
31 | "28": "NNP",
32 | "29": "VB",
33 | "30": "WRB",
34 | "31": "CC",
35 | "32": "LS",
36 | "33": "PDT",
37 | "34": "RBS",
38 | "35": "RBR",
39 | "36": "CD",
40 | "37": "EX",
41 | "38": "IN",
42 | "39": "WP$",
43 | "40": "MD",
44 | "41": "NNPS",
45 | "42": "-RRB-",
46 | "43": "JJS",
47 | "44": "JJR",
48 | "45": "SYM",
49 | "46": "UH"
50 | },
51 | "alpha2id": {
52 | "": 1,
53 | "PRP$": 2,
54 | "VBG": 3,
55 | "VBD": 4,
56 | "VBN": 6,
57 | ",": 20,
58 | "''": 8,
59 | "VBP": 9,
60 | "WDT": 10,
61 | "JJ": 11,
62 | "WP": 12,
63 | "VBZ": 13,
64 | "DT": 14,
65 | "#": 15,
66 | "RP": 16,
67 | "$": 17,
68 | "NN": 18,
69 | "FW": 19,
70 | "POS": 7,
71 | ".": 21,
72 | "TO": 22,
73 | "PRP": 23,
74 | "RB": 24,
75 | "-LRB-": 25,
76 | ":": 26,
77 | "NNS": 27,
78 | "NNP": 28,
79 | "``": 5,
80 | "WRB": 30,
81 | "CC": 31,
82 | "LS": 32,
83 | "PDT": 33,
84 | "RBS": 34,
85 | "RBR": 35,
86 | "CD": 36,
87 | "EX": 37,
88 | "IN": 38,
89 | "WP$": 39,
90 | "UNK": 0,
91 | "MD": 40,
92 | "NNPS": 41,
93 | "-RRB-": 42,
94 | "JJS": 43,
95 | "JJR": 44,
96 | "SYM": 45,
97 | "VB": 29,
98 | "UH": 46
99 | },
100 | "alphas": [
101 | "",
102 | "PRP$",
103 | "VBG",
104 | "VBD",
105 | "``",
106 | "VBN",
107 | "POS",
108 | "''",
109 | "VBP",
110 | "WDT",
111 | "JJ",
112 | "WP",
113 | "VBZ",
114 | "DT",
115 | "#",
116 | "RP",
117 | "$",
118 | "NN",
119 | "FW",
120 | ",",
121 | ".",
122 | "TO",
123 | "PRP",
124 | "RB",
125 | "-LRB-",
126 | ":",
127 | "NNS",
128 | "NNP",
129 | "VB",
130 | "WRB",
131 | "CC",
132 | "LS",
133 | "PDT",
134 | "RBS",
135 | "RBR",
136 | "CD",
137 | "EX",
138 | "IN",
139 | "WP$",
140 | "MD",
141 | "NNPS",
142 | "-RRB-",
143 | "JJS",
144 | "JJR",
145 | "SYM",
146 | "UH",
147 | "UNK"
148 | ]
149 | }
--------------------------------------------------------------------------------
/models/rst/config.cfg:
--------------------------------------------------------------------------------
1 | use_gpu = True
2 | use_dynamic_oracle = True
3 | flag_oracle = True
4 | word_embedding = glove
5 | word_embedding_file = /home/ffajri/Data/NeuralRST/glove.6B.200d.txt.gz
6 | train_path = /home/ffajri/Data/NeuralRST/rst.train312
7 | test_path = /home/ffajri/Data/NeuralRST/rst.test38
8 | dev_path = /home/ffajri/Data/NeuralRST/rst.dev35
9 | train_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/train.conll.dump.results
10 | test_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/test.conll.dump.results
11 | dev_syn_feat_path = /home/ffajri/Data/NeuralRST/SyntaxBiaffine/dev.conll.dump.results
12 | model_path = /home/ffajri/Workspace/RSTExtractor/models/rst/
13 | model_name = /home/ffajri/Workspace/RSTExtractor/models/rst/network.pt
14 | alphabet_path = /home/ffajri/Workspace/RSTExtractor/models/rst/alphabets/
15 | max_iter = 1000
16 | word_dim = 200
17 | tag_dim = 200
18 | etype_dim = 100
19 | syntax_dim = 1200
20 | max_sent_size = 40
21 | max_edu_size = 400
22 | max_state_size = 1024
23 | hidden_size = 200
24 | freeze = True
25 | drop_prob = 0.5
26 | num_layers = 1
27 | batch_size = 4
28 | opt = adam
29 | lr = 0.001
30 | ada_eps = 1e-08
31 | momentum = 0.9
32 | beta1 = 0.9
33 | beta2 = 0.999
34 | gamma = 2e-06
35 | start_decay = 0
36 | clip = 10.0
37 | decay = 0
38 | oracle_prob = 0.66666
39 | start_dynamic_oracle = 15
40 | early_stopping = 50
41 |
--------------------------------------------------------------------------------
/models/rst/network.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fajri91/RSTExtractor/ebffc560095718150b22d9134784c49fd6629bb4/models/rst/network.pt
--------------------------------------------------------------------------------
/neuronlp2/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from . import io
4 | from . import nn
5 | from . import utils
6 | from . import nlinalg
7 | from . import models
8 |
9 | __version__ = "0.1.dev1"
--------------------------------------------------------------------------------
/neuronlp2/biaffine_model.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import math
3 | THREADS=7
4 | files = glob.glob(DATA_PATH)
5 | DATA_PATH = '/home/ffajri/Data/segmenter/*'
6 | size = int(math.ceil(1.0*len(files)/THREADS))
7 |
8 | allfiles = []
9 | for i in range(THREADS):
10 | start = i * size
11 | end = start + size
12 | if end > len(files):
13 | end = len(files)
14 | p = files[start:end]
15 | allfiles.append(p)
16 | if end == len(files):
17 | break
18 |
19 | for idx in range(len(allfiles)):
20 | f = open(str(idx)+'.list', 'w')
21 | for l in allfiles[idx]:
22 | f.write(l+'\n')
23 | f.close()
24 |
--------------------------------------------------------------------------------
/neuronlp2/io/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .alphabet import *
4 | from .instance import *
5 | from .logger import *
6 | from .writer import *
7 | from . import conllx_data
8 | from . import conllx_stacked_data
9 | from . import conll03_data
--------------------------------------------------------------------------------
/neuronlp2/io/alphabet.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | """
4 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects.
5 | """
6 | import json
7 | import os
8 | from .logger import get_logger
9 |
10 | class Alphabet(object):
11 | def __init__(self, name, defualt_value=False, keep_growing=True, singleton=False):
12 | self.__name = name
13 |
14 | self.instance2index = {}
15 | self.instances = []
16 | self.default_value = defualt_value
17 | self.offset = 1 if self.default_value else 0
18 | self.keep_growing = keep_growing
19 | self.singletons = set() if singleton else None
20 |
21 | # Index 0 is occupied by default, all else following.
22 | self.default_index = 0 if self.default_value else None
23 |
24 | self.next_index = self.offset
25 |
26 | self.logger = get_logger('Alphabet')
27 |
28 | def add(self, instance):
29 | if instance not in self.instance2index:
30 | self.instances.append(instance)
31 | self.instance2index[instance] = self.next_index
32 | self.next_index += 1
33 |
34 | def add_singleton(self, id):
35 | if self.singletons is None:
36 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
37 | else:
38 | self.singletons.add(id)
39 |
40 | def add_singletons(self, ids):
41 | if self.singletons is None:
42 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
43 | else:
44 | self.singletons.update(ids)
45 |
46 | def is_singleton(self, id):
47 | if self.singletons is None:
48 | raise RuntimeError('Alphabet %s does not have singleton.' % self.__name)
49 | else:
50 | return id in self.singletons
51 |
52 | def get_index(self, instance):
53 | try:
54 | return self.instance2index[instance]
55 | except KeyError:
56 | if self.keep_growing:
57 | index = self.next_index
58 | self.add(instance)
59 | return index
60 | else:
61 | if self.default_value:
62 | return self.default_index
63 | else:
64 | raise KeyError("instance not found: %s" % instance)
65 |
66 | def get_instance(self, index):
67 | if self.default_value and index == self.default_index:
68 | # First index is occupied by the wildcard element.
69 | return '<_UNK>'
70 | else:
71 | try:
72 | return self.instances[index - self.offset]
73 | except IndexError:
74 | raise IndexError('unknown index: %d' % index)
75 |
76 | def size(self):
77 | return len(self.instances) + self.offset
78 |
79 | def singleton_size(self):
80 | return len(self.singletons)
81 |
82 | def items(self):
83 | return self.instance2index.items()
84 |
85 | def enumerate_items(self, start):
86 | if start < self.offset or start >= self.size():
87 | raise IndexError("Enumerate is allowed between [%d : size of the alphabet)" % self.offset)
88 | return zip(range(start, len(self.instances) + self.offset), self.instances[start - self.offset:])
89 |
90 | def close(self):
91 | self.keep_growing = False
92 |
93 | def open(self):
94 | self.keep_growing = True
95 |
96 | def get_content(self):
97 | if self.singletons is None:
98 | return {'instance2index': self.instance2index, 'instances': self.instances}
99 | else:
100 | return {'instance2index': self.instance2index, 'instances': self.instances,
101 | 'singletions': list(self.singletons)}
102 |
103 | def __from_json(self, data):
104 | self.instances = data["instances"]
105 | self.instance2index = data["instance2index"]
106 | if 'singletions' in data:
107 | self.singletons = set(data['singletions'])
108 | else:
109 | self.singletons = None
110 |
111 | def save(self, output_directory, name=None):
112 | """
113 | Save both alhpabet records to the given directory.
114 | :param output_directory: Directory to save model and weights.
115 | :param name: The alphabet saving name, optional.
116 | :return:
117 | """
118 | saving_name = name if name else self.__name
119 | try:
120 | if not os.path.exists(output_directory):
121 | os.makedirs(output_directory)
122 |
123 | json.dump(self.get_content(),
124 | open(os.path.join(output_directory, saving_name + ".json"), 'w'), indent=4)
125 | except Exception as e:
126 | self.logger.warn("Alphabet is not saved: %s" % repr(e))
127 |
128 | def load(self, input_directory, name=None):
129 | """
130 | Load model architecture and weights from the give directory. This allow we use old models even the structure
131 | changes.
132 | :param input_directory: Directory to save model and weights
133 | :return:
134 | """
135 | loading_name = name if name else self.__name
136 | self.__from_json(json.load(open(os.path.join(input_directory, loading_name + ".json"))))
137 | self.next_index = len(self.instances) + self.offset
138 | self.keep_growing = False
139 |
--------------------------------------------------------------------------------
/neuronlp2/io/instance.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 |
4 | class Sentence(object):
5 | def __init__(self, words, word_ids, char_seqs, char_id_seqs):
6 | self.words = words
7 | self.word_ids = word_ids
8 | self.char_seqs = char_seqs
9 | self.char_id_seqs = char_id_seqs
10 |
11 | def length(self):
12 | return len(self.words)
13 |
14 |
15 | class DependencyInstance(object):
16 | def __init__(self, sentence, postags, pos_ids, heads=None, types=None, type_ids=None):
17 | self.sentence = sentence
18 | self.postags = postags
19 | self.pos_ids = pos_ids
20 | self.heads = heads
21 | self.types = types
22 | self.type_ids = type_ids
23 |
24 | def length(self):
25 | return self.sentence.length()
26 |
27 |
28 | class NERInstance(object):
29 | def __init__(self, sentence, postags, pos_ids, chunk_tags, chunk_ids, ner_tags, ner_ids):
30 | self.sentence = sentence
31 | self.postags = postags
32 | self.pos_ids = pos_ids
33 | self.chunk_tags = chunk_tags
34 | self.chunk_ids = chunk_ids
35 | self.ner_tags = ner_tags
36 | self.ner_ids = ner_ids
37 |
38 | def length(self):
39 | return self.sentence.length()
40 |
--------------------------------------------------------------------------------
/neuronlp2/io/logger.py:
--------------------------------------------------------------------------------
1 | _author__ = 'max'
2 |
3 | import logging
4 | import sys
5 |
6 |
7 | def get_logger(name, level=logging.INFO, handler=sys.stdout,
8 | formatter='%(asctime)s - %(name)s - %(levelname)s - %(message)s'):
9 | logger = logging.getLogger(name)
10 | logger.setLevel(logging.INFO)
11 | formatter = logging.Formatter(formatter)
12 | stream_handler = logging.StreamHandler(handler)
13 | stream_handler.setLevel(level)
14 | stream_handler.setFormatter(formatter)
15 | logger.addHandler(stream_handler)
16 |
17 | return logger
18 |
--------------------------------------------------------------------------------
/neuronlp2/io/reader.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .instance import DependencyInstance, NERInstance
4 | from .instance import Sentence
5 | from .conllx_data import ROOT, ROOT_POS, ROOT_CHAR, ROOT_TYPE, END, END_POS, END_CHAR, END_TYPE
6 | from . import utils
7 |
8 |
9 | class CoNLLXReader(object):
10 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
11 | self.__source_file = open(file_path, 'r')
12 | self.__word_alphabet = word_alphabet
13 | self.__char_alphabet = char_alphabet
14 | self.__pos_alphabet = pos_alphabet
15 | self.__type_alphabet = type_alphabet
16 |
17 | def close(self):
18 | self.__source_file.close()
19 |
20 | def getNext(self, normalize_digits=True, symbolic_root=False, symbolic_end=False):
21 | line = self.__source_file.readline()
22 | # skip multiple blank lines.
23 | while len(line) > 0 and len(line.strip()) == 0:
24 | line = self.__source_file.readline()
25 | if len(line) == 0:
26 | return None
27 |
28 | lines = []
29 | while len(line.strip()) > 0:
30 | line = line.strip()
31 | line = line.decode('utf-8')
32 | lines.append(line.split('\t'))
33 | line = self.__source_file.readline()
34 |
35 | length = len(lines)
36 | if length == 0:
37 | return None
38 |
39 | words = []
40 | word_ids = []
41 | char_seqs = []
42 | char_id_seqs = []
43 | postags = []
44 | pos_ids = []
45 | types = []
46 | type_ids = []
47 | heads = []
48 |
49 | if symbolic_root:
50 | words.append(ROOT)
51 | word_ids.append(self.__word_alphabet.get_index(ROOT))
52 | char_seqs.append([ROOT_CHAR, ])
53 | char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ])
54 | postags.append(ROOT_POS)
55 | pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
56 | types.append(ROOT_TYPE)
57 | type_ids.append(self.__type_alphabet.get_index(ROOT_TYPE))
58 | heads.append(0)
59 |
60 | for tokens in lines:
61 | chars = []
62 | char_ids = []
63 | for char in tokens[1]:
64 | chars.append(char)
65 | char_ids.append(self.__char_alphabet.get_index(char))
66 | if len(chars) > utils.MAX_CHAR_LENGTH:
67 | chars = chars[:utils.MAX_CHAR_LENGTH]
68 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
69 | char_seqs.append(chars)
70 | char_id_seqs.append(char_ids)
71 |
72 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
73 | pos = tokens[4]
74 | head = int(tokens[6])
75 | type = tokens[7]
76 |
77 | words.append(word)
78 | word_ids.append(self.__word_alphabet.get_index(word))
79 |
80 | postags.append(pos)
81 | pos_ids.append(self.__pos_alphabet.get_index(pos))
82 |
83 | types.append(type)
84 | type_ids.append(self.__type_alphabet.get_index(type))
85 |
86 | heads.append(head)
87 |
88 | if symbolic_end:
89 | words.append(END)
90 | word_ids.append(self.__word_alphabet.get_index(END))
91 | char_seqs.append([END_CHAR, ])
92 | char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ])
93 | postags.append(END_POS)
94 | pos_ids.append(self.__pos_alphabet.get_index(END_POS))
95 | types.append(END_TYPE)
96 | type_ids.append(self.__type_alphabet.get_index(END_TYPE))
97 | heads.append(0)
98 |
99 | return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, heads, types, type_ids)
100 |
101 | def getNextForTest(self, normalize_digits=True, symbolic_root=False, symbolic_end=False):
102 | line = self.__source_file.readline()
103 | # skip multiple blank lines.
104 | while len(line) > 0 and len(line.strip()) == 0:
105 | line = self.__source_file.readline()
106 | if len(line) == 0:
107 | return None
108 |
109 | lines = []
110 | while len(line.strip()) > 0:
111 | line = line.strip()
112 | line = line.decode('utf-8')
113 | lines.append(line.split('\t'))
114 | line = self.__source_file.readline()
115 |
116 | length = len(lines)
117 | if length == 0:
118 | return None
119 |
120 | words = []
121 | word_ids = []
122 | char_seqs = []
123 | char_id_seqs = []
124 | postags = []
125 | pos_ids = []
126 |
127 | if symbolic_root:
128 | words.append(ROOT)
129 | word_ids.append(self.__word_alphabet.get_index(ROOT))
130 | char_seqs.append([ROOT_CHAR, ])
131 | char_id_seqs.append([self.__char_alphabet.get_index(ROOT_CHAR), ])
132 | postags.append(ROOT_POS)
133 | pos_ids.append(self.__pos_alphabet.get_index(ROOT_POS))
134 |
135 | for tokens in lines:
136 | chars = []
137 | char_ids = []
138 | for char in tokens[1]:
139 | chars.append(char)
140 | char_ids.append(self.__char_alphabet.get_index(char))
141 | if len(chars) > utils.MAX_CHAR_LENGTH:
142 | chars = chars[:utils.MAX_CHAR_LENGTH]
143 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
144 | char_seqs.append(chars)
145 | char_id_seqs.append(char_ids)
146 |
147 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
148 | pos = tokens[4]
149 | if pos == '_':
150 | pos = tokens[3]
151 | if pos == '#':
152 | pos = '$'
153 | words.append(word)
154 | word_ids.append(self.__word_alphabet.get_index(word))
155 |
156 | postags.append(pos)
157 | pos_ids.append(self.__pos_alphabet.get_index(pos))
158 |
159 |
160 | if symbolic_end:
161 | words.append(END)
162 | word_ids.append(self.__word_alphabet.get_index(END))
163 | char_seqs.append([END_CHAR, ])
164 | char_id_seqs.append([self.__char_alphabet.get_index(END_CHAR), ])
165 | postags.append(END_POS)
166 | pos_ids.append(self.__pos_alphabet.get_index(END_POS))
167 |
168 | return DependencyInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids)
169 |
170 | class CoNLL03Reader(object):
171 | def __init__(self, file_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
172 | self.__source_file = open(file_path, 'r')
173 | self.__word_alphabet = word_alphabet
174 | self.__char_alphabet = char_alphabet
175 | self.__pos_alphabet = pos_alphabet
176 | self.__chunk_alphabet = chunk_alphabet
177 | self.__ner_alphabet = ner_alphabet
178 |
179 | def close(self):
180 | self.__source_file.close()
181 |
182 | def getNext(self, normalize_digits=True):
183 | line = self.__source_file.readline()
184 | # skip multiple blank lines.
185 | while len(line) > 0 and len(line.strip()) == 0:
186 | line = self.__source_file.readline()
187 | if len(line) == 0:
188 | return None
189 |
190 | lines = []
191 | while len(line.strip()) > 0:
192 | line = line.strip()
193 | line = line.decode('utf-8')
194 | lines.append(line.split(' '))
195 | line = self.__source_file.readline()
196 |
197 | length = len(lines)
198 | if length == 0:
199 | return None
200 |
201 | words = []
202 | word_ids = []
203 | char_seqs = []
204 | char_id_seqs = []
205 | postags = []
206 | pos_ids = []
207 | chunk_tags = []
208 | chunk_ids = []
209 | ner_tags = []
210 | ner_ids = []
211 |
212 | for tokens in lines:
213 | chars = []
214 | char_ids = []
215 | for char in tokens[1]:
216 | chars.append(char)
217 | char_ids.append(self.__char_alphabet.get_index(char))
218 | if len(chars) > utils.MAX_CHAR_LENGTH:
219 | chars = chars[:utils.MAX_CHAR_LENGTH]
220 | char_ids = char_ids[:utils.MAX_CHAR_LENGTH]
221 | char_seqs.append(chars)
222 | char_id_seqs.append(char_ids)
223 |
224 | word = utils.DIGIT_RE.sub(b"0", tokens[1]) if normalize_digits else tokens[1]
225 | pos = tokens[2]
226 | chunk = tokens[3]
227 | ner = tokens[4]
228 |
229 | words.append(word)
230 | word_ids.append(self.__word_alphabet.get_index(word))
231 |
232 | postags.append(pos)
233 | pos_ids.append(self.__pos_alphabet.get_index(pos))
234 |
235 | chunk_tags.append(chunk)
236 | chunk_ids.append(self.__chunk_alphabet.get_index(chunk))
237 |
238 | ner_tags.append(ner)
239 | ner_ids.append(self.__ner_alphabet.get_index(ner))
240 |
241 | return NERInstance(Sentence(words, word_ids, char_seqs, char_id_seqs), postags, pos_ids, chunk_tags, chunk_ids,
242 | ner_tags, ner_ids)
243 |
--------------------------------------------------------------------------------
/neuronlp2/io/utils.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import re
4 | MAX_CHAR_LENGTH = 45
5 | NUM_CHAR_PAD = 2
6 |
7 | # Regular expressions used to normalize digits.
8 | DIGIT_RE = re.compile(br"\d")
9 |
--------------------------------------------------------------------------------
/neuronlp2/io/writer.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 |
4 | class CoNLL03Writer(object):
5 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet):
6 | self.__source_file = None
7 | self.__word_alphabet = word_alphabet
8 | self.__char_alphabet = char_alphabet
9 | self.__pos_alphabet = pos_alphabet
10 | self.__chunk_alphabet = chunk_alphabet
11 | self.__ner_alphabet = ner_alphabet
12 |
13 | def start(self, file_path):
14 | self.__source_file = open(file_path, 'w')
15 |
16 | def close(self):
17 | self.__source_file.close()
18 |
19 | def write(self, word, pos, chunk, predictions, targets, lengths):
20 | batch_size, _ = word.shape
21 | for i in range(batch_size):
22 | for j in range(lengths[i]):
23 | w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8')
24 | p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8')
25 | ch = self.__chunk_alphabet.get_instance(chunk[i, j]).encode('utf-8')
26 | tgt = self.__ner_alphabet.get_instance(targets[i, j]).encode('utf-8')
27 | pred = self.__ner_alphabet.get_instance(predictions[i, j]).encode('utf-8')
28 | self.__source_file.write('%d %s %s %s %s %s\n' % (j + 1, w, p, ch, tgt, pred))
29 | self.__source_file.write('\n')
30 |
31 |
32 | class CoNLLXWriter(object):
33 | def __init__(self, word_alphabet, char_alphabet, pos_alphabet, type_alphabet):
34 | self.__source_file = None
35 | self.__word_alphabet = word_alphabet
36 | self.__char_alphabet = char_alphabet
37 | self.__pos_alphabet = pos_alphabet
38 | self.__type_alphabet = type_alphabet
39 |
40 | def start(self, file_path):
41 | self.__source_file = open(file_path, 'w')
42 |
43 | def close(self):
44 | self.__source_file.close()
45 |
46 | def write(self, word, pos, head, type, lengths, symbolic_root=False, symbolic_end=False):
47 | batch_size, _ = word.shape
48 | start = 1 if symbolic_root else 0
49 | end = 1 if symbolic_end else 0
50 | for i in range(batch_size):
51 | for j in range(start, lengths[i] - end):
52 | w = self.__word_alphabet.get_instance(word[i, j]).encode('utf-8')
53 | p = self.__pos_alphabet.get_instance(pos[i, j]).encode('utf-8')
54 | t = self.__type_alphabet.get_instance(type[i, j]).encode('utf-8')
55 | h = head[i, j]
56 | self.__source_file.write('%d\t%s\t_\t_\t%s\t_\t%d\t%s\n' % (j, w, p, h, t))
57 | self.__source_file.write('\n')
58 |
--------------------------------------------------------------------------------
/neuronlp2/models/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .sequence_labeling import *
4 | from .parsing import *
5 |
6 |
--------------------------------------------------------------------------------
/neuronlp2/nlinalg/__init__.py:
--------------------------------------------------------------------------------
1 | _author__ = 'max'
2 |
3 | from .nlinalg import *
4 |
--------------------------------------------------------------------------------
/neuronlp2/nlinalg/nlinalg.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import numpy
4 | import torch
5 | from torch.autograd.function import Function
6 |
7 |
8 | def logdet(x):
9 | """
10 |
11 | Args:
12 | x: 2D positive semidefinite matrix.
13 |
14 | Returns: log determinant of x
15 |
16 | """
17 | # TODO for pytorch 2.0.4, use inside potrf for variable.
18 | print(torch.log(torch.eig(x.data)[0]))
19 | print(x)
20 | u_chol = x.potrf()
21 | return torch.sum(torch.log(u_chol.diag())) * 2
22 |
23 |
24 | def logsumexp(x, dim=None):
25 | """
26 |
27 | Args:
28 | x: A pytorch tensor (any dimension will do)
29 | dim: int or None, over which to perform the summation. `None`, the
30 | default, performs over all axes.
31 |
32 | Returns: The result of the log(sum(exp(...))) operation.
33 |
34 | """
35 | if dim is None:
36 | xmax = x.max()
37 | xmax_ = x.max()
38 | return xmax_ + torch.log(torch.exp(x - xmax).sum())
39 | else:
40 | xmax, _ = x.max(dim, keepdim=True)
41 | xmax_, _ = x.max(dim)
42 | return xmax_ + torch.log(torch.exp(x - xmax).sum(dim))
43 |
--------------------------------------------------------------------------------
/neuronlp2/nn/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .modules import *
4 | from . import init
5 |
--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from . import masked_rnn
4 | from . import variational_rnn
5 | from . import skipconnect_rnn
6 |
--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/masked_rnn.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import torch
4 | from torch.nn import functional as F
5 |
6 |
7 | def MaskedRecurrent(reverse=False):
8 | def forward(input, hidden, cell, mask):
9 | output = []
10 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
11 | for i in steps:
12 | if mask is None or mask[i].data.min() > 0.5:
13 | hidden = cell(input[i], hidden)
14 | elif mask[i].data.max() > 0.5:
15 | hidden_next = cell(input[i], hidden)
16 | # hack to handle LSTM
17 | if isinstance(hidden, tuple):
18 | hx, cx = hidden
19 | hp1, cp1 = hidden_next
20 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
21 | else:
22 | hidden = hidden + (hidden_next - hidden) * mask[i]
23 | # hack to handle LSTM
24 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
25 |
26 | if reverse:
27 | output.reverse()
28 | output = torch.cat(output, 0).view(input.size(0), *output[0].size())
29 |
30 | return hidden, output
31 |
32 | return forward
33 |
34 |
35 | def StackedRNN(inners, num_layers, lstm=False, dropout=0, train=True):
36 | num_directions = len(inners)
37 | total_layers = num_layers * num_directions
38 |
39 | def forward(input, hidden, cells, mask):
40 | assert (len(cells) == total_layers)
41 | next_hidden = []
42 |
43 | if lstm:
44 | hidden = list(zip(*hidden))
45 |
46 | for i in range(num_layers):
47 | all_output = []
48 | for j, inner in enumerate(inners):
49 | l = i * num_directions + j
50 | hy, output = inner(input, hidden[l], cells[l], mask)
51 | next_hidden.append(hy)
52 | all_output.append(output)
53 |
54 | input = torch.cat(all_output, input.dim() - 1)
55 |
56 | if dropout != 0 and i < num_layers - 1:
57 | input = F.dropout(input, p=dropout, training=train, inplace=False)
58 |
59 | if lstm:
60 | next_h, next_c = zip(*next_hidden)
61 | next_hidden = (
62 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
63 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
64 | )
65 | else:
66 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
67 |
68 | return next_hidden, input
69 |
70 | return forward
71 |
72 |
73 | def AutogradMaskedRNN(num_layers=1, batch_first=False, dropout=0, train=True, bidirectional=False, lstm=False):
74 | rec_factory = MaskedRecurrent
75 |
76 | if bidirectional:
77 | layer = (rec_factory(), rec_factory(reverse=True))
78 | else:
79 | layer = (rec_factory(),)
80 |
81 | func = StackedRNN(layer,
82 | num_layers,
83 | lstm=lstm,
84 | dropout=dropout,
85 | train=train)
86 |
87 | def forward(input, cells, hidden, mask):
88 | if batch_first:
89 | input = input.transpose(0, 1)
90 | if mask is not None:
91 | mask = mask.transpose(0, 1)
92 |
93 | nexth, output = func(input, hidden, cells, mask)
94 |
95 | if batch_first:
96 | output = output.transpose(0, 1)
97 |
98 | return output, nexth
99 |
100 | return forward
101 |
102 |
103 | def MaskedStep():
104 | def forward(input, hidden, cell, mask):
105 | if mask is None or mask.data.min() > 0.5:
106 | hidden = cell(input, hidden)
107 | elif mask.data.max() > 0.5:
108 | hidden_next = cell(input, hidden)
109 | # hack to handle LSTM
110 | if isinstance(hidden, tuple):
111 | hx, cx = hidden
112 | hp1, cp1 = hidden_next
113 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
114 | else:
115 | hidden = hidden + (hidden_next - hidden) * mask
116 | # hack to handle LSTM
117 | output = hidden[0] if isinstance(hidden, tuple) else hidden
118 |
119 | return hidden, output
120 |
121 | return forward
122 |
123 |
124 | def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True):
125 | def forward(input, hidden, cells, mask):
126 | assert (len(cells) == num_layers)
127 | next_hidden = []
128 |
129 | if lstm:
130 | hidden = list(zip(*hidden))
131 |
132 | for l in range(num_layers):
133 | hy, output = layer(input, hidden[l], cells[l], mask)
134 | next_hidden.append(hy)
135 | input = output
136 |
137 | if dropout != 0 and l < num_layers - 1:
138 | input = F.dropout(input, p=dropout, training=train, inplace=False)
139 |
140 | if lstm:
141 | next_h, next_c = zip(*next_hidden)
142 | next_hidden = (
143 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
144 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
145 | )
146 | else:
147 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
148 |
149 | return next_hidden, input
150 |
151 | return forward
152 |
153 |
154 | def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False):
155 | layer = MaskedStep()
156 |
157 | func = StackedStep(layer,
158 | num_layers,
159 | lstm=lstm,
160 | dropout=dropout,
161 | train=train)
162 |
163 | def forward(input, cells, hidden, mask):
164 | nexth, output = func(input, hidden, cells, mask)
165 | return output, nexth
166 |
167 | return forward
168 |
--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/skipconnect_rnn.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import torch
4 | from torch.autograd import Variable
5 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
6 | from torch.nn import functional as F
7 |
8 |
9 | def SkipConnectRNNReLUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None, noise_skip=None):
10 | if noise_in is not None:
11 | input = input * noise_in
12 |
13 | hidden = torch.cat([hidden, hidden_skip], dim=1)
14 | if noise_hidden is not None:
15 | hidden = hidden * noise_hidden
16 |
17 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
18 | return hy
19 |
20 |
21 | def SkipConnectRNNTanhCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
22 | if noise_in is not None:
23 | input = input * noise_in
24 |
25 | hidden = torch.cat([hidden, hidden_skip], dim=1)
26 | if noise_hidden is not None:
27 | hidden = hidden * noise_hidden
28 |
29 | hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
30 | return hy
31 |
32 |
33 | def SkipConnectLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
34 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
35 |
36 | hx, cx = hidden
37 | hx = torch.cat([hx, hidden_skip], dim=1)
38 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
39 |
40 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
41 |
42 | ingate, forgetgate, cellgate, outgate = gates
43 |
44 | ingate = F.sigmoid(ingate)
45 | forgetgate = F.sigmoid(forgetgate)
46 | cellgate = F.tanh(cellgate)
47 | outgate = F.sigmoid(outgate)
48 |
49 | cy = (forgetgate * cx) + (ingate * cellgate)
50 | hy = outgate * F.tanh(cy)
51 |
52 | return hy, cy
53 |
54 |
55 | def SkipConnectFastLSTMCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
56 | if noise_in is not None:
57 | input = input * noise_in
58 |
59 | hx, cx = hidden
60 | hx = torch.cat([hx, hidden_skip], dim=1)
61 | if noise_hidden is not None:
62 | hx = hx * noise_hidden
63 |
64 | if input.is_cuda:
65 | igates = F.linear(input, w_ih)
66 | hgates = F.linear(hx, w_hh)
67 | state = fusedBackend.LSTMFused.apply
68 | return state(igates, hgates, cx) if b_ih is None else state(igates, hgates, cx, b_ih, b_hh)
69 |
70 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
71 |
72 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
73 |
74 | ingate = F.sigmoid(ingate)
75 | forgetgate = F.sigmoid(forgetgate)
76 | cellgate = F.tanh(cellgate)
77 | outgate = F.sigmoid(outgate)
78 |
79 | cy = (forgetgate * cx) + (ingate * cellgate)
80 | hy = outgate * F.tanh(cy)
81 |
82 | return hy, cy
83 |
84 |
85 | def SkipConnectGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
86 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
87 | hx = torch.cat([hidden, hidden_skip], dim=1)
88 | hx = hx.expand(3, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
89 |
90 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
91 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
92 | i_r, i_i, i_n = gi
93 | h_r, h_i, h_n = gh
94 |
95 | resetgate = F.sigmoid(i_r + h_r)
96 | inputgate = F.sigmoid(i_i + h_i)
97 | newgate = F.tanh(i_n + resetgate * h_n)
98 | hy = newgate + inputgate * (hidden - newgate)
99 |
100 | return hy
101 |
102 |
103 | def SkipConnectFastGRUCell(input, hidden, hidden_skip, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
104 | if noise_in is not None:
105 | input = input * noise_in
106 |
107 | hx = torch.cat([hidden, hidden_skip], dim=1)
108 | if noise_hidden is not None:
109 | hx = hx * noise_hidden
110 |
111 | if input.is_cuda:
112 | gi = F.linear(input, w_ih)
113 | gh = F.linear(hx, w_hh)
114 | state = fusedBackend.GRUFused.apply
115 | return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
116 |
117 | gi = F.linear(input, w_ih, b_ih)
118 | gh = F.linear(hx, w_hh, b_hh)
119 | i_r, i_i, i_n = gi.chunk(3, 1)
120 | h_r, h_i, h_n = gh.chunk(3, 1)
121 |
122 | resetgate = F.sigmoid(i_r + h_r)
123 | inputgate = F.sigmoid(i_i + h_i)
124 | newgate = F.tanh(i_n + resetgate * h_n)
125 | hy = newgate + inputgate * (hidden - newgate)
126 |
127 | return hy
128 |
129 |
130 | def SkipConnectRecurrent(reverse=False):
131 | def forward(input, skip_connect, hidden, cell, mask):
132 | # hack to handle LSTM
133 | h0 = hidden[0] if isinstance(hidden, tuple) else hidden
134 | # [length + 1, batch, hidden_size]
135 | output = Variable(input.data.new(input.size(0) + 1, *h0.size()).zero_()) + h0
136 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
137 | # create batch index
138 | batch_index = torch.arange(0, h0.size(0)).type_as(skip_connect)
139 | for i in steps:
140 | if mask is None or mask[i].data.min() > 0.5:
141 | hidden_skip = output[skip_connect[i], batch_index]
142 | hidden = cell(input[i], hidden, hidden_skip)
143 | elif mask[i].data.max() > 0.5:
144 | hidden_skip = output[skip_connect[i], batch_index]
145 | hidden_next = cell(input[i], hidden, hidden_skip)
146 | # hack to handle LSTM
147 | if isinstance(hidden, tuple):
148 | hx, cx = hidden
149 | hp1, cp1 = hidden_next
150 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
151 | else:
152 | hidden = hidden + (hidden_next - hidden) * mask[i]
153 | # hack to handle LSTM
154 | if reverse:
155 | output[i] = hidden[0] if isinstance(hidden, tuple) else hidden
156 | else:
157 | output[i + 1] = hidden[0] if isinstance(hidden, tuple) else hidden
158 |
159 | if reverse:
160 | # remove last position
161 | output = output[:-1]
162 | else:
163 | # remove position 0
164 | output = output[1:]
165 |
166 | return hidden, output
167 |
168 | return forward
169 |
170 |
171 | def StackedRNN(inners, num_layers, lstm=False):
172 | num_directions = len(inners)
173 | total_layers = num_layers * num_directions
174 |
175 | def reverse_skip_connection(skip_connect):
176 | # TODO reverse skip connection for bidirectional rnn.
177 | return skip_connect
178 |
179 | def forward(input, skip_connect, hidden, cells, mask):
180 | assert (len(cells) == total_layers)
181 | next_hidden = []
182 |
183 | skip_connect_forward = skip_connect
184 | skip_connec_backward = reverse_skip_connection(skip_connect) if num_directions == 2 else None
185 |
186 | if lstm:
187 | hidden = list(zip(*hidden))
188 |
189 | for i in range(num_layers):
190 | all_output = []
191 | for j, inner in enumerate(inners):
192 | l = i * num_directions + j
193 | skip_connect = skip_connect_forward if j == 0 else skip_connec_backward
194 | hy, output = inner(input, skip_connect, hidden[l], cells[l], mask)
195 | next_hidden.append(hy)
196 | all_output.append(output)
197 |
198 | input = torch.cat(all_output, input.dim() - 1)
199 |
200 | if lstm:
201 | next_h, next_c = zip(*next_hidden)
202 | next_hidden = (
203 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
204 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
205 | )
206 | else:
207 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
208 |
209 | return next_hidden, input
210 |
211 | return forward
212 |
213 |
214 | def AutogradSkipConnectRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
215 | rec_factory = SkipConnectRecurrent
216 |
217 | if bidirectional:
218 | layer = (rec_factory(), rec_factory(reverse=True))
219 | else:
220 | layer = (rec_factory(),)
221 |
222 | func = StackedRNN(layer,
223 | num_layers,
224 | lstm=lstm)
225 |
226 | def forward(input, skip_connect, cells, hidden, mask):
227 | if batch_first:
228 | input = input.transpose(0, 1)
229 | skip_connect = skip_connect.transpose(0, 1)
230 | if mask is not None:
231 | mask = mask.transpose(0, 1)
232 |
233 | nexth, output = func(input, skip_connect, hidden, cells, mask)
234 |
235 | if batch_first:
236 | output = output.transpose(0, 1)
237 |
238 | return output, nexth
239 |
240 | return forward
241 |
242 |
243 | def SkipConnectStep():
244 | def forward(input, hidden, hidden_skip, cell, mask):
245 | if mask is None or mask.data.min() > 0.5:
246 | hidden = cell(input, hidden, hidden_skip)
247 | elif mask.data.max() > 0.5:
248 | hidden_next = cell(input, hidden, hidden_skip)
249 | # hack to handle LSTM
250 | if isinstance(hidden, tuple):
251 | hx, cx = hidden
252 | hp1, cp1 = hidden_next
253 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
254 | else:
255 | hidden = hidden + (hidden_next - hidden) * mask
256 | # hack to handle LSTM
257 | output = hidden[0] if isinstance(hidden, tuple) else hidden
258 |
259 | return hidden, output
260 |
261 | return forward
262 |
263 |
264 | def StackedStep(layer, num_layers, lstm=False):
265 | def forward(input, hidden, hidden_skip, cells, mask):
266 | assert (len(cells) == num_layers)
267 | next_hidden = []
268 |
269 | if lstm:
270 | hidden = list(zip(*hidden))
271 |
272 | for l in range(num_layers):
273 | hy, output = layer(input, hidden[l], hidden_skip[l], cells[l], mask)
274 | next_hidden.append(hy)
275 | input = output
276 |
277 | if lstm:
278 | next_h, next_c = zip(*next_hidden)
279 | next_hidden = (
280 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
281 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
282 | )
283 | else:
284 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
285 |
286 | return next_hidden, input
287 |
288 | return forward
289 |
290 |
291 | def AutogradSkipConnectStep(num_layers=1, lstm=False):
292 | layer = SkipConnectStep()
293 |
294 | func = StackedStep(layer,
295 | num_layers,
296 | lstm=lstm)
297 |
298 | def forward(input, cells, hidden, hidden_skip, mask):
299 | nexth, output = func(input, hidden, hidden_skip, cells, mask)
300 | return output, nexth
301 |
302 | return forward
303 |
--------------------------------------------------------------------------------
/neuronlp2/nn/_functions/variational_rnn.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import torch
4 | from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
5 | from torch.nn import functional as F
6 |
7 |
8 | def VarRNNReLUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
9 | if noise_in is not None:
10 | input = input * noise_in
11 | if noise_hidden is not None:
12 | hidden = hidden * noise_hidden
13 | hy = F.relu(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
14 | return hy
15 |
16 |
17 | def VarRNNTanhCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
18 | if noise_in is not None:
19 | input = input * noise_in
20 | if noise_hidden is not None:
21 | hidden = hidden * noise_hidden
22 | hy = F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))
23 | return hy
24 |
25 |
26 | def VarLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
27 | input = input.expand(4, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
28 |
29 | hx, cx = hidden
30 | hx = hx.expand(4, *hx.size()) if noise_hidden is None else hx.unsqueeze(0) * noise_hidden
31 |
32 | gates = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih) + torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
33 |
34 | ingate, forgetgate, cellgate, outgate = gates
35 |
36 | ingate = F.sigmoid(ingate)
37 | forgetgate = F.sigmoid(forgetgate)
38 | cellgate = F.tanh(cellgate)
39 | outgate = F.sigmoid(outgate)
40 |
41 | cy = (forgetgate * cx) + (ingate * cellgate)
42 | hy = outgate * F.tanh(cy)
43 |
44 | return hy, cy
45 |
46 |
47 | def VarFastLSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
48 | if noise_in is not None:
49 | input = input * noise_in
50 |
51 | if input.is_cuda:
52 | igates = F.linear(input, w_ih)
53 | hgates = F.linear(hidden[0], w_hh) if noise_hidden is None else F.linear(hidden[0] * noise_hidden, w_hh)
54 | state = fusedBackend.LSTMFused.apply
55 | return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)
56 |
57 | hx, cx = hidden
58 | if noise_hidden is not None:
59 | hx = hx * noise_hidden
60 | gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)
61 |
62 | ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
63 |
64 | ingate = F.sigmoid(ingate)
65 | forgetgate = F.sigmoid(forgetgate)
66 | cellgate = F.tanh(cellgate)
67 | outgate = F.sigmoid(outgate)
68 |
69 | cy = (forgetgate * cx) + (ingate * cellgate)
70 | hy = outgate * F.tanh(cy)
71 |
72 | return hy, cy
73 |
74 |
75 | def VarGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
76 | input = input.expand(3, *input.size()) if noise_in is None else input.unsqueeze(0) * noise_in
77 | hx = hidden.expand(3, *hidden.size()) if noise_hidden is None else hidden.unsqueeze(0) * noise_hidden
78 |
79 | gi = torch.baddbmm(b_ih.unsqueeze(1), input, w_ih)
80 | gh = torch.baddbmm(b_hh.unsqueeze(1), hx, w_hh)
81 | i_r, i_i, i_n = gi
82 | h_r, h_i, h_n = gh
83 |
84 | resetgate = F.sigmoid(i_r + h_r)
85 | inputgate = F.sigmoid(i_i + h_i)
86 | newgate = F.tanh(i_n + resetgate * h_n)
87 | hy = newgate + inputgate * (hidden - newgate)
88 |
89 | return hy
90 |
91 |
92 | def VarFastGRUCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
93 | if noise_in is not None:
94 | input = input * noise_in
95 |
96 | hx = hidden if noise_hidden is None else hidden * noise_hidden
97 | if input.is_cuda:
98 | gi = F.linear(input, w_ih)
99 | gh = F.linear(hx, w_hh)
100 | state = fusedBackend.GRUFused.apply
101 | return state(gi, gh, hidden) if b_ih is None else state(gi, gh, hidden, b_ih, b_hh)
102 |
103 | gi = F.linear(input, w_ih, b_ih)
104 | gh = F.linear(hx, w_hh, b_hh)
105 | i_r, i_i, i_n = gi.chunk(3, 1)
106 | h_r, h_i, h_n = gh.chunk(3, 1)
107 |
108 | resetgate = F.sigmoid(i_r + h_r)
109 | inputgate = F.sigmoid(i_i + h_i)
110 | newgate = F.tanh(i_n + resetgate * h_n)
111 | hy = newgate + inputgate * (hidden - newgate)
112 |
113 | return hy
114 |
115 |
116 | def VarMaskedRecurrent(reverse=False):
117 | def forward(input, hidden, cell, mask):
118 | output = []
119 | steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
120 | for i in steps:
121 | if mask is None or mask[i].data.min() > 0.5:
122 | hidden = cell(input[i], hidden)
123 | elif mask[i].data.max() > 0.5:
124 | hidden_next = cell(input[i], hidden)
125 | # hack to handle LSTM
126 | if isinstance(hidden, tuple):
127 | hx, cx = hidden
128 | hp1, cp1 = hidden_next
129 | hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
130 | else:
131 | hidden = hidden + (hidden_next - hidden) * mask[i]
132 | # hack to handle LSTM
133 | output.append(hidden[0] if isinstance(hidden, tuple) else hidden)
134 |
135 | if reverse:
136 | output.reverse()
137 | output = torch.cat(output, 0).view(input.size(0), *output[0].size())
138 |
139 | return hidden, output
140 |
141 | return forward
142 |
143 |
144 | def StackedRNN(inners, num_layers, lstm=False):
145 | num_directions = len(inners)
146 | total_layers = num_layers * num_directions
147 |
148 | def forward(input, hidden, cells, mask):
149 | assert (len(cells) == total_layers)
150 | next_hidden = []
151 |
152 | if lstm:
153 | hidden = list(zip(*hidden))
154 |
155 | for i in range(num_layers):
156 | all_output = []
157 | for j, inner in enumerate(inners):
158 | l = i * num_directions + j
159 | hy, output = inner(input, hidden[l], cells[l], mask)
160 | next_hidden.append(hy)
161 | all_output.append(output)
162 |
163 | input = torch.cat(all_output, input.dim() - 1)
164 |
165 | if lstm:
166 | next_h, next_c = zip(*next_hidden)
167 | next_hidden = (
168 | torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
169 | torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
170 | )
171 | else:
172 | next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())
173 |
174 | return next_hidden, input
175 |
176 | return forward
177 |
178 |
179 | def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
180 | rec_factory = VarMaskedRecurrent
181 |
182 | if bidirectional:
183 | layer = (rec_factory(), rec_factory(reverse=True))
184 | else:
185 | layer = (rec_factory(),)
186 |
187 | func = StackedRNN(layer,
188 | num_layers,
189 | lstm=lstm)
190 |
191 | def forward(input, cells, hidden, mask):
192 | if batch_first:
193 | input = input.transpose(0, 1)
194 | if mask is not None:
195 | mask = mask.transpose(0, 1)
196 |
197 | nexth, output = func(input, hidden, cells, mask)
198 |
199 | if batch_first:
200 | output = output.transpose(0, 1)
201 |
202 | return output, nexth
203 |
204 | return forward
205 |
206 |
207 | def VarMaskedStep():
208 | def forward(input, hidden, cell, mask):
209 | if mask is None or mask.data.min() > 0.5:
210 | hidden = cell(input, hidden)
211 | elif mask.data.max() > 0.5:
212 | hidden_next = cell(input, hidden)
213 | # hack to handle LSTM
214 | if isinstance(hidden, tuple):
215 | hx, cx = hidden
216 | hp1, cp1 = hidden_next
217 | hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
218 | else:
219 | hidden = hidden + (hidden_next - hidden) * mask
220 | # hack to handle LSTM
221 | output = hidden[0] if isinstance(hidden, tuple) else hidden
222 |
223 | return hidden, output
224 |
225 | return forward
226 |
227 |
228 | def StackedStep(layer, num_layers, lstm=False):
229 | def forward(input, hidden, cells, mask):
230 | assert (len(cells) == num_layers)
231 | next_hidden = []
232 |
233 | if lstm:
234 | hidden = list(zip(*hidden))
235 |
236 | for l in range(num_layers):
237 | hy, output = layer(input, hidden[l], cells[l], mask)
238 | next_hidden.append(hy)
239 | input = output
240 |
241 | if lstm:
242 | next_h, next_c = zip(*next_hidden)
243 | next_hidden = (
244 | torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
245 | torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
246 | )
247 | else:
248 | next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())
249 |
250 | return next_hidden, input
251 |
252 | return forward
253 |
254 |
255 | def AutogradVarMaskedStep(num_layers=1, lstm=False):
256 | layer = VarMaskedStep()
257 |
258 | func = StackedStep(layer,
259 | num_layers,
260 | lstm=lstm)
261 |
262 | def forward(input, cells, hidden, mask):
263 | nexth, output = func(input, hidden, cells, mask)
264 | return output, nexth
265 |
266 | return forward
267 |
--------------------------------------------------------------------------------
/neuronlp2/nn/init.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from torch.autograd import Variable
4 |
5 |
6 | def assign_tensor(tensor, val):
7 | """
8 | copy val to tensor
9 | Args:
10 | tensor: an n-dimensional torch.Tensor or autograd.Variable
11 | val: an n-dimensional torch.Tensor to fill the tensor with
12 |
13 | Returns:
14 |
15 | """
16 | if isinstance(tensor, Variable):
17 | assign_tensor(tensor.data, val)
18 | return tensor
19 | return tensor.copy_(val)
20 |
--------------------------------------------------------------------------------
/neuronlp2/nn/modules/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .masked_rnn import *
4 | from .variational_rnn import *
5 | from .skipconnect_rnn import *
6 | from .crf import *
7 | from .sparse import *
8 | from .attention import *
9 | from .linear import *
10 |
--------------------------------------------------------------------------------
/neuronlp2/nn/modules/attention.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.nn.parameter import Parameter
7 |
8 |
9 | class BiAAttention(nn.Module):
10 | '''
11 | Bi-Affine attention layer.
12 | '''
13 |
14 | def __init__(self, input_size_encoder, input_size_decoder, num_labels, biaffine=True, **kwargs):
15 | '''
16 |
17 | Args:
18 | input_size_encoder: int
19 | the dimension of the encoder input.
20 | input_size_decoder: int
21 | the dimension of the decoder input.
22 | num_labels: int
23 | the number of labels of the crf layer
24 | biaffine: bool
25 | if apply bi-affine parameter.
26 | **kwargs:
27 | '''
28 | super(BiAAttention, self).__init__()
29 | self.input_size_encoder = input_size_encoder
30 | self.input_size_decoder = input_size_decoder
31 | self.num_labels = num_labels
32 | self.biaffine = biaffine
33 |
34 | self.W_d = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder))
35 | self.W_e = Parameter(torch.Tensor(self.num_labels, self.input_size_encoder))
36 | self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
37 | if self.biaffine:
38 | self.U = Parameter(torch.Tensor(self.num_labels, self.input_size_decoder, self.input_size_encoder))
39 | else:
40 | self.register_parameter('U', None)
41 |
42 | self.reset_parameters()
43 |
44 | def reset_parameters(self):
45 | nn.init.xavier_uniform(self.W_d)
46 | nn.init.xavier_uniform(self.W_e)
47 | nn.init.constant(self.b, 0.)
48 | if self.biaffine:
49 | nn.init.xavier_uniform(self.U)
50 |
51 | def forward(self, input_d, input_e, mask_d=None, mask_e=None):
52 | '''
53 |
54 | Args:
55 | input_d: Tensor
56 | the decoder input tensor with shape = [batch, length_decoder, input_size]
57 | input_e: Tensor
58 | the child input tensor with shape = [batch, length_encoder, input_size]
59 | mask_d: Tensor or None
60 | the mask tensor for decoder with shape = [batch, length_decoder]
61 | mask_e: Tensor or None
62 | the mask tensor for encoder with shape = [batch, length_encoder]
63 |
64 | Returns: Tensor
65 | the energy tensor with shape = [batch, num_label, length, length]
66 |
67 | '''
68 | assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
69 | batch, length_decoder, _ = input_d.size()
70 | _, length_encoder, _ = input_e.size()
71 |
72 | # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
73 | # the output shape is [batch, num_label, length_decoder]
74 | out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
75 | # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
76 | # the output shape is [batch, num_label, length_encoder]
77 | out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)
78 |
79 | # output shape [batch, num_label, length_decoder, length_encoder]
80 | if self.biaffine:
81 | # compute bi-affine part
82 | # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
83 | # output shape [batch, num_label, length_decoder, input_size_encoder]
84 | output = torch.matmul(input_d.unsqueeze(1), self.U)
85 | # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
86 | # output shape [batch, num_label, length_decoder, length_encoder]
87 | output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))
88 |
89 | output = output + out_d + out_e + self.b
90 | else:
91 | output = out_d + out_d + self.b
92 |
93 | if mask_d is not None:
94 | output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)
95 |
96 | return output
97 |
98 |
99 | class ConcatAttention(nn.Module):
100 | '''
101 | Concatenate attention layer.
102 | '''
103 | # TODO test it!
104 |
105 | def __init__(self, input_size_encoder, input_size_decoder, hidden_size, num_labels, **kwargs):
106 | '''
107 |
108 | Args:
109 | input_size_encoder: int
110 | the dimension of the encoder input.
111 | input_size_decoder: int
112 | the dimension of the decoder input.
113 | hidden_size: int
114 | the dimension of the hidden.
115 | num_labels: int
116 | the number of labels of the crf layer
117 | biaffine: bool
118 | if apply bi-affine parameter.
119 | **kwargs:
120 | '''
121 | super(ConcatAttention, self).__init__()
122 | self.input_size_encoder = input_size_encoder
123 | self.input_size_decoder = input_size_decoder
124 | self.hidden_size = hidden_size
125 | self.num_labels = num_labels
126 |
127 | self.W_d = Parameter(torch.Tensor(self.input_size_decoder, self.hidden_size))
128 | self.W_e = Parameter(torch.Tensor(self.input_size_encoder, self.hidden_size))
129 | self.b = Parameter(torch.Tensor(self.hidden_size))
130 | self.v = Parameter(torch.Tensor(self.hidden_size, self.num_labels))
131 |
132 | self.reset_parameters()
133 |
134 | def reset_parameters(self):
135 | nn.init.xavier_uniform(self.W_d)
136 | nn.init.xavier_uniform(self.W_e)
137 | nn.init.xavier_uniform(self.v)
138 | nn.init.constant(self.b, 0.)
139 |
140 | def forward(self, input_d, input_e, mask_d=None, mask_e=None):
141 | '''
142 |
143 | Args:
144 | input_d: Tensor
145 | the decoder input tensor with shape = [batch, length_decoder, input_size]
146 | input_e: Tensor
147 | the child input tensor with shape = [batch, length_encoder, input_size]
148 | mask_d: Tensor or None
149 | the mask tensor for decoder with shape = [batch, length_decoder]
150 | mask_e: Tensor or None
151 | the mask tensor for encoder with shape = [batch, length_encoder]
152 |
153 | Returns: Tensor
154 | the energy tensor with shape = [batch, num_label, length, length]
155 |
156 | '''
157 | assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
158 | batch, length_decoder, _ = input_d.size()
159 | _, length_encoder, _ = input_e.size()
160 |
161 | # compute decoder part: [batch, length_decoder, input_size_decoder] * [input_size_decoder, hidden_size]
162 | # the output shape is [batch, length_decoder, hidden_size]
163 | # then --> [batch, 1, length_decoder, hidden_size]
164 | out_d = torch.matmul(input_d, self.W_d).unsqueeze(1)
165 | # compute decoder part: [batch, length_encoder, input_size_encoder] * [input_size_encoder, hidden_size]
166 | # the output shape is [batch, length_encoder, hidden_size]
167 | # then --> [batch, length_encoder, 1, hidden_size]
168 | out_e = torch.matmul(input_e, self.W_e).unsqueeze(2)
169 |
170 | # add them together [batch, length_encoder, length_decoder, hidden_size]
171 | out = F.tanh(out_d + out_e + self.b)
172 |
173 | # product with v
174 | # [batch, length_encoder, length_decoder, hidden_size] * [hidden, num_label]
175 | # [batch, length_encoder, length_decoder, num_labels]
176 | # then --> [batch, num_labels, length_decoder, length_encoder]
177 | return torch.matmul(out, self.v).transpose(1, 3)
178 |
--------------------------------------------------------------------------------
/neuronlp2/nn/modules/linear.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import math
4 | import numpy as np
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.nn.parameter import Parameter
9 |
10 |
11 | class BiLinear(nn.Module):
12 | '''
13 | Bi-linear layer
14 | '''
15 | def __init__(self, left_features, right_features, out_features, bias=True):
16 | '''
17 |
18 | Args:
19 | left_features: size of left input
20 | right_features: size of right input
21 | out_features: size of output
22 | bias: If set to False, the layer will not learn an additive bias.
23 | Default: True
24 | '''
25 | super(BiLinear, self).__init__()
26 | self.left_features = left_features
27 | self.right_features = right_features
28 | self.out_features = out_features
29 |
30 | self.U = Parameter(torch.Tensor(self.out_features, self.left_features, self.right_features))
31 | self.W_l = Parameter(torch.Tensor(self.out_features, self.left_features))
32 | self.W_r = Parameter(torch.Tensor(self.out_features, self.left_features))
33 |
34 | if bias:
35 | self.bias = Parameter(torch.Tensor(out_features))
36 | else:
37 | self.register_parameter('bias', None)
38 |
39 | self.reset_parameters()
40 |
41 | def reset_parameters(self):
42 | nn.init.xavier_uniform(self.W_l)
43 | nn.init.xavier_uniform(self.W_r)
44 | nn.init.constant(self.bias, 0.)
45 | nn.init.xavier_uniform(self.U)
46 |
47 | def forward(self, input_left, input_right):
48 | '''
49 |
50 | Args:
51 | input_left: Tensor
52 | the left input tensor with shape = [batch1, batch2, ..., left_features]
53 | input_right: Tensor
54 | the right input tensor with shape = [batch1, batch2, ..., right_features]
55 |
56 | Returns:
57 |
58 | '''
59 |
60 | left_size = input_left.size()
61 | right_size = input_right.size()
62 | assert left_size[:-1] == right_size[:-1], \
63 | "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
64 | batch = int(np.prod(left_size[:-1]))
65 |
66 | # convert left and right input to matrices [batch, left_features], [batch, right_features]
67 | input_left = input_left.view(batch, self.left_features)
68 | input_right = input_right.view(batch, self.right_features)
69 |
70 | # output [batch, out_features]
71 | output = F.bilinear(input_left, input_right, self.U, self.bias)
72 | output = output + F.linear(input_left, self.W_l, None) + F.linear(input_right, self.W_r, None)
73 | # convert back to [batch1, batch2, ..., out_features]
74 | return output.view(left_size[:-1] + (self.out_features, ))
75 |
76 | def __repr__(self):
77 | return self.__class__.__name__ + ' (' \
78 | + 'in1_features=' + str(self.left_features) \
79 | + ', in2_features=' + str(self.right_features) \
80 | + ', out_features=' + str(self.out_features) + ')'
81 |
--------------------------------------------------------------------------------
/neuronlp2/nn/modules/sparse.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 | from torch.autograd import Variable
7 | from torch.nn.parameter import Parameter
8 |
9 | from ..init import assign_tensor
10 |
11 |
12 | class Embedding(nn.Module):
13 | r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
14 | This module is often used to store word embeddings and retrieve them using indices.
15 | The input to the module is a list of indices, and the output is the corresponding
16 | word embeddings.
17 | Args:
18 | num_embeddings (int): size of the dictionary of embeddings
19 | embedding_dim (int): the size of each embedding vector
20 | init_embedding (Tensor or Variable): If given, the embedding will be initialized with the given tensor.
21 | freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
22 | padding_idx (int, optional): If given, pads the output with zeros whenever it encounters the index.
23 | max_norm (float, optional): If given, will renormalize the embeddings to always have a norm lesser than this
24 | norm_type (float, optional): The p of the p-norm to compute for the max_norm option
25 | scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the frequency of
26 | the words in the mini-batch.
27 | sparse (boolean, optional): if True, gradient w.r.t. weight matrix will be a sparse tensor. See Notes for
28 | more details regarding sparse gradients.
29 | Attributes:
30 | weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
31 | Shape:
32 | - Input: LongTensor `(N1, N2, ...,Nm, W)`, N = mini-batch, W = number of indices to extract per mini-batch
33 | - Output: `(N1, N2, ..., Nm, W, embedding_dim)`
34 | Notes:
35 | Keep in mind that only a limited number of optimizers support
36 | sparse gradients: currently it's `optim.SGD` (`cuda` and `cpu`),
37 | and `optim.Adagrad` (`cpu`)
38 | """
39 |
40 | def __init__(self, num_embeddings, embedding_dim, init_embedding=None, freeze=False, padding_idx=None,
41 | max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False):
42 | super(Embedding, self).__init__()
43 | self.num_embeddings = num_embeddings
44 | self.embedding_dim = embedding_dim
45 | self.padding_idx = padding_idx
46 | self.max_norm = max_norm
47 | self.norm_type = norm_type
48 | self.scale_grad_by_freq = scale_grad_by_freq
49 | self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
50 | self.frozen = freeze
51 | self.sparse = sparse
52 |
53 | self.reset_parameters(init_embedding)
54 |
55 | def reset_parameters(self, init_embedding):
56 | if init_embedding is None:
57 | scale = np.sqrt(3.0 / self.embedding_dim)
58 | self.weight.data.uniform_(-scale, scale)
59 | else:
60 | assign_tensor(self.weight, init_embedding)
61 | if self.padding_idx is not None:
62 | self.weight.data[self.padding_idx].fill_(0)
63 |
64 | if self.frozen:
65 | if init_embedding is None:
66 | raise Warning('Freeze embeddings which are randomly initialized.')
67 | self.weight.requires_grad = False
68 |
69 | def freeze(self):
70 | self.weight.requires_grad = False
71 | self.frozen = True
72 |
73 | def forward(self, input):
74 | padding_idx = self.padding_idx
75 | if padding_idx is None:
76 | padding_idx = -1
77 |
78 | input_size = input.size()
79 | if input.dim() > 2:
80 | num_inputs = int(np.prod(input_size[:-1]))
81 | input = input.view(num_inputs, input_size[-1])
82 |
83 | output_size = input_size + (self.embedding_dim,)
84 | return self._backend.Embedding.apply(
85 | input, self.weight,
86 | padding_idx, self.max_norm, self.norm_type,
87 | self.scale_grad_by_freq, self.sparse).view(output_size)
88 |
89 | def __repr__(self):
90 | s = '{name}({num_embeddings}, {embedding_dim}'
91 | if self.padding_idx is not None:
92 | s += ', padding_idx={padding_idx}'
93 | if self.max_norm is not None:
94 | s += ', max_norm={max_norm}'
95 | if self.norm_type != 2:
96 | s += ', norm_type={norm_type}'
97 | if self.scale_grad_by_freq is not False:
98 | s += ', scale_grad_by_freq={scale_grad_by_freq}'
99 | if self.sparse is not False:
100 | s += ', sparse=True'
101 | s += ')'
102 | return s.format(name=self.__class__.__name__, **self.__dict__)
103 |
--------------------------------------------------------------------------------
/neuronlp2/nn/utils.py:
--------------------------------------------------------------------------------
1 | import collections
2 | from itertools import repeat
3 | import torch
4 | import torch.nn.utils.rnn as rnn_utils
5 | from torch.autograd import Variable
6 |
7 |
8 | def _ntuple(n):
9 | def parse(x):
10 | if isinstance(x, collections.Iterable):
11 | return x
12 | return tuple(repeat(x, n))
13 | return parse
14 |
15 | _single = _ntuple(1)
16 | _pair = _ntuple(2)
17 | _triple = _ntuple(3)
18 | _quadruple = _ntuple(4)
19 |
20 |
21 | def prepare_rnn_seq(rnn_input, lengths, hx=None, masks=None, batch_first=False):
22 | '''
23 |
24 | Args:
25 | rnn_input: [seq_len, batch, input_size]: tensor containing the features of the input sequence.
26 | lengths: [batch]: tensor containing the lengthes of the input sequence
27 | hx: [num_layers * num_directions, batch, hidden_size]: tensor containing the initial hidden state for each element in the batch.
28 | masks: [seq_len, batch]: tensor containing the mask for each element in the batch.
29 | batch_first: If True, then the input and output tensors are provided as [batch, seq_len, feature].
30 |
31 | Returns:
32 |
33 | '''
34 | def check_decreasing(lengths):
35 | lens, order = torch.sort(lengths, dim=0, descending=True)
36 | if torch.ne(lens, lengths).sum() == 0:
37 | return None
38 | else:
39 | _, rev_order = torch.sort(order)
40 | return lens, Variable(order), Variable(rev_order)
41 |
42 | check_res = check_decreasing(lengths)
43 |
44 | if check_res is None:
45 | lens = lengths
46 | rev_order = None
47 | else:
48 | lens, order, rev_order = check_res
49 | batch_dim = 0 if batch_first else 1
50 | rnn_input = rnn_input.index_select(batch_dim, order)
51 | if hx is not None:
52 | # hack lstm
53 | if isinstance(hx, tuple):
54 | hx, cx = hx
55 | hx = hx.index_select(1, order)
56 | cx = cx.index_select(1, order)
57 | hx = (hx, cx)
58 | else:
59 | hx = hx.index_select(1, order)
60 |
61 | lens = lens.tolist()
62 | seq = rnn_utils.pack_padded_sequence(rnn_input, lens, batch_first=batch_first)
63 | if masks is not None:
64 | if batch_first:
65 | masks = masks[:, :lens[0]]
66 | else:
67 | masks = masks[:lens[0]]
68 | return seq, hx, rev_order, masks
69 |
70 |
71 | def recover_rnn_seq(seq, rev_order, hx=None, batch_first=False):
72 | output, _ = rnn_utils.pad_packed_sequence(seq, batch_first=batch_first)
73 | if rev_order is not None:
74 | batch_dim = 0 if batch_first else 1
75 | output = output.index_select(batch_dim, rev_order)
76 | if hx is not None:
77 | # hack lstm
78 | if isinstance(hx, tuple):
79 | hx, cx = hx
80 | hx = hx.index_select(1, rev_order)
81 | cx = cx.index_select(1, rev_order)
82 | hx = (hx, cx)
83 | else:
84 | hx = hx.index_select(1, rev_order)
85 | return output, hx
86 |
--------------------------------------------------------------------------------
/neuronlp2/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | from .parser import *
4 |
--------------------------------------------------------------------------------
/neuronlp2/tasks/parser.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import re
4 | import numpy as np
5 |
6 | def is_uni_punctuation(word):
7 | match = re.match("^[^\w\s]+$]", word, flags=re.UNICODE)
8 | return match is not None
9 |
10 | def is_punctuation(word, pos, punct_set=None):
11 | if punct_set is None:
12 | return is_uni_punctuation(word)
13 | else:
14 | return pos in punct_set
15 |
16 |
17 | def eval(words, postags, heads_pred, types_pred, heads, types, word_alphabet, pos_alphabet, lengths,
18 | punct_set=None, symbolic_root=False, symbolic_end=False):
19 | batch_size, _ = words.shape
20 | ucorr = 0.
21 | lcorr = 0.
22 | total = 0.
23 | ucomplete_match = 0.
24 | lcomplete_match = 0.
25 |
26 | ucorr_nopunc = 0.
27 | lcorr_nopunc = 0.
28 | total_nopunc = 0.
29 | ucomplete_match_nopunc = 0.
30 | lcomplete_match_nopunc = 0.
31 |
32 | corr_root = 0.
33 | total_root = 0.
34 | start = 1 if symbolic_root else 0
35 | end = 1 if symbolic_end else 0
36 | for i in range(batch_size):
37 | ucm = 1.
38 | lcm = 1.
39 | ucm_nopunc = 1.
40 | lcm_nopunc = 1.
41 | for j in range(start, lengths[i] - end):
42 | word = word_alphabet.get_instance(words[i, j])
43 | word = word.encode('utf8')
44 |
45 | pos = pos_alphabet.get_instance(postags[i, j])
46 | pos = pos.encode('utf8')
47 |
48 | total += 1
49 | if heads[i, j] == heads_pred[i, j]:
50 | ucorr += 1
51 | if types[i, j] == types_pred[i, j]:
52 | lcorr += 1
53 | else:
54 | lcm = 0
55 | else:
56 | ucm = 0
57 | lcm = 0
58 |
59 | if not is_punctuation(word, pos, punct_set):
60 | total_nopunc += 1
61 | if heads[i, j] == heads_pred[i, j]:
62 | ucorr_nopunc += 1
63 | if types[i, j] == types_pred[i, j]:
64 | lcorr_nopunc += 1
65 | else:
66 | lcm_nopunc = 0
67 | else:
68 | ucm_nopunc = 0
69 | lcm_nopunc = 0
70 |
71 | if heads[i, j] == 0:
72 | total_root += 1
73 | corr_root += 1 if heads_pred[i, j] == 0 else 0
74 |
75 | ucomplete_match += ucm
76 | lcomplete_match += lcm
77 | ucomplete_match_nopunc += ucm_nopunc
78 | lcomplete_match_nopunc += lcm_nopunc
79 |
80 | return (ucorr, lcorr, total, ucomplete_match, lcomplete_match), \
81 | (ucorr_nopunc, lcorr_nopunc, total_nopunc, ucomplete_match_nopunc, lcomplete_match_nopunc), \
82 | (corr_root, total_root), batch_size
83 |
84 |
85 | def decode_MST(energies, lengths, leading_symbolic=0, labeled=True):
86 | """
87 | decode best parsing tree with MST algorithm.
88 | :param energies: energies: numpy 4D tensor
89 | energies of each edge. the shape is [batch_size, num_labels, n_steps, n_steps],
90 | where the summy root is at index 0.
91 | :param masks: numpy 2D tensor
92 | masks in the shape [batch_size, n_steps].
93 | :param leading_symbolic: int
94 | number of symbolic dependency types leading in type alphabets)
95 | :return:
96 | """
97 |
98 | def find_cycle(par):
99 | added = np.zeros([length], np.bool)
100 | added[0] = True
101 | cycle = set()
102 | findcycle = False
103 | for i in range(1, length):
104 | if findcycle:
105 | break
106 |
107 | if added[i] or not curr_nodes[i]:
108 | continue
109 |
110 | # init cycle
111 | tmp_cycle = set()
112 | tmp_cycle.add(i)
113 | added[i] = True
114 | findcycle = True
115 | l = i
116 |
117 | while par[l] not in tmp_cycle:
118 | l = par[l]
119 | if added[l]:
120 | findcycle = False
121 | break
122 | added[l] = True
123 | tmp_cycle.add(l)
124 |
125 | if findcycle:
126 | lorg = l
127 | cycle.add(lorg)
128 | l = par[lorg]
129 | while l != lorg:
130 | cycle.add(l)
131 | l = par[l]
132 | break
133 |
134 | return findcycle, cycle
135 |
136 | def chuLiuEdmonds():
137 | par = np.zeros([length], dtype=np.int32)
138 | # create best graph
139 | par[0] = -1
140 | for i in range(1, length):
141 | # only interested at current nodes
142 | if curr_nodes[i]:
143 | max_score = score_matrix[0, i]
144 | par[i] = 0
145 | for j in range(1, length):
146 | if j == i or not curr_nodes[j]:
147 | continue
148 |
149 | new_score = score_matrix[j, i]
150 | if new_score > max_score:
151 | max_score = new_score
152 | par[i] = j
153 |
154 | # find a cycle
155 | findcycle, cycle = find_cycle(par)
156 | # no cycles, get all edges and return them.
157 | if not findcycle:
158 | final_edges[0] = -1
159 | for i in range(1, length):
160 | if not curr_nodes[i]:
161 | continue
162 |
163 | pr = oldI[par[i], i]
164 | ch = oldO[par[i], i]
165 | final_edges[ch] = pr
166 | return
167 |
168 | cyc_len = len(cycle)
169 | cyc_weight = 0.0
170 | cyc_nodes = np.zeros([cyc_len], dtype=np.int32)
171 | id = 0
172 | for cyc_node in cycle:
173 | cyc_nodes[id] = cyc_node
174 | id += 1
175 | cyc_weight += score_matrix[par[cyc_node], cyc_node]
176 |
177 | rep = cyc_nodes[0]
178 | for i in range(length):
179 | if not curr_nodes[i] or i in cycle:
180 | continue
181 |
182 | max1 = float("-inf")
183 | wh1 = -1
184 | max2 = float("-inf")
185 | wh2 = -1
186 |
187 | for j in range(cyc_len):
188 | j1 = cyc_nodes[j]
189 | if score_matrix[j1, i] > max1:
190 | max1 = score_matrix[j1, i]
191 | wh1 = j1
192 |
193 | scr = cyc_weight + score_matrix[i, j1] - score_matrix[par[j1], j1]
194 |
195 | if scr > max2:
196 | max2 = scr
197 | wh2 = j1
198 |
199 | score_matrix[rep, i] = max1
200 | oldI[rep, i] = oldI[wh1, i]
201 | oldO[rep, i] = oldO[wh1, i]
202 | score_matrix[i, rep] = max2
203 | oldO[i, rep] = oldO[i, wh2]
204 | oldI[i, rep] = oldI[i, wh2]
205 |
206 | rep_cons = []
207 | for i in range(cyc_len):
208 | rep_cons.append(set())
209 | cyc_node = cyc_nodes[i]
210 | for cc in reps[cyc_node]:
211 | rep_cons[i].add(cc)
212 |
213 | for i in range(1, cyc_len):
214 | cyc_node = cyc_nodes[i]
215 | curr_nodes[cyc_node] = False
216 | for cc in reps[cyc_node]:
217 | reps[rep].add(cc)
218 |
219 | chuLiuEdmonds()
220 |
221 | # check each node in cycle, if one of its representatives is a key in the final_edges, it is the one.
222 | found = False
223 | wh = -1
224 | for i in range(cyc_len):
225 | for repc in rep_cons[i]:
226 | if repc in final_edges:
227 | wh = cyc_nodes[i]
228 | found = True
229 | break
230 | if found:
231 | break
232 |
233 | l = par[wh]
234 | while l != wh:
235 | ch = oldO[par[l], l]
236 | pr = oldI[par[l], l]
237 | final_edges[ch] = pr
238 | l = par[l]
239 |
240 | if labeled:
241 | assert energies.ndim == 4, 'dimension of energies is not equal to 4'
242 | else:
243 | assert energies.ndim == 3, 'dimension of energies is not equal to 3'
244 | input_shape = energies.shape
245 | batch_size = input_shape[0]
246 | max_length = input_shape[2]
247 |
248 | pars = np.zeros([batch_size, max_length], dtype=np.int32)
249 | types = np.zeros([batch_size, max_length], dtype=np.int32) if labeled else None
250 | for i in range(batch_size):
251 | energy = energies[i]
252 |
253 | # calc the realy length of this instance
254 | length = lengths[i]
255 |
256 | # calc real energy matrix shape = [length, length, num_labels - #symbolic] (remove the label for symbolic types).
257 | if labeled:
258 | energy = energy[leading_symbolic:, :length, :length]
259 | # get best label for each edge.
260 | label_id_matrix = energy.argmax(axis=0) + leading_symbolic
261 | energy = energy.max(axis=0)
262 | else:
263 | energy = energy[:length, :length]
264 | label_id_matrix = None
265 | # get original score matrix
266 | orig_score_matrix = energy
267 | # initialize score matrix to original score matrix
268 | score_matrix = np.array(orig_score_matrix, copy=True)
269 |
270 | oldI = np.zeros([length, length], dtype=np.int32)
271 | oldO = np.zeros([length, length], dtype=np.int32)
272 | curr_nodes = np.zeros([length], dtype=np.bool)
273 | reps = []
274 |
275 | for s in range(length):
276 | orig_score_matrix[s, s] = 0.0
277 | score_matrix[s, s] = 0.0
278 | curr_nodes[s] = True
279 | reps.append(set())
280 | reps[s].add(s)
281 | for t in range(s + 1, length):
282 | oldI[s, t] = s
283 | oldO[s, t] = t
284 |
285 | oldI[t, s] = t
286 | oldO[t, s] = s
287 |
288 | final_edges = dict()
289 | chuLiuEdmonds()
290 | par = np.zeros([max_length], np.int32)
291 | if labeled:
292 | type = np.ones([max_length], np.int32)
293 | type[0] = 0
294 | else:
295 | type = None
296 |
297 | for ch, pr in final_edges.items():
298 | par[ch] = pr
299 | if labeled and ch != 0:
300 | type[ch] = label_id_matrix[pr, ch]
301 |
302 | par[0] = 0
303 | pars[i] = par
304 | if labeled:
305 | types[i] = type
306 |
307 | return pars, types
308 |
--------------------------------------------------------------------------------
/neuronlp2/utils.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 |
3 | import pickle
4 | import numpy as np
5 | from gensim.models.word2vec import Word2Vec
6 | import gzip
7 |
8 | from .io import utils
9 |
10 |
11 | def load_embedding_dict(embedding, embedding_path, normalize_digits=True):
12 | """
13 | load word embeddings from file
14 | :param embedding:
15 | :param embedding_path:
16 | :return: embedding dict, embedding dimention, caseless
17 | """
18 | print("loading embedding: %s from %s" % (embedding, embedding_path))
19 | if embedding == 'word2vec':
20 | # loading word2vec
21 | word2vec = Word2Vec.load_word2vec_format(embedding_path, binary=True)
22 | embedd_dim = word2vec.vector_size
23 | return word2vec, embedd_dim
24 | elif embedding == 'glove':
25 | # loading GloVe
26 | embedd_dim = -1
27 | embedd_dict = dict()
28 | with gzip.open(embedding_path, 'r') as file:
29 | for line in file:
30 | line = line.strip()
31 | line = line.decode('utf-8')
32 | if len(line) == 0:
33 | continue
34 |
35 | tokens = line.split()
36 | if embedd_dim < 0:
37 | embedd_dim = len(tokens) - 1
38 | else:
39 | assert (embedd_dim + 1 == len(tokens))
40 | embedd = np.empty([1, embedd_dim], dtype=np.float32)
41 | embedd[:] = tokens[1:]
42 | word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
43 | embedd_dict[word] = embedd
44 | return embedd_dict, embedd_dim
45 | elif embedding == 'senna':
46 | # loading Senna
47 | embedd_dim = -1
48 | embedd_dict = dict()
49 | with gzip.open(embedding_path, 'r') as file:
50 | for line in file:
51 | line = line.strip()
52 | line = line.decode('utf-8')
53 | if len(line) == 0:
54 | continue
55 |
56 | tokens = line.split()
57 | if embedd_dim < 0:
58 | embedd_dim = len(tokens) - 1
59 | else:
60 | assert (embedd_dim + 1 == len(tokens))
61 | embedd = np.empty([1, embedd_dim], dtype=np.float32)
62 | embedd[:] = tokens[1:]
63 | word = utils.DIGIT_RE.sub(b"0", tokens[0]) if normalize_digits else tokens[0]
64 | embedd_dict[word] = embedd
65 | return embedd_dict, embedd_dim
66 | elif embedding == 'sskip':
67 | embedd_dim = -1
68 | embedd_dict = dict()
69 | with gzip.open(embedding_path, 'r') as file:
70 | # skip the first line
71 | file.readline()
72 | for line in file:
73 | line = line.strip()
74 | try:
75 | line = line.decode('utf-8')
76 | if len(line) == 0:
77 | continue
78 |
79 | tokens = line.split()
80 | if len(tokens) < embedd_dim:
81 | continue
82 |
83 | if embedd_dim < 0:
84 | embedd_dim = len(tokens) - 1
85 |
86 | embedd = np.empty([1, embedd_dim], dtype=np.float32)
87 | start = len(tokens) - embedd_dim
88 | word = ' '.join(tokens[0:start])
89 | embedd[:] = tokens[start:]
90 | word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word
91 | embedd_dict[word] = embedd
92 | except UnicodeDecodeError:
93 | continue
94 | return embedd_dict, embedd_dim
95 | elif embedding == 'polyglot':
96 | words, embeddings = pickle.load(open(embedding_path, 'rb'))
97 | _, embedd_dim = embeddings.shape
98 | embedd_dict = dict()
99 | for i, word in enumerate(words):
100 | embedd = np.empty([1, embedd_dim], dtype=np.float32)
101 | embedd[:] = embeddings[i, :]
102 | word = utils.DIGIT_RE.sub(b"0", word) if normalize_digits else word
103 | embedd_dict[word] = embedd
104 | return embedd_dict, embedd_dim
105 |
106 | else:
107 | raise ValueError("embedding should choose from [word2vec, senna, glove, sskip, polyglot]")
108 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backports.shutil-get-terminal-size==1.0.0
2 | boto==2.49.0
3 | boto3==1.9.93
4 | botocore==1.12.93
5 | bz2file==0.98
6 | certifi==2019.6.16
7 | chardet==3.0.4
8 | decorator==4.3.2
9 | docutils==0.14
10 | enum34==1.1.6
11 | futures==3.2.0
12 | gensim==3.7.1
13 | idna==2.8
14 | ipdb==0.11
15 | ipython==5.8.0
16 | ipython-genutils==0.2.0
17 | jmespath==0.9.3
18 | mkl-fft==1.0.6
19 | mkl-random==1.0.1
20 | nltk==3.4.1
21 | numpy==1.16.1
22 | pathlib2==2.3.3
23 | pexpect==4.6.0
24 | pickleshare==0.7.5
25 | prompt-toolkit==1.0.15
26 | ptyprocess==0.6.0
27 | Pygments==2.3.1
28 | python-dateutil==2.8.0
29 | PyYAML==3.13
30 | requests==2.21.0
31 | s3transfer==0.2.0
32 | scandir==1.9.0
33 | scikit-learn==0.20.3
34 | scipy==1.2.1
35 | simplegeneric==0.8.1
36 | singledispatch==3.4.0.3
37 | six==1.12.0
38 | sklearn==0.0
39 | smart-open==1.8.0
40 | torch==0.3.1
41 | traitlets==4.3.2
42 | urllib3==1.24.1
43 | wcwidth==0.1.7
44 |
--------------------------------------------------------------------------------
/rst_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch.autograd import Variable
4 |
5 | from NeuralRST.in_out.util import load_embedding_dict, get_logger
6 | from NeuralRST.in_out.preprocess import create_alphabet
7 | from NeuralRST.in_out.preprocess import batch_data_variable
8 | from NeuralRST.models.vocab import Vocab
9 | from NeuralRST.models.metric import Metric
10 | from NeuralRST.models.config import Config
11 | from NeuralRST.models.architecture import MainArchitecture
12 |
13 |
14 | class RSTModel(object):
15 | def __init__(self, rst_config_path):
16 | print("................................................")
17 | print("LOADING RST Model")
18 | self.config = Config(None)
19 | self.config.load_config(rst_config_path)
20 | self.logger = get_logger("RSTParser RUN", self.config.use_dynamic_oracle, self.config.model_path)
21 | word_alpha, tag_alpha, gold_action_alpha, action_label_alpha, etype_alpha = create_alphabet(None, self.config.alphabet_path, self.logger)
22 | self.vocab = Vocab(word_alpha, tag_alpha, etype_alpha, gold_action_alpha, action_label_alpha)
23 | self.network = MainArchitecture(self.vocab, self.config)
24 | self.network.load_state_dict(torch.load(self.config.model_name))
25 | if self.config.use_gpu:
26 | self.network = self.network.cuda()
27 | self.network.eval()
28 |
29 | def prepare_data(self, batch, batch_size):
30 | config = self.config
31 | vocab = self.vocab
32 | max_edu_len = -1
33 | max_edu_num = -1
34 | for data in batch:
35 | edu_num = len(data.edus)
36 | if edu_num > max_edu_num: max_edu_num = edu_num
37 | for edu in data.edus:
38 | edu_len = len(edu.words)
39 | if edu_len > max_edu_len: max_edu_len = edu_len
40 |
41 | edu_words = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
42 | edu_types = Variable(torch.LongTensor(batch_size, max_edu_num).zero_(), requires_grad=False)
43 | edu_syntax = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len, config.syntax_dim).zero_(), requires_grad=False)
44 | word_mask = Variable(torch.Tensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
45 | edu_tags = Variable(torch.LongTensor(batch_size, max_edu_num, max_edu_len).zero_(), requires_grad=False)
46 | edu_mask = Variable(torch.Tensor(batch_size, max_edu_num).zero_(), requires_grad=False)
47 | word_denominator = Variable(torch.ones(batch_size, max_edu_num).type(torch.FloatTensor) * -1, requires_grad=False)
48 | len_edus = np.zeros([batch_size], dtype=np.int64)
49 |
50 | for idx in range(batch_size):
51 | for idy in range(len(batch[idx].edus)):
52 | len_edus[idx] = len(batch[idx].edus)
53 | edu = batch[idx].edus[idy]
54 | edu_mask[idx, idy] = 1
55 | edu_types[idx, idy] = vocab.etype_alpha.word2id(edu.etype)
56 | edu_len = len(edu.words)
57 | word_denominator[idx, idy] = edu_len
58 | for idz in range(edu_len):
59 | word = edu.words[idz]
60 | tag = edu.tags[idz]
61 | edu_words[idx, idy, idz] = vocab.word_alpha.word2id(word)
62 | edu_tags[idx, idy, idz] = vocab.tag_alpha.word2id(tag)
63 | edu_syntax[idx, idy, idz] = edu.syntax_features[idz].view(config.syntax_dim)
64 | word_mask[idx, idy, idz] = 1
65 |
66 | if config.use_gpu:
67 | edu_words = edu_words.cuda()
68 | edu_tags = edu_tags.cuda()
69 | edu_types = edu_types.cuda()
70 | edu_mask = edu_mask.cuda()
71 | word_mask = word_mask.cuda()
72 | word_denominator = word_denominator.cuda()
73 | edu_syntax = edu_syntax.cuda()
74 |
75 | return edu_words, edu_tags, edu_types, edu_mask, word_mask, len_edus, word_denominator, edu_syntax
76 |
77 | def get_edu_representation(self, data_test):
78 | words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test
79 | encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax)
80 | return encoder_output
81 |
82 | def get_subtree(self, data_test):
83 | words, tags, etypes, edu_mask, word_mask, len_edus, word_denominator, syntax = data_test
84 | self.network.training = False
85 | encoder_output = self.network.forward_all(words, tags, etypes, edu_mask, word_mask, word_denominator, syntax)
86 | results = self.network.decode(encoder_output, [], [], len_edus)
87 | return results
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/sentence.py:
--------------------------------------------------------------------------------
1 | class Sentence(object):
2 | def __init__ (self, words, seq_chars, tags, word_ids, seq_char_ids, tag_ids, edu_ids):
3 | self.words = words
4 | self.seq_chars = seq_chars
5 | self.tags = tags
6 | self.word_ids = word_ids
7 | self.seq_char_ids = seq_char_ids
8 | self.tag_ids = tag_ids
9 | self.edu_ids = edu_ids
10 |
11 | def length(self):
12 | return len(self.words)
13 |
14 | class Instance(object):
15 | def __init__(self, sentences, syntax_features):
16 | self.edus = []
17 |
18 | cur_edu_id = 1
19 | cur_words = []
20 | cur_tags = []
21 | cur_syntax = []
22 | for idx in range(len(sentences)):
23 | sentence = sentences[idx]
24 | syntax = syntax_features[idx]
25 | for idy in range(len(sentence.words)):
26 | if sentence.edu_ids[idy] != cur_edu_id:
27 | cur_edu_id += 1
28 | self.edus.append(EDU(cur_words, cur_tags, '', cur_syntax))
29 | cur_words = []
30 | cur_tags = []
31 | cur_syntax = []
32 | cur_words.append(sentence.words[idy])
33 | cur_tags.append(sentence.tags[idy])
34 | cur_syntax.append(syntax[:,idy,:])
35 | self.edus.append(EDU(cur_words, cur_tags, '
', cur_syntax)) 36 | 37 | class EDU(object): 38 | def __init__(self, words, tags, etype, syntax_features): 39 | self.words = words 40 | self.tags = tags 41 | self.etype = etype 42 | self.syntax_features = syntax_features 43 | 44 | --------------------------------------------------------------------------------