├── .gitignore ├── LICENSE ├── README.md ├── config.py ├── data └── download.sh ├── eval.py ├── log └── .gitignore ├── model.py ├── model_param_space.py ├── nfetc_clsc.py ├── output └── .gitignore ├── pkl └── .gitignore ├── predict.py ├── prepkl └── .gitignore ├── requirement.txt ├── task.py └── utils ├── data_utils.py ├── embedding_utils.py ├── eval_utils.py ├── logging_utils.py ├── pkl_utils.py └── prior_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *.swp 4 | utils/__pycache__/ 5 | data/corpus/* 6 | data/glove* 7 | 8 | 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NFETC-CLSC 2 | 3 | Improving Distantly-supervised Entity Typing with Compact Latent Space Clustering 4 | 5 | Paper accepted by NAACL-HLT 2019: [NFETC-CLSC]( https://arxiv.org/pdf/1904.06475.pdf) 6 | 7 | ### Prerequisites 8 | - python 3.6.0 9 | - tensorflow == 1.6.0 10 | - hyperopt 11 | - gensim 12 | - sklearn 13 | - pandas 14 | 15 | Run `pip install -r requirement.txt` to satisfy the prerequisites. 16 | 17 | 18 | ### Dataset 19 | 20 | Run `./download.sh` to download the pre-trained word embeddings. 21 | 22 | The preprocessed dataset can be download from [Google Drive](https://drive.google.com/open?id=1opjfoA0I2mOjE11kM_TYsaeq-HHO1rqv) 23 | 24 | Put the data under the `./data/` directory 25 | 26 | ### Evaluation 27 | 28 | Run `python eval.py -m -d -r -p -a ` 29 | 30 | The scores for each run and the average scores are also recorded in one log file stored in folder `log` 31 | 32 | Available ``: ` bbn , ontonotes` 33 | 34 | Available ``: ` nfetc_bbn_NFETC_CLSC , nfetc_ontonotes_NFETC_CLSC` 35 | 36 | (which can be modified in `model_param_space.py`, the detailed hyper-parameter is in this file too) 37 | 38 | Available `` for noisy data: `5, 10, 15, 20, 25, 100` 39 | 40 | Available `` for clean data: `500, 1000, 1500, 2000, 2500` (You need to prepare the training file as mentioned before) 41 | 42 | `` is the hierarchy loss factor, `default == 0.0 ` 43 | 44 | 45 | ### Cite 46 | 47 | If you found this codebase or our work useful, please cite: 48 | 49 | ``` 50 | @inproceedings{chen-etal-2019-improving, 51 | title = "Improving Distantly-supervised Entity Typing with Compact Latent Space Clustering", 52 | author = "Chen, Bo and 53 | Gu, Xiaotao and 54 | Hu, Yufeng and 55 | Tang, Siliang and 56 | Hu, Guoping and 57 | Zhuang, Yueting and 58 | Ren, Xiang", 59 | booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", 60 | month = jun, 61 | year = "2019", 62 | address = "Minneapolis, Minnesota", 63 | publisher = "Association for Computational Linguistics", 64 | url = "https://www.aclweb.org/anthology/N19-1294", 65 | pages = "2862--2872", 66 | } 67 | ``` 68 | 69 | Note: 70 | 71 | This code is based on the previous work by [Peng Xu](https://github.com/billy-inn). Many thanks to [Peng Xu](https://github.com/billy-inn). 72 | 73 | Sincerely thanks [Konstantinos Kamnitsas](https://github.com/Kamnitsask) 74 | 75 | for the guidance of the CLSC impelementation and the advice for the paper writting. 76 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # -------------------- PATH --------------------- 2 | 3 | #ROOT_PATH = "/local/data2/pxu4/TypeClassification" 4 | ROOT_PATH = "." 5 | DATA_PATH = "%s/data" % ROOT_PATH 6 | ONTONOTES_DATA_PATH = "%s/OntoNotes" % DATA_PATH 7 | BBN_DATA_PATH="%s/BBN" % DATA_PATH 8 | LOG_DIR = "%s/log" % ROOT_PATH 9 | CHECKPOINT_DIR = "%s/checkpoint" % ROOT_PATH 10 | OUTPUT_DIR = "%s/output" % ROOT_PATH 11 | PKL_DIR='./pkl' 12 | 13 | EMBEDDING_DATA = "%s/glove.840B.300d.txt" % DATA_PATH 14 | testemb='testemb' 15 | prep='prep' 16 | 17 | # -------------------- DATA ---------------------- 18 | 19 | 20 | ONTONOTES_ALL = "%s/all.txt" % ONTONOTES_DATA_PATH 21 | ONTONOTES_TRAIN = "%s/train.txt" % ONTONOTES_DATA_PATH 22 | ONTONOTES_VALID = "%s/dev.txt" % ONTONOTES_DATA_PATH 23 | ONTONOTES_TEST = "%s/test.txt" % ONTONOTES_DATA_PATH 24 | 25 | ONTONOTES_TYPE = "%s/type.pkl" % ONTONOTES_DATA_PATH 26 | ONTONOTES_TRAIN_CLEAN = "%s/train_clean.tsv" % ONTONOTES_DATA_PATH 27 | ONTONOTES_TEST_CLEAN = "%s/test_clean.tsv" % ONTONOTES_DATA_PATH 28 | 29 | BBN_ALL = "%s/all.txt" % BBN_DATA_PATH 30 | BBN_TRAIN = "%s/train.txt" % BBN_DATA_PATH 31 | BBN_VALID = "%s/dev.txt" % BBN_DATA_PATH 32 | BBN_TEST = "%s/test.txt" % BBN_DATA_PATH 33 | BBN_TRAIN_CLEAN = "%s/train_clean.tsv" % BBN_DATA_PATH 34 | BBN_TEST_CLEAN = "%s/test_clean.tsv" % BBN_DATA_PATH 35 | BBN_TYPE = "%s/type.pkl" % BBN_DATA_PATH 36 | 37 | # --------------------- PARAM ----------------------- 38 | 39 | MAX_DOCUMENT_LENGTH = 30 40 | 41 | MENTION_SIZE = 15 42 | 43 | WINDOW_SIZE = 10 44 | 45 | RANDOM_SEED = 2017 -------------------------------------------------------------------------------- /data/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo "Downloading word embeddings..." 3 | wget http://nlp.stanford.edu/data/glove.840B.300d.zip 4 | unzip glove.840B.300d.zip 5 | rm glove.840B.300d.zip -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from optparse import OptionParser 2 | from task import Task 3 | import logging 4 | from utils import logging_utils 5 | from model_param_space import param_space_dict 6 | import datetime 7 | import config 8 | 9 | def parse_args(parser): 10 | parser.add_option("-m", "--model", dest="model_name", type="string") 11 | parser.add_option("-d", "--data", dest="data_name", type="string") 12 | parser.add_option("-p", "--portion", dest="portion", type=int,default=100) 13 | parser.add_option("-a", "--alpha", dest="alpha", type=float,default=0.) 14 | parser.add_option("-o", "--savename", dest="save_name", type="string",default='') 15 | parser.add_option("-r", "--runs", dest="runs", type="int", default=5) 16 | parser.add_option("-g", "--getfeature", dest="get_features", default=False, action="store_true")#getfeature 17 | parser.add_option("-i", "--ifretraining", default=False, action="store_true") 18 | options, args = parser.parse_args() 19 | return options, args 20 | 21 | def main(options): 22 | time_str = datetime.datetime.now().isoformat() 23 | if len(options.save_name) == 0: 24 | logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (options.model_name, 25 | options.data_name, time_str) 26 | else: 27 | logname = "Eval_[Model@%s]_[Data@%s]_%s.log" % (options.save_name, 28 | options.data_name, time_str) 29 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 30 | # else: 31 | # time_str = datetime.datetime.now().isoformat() 32 | # logname = "Final_[Model@%s]_[Data@%s]_%s.log" % (options.model_name, 33 | # options.data_name, time_str) 34 | # logger = logging_utils._get_logger(config.LOG_DIR, logname) 35 | # 36 | params_dict = param_space_dict[options.model_name] 37 | params_dict['alpha']=options.alpha 38 | task = Task(model_name=options.model_name, data_name=options.data_name, cv_runs=options.runs, 39 | params_dict=params_dict,logger=logger,portion=options.portion, 40 | save_name=options.save_name) 41 | 42 | print('-'*50+'refit'+'-'*50) 43 | task.refit() 44 | 45 | if __name__ == "__main__": 46 | parser = OptionParser() 47 | options, args = parse_args(parser) 48 | main(options) 49 | -------------------------------------------------------------------------------- /log/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *.swp 4 | utils/__pycache__/ 5 | data/corpus/* 6 | data/glove* 7 | 8 | 9 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | class Model(object): 2 | def add_placeholders(self): 3 | raise NotImplementedError("Each Model must re-implement this method.") 4 | 5 | def create_feed_dict(self, inputs_batch, labels_batch=None): 6 | raise NotImplementedError("Each Model must re-implement this method.") 7 | 8 | def add_prediction_op(self): 9 | raise NotImplementedError("Each Model must re-implement this method.") 10 | 11 | def add_loss_op(self, pred): 12 | raise NotImplementedError("Each Model must re-implement this method.") 13 | 14 | def add_training_op(self, loss): 15 | raise NotImplementedError("Each Model must re-implement this method.") 16 | 17 | def train_on_batch(self, sess, inputs_batch, labels_batch): 18 | feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch) 19 | _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) 20 | return loss 21 | 22 | def predict_on_batch(self, sess, inputs_batch): 23 | feed = self.create_feed_dict(inputs_batch) 24 | predictions = sess.run(self.pred, feed_dict=feed) 25 | return predictions 26 | 27 | def build(self): 28 | self.add_placeholders() 29 | self.add_prediction_op() 30 | self.add_loss_op() 31 | self.add_training_op() 32 | -------------------------------------------------------------------------------- /model_param_space.py: -------------------------------------------------------------------------------- 1 | 2 | param_space_nfetc_ontonotes_NFETC_CLSC={ 3 | "wpe_dim": 70, 4 | "hidden_layers": 2, 5 | "hidden_size": 700, 6 | "dense_keep_prob": 0.7, 7 | "rnn_keep_prob": 0.6, 8 | "num_epochs": 20, 9 | "makchainlabel": 200, 10 | 'measureway': 'dot-product', 11 | "lr": 0.0006, 12 | "state_size": 1000, 13 | "l2_reg_lambda": 0.000, 14 | "batch_size": 512, 15 | "alpha": 0.25, 16 | 'useCCLPloss':True, 17 | 'sslloss':True, 18 | 'hier':True, 19 | 'cclpvar':2.0, 20 | 'makchainfeature':9.0, 21 | 'filterdata':True, 22 | 'bn':False, 23 | } 24 | 25 | 26 | param_space_nfetc_bbn_NFETC_CLSC={ 27 | "lr": 0.0007,#learning rate 28 | "state_size": 1000,# LSTM dim 29 | "l2_reg_lambda": 0.000,# l2 factor 30 | "alpha": 0.0,# control the hier loss 31 | 'useCCLPloss':True,# use the CLSC or not 32 | 'cclpvar':1.5, # the CLSC factor 33 | 'makchainfeature':13,# the max length of Markov chain 34 | "wpe_dim": 40,#position embedding dim 35 | "hidden_layers":1,#number of hidden layer of the classifier 36 | "hidden_size": 560,#hidden layer dim of the classifier 37 | "dense_keep_prob": 0.3,#dense dropout rate of the feature extractor 38 | 'rnn_dense_dropout':0.3,#useless 39 | "rnn_keep_prob": 1.0,# rnn output droput rate 40 | "batch_size": 512,# as the name 41 | "num_epochs": 20,# as the name 42 | "makchainlabel":200,# max time step of label propagation 43 | 'measureway':'dot-product',# the measurement of the distance between samples in the latent space 44 | } 45 | 46 | param_space_dict = { 47 | "nfetc_ontonotes_NFETC_CLSC":param_space_nfetc_ontonotes_NFETC_CLSC,# the best hp for NFETC_CLSC in OntoNotes 48 | 'nfetc_bbn_NFETC_CLSC':param_space_nfetc_bbn_NFETC_CLSC,# the best hp for NFETC_CLSC in BBN 49 | } 50 | 51 | int_params = [ 52 | "wpe_dim", "state_size", "batch_size", "num_epochs", "hidden_size", "hidden_layers","sedim","selayer" 53 | ] 54 | 55 | class ModelParamSpace: 56 | def __init__(self, learner_name): 57 | s = "Wrong learner name!" 58 | assert learner_name in param_space_dict, s 59 | self.learner_name = learner_name 60 | 61 | def _build_space(self): 62 | return param_space_dict[self.learner_name] 63 | 64 | def _convert_into_param(self, param_dict): 65 | if isinstance(param_dict, dict): 66 | for k, v in param_dict.items(): 67 | if k in int_params: 68 | param_dict[k] = int(v) 69 | elif isinstance(v, list) or isinstance(v, tuple): 70 | for i in range(len(v)): 71 | self._convert_into_param(v[i]) 72 | elif isinstance(v, dict): 73 | self._convert_into_param(v) 74 | return param_dict 75 | -------------------------------------------------------------------------------- /nfetc_clsc.py: -------------------------------------------------------------------------------- 1 | from model import Model 2 | import tensorflow as tf 3 | from utils import data_utils, prior_utils 4 | from utils import eval_utils 5 | import numpy as np 6 | import config 7 | import pickle 8 | from functools import reduce 9 | 10 | 11 | tf.set_random_seed(seed=config.RANDOM_SEED) 12 | 13 | 14 | class NFETC(Model): 15 | def __init__(self, sequence_length, mention_length, num_classes, vocab_size, 16 | embedding_size, position_size, pretrained_embedding, wpe, type_info, hparams): 17 | self.sequence_length = sequence_length 18 | self.mention_length = mention_length 19 | self.num_classes = num_classes 20 | self.vocab_size = vocab_size 21 | self.embedding_size = embedding_size 22 | self.position_size = position_size 23 | self.pretrained_embedding = pretrained_embedding 24 | self.wpe = wpe 25 | 26 | self.state_size = hparams.state_size 27 | self.hidden_layers = hparams.hidden_layers 28 | self.hidden_size = hparams.hidden_size 29 | self.wpe_dim = hparams.wpe_dim 30 | self.l2_reg_lambda = hparams.l2_reg_lambda 31 | self.lr = hparams.lr 32 | 33 | self.dense_keep_prob = hparams.dense_keep_prob 34 | self.rnn_keep_prob = hparams.rnn_keep_prob 35 | 36 | self.rnn_dense_dropoutkeeper=self.dense_keep_prob 37 | self.hp=hparams 38 | self.batch_size = hparams.batch_size 39 | self.num_epochs = hparams.num_epochs 40 | #self.bn=hparams.bn 41 | 42 | self.prior = tf.Variable(prior_utils.create_prior(type_info), trainable=False, dtype=tf.float32, 43 | name="prior") # all one;no alpha 44 | self.alpha=hparams.alpha 45 | 46 | self.useCCLPloss=hparams.useCCLPloss 47 | 48 | self.makchainTimeForlabel=hparams.makchainlabel#directly use prob not propagate 49 | self.makchainTimeForfeature=hparams.makchainfeature 50 | 51 | #self.sslloss=hparams.sslloss 52 | self.measureway=hparams.measureway 53 | 54 | #self.filterdata=hparams.filterdata 55 | 56 | self.tune = tf.Variable(np.transpose(prior_utils.create_prior(type_info, hparams.alpha)), trainable=False, 57 | dtype=tf.float32, name="tune") # tr 之后,每个节点算的就是整个path的penelty 概率 58 | 59 | self.global_step = tf.Variable(0, name="global_step", trainable=False) 60 | self.build() 61 | 62 | def add_placeholders(self): 63 | self.input_words = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_words") 64 | self.input_textlen = tf.placeholder(tf.int32, [None], name="input_textlen") 65 | self.input_mentions = tf.placeholder(tf.int32, [None, self.mention_length], name="input_mentions") 66 | self.input_mentionlen = tf.placeholder(tf.int32, [None], name="input_mentionlen") 67 | self.input_positions = tf.placeholder(tf.int32, [None, self.sequence_length], name="input_positions") 68 | self.input_labels = tf.placeholder(tf.float32, [None, self.num_classes], name="input_labels") 69 | self.cclpvar = tf.placeholder(tf.float32, name="cclpvar") 70 | 71 | self.phase = tf.placeholder(tf.bool, name="phase") 72 | 73 | self.dense_dropout = tf.placeholder(tf.float32, name="dense_dropout") 74 | 75 | self.rnn_dropout = tf.placeholder(tf.float32, name="rnn_dropout") 76 | self.rnn_dense_dropout = tf.placeholder(tf.float32, name="rnn_dense_dropout") 77 | self.words = tf.placeholder(tf.float32, name="words") 78 | self.movingmax=tf.Variable(0.0,trainable=False) 79 | tmp = [i for i in range(self.mention_length)] 80 | tmp[0] = self.mention_length 81 | interval = tf.Variable(tmp, trainable=False) 82 | interval_row = tf.expand_dims(interval, 0) 83 | upper = tf.expand_dims(self.input_mentionlen - 1, 1) 84 | mask = tf.less(interval_row, upper) 85 | self.mention = tf.where(mask, self.input_mentions, tf.zeros_like(self.input_mentions)) 86 | self.mentionlen = tf.reduce_sum(tf.cast(mask, tf.int32), axis=-1) 87 | self.mentionlen = tf.cast( 88 | tf.where(tf.not_equal(self.mentionlen, tf.zeros_like(self.mentionlen)), self.mentionlen, 89 | tf.ones_like(self.mentionlen)), tf.float32) 90 | self.mentionlen = tf.expand_dims(self.mentionlen, 1) 91 | 92 | def create_feed_dict(self, input_words, input_textlen, input_mentions, input_mentionlen, input_positions, 93 | input_labels=None, phase=False, dense_dropout=1., rnn_dropout=1.,rnn_dense_dropout=1.,cclpvar=None): 94 | #print(rnn_dense_dropout) 95 | feed_dict = { 96 | self.input_words: input_words, 97 | self.input_textlen: input_textlen, 98 | self.input_mentions: input_mentions, 99 | self.input_mentionlen: input_mentionlen, 100 | self.input_positions: input_positions, 101 | self.phase: phase, 102 | self.dense_dropout: dense_dropout, 103 | self.rnn_dropout: rnn_dropout, 104 | self.rnn_dense_dropout:rnn_dense_dropout, 105 | } 106 | feed_dict[self.cclpvar] =0 107 | if input_labels is not None: 108 | feed_dict[self.input_labels] = input_labels 109 | if cclpvar is not None: 110 | feed_dict[self.cclpvar] = cclpvar 111 | return feed_dict 112 | 113 | 114 | def add_embedding(self): 115 | with tf.device('/cpu:0'), tf.name_scope("word_embedding"): 116 | W = tf.Variable(self.pretrained_embedding, trainable=False, dtype=tf.float32, name="W") 117 | self.embedded_words = tf.nn.embedding_lookup(W, self.input_words) 118 | self.embedded_mentions = tf.nn.embedding_lookup(W, self.input_mentions) 119 | self.mention_embedding = tf.divide(tf.reduce_sum(tf.nn.embedding_lookup(W, self.mention), 120 | axis=1),self.mentionlen) 121 | 122 | with tf.device('/cpu:0'), tf.name_scope("position_embedding"): 123 | W = tf.Variable(self.wpe, trainable=False, dtype=tf.float32, name="W") 124 | self.wpe_chars = tf.nn.embedding_lookup(W, self.input_positions) 125 | self.input_sentences = tf.concat([self.embedded_words, self.wpe_chars], 2) 126 | 127 | 128 | def add_hidden_layer(self, x, idx): 129 | dim = self.feature_dim if idx == 0 else self.hidden_size 130 | with tf.variable_scope("hidden_%d" % idx): 131 | W = tf.get_variable("W", shape=[dim, self.hidden_size], 132 | initializer=tf.contrib.layers.variance_scaling_initializer( 133 | seed=config.RANDOM_SEED,factor=2.0)) 134 | b = tf.get_variable("b", shape=[self.hidden_size], 135 | initializer=tf.contrib.layers.variance_scaling_initializer( 136 | seed=config.RANDOM_SEED,factor=2.0)) 137 | h = tf.nn.xw_plus_b(x, W, b) 138 | h_drop = tf.nn.dropout(tf.nn.relu(h), self.dense_dropout, seed=config.RANDOM_SEED) 139 | return h_drop 140 | 141 | def extract_last_relevant(self, outputs, seq_len): 142 | batch_size = tf.shape(outputs)[0] 143 | max_length = int(outputs.get_shape()[1]) 144 | num_units = int(outputs.get_shape()[2]) 145 | index = tf.range(0, batch_size) * max_length + (seq_len - 1) 146 | flat = tf.reshape(outputs, [-1, num_units]) 147 | relevant = tf.gather(flat, index) 148 | return relevant 149 | 150 | def add_prediction_op(self): 151 | self.add_embedding() 152 | self.lossesmask = 1 - self.boolize(tf.reduce_sum(self.input_labels, axis=1, keepdims=True, name='lmsk'), 153 | threhold=1.1) 154 | 155 | self.numclean = tf.clip_by_value(tf.cast( 156 | tf.reduce_sum(self.lossesmask), dtype=tf.float32), 157 | clip_value_min=1e-10, clip_value_max=1000000)#一个batch 里面clean data的数量 158 | self.allonemasknoisy = tf.tile(self.lossesmask, multiples=[1, self.num_classes])# clean data 为 1* num_classes 的tensor 159 | self.lossesmaskMatrix = self.allonemasknoisy * self.input_labels 160 | 161 | self.bsize = tf.shape(self.embedded_mentions)[0] 162 | with tf.name_scope("sentence_repr"): 163 | attention_w = tf.get_variable("attention_w", [self.state_size, 1], 164 | initializer=tf.contrib.layers.variance_scaling_initializer(seed=config.RANDOM_SEED,factor=2.0)) 165 | cell_forward = tf.contrib.rnn.LSTMCell(self.state_size) 166 | cell_backward = tf.contrib.rnn.LSTMCell(self.state_size) 167 | cell_forward = tf.contrib.rnn.DropoutWrapper(cell_forward, input_keep_prob=self.rnn_dense_dropout, 168 | output_keep_prob=self.rnn_dropout, seed=config.RANDOM_SEED) 169 | cell_backward = tf.contrib.rnn.DropoutWrapper(cell_backward, input_keep_prob=self.rnn_dense_dropout, 170 | output_keep_prob=self.rnn_dropout, seed=config.RANDOM_SEED) 171 | 172 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 173 | cell_forward, cell_backward, self.input_sentences, 174 | sequence_length=self.input_textlen, dtype=tf.float32) 175 | outputs_added = tf.nn.relu(tf.add(outputs[0], outputs[1])) 176 | alpha = tf.nn.softmax(tf.reshape(tf.matmul( 177 | tf.reshape(outputs_added, [-1, self.state_size]), 178 | attention_w), 179 | [-1, self.sequence_length])) 180 | alpha = tf.expand_dims(alpha, 1) 181 | self.sen_repr = tf.reshape(tf.squeeze(tf.matmul(alpha, outputs_added)),[self.bsize,self.state_size]) 182 | 183 | with tf.name_scope("mention_repr"): 184 | cell = tf.contrib.rnn.LSTMCell(self.state_size) 185 | cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=self.rnn_dense_dropout, 186 | output_keep_prob=self.rnn_dropout, seed=config.RANDOM_SEED) 187 | 188 | outputs, states = tf.nn.dynamic_rnn( 189 | cell, self.embedded_mentions, 190 | sequence_length=self.input_mentionlen, dtype=tf.float32) 191 | self.men_repr = self.extract_last_relevant(outputs, self.input_mentionlen) 192 | 193 | self.features = tf.concat([self.sen_repr,self.men_repr,self.mention_embedding], -1) 194 | self.feature_dim = self.state_size * 2 + self.embedding_size 195 | 196 | h_bn = self.features 197 | hcclp = tf.nn.relu(h_bn) 198 | h_output = tf.nn.dropout(hcclp, self.dense_dropout, seed=config.RANDOM_SEED) 199 | 200 | #get representation layer 201 | for i in range(self.hidden_layers): 202 | h_output = self.add_hidden_layer(h_output, i) 203 | if self.hidden_layers == 0: 204 | self.hidden_size = self.feature_dim 205 | 206 | with tf.variable_scope("typeVec",reuse=tf.AUTO_REUSE): 207 | W = tf.get_variable("W", shape=[ self.hidden_size,self.num_classes], 208 | initializer=tf.contrib.layers.variance_scaling_initializer(seed=config.RANDOM_SEED,factor=2.0)) # hidden size= 660 209 | b = tf.get_variable("b", shape=[self.num_classes], 210 | initializer=tf.contrib.layers.variance_scaling_initializer(seed=config.RANDOM_SEED,factor=2.0)) 211 | 212 | self.scores = tf.nn.xw_plus_b(h_output, W, b, name="scores") # [batch,num class] 213 | 214 | #将input label的父节点都找到 215 | self.labelpth=self.boolize(tf.matmul(self.input_labels, self.prior), threhold=0.0) 216 | 217 | #不加hier的概率 218 | self.proba =tf.clip_by_value( tf.nn.softmax(self.scores, axis=1),1e-10,1) 219 | #加了hier的概率 220 | self.adjusted_proba = tf.matmul(self.proba, self.tune) 221 | self.adjusted_proba = tf.clip_by_value(self.adjusted_proba, 1e-10, 1, name='adprob') 222 | 223 | # unleaked ori props 224 | self.maxtype = tf.argmax(self.proba, 1, name="maxtype") 225 | self.predictions = tf.one_hot(self.maxtype, self.num_classes, name='prediction') 226 | 227 | p = self.distanceMeasure(h_output, measureway=self.measureway) 228 | 229 | if self.useCCLPloss: 230 | choicematrix = self.input_labels 231 | self.LP_post,self.cclploss=self.calculateCClP(p,choicematrix,featurestep=self.makchainTimeForfeature) 232 | self.fi=tf.stop_gradient(self.LP_post) 233 | 234 | def calculateCClP(self,p,choicematrix,featurestep): 235 | H = tf.nn.softmax(p, axis=1, name='transMat') # vote with himself 236 | 237 | # random init 238 | fi = tf.stop_gradient( 239 | tf.random_uniform(name='fi', 240 | shape=[self.bsize, self.num_classes], 241 | minval=0.0001, maxval=1.0, 242 | seed=config.RANDOM_SEED)) 243 | fi = tf.div(fi , tf.tile(tf.reduce_sum(fi, axis=-1, keepdims=True), 244 | multiples=[1, self.num_classes]),name='oriP') 245 | 246 | # loop exit function 247 | cond = lambda fi_, distance, i: i < self.makchainTimeForlabel 248 | 249 | # loop body of LP 250 | def body(fi_, distance, i): 251 | fi_ = tf.matmul(H, fi_) * choicematrix 252 | fi_ = tf.div(fi_, 253 | tf.tile(tf.reduce_sum(fi_, axis=-1, keepdims=True), 254 | multiples=[1, self.num_classes])) 255 | i += 1 256 | return (fi_, distance, i) 257 | 258 | # loop of LP 259 | fi, _, _ = tf.while_loop(cond=cond, body=body, loop_vars=(fi, H, 0.0)) 260 | 261 | # calculate the mass of each types 262 | m = tf.reduce_sum(fi, axis=0, keep_dims=True,name='maskM') 263 | fij = tf.div(fi, tf.tile(tf.clip_by_value(m, 1e-10, 1), multiples=[tf.shape(H)[0], 1])) 264 | 265 | # T is the desirable transition matrix 266 | T = tf.matmul(fi, tf.transpose(fij), name='desTransitionMatrix') 267 | 268 | # Transition masking matrix 269 | M = tf.matmul(fi, tf.transpose(fi)) 270 | 271 | cclpLoss = 0.0 272 | 273 | # L clsc 274 | Hs = H 275 | # loop exit 276 | cclpcond = lambda H, M, Hs, i, cclpLoss,noisymask,numclean: i < featurestep 277 | 278 | i = tf.Variable(1.0, trainable=False, dtype=tf.float32) 279 | 280 | # loop body 281 | def cclpbody(H, M, Hs, i, cclpLoss,noisymask,numclean): 282 | clscKLmat=T * tf.log(tf.clip_by_value(Hs, 1e-10, 1)) 283 | m = tf.reduce_mean(tf.reduce_mean(clscKLmat)) 284 | Hs = tf.matmul((H * M), Hs) 285 | cclpLoss -= tf.div(m, i) 286 | i = i + 1 287 | return (H, M, Hs, i, cclpLoss,noisymask,numclean) 288 | 289 | H, M, Hs, i, cclpLoss,_,_ = tf.while_loop(cond=cclpcond, body=cclpbody, 290 | loop_vars=(H, M, Hs, i, cclpLoss, 291 | tf.tile(self.lossesmask, multiples=[1, self.bsize]), 292 | self.numclean)) 293 | # return LP pro and cclploss 294 | return fi,cclpLoss 295 | 296 | 297 | def distanceMeasure(self,h_output,measureway='dot-product'): 298 | ''' 299 | 距离矩阵计算方式,目前定义了 dot-product,scale dot-product,cos 300 | ''' 301 | if measureway=='dot-product': 302 | hp=h_output 303 | distance=tf.matmul(hp, tf.transpose(h_output)) / tf.sqrt(tf.cast(tf.shape(h_output)[1],dtype=tf.float32)) 304 | 305 | elif measureway=='dot-product-noscale': 306 | distance=tf.matmul(h_output, tf.transpose(h_output)) 307 | elif measureway=='cosine': 308 | mod=tf.sqrt(tf.reduce_sum(h_output * h_output, axis=1,keep_dims=True)) 309 | hihj=tf.matmul(h_output, tf.transpose(h_output)) 310 | hi2hj2=tf.tile(mod,multiples=[1,tf.shape(mod)[0]])*tf.tile(tf.transpose(mod),multiples=[tf.shape(mod)[0],1]) 311 | coshij=tf.div(hihj,hi2hj2) 312 | distance=tf.nn.softmax(-coshij,axis=-1) 313 | else: 314 | assert False,'you must define distance function' 315 | 316 | return distance 317 | 318 | #用于bool化矩阵 319 | def boolize(self, item, threhold): 320 | return tf.cast(tf.greater(item, threhold), dtype=tf.float32) 321 | 322 | def add_loss_op(self): 323 | 324 | with tf.name_scope("loss"): 325 | self.comp = tf.Variable(0.0) 326 | 327 | proba=self.adjusted_proba 328 | 329 | #clean data loss function 330 | numclean=self.numclean 331 | 332 | losses = -tf.reduce_sum(tf.reduce_sum(tf.multiply(self.lossesmaskMatrix, 333 | tf.log(tf.clip_by_value(proba, 1e-10, 1), 334 | name='labelagreeprob'),name='ssloss'),axis=1))\ 335 | /numclean 336 | 337 | self.suploss = losses 338 | 339 | 340 | if self.useCCLPloss: 341 | print('use cclploss') 342 | losses += self.cclpvar*self.cclploss # use cclp loss 343 | 344 | self.l2_loss = tf.contrib.layers.apply_regularization( 345 | regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg_lambda), 346 | weights_list=tf.trainable_variables()) 347 | self.loss = losses + self.l2_loss 348 | 349 | def add_training_op(self): 350 | 351 | optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) 352 | self.grads_and_vars = optimizer.compute_gradients(self.loss) 353 | 354 | extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 355 | with tf.control_dependencies(extra_update_ops): 356 | self.train_op = optimizer.apply_gradients(self.grads_and_vars, global_step=self.global_step) 357 | 358 | def train_on_batch(self, sess, input_words, input_textlen, input_mentions, 359 | input_mentionlen, input_positions,input_labels,cclpvar=None): 360 | feed = self.create_feed_dict(input_words, input_textlen, input_mentions, input_mentionlen, input_positions, 361 | input_labels, True, self.dense_keep_prob, self.rnn_keep_prob, 362 | rnn_dense_dropout=self.rnn_dense_dropoutkeeper, 363 | cclpvar=cclpvar) 364 | Variablelist=[self.train_op, self.global_step, self.loss,self.l2_loss,self.suploss] 365 | Variablename=['_','step','loss','L2_Loss','CCE_loss'] 366 | 367 | if self.useCCLPloss: 368 | Variablelist.append(self.cclploss) 369 | Variablename.append('Clsc_Loss') 370 | a= sess.run( 371 | Variablelist, 372 | feed_dict=feed) 373 | step=a[1] 374 | if step: 375 | outpair=list(zip(Variablename,a)) 376 | outstring='' 377 | for k,v in outpair: 378 | if k=='_': 379 | continue 380 | outstring+=k+': '+str(v)[:8]+' ' 381 | print(outstring) 382 | 383 | def get_scores(self, preds, labels,id2type): 384 | label_path=eval_utils.label_path 385 | if type(preds) == np.ndarray: 386 | preds = [[label_path(id2type[i]) for i, x in enumerate(line) if x > 0] for line in preds] 387 | preds = [list(set(reduce(lambda x, y: x + y, line))) for line in preds] 388 | else: 389 | preds = [label_path(id2type[x]) for x in preds] 390 | 391 | def vec2type(v): 392 | s = [] 393 | for i in range(len(v)): 394 | if v[i]: 395 | s.extend(label_path(id2type[i])) 396 | return set(s) 397 | labels_test = [vec2type(x) for x in labels] # path will caculate the father node for strict acc 398 | acc = eval_utils.strict(labels_test, preds) 399 | _, _, macro = eval_utils.loose_macro(labels_test, preds) 400 | _, _, micro = eval_utils.loose_micro(labels_test, preds) 401 | 402 | return acc, macro, micro 403 | 404 | def predict(self, sess, test): 405 | batches = data_utils.batch_iter(test, self.batch_size, 1, shuffle=False) 406 | all_predictions = [] 407 | all_labels = [] 408 | all_maxtype = [] 409 | for batch in batches: 410 | words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch= zip(*batch) 411 | 412 | feed = self.create_feed_dict(words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch) 413 | batch_predictions, batchmaxtype = sess.run([self.predictions, self.maxtype], feed_dict=feed) 414 | if len(all_predictions) == 0: 415 | all_predictions = batch_predictions 416 | else: 417 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 418 | if len(all_maxtype) == 0: 419 | all_maxtype = batchmaxtype 420 | else: 421 | all_maxtype = np.concatenate([all_maxtype, batchmaxtype]) 422 | 423 | if len(all_labels) == 0: 424 | all_labels = np.array(labels_batch) 425 | else: 426 | all_labels = np.concatenate([all_labels, np.array(labels_batch)]) 427 | return all_predictions, all_maxtype 428 | 429 | def evaluate(self, sess, train, test): 430 | print('begin training') 431 | train_batches = data_utils.batch_iter(train, self.batch_size, self.num_epochs) 432 | 433 | data_size = len(train) 434 | num_batches_per_epoch = int((data_size - 1) / self.batch_size) + 1 435 | epoch=0 436 | for batch in train_batches: 437 | words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch= zip(*batch) 438 | self.train_on_batch(sess, words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, 439 | labels_batch, cclpvar=self.hp.cclpvar) 440 | current_step = tf.train.global_step(sess, self.global_step) 441 | if current_step % num_batches_per_epoch == 0: 442 | epoch+=1 443 | yield self.predict(sess, test) 444 | 445 | -------------------------------------------------------------------------------- /output/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *.swp 4 | utils/__pycache__/ 5 | data/corpus/* 6 | data/glove* 7 | 8 | 9 | -------------------------------------------------------------------------------- /pkl/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *.swp 4 | utils/__pycache__/ 5 | data/corpus/* 6 | data/glove* 7 | 8 | 9 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from optparse import OptionParser 3 | from utils import embedding_utils, data_utils, pkl_utils 4 | import config 5 | import os 6 | import numpy as np 7 | import pandas as pd 8 | 9 | def parse_args(parser): 10 | parser.add_option("-m", "--model", dest="model_name", type="string") 11 | parser.add_option("--input", dest="input_file", type="string", default="") 12 | parser.add_option("--output", dest="output_file", type="string") 13 | parser.add_option("-e", dest="embedding", default=False, action="store_true") 14 | options, args = parser.parse_args() 15 | return options, args 16 | 17 | def get_types(model_name, input_file, output_file): 18 | checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name) 19 | type2id, typeDict = pkl_utils._load(config.WIKI_TYPE) 20 | id2type = {type2id[x]:x for x in type2id.keys()} 21 | 22 | df = pd.read_csv(input_file, sep="\t", names=["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"]) 23 | n = df.shape[0] 24 | words1 = np.array(df.s) 25 | mentions1 = np.array(df.e1) 26 | positions1 = np.array([[x, y] for x, y in zip(df.x1, df.y1+1)]) 27 | words2 = np.array(df.s) 28 | mentions2 = np.array(df.e2) 29 | positions2 = np.array([[x, y] for x, y in zip(df.x2, df.y2+1)]) 30 | 31 | words = np.concatenate([words1, words2]) 32 | mentions = np.concatenate([mentions1, mentions2]) 33 | positions = np.concatenate([positions1, positions2]) 34 | 35 | embedding = embedding_utils.Embedding.restore(checkpoint_file) 36 | 37 | textlen = np.array([embedding.len_transform1(x) for x in words]) 38 | words = np.array([embedding.text_transform1(x) for x in words]) 39 | mentionlen = np.array([embedding.len_transform2(x) for x in mentions]) 40 | mentions = np.array([embedding.text_transform2(x) for x in mentions]) 41 | positions = np.array([embedding.position_transform(x) for x in positions]) 42 | labels = np.zeros(2*n) 43 | test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels)) 44 | 45 | graph = tf.Graph() 46 | with graph.as_default(): 47 | sess = tf.Session() 48 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 49 | saver.restore(sess, checkpoint_file) 50 | 51 | input_words = graph.get_operation_by_name("input_words").outputs[0] 52 | input_textlen = graph.get_operation_by_name("input_textlen").outputs[0] 53 | input_mentions = graph.get_operation_by_name("input_mentions").outputs[0] 54 | input_mentionlen = graph.get_operation_by_name("input_mentionlen").outputs[0] 55 | input_positions = graph.get_operation_by_name("input_positions").outputs[0] 56 | phase = graph.get_operation_by_name("phase").outputs[0] 57 | dense_dropout = graph.get_operation_by_name("dense_dropout").outputs[0] 58 | rnn_dropout = graph.get_operation_by_name("rnn_dropout").outputs[0] 59 | 60 | pred_op = graph.get_operation_by_name("output/predictions").outputs[0] 61 | batches = data_utils.batch_iter(test_set, 512, 1, shuffle=False) 62 | all_predictions = [] 63 | for batch in batches: 64 | words_batch, textlen_batch, mentions_batch, mentionlen_batch, positions_batch, labels_batch = zip(*batch) 65 | feed = { 66 | input_words: words_batch, 67 | input_textlen: textlen_batch, 68 | input_mentions: mentions_batch, 69 | input_mentionlen: mentionlen_batch, 70 | input_positions: positions_batch, 71 | phase: False, 72 | dense_dropout: 1.0, 73 | rnn_dropout: 1.0 74 | } 75 | batch_predictions = sess.run(pred_op, feed_dict=feed) 76 | all_predictions = np.concatenate([all_predictions, batch_predictions]) 77 | 78 | df["t1"] = all_predictions[:n] 79 | df["t2"] = all_predictions[n:] 80 | df["t1"] = df["t1"].map(id2type) 81 | df["t2"] = df["t2"].map(id2type) 82 | df.to_csv(output_file, sep="\t", header=False, index=False) 83 | 84 | def get_embeddings(model_name, output_file): 85 | checkpoint_file = os.path.join(config.CHECKPOINT_DIR, model_name) 86 | graph = tf.Graph() 87 | with graph.as_default(): 88 | sess = tf.Session() 89 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 90 | saver.restore(sess, checkpoint_file) 91 | 92 | embedding_op = graph.get_tensor_by_name("typeVec/W:0") 93 | type_embedding = sess.run(embedding_op) 94 | np.save(output_file, type_embedding) 95 | return type_embedding 96 | 97 | def main(options): 98 | if options.input_file != "": 99 | get_types(options.model_name, options.input_file, options.output_file) 100 | if options.embedding: 101 | #加载计算图获取W作为type emb 102 | get_embeddings(options.model_name, options.output_file) 103 | 104 | if __name__ == "__main__": 105 | parser = OptionParser() 106 | options, args = parse_args(parser) 107 | main(options) 108 | -------------------------------------------------------------------------------- /prepkl/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | *.swp 4 | utils/__pycache__/ 5 | data/corpus/* 6 | data/glove* 7 | 8 | 9 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.6.0 2 | hyperopt 3 | gensim 4 | sklearn 5 | pandas 6 | 7 | -------------------------------------------------------------------------------- /task.py: -------------------------------------------------------------------------------- 1 | from model_param_space import ModelParamSpace 2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval 3 | from optparse import OptionParser 4 | from utils import logging_utils, data_utils, embedding_utils, pkl_utils 5 | from utils.eval_utils import strict, loose_macro, loose_micro, label_path, complete_path 6 | import numpy as np 7 | from sklearn.model_selection import ShuffleSplit 8 | import os 9 | import config 10 | import datetime, pickle 11 | import tensorflow as tf 12 | from nfetc_clsc import NFETC 13 | 14 | class AttrDict(dict): 15 | def __init__(self, *args, **kwargs): 16 | super(AttrDict, self).__init__(*args, **kwargs) 17 | self.__dict__ = self 18 | 19 | class Task: 20 | def __init__(self, model_name, data_name, cv_runs, params_dict, logger, portion=100,save_name=''): 21 | print("Loading data...") 22 | if portion<=100:# all the data, portion% clean + all noisy 23 | self.portion = '-'+str(portion) if portion != 100 else '' 24 | else: 25 | portion/=100# only clean data, portion% clean 26 | self.portion='-'+str(int(portion))+'-clean' 27 | print('run task on: ', self.portion,' dataset: ',data_name) 28 | if data_name == "ontonotes": 29 | words_train, mentions_train, positions_train, labels_train = data_utils.load( 30 | config.ONTONOTES_TRAIN_CLEAN+self.portion) 31 | words, mentions, positions, labels = data_utils.load(config.ONTONOTES_TEST_CLEAN) 32 | type2id, typeDict = pkl_utils._load(config.ONTONOTES_TYPE) 33 | num_types = len(type2id) 34 | type_info = config.ONTONOTES_TYPE 35 | elif data_name == "bbn": 36 | words_train, mentions_train, positions_train, labels_train = data_utils.load( 37 | config.BBN_TRAIN_CLEAN+self.portion) 38 | words, mentions, positions, labels = data_utils.load(config.BBN_TEST_CLEAN) 39 | type2id, typeDict = pkl_utils._load(config.BBN_TYPE) 40 | num_types = len(type2id) 41 | type_info = config.BBN_TYPE 42 | else: 43 | assert False,'you have to specify the name of dataset with -d (ie. bbn/....)' 44 | self.model_name = model_name 45 | self.savename = save_name 46 | self.data_name = data_name 47 | self.cv_runs = cv_runs 48 | self.params_dict = params_dict 49 | self.hparams = AttrDict(params_dict) 50 | #self.hparams.alpha=alpha 51 | self.logger = logger 52 | 53 | self.id2type = {type2id[x]: x for x in type2id.keys()} 54 | 55 | def type2vec(types): # only terminal will be labeled 56 | tmp = np.zeros(num_types) 57 | for t in str(types).split(): 58 | if t in type2id.keys(): 59 | tmp[type2id[t]] = 1.0 60 | return tmp 61 | 62 | labels_train = np.array([type2vec(t) for t in labels_train]) # one hot vec' 63 | labels = np.array([type2vec(t) for t in labels]) 64 | 65 | tempname=self.data_name+config.testemb 66 | tempname=os.path.join(config.PKL_DIR,tempname) 67 | if os.path.exists(tempname): 68 | self.embedding = pickle.load(open(tempname, 'rb')) 69 | print('embedding load over') 70 | else: 71 | self.embedding = embedding_utils.\ 72 | Embedding.fromCorpus(config.EMBEDDING_DATA,list(words_train) + list(words), 73 | config.MAX_DOCUMENT_LENGTH, config.MENTION_SIZE) 74 | pickle.dump(self.embedding, open(tempname, 'wb')) 75 | print('embedding dump over') 76 | self.embedding.max_document_length=config.MAX_DOCUMENT_LENGTH 77 | 78 | print("Preprocessing data...") 79 | 80 | if True: 81 | textlen_train = np.array( 82 | [self.embedding.len_transform1(x) for x in words_train]) # with cut down len sequence 83 | words_train = np.array([self.embedding.text_transform1(x) for x in 84 | words_train]) # with cut down word id sequence and mask with zero 85 | mentionlen_train = np.array([self.embedding.len_transform2(x) for x in mentions_train]) # mention len 86 | mentions_train = np.array( 87 | [self.embedding.text_transform2(x) for x in mentions_train]) # mention text indexer 88 | positions_train = np.array( 89 | [self.embedding.position_transform(x) for x in positions_train]) # start ,end position 90 | print('get train data') 91 | 92 | textlen = np.array([self.embedding.len_transform1(x) for x in words]) 93 | words = np.array([self.embedding.text_transform1(x) for x in words]) # padding and cut down 94 | mentionlen = np.array([self.embedding.len_transform2(x) for x in mentions]) 95 | mentions = np.array([self.embedding.text_transform2(x) for x in mentions]) 96 | positions = np.array([self.embedding.position_transform(x) for x in positions]) 97 | print('get test data') 98 | # pickle.dump([textlen_train, words_train, mentionlen_train, mentions_train, positions_train, 99 | # textlen, words, mentionlen, mentions, positions 100 | # ], open(os.path.join(self.data_name + config.prep+self.portion, 'wb')) 101 | # print('dump preprocessed data to pkl over...') 102 | # else: 103 | # textlen_train, words_train, mentionlen_train, mentions_train, \ 104 | # positions_train, textlen, words, mentionlen, mentions, positions = pickle.load( 105 | # open(self.data_name + config.prep+self.portion, 'rb')) 106 | # print('load preprocessed data from pkl over...') 107 | 108 | #if True: 109 | ss = ShuffleSplit(n_splits=1, test_size=0.1, random_state=config.RANDOM_SEED) 110 | for test_index, valid_index in ss.split(np.zeros(len(labels)), labels): # 用index做划分 111 | textlen_test, textlen_valid = textlen[test_index], textlen[valid_index] 112 | words_test, words_valid = words[test_index], words[valid_index] 113 | mentionlen_test, mentionlen_valid = mentionlen[test_index], mentionlen[valid_index] 114 | mentions_test, mentions_valid = mentions[test_index], mentions[valid_index] 115 | positions_test, positions_valid = positions[test_index], positions[valid_index] 116 | labels_test, labels_valid = labels[test_index], labels[valid_index] 117 | 118 | self.train_set = list( 119 | zip(words_train, textlen_train, mentions_train, mentionlen_train, positions_train, labels_train,)) 120 | self.valid_set = list( 121 | zip(words_valid, textlen_valid, mentions_valid, mentionlen_valid, positions_valid, labels_valid,)) 122 | self.test_set = list(zip(words_test, textlen_test, mentions_test, mentionlen_test, positions_test, labels_test,)) 123 | 124 | self.full_test_set = list(zip(words, textlen, mentions, mentionlen, positions, labels,)) 125 | 126 | self.labels_test = labels_test 127 | self.labels = labels 128 | self.labels_valid = labels_valid 129 | 130 | self.num_types = num_types 131 | self.type_info = type_info 132 | self.logger.info("train set size:%d, test set size: %d" % (len(self.train_set), len(self.full_test_set))) 133 | 134 | self.model = self._get_model() 135 | self.saver = tf.train.Saver(tf.global_variables()) 136 | checkpoint_dir = os.path.abspath(config.CHECKPOINT_DIR) 137 | if not os.path.exists(checkpoint_dir): 138 | os.makedirs(checkpoint_dir) 139 | 140 | self.checkpoint_prefix = os.path.join(checkpoint_dir, self.__str__()) 141 | 142 | def __str__(self): 143 | return self.model_name + self.savename 144 | 145 | def _get_model(self): 146 | np.random.seed(config.RANDOM_SEED) 147 | #print(noisemask) 148 | kwargs = { 149 | "sequence_length": config.MAX_DOCUMENT_LENGTH, 150 | "mention_length": config.MENTION_SIZE, 151 | "num_classes": self.num_types, 152 | "vocab_size": self.embedding.vocab_size, 153 | "embedding_size": self.embedding.embedding_dim, 154 | "position_size": self.embedding.position_size, 155 | "pretrained_embedding": self.embedding.embedding, 156 | "wpe": np.random.random_sample((self.embedding.position_size, self.hparams.wpe_dim)), 157 | "type_info": self.type_info, 158 | "hparams": self.hparams, 159 | } 160 | if "nfetc" in self.model_name: 161 | return NFETC(**kwargs) 162 | else: 163 | raise AttributeError("Invalid model name!") 164 | 165 | def _print_param_dict(self, d, prefix=" ", incr_prefix=" "): 166 | for k, v in sorted(d.items()): 167 | if isinstance(v, dict): 168 | self.logger.info("%s%s:" % (prefix, k)) 169 | self.print_param_dict(v, prefix + incr_prefix, incr_prefix) 170 | else: 171 | self.logger.info("%s%s: %s" % (prefix, k, v)) 172 | 173 | def create_session(self): 174 | session_conf = tf.ConfigProto( 175 | intra_op_parallelism_threads=8, 176 | allow_soft_placement=True, 177 | log_device_placement=False) 178 | session_conf.gpu_options.allow_growth = True 179 | return tf.Session(config=session_conf) 180 | 181 | def get_scores(self, preds, target='fullset'): 182 | preds = [label_path(self.id2type[x]) for x in preds] 183 | #print(self.test_set[0]) 184 | def vec2type(v): 185 | s = [] 186 | for i in range(len(v)): 187 | if v[i]: 188 | s.extend(label_path(self.id2type[i])) 189 | return set(s) 190 | 191 | print('eval on ', target) 192 | if target == 'fullset': 193 | labels_test = [vec2type(x) for x in self.labels] # path will caculate the father node for strict acc 194 | else: 195 | labels_test = [vec2type(x) for x in self.labels_valid] 196 | words=[self.embedding.i2w(k[0]) for k in self.full_test_set] 197 | mentions = [self.embedding.i2w(k[2]) for k in self.full_test_set] 198 | acc = strict(labels_test, preds,oridata=(words,mentions),modelname=self.savename) 199 | _, _, macro = loose_macro(labels_test, preds) 200 | _, _, micro = loose_micro(labels_test, preds) 201 | return acc, macro, micro 202 | 203 | def refit(self): 204 | self.logger.info("Params") 205 | self._print_param_dict(self.params_dict) 206 | self.logger.info("Evaluation for each epoch") 207 | self.logger.info("\t\tEpoch\t\tAcc\t\tMacro\t\tMicro") 208 | sess = self.create_session() 209 | 210 | print('retraining times: ', self.cv_runs) 211 | sess.run(tf.global_variables_initializer()) 212 | 213 | maxbaseonvalid = () 214 | 215 | vaacclist = [] 216 | vamacrolist = [] 217 | vamicrolist = [] 218 | 219 | for i in range(self.cv_runs): 220 | if self.cv_runs > 1 and i !=0: 221 | print('reopen sess...') 222 | sess.close() 223 | sess = self.create_session() 224 | sess.run(tf.global_variables_initializer()) 225 | maxvaacc = -1 226 | epochs = 0 227 | 228 | for preds, maxtype in self.model.evaluate(sess, self.train_set, self.full_test_set): 229 | epochs += 1 230 | acc, macro, micro = self.get_scores(maxtype) 231 | vapreds, _ = self.model.predict(sess, self.valid_set) 232 | vaacc, vamacro, vamicro = self.get_scores(_, target='vatestset') 233 | # vaacc=round(vaacc,3) 234 | # vamacro=round(vamacro,3) 235 | # vamicro=round(vamicro,3) 236 | cmp=vaacc 237 | if cmp >= maxvaacc: 238 | maxvaacc = cmp 239 | maxbaseonvalid = (epochs, acc, macro, micro, maxvaacc) 240 | self.logger.info( 241 | "\tep\t%d\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % (epochs, vaacc, vamacro, vamicro, maxvaacc)) 242 | else: 243 | self.logger.info("\tep\t%d\t\t%.3f\t\t%.3f\t\t%.3f" % 244 | (epochs, vaacc, vamacro, vamicro)) 245 | 246 | vaacclist.append(maxbaseonvalid[1]) 247 | vamacrolist.append(maxbaseonvalid[2]) 248 | vamicrolist.append(maxbaseonvalid[3]) 249 | self.logger.info("\tMax\t%d\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f" % ( 250 | maxbaseonvalid[0], maxbaseonvalid[1], maxbaseonvalid[2], maxbaseonvalid[3], maxbaseonvalid[4])) 251 | 252 | #计算 验证集最大时,测试集的方差与均值 253 | meanvaacc = np.mean(vaacclist) 254 | meanvamacro = np.mean(vamacrolist) 255 | meanvamicro = np.mean(vamicrolist) 256 | stdvaacc = np.std(vaacclist) 257 | stdvamacro = np.std(vamacrolist) 258 | stdvamicro = np.std(vamicrolist) 259 | 260 | self.logger.info("\tCV\t%.1f±%.1f\t%.1f±%.1f\t%.1f±%.1f" 261 | % (meanvaacc*100, stdvaacc*100, meanvamacro*100, 262 | stdvamacro*100, meanvamicro*100, stdvamicro*100)) 263 | sess.close() 264 | 265 | def get_feature(self,dataset): 266 | if dataset=='train': 267 | ds=self.train_set 268 | elif dataset=='test': 269 | ds=self.full_test_set 270 | else: 271 | assert False,'you must give a test or training set ' 272 | checkpoint_file = self.checkpoint_prefix 273 | print('begin to reload model') 274 | sess = self.create_session() 275 | saver = tf.train.Saver() 276 | 277 | sess.run(tf.global_variables_initializer()) 278 | saver.restore(sess,checkpoint_file) 279 | self.model.getfeatures(sess,ds) 280 | 281 | def save(self, sess): 282 | 283 | path = self.saver.save(sess, self.checkpoint_prefix) 284 | self.embedding.save(self.checkpoint_prefix) 285 | print("Saved model to {}".format(path)) 286 | print('-' * 100) 287 | 288 | 289 | class TaskOptimizer: 290 | def __init__(self, model_name, data_name, cv_runs, max_evals, logger, cvonfull, savename='',portion=100): 291 | self.model_name = model_name 292 | self.data_name = data_name 293 | self.cv_runs = cv_runs 294 | self.max_evals = max_evals 295 | self.logger = logger 296 | self.cvonfull = cvonfull 297 | self.save_name = savename 298 | self.model_param_space = ModelParamSpace(self.model_name) # get the param dict via dict name 299 | self.portion=portion 300 | 301 | def _obj(self, param_dict): 302 | param_dict = self.model_param_space._convert_into_param(param_dict) 303 | self.task = Task(model_name=self.model_name,data_name= self.data_name, 304 | cv_runs=self.cv_runs, params_dict=param_dict, logger=self.logger, save_name=self.save_name,portion=self.portion) 305 | self.task.cv(self.cvonfull) 306 | tf.reset_default_graph() 307 | ret = { 308 | "loss": -self.task.eacc, 309 | # "attachments": { 310 | # "pacc": self.task.pacc, 311 | # # "eacc": self.task.eacc, 312 | # }, 313 | "status": STATUS_OK 314 | } 315 | return ret 316 | 317 | def run(self): 318 | trials = Trials() 319 | best = fmin(self._obj, self.model_param_space._build_space(), tpe.suggest, self.max_evals, trials) 320 | best_params = space_eval(self.model_param_space._build_space(), best) 321 | best_params = self.model_param_space._convert_into_param(best_params) 322 | trial_loss = np.asarray(trials.losses(), dtype=float) 323 | best_ind = np.argmin(trial_loss) 324 | best_loss = -trial_loss[best_ind] 325 | # best_pacc = trials.trial_attachments(trials.trials[best_ind])["pacc"] 326 | # best_eacc = trials.trial_attachments(trials.trials[best_ind])["eacc"] 327 | self.logger.info("-" * 50) 328 | self.logger.info("Best Exact Accuracy %.3f " % (best_loss,)) 329 | self.logger.info("Best Param:") 330 | self.task._print_param_dict(best_params) 331 | self.logger.info("-" * 50) 332 | 333 | 334 | def parse_args(parser): 335 | parser.add_option("-m", "--model", type="string", dest="model_name", default='nfetc') 336 | parser.add_option("-d", "--data", type="string", dest="data_name", default='wikim') 337 | parser.add_option("-e", "--eval", type="int", dest="max_evals", default=100) 338 | parser.add_option("-c", "--cv_runs", type="int", dest="cv_runs", default=3) 339 | parser.add_option("-p", "--portion", type="int", dest="portion", default=100) 340 | parser.add_option("-s", "--savename", type="string", dest="save_name", default='') 341 | parser.add_option("-f", "--cvfull", dest="cvfull", default=False, action='store_true') 342 | 343 | options, args = parser.parse_args() 344 | return options, args 345 | 346 | 347 | # python task.py -m -d -e -c 348 | def main(options): 349 | time_str = datetime.datetime.now().isoformat() 350 | logname = "[Model@%s]_[Data@%s]_[use@%s]_%s.log" % (options.model_name, options.data_name,options.save_name, time_str) 351 | logger = logging_utils._get_logger(config.LOG_DIR, logname) 352 | optimizer = TaskOptimizer(options.model_name, options.data_name, options.cv_runs, 353 | options.max_evals, logger, options.cvfull, options.save_name,options.portion) 354 | optimizer.run() 355 | 356 | 357 | if __name__ == "__main__": 358 | parser = OptionParser() 359 | options, args = parse_args(parser) 360 | main(options) 361 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle 4 | import config 5 | import tensorflow as tf 6 | import csv 7 | import math,copy 8 | def make_summary(value_dict): 9 | return tf.Summary(value=[tf.Summary.Value(tag=k, simple_value=v) for k,v in value_dict.items()]) 10 | def load(data_file): 11 | df = pd.read_csv(data_file, sep="\t", names=["p1", "p2", "words", "mentions", "types"],quoting= csv.QUOTE_NONE ) 12 | print('seq 0:\n')#第一个 13 | print('words:', df.words[0]) 14 | print('mentions:', df.mentions[0]) 15 | print('df.types:', df.types[0]) 16 | print('df.p1:', df.p1[0]) 17 | print('df.p2:', df.p2[0]) 18 | words = np.array(df.words) 19 | mentions = np.array(df.mentions) 20 | positions = np.array([[x, y] for x, y in zip(df.p1, df.p2)]) 21 | #print(df.types) 22 | labels = np.array(df.types) 23 | #print(labels) 24 | 25 | all_len=len(df.words) 26 | print('seq min:\n')#第n个 27 | print('words:', df.words[all_len//2]) 28 | print('mentions:', df.mentions[all_len//2]) 29 | print('df.types:', df.types[all_len//2]) 30 | print('df.p1:', df.p1[all_len//2]) 31 | print('df.p2:', df.p2[all_len//2]) 32 | return words, mentions, positions, labels 33 | 34 | def batch_iter(data, batch_size, num_epochs, shuffle=True,istraining=False): 35 | data = np.array(data) 36 | data_size = len(data) 37 | num_batches_per_epoch = int((data_size-1)/batch_size) + 1 38 | for epoch in range(num_epochs): 39 | if shuffle: 40 | np.random.seed(config.RANDOM_SEED) 41 | shuffle_indices = np.random.permutation(np.arange(data_size)) 42 | shuffled_data = data[shuffle_indices] 43 | else: 44 | shuffled_data = data 45 | for batch_num in range(num_batches_per_epoch): 46 | start_index = batch_num * batch_size 47 | end_index = min((batch_num + 1) * batch_size, data_size) 48 | yield shuffled_data[start_index:end_index] 49 | 50 | 51 | -------------------------------------------------------------------------------- /utils/embedding_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gensim 3 | import json 4 | 5 | class Embedding: 6 | def __init__(self, vocab_size, embedding_dim, word2id, id2word, embedding, 7 | max_document_length, position_size, mention_size): 8 | self.vocab_size = vocab_size 9 | self.embedding_dim = embedding_dim 10 | self.word2id = word2id 11 | self.id2word = id2word 12 | self.embedding = embedding 13 | self.max_document_length = max_document_length 14 | self.position_size = position_size 15 | self.mention_size = mention_size 16 | 17 | @classmethod 18 | def restore(cls, inpath): 19 | with open(inpath+"_args.json") as f: 20 | kwargs = json.load(f) 21 | embedding = np.load(inpath+"_embedding.npy") 22 | return cls(embedding=embedding, **kwargs) 23 | 24 | @classmethod 25 | def fromCorpus(cls, f, corpus, max_document_length, mention_size): 26 | if ".txt" in f or '.vec': 27 | model = gensim.models.KeyedVectors.load_word2vec_format(f, binary=False) 28 | else: 29 | model = gensim.models.KeyedVectors.load_word2vec_format(f, binary=True) 30 | print('glove file load over') 31 | wordSet = set(['"']) 32 | for sen in corpus: 33 | words = sen.split() 34 | for w in words: 35 | if w in model: 36 | wordSet.add(w) 37 | 38 | vocab_size = len(wordSet) 39 | print("%d unique tokens have been found!" % vocab_size) 40 | embedding_dim = model.syn0.shape[1] 41 | word2id = {"":0} 42 | id2word = {0:""} 43 | word2id = {"":1} 44 | id2word = {1:""} 45 | embedding = np.zeros((vocab_size+2, embedding_dim)) 46 | 47 | np.random.seed(0) 48 | #embedding[0, :] = np.random.uniform(-1, 1, embedding_dim) 49 | embedding[1, :] = np.random.uniform(-1, 1, embedding_dim) 50 | for i, word in enumerate(wordSet): 51 | word2id[word] = i+2 52 | id2word[i+2] = word 53 | embedding[i+2, :] = model[word] 54 | 55 | kwargs = {} 56 | kwargs["vocab_size"] = vocab_size + 2 57 | kwargs["embedding_dim"] = embedding_dim 58 | kwargs["word2id"] = word2id 59 | kwargs["id2word"] = id2word 60 | kwargs["embedding"] = embedding 61 | kwargs["max_document_length"] = max_document_length 62 | kwargs["position_size"] = max_document_length * 2 + 1 63 | kwargs["mention_size"] = mention_size 64 | return cls(**kwargs) 65 | 66 | def i2w(self,idlist): 67 | return ' '.join([self.id2word[i] for i in idlist if i!=0]) 68 | 69 | def _text_transform(self, s, maxlen): 70 | if not isinstance(s, str): 71 | s = "" 72 | words = s.split() 73 | vec = [] 74 | for w in words: 75 | if w == "''": 76 | w = '"' 77 | if w in self.word2id: 78 | vec.append(self.word2id[w]) 79 | else: 80 | #vec.append(np.random.choice(self.vocab_size-1, 1)[0]+1) 81 | vec.append(1) 82 | for i in range(len(words), maxlen): 83 | vec.append(0) 84 | return vec[:maxlen] 85 | 86 | def _len_transform(self, s, maxlen): 87 | if not isinstance(s, str): 88 | s = "" 89 | length = len(s.split()) 90 | return min(length, maxlen) 91 | 92 | def text_transform1(self, s): 93 | return self._text_transform(s, self.max_document_length) 94 | 95 | def len_transform1(self, s): 96 | return self._len_transform(s, self.max_document_length) 97 | 98 | def text_transform2(self, s): 99 | return self._text_transform(s, self.mention_size) 100 | 101 | def len_transform2(self, s): 102 | return self._len_transform(s, self.mention_size) 103 | 104 | def position_transform(self, s): 105 | x, y = s[0], s[1] 106 | y -= 1 107 | vec = [] 108 | for i in range(self.max_document_length): 109 | if i < x: 110 | vec.append(i-x) 111 | elif i > y: 112 | vec.append(i-y) 113 | else: 114 | vec.append(0) 115 | vec = [np.clip(p+self.max_document_length, 0, self.position_size-1) for p in vec] 116 | return vec 117 | 118 | def save(self, outpath): 119 | kwargs = { 120 | "vocab_size": self.vocab_size, 121 | "embedding_dim": self.embedding_dim, 122 | "word2id": self.word2id, 123 | "id2word": self.id2word, 124 | "max_document_length": self.max_document_length, 125 | "position_size": self.position_size, 126 | "mention_size": self.mention_size, 127 | } 128 | with open(outpath+"_args.json", "w") as f: 129 | json.dump(kwargs, f) 130 | np.save(outpath+"_embedding.npy", self.embedding) 131 | -------------------------------------------------------------------------------- /utils/eval_utils.py: -------------------------------------------------------------------------------- 1 | import copy,json,os 2 | def f1(p, r): 3 | if p == 0. or r == 0.: 4 | return 0. 5 | return 2*p*r/(p+r) 6 | 7 | def label_path(t): 8 | types = t.split("/") 9 | if len(types) == 3: 10 | return ["/"+types[1], t] 11 | if len(types) == 4: 12 | return ["/"+types[1], "/"+types[1]+"/"+types[2], t] 13 | return [t] 14 | 15 | def complete_path(t): 16 | v = [] 17 | for x in t: 18 | v.extend(label_path(x)) 19 | return set(v) 20 | 21 | def strict(labels, predictions,oridata=None,modelname=''): 22 | cnt = 0 23 | if not modelname: 24 | outfile='./answer/duibi_test.json' 25 | else: 26 | answerdir=os.path.join('answer', modelname) 27 | if not os.path.isdir(answerdir): 28 | os.makedirs(answerdir) 29 | outfile = os.path.join(answerdir,'result.json') 30 | print("len:",len(labels),len(predictions)) 31 | for i,(label, pred) in enumerate(zip(labels, predictions)): 32 | cnt += set(label) == set(pred) 33 | 34 | acc = cnt/len(labels) 35 | print("Strict Accuracy: %s" % acc) 36 | 37 | return acc 38 | 39 | def loose_macro(labels, predictions): 40 | p = 0. 41 | r = 0. 42 | for label, pred in zip(labels, predictions): 43 | label = set(label) 44 | pred = set(pred) 45 | if len(pred) > 0: 46 | p += len(label.intersection(pred))/len(pred) 47 | if len(label) > 0: 48 | r += len(label.intersection(pred))/len(label) 49 | p /= len(labels)#所有样本平均的p 50 | r /= len(labels)#平均的r 51 | f = f1(p, r) 52 | print("Loose Macro:") 53 | print("Precision %s Recall %s F1 %s" % (p, r, f)) 54 | return p, r, f 55 | 56 | def loose_micro(labels, predictions): 57 | cnt_pred = 0 58 | cnt_label = 0 59 | cnt_correct = 0 60 | for label, pred in zip(labels, predictions): 61 | label = set(label) 62 | pred = set(pred) 63 | cnt_pred += len(pred) 64 | cnt_label += len(label) 65 | cnt_correct += len(label.intersection(pred)) 66 | p = cnt_correct/cnt_pred 67 | r = cnt_correct/cnt_label 68 | f = f1(p, r)#所有东西加起来算f1 69 | print("Loose Micro:") 70 | print("Precision %s Recall %s F1 %s" % (p, r, f)) 71 | return p, r, f 72 | -------------------------------------------------------------------------------- /utils/logging_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import logging.handlers 4 | 5 | def _get_logger(logdir, logname, loglevel=logging.INFO): 6 | fmt = "[%(asctime)s] %(levelname)s: %(message)s" 7 | formatter = logging.Formatter(fmt) 8 | 9 | handler = logging.handlers.RotatingFileHandler( 10 | filename=os.path.join(logdir, logname), 11 | maxBytes=10*1024*1024, 12 | backupCount=10 13 | ) 14 | handler.setFormatter(formatter) 15 | 16 | logger = logging.getLogger("") 17 | logger.addHandler(handler) 18 | logger.setLevel(loglevel) 19 | return logger 20 | -------------------------------------------------------------------------------- /utils/pkl_utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | def _save(fname, data, protocol=-1): 4 | with open(fname, "wb") as f: 5 | pickle.dump(data, f, protocol) 6 | 7 | def _load(fname): 8 | with open(fname, "rb") as f: 9 | return pickle.load(f) 10 | -------------------------------------------------------------------------------- /utils/prior_utils.py: -------------------------------------------------------------------------------- 1 | from utils import pkl_utils 2 | import sys 3 | sys.path.append("../") 4 | import config 5 | import pprint 6 | import numpy as np 7 | import collections,copy 8 | 9 | #每一个列代表了一个type,其子类型有一个alpha控制loss权重,return ntype,ntype矩阵 10 | def create_prior(type_info, alpha=1.0): 11 | #print(type_info) 12 | type2id, typeDict = pkl_utils._load(type_info) 13 | #pprint.pprint(typeDict) 14 | num_types = len(type2id) 15 | prior = np.zeros((num_types, num_types)) 16 | for x in type2id.keys(): 17 | tmp = np.zeros(num_types) 18 | tmp[type2id[x]] = 1.0 19 | for y in typeDict[x]:#子节点 20 | tmp[type2id[y]] = alpha 21 | #print(tmp) 22 | prior[:,type2id[x]] = tmp 23 | return prior 24 | def istopSon(s)->bool: 25 | istop=False 26 | counter=collections.Counter(s) 27 | if counter['/']==1: 28 | istop=True 29 | return istop 30 | 31 | def makeSonFindermatrix(type_info): 32 | # print(type_info) 33 | type2id, typeDict = pkl_utils._load(type_info) 34 | #pprint.pprint(typeDict) 35 | num_types = len(type2id) 36 | prior = np.zeros((num_types, num_types)) 37 | for x in type2id.keys(): 38 | tmp = np.zeros(num_types) 39 | tmp[type2id[x]] = 1.0 40 | for y in typeDict[x]: # 子节点 41 | tmp[type2id[y]] = 1.0 42 | prior[type2id[x],:] = tmp 43 | #print('-'*50) 44 | tmp = np.zeros(num_types) 45 | for typename in typeDict.keys(): 46 | if istopSon(typename): 47 | # print(typename) 48 | # print(type2id[typename]) 49 | tmp[type2id[typename]]=1 50 | # prior[num_types,num_types]=1 51 | fatherNotin=copy.deepcopy(prior) 52 | for i in range(num_types): 53 | fatherNotin[i,i]=0 54 | #print('-' * 50) 55 | # id2type={val:key for key,val in type2id.items()} 56 | # for j,i in enumerate(prior[num_types,:]): 57 | # if i==1: 58 | # print(id2type[j]) 59 | return prior,fatherNotin,tmp 60 | 61 | if __name__ == '__main__': 62 | type_info='.'+config.WIKIM_TYPE 63 | print(type_info) 64 | 65 | #pprint.pprint(create_prior(type_info)[5,:]) 66 | pprint.pprint(makeSonFindermatrix(type_info)[0][101,:]) 67 | 68 | --------------------------------------------------------------------------------