├── .gitignore ├── LICENSE ├── README.md ├── data ├── DATA.md ├── char_embed.txt ├── question.csv ├── test.csv ├── train.csv └── word_embed.txt └── src ├── __init__.py ├── config.py ├── inputs ├── __init__.py ├── data.py └── dynamic_pooling.py ├── main.py ├── models ├── __init__.py ├── base_model.py ├── bcnn.py ├── decatt.py ├── dsmm.py ├── dssm.py ├── esim.py ├── match_pyramid.py └── model_library.py ├── tf_common ├── __init__.py ├── metrics.py ├── nadam.py ├── nn_module.py └── optimizer.py └── utils ├── __init__.py ├── dist_utils.py ├── log_utils.py ├── ngram_utils.py ├── np_utils.py ├── os_utils.py ├── time_utils.py └── topk_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | *.py[cod] 3 | *$py.class 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | .static_storage/ 55 | .media/ 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | # 106 | .idea 107 | __pycache__ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Chenglong Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tensorflow-DSMM 2 | 3 | Ongoing project for implementing various Deep Semantic Matching Models (DSMM). DSMM is widely used for: 4 | 5 | 6 | - duplicate detection 7 | - sentence similarity 8 | - question answering 9 | - search relevance 10 | - ... 11 | 12 | ## Quickstart 13 | ### Data 14 | This project is developed with regard to the data format provided in the [第三届魔镜杯大赛](https://www.ppdai.ai/mirror/goToMirrorDetail?mirrorId=1). 15 | 16 | You can see `/data/DATA.md` for the data format description and prepared data accordingly. 17 | Your data should be placed in the `data` directory. Current `data` directory also holds a toy data. 18 | 19 | If you want to run a quick demo, you can download data from the above competition link. Download is allowed after registration. 20 | 21 | ### Demo 22 | ```bash 23 | python src/main.py 24 | ``` 25 | 26 | ## Supported Models 27 | 28 | ### Representation based methods 29 | - DSSM style models 30 | - DSSM: use FastText as encoder 31 | - CDSSM: use TextCNN as encoder 32 | - RDSSM: use TextRNN/TextBiRNN as encoder 33 | 34 | ### Interaction based methods 35 | - MatchPyramid style models 36 | - MatchPyramid: use identity/cosine similarity/dot product as match matrix 37 | - General MatchPyramid: use match matrices based on various embeddings and various match scores 38 | - word embeddings 39 | - original word embedding 40 | - compressed word embedding 41 | - contextual word embedding (use an encoder to encode contextual information) 42 | - match score 43 | - identity 44 | - cosine similarity/dot product 45 | - element product 46 | - element concat 47 | - BCNN style models 48 | - BCNN 49 | - ABCNN1 50 | - ABCNN2 51 | - ABCNN3 52 | - ESIM 53 | - DecAtt (Decomposable Attention) 54 | 55 | 56 | ## Building Blocks 57 | ### Encoder layers 58 | - FastText 59 | - TimeDistributed Dense Projection 60 | - TextCNN (Gated CNN and also Residual Gated CNN) 61 | - TextRNN/TextBiRNN with GRU and LSTM cell 62 | 63 | ### Attention layers 64 | - mean/max/min pooling 65 | - scalar-based and vector-based attention 66 | - self and context attention 67 | - multi-head attention 68 | 69 | # Acknowledgments 70 | This project gets inspirations from the following projects: 71 | - [MatchZoo](https://github.com/faneshion/MatchZoo) 72 | - [MatchPyramid-TensorFlow](https://github.com/pl8787/MatchPyramid-TensorFlow) 73 | - [ABCNN](https://github.com/galsang/ABCNN) 74 | -------------------------------------------------------------------------------- /data/DATA.md: -------------------------------------------------------------------------------- 1 | # Data Format 2 | ## char_embed.txt 3 | This file should contains the char embedding. 4 | 5 | Each line should be `char_id embedding_vector`. For example, 6 | ```text 7 | C1 0 0 0 0 8 | C2 0.1 0.5 0.4 0.2 9 | C3 0.8 0.2 0.9 1.0 10 | C4 0.14 0.15 0.64 0.12 11 | ``` 12 | 13 | ## word_embed.txt 14 | This file should contains the word embedding. 15 | 16 | Each line should be `word_id embedding_vector`. For example, 17 | ```text 18 | W1 0 0 0 0 19 | W2 0.1 0.5 0.4 0.2 20 | W3 0.8 0.2 0.9 1.0 21 | W4 0.14 0.15 0.64 0.12 22 | ``` 23 | 24 | ## question.csv 25 | This file should contains all the question that appears in `train.csv` and `test.csv`. 26 | 27 | Each line should be `question_id,word_sequence_ids,char_sequence_ids`. For example, 28 | ```text 29 | qid,words,chars 30 | Q1,W1 W2 W3,C31 C64 C45 C85 31 | Q2,W2 W9 W7 W10 W20,C39 C58 C3 32 | Q3,W23 W91 W7 W10 W290,C19 C81 C31 33 | Q4,W25 W9 W70 W101 W210,C92 C58 C33 34 | Q5,W22 W9 W7 W130 W20,C98 C85 C35 35 | Q6,W2 W19 W87,C39 C86 C34 36 | ``` 37 | 38 | ## train.csv 39 | This file should contains the training question pairs. 40 | 41 | Each line should be `label,q1,q2`, where `label=1` means `q1` (`q1` is the id of question 1) and `q2` (`q2` is the id of question 2) is of the same meaning. `label=0` means they have different meanings. For example 42 | ```text 43 | label,q1,q2 44 | 1,Q1,Q2 45 | 0,Q1,Q3 46 | 0,Q2,Q4 47 | 0,Q5,Q1 48 | 1,Q2,Q6 49 | ``` 50 | 51 | ## test.csv 52 | This file should contains the testing question pairs. 53 | 54 | Each line should be `q1,q2`, where `q1` is the id of question 1 and `q2` is the id of question 2. For example 55 | ```text 56 | q1,q2 57 | Q2,Q3 58 | Q6,Q5 59 | ``` -------------------------------------------------------------------------------- /data/char_embed.txt: -------------------------------------------------------------------------------- 1 | C1 0 0 0 0 2 | C2 0.1 0.5 0.4 0.2 3 | C3 0.8 0.2 0.9 1.0 4 | C4 0.14 0.15 0.64 0.12 -------------------------------------------------------------------------------- /data/question.csv: -------------------------------------------------------------------------------- 1 | qid,words,chars 2 | Q1,W1 W2 W3,C31 C64 C45 C85 3 | Q2,W2 W9 W7 W10 W20,C39 C58 C3 4 | Q3,W23 W91 W7 W10 W290,C19 C81 C31 5 | Q4,W25 W9 W70 W101 W210,C92 C58 C33 6 | Q5,W22 W9 W7 W130 W20,C98 C85 C35 7 | Q6,W2 W19 W87,C39 C86 C34 -------------------------------------------------------------------------------- /data/test.csv: -------------------------------------------------------------------------------- 1 | q1,q2 2 | Q2,Q3 3 | Q6,Q5 -------------------------------------------------------------------------------- /data/train.csv: -------------------------------------------------------------------------------- 1 | label,q1,q2 2 | 1,Q1,Q2 3 | 0,Q1,Q3 4 | 0,Q2,Q4 5 | 0,Q5,Q1 6 | 1,Q2,Q6 -------------------------------------------------------------------------------- /data/word_embed.txt: -------------------------------------------------------------------------------- 1 | W1 0 0 0 0 2 | W2 0.1 0.5 0.4 0.2 3 | W3 0.8 0.2 0.9 1.0 4 | W4 0.14 0.15 0.64 0.12 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/__init__.py -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | DATA_DIR = "../data" 4 | 5 | TRAIN_FILE = DATA_DIR + "/train.csv" 6 | TEST_FILE = DATA_DIR + "/test.csv" 7 | 8 | TRAIN_FEATURES_FILE = DATA_DIR + "/train_features.npy" 9 | TEST_FEATURES_FILE = DATA_DIR + "/test_features.npy" 10 | 11 | QUESTION_FILE = DATA_DIR + "/question.csv" 12 | 13 | WORD_EMBEDDING_FILE = DATA_DIR + "/word_embed.txt" 14 | CHAR_EMBEDDING_FILE = DATA_DIR + "/char_embed.txt" 15 | 16 | SUB_DIR = "../sub" 17 | SUB_FILE = "submission.csv" 18 | SINGLE_SUB_FILE_PATTERN = "submission_%s_%s.csv" 19 | STACKING_SUB_FILE_PATTERN = "submission_%s.csv" 20 | 21 | 22 | # missing 23 | MISSING_INDEX_WORD = 20891 24 | PADDING_INDEX_WORD = 20892 25 | 26 | MISSING_INDEX_CHAR = 3048 27 | PADDING_INDEX_CHAR = 3049 28 | 29 | # ratio 30 | POS_RATIO_OFFLINE = 0.5191087559849992 31 | POS_RATIO_ONLINE = 0.50296075348400959 32 | 33 | """ 34 | 1/(p0 + p1) * (P0 * (0*log(0+eps) + (1-0)*log(1-0-eps)) + P1 * (1*log(0+eps) + (1-1)*log(1-0-eps))) = 17.371649 35 | 1/(p0 + p1) * (p0 * log(1-eps) + p1 * log(0+eps)) = 17.371649 36 | p1/(p0 + p1) ~= 17.371649/log(eps) 37 | = 17.371649/log(1e-15) 38 | = 0.50296075348400959 39 | """ 40 | 41 | NUM_TRAIN = 254386 42 | NUM_TEST = 172956 43 | 44 | TRAIN_RATIO = 0.7 45 | 46 | SPLIT_FILE = "split.pkl" 47 | -------------------------------------------------------------------------------- /src/inputs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/inputs/__init__.py -------------------------------------------------------------------------------- /src/inputs/data.py: -------------------------------------------------------------------------------- 1 | 2 | import config 3 | import numpy as np 4 | import pandas as pd 5 | import scipy as sp 6 | from keras.preprocessing.sequence import pad_sequences 7 | 8 | 9 | def _to_ind(qid): 10 | return int(qid[1:]) 11 | 12 | 13 | def load_raw_question(): 14 | df = pd.read_csv(config.QUESTION_FILE) 15 | df["words"] = df.words.str.split(" ") 16 | df["chars"] = df.chars.str.split(" ") 17 | Q = {} 18 | Q["words"] = df["words"].values 19 | Q["chars"] = df["chars"].values 20 | return Q 21 | 22 | 23 | def load_question(params): 24 | df = pd.read_csv(config.QUESTION_FILE) 25 | df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x]) 26 | df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x]) 27 | Q = {} 28 | Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"]) 29 | Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"]) 30 | Q["words"] = pad_sequences(df["words"], 31 | maxlen=params["max_seq_len_word"], 32 | padding=params["pad_sequences_padding"], 33 | truncating=params["pad_sequences_truncating"], 34 | value=config.PADDING_INDEX_WORD) 35 | Q["chars"] = pad_sequences(df["chars"], 36 | maxlen=params["max_seq_len_char"], 37 | padding=params["pad_sequences_padding"], 38 | truncating=params["pad_sequences_truncating"], 39 | value=config.PADDING_INDEX_CHAR) 40 | return Q 41 | 42 | 43 | def load_train(): 44 | df = pd.read_csv(config.TRAIN_FILE) 45 | df["q1"] = df.q1.apply(_to_ind) 46 | df["q2"] = df.q2.apply(_to_ind) 47 | return df 48 | 49 | 50 | def load_test(): 51 | df = pd.read_csv(config.TEST_FILE) 52 | df["q1"] = df.q1.apply(_to_ind) 53 | df["q2"] = df.q2.apply(_to_ind) 54 | df["label"] = np.zeros(df.shape[0]) 55 | return df 56 | 57 | 58 | def load_embedding_matrix(embedding_file): 59 | print("read embedding from: %s " %embedding_file) 60 | d = {} 61 | n = 0 62 | with open(embedding_file, "r") as f: 63 | line = f.readline() 64 | while line: 65 | n += 1 66 | w, v = line.strip().split(" ", 1) 67 | d[int(w[1:])] = v 68 | line = f.readline() 69 | dim = len(v.split(" ")) 70 | 71 | # add two index for missing and padding 72 | emb_matrix = np.zeros((n+2, dim), dtype=float) 73 | for key ,val in d.items(): 74 | v = np.asarray(val.split(" "), dtype=float) 75 | emb_matrix[key] = v 76 | emb_matrix = np.array(emb_matrix, dtype=np.float32) 77 | return emb_matrix 78 | 79 | 80 | init_embedding_matrix = { 81 | "word": load_embedding_matrix(config.WORD_EMBEDDING_FILE), 82 | "char": load_embedding_matrix(config.CHAR_EMBEDDING_FILE), 83 | } -------------------------------------------------------------------------------- /src/inputs/dynamic_pooling.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | # see https://github.com/pl8787/MatchPyramid-TensorFlow 6 | def dpool_index_(batch_idx, len1_one, len2_one, max_len1, max_len2): 7 | stride1 = 1.0 * max_len1 / len1_one 8 | stride2 = 1.0 * max_len2 / len2_one 9 | idx1_one = np.arange(max_len1) / stride1 10 | idx2_one = np.arange(max_len2) / stride2 11 | mesh1, mesh2 = np.meshgrid(idx1_one, idx2_one) 12 | index_one = np.transpose(np.stack([np.ones(mesh1.shape) * batch_idx, mesh1, mesh2]), (2, 1, 0)) 13 | return index_one 14 | 15 | 16 | def dynamic_pooling_index(len1, len2, max_len1, max_len2): 17 | index = np.zeros((len(len1), max_len1, max_len2, 3), dtype=int) 18 | for i in range(len(len1)): 19 | index[i] = dpool_index_(i, len1[i], len2[i], max_len1, max_len2) 20 | return index 21 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | import pickle as pkl 5 | import tensorflow as tf 6 | 7 | from optparse import OptionParser 8 | 9 | import config 10 | 11 | from inputs.data import load_question, load_train, load_test 12 | from inputs.data import init_embedding_matrix 13 | from models.model_library import get_model 14 | from utils import log_utils, os_utils, time_utils 15 | 16 | 17 | params = { 18 | "model_name": "semantic_matching", 19 | "offline_model_dir": "./weights/semantic_matching", 20 | "summary_dir": "../summary", 21 | "construct_neg": False, 22 | 23 | "augmentation_init_permutation": 0.5, 24 | "augmentation_min_permutation": 0.01, 25 | "augmentation_permutation_decay_steps": 2000, 26 | "augmentation_permutation_decay_rate": 0.975, 27 | 28 | "augmentation_init_dropout": 0.5, 29 | "augmentation_min_dropout": 0.01, 30 | "augmentation_dropout_decay_steps": 2000, 31 | "augmentation_dropout_decay_rate": 0.975, 32 | 33 | "use_features": False, 34 | "num_features": 1, 35 | 36 | "n_runs": 10, 37 | "batch_size": 128, 38 | "epoch": 50, 39 | "max_batch": -1, 40 | "l2_lambda": 0.000, 41 | 42 | # embedding 43 | "embedding_dropout": 0.3, 44 | "embedding_dim_word": init_embedding_matrix["word"].shape[1], 45 | "embedding_dim_char": init_embedding_matrix["char"].shape[1], 46 | "embedding_dim": init_embedding_matrix["word"].shape[1], 47 | "embedding_dim_compressed": 32, 48 | "embedding_trainable": True, 49 | "embedding_mask_zero": True, 50 | 51 | "max_num_word": init_embedding_matrix["word"].shape[0], 52 | "max_num_char": init_embedding_matrix["char"].shape[0], 53 | 54 | "threshold": 0.217277, 55 | "calibration": False, 56 | 57 | "max_seq_len_word": 12, 58 | "max_seq_len_char": 20, 59 | "pad_sequences_padding": "post", 60 | "pad_sequences_truncating": "post", 61 | 62 | # optimization 63 | "optimizer_type": "lazyadam", 64 | "init_lr": 0.001, 65 | "beta1": 0.9, 66 | "beta2": 0.999, 67 | "decay_steps": 2000, 68 | "decay_rate": 0.95, 69 | "schedule_decay": 0.004, 70 | "random_seed": 2018, 71 | "eval_every_num_update": 5000, 72 | 73 | # semantic feature layer 74 | "encode_method": "textcnn", 75 | "attend_method": ["ave", "max", "min", "self-scalar-attention"], 76 | "attention_dim": 64, 77 | "attention_num_heads": 1, 78 | 79 | # cnn 80 | "cnn_num_layers": 1, 81 | "cnn_num_filters": 32, 82 | "cnn_filter_sizes": [1, 2, 3], 83 | "cnn_timedistributed": False, 84 | "cnn_activation": tf.nn.relu, 85 | "cnn_gated_conv": False, 86 | "cnn_residual": False, 87 | 88 | "rnn_num_units": 32, 89 | "rnn_cell_type": "gru", 90 | "rnn_num_layers": 1, 91 | 92 | # fc block 93 | "fc_type": "fc", 94 | "fc_hidden_units": [64*4, 64*2, 64], 95 | "fc_dropouts": [0, 0, 0], 96 | 97 | # True: cosine(l1, l2), sum(abs(l1 - l2)) 98 | # False: l1 * l2, abs(l1 - l2) 99 | "similarity_aggregation": False, 100 | 101 | # match pyramid 102 | "mp_num_filters": [8, 16], 103 | "mp_filter_sizes": [5, 3], 104 | "mp_activation": tf.nn.relu, 105 | "mp_dynamic_pooling": False, 106 | "mp_pool_sizes_word": [6, 3], 107 | "mp_pool_sizes_char": [10, 5], 108 | 109 | # bcnn 110 | "bcnn_num_layers": 2, 111 | "bcnn_num_filters": 16, 112 | "bcnn_filter_size": 3, 113 | "bcnn_activation": tf.nn.tanh, # tf.nn.relu with euclidean/euclidean_exp produce nan 114 | "bcnn_match_score_type": "cosine", 115 | 116 | "bcnn_mp_att_pooling": False, 117 | "bcnn_mp_num_filters": [8, 16], 118 | "bcnn_mp_filter_sizes": [5, 3], 119 | "bcnn_mp_activation": tf.nn.relu, 120 | "bcnn_mp_dynamic_pooling": False, 121 | "bcnn_mp_pool_sizes_word": [6, 3], 122 | "bcnn_mp_pool_sizes_char": [10, 5], 123 | 124 | # final layer 125 | "final_dropout": 0.3, 126 | 127 | } 128 | 129 | 130 | def get_model_data(df, features, params): 131 | X = { 132 | "q1": df.q1.values, 133 | "q2": df.q2.values, 134 | "label": df.label.values, 135 | } 136 | if params["use_features"]: 137 | X.update({ 138 | "features": features, 139 | }) 140 | params["num_features"] = X["features"].shape[1] 141 | return X 142 | 143 | 144 | def downsample(df): 145 | # downsample negative 146 | num_pos = np.sum(df.label) 147 | num_neg = int((1. / config.POS_RATIO_OFFLINE - 1.) * num_pos) 148 | idx_pos = np.where(df.label == 1)[0] 149 | idx_neg = np.where(df.label == 0)[0] 150 | np.random.shuffle(idx_neg) 151 | idx = np.hstack([idx_pos, idx_neg[:num_neg]]) 152 | return df.loc[idx] 153 | 154 | 155 | def get_train_valid_test_data(augmentation=False): 156 | # load data 157 | Q = load_question(params) 158 | dfTrain = load_train() 159 | dfTest = load_test() 160 | # train_features = load_feat("train") 161 | # test_features = load_feat("test") 162 | # params["num_features"] = train_features.shape[1] 163 | 164 | # load split 165 | with open(config.SPLIT_FILE, "rb") as f: 166 | train_idx, valid_idx = pkl.load(f) 167 | 168 | # validation 169 | if augmentation: 170 | dfDev = pd.read_csv(config.DATA_DIR + "/" + "dev_aug.csv") 171 | dfDev = downsample(dfDev) 172 | params["use_features"] = False 173 | params["augmentation_decay_steps"] = 50000 174 | params["decay_steps"] = 50000 175 | X_dev = get_model_data(dfDev, None, params) 176 | else: 177 | X_dev = get_model_data(dfTrain.loc[train_idx], None, params) 178 | X_valid = get_model_data(dfTrain.loc[valid_idx], None, params) 179 | 180 | # submit 181 | if augmentation: 182 | dfTrain = pd.read_csv(config.DATA_DIR + "/" + "train_aug.csv") 183 | dfTrain = downsample(dfTrain) 184 | params["use_features"] = False 185 | params["augmentation_decay_steps"] = 50000 186 | params["decay_steps"] = 50000 187 | X_train = get_model_data(dfTrain, None, params) 188 | else: 189 | X_train = get_model_data(dfTrain, None, params) 190 | X_test = get_model_data(dfTest, None, params) 191 | 192 | return X_dev, X_valid, X_train, X_test, Q 193 | 194 | 195 | def parse_args(parser): 196 | parser.add_option("-m", "--model", type="string", dest="model", 197 | help="model type", default="cdssm") 198 | parser.add_option("-a", "--augmentation", action="store_true", dest="augmentation", 199 | help="augmentation", default=False) 200 | parser.add_option("-g", "--granularity", type="string", dest="granularity", 201 | help="granularity, e.g., word or char", default="word") 202 | 203 | (options, args) = parser.parse_args() 204 | return options, args 205 | 206 | 207 | def main(options): 208 | 209 | os_utils._makedirs("../logs") 210 | os_utils._makedirs("../output") 211 | logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp()) 212 | 213 | params["granularity"] = options.granularity 214 | 215 | # save path 216 | model_name = "augmentation_%s_%s_%s"%(str(options.augmentation), options.granularity, options.model) 217 | path = config.SUB_DIR + "/" + model_name 218 | os_utils._makedirs(path) 219 | 220 | # load data 221 | X_dev, X_valid, X_train, X_test, Q = get_train_valid_test_data(options.augmentation) 222 | 223 | # validation 224 | model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix) 225 | model.fit(X_dev, Q, validation_data=X_valid, shuffle=True) 226 | y_pred_valid = model.predict_proba(X_valid, Q).flatten() 227 | # save for stacking 228 | df = pd.DataFrame({"y_pred": y_pred_valid, "y_true": X_valid["label"]}) 229 | df.to_csv(path + "/valid.csv", index=False, header=True) 230 | 231 | # submission 232 | y_proba = np.zeros((len(X_test["label"]), params["n_runs"]), dtype=np.float32) 233 | for run in range(params["n_runs"]): 234 | params["random_seed"] = run 235 | params["model_name"] = "semantic_model_%s"%str(run+1) 236 | model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix) 237 | model.fit(X_train, Q, validation_data=None, shuffle=True) 238 | y_proba[:,run] = model.predict_proba(X_test, Q).flatten() 239 | df = pd.DataFrame(y_proba[:,:(run+1)], columns=["y_proba_%d"%(i+1) for i in range(run+1)]) 240 | df.to_csv(path + "/test.csv", index=False, header=True) 241 | 242 | 243 | if __name__ == "__main__": 244 | 245 | parser = OptionParser() 246 | options, args = parse_args(parser) 247 | main(options) 248 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/models/__init__.py -------------------------------------------------------------------------------- /src/models/base_model.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | import numpy as np 4 | import tensorflow as tf 5 | from sklearn.linear_model import LogisticRegression 6 | from sklearn.metrics import log_loss 7 | 8 | import config 9 | from utils import os_utils 10 | from tf_common.optimizer import * 11 | from tf_common.nn_module import word_dropout, mlp_layer 12 | from tf_common.nn_module import encode, attend 13 | 14 | 15 | def sigmoid(x): 16 | return 1./(1.+np.exp(-x)) 17 | 18 | 19 | class BaseModel(object): 20 | def __init__(self, params, logger, init_embedding_matrix=None): 21 | self.params = params 22 | self.logger = logger 23 | self.init_embedding_matrix = init_embedding_matrix 24 | self.model_name = self.params["model_name"] 25 | self.threshold = self.params["threshold"] 26 | self.calibration_model = None 27 | # os_utils._makedirs(self.params["offline_model_dir"], force=True) 28 | 29 | self._init_tf_vars() 30 | self.matching_features_word, self.matching_features_char = self._get_matching_features() 31 | self.logits, self.proba = self._get_prediction() 32 | self.loss = self._get_loss() 33 | self.train_op = self._get_train_op() 34 | self.summary = self._get_summary() 35 | 36 | self.sess, self.saver = self._init_session() 37 | self.train_writer = tf.summary.FileWriter(self.params["summary_dir"] + '/train', self.sess.graph) 38 | self.test_writer = tf.summary.FileWriter(self.params["summary_dir"] + '/test') 39 | 40 | 41 | def _init_tf_vars(self): 42 | #### training flag 43 | self.training = tf.placeholder(tf.bool, shape=[], name="training") 44 | #### labels 45 | self.labels = tf.placeholder(tf.float32, shape=[None], name="labels") 46 | #### word 47 | self.seq_word_left = tf.placeholder(tf.int32, shape=[None, None], name="seq_word_left") 48 | self.seq_word_right = tf.placeholder(tf.int32, shape=[None, None], name="seq_word_right") 49 | #### char 50 | self.seq_char_left = tf.placeholder(tf.int32, shape=[None, None], name="seq_char_left") 51 | self.seq_char_right = tf.placeholder(tf.int32, shape=[None, None], name="seq_char_right") 52 | #### word len 53 | self.seq_len_word_left = tf.placeholder(tf.int32, shape=[None], name="seq_len_word_left") 54 | self.seq_len_word_right = tf.placeholder(tf.int32, shape=[None], name="seq_len_word_right") 55 | #### char len 56 | self.seq_len_char_left = tf.placeholder(tf.int32, shape=[None], name="seq_len_char_left") 57 | self.seq_len_char_right = tf.placeholder(tf.int32, shape=[None], name="seq_len_char_right") 58 | 59 | #### features 60 | self.features = tf.placeholder(tf.float32, shape=[None, self.params["num_features"]], name="features") 61 | 62 | #### training 63 | self.global_step = tf.Variable(0, trainable=False) 64 | self.learning_rate = tf.train.exponential_decay(self.params["init_lr"], self.global_step, 65 | self.params["decay_steps"], self.params["decay_rate"]) 66 | self.augmentation_dropout = tf.train.exponential_decay(self.params["augmentation_init_dropout"], self.global_step, 67 | self.params["augmentation_dropout_decay_steps"], 68 | self.params["augmentation_dropout_decay_rate"]) 69 | self.augmentation_permutation = tf.train.exponential_decay(self.params["augmentation_init_permutation"], 70 | self.global_step, 71 | self.params["augmentation_permutation_decay_steps"], 72 | self.params["augmentation_permutation_decay_rate"]) 73 | 74 | 75 | def _get_embedding_matrix(self, granularity="word"): 76 | if self.init_embedding_matrix[granularity] is None: 77 | std = 0.1 78 | minval = -std 79 | maxval = std 80 | emb_matrix = tf.Variable( 81 | tf.random_uniform( 82 | [self.params["max_num_%s" % granularity] + 1, self.params["embedding_dim_%s" % granularity]], 83 | minval, maxval, 84 | seed=self.params["random_seed"], 85 | dtype=tf.float32)) 86 | else: 87 | emb_matrix = tf.Variable(self.init_embedding_matrix[granularity], 88 | trainable=self.params["embedding_trainable"]) 89 | return emb_matrix 90 | 91 | 92 | def _semantic_feature_layer(self, seq_input, seq_len, granularity="word", reuse=False): 93 | assert granularity in ["char", "word"] 94 | #### embed 95 | emb_matrix = self._get_embedding_matrix(granularity) 96 | emb_seq = tf.nn.embedding_lookup(emb_matrix, seq_input) 97 | 98 | #### dropout 99 | random_seed = np.random.randint(10000000) 100 | emb_seq = word_dropout(emb_seq, 101 | training=self.training, 102 | dropout=self.params["embedding_dropout"], 103 | seed=random_seed) 104 | 105 | #### encode 106 | input_dim = self.params["embedding_dim"] 107 | enc_seq = encode(emb_seq, method=self.params["encode_method"], 108 | input_dim=input_dim, 109 | params=self.params, 110 | sequence_length=seq_len, 111 | mask_zero=self.params["embedding_mask_zero"], 112 | scope_name=self.model_name + "enc_seq_%s"%granularity, reuse=reuse, 113 | training=self.training) 114 | 115 | #### attend 116 | feature_dim = self.params["encode_dim"] 117 | context = None 118 | att_seq = attend(enc_seq, context=context, 119 | encode_dim=self.params["encode_dim"], 120 | feature_dim=feature_dim, 121 | attention_dim=self.params["attention_dim"], 122 | method=self.params["attend_method"], 123 | scope_name=self.model_name + "att_seq_%s"%granularity, 124 | reuse=reuse, num_heads=self.params["attention_num_heads"]) 125 | 126 | #### MLP nonlinear projection 127 | sem_seq = mlp_layer(att_seq, fc_type=self.params["fc_type"], 128 | hidden_units=self.params["fc_hidden_units"], 129 | dropouts=self.params["fc_dropouts"], 130 | scope_name=self.model_name + "sem_seq_%s"%granularity, 131 | reuse=reuse, 132 | training=self.training, 133 | seed=self.params["random_seed"]) 134 | 135 | return emb_seq, enc_seq, att_seq, sem_seq 136 | 137 | 138 | def _interaction_semantic_feature_layer(self, seq_input_left, seq_input_right, seq_len_left, seq_len_right, granularity="word"): 139 | assert granularity in ["char", "word"] 140 | #### embed 141 | emb_matrix = self._get_embedding_matrix(granularity) 142 | emb_seq_left = tf.nn.embedding_lookup(emb_matrix, seq_input_left) 143 | emb_seq_right = tf.nn.embedding_lookup(emb_matrix, seq_input_right) 144 | 145 | #### dropout 146 | random_seed = np.random.randint(10000000) 147 | emb_seq_left = word_dropout(emb_seq_left, 148 | training=self.training, 149 | dropout=self.params["embedding_dropout"], 150 | seed=random_seed) 151 | random_seed = np.random.randint(10000000) 152 | emb_seq_right = word_dropout(emb_seq_right, 153 | training=self.training, 154 | dropout=self.params["embedding_dropout"], 155 | seed=random_seed) 156 | 157 | #### encode 158 | input_dim = self.params["embedding_dim"] 159 | enc_seq_left = encode(emb_seq_left, method=self.params["encode_method"], 160 | input_dim=input_dim, 161 | params=self.params, 162 | sequence_length=seq_len_left, 163 | mask_zero=self.params["embedding_mask_zero"], 164 | scope_name=self.model_name + "enc_seq_%s"%granularity, reuse=False, 165 | training=self.training) 166 | enc_seq_right = encode(emb_seq_right, method=self.params["encode_method"], 167 | input_dim=input_dim, 168 | params=self.params, 169 | sequence_length=seq_len_right, 170 | mask_zero=self.params["embedding_mask_zero"], 171 | scope_name=self.model_name + "enc_seq_%s" % granularity, reuse=True, 172 | training=self.training) 173 | 174 | #### attend 175 | # [batchsize, s1, s2] 176 | att_mat = tf.einsum("abd,acd->abc", enc_seq_left, enc_seq_right) 177 | feature_dim = self.params["encode_dim"] + self.params["max_seq_len_%s"%granularity] 178 | att_seq_left = attend(enc_seq_left, context=att_mat, feature_dim=feature_dim, 179 | method=self.params["attend_method"], 180 | scope_name=self.model_name + "att_seq_%s"%granularity, 181 | reuse=False) 182 | att_seq_right = attend(enc_seq_right, context=tf.transpose(att_mat), feature_dim=feature_dim, 183 | method=self.params["attend_method"], 184 | scope_name=self.model_name + "att_seq_%s" % granularity, 185 | reuse=True) 186 | 187 | #### MLP nonlinear projection 188 | sem_seq_left = mlp_layer(att_seq_left, fc_type=self.params["fc_type"], 189 | hidden_units=self.params["fc_hidden_units"], 190 | dropouts=self.params["fc_dropouts"], 191 | scope_name=self.model_name + "sem_seq_%s"%granularity, 192 | reuse=False, 193 | training=self.training, 194 | seed=self.params["random_seed"]) 195 | sem_seq_right = mlp_layer(att_seq_right, fc_type=self.params["fc_type"], 196 | hidden_units=self.params["fc_hidden_units"], 197 | dropouts=self.params["fc_dropouts"], 198 | scope_name=self.model_name + "sem_seq_%s" % granularity, 199 | reuse=True, 200 | training=self.training, 201 | seed=self.params["random_seed"]) 202 | 203 | return emb_seq_left, enc_seq_left, att_seq_left, sem_seq_left, \ 204 | emb_seq_right, enc_seq_right, att_seq_right, sem_seq_right 205 | 206 | 207 | def _get_matching_features(self): 208 | pass 209 | 210 | 211 | def _get_prediction(self): 212 | with tf.name_scope(self.model_name + "/"): 213 | with tf.name_scope("prediction"): 214 | lst = [] 215 | if "word" in self.params["granularity"]: 216 | lst.append(self.matching_features_word) 217 | if "char" in self.params["granularity"]: 218 | lst.append(self.matching_features_char) 219 | if self.params["use_features"]: 220 | out_0 = mlp_layer(self.features, fc_type=self.params["fc_type"], 221 | hidden_units=self.params["fc_hidden_units"], 222 | dropouts=self.params["fc_dropouts"], 223 | scope_name=self.model_name + "mlp_features", 224 | reuse=False, 225 | training=self.training, 226 | seed=self.params["random_seed"]) 227 | lst.append(out_0) 228 | out = tf.concat(lst, axis=-1) 229 | out = tf.layers.Dropout(self.params["final_dropout"])(out, training=self.training) 230 | out = mlp_layer(out, fc_type=self.params["fc_type"], 231 | hidden_units=self.params["fc_hidden_units"], 232 | dropouts=self.params["fc_dropouts"], 233 | scope_name=self.model_name + "mlp", 234 | reuse=False, 235 | training=self.training, 236 | seed=self.params["random_seed"]) 237 | logits = tf.layers.dense(out, 1, activation=None, 238 | kernel_initializer=tf.glorot_uniform_initializer( 239 | seed=self.params["random_seed"]), 240 | name=self.model_name + "logits") 241 | logits = tf.squeeze(logits, axis=1) 242 | proba = tf.nn.sigmoid(logits) 243 | 244 | return logits, proba 245 | 246 | 247 | def _get_loss(self): 248 | with tf.name_scope(self.model_name + "/"): 249 | with tf.name_scope("loss"): 250 | loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.logits) 251 | loss = tf.reduce_mean(loss, name="log_loss") 252 | if self.params["l2_lambda"] > 0: 253 | l2_losses = tf.add_n( 254 | [tf.nn.l2_loss(v) for v in tf.trainable_variables() if "bias" not in v.name]) * self.params[ 255 | "l2_lambda"] 256 | loss = loss + l2_losses 257 | return loss 258 | 259 | 260 | def _get_train_op(self): 261 | with tf.name_scope(self.model_name + "/"): 262 | with tf.name_scope("optimization"): 263 | if self.params["optimizer_type"] == "lazynadam": 264 | optimizer = LazyNadamOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], 265 | beta2=self.params["beta2"], epsilon=1e-8, 266 | schedule_decay=self.params["schedule_decay"]) 267 | elif self.params["optimizer_type"] == "adam": 268 | optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, 269 | beta1=self.params["beta1"], 270 | beta2=self.params["beta2"], epsilon=1e-8) 271 | elif self.params["optimizer_type"] == "lazyadam": 272 | optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=self.learning_rate, 273 | beta1=self.params["beta1"], 274 | beta2=self.params["beta2"], epsilon=1e-8) 275 | elif self.params["optimizer_type"] == "adagrad": 276 | optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, 277 | initial_accumulator_value=1e-7) 278 | elif self.params["optimizer_type"] == "adadelta": 279 | optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate) 280 | elif self.params["optimizer_type"] == "gd": 281 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) 282 | elif self.params["optimizer_type"] == "momentum": 283 | optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95) 284 | elif self.params["optimizer_type"] == "rmsprop": 285 | optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.9, 286 | momentum=0.9, epsilon=1e-8) 287 | elif self.params["optimizer_type"] == "lazypowersign": 288 | optimizer = LazyPowerSignOptimizer(learning_rate=self.learning_rate) 289 | elif self.params["optimizer_type"] == "lazyaddsign": 290 | optimizer = LazyAddSignOptimizer(learning_rate=self.learning_rate) 291 | elif self.params["optimizer_type"] == "lazyamsgrad": 292 | optimizer = LazyAMSGradOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"], 293 | beta2=self.params["beta2"], epsilon=1e-8) 294 | 295 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 296 | with tf.control_dependencies(update_ops): 297 | train_op = optimizer.minimize(self.loss, global_step=self.global_step) 298 | return train_op 299 | 300 | 301 | def _get_summary(self): 302 | with tf.name_scope(self.model_name + "/"): 303 | tf.summary.scalar("augmentation_dropout", self.augmentation_dropout) 304 | tf.summary.scalar("logloss", self.loss) 305 | tf.summary.scalar("lr", self.learning_rate) 306 | # error: https://blog.csdn.net/u012436149/article/details/53894364 307 | # summary = tf.summary.merge_all() 308 | summary = tf.summary.merge( 309 | tf.get_collection(tf.GraphKeys.SUMMARIES, self.model_name) 310 | ) 311 | return summary 312 | 313 | 314 | def _init_session(self): 315 | config = tf.ConfigProto(device_count={"gpu": 1}) 316 | config.gpu_options.allow_growth = True 317 | config.intra_op_parallelism_threads = 4 318 | config.inter_op_parallelism_threads = 4 319 | sess = tf.Session(config=config) 320 | sess.run(tf.global_variables_initializer()) 321 | # max_to_keep=None, keep all the models 322 | saver = tf.train.Saver(max_to_keep=None) 323 | return sess, saver 324 | 325 | 326 | def save_session(self): 327 | self.saver.save(self.sess, self.params["offline_model_dir"] + "/model.checkpoint") 328 | 329 | 330 | def restore_session(self): 331 | self.saver.restore(self.sess, self.params["offline_model_dir"] + "/model.checkpoint") 332 | 333 | 334 | def _get_batch_index(self, seq, step): 335 | n = len(seq) 336 | res = [] 337 | for i in range(0, n, step): 338 | res.append(seq[i:i + step]) 339 | # last batch 340 | if len(res) * step < n: 341 | res.append(seq[len(res) * step:]) 342 | return res 343 | 344 | 345 | def _get_pos_neg_ind(self, label): 346 | length = len(label) 347 | pos_ind_tmp = np.where(label == 1)[0] 348 | inds = np.zeros((len(pos_ind_tmp) * length, 2), dtype=int) 349 | inds[:, 0] = np.tile(pos_ind_tmp, length) 350 | inds[:, 1] = list(range(length)) * len(pos_ind_tmp) 351 | mask = inds[:, 0] != inds[:, 1] 352 | pos_ind = inds[mask, 0] 353 | neg_ind = inds[mask, 1] 354 | return pos_ind, neg_ind 355 | 356 | 357 | def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False): 358 | if training: 359 | if construct_neg: 360 | q1 = X["q1"][idx] 361 | q2 = X["q2"][idx] 362 | # for label=1 sample, construct negative sample within batch 363 | pos_ind, neg_ind = self._get_pos_neg_ind(X["label"][idx]) 364 | # original & symmetric 365 | feed_dict = { 366 | self.seq_word_left: np.vstack([Q["words"][q1], 367 | Q["words"][X["q1"][idx[pos_ind]]], 368 | Q["words"][X["q1"][idx[neg_ind]]], 369 | Q["words"][q2], 370 | Q["words"][X["q2"][idx[neg_ind]]], 371 | Q["words"][X["q2"][idx[pos_ind]]] 372 | ]), 373 | self.seq_word_right: np.vstack([Q["words"][q2], 374 | Q["words"][X["q2"][idx[neg_ind]]], 375 | Q["words"][X["q2"][idx[pos_ind]]], 376 | Q["words"][q1], 377 | Q["words"][X["q1"][idx[pos_ind]]], 378 | Q["words"][X["q1"][idx[neg_ind]]], 379 | ]), 380 | self.seq_char_left: np.vstack([Q["chars"][q1], 381 | Q["chars"][X["q1"][idx[pos_ind]]], 382 | Q["chars"][X["q1"][idx[neg_ind]]], 383 | Q["chars"][q2], 384 | Q["chars"][X["q2"][idx[neg_ind]]], 385 | Q["chars"][X["q2"][idx[pos_ind]]] 386 | ]), 387 | self.seq_char_right: np.vstack([Q["chars"][q2], 388 | Q["chars"][X["q2"][idx[neg_ind]]], 389 | Q["chars"][X["q2"][idx[pos_ind]]], 390 | Q["chars"][q1], 391 | Q["chars"][X["q1"][idx[pos_ind]]], 392 | Q["chars"][X["q1"][idx[neg_ind]]] 393 | ]), 394 | self.labels: np.hstack([X["label"][idx], 395 | np.zeros(len(pos_ind)), 396 | np.zeros(len(pos_ind)), 397 | X["label"][idx], 398 | np.zeros(len(pos_ind)), 399 | np.zeros(len(pos_ind)) 400 | ]), 401 | self.training: training, 402 | } 403 | else: 404 | q1 = X["q1"][idx] 405 | q2 = X["q2"][idx] 406 | feed_dict = { 407 | self.seq_word_left: np.vstack([Q["words"][q1], 408 | Q["words"][q2], 409 | ]), 410 | self.seq_word_right: np.vstack([Q["words"][q2], 411 | Q["words"][q1], 412 | ]), 413 | self.seq_char_left: np.vstack([Q["chars"][q1], 414 | Q["chars"][q2], 415 | ]), 416 | self.seq_char_right: np.vstack([Q["chars"][q2], 417 | Q["chars"][q1], 418 | ]), 419 | self.seq_len_word_left: np.hstack([Q["seq_len_word"][q1], 420 | Q["seq_len_word"][q2], 421 | ]), 422 | self.seq_len_word_right: np.hstack([Q["seq_len_word"][q2], 423 | Q["seq_len_word"][q1], 424 | ]), 425 | self.seq_len_char_left: np.hstack([Q["seq_len_char"][q1], 426 | Q["seq_len_char"][q2], 427 | ]), 428 | self.seq_len_char_right: np.hstack([Q["seq_len_char"][q2], 429 | Q["seq_len_char"][q1], 430 | ]), 431 | self.labels: np.hstack([X["label"][idx], 432 | X["label"][idx], 433 | ]), 434 | self.training: training, 435 | } 436 | if self.params["use_features"]: 437 | feed_dict.update({ 438 | self.features: np.vstack([X["features"][idx], 439 | X["features"][idx], 440 | ]), 441 | }) 442 | elif not symmetric: 443 | q1 = X["q1"][idx] 444 | q2 = X["q2"][idx] 445 | feed_dict = { 446 | self.seq_word_left: Q["words"][q1], 447 | self.seq_word_right: Q["words"][q2], 448 | self.seq_char_left: Q["chars"][q1], 449 | self.seq_char_right: Q["chars"][q2], 450 | self.seq_len_word_left: Q["seq_len_word"][q1], 451 | self.seq_len_word_right: Q["seq_len_word"][q2], 452 | self.seq_len_char_left: Q["seq_len_char"][q1], 453 | self.seq_len_char_right: Q["seq_len_char"][q2], 454 | self.labels: X["label"][idx], 455 | self.training: training, 456 | } 457 | if self.params["use_features"]: 458 | feed_dict.update({ 459 | self.features: X["features"][idx], 460 | }) 461 | else: 462 | q1 = X["q1"][idx] 463 | q2 = X["q2"][idx] 464 | feed_dict = { 465 | self.seq_word_left: np.vstack([Q["words"][q1], 466 | Q["words"][q2], 467 | ]), 468 | self.seq_word_right: np.vstack([Q["words"][q2], 469 | Q["words"][q1], 470 | ]), 471 | self.seq_char_left: np.vstack([Q["chars"][q1], 472 | Q["chars"][q2], 473 | ]), 474 | self.seq_char_right: np.vstack([Q["chars"][q2], 475 | Q["chars"][q1], 476 | ]), 477 | self.seq_len_word_left: np.hstack([Q["seq_len_word"][q1], 478 | Q["seq_len_word"][q2], 479 | ]), 480 | self.seq_len_word_right: np.hstack([Q["seq_len_word"][q2], 481 | Q["seq_len_word"][q1], 482 | ]), 483 | self.seq_len_char_left: np.hstack([Q["seq_len_char"][q1], 484 | Q["seq_len_char"][q2], 485 | ]), 486 | self.seq_len_char_right: np.hstack([Q["seq_len_char"][q2], 487 | Q["seq_len_char"][q1], 488 | ]), 489 | self.labels: np.hstack([X["label"][idx], 490 | X["label"][idx], 491 | ]), 492 | self.training: training, 493 | } 494 | if self.params["use_features"]: 495 | feed_dict.update({ 496 | self.features: np.vstack([X["features"][idx], 497 | X["features"][idx], 498 | ]), 499 | }) 500 | # augmentation 501 | if training: 502 | if self.params["augmentation_init_dropout"] > 0: 503 | self._dropout_augmentation(feed_dict) 504 | if self.params["augmentation_init_permutation"]: 505 | self._permutation_augmentation(feed_dict) 506 | 507 | return feed_dict 508 | 509 | 510 | def _dropout(self, val_arr, ind_arr, p, value): 511 | new_arr = np.array(val_arr) 512 | drop = np.empty(val_arr.shape, dtype=np.bool) 513 | for i in range(val_arr.shape[0]): 514 | drop[i, :ind_arr[i]] = np.random.choice([True, False], ind_arr[i], p=[p, 1 - p]) 515 | new_arr[drop] = value 516 | return new_arr 517 | 518 | 519 | def _dropout_augmentation(self, feed_dict): 520 | p = self.sess.run(self.augmentation_dropout) 521 | if p <= self.params["augmentation_min_dropout"]: 522 | return 523 | 524 | dropout_data = self._dropout(val_arr=feed_dict[self.seq_word_left], 525 | ind_arr=feed_dict[self.seq_len_word_left], 526 | p=p, value=config.MISSING_INDEX_WORD) 527 | feed_dict[self.seq_word_left] = np.vstack([ 528 | feed_dict[self.seq_word_left], 529 | dropout_data, 530 | ]) 531 | 532 | dropout_data = self._dropout(val_arr=feed_dict[self.seq_word_right], 533 | ind_arr=feed_dict[self.seq_len_word_right], 534 | p=p, value=config.MISSING_INDEX_WORD) 535 | feed_dict[self.seq_word_right] = np.vstack([ 536 | feed_dict[self.seq_word_right], 537 | dropout_data, 538 | ]) 539 | 540 | dropout_data = self._dropout(val_arr=feed_dict[self.seq_char_left], 541 | ind_arr=feed_dict[self.seq_len_char_left], 542 | p=p, value=config.MISSING_INDEX_CHAR) 543 | feed_dict[self.seq_char_left] = np.vstack([ 544 | feed_dict[self.seq_char_left], 545 | dropout_data, 546 | ]) 547 | 548 | dropout_data = self._dropout(val_arr=feed_dict[self.seq_char_right], 549 | ind_arr=feed_dict[self.seq_len_char_right], 550 | p=p, value=config.MISSING_INDEX_CHAR) 551 | feed_dict[self.seq_char_right] = np.vstack([ 552 | feed_dict[self.seq_char_right], 553 | dropout_data, 554 | ]) 555 | 556 | # double others 557 | feed_dict[self.seq_len_word_left] = np.tile(feed_dict[self.seq_len_word_left], 2) 558 | feed_dict[self.seq_len_word_right] = np.tile(feed_dict[self.seq_len_word_right], 2) 559 | feed_dict[self.seq_len_char_left] = np.tile(feed_dict[self.seq_len_char_left], 2) 560 | feed_dict[self.seq_len_char_right] = np.tile(feed_dict[self.seq_len_char_right], 2) 561 | feed_dict[self.labels] = np.tile(feed_dict[self.labels], 2) 562 | if self.params["use_features"]: 563 | feed_dict[self.features] = np.tile(feed_dict[self.features], [2, 1]) 564 | 565 | 566 | def _permutation(self, val_arr, ind_arr, p): 567 | if np.random.random() < p: 568 | new_arr = np.array(val_arr) 569 | for i in range(val_arr.shape[0]): 570 | new_arr[i, :ind_arr[i]] = np.random.permutation(new_arr[i,:ind_arr[i]]) 571 | return new_arr 572 | else: 573 | return val_arr 574 | 575 | 576 | def _permutation_augmentation(self, feed_dict): 577 | p = self.sess.run(self.augmentation_permutation) 578 | if p <= self.params["augmentation_min_permutation"]: 579 | return 580 | 581 | feed_dict[self.seq_word_left] = np.vstack([ 582 | feed_dict[self.seq_word_left], 583 | self._permutation(feed_dict[self.seq_word_left], feed_dict[self.seq_len_word_left], p), 584 | ]) 585 | feed_dict[self.seq_word_right] = np.vstack([ 586 | feed_dict[self.seq_word_right], 587 | self._permutation(feed_dict[self.seq_word_right], feed_dict[self.seq_len_word_right], p), 588 | ]) 589 | feed_dict[self.seq_char_left] = np.vstack([ 590 | feed_dict[self.seq_char_left], 591 | self._permutation(feed_dict[self.seq_char_left], feed_dict[self.seq_len_char_left], p), 592 | ]) 593 | feed_dict[self.seq_char_right] = np.vstack([ 594 | feed_dict[self.seq_char_right], 595 | self._permutation(feed_dict[self.seq_char_right], feed_dict[self.seq_len_char_right], p), 596 | ]) 597 | # double others 598 | feed_dict[self.seq_len_word_left] = np.tile(feed_dict[self.seq_len_word_left], 2) 599 | feed_dict[self.seq_len_word_right] = np.tile(feed_dict[self.seq_len_word_right], 2) 600 | feed_dict[self.seq_len_char_left] = np.tile(feed_dict[self.seq_len_char_left], 2) 601 | feed_dict[self.seq_len_char_right] = np.tile(feed_dict[self.seq_len_char_right], 2) 602 | feed_dict[self.labels] = np.tile(feed_dict[self.labels], 2) 603 | if self.params["use_features"]: 604 | feed_dict[self.features] = np.tile(feed_dict[self.features], [2, 1]) 605 | 606 | 607 | def fit(self, X, Q, validation_data=None, shuffle=False, total_epoch=None): 608 | start_time = time.time() 609 | l = X["label"].shape[0] 610 | self.logger.info("fit on %d sample" % l) 611 | self.logger.info("max_batch: %d" % self.params["max_batch"]) 612 | if validation_data is not None: 613 | self.logger.info("mean: %.5f"%np.mean(validation_data["label"])) 614 | train_idx_shuffle = np.arange(l) 615 | total_loss = 0. 616 | loss_decay = 0.9 617 | global_step = self.sess.run(self.global_step) 618 | if total_epoch is None: 619 | total_epoch = self.params["epoch"] 620 | for epoch in range(total_epoch): 621 | self.logger.info("epoch: %d" % (epoch + 1)) 622 | np.random.seed(epoch) 623 | if shuffle: 624 | np.random.shuffle(train_idx_shuffle) 625 | dropout_p = self.sess.run(self.augmentation_dropout) 626 | batch_size = self.params["batch_size"] 627 | if dropout_p <= self.params["augmentation_min_dropout"]: 628 | batch_size *= 2 629 | batches = self._get_batch_index(train_idx_shuffle, batch_size) 630 | for i, idx in enumerate(batches): 631 | feed_dict = self._get_feed_dict(X, idx, Q, construct_neg=self.params["construct_neg"], training=True) 632 | loss, lr, opt, summary, global_step = self.sess.run((self.loss, self.learning_rate, self.train_op, self.summary, self.global_step), feed_dict=feed_dict) 633 | self.train_writer.add_summary(summary, global_step) 634 | total_loss = loss_decay * total_loss + (1. - loss_decay) * loss 635 | if validation_data is not None and (self.params["eval_every_num_update"] > 0) and (global_step % self.params["eval_every_num_update"] == 0): 636 | y_valid = validation_data["label"] 637 | y_proba, y_proba_cal = self._predict_proba(validation_data, Q, fit_calibration=self.params["calibration"]) 638 | valid_loss = log_loss(y_valid, y_proba, eps=1e-15) 639 | valid_loss_cal = log_loss(y_valid, y_proba_cal, eps=1e-15) 640 | summary = tf.Summary() 641 | summary.value.add(tag="logloss", simple_value=valid_loss) 642 | self.test_writer.add_summary(summary, global_step) 643 | self.logger.info( 644 | "[epoch-%d, batch-%d] train-loss=%.5f, valid-loss=%.5f, valid-loss-cal=%.5f, valid-proba=%.5f, predict-proba=%.5f, predict-proba-cal=%.5f, lr=%.5f [%.1f s]" % ( 645 | epoch + 1, global_step, total_loss, valid_loss, valid_loss_cal, 646 | np.mean(y_valid), np.mean(y_proba), np.mean(y_proba_cal), lr, time.time() - start_time)) 647 | else: 648 | self.logger.info("[epoch-%d, batch-%d] train-loss=%.5f, lr=%.5f [%.1f s]" % ( 649 | epoch + 1, global_step, total_loss, 650 | lr, time.time() - start_time)) 651 | if global_step >= self.params["max_batch"] and self.params["max_batch"] > 0: 652 | break 653 | if global_step >= self.params["max_batch"] and self.params["max_batch"] > 0: 654 | break 655 | 656 | 657 | def _predict_node(self, X, Q, node): 658 | l = X["label"].shape[0] 659 | train_idx = np.arange(l) 660 | batches = self._get_batch_index(train_idx, self.params["batch_size"]) 661 | y_pred = [] 662 | y_pred_append = y_pred.append 663 | for idx in batches: 664 | feed_dict = self._get_feed_dict(X, idx, Q, training=False, symmetric=True) 665 | pred = self.sess.run(node, feed_dict=feed_dict) 666 | n = int(pred.shape[0]/2) 667 | pred = (pred[:n] + pred[n:])/2. 668 | y_pred_append(pred) 669 | y_pred = np.hstack(y_pred).reshape((-1, 1)).astype(np.float64) 670 | return y_pred 671 | 672 | 673 | def _predict_proba(self, X, Q, fit_calibration=False): 674 | y_logit = self._predict_node(X, Q, self.logits) 675 | y_proba = sigmoid(y_logit) 676 | y_proba_cal = y_proba 677 | if fit_calibration: 678 | y_valid = X["label"] 679 | self.calibration_model = LogisticRegression() 680 | self.calibration_model.fit(y_logit, y_valid) 681 | if self.calibration_model is not None: 682 | y_proba_cal = self.calibration_model.predict_proba(y_logit)[:,1] 683 | return y_proba, y_proba_cal 684 | 685 | 686 | def predict_proba(self, X, Q): 687 | _, y_proba_cal = self._predict_proba(X, Q, fit_calibration=False) 688 | return y_proba_cal 689 | 690 | 691 | def predict(self, X, Q): 692 | proba = self.predict_proba(X, Q) 693 | y = np.array(proba > self.threshold, dtype=int) 694 | return y 695 | -------------------------------------------------------------------------------- /src/models/bcnn.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | from inputs.dynamic_pooling import dynamic_pooling_index 7 | from models.base_model import BaseModel 8 | from tf_common import metrics 9 | 10 | 11 | class BCNNBaseModel(BaseModel): 12 | def __init__(self, params, logger, init_embedding_matrix): 13 | super(BCNNBaseModel, self).__init__(params, logger, init_embedding_matrix) 14 | 15 | 16 | def _init_tf_vars(self): 17 | super(BCNNBaseModel, self)._init_tf_vars() 18 | self.dpool_index_word = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_word"], 19 | self.params["max_seq_len_word"], 3], 20 | name="dpool_index_word") 21 | self.dpool_index_char = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_char"], 22 | self.params["max_seq_len_char"], 3], 23 | name="dpool_index_char") 24 | 25 | 26 | def _padding(self, x, name): 27 | # x: [batch, s, d, 1] 28 | # x => [batch, s+w*2-2, d, 1] 29 | w = self.params["bcnn_filter_size"] 30 | return tf.pad(x, np.array([[0, 0], [w - 1, w - 1], [0, 0], [0, 0]]), "CONSTANT", name) 31 | 32 | 33 | def _make_attention_matrix(self, x1, x2): 34 | # x1: [batch, s1, d, 1] 35 | # x2: [batch, s2, d, 1] 36 | # match score 37 | if "euclidean" in self.params["bcnn_match_score_type"]: 38 | # x1 => [batch, s1, 1, d] 39 | # x2 => [batch, 1, s2, d] 40 | x1_ = tf.transpose(x1, perm=[0, 1, 3, 2]) 41 | x2_ = tf.transpose(x2, perm=[0, 3, 1, 2]) 42 | euclidean = tf.sqrt(tf.reduce_sum(tf.square(x1_ - x2_), axis=-1)) 43 | if "exp" in self.params["bcnn_match_score_type"]: 44 | # exp(-euclidean / (2. * beta)) (producenan) 45 | # from Convolutional Neural Network for Paraphrase Identification 46 | beta = 2. 47 | att = tf.exp(-euclidean / (2. * beta)) 48 | else: 49 | # euclidean distance (produce nan) 50 | att = 1. / (1. + euclidean) 51 | elif self.params["bcnn_match_score_type"] == "cosine": 52 | # cosine similarity 53 | x1_ = tf.nn.l2_normalize(x1, dim=2) 54 | x2_ = tf.nn.l2_normalize(x2, dim=2) 55 | sim = tf.einsum("abcd,aecd->abe", x1_, x2_) # value in [-1, 1] 56 | att = (1. + sim) / 2. # value in [0, 1] 57 | return att 58 | 59 | 60 | def _convolution(self, x, d, name, reuse=False): 61 | # conv: [batch, s+w-1, 1, d] 62 | conv = tf.layers.conv2d( 63 | inputs=x, 64 | filters=self.params["bcnn_num_filters"], 65 | kernel_size=(self.params["bcnn_filter_size"], d), 66 | padding="valid", 67 | activation=self.params["bcnn_activation"], 68 | strides=1, 69 | reuse=reuse, 70 | name=name) 71 | 72 | # [batch, s+w-1, d, 1] 73 | return tf.transpose(conv, perm=[0, 1, 3, 2]) 74 | 75 | 76 | def _w_ap(self, x, attention, name): 77 | # x: [batch, s+w-1, d, 1] 78 | # attention: [batch, s+w-1] 79 | if attention is not None: 80 | attention = tf.expand_dims(tf.expand_dims(attention, axis=-1), axis=-1) 81 | x2 = x * attention 82 | else: 83 | x2 = x 84 | w_ap = tf.layers.average_pooling2d( 85 | inputs=x2, 86 | pool_size=(self.params["bcnn_filter_size"], 1), 87 | strides=1, 88 | padding="valid", 89 | name=name) 90 | if attention is not None: 91 | w_ap = w_ap * self.params["bcnn_filter_size"] 92 | 93 | return w_ap 94 | 95 | 96 | def _all_ap(self, x, seq_len, name): 97 | if "input" in name: 98 | pool_width = seq_len 99 | d = self.params["embedding_dim"] 100 | else: 101 | pool_width = seq_len + self.params["bcnn_filter_size"] - 1 102 | d = self.params["bcnn_num_filters"] 103 | 104 | all_ap = tf.layers.average_pooling2d( 105 | inputs=x, 106 | pool_size=(pool_width, 1), 107 | strides=1, 108 | padding="valid", 109 | name=name) 110 | all_ap_reshaped = tf.reshape(all_ap, [-1, d]) 111 | 112 | return all_ap_reshaped 113 | 114 | 115 | def _expand_input(self, x1, x2, att_mat, seq_len, d, name): 116 | # att_mat: [batch, s, s] 117 | aW = tf.get_variable(name=name, shape=(seq_len, d)) 118 | 119 | # [batch, s, s] * [s,d] => [batch, s, d] 120 | # expand dims => [batch, s, d, 1] 121 | x1_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", att_mat, aW), -1) 122 | x2_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW), -1) 123 | 124 | # [batch, s, d, 2] 125 | x1 = tf.concat([x1, x1_a], axis=3) 126 | x2 = tf.concat([x2, x2_a], axis=3) 127 | 128 | return x1, x2 129 | 130 | 131 | def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index, granularity="word"): 132 | return None, None, None, None, None 133 | 134 | 135 | def _mp_cnn_layer(self, cross, dpool_index, filters, kernel_size, pool_size, strides, name): 136 | cross_conv = tf.layers.conv2d( 137 | inputs=cross, 138 | filters=filters, 139 | kernel_size=kernel_size, 140 | padding="same", 141 | activation=self.params["bcnn_mp_activation"], 142 | strides=1, 143 | reuse=False, 144 | name=name+"cross_conv") 145 | if self.params["bcnn_mp_dynamic_pooling"] and dpool_index is not None: 146 | cross_conv = tf.gather_nd(cross_conv, dpool_index) 147 | cross_pool = tf.layers.max_pooling2d( 148 | inputs=cross_conv, 149 | pool_size=pool_size, 150 | strides=strides, 151 | padding="valid", 152 | name=name+"cross_pool") 153 | return cross_pool 154 | 155 | def _bcnn_semantic_feature_layer(self, seq_left, seq_right, dpool_index=None, granularity="word"): 156 | name = self.model_name + granularity 157 | seq_len = self.params["max_seq_len_%s" % granularity] 158 | # [batch, s, d] => [batch, s, d, 1] 159 | seq_left = tf.expand_dims(seq_left, axis=-1) 160 | seq_right = tf.expand_dims(seq_right, axis=-1) 161 | 162 | left_ap_list = [None] * (self.params["bcnn_num_layers"] + 1) 163 | right_ap_list = [None] * (self.params["bcnn_num_layers"] + 1) 164 | left_ap_list[0] = self._all_ap(x=seq_left, seq_len=seq_len, name=name + "global_pooling_input_left") 165 | right_ap_list[0] = self._all_ap(x=seq_right, seq_len=seq_len, name=name + "global_pooling_input_right") 166 | 167 | x1 = seq_left 168 | x2 = seq_right 169 | d = self.params["embedding_dim"] 170 | outputs = [] 171 | for layer in range(self.params["bcnn_num_layers"]): 172 | x1, left_ap_list[layer + 1], x2, right_ap_list[layer + 1], att_pooled = self._bcnn_cnn_layer(x1=x1, x2=x2, 173 | seq_len=seq_len, 174 | d=d, 175 | name=name + "cnn_layer_%d" % ( 176 | layer + 1), 177 | dpool_index=dpool_index, 178 | granularity=granularity) 179 | d = self.params["bcnn_num_filters"] 180 | if self.params["bcnn_mp_att_pooling"] and att_pooled is not None: 181 | outputs.append(att_pooled) 182 | 183 | for l, r in zip(left_ap_list, right_ap_list): 184 | outputs.append(metrics.cosine_similarity(l, r, self.params["similarity_aggregation"])) 185 | outputs.append(metrics.dot_product(l, r, self.params["similarity_aggregation"])) 186 | outputs.append(metrics.euclidean_distance(l, r, self.params["similarity_aggregation"])) 187 | return tf.concat(outputs, axis=-1) 188 | 189 | 190 | def _get_attention_matrix_pooled_features(self, att_mat, seq_len, dpool_index, granularity, name): 191 | # get attention matrix pooled features (as in sec. 5.3.1) 192 | att_mat0 = tf.expand_dims(att_mat, axis=3) 193 | # conv-pool layer 1 194 | filters = self.params["bcnn_mp_num_filters"][0] 195 | kernel_size = self.params["bcnn_mp_filter_sizes"][0] 196 | # seq_len = seq_len + self.params["bcnn_filter_size"] - 1 197 | pool_size0 = self.params["bcnn_mp_pool_sizes_%s" % granularity][0] 198 | pool_sizes = [seq_len / pool_size0, seq_len / pool_size0] 199 | strides = [seq_len / pool_size0, seq_len / pool_size0] 200 | conv1 = self._mp_cnn_layer(att_mat0, dpool_index, filters, kernel_size, pool_sizes, strides, 201 | name=self.model_name + name + granularity + "1") 202 | conv1_flatten = tf.reshape(conv1, [-1, self.params["mp_num_filters"][0] * (pool_size0 * pool_size0)]) 203 | 204 | # conv-pool layer 2 205 | filters = self.params["bcnn_mp_num_filters"][1] 206 | kernel_size = self.params["bcnn_mp_filter_sizes"][1] 207 | pool_size1 = self.params["bcnn_mp_pool_sizes_%s" % granularity][1] 208 | pool_sizes = [pool_size0 / pool_size1, pool_size0 / pool_size1] 209 | strides = [pool_size0 / pool_size1, pool_size0 / pool_size1] 210 | conv2 = self._mp_cnn_layer(conv1, None, filters, kernel_size, pool_sizes, strides, 211 | name=self.model_name + name + granularity + "2") 212 | conv2_flatten = tf.reshape(conv2, [-1, self.params["mp_num_filters"][1] * (pool_size1 * pool_size1)]) 213 | 214 | return conv2_flatten 215 | 216 | 217 | def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False): 218 | feed_dict = super(BCNNBaseModel, self)._get_feed_dict(X, idx, Q, construct_neg, training, symmetric) 219 | if self.params["mp_dynamic_pooling"]: 220 | dpool_index_word = dynamic_pooling_index(feed_dict[self.seq_len_word_left], 221 | feed_dict[self.seq_len_word_right], 222 | self.params["max_seq_len_word"], 223 | self.params["max_seq_len_word"]) 224 | dpool_index_char = dynamic_pooling_index(feed_dict[self.seq_len_char_left], 225 | feed_dict[self.seq_len_char_right], 226 | self.params["max_seq_len_char"], 227 | self.params["max_seq_len_char"]) 228 | feed_dict.update({ 229 | self.dpool_index_word: dpool_index_word, 230 | self.dpool_index_char: dpool_index_char, 231 | }) 232 | return feed_dict 233 | 234 | 235 | def _get_matching_features(self): 236 | with tf.name_scope(self.model_name): 237 | tf.set_random_seed(self.params["random_seed"]) 238 | 239 | with tf.name_scope("word_network"): 240 | if self.params["attend_method"] == "context-attention": 241 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \ 242 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 243 | self._interaction_semantic_feature_layer( 244 | self.seq_word_left, 245 | self.seq_word_right, 246 | self.seq_len_word_left, 247 | self.seq_len_word_right, 248 | granularity="word") 249 | else: 250 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 251 | self._semantic_feature_layer( 252 | self.seq_word_left, 253 | self.seq_len_word_left, 254 | granularity="word", reuse=False) 255 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 256 | self._semantic_feature_layer( 257 | self.seq_word_right, 258 | self.seq_len_word_right, 259 | granularity="word", reuse=True) 260 | sim_word = self._bcnn_semantic_feature_layer(emb_seq_word_left, emb_seq_word_right, self.dpool_index_word, granularity="word") 261 | 262 | with tf.name_scope("char_network"): 263 | if self.params["attend_method"] == "context-attention": 264 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \ 265 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 266 | self._interaction_semantic_feature_layer( 267 | self.seq_char_left, 268 | self.seq_char_right, 269 | self.seq_len_char_left, 270 | self.seq_len_char_right, 271 | granularity="char") 272 | else: 273 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 274 | self._semantic_feature_layer( 275 | self.seq_char_left, 276 | self.seq_len_char_left, 277 | granularity="char", reuse=False) 278 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 279 | self._semantic_feature_layer( 280 | self.seq_char_right, 281 | self.seq_len_char_right, 282 | granularity="char", reuse=True) 283 | sim_char = self._bcnn_semantic_feature_layer(emb_seq_char_left, emb_seq_char_right, self.dpool_index_char, granularity="char") 284 | 285 | with tf.name_scope("matching_features"): 286 | matching_features_word = sim_word 287 | matching_features_char = sim_char 288 | 289 | return matching_features_word, matching_features_char 290 | 291 | 292 | class BCNN(BCNNBaseModel): 293 | def __init__(self, params, logger, init_embedding_matrix): 294 | p = copy(params) 295 | p["model_name"] = p["model_name"] + "bcnn" 296 | super(BCNN, self).__init__(p, logger, init_embedding_matrix) 297 | 298 | 299 | def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"): 300 | # x1, x2 = [batch, s, d, 1] 301 | # att_mat0: [batch, s, s] 302 | att_mat0 = self._make_attention_matrix(x1, x2) 303 | left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False) 304 | right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True) 305 | 306 | left_attention, right_attention = None, None 307 | 308 | left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left") 309 | left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left") 310 | right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right") 311 | right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right") 312 | 313 | # get attention matrix pooled features (as in sec. 5.3.1) 314 | att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled") 315 | 316 | return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled 317 | 318 | 319 | class ABCNN1(BCNNBaseModel): 320 | def __init__(self, params, logger, init_embedding_matrix): 321 | p = copy(params) 322 | p["model_name"] = p["model_name"] + "abcnn1" 323 | super(ABCNN1, self).__init__(p, logger, init_embedding_matrix) 324 | 325 | 326 | def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"): 327 | # x1, x2 = [batch, s, d, 1] 328 | # att_mat0: [batch, s, s] 329 | att_mat0 = self._make_attention_matrix(x1, x2) 330 | x1, x2 = self._expand_input(x1, x2, att_mat0, seq_len, d, name=name+"expand_input") 331 | 332 | left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False) 333 | right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True) 334 | 335 | left_attention, right_attention = None, None 336 | 337 | left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left") 338 | left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left") 339 | right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right") 340 | right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right") 341 | 342 | # get attention matrix pooled features (as in sec. 5.3.1) 343 | att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled") 344 | 345 | return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled 346 | 347 | 348 | class ABCNN2(BCNNBaseModel): 349 | def __init__(self, params, logger, init_embedding_matrix): 350 | p = copy(params) 351 | p["model_name"] = p["model_name"] + "abcnn2" 352 | super(ABCNN2, self).__init__(p, logger, init_embedding_matrix) 353 | 354 | 355 | def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"): 356 | # x1, x2 = [batch, s, d, 1] 357 | att_mat0 = self._make_attention_matrix(x1, x2) 358 | left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False) 359 | right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True) 360 | 361 | # [batch, s+w-1, s+w-1] 362 | att_mat1 = self._make_attention_matrix(left_conv, right_conv) 363 | # [batch, s+w-1], [batch, s+w-1] 364 | left_attention, right_attention = tf.reduce_sum(att_mat1, axis=2), tf.reduce_sum(att_mat1, axis=1) 365 | 366 | left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left") 367 | left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left") 368 | right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right") 369 | right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right") 370 | 371 | # get attention matrix pooled features (as in sec. 5.3.1) 372 | att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled") 373 | 374 | return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled 375 | 376 | 377 | class ABCNN3(BCNNBaseModel): 378 | def __init__(self, params, logger, init_embedding_matrix): 379 | p = copy(params) 380 | p["model_name"] = p["model_name"] + "abcnn3" 381 | super(ABCNN3, self).__init__(p, logger, init_embedding_matrix) 382 | 383 | 384 | def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"): 385 | # x1, x2 = [batch, s, d, 1] 386 | # att_mat0: [batch, s, s 387 | att_mat0 = self._make_attention_matrix(x1, x2) 388 | x1, x2 = self._expand_input(x1, x2, att_mat0, seq_len, d, name=name + "expand_input") 389 | 390 | left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False) 391 | right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True) 392 | 393 | # [batch, s+w-1, s+w-1] 394 | att_mat1 = self._make_attention_matrix(left_conv, right_conv) 395 | # [batch, s+w-1], [batch, s+w-1] 396 | left_attention, right_attention = tf.reduce_sum(att_mat1, axis=2), tf.reduce_sum(att_mat1, axis=1) 397 | 398 | left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left") 399 | left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left") 400 | right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right") 401 | right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right") 402 | 403 | # get attention matrix pooled features (as in sec. 5.3.1) 404 | att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled") 405 | 406 | return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled 407 | -------------------------------------------------------------------------------- /src/models/decatt.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | 4 | from models.esim import ESIMDecAttBaseModel 5 | 6 | 7 | class DecAtt(ESIMDecAttBaseModel): 8 | def __init__(self, params, logger, init_embedding_matrix=None): 9 | p = copy(params) 10 | # model config 11 | p.update({ 12 | "model_name": p["model_name"] + "dec_att", 13 | "encode_method": "project", 14 | "attend_method": ["ave", "max", "min", "self-attention"], 15 | 16 | "project_type": "fc", 17 | "project_hidden_units": [64 * 4, 64 * 2, 64], 18 | "project_dropouts": [0, 0, 0], 19 | 20 | # fc block 21 | "fc_type": "fc", 22 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 23 | "fc_dropouts": [0, 0, 0], 24 | }) 25 | super(DecAtt, self).__init__(p, logger, init_embedding_matrix) 26 | -------------------------------------------------------------------------------- /src/models/dsmm.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import tensorflow as tf 4 | 5 | from models.bcnn import BCNN, ABCNN1, ABCNN2, ABCNN3 6 | from models.esim import ESIMDecAttBaseModel 7 | from models.match_pyramid import MatchPyramidBaseModel 8 | from tf_common import metrics 9 | from tf_common.nn_module import mlp_layer 10 | 11 | 12 | class DSMM(MatchPyramidBaseModel, ESIMDecAttBaseModel, BCNN): 13 | def __init__(self, params, logger, init_embedding_matrix=None): 14 | p = copy(params) 15 | p["model_name"] = p["model_name"] + "dsmm" 16 | super(DSMM, self).__init__(p, logger, init_embedding_matrix) 17 | 18 | 19 | def _get_matching_features(self): 20 | with tf.name_scope(self.model_name): 21 | tf.set_random_seed(self.params["random_seed"]) 22 | 23 | with tf.name_scope("word_network"): 24 | if self.params["attend_method"] == "context-attention": 25 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \ 26 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 27 | self._interaction_semantic_feature_layer( 28 | self.seq_word_left, 29 | self.seq_word_right, 30 | self.seq_len_word_left, 31 | self.seq_len_word_right, 32 | granularity="word") 33 | else: 34 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 35 | self._semantic_feature_layer( 36 | self.seq_word_left, 37 | self.seq_len_word_left, 38 | granularity="word", reuse=False) 39 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 40 | self._semantic_feature_layer( 41 | self.seq_word_right, 42 | self.seq_len_word_right, 43 | granularity="word", reuse=True) 44 | 45 | #### matching 46 | # match score 47 | sim_word = tf.concat([ 48 | metrics.cosine_similarity(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 49 | metrics.dot_product(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 50 | metrics.euclidean_distance(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 51 | # self._canberra_score(sem_seq_word_left, sem_seq_word_right), 52 | ], axis=-1) 53 | 54 | # match pyramid 55 | match_matrix_word = self._get_match_matrix(self.seq_word_left, emb_seq_word_left, enc_seq_word_left, 56 | self.seq_word_right, emb_seq_word_right, enc_seq_word_right, 57 | granularity="word") 58 | mp_word = self._mp_semantic_feature_layer(match_matrix_word, 59 | self.dpool_index_word, 60 | granularity="word") 61 | 62 | # esim 63 | esim_word = self._esim_semantic_feature_layer(emb_seq_word_left, 64 | emb_seq_word_right, 65 | self.seq_len_word_left, 66 | self.seq_len_word_right, 67 | granularity="word") 68 | 69 | # bcnn 70 | bcnn_word = self._bcnn_semantic_feature_layer(emb_seq_word_left, 71 | emb_seq_word_right, 72 | granularity="word") 73 | 74 | # dense 75 | deep_in_word = tf.concat([sem_seq_word_left, sem_seq_word_right], axis=-1) 76 | deep_word = mlp_layer(deep_in_word, fc_type=self.params["fc_type"], 77 | hidden_units=self.params["fc_hidden_units"], 78 | dropouts=self.params["fc_dropouts"], 79 | scope_name=self.model_name + "deep_word", 80 | reuse=False, 81 | training=self.training, 82 | seed=self.params["random_seed"]) 83 | 84 | with tf.name_scope("char_network"): 85 | if self.params["attend_method"] == "context-attention": 86 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \ 87 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 88 | self._interaction_semantic_feature_layer( 89 | self.seq_char_left, 90 | self.seq_char_right, 91 | self.seq_len_char_left, 92 | self.seq_len_char_right, 93 | granularity="char") 94 | else: 95 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 96 | self._semantic_feature_layer( 97 | self.seq_char_left, 98 | self.seq_len_char_left, 99 | granularity="char", reuse=False) 100 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 101 | self._semantic_feature_layer( 102 | self.seq_char_right, 103 | self.seq_len_char_right, 104 | granularity="char", reuse=True) 105 | 106 | # match score 107 | sim_char = tf.concat([ 108 | metrics.cosine_similarity(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 109 | metrics.dot_product(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 110 | metrics.euclidean_distance(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 111 | # self._canberra_score(sem_seq_char_left, sem_seq_char_right), 112 | ], axis=-1) 113 | 114 | # match pyramid 115 | match_matrix_char = self._get_match_matrix(self.seq_char_left, emb_seq_char_left, enc_seq_char_left, 116 | self.seq_char_right, emb_seq_char_right, enc_seq_char_right, 117 | granularity="char") 118 | mp_char = self._mp_semantic_feature_layer(match_matrix_char, 119 | self.dpool_index_char, 120 | granularity="char") 121 | 122 | # esim 123 | esim_char = self._esim_semantic_feature_layer(emb_seq_char_left, 124 | emb_seq_char_right, 125 | self.seq_len_char_left, 126 | self.seq_len_char_right, 127 | granularity="char") 128 | 129 | # bcnn 130 | bcnn_char = self._bcnn_semantic_feature_layer(emb_seq_char_left, 131 | emb_seq_char_right, 132 | granularity="char") 133 | 134 | # dense 135 | deep_in_char = tf.concat([sem_seq_char_left, sem_seq_char_right], axis=-1) 136 | deep_char = mlp_layer(deep_in_char, fc_type=self.params["fc_type"], 137 | hidden_units=self.params["fc_hidden_units"], 138 | dropouts=self.params["fc_dropouts"], 139 | scope_name=self.model_name + "deep_char", 140 | reuse=False, 141 | training=self.training, 142 | seed=self.params["random_seed"]) 143 | 144 | with tf.name_scope("matching_features"): 145 | matching_features_word = tf.concat([ 146 | sim_word, mp_word, esim_word, bcnn_word, deep_word,# sem_seq_word_left, sem_seq_word_right, 147 | ], axis=-1) 148 | matching_features_char = tf.concat([ 149 | sim_char, mp_char, esim_char, bcnn_char, deep_char,# sem_seq_char_left, sem_seq_char_right, 150 | ], axis=-1) 151 | 152 | return matching_features_word, matching_features_char 153 | -------------------------------------------------------------------------------- /src/models/dssm.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import tensorflow as tf 4 | 5 | from models.base_model import BaseModel 6 | from tf_common import metrics 7 | 8 | 9 | class DSSMBaseModel(BaseModel): 10 | def __init__(self, params, logger, init_embedding_matrix=None): 11 | super(DSSMBaseModel, self).__init__(params, logger, init_embedding_matrix) 12 | 13 | 14 | def _get_matching_features(self): 15 | with tf.name_scope(self.model_name): 16 | tf.set_random_seed(self.params["random_seed"]) 17 | 18 | with tf.name_scope("word_network"): 19 | if self.params["attend_method"] == "context-attention": 20 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \ 21 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 22 | self._interaction_semantic_feature_layer( 23 | self.seq_word_left, 24 | self.seq_word_right, 25 | self.seq_len_word_left, 26 | self.seq_len_word_right, 27 | granularity="word") 28 | else: 29 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 30 | self._semantic_feature_layer( 31 | self.seq_word_left, 32 | self.seq_len_word_left, 33 | granularity="word", reuse=False) 34 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 35 | self._semantic_feature_layer( 36 | self.seq_word_right, 37 | self.seq_len_word_right, 38 | granularity="word", reuse=True) 39 | # match score 40 | sim_word = tf.concat([ 41 | metrics.cosine_similarity(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 42 | metrics.dot_product(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 43 | metrics.euclidean_distance(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]), 44 | # self._canberra_score(sem_seq_word_left, sem_seq_word_right), 45 | ], axis=-1) 46 | 47 | with tf.name_scope("char_network"): 48 | if self.params["attend_method"] == "context-attention": 49 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \ 50 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 51 | self._interaction_semantic_feature_layer( 52 | self.seq_char_left, 53 | self.seq_char_right, 54 | self.seq_len_char_left, 55 | self.seq_len_char_right, 56 | granularity="char") 57 | else: 58 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 59 | self._semantic_feature_layer( 60 | self.seq_char_left, 61 | self.seq_len_char_left, 62 | granularity="char", reuse=False) 63 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 64 | self._semantic_feature_layer( 65 | self.seq_char_right, 66 | self.seq_len_char_right, 67 | granularity="char", reuse=True) 68 | # match score 69 | sim_char = tf.concat([ 70 | metrics.cosine_similarity(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 71 | metrics.dot_product(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 72 | metrics.euclidean_distance(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]), 73 | # self._canberra_score(sem_seq_char_left, sem_seq_char_right), 74 | ], axis=-1) 75 | 76 | with tf.name_scope("matching_features"): 77 | matching_features_word = sim_word 78 | matching_features_char = sim_char 79 | 80 | return matching_features_word, matching_features_char 81 | 82 | 83 | class DSSM(DSSMBaseModel): 84 | def __init__(self, params, logger, init_embedding_matrix=None): 85 | p = copy(params) 86 | # model config 87 | p.update({ 88 | "model_name": p["model_name"] + "dssm", 89 | "encode_method": "fasttext", 90 | "attend_method": ["ave", "max", "min", "self-scalar-attention"], 91 | 92 | # fc block 93 | "fc_type": "fc", 94 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 95 | "fc_dropouts": [0, 0, 0], 96 | }) 97 | super(DSSM, self).__init__(p, logger, init_embedding_matrix) 98 | 99 | 100 | class CDSSM(DSSMBaseModel): 101 | def __init__(self, params, logger, init_embedding_matrix=None): 102 | p = copy(params) 103 | # model config 104 | p.update({ 105 | "model_name": p["model_name"] + "cdssm", 106 | "encode_method": "textcnn", 107 | "attend_method": ["ave", "max", "min", "self-scalar-attention"], 108 | 109 | # cnn 110 | "cnn_num_layers": 1, 111 | "cnn_num_filters": 32, 112 | "cnn_filter_sizes": [1, 2, 3], 113 | "cnn_timedistributed": False, 114 | "cnn_activation": tf.nn.relu, 115 | "cnn_gated_conv": False, 116 | "cnn_residual": False, 117 | 118 | # fc block 119 | "fc_type": "fc", 120 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 121 | "fc_dropouts": [0, 0, 0], 122 | }) 123 | super(CDSSM, self).__init__(p, logger, init_embedding_matrix) 124 | 125 | 126 | class RDSSM(DSSMBaseModel): 127 | def __init__(self, params, logger, init_embedding_matrix=None): 128 | p = copy(params) 129 | # model config 130 | p.update({ 131 | "model_name": p["model_name"] + "rdssm", 132 | "encode_method": "textbirnn", 133 | "attend_method": ["ave", "max", "min", "self-scalar-attention"], 134 | 135 | # rnn 136 | "rnn_num_units": 32, 137 | "rnn_cell_type": "gru", 138 | "rnn_num_layers": 1, 139 | 140 | # fc block 141 | "fc_type": "fc", 142 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 143 | "fc_dropouts": [0, 0, 0], 144 | }) 145 | super(RDSSM, self).__init__(p, logger, init_embedding_matrix) 146 | -------------------------------------------------------------------------------- /src/models/esim.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from models.base_model import BaseModel 7 | from tf_common.nn_module import word_dropout 8 | from tf_common.nn_module import encode, attend 9 | 10 | 11 | class ESIMDecAttBaseModel(BaseModel): 12 | """ 13 | Implementation of base model of ESIM and DecAtt 14 | The difference between them lies in the encoder they use. 15 | - ESIM: BiLSTM 16 | - DecAtt: timedistributed dense projection 17 | 18 | Reference 19 | Paper: 20 | - ESIM: Enhanced LSTM for Natural Language Inference 21 | - DecAtt: A Decomposable Attention Model for Natural Language Inference 22 | Keras: 23 | https://www.kaggle.com/lamdang/dl-models 24 | Pytorch: 25 | https://github.com/lanwuwei/SPM_toolkit 26 | """ 27 | def __init__(self, params, logger, init_embedding_matrix=None): 28 | super(ESIMDecAttBaseModel, self).__init__(params, logger, init_embedding_matrix) 29 | 30 | 31 | def _soft_attention_alignment(self, x1, x2): 32 | "Align text representation with neural soft attention" 33 | # x1: [b, s1, d] 34 | # x2: [b, s2, d] 35 | # att: [b, s1, s2] 36 | att = tf.einsum("abd,acd->abc", x1, x2) 37 | w_att_1 = tf.nn.softmax(att, dim=1) 38 | w_att_2 = tf.nn.softmax(att, dim=2) 39 | x2_att = tf.einsum("abd,abc->acd", x1, w_att_1) 40 | x1_att = tf.einsum("abd,acb->acd", x2, w_att_2) 41 | return x1_att, x2_att 42 | 43 | 44 | def _esim_semantic_feature_layer(self, emb_seq_left, emb_seq_right, seq_len_left, seq_len_right, granularity="word"): 45 | # for sharing embedding with other sub-graph 46 | # #### embed 47 | # emb_matrix = self._get_embedding_matrix(granularity) 48 | # emb_seq_left = tf.nn.embedding_lookup(emb_matrix, seq_input_left) 49 | # emb_seq_right = tf.nn.embedding_lookup(emb_matrix, seq_input_right) 50 | # 51 | # #### dropout 52 | # random_seed = np.random.randint(10000000) 53 | # emb_seq_left = word_dropout(emb_seq_left, 54 | # training=self.training, 55 | # dropout=self.params["embedding_dropout"], 56 | # seed=random_seed) 57 | # random_seed = np.random.randint(10000000) 58 | # emb_seq_right = word_dropout(emb_seq_right, 59 | # training=self.training, 60 | # dropout=self.params["embedding_dropout"], 61 | # seed=random_seed) 62 | 63 | #### encode 64 | input_dim = self.params["embedding_dim"] 65 | enc_seq_left = encode(emb_seq_left, method=self.params["encode_method"], 66 | input_dim=input_dim, 67 | params=self.params, 68 | sequence_length=seq_len_left, 69 | mask_zero=self.params["embedding_mask_zero"], 70 | scope_name=self.model_name + "esim_enc_seq_%s" % granularity, reuse=False, 71 | training=self.training) 72 | enc_seq_right = encode(emb_seq_right, method=self.params["encode_method"], 73 | input_dim=input_dim, 74 | params=self.params, 75 | sequence_length=seq_len_right, 76 | mask_zero=self.params["embedding_mask_zero"], 77 | scope_name=self.model_name + "esim_enc_seq_%s" % granularity, reuse=True, 78 | training=self.training) 79 | 80 | #### align 81 | ali_seq_left, ali_seq_right = self._soft_attention_alignment(enc_seq_left, enc_seq_right) 82 | 83 | #### compose 84 | com_seq_left = tf.concat([ 85 | enc_seq_left, 86 | ali_seq_left, 87 | enc_seq_left * ali_seq_left, 88 | enc_seq_left - ali_seq_left, 89 | ], axis=-1) 90 | com_seq_right = tf.concat([ 91 | enc_seq_right, 92 | ali_seq_right, 93 | enc_seq_right * ali_seq_right, 94 | enc_seq_right - ali_seq_right, 95 | ], axis=-1) 96 | 97 | input_dim = self.params["encode_dim"] * 4 98 | compare_seq_left = encode(com_seq_left, method=self.params["encode_method"], 99 | input_dim=input_dim, 100 | params=self.params, 101 | sequence_length=seq_len_left, 102 | mask_zero=self.params["embedding_mask_zero"], 103 | scope_name=self.model_name + "compare_seq_%s" % granularity, reuse=False, 104 | training=self.training) 105 | compare_seq_right = encode(com_seq_right, method=self.params["encode_method"], 106 | input_dim=input_dim, 107 | params=self.params, 108 | sequence_length=seq_len_right, 109 | mask_zero=self.params["embedding_mask_zero"], 110 | scope_name=self.model_name + "compare_seq_%s" % granularity, reuse=True, 111 | training=self.training) 112 | 113 | #### attend 114 | feature_dim = self.params["encode_dim"] 115 | att_seq_left = attend(compare_seq_left, context=None, 116 | encode_dim=self.params["encode_dim"], 117 | feature_dim=feature_dim, 118 | attention_dim=self.params["attention_dim"], 119 | method=self.params["attend_method"], 120 | scope_name=self.model_name + "agg_seq_%s" % granularity, 121 | reuse=False, num_heads=self.params["attention_num_heads"]) 122 | att_seq_right = attend(compare_seq_right, context=None, 123 | encode_dim=self.params["encode_dim"], 124 | feature_dim=feature_dim, 125 | attention_dim=self.params["attention_dim"], 126 | method=self.params["attend_method"], 127 | scope_name=self.model_name + "agg_seq_%s" % granularity, 128 | reuse=True, num_heads=self.params["attention_num_heads"]) 129 | return tf.concat([att_seq_left, att_seq_right], axis=-1) 130 | 131 | 132 | def _get_matching_features(self): 133 | with tf.name_scope(self.model_name): 134 | tf.set_random_seed(self.params["random_seed"]) 135 | 136 | with tf.name_scope("word_network"): 137 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 138 | self._semantic_feature_layer( 139 | self.seq_word_left, 140 | self.seq_len_word_left, 141 | granularity="word", reuse=False) 142 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 143 | self._semantic_feature_layer( 144 | self.seq_word_right, 145 | self.seq_len_word_right, 146 | granularity="word", reuse=True) 147 | sim_word = self._esim_semantic_feature_layer( 148 | emb_seq_word_left, 149 | emb_seq_word_right, 150 | self.seq_len_word_left, 151 | self.seq_len_word_right, 152 | granularity="word") 153 | 154 | with tf.name_scope("char_network"): 155 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 156 | self._semantic_feature_layer( 157 | self.seq_char_left, 158 | self.seq_len_char_left, 159 | granularity="char", reuse=False) 160 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 161 | self._semantic_feature_layer( 162 | self.seq_char_right, 163 | self.seq_len_char_right, 164 | granularity="char", reuse=True) 165 | sim_char = self._esim_semantic_feature_layer( 166 | emb_seq_char_left, 167 | emb_seq_char_right, 168 | self.seq_len_char_left, 169 | self.seq_len_char_right, 170 | granularity="char") 171 | 172 | with tf.name_scope("matching_features"): 173 | matching_features_word = sim_word 174 | matching_features_char = sim_char 175 | 176 | return matching_features_word, matching_features_char 177 | 178 | 179 | class ESIM(ESIMDecAttBaseModel): 180 | def __init__(self, params, logger, init_embedding_matrix=None): 181 | p = copy(params) 182 | # model config 183 | p.update({ 184 | "model_name": p["model_name"] + "esim", 185 | "encode_method": "textbirnn", 186 | "attend_method": ["ave", "max", "min", "self-attention"], 187 | 188 | # rnn 189 | "rnn_num_units": 32, 190 | "rnn_cell_type": "gru", 191 | "rnn_num_layers": 1, 192 | 193 | # fc block 194 | "fc_type": "fc", 195 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 196 | "fc_dropouts": [0, 0, 0], 197 | }) 198 | super(ESIMDecAttBaseModel, self).__init__(p, logger, init_embedding_matrix) 199 | -------------------------------------------------------------------------------- /src/models/match_pyramid.py: -------------------------------------------------------------------------------- 1 | 2 | from copy import copy 3 | import tensorflow as tf 4 | 5 | from inputs.dynamic_pooling import dynamic_pooling_index 6 | from models.base_model import BaseModel 7 | 8 | 9 | class MatchPyramidBaseModel(BaseModel): 10 | def __init__(self, params, logger, init_embedding_matrix=None): 11 | super(MatchPyramidBaseModel, self).__init__(params, logger, init_embedding_matrix) 12 | 13 | 14 | def _init_tf_vars(self): 15 | super(MatchPyramidBaseModel, self)._init_tf_vars() 16 | self.dpool_index_word = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_word"], 17 | self.params["max_seq_len_word"], 3], 18 | name="dpool_index_word") 19 | self.dpool_index_char = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_char"], 20 | self.params["max_seq_len_char"], 3], 21 | name="dpool_index_char") 22 | 23 | 24 | def _get_match_matrix(self, seq_left, emb_seq_left, enc_seq_left, seq_right, emb_seq_right, enc_seq_right, 25 | granularity="word"): 26 | # 1. word embedding 27 | # 1.1 dot product: [batchsize, s1, s2, 1] 28 | match_matrix_dot_product = tf.expand_dims( 29 | tf.einsum("abd,acd->abc", emb_seq_left, emb_seq_right), axis=-1) 30 | # 1.2 identity: [batchsize, s1, s2, 1] 31 | match_matrix_identity = tf.expand_dims(tf.cast( 32 | tf.equal( 33 | tf.expand_dims(seq_left, 2), 34 | tf.expand_dims(seq_right, 1) 35 | ), tf.float32), axis=-1) 36 | 37 | # 2. compressed word embedding 38 | eW = tf.get_variable("eW_%s" % (self.model_name + granularity), 39 | initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.2, dtype=tf.float32), 40 | dtype=tf.float32, 41 | shape=[self.params["embedding_dim_%s" % granularity], 42 | self.params["embedding_dim_compressed"]]) 43 | emb_seq_com_left = tf.einsum("abd,dc->abc", emb_seq_left, eW) 44 | emb_seq_com_right = tf.einsum("abd,dc->abc", emb_seq_right, eW) 45 | # 2.1 dot product: [batchsize, s1, s2, 1] 46 | match_matrix_dot_product_com = tf.expand_dims( 47 | tf.einsum("abd,acd->abc", emb_seq_com_left, emb_seq_com_right), axis=-1) 48 | # 2.2 element product: [batchsize, s1, s2, d] 49 | match_matrix_element_product_com = tf.expand_dims(emb_seq_com_left, 2) * tf.expand_dims( 50 | emb_seq_com_right, 1) 51 | # 2.3 element concat: [batchsize, s1, s2, 2*d] 52 | match_matrix_element_concat_com = tf.concat([ 53 | tf.tile(tf.expand_dims(emb_seq_com_left, 2), [1, 1, self.params["max_seq_len_%s" % granularity], 1]), 54 | tf.tile(tf.expand_dims(emb_seq_com_right, 1), [1, self.params["max_seq_len_%s" % granularity], 1, 1]), 55 | ], axis=-1) 56 | 57 | # 3. contextual word embedding 58 | # 3.1 dot product: [batchsize, s1, s2, 1] 59 | match_matrix_dot_product_ctx = tf.expand_dims( 60 | tf.einsum("abd,acd->abc", enc_seq_left, enc_seq_right), axis=-1) 61 | # 2.2 element product: [batchsize, s1, s2, d] 62 | match_matrix_element_product_ctx = tf.expand_dims(enc_seq_left, 2) * tf.expand_dims( 63 | enc_seq_right, 1) 64 | # 2.3 element concat: [batchsize, s1, s2, 2*d] 65 | match_matrix_element_concat_ctx = tf.concat([ 66 | tf.tile(tf.expand_dims(enc_seq_left, 2), [1, 1, self.params["max_seq_len_%s" % granularity], 1]), 67 | tf.tile(tf.expand_dims(enc_seq_right, 1), [1, self.params["max_seq_len_%s" % granularity], 1, 1]), 68 | ], axis=-1) 69 | 70 | match_matrix = tf.concat([ 71 | match_matrix_dot_product, 72 | match_matrix_identity, 73 | match_matrix_dot_product_com, 74 | match_matrix_element_product_com, 75 | match_matrix_element_concat_com, 76 | match_matrix_dot_product_ctx, 77 | match_matrix_element_product_ctx, 78 | match_matrix_element_concat_ctx, 79 | ], axis=-1) 80 | return match_matrix 81 | 82 | 83 | def _mp_cnn_layer(self, cross, dpool_index, filters, kernel_size, pool_size, strides, name): 84 | cross_conv = tf.layers.conv2d( 85 | inputs=cross, 86 | filters=filters, 87 | kernel_size=kernel_size, 88 | padding="same", 89 | activation=self.params["mp_activation"], 90 | strides=1, 91 | reuse=False, 92 | name=name+"cross_conv") 93 | if self.params["mp_dynamic_pooling"] and dpool_index is not None: 94 | cross_conv = tf.gather_nd(cross_conv, dpool_index) 95 | cross_pool = tf.layers.max_pooling2d( 96 | inputs=cross_conv, 97 | pool_size=pool_size, 98 | strides=strides, 99 | padding="valid", 100 | name=name+"cross_pool") 101 | return cross_pool 102 | 103 | 104 | def _mp_semantic_feature_layer(self, match_matrix, dpool_index, granularity="word"): 105 | 106 | # conv-pool layer 1 107 | filters = self.params["mp_num_filters"][0] 108 | kernel_size = self.params["mp_filter_sizes"][0] 109 | seq_len = self.params["max_seq_len_%s" % granularity] 110 | pool_size0 = self.params["mp_pool_sizes_%s" % granularity][0] 111 | pool_sizes = [seq_len / pool_size0, seq_len / pool_size0] 112 | strides = [seq_len / pool_size0, seq_len / pool_size0] 113 | conv1 = self._mp_cnn_layer(match_matrix, dpool_index, filters, kernel_size, pool_sizes, strides, name=self.model_name+granularity+"1") 114 | conv1_flatten = tf.reshape(conv1, [-1, self.params["mp_num_filters"][0] * (pool_size0 * pool_size0)]) 115 | 116 | # conv-pool layer 2 117 | filters = self.params["mp_num_filters"][1] 118 | kernel_size = self.params["mp_filter_sizes"][1] 119 | pool_size1 = self.params["mp_pool_sizes_%s" % granularity][1] 120 | pool_sizes = [pool_size0 / pool_size1, pool_size0 / pool_size1] 121 | strides = [pool_size0 / pool_size1, pool_size0 / pool_size1] 122 | conv2 = self._mp_cnn_layer(conv1, None, filters, kernel_size, pool_sizes, strides, name=self.model_name + granularity + "2") 123 | conv2_flatten = tf.reshape(conv2, [-1, self.params["mp_num_filters"][1] * (pool_size1 * pool_size1)]) 124 | 125 | # cross = tf.concat([conv1_flatten, conv2_flatten], axis=-1) 126 | 127 | return conv2_flatten 128 | 129 | 130 | def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False): 131 | feed_dict = super(MatchPyramidBaseModel, self)._get_feed_dict(X, idx, Q, construct_neg, training, symmetric) 132 | if self.params["mp_dynamic_pooling"]: 133 | dpool_index_word = dynamic_pooling_index(feed_dict[self.seq_len_word_left], 134 | feed_dict[self.seq_len_word_right], 135 | self.params["max_seq_len_word"], 136 | self.params["max_seq_len_word"]) 137 | dpool_index_char = dynamic_pooling_index(feed_dict[self.seq_len_char_left], 138 | feed_dict[self.seq_len_char_right], 139 | self.params["max_seq_len_char"], 140 | self.params["max_seq_len_char"]) 141 | feed_dict.update({ 142 | self.dpool_index_word: dpool_index_word, 143 | self.dpool_index_char: dpool_index_char, 144 | }) 145 | return feed_dict 146 | 147 | 148 | class MatchPyramid(MatchPyramidBaseModel): 149 | def __init__(self, params, logger, init_embedding_matrix=None): 150 | p = copy(params) 151 | p["model_name"] = p["model_name"] + "match_pyramid" 152 | super(MatchPyramid, self).__init__(p, logger, init_embedding_matrix) 153 | 154 | 155 | def _get_matching_features(self): 156 | with tf.name_scope(self.model_name): 157 | tf.set_random_seed(self.params["random_seed"]) 158 | 159 | with tf.name_scope("word_network"): 160 | if self.params["attend_method"] == "context-attention": 161 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \ 162 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 163 | self._interaction_semantic_feature_layer( 164 | self.seq_word_left, 165 | self.seq_word_right, 166 | self.seq_len_word_left, 167 | self.seq_len_word_right, 168 | granularity="word") 169 | else: 170 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 171 | self._semantic_feature_layer( 172 | self.seq_word_left, 173 | self.seq_len_word_left, 174 | granularity="word", reuse=False) 175 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 176 | self._semantic_feature_layer( 177 | self.seq_word_right, 178 | self.seq_len_word_right, 179 | granularity="word", reuse=True) 180 | match_matrix_word = tf.einsum("abd,acd->abc", emb_seq_word_left, emb_seq_word_right) 181 | match_matrix_word = tf.expand_dims(match_matrix_word, axis=-1) 182 | sim_word = self._mp_semantic_feature_layer(match_matrix_word, self.dpool_index_word, 183 | granularity="word") 184 | 185 | with tf.name_scope("char_network"): 186 | if self.params["attend_method"] == "context-attention": 187 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \ 188 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 189 | self._interaction_semantic_feature_layer( 190 | self.seq_char_left, 191 | self.seq_char_right, 192 | self.seq_len_char_left, 193 | self.seq_len_char_right, 194 | granularity="char") 195 | else: 196 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 197 | self._semantic_feature_layer( 198 | self.seq_char_left, 199 | self.seq_len_char_left, 200 | granularity="char", reuse=False) 201 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 202 | self._semantic_feature_layer( 203 | self.seq_char_right, 204 | self.seq_len_char_right, 205 | granularity="char", reuse=True) 206 | match_matrix_char = tf.einsum("abd,acd->abc", emb_seq_char_left, emb_seq_char_right) 207 | match_matrix_char = tf.expand_dims(match_matrix_char, axis=-1) 208 | sim_char = self._mp_semantic_feature_layer(match_matrix_char, self.dpool_index_char, 209 | granularity="char") 210 | with tf.name_scope("matching_features"): 211 | matching_features_word = sim_word 212 | matching_features_char = sim_char 213 | 214 | return matching_features_word, matching_features_char 215 | 216 | 217 | class GMatchPyramid(MatchPyramidBaseModel): 218 | def __init__(self, params, logger, init_embedding_matrix=None): 219 | p = copy(params) 220 | # model config 221 | p.update({ 222 | "model_name": p["model_name"] + "g_match_pyramid", 223 | "encode_method": "textcnn", 224 | "attend_method": ["ave", "max", "min", "self-attention"], 225 | 226 | # cnn 227 | "cnn_num_layers": 1, 228 | "cnn_num_filters": 32, 229 | "cnn_filter_sizes": [1, 2, 3], 230 | "cnn_timedistributed": False, 231 | "cnn_activation": tf.nn.relu, 232 | "cnn_gated_conv": True, 233 | "cnn_residual": True, 234 | 235 | # fc block 236 | "fc_type": "fc", 237 | "fc_hidden_units": [64 * 4, 64 * 2, 64], 238 | "fc_dropouts": [0, 0, 0], 239 | }) 240 | super(GMatchPyramid, self).__init__(p, logger, init_embedding_matrix) 241 | 242 | 243 | def _get_matching_features(self): 244 | with tf.name_scope(self.model_name): 245 | tf.set_random_seed(self.params["random_seed"]) 246 | 247 | with tf.name_scope("word_network"): 248 | if self.params["attend_method"] == "context-attention": 249 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \ 250 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 251 | self._interaction_semantic_feature_layer( 252 | self.seq_word_left, 253 | self.seq_word_right, 254 | self.seq_len_word_left, 255 | self.seq_len_word_right, 256 | granularity="word") 257 | else: 258 | emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \ 259 | self._semantic_feature_layer( 260 | self.seq_word_left, 261 | self.seq_len_word_left, 262 | granularity="word", reuse=False) 263 | emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \ 264 | self._semantic_feature_layer( 265 | self.seq_word_right, 266 | self.seq_len_word_right, 267 | granularity="word", reuse=True) 268 | 269 | match_matrix_word = self._get_match_matrix(self.seq_word_left, emb_seq_word_left, enc_seq_word_left, 270 | self.seq_word_right, emb_seq_word_right, enc_seq_word_right, 271 | granularity="word") 272 | sim_word = self._mp_semantic_feature_layer(match_matrix_word, self.dpool_index_word, granularity="word") 273 | 274 | with tf.name_scope("char_network"): 275 | if self.params["attend_method"] == "context-attention": 276 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \ 277 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 278 | self._interaction_semantic_feature_layer( 279 | self.seq_char_left, 280 | self.seq_char_right, 281 | self.seq_len_char_left, 282 | self.seq_len_char_right, 283 | granularity="char") 284 | else: 285 | emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \ 286 | self._semantic_feature_layer( 287 | self.seq_char_left, 288 | self.seq_len_char_left, 289 | granularity="char", reuse=False) 290 | emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \ 291 | self._semantic_feature_layer( 292 | self.seq_char_right, 293 | self.seq_len_char_right, 294 | granularity="char", reuse=True) 295 | 296 | match_matrix_char = self._get_match_matrix(self.seq_char_left, emb_seq_char_left, enc_seq_char_left, 297 | self.seq_char_right, emb_seq_char_right, enc_seq_char_right, 298 | granularity="char") 299 | sim_char = self._mp_semantic_feature_layer(match_matrix_char, self.dpool_index_char, 300 | granularity="char") 301 | 302 | with tf.name_scope("matching_features"): 303 | matching_features_word = sim_word 304 | matching_features_char = sim_char 305 | 306 | return matching_features_word, matching_features_char 307 | -------------------------------------------------------------------------------- /src/models/model_library.py: -------------------------------------------------------------------------------- 1 | 2 | from models.bcnn import BCNN, ABCNN1, ABCNN2, ABCNN3 3 | from models.decatt import DecAtt 4 | from models.dssm import DSSM, CDSSM, RDSSM 5 | from models.dsmm import DSMM 6 | from models.esim import ESIM 7 | from models.match_pyramid import MatchPyramid, GMatchPyramid 8 | 9 | 10 | def get_model(model_type): 11 | if model_type == "dssm": 12 | return DSSM 13 | elif model_type == "cdssm": 14 | return CDSSM 15 | elif model_type == "rdssm": 16 | return RDSSM 17 | elif model_type == "match_pyramid": 18 | return MatchPyramid 19 | elif model_type == "g_match_pyramid": 20 | return GMatchPyramid 21 | elif model_type == "dsmm": 22 | return DSMM 23 | elif model_type == "bcnn": 24 | return BCNN 25 | elif model_type == "abcnn1": 26 | return ABCNN1 27 | elif model_type == "abcnn2": 28 | return ABCNN2 29 | elif model_type == "abcnn3": 30 | return ABCNN3 31 | elif model_type == "esim": 32 | return ESIM 33 | elif model_type == "decatt": 34 | return DecAtt 35 | else: 36 | return DSMM 37 | -------------------------------------------------------------------------------- /src/tf_common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/tf_common/__init__.py -------------------------------------------------------------------------------- /src/tf_common/metrics.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | 5 | def cosine_similarity(v1, v2, aggregation=True): 6 | v1_n = tf.nn.l2_normalize(v1, dim=1) 7 | v2_n = tf.nn.l2_normalize(v2, dim=1) 8 | if aggregation: 9 | s = tf.reduce_sum(v1_n * v2_n, axis=1, keep_dims=True) 10 | else: 11 | s = v1_n * v2_n 12 | return s 13 | 14 | 15 | def dot_product(v1, v2, aggregation=True): 16 | if aggregation: 17 | s = tf.reduce_sum(v1 * v2, axis=1, keep_dims=True) 18 | else: 19 | s = v1 * v2 20 | return s 21 | 22 | 23 | def euclidean_distance(v1, v2, aggregation=True): 24 | if aggregation: 25 | s = tf.sqrt(tf.reduce_sum(tf.square(v1 - v2), axis=1, keep_dims=True)) 26 | else: 27 | s = tf.abs(v1 - v2) 28 | return s 29 | 30 | 31 | def euclidean_score(v1, v2, aggregation=True): 32 | s = euclidean_distance(v1, v2, aggregation) 33 | return 1. / (1. + s) 34 | 35 | 36 | def canberra_score(v1, v2, aggregation=True): 37 | if aggregation: 38 | s = tf.reduce_sum(tf.abs(v1 - v2) / (v1 + v2), axis=1, keep_dims=True) 39 | else: 40 | s = tf.abs(v1 - v2) / (v1 + v2) 41 | return s -------------------------------------------------------------------------------- /src/tf_common/nadam.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | from tensorflow.python.eager import context 4 | from tensorflow.python.framework import ops 5 | from tensorflow.python.ops import array_ops 6 | from tensorflow.python.ops import control_flow_ops 7 | from tensorflow.python.ops import math_ops 8 | from tensorflow.python.ops import resource_variable_ops 9 | from tensorflow.python.ops import state_ops 10 | from tensorflow.python.ops import variable_scope 11 | from tensorflow.python.training import optimizer 12 | from tensorflow.python.training import training_ops 13 | 14 | 15 | class NadamOptimizer(optimizer.Optimizer): 16 | def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8, 17 | schedule_decay=0.004, use_locking=False, name="Nadam"): 18 | super(NadamOptimizer, self).__init__(use_locking, name) 19 | self._lr = learning_rate 20 | self._beta1 = beta1 21 | self._beta2 = beta2 22 | self._epsilon = epsilon 23 | self._schedule_decay = schedule_decay 24 | # momentum cache decay 25 | self._momentum_cache_decay = tf.cast(0.96, tf.float32) 26 | self._momentum_cache_const = tf.pow(self._momentum_cache_decay, 1. * schedule_decay) 27 | 28 | # Tensor versions of the constructor arguments, created in _prepare(). 29 | self._lr_t = None 30 | self._beta1_t = None 31 | self._beta2_t = None 32 | self._epsilon_t = None 33 | self._schedule_decay_t = None 34 | 35 | # Variables to accumulate the powers of the beta parameters. 36 | # Created in _create_slots when we know the variables to optimize. 37 | self._beta1_power = None 38 | self._beta2_power = None 39 | self._iterations = None 40 | self._m_schedule = None 41 | 42 | # Created in SparseApply if needed. 43 | self._updated_lr = None 44 | 45 | 46 | def _prepare(self): 47 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 48 | self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1") 49 | self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2") 50 | self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon") 51 | self._schedule_decay_t = ops.convert_to_tensor(self._schedule_decay, name="schedule_decay") 52 | 53 | def _create_slots(self, var_list): 54 | # Create the beta1 and beta2 accumulators on the same device as the first 55 | # variable. Sort the var_list to make sure this device is consistent across 56 | # workers (these need to go on the same PS, otherwise some updates are 57 | # silently ignored). 58 | first_var = min(var_list, key=lambda x: x.name) 59 | 60 | create_new = self._iterations is None 61 | if not create_new and context.in_graph_mode(): 62 | create_new = (self._iterations.graph is not first_var.graph) 63 | 64 | if create_new: 65 | with ops.colocate_with(first_var): 66 | self._beta1_power = variable_scope.variable(self._beta1, 67 | name="beta1_power", 68 | trainable=False) 69 | self._beta2_power = variable_scope.variable(self._beta2, 70 | name="beta2_power", 71 | trainable=False) 72 | self._iterations = variable_scope.variable(0., 73 | name="iterations", 74 | trainable=False) 75 | self._m_schedule = variable_scope.variable(1., 76 | name="m_schedule", 77 | trainable=False) 78 | # Create slots for the first and second moments. 79 | for v in var_list: 80 | self._zeros_slot(v, "m", self._name) 81 | self._zeros_slot(v, "v", self._name) 82 | 83 | def _get_momentum_cache(self, schedule_decay_t, t): 84 | return tf.pow(self._momentum_cache_decay, t * schedule_decay_t) 85 | # return beta1_t * (1. - 0.5 * (tf.pow(self._momentum_cache_decay, t * schedule_decay_t))) 86 | 87 | 88 | """very slow 89 | we simply use the nadam update rule without warming momentum schedule 90 | def _apply_dense(self, grad, var): 91 | t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. 92 | m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) 93 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 94 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 95 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 96 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 97 | schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) 98 | 99 | # Due to the recommendations in [2], i.e. warming momentum schedule 100 | # see keras Nadam 101 | momentum_cache_t = self._get_momentum_cache(beta1_t, schedule_decay_t, t) 102 | momentum_cache_t_1 = self._get_momentum_cache(beta1_t, schedule_decay_t, t+1.) 103 | m_schedule_new = m_schedule * momentum_cache_t 104 | m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1 105 | 106 | # the following equations given in [1] 107 | # m_t = beta1 * m + (1 - beta1) * g_t 108 | m = self.get_slot(var, "m") 109 | m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking) 110 | g_prime = grad / (1. - m_schedule_new) 111 | m_t_prime = m_t / (1. - m_schedule_next) 112 | m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime 113 | 114 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 115 | v = self.get_slot(var, "v") 116 | v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking) 117 | v_t_prime = v_t / (1. - tf.pow(beta2_t, t)) 118 | 119 | var_update = state_ops.assign_sub(var, 120 | lr_t * m_t_bar / (tf.sqrt(v_t_prime) + epsilon_t), 121 | use_locking=self._use_locking) 122 | 123 | return control_flow_ops.group(*[var_update, m_t, v_t]) 124 | """ 125 | # nadam update rule without warming momentum schedule 126 | def _apply_dense(self, grad, var): 127 | m = self.get_slot(var, "m") 128 | v = self.get_slot(var, "v") 129 | return training_ops.apply_adam( 130 | var, 131 | m, 132 | v, 133 | math_ops.cast(self._beta1_power, var.dtype.base_dtype), 134 | math_ops.cast(self._beta2_power, var.dtype.base_dtype), 135 | math_ops.cast(self._lr_t, var.dtype.base_dtype), 136 | math_ops.cast(self._beta1_t, var.dtype.base_dtype), 137 | math_ops.cast(self._beta2_t, var.dtype.base_dtype), 138 | math_ops.cast(self._epsilon_t, var.dtype.base_dtype), 139 | grad, 140 | use_locking=self._use_locking, 141 | use_nesterov=True).op 142 | 143 | def _resource_apply_dense(self, grad, var): 144 | m = self.get_slot(var, "m") 145 | v = self.get_slot(var, "v") 146 | return training_ops.resource_apply_adam( 147 | var.handle, 148 | m.handle, 149 | v.handle, 150 | math_ops.cast(self._beta1_power, grad.dtype.base_dtype), 151 | math_ops.cast(self._beta2_power, grad.dtype.base_dtype), 152 | math_ops.cast(self._lr_t, grad.dtype.base_dtype), 153 | math_ops.cast(self._beta1_t, grad.dtype.base_dtype), 154 | math_ops.cast(self._beta2_t, grad.dtype.base_dtype), 155 | math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), 156 | grad, 157 | use_locking=self._use_locking, 158 | use_nesterov=True) 159 | 160 | # keras Nadam update rule 161 | def _apply_sparse(self, grad, var): 162 | t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. 163 | m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) 164 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 165 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 166 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 167 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 168 | schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) 169 | 170 | # Due to the recommendations in [2], i.e. warming momentum schedule 171 | momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) 172 | momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) 173 | momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) 174 | m_schedule_new = m_schedule * momentum_cache_t 175 | m_schedule_next = m_schedule_new * momentum_cache_t_1 176 | 177 | # the following equations given in [1] 178 | # m_t = beta1 * m + (1 - beta1) * g_t 179 | m = self.get_slot(var, "m") 180 | m_t = state_ops.scatter_update(m, grad.indices, 181 | beta1_t * array_ops.gather(m, grad.indices) + 182 | (1. - beta1_t) * grad.values, 183 | use_locking=self._use_locking) 184 | g_prime_slice = grad.values / (1. - m_schedule_new) 185 | m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) 186 | m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice 187 | 188 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 189 | v = self.get_slot(var, "v") 190 | v_t = state_ops.scatter_update(v, grad.indices, 191 | beta2_t * array_ops.gather(v, grad.indices) + 192 | (1. - beta2_t) * tf.square(grad.values), 193 | use_locking=self._use_locking) 194 | v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) 195 | 196 | var_update = state_ops.scatter_sub(var, grad.indices, 197 | lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), 198 | use_locking=self._use_locking) 199 | 200 | return control_flow_ops.group(*[var_update, m_t, v_t]) 201 | 202 | def _finish(self, update_ops, name_scope): 203 | # Update the power accumulators. 204 | with ops.control_dependencies(update_ops): 205 | with ops.colocate_with(self._iterations): 206 | update_beta1 = self._beta1_power.assign( 207 | self._beta1_power * self._beta1_t, 208 | use_locking=self._use_locking) 209 | update_beta2 = self._beta2_power.assign( 210 | self._beta2_power * self._beta2_t, 211 | use_locking=self._use_locking) 212 | t = self._iterations + 1. 213 | update_iterations = self._iterations.assign(t, use_locking=self._use_locking) 214 | momentum_cache_power = self._get_momentum_cache(self._schedule_decay_t, t) 215 | momentum_cache_t = self._beta1_t * (1. - 0.5 * momentum_cache_power) 216 | update_m_schedule = self._m_schedule.assign( 217 | self._m_schedule * momentum_cache_t, 218 | use_locking=self._use_locking) 219 | return control_flow_ops.group( 220 | *update_ops + [update_beta1, update_beta2] + [update_iterations, update_m_schedule], 221 | name=name_scope) -------------------------------------------------------------------------------- /src/tf_common/nn_module.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | """ 6 | https://explosion.ai/blog/deep-learning-formula-nlp 7 | embed -> encode -> attend -> predict 8 | """ 9 | def batch_normalization(x, training, name): 10 | # with tf.variable_scope(name, reuse=) 11 | bn_train = tf.layers.batch_normalization(x, training=True, reuse=None, name=name) 12 | bn_inference = tf.layers.batch_normalization(x, training=False, reuse=True, name=name) 13 | z = tf.cond(training, lambda: bn_train, lambda: bn_inference) 14 | return z 15 | 16 | 17 | #### Step 1 18 | def embed(x, size, dim, seed=0, flatten=False, reduce_sum=False): 19 | # std = np.sqrt(2 / dim) 20 | std = 0.001 21 | minval = -std 22 | maxval = std 23 | emb = tf.Variable(tf.random_uniform([size, dim], minval, maxval, dtype=tf.float32, seed=seed)) 24 | # None * max_seq_len * embed_dim 25 | out = tf.nn.embedding_lookup(emb, x) 26 | if flatten: 27 | out = tf.layers.flatten(out) 28 | if reduce_sum: 29 | out = tf.reduce_sum(out, axis=1) 30 | return out 31 | 32 | 33 | def embed_subword(x, size, dim, sequence_length, seed=0, mask_zero=False, maxlen=None): 34 | # std = np.sqrt(2 / dim) 35 | std = 0.001 36 | minval = -std 37 | maxval = std 38 | emb = tf.Variable(tf.random_uniform([size, dim], minval, maxval, dtype=tf.float32, seed=seed)) 39 | # None * max_seq_len * max_word_len * embed_dim 40 | out = tf.nn.embedding_lookup(emb, x) 41 | if mask_zero: 42 | # word_len: None * max_seq_len 43 | # mask: shape=None * max_seq_len * max_word_len 44 | mask = tf.sequence_mask(sequence_length, maxlen) 45 | mask = tf.expand_dims(mask, axis=-1) 46 | mask = tf.cast(mask, tf.float32) 47 | out = out * mask 48 | # None * max_seq_len * embed_dim 49 | # according to facebook subword paper, it's sum 50 | out = tf.reduce_sum(out, axis=2) 51 | return out 52 | 53 | 54 | def word_dropout(x, training, dropout=0, seed=0): 55 | # word dropout (dropout the entire embedding for some words) 56 | """ 57 | tf.layers.Dropout doesn't work as it can't switch training or inference 58 | """ 59 | if dropout > 0: 60 | input_shape = tf.shape(x) 61 | noise_shape = [input_shape[0], input_shape[1], 1] 62 | x = tf.layers.Dropout(rate=dropout, noise_shape=noise_shape, seed=seed)(x, training=training) 63 | return x 64 | 65 | 66 | #### Step 2 67 | def fasttext(x): 68 | return x 69 | 70 | 71 | # Language Modeling with Gated Convolutional Networks 72 | # https://github.com/anantzoid/Language-Modeling-GatedCNN 73 | def gated_conv1d_op(inputs, filters=8, kernel_size=3, padding="same", activation=None, strides=1, reuse=False, name=""): 74 | conv_linear = tf.layers.conv1d( 75 | inputs=inputs, 76 | filters=filters, 77 | kernel_size=kernel_size, 78 | padding="same", 79 | activation=None, 80 | strides=strides, 81 | reuse=reuse, 82 | name=name+"_linear") 83 | conv_gated = tf.layers.conv1d( 84 | inputs=inputs, 85 | filters=filters, 86 | kernel_size=kernel_size, 87 | padding="same", 88 | activation=tf.nn.sigmoid, 89 | strides=strides, 90 | reuse=reuse, 91 | name=name+"_gated") 92 | conv = conv_linear * conv_gated 93 | return conv 94 | 95 | 96 | def residual_gated_conv1d_op(inputs, filters=8, kernel_size=3, padding="same", activation=None, strides=1, reuse=False, name=""): 97 | conv_linear = tf.layers.conv1d( 98 | inputs=inputs, 99 | filters=filters, 100 | kernel_size=kernel_size, 101 | padding="same", 102 | activation=None, 103 | strides=strides, 104 | reuse=reuse, 105 | name=name+"_linear") 106 | conv_gated = tf.layers.conv1d( 107 | inputs=inputs, 108 | filters=filters, 109 | kernel_size=kernel_size, 110 | padding="same", 111 | activation=tf.nn.sigmoid, 112 | strides=strides, 113 | reuse=reuse, 114 | name=name+"_gated") 115 | conv = inputs * (1. - conv_gated) + conv_linear * conv_gated 116 | return conv 117 | 118 | 119 | def _textcnn(x, conv_op, num_filters=8, filter_sizes=[2, 3], bn=False, training=False, 120 | timedistributed=False, scope_name="textcnn", reuse=False, activation=tf.nn.relu): 121 | # x: None * step_dim * embed_dim 122 | conv_blocks = [] 123 | for i, filter_size in enumerate(filter_sizes): 124 | scope_name_i = "%s_textcnn_%s"%(str(scope_name), str(filter_size)) 125 | with tf.variable_scope(scope_name_i, reuse=reuse): 126 | if timedistributed: 127 | input_shape = tf.shape(x) 128 | step_dim = input_shape[1] 129 | embed_dim = input_shape[2] 130 | x = tf.transpose(x, [0, 2, 1]) 131 | # None * embed_dim * step_dim 132 | x = tf.reshape(x, [input_shape[0] * embed_dim, step_dim, 1]) 133 | conv = conv_op( 134 | inputs=x, 135 | filters=1, 136 | kernel_size=filter_size, 137 | padding="same", 138 | activation=activation, 139 | strides=1, 140 | reuse=reuse, 141 | name=scope_name_i) 142 | conv = tf.reshape(conv, [input_shape[0], embed_dim, step_dim]) 143 | conv = tf.transpose(conv, [0, 2, 1]) 144 | else: 145 | conv = conv_op( 146 | inputs=x, 147 | filters=num_filters, 148 | kernel_size=filter_size, 149 | padding="same", 150 | activation=activation, 151 | strides=1, 152 | reuse=reuse, 153 | name=scope_name_i) 154 | if bn: 155 | conv = tf.layers.BatchNormalization()(conv, training) 156 | # conv = activation(conv) 157 | conv_blocks.append(conv) 158 | if len(conv_blocks) > 1: 159 | z = tf.concat(conv_blocks, axis=-1) 160 | else: 161 | z = conv_blocks[0] 162 | return z 163 | 164 | 165 | def textcnn(x, num_layers=2, num_filters=8, filter_sizes=[2, 3], bn=False, training=False, 166 | timedistributed=False, scope_name="textcnn", reuse=False, activation=tf.nn.relu, 167 | gated_conv=False, residual=False): 168 | if gated_conv: 169 | if residual: 170 | conv_op = residual_gated_conv1d_op 171 | else: 172 | conv_op = gated_conv1d_op 173 | else: 174 | conv_op = tf.layers.conv1d 175 | conv_blocks = [] 176 | for i in range(num_layers): 177 | scope_name_i = "%s_textcnn_layer_%s" % (str(scope_name), str(i)) 178 | x = _textcnn(x, conv_op, num_filters, filter_sizes, bn, training, timedistributed, scope_name_i, reuse, activation) 179 | conv_blocks.append(x) 180 | if len(conv_blocks) > 1: 181 | z = tf.concat(conv_blocks, axis=-1) 182 | else: 183 | z = conv_blocks[0] 184 | return z 185 | 186 | 187 | def textrnn(x, num_units, cell_type, sequence_length, num_layers=1, mask_zero=False, scope_name="textrnn", reuse=False): 188 | for i in range(num_layers): 189 | scope_name_i = "%s_textrnn_%s_%s_%s" % (str(scope_name), cell_type, str(i), str(num_units)) 190 | with tf.variable_scope(scope_name_i, reuse=reuse): 191 | if cell_type == "gru": 192 | cell_fw = tf.nn.rnn_cell.GRUCell(num_units) 193 | elif cell_type == "lstm": 194 | cell_fw = tf.nn.rnn_cell.LSTMCell(num_units) 195 | if mask_zero: 196 | x, _ = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=sequence_length, scope=scope_name_i) 197 | else: 198 | x, _ = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=None, scope=scope_name_i) 199 | return x 200 | 201 | 202 | def textbirnn(x, num_units, cell_type, sequence_length, num_layers=1, mask_zero=False, scope_name="textbirnn", reuse=False): 203 | for i in range(num_layers): 204 | scope_name_i = "%s_textbirnn_%s_%s_%s" % (str(scope_name), cell_type, str(i), str(num_units)) 205 | with tf.variable_scope(scope_name_i, reuse=reuse): 206 | if cell_type == "gru": 207 | cell_fw = tf.nn.rnn_cell.GRUCell(num_units) 208 | cell_bw = tf.nn.rnn_cell.GRUCell(num_units) 209 | elif cell_type == "lstm": 210 | cell_fw = tf.nn.rnn_cell.LSTMCell(num_units) 211 | cell_bw = tf.nn.rnn_cell.LSTMCell(num_units) 212 | if mask_zero: 213 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 214 | cell_fw, cell_bw, x, dtype=tf.float32, sequence_length=sequence_length, scope=scope_name_i) 215 | else: 216 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 217 | cell_fw, cell_bw, x, dtype=tf.float32, sequence_length=None, scope=scope_name_i) 218 | x = tf.concat([output_fw, output_bw], axis=-1) 219 | return x 220 | 221 | 222 | 223 | def encode(x, method, params, input_dim, 224 | sequence_length=None, mask_zero=False, 225 | scope_name="encode", reuse=False, 226 | training=False, seed=0): 227 | """ 228 | :param x: shape=(None,seqlen,dim) 229 | :param params: 230 | :return: shape=(None,seqlen,dim) 231 | """ 232 | out_list = [] 233 | params["encode_dim"] = 0 234 | for m in method.split("+"): 235 | if m == "fasttext": 236 | dim_f = input_dim # params["embedding_dim"] 237 | z = fasttext(x) 238 | out_list.append(z) 239 | params["encode_dim"] += dim_f 240 | elif m == "project": 241 | dim_p = params["project_hidden_units"][-1] 242 | step_dim = tf.shape(x)[1] 243 | z = tf.reshape(x, [-1, input_dim]) 244 | z = mlp_layer(z, fc_type=params["project_type"], 245 | hidden_units=params["project_hidden_units"], 246 | dropouts=params["project_dropouts"], 247 | scope_name=scope_name, 248 | reuse=reuse, 249 | training=training, 250 | seed=params["random_seed"]) 251 | z = tf.reshape(z, [-1, step_dim, params["project_hidden_units"][-1]]) 252 | out_list.append(z) 253 | params["encode_dim"] += dim_p 254 | elif m == "textcnn": 255 | dim_c = params["cnn_num_layers"] * len(params["cnn_filter_sizes"]) * params["cnn_num_filters"] 256 | z = textcnn(x, num_layers=params["cnn_num_layers"], num_filters=params["cnn_num_filters"], filter_sizes=params["cnn_filter_sizes"], 257 | timedistributed=params["cnn_timedistributed"], scope_name=scope_name, reuse=reuse) 258 | out_list.append(z) 259 | params["encode_dim"] += dim_c 260 | elif m == "textrnn": 261 | dim_r = params["rnn_num_units"] 262 | z = textrnn(x, num_units=params["rnn_num_units"], cell_type=params["rnn_cell_type"], num_layers=params["rnn_num_layers"], 263 | sequence_length=sequence_length, mask_zero=mask_zero, scope_name=scope_name, reuse=reuse) 264 | out_list.append(z) 265 | params["encode_dim"] += dim_r 266 | elif method == "textbirnn": 267 | dim_b = params["rnn_num_units"] * 2 268 | z = textbirnn(x, num_units=params["rnn_num_units"], cell_type=params["rnn_cell_type"], num_layers=params["rnn_num_layers"], 269 | sequence_length=sequence_length, mask_zero=mask_zero, scope_name=scope_name, reuse=reuse) 270 | out_list.append(z) 271 | params["encode_dim"] += dim_b 272 | z = tf.concat(out_list, axis=-1) 273 | return z 274 | 275 | 276 | def scalar_attention(x, encode_dim, feature_dim, attention_dim, sequence_length=None, 277 | mask_zero=False, maxlen=None, epsilon=1e-8, seed=0, scope_name="attention", reuse=False): 278 | """ 279 | :param x: [batchsize, s, feature_dim] 280 | :param encode_dim: dim of encoder output 281 | :param feature_dim: dim of x (for self-attention, x is the encoder output; 282 | for context-attention, x is the concat of encoder output and contextual info) 283 | :param sequence_length: 284 | :param mask_zero: 285 | :param maxlen: 286 | :param epsilon: 287 | :param seed: 288 | :param scope_name: 289 | :param reuse: 290 | :return: [batchsize, s, 1] 291 | """ 292 | with tf.variable_scope(scope_name, reuse=reuse): 293 | # W1: [feature_dim] 294 | W1 = tf.get_variable("W1_%s" % scope_name, 295 | initializer=tf.truncated_normal_initializer( 296 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 297 | dtype=tf.float32, 298 | shape=[feature_dim]) 299 | # b1: [1] 300 | b1 = tf.get_variable("b1_%s" % scope_name, 301 | initializer=tf.truncated_normal_initializer( 302 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 303 | dtype=tf.float32, 304 | shape=[1]) 305 | e = tf.einsum("bsf,f->bs", x, W1) + \ 306 | tf.expand_dims(b1, axis=1) 307 | a = tf.exp(e) 308 | 309 | # apply mask after the exp. will be re-normalized next 310 | if mask_zero: 311 | # None * s 312 | mask = tf.sequence_mask(sequence_length, maxlen) 313 | mask = tf.cast(mask, tf.float32) 314 | a = a * mask 315 | 316 | # in some cases especially in the early stages of training the sum may be almost zero 317 | s = tf.reduce_sum(a, axis=1, keep_dims=True) 318 | a /= tf.cast(s + epsilon, tf.float32) 319 | a = tf.expand_dims(a, axis=-1) 320 | 321 | return a 322 | 323 | 324 | # vector-based attention proposed in the following paper 325 | # Enhancing Sentence Embedding with Generalized Pooling 326 | def vector_attention(x, encode_dim, feature_dim, attention_dim, sequence_length=None, 327 | mask_zero=False, maxlen=None, epsilon=1e-8, seed=0, 328 | scope_name="attention", reuse=False): 329 | """ 330 | :param x: [batchsize, s, feature_dim] 331 | :param encode_dim: dim of encoder output 332 | :param feature_dim: dim of x (for self-attention, x is the encoder output; 333 | for context-attention, x is the concat of encoder output and contextual info) 334 | :param sequence_length: 335 | :param mask_zero: 336 | :param maxlen: 337 | :param epsilon: 338 | :param seed: 339 | :param scope_name: 340 | :param reuse: 341 | :return: [batchsize, s, encode_dim] 342 | """ 343 | with tf.variable_scope(scope_name, reuse=reuse): 344 | # W1: [attention_dim, feature_dim] 345 | W1 = tf.get_variable("W1_%s" % scope_name, 346 | initializer=tf.truncated_normal_initializer( 347 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 348 | dtype=tf.float32, 349 | shape=[attention_dim, feature_dim]) 350 | # b1: [attention_dim] 351 | b1 = tf.get_variable("b1_%s" % scope_name, 352 | initializer=tf.truncated_normal_initializer( 353 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 354 | dtype=tf.float32, 355 | shape=[attention_dim]) 356 | # W2: [encode_dim, attention_dim] 357 | W2 = tf.get_variable("W2_%s" % scope_name, 358 | initializer=tf.truncated_normal_initializer( 359 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 360 | dtype=tf.float32, 361 | shape=[encode_dim, attention_dim]) 362 | # b2: [encode_dim] 363 | b2 = tf.get_variable("b2_%s" % scope_name, 364 | initializer=tf.truncated_normal_initializer( 365 | mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed), 366 | dtype=tf.float32, 367 | shape=[encode_dim]) 368 | # [batchsize, attention_dim, s] 369 | e = tf.nn.relu( 370 | tf.einsum("bsf,af->bas", x, W1) + \ 371 | tf.expand_dims(tf.expand_dims(b1, axis=0), axis=-1)) 372 | # [batchsize, s, encode_dim] 373 | e = tf.einsum("bas,ea->bse", e, W2) + \ 374 | tf.expand_dims(tf.expand_dims(b2, axis=0), axis=0) 375 | a = tf.exp(e) 376 | 377 | # apply mask after the exp. will be re-normalized next 378 | if mask_zero: 379 | # [batchsize, s, 1] 380 | mask = tf.sequence_mask(sequence_length, maxlen) 381 | mask = tf.expand_dims(tf.cast(mask, tf.float32), axis=-1) 382 | a = a * mask 383 | 384 | # in some cases especially in the early stages of training the sum may be almost zero 385 | s = tf.reduce_sum(a, axis=1, keep_dims=True) 386 | a /= tf.cast(s + epsilon, tf.float32) 387 | 388 | return a 389 | 390 | 391 | def _attend(x, sequence_length=None, method="ave", context=None, encode_dim=None, 392 | feature_dim=None, attention_dim=None, mask_zero=False, maxlen=None, 393 | bn=False, training=False, seed=0, scope_name="attention", reuse=False, 394 | num_heads=1): 395 | if method == "ave": 396 | if mask_zero: 397 | # None * step_dim 398 | mask = tf.sequence_mask(sequence_length, maxlen) 399 | mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1)) 400 | mask = tf.cast(mask, tf.float32) 401 | z = tf.reduce_sum(x * mask, axis=1) 402 | l = tf.reduce_sum(mask, axis=1) 403 | # in some cases especially in the early stages of training the sum may be almost zero 404 | epsilon = 1e-8 405 | z /= tf.cast(l + epsilon, tf.float32) 406 | else: 407 | z = tf.reduce_mean(x, axis=1) 408 | elif method == "sum": 409 | if mask_zero: 410 | # None * step_dim 411 | mask = tf.sequence_mask(sequence_length, maxlen) 412 | mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1)) 413 | mask = tf.cast(mask, tf.float32) 414 | z = tf.reduce_sum(x * mask, axis=1) 415 | else: 416 | z = tf.reduce_sum(x, axis=1) 417 | elif method == "max": 418 | if mask_zero: 419 | # None * step_dim 420 | mask = tf.sequence_mask(sequence_length, maxlen) 421 | mask = tf.expand_dims(mask, axis=-1) 422 | mask = tf.tile(mask, (1, 1, tf.shape(x)[2])) 423 | masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)), 424 | tf.ones_like(x) * -np.inf, x) # if masked assume value is -inf 425 | z = tf.reduce_max(masked_data, axis=1) 426 | else: 427 | z = tf.reduce_max(x, axis=1) 428 | elif method == "min": 429 | if mask_zero: 430 | # None * step_dim 431 | mask = tf.sequence_mask(sequence_length, maxlen) 432 | mask = tf.expand_dims(mask, axis=-1) 433 | mask = tf.tile(mask, (1, 1, tf.shape(x)[2])) 434 | masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)), 435 | tf.ones_like(x) * np.inf, x) # if masked assume value is -inf 436 | z = tf.reduce_min(masked_data, axis=1) 437 | else: 438 | z = tf.reduce_min(x, axis=1) 439 | elif "attention" in method: 440 | if context is not None: 441 | y = tf.concat([x, context], axis=-1) 442 | else: 443 | y = x 444 | zs = [] 445 | for i in range(num_heads): 446 | if "vector" in method: 447 | a = vector_attention(y, encode_dim, feature_dim, attention_dim, sequence_length, mask_zero, maxlen, seed=seed, scope_name=scope_name+str(i), reuse=reuse) 448 | else: 449 | a = scalar_attention(y, encode_dim, feature_dim, attention_dim, sequence_length, mask_zero, maxlen, seed=seed, scope_name=scope_name+str(i), reuse=reuse) 450 | zs.append(tf.reduce_sum(x * a, axis=1)) 451 | z = tf.concat(zs, axis=-1) 452 | if bn: 453 | z = tf.layers.BatchNormalization()(z, training=training) 454 | return z 455 | 456 | 457 | def attend(x, sequence_length=None, method="ave", context=None, encode_dim=None, 458 | feature_dim=None, attention_dim=None, mask_zero=False, maxlen=None, 459 | bn=False, training=False, seed=0, scope_name="attention", reuse=False, 460 | num_heads=1): 461 | if isinstance(method, list): 462 | outputs = [None]*len(method) 463 | for i,m in enumerate(method): 464 | outputs[i] = _attend(x, sequence_length, m, context, encode_dim, feature_dim, attention_dim, mask_zero, maxlen, 465 | bn, training, seed, scope_name+m, reuse, num_heads) 466 | return tf.concat(outputs, axis=-1) 467 | else: 468 | return _attend(x, sequence_length, method, context, encode_dim, feature_dim, attention_dim, mask_zero, maxlen, 469 | bn, training, seed, scope_name+method, reuse, num_heads) 470 | 471 | 472 | #### Step 4 473 | def _dense_block_mode1(x, hidden_units, dropouts, densenet=False, scope_name="dense_block", reuse=False, training=False, seed=0, bn=False): 474 | """ 475 | :param x: 476 | :param hidden_units: 477 | :param dropouts: 478 | :param densenet: enable densenet 479 | :return: 480 | Ref: https://github.com/titu1994/DenseNet 481 | """ 482 | for i, (h, d) in enumerate(zip(hidden_units, dropouts)): 483 | scope_name_i = "%s-dense_block_mode1-%s"%(str(scope_name), str(i)) 484 | with tf.variable_scope(scope_name, reuse=reuse): 485 | z = tf.layers.dense(x, h, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * i), 486 | reuse=reuse, 487 | name=scope_name_i) 488 | if bn: 489 | z = batch_normalization(z, training=training, name=scope_name_i+"-bn") 490 | z = tf.nn.relu(z) 491 | z = tf.layers.Dropout(d, seed=seed * i)(z, training=training) if d > 0 else z 492 | if densenet: 493 | x = tf.concat([x, z], axis=-1) 494 | else: 495 | x = z 496 | return x 497 | 498 | 499 | def _dense_block_mode2(x, hidden_units, dropouts, densenet=False, training=False, seed=0, bn=False, name="dense_block"): 500 | """ 501 | :param x: 502 | :param hidden_units: 503 | :param dropouts: 504 | :param densenet: enable densenet 505 | :return: 506 | Ref: https://github.com/titu1994/DenseNet 507 | """ 508 | for i, (h, d) in enumerate(zip(hidden_units, dropouts)): 509 | if bn: 510 | z = batch_normalization(x, training=training, name=name + "-" + str(i)) 511 | z = tf.nn.relu(z) 512 | z = tf.layers.Dropout(d, seed=seed * i)(z, training=training) if d > 0 else z 513 | z = tf.layers.Dense(h, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * i), dtype=tf.float32, 514 | bias_initializer=tf.zeros_initializer())(z) 515 | if densenet: 516 | x = tf.concat([x, z], axis=-1) 517 | else: 518 | x = z 519 | return x 520 | 521 | 522 | def dense_block(x, hidden_units, dropouts, densenet=False, scope_name="dense_block", reuse=False, training=False, seed=0, bn=False): 523 | return _dense_block_mode1(x, hidden_units, dropouts, densenet, scope_name, reuse, training, seed, bn) 524 | 525 | 526 | def _resnet_branch_mode1(x, hidden_units, dropouts, training, seed=0): 527 | h1, h2, h3 = hidden_units 528 | dr1, dr2, dr3 = dropouts 529 | name = "resnet_block" 530 | # branch 2 531 | x2 = tf.layers.Dense(h1, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 2), dtype=tf.float32, 532 | bias_initializer=tf.zeros_initializer())(x) 533 | x2 = tf.layers.BatchNormalization()(x2, training=training) 534 | # x2 = batch_normalization(x2, training=training, name=name + "-" + str(1)) 535 | x2 = tf.nn.relu(x2) 536 | x2 = tf.layers.Dropout(dr1, seed=seed * 1)(x2, training=training) if dr1 > 0 else x2 537 | 538 | x2 = tf.layers.Dense(h2, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 3), dtype=tf.float32, 539 | bias_initializer=tf.zeros_initializer())(x2) 540 | x2 = tf.layers.BatchNormalization()(x2, training=training) 541 | # x2 = batch_normalization(x2, training=training, name=name + "-" + str(2)) 542 | x2 = tf.nn.relu(x2) 543 | x2 = tf.layers.Dropout(dr2, seed=seed * 2)(x2, training=training) if dr2 > 0 else x2 544 | 545 | x2 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 4), dtype=tf.float32, 546 | bias_initializer=tf.zeros_initializer())(x2) 547 | x2 = tf.layers.BatchNormalization()(x2, training=training) 548 | # x2 = batch_normalization(x2, training=training, name=name + "-" + str(3)) 549 | 550 | return x2 551 | 552 | 553 | def _resnet_block_mode1(x, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0): 554 | """A block that has a dense layer at shortcut. 555 | # Arguments 556 | input_tensor: input tensor 557 | kernel_size: default 3, the kernel size of middle conv layer at main path 558 | filters: list of integers, the filters of 3 conv layer at main path 559 | stage: integer, current stage label, used for generating layer names 560 | block: 'a','b'..., current block label, used for generating layer names 561 | # Returns 562 | Output tensor for the block. 563 | Note that from stage 3, the first conv layer at main path is with strides=(2,2) 564 | And the shortcut should have strides=(2,2) as well 565 | """ 566 | h1, h2, h3 = hidden_units 567 | dr1, dr2, dr3 = dropouts 568 | name = "resnet_block" 569 | xs = [] 570 | # branch 0 571 | if dense_shortcut: 572 | x0 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 1), dtype=tf.float32, 573 | bias_initializer=tf.zeros_initializer())(x) 574 | x0 = tf.layers.BatchNormalization()(x0, training=training) 575 | # x0 = batch_normalization(x0, training=training, name=name + "-" + str(0)) 576 | xs.append(x0) 577 | else: 578 | xs.append(x) 579 | 580 | # branch 1 ~ cardinality 581 | for i in range(cardinality): 582 | xs.append(_resnet_branch_mode1(x, hidden_units, dropouts, training, seed)) 583 | 584 | x = tf.add_n(xs) 585 | x = tf.nn.relu(x) 586 | x = tf.layers.Dropout(dr3, seed=seed * 4)(x, training=training) if dr3 > 0 else x 587 | return x 588 | 589 | 590 | def _resnet_branch_mode2(x, hidden_units, dropouts, training=False, seed=0, scope_name="_resnet_branch_mode2", reuse=False): 591 | h1, h2, h3 = hidden_units 592 | dr1, dr2, dr3 = dropouts 593 | # name = "resnet" 594 | with tf.variable_scope(scope_name, reuse=reuse): 595 | # branch 2: bn-relu->weight 596 | x2 = tf.layers.BatchNormalization()(x) 597 | # x2 = batch_normalization(x, training=training, name=scope_name + "-bn-" + str(1)) 598 | x2 = tf.nn.relu(x2) 599 | x2 = tf.layers.Dropout(dr1)(x2, training=training) if dr1 > 0 else x2 600 | x2 = tf.layers.dense(x2, h1, kernel_initializer=tf.glorot_uniform_initializer(seed * 1), 601 | bias_initializer=tf.zeros_initializer(), 602 | name=scope_name+"-dense-"+str(1), 603 | reuse=reuse) 604 | 605 | x2 = tf.layers.BatchNormalization()(x2) 606 | # x2 = batch_normalization(x2, training=training, name=scope_name + "-bn-" + str(2)) 607 | x2 = tf.nn.relu(x2) 608 | x2 = tf.layers.Dropout(dr2)(x2, training=training) if dr2 > 0 else x2 609 | x2 = tf.layers.dense(x2, h2, kernel_initializer=tf.glorot_uniform_initializer(seed * 2), 610 | bias_initializer=tf.zeros_initializer(), 611 | name=scope_name + "-dense-" + str(2), 612 | reuse=reuse) 613 | 614 | x2 = tf.layers.BatchNormalization()(x2) 615 | # x2 = batch_normalization(x2, training=training, name=scope_name + "-bn-" + str(3)) 616 | x2 = tf.nn.relu(x2) 617 | x2 = tf.layers.Dropout(dr3)(x2, training=training) if dr3 > 0 else x2 618 | x2 = tf.layers.dense(x2, h3, kernel_initializer=tf.glorot_uniform_initializer(seed * 3), 619 | bias_initializer=tf.zeros_initializer(), 620 | name=scope_name + "-dense-" + str(3), 621 | reuse=reuse) 622 | 623 | return x2 624 | 625 | 626 | def _resnet_block_mode2(x, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0, 627 | scope_name="_resnet_block_mode2", reuse=False): 628 | """A block that has a dense layer at shortcut. 629 | # Arguments 630 | input_tensor: input tensor 631 | kernel_size: default 3, the kernel size of middle conv layer at main path 632 | filters: list of integers, the filters of 3 conv layer at main path 633 | stage: integer, current stage label, used for generating layer names 634 | block: 'a','b'..., current block label, used for generating layer names 635 | # Returns 636 | Output tensor for the block. 637 | Note that from stage 3, the first conv layer at main path is with strides=(2,2) 638 | And the shortcut should have strides=(2,2) as well 639 | """ 640 | h1, h2, h3 = hidden_units 641 | dr1, dr2, dr3 = dropouts 642 | 643 | xs = [] 644 | # branch 0 645 | if dense_shortcut: 646 | with tf.variable_scope(scope_name, reuse=reuse): 647 | x0 = tf.layers.dense(x, h3, kernel_initializer=tf.glorot_uniform_initializer(seed * 1), 648 | bias_initializer=tf.zeros_initializer(), 649 | reuse=reuse, 650 | name=scope_name+"-dense-"+str("0")) 651 | xs.append(x0) 652 | else: 653 | xs.append(x) 654 | 655 | # branch 1 ~ cardinality 656 | for i in range(cardinality): 657 | xs.append(_resnet_branch_mode2(x, hidden_units, dropouts, training, seed, scope_name, reuse)) 658 | 659 | x = tf.add_n(xs) 660 | return x 661 | 662 | 663 | def resnet_block(input_tensor, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0, 664 | scope_name="resnet_block", reuse=False): 665 | return _resnet_block_mode2(input_tensor, hidden_units, dropouts, cardinality, dense_shortcut, training, seed, 666 | scope_name, reuse) 667 | 668 | 669 | def mlp_layer(input, fc_type, hidden_units, dropouts, scope_name, reuse=False, training=False, seed=0): 670 | if fc_type == "fc": 671 | output = dense_block(input, hidden_units=hidden_units, dropouts=dropouts, 672 | densenet=False, scope_name=scope_name, 673 | reuse=reuse, 674 | training=training, seed=seed) 675 | elif fc_type == "densenet": 676 | output = dense_block(input, hidden_units=hidden_units, dropouts=dropouts, 677 | densenet=True, scope_name=scope_name, 678 | reuse=reuse, 679 | training=training, seed=seed) 680 | elif fc_type == "resnet": 681 | output = resnet_block(input, hidden_units=hidden_units, dropouts=dropouts, 682 | cardinality=1, dense_shortcut=True, training=training, 683 | reuse=reuse, 684 | seed=seed, 685 | scope_name=scope_name) 686 | return output 687 | -------------------------------------------------------------------------------- /src/tf_common/optimizer.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&cad=rja&uact=8&ved=0ahUKEwih7-6VlejYAhWGS98KHWeLCWQQFgg3MAE&url=https%3A%2F%2Fwww.bigdatarepublic.nl%2Fcustom-optimizer-in-tensorflow%2F&usg=AOvVaw3jmxRDqr2pkGRLvX6rNJrl 4 | """ 5 | 6 | import tensorflow as tf 7 | from tensorflow.python.framework import constant_op 8 | from tensorflow.python.ops import random_ops 9 | from tensorflow.python.eager import context 10 | from tensorflow.python.framework import ops 11 | from tensorflow.python.ops import array_ops 12 | from tensorflow.python.ops import control_flow_ops 13 | from tensorflow.python.ops import math_ops 14 | from tensorflow.python.ops import state_ops 15 | from tensorflow.python.ops import variable_scope 16 | from tensorflow.python.training import optimizer 17 | from tensorflow.python.training import training_ops 18 | 19 | 20 | class LazyPowerSignOptimizer(optimizer.Optimizer): 21 | """Implementation of PowerSign. 22 | See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417) 23 | @@__init__ 24 | """ 25 | 26 | def __init__(self, learning_rate=0.001, alpha=0.01, beta=0.5, use_locking=False, name="PowerSign"): 27 | super(LazyPowerSignOptimizer, self).__init__(use_locking, name) 28 | self._lr = learning_rate 29 | self._alpha = alpha 30 | self._beta = beta 31 | 32 | # Tensor versions of the constructor arguments, created in _prepare(). 33 | self._lr_t = None 34 | self._alpha_t = None 35 | self._beta_t = None 36 | 37 | def _prepare(self): 38 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 39 | self._alpha_t = ops.convert_to_tensor(self._beta, name="alpha_t") 40 | self._beta_t = ops.convert_to_tensor(self._beta, name="beta_t") 41 | 42 | def _create_slots(self, var_list): 43 | # Create slots for the first and second moments. 44 | for v in var_list: 45 | self._zeros_slot(v, "m", self._name) 46 | 47 | def _apply_dense(self, grad, var): 48 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 49 | alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 50 | beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) 51 | 52 | eps = 1e-7 # cap for moving average 53 | 54 | m = self.get_slot(var, "m") 55 | m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad))) 56 | 57 | var_update = state_ops.assign_sub(var, lr_t * grad * tf.exp( 58 | tf.log(alpha_t) * tf.sign(grad) * tf.sign(m_t))) # Update 'ref' by subtracting 'value 59 | # Create an op that groups multiple operations. 60 | # When this op finishes, all ops in input have finished 61 | return control_flow_ops.group(*[var_update, m_t]) 62 | 63 | def _apply_sparse(self, grad, var): 64 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 65 | alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 66 | beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) 67 | 68 | eps = 1e-7 # cap for moving average 69 | 70 | m = self.get_slot(var, "m") 71 | m_slice = tf.gather(m, grad.indices) 72 | m_t = state_ops.scatter_update(m, grad.indices, 73 | tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) 74 | m_t_slice = tf.gather(m_t, grad.indices) 75 | 76 | var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( 77 | tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value 78 | # Create an op that groups multiple operations. 79 | # When this op finishes, all ops in input have finished 80 | return control_flow_ops.group(*[var_update, m_t]) 81 | 82 | 83 | class LazyAddSignOptimizer(optimizer.Optimizer): 84 | """Implementation of AddSign. 85 | See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417) 86 | @@__init__ 87 | """ 88 | 89 | def __init__(self, learning_rate=1.001, alpha=0.01, beta=0.5, use_locking=False, name="AddSign"): 90 | super(LazyAddSignOptimizer, self).__init__(use_locking, name) 91 | self._lr = learning_rate 92 | self._alpha = alpha 93 | self._beta = beta 94 | 95 | # Tensor versions of the constructor arguments, created in _prepare(). 96 | self._lr_t = None 97 | self._alpha_t = None 98 | self._beta_t = None 99 | 100 | def _prepare(self): 101 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 102 | self._alpha_t = ops.convert_to_tensor(self._beta, name="beta_t") 103 | self._beta_t = ops.convert_to_tensor(self._beta, name="beta_t") 104 | 105 | def _create_slots(self, var_list): 106 | # Create slots for the first and second moments. 107 | for v in var_list: 108 | self._zeros_slot(v, "m", self._name) 109 | 110 | def _apply_dense(self, grad, var): 111 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 112 | beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) 113 | alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 114 | 115 | eps = 1e-7 # cap for moving average 116 | 117 | m = self.get_slot(var, "m") 118 | m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad))) 119 | 120 | var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t))) 121 | # Create an op that groups multiple operations 122 | # When this op finishes, all ops in input have finished 123 | return control_flow_ops.group(*[var_update, m_t]) 124 | 125 | def _apply_sparse(self, grad, var): 126 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 127 | beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) 128 | alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) 129 | 130 | eps = 1e-7 # cap for moving average 131 | 132 | m = self.get_slot(var, "m") 133 | m_slice = tf.gather(m, grad.indices) 134 | m_t = state_ops.scatter_update(m, grad.indices, 135 | tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) 136 | m_t_slice = tf.gather(m_t, grad.indices) 137 | 138 | var_update = state_ops.scatter_sub(var, grad.indices, 139 | lr_t * grad.values * ( 140 | 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) 141 | 142 | # Create an op that groups multiple operations 143 | # When this op finishes, all ops in input have finished 144 | return control_flow_ops.group(*[var_update, m_t]) 145 | 146 | 147 | class LazyAMSGradOptimizer(optimizer.Optimizer): 148 | def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8, 149 | use_locking=False, name="AMSGrad"): 150 | super(LazyAMSGradOptimizer, self).__init__(use_locking, name) 151 | self._lr = learning_rate 152 | self._beta1 = beta1 153 | self._beta2 = beta2 154 | self._epsilon = epsilon 155 | 156 | # Tensor versions of the constructor arguments, created in _prepare(). 157 | self._lr_t = None 158 | self._beta1_t = None 159 | self._beta2_t = None 160 | self._epsilon_t = None 161 | 162 | def _prepare(self): 163 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 164 | self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1") 165 | self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2") 166 | self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon") 167 | 168 | def _create_slots(self, var_list): 169 | # Create slots for the first and second moments. 170 | for v in var_list: 171 | self._zeros_slot(v, "m", self._name) 172 | self._zeros_slot(v, "v", self._name) 173 | self._zeros_slot(v, "v_prime", self._name) 174 | 175 | def _apply_dense(self, grad, var): 176 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 177 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 178 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 179 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 180 | 181 | # the following equations given in [1] 182 | # m_t = beta1 * m + (1 - beta1) * g_t 183 | m = self.get_slot(var, "m") 184 | m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking) 185 | 186 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 187 | v = self.get_slot(var, "v") 188 | v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking) 189 | v_prime = self.get_slot(var, "v_prime") 190 | v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t)) 191 | 192 | var_update = state_ops.assign_sub(var, 193 | lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t), 194 | use_locking=self._use_locking) 195 | 196 | return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime]) 197 | 198 | # keras Nadam update rule 199 | def _apply_sparse(self, grad, var): 200 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 201 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 202 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 203 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 204 | 205 | # the following equations given in [1] 206 | # m_t = beta1 * m + (1 - beta1) * g_t 207 | m = self.get_slot(var, "m") 208 | m_t = state_ops.scatter_update(m, grad.indices, 209 | beta1_t * array_ops.gather(m, grad.indices) + 210 | (1. - beta1_t) * grad.values, 211 | use_locking=self._use_locking) 212 | m_t_slice = tf.gather(m_t, grad.indices) 213 | 214 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 215 | v = self.get_slot(var, "v") 216 | v_t = state_ops.scatter_update(v, grad.indices, 217 | beta2_t * array_ops.gather(v, grad.indices) + 218 | (1. - beta2_t) * tf.square(grad.values), 219 | use_locking=self._use_locking) 220 | v_prime = self.get_slot(var, "v_prime") 221 | v_t_slice = tf.gather(v_t, grad.indices) 222 | v_prime_slice = tf.gather(v_prime, grad.indices) 223 | v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) 224 | 225 | v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) 226 | var_update = state_ops.scatter_sub(var, grad.indices, 227 | lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), 228 | use_locking=self._use_locking) 229 | 230 | return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime]) 231 | 232 | 233 | class LazyNadamOptimizer(optimizer.Optimizer): 234 | def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8, 235 | schedule_decay=0.004, use_locking=False, name="Nadam"): 236 | super(LazyNadamOptimizer, self).__init__(use_locking, name) 237 | self._lr = learning_rate 238 | self._beta1 = beta1 239 | self._beta2 = beta2 240 | self._epsilon = epsilon 241 | self._schedule_decay = schedule_decay 242 | # momentum cache decay 243 | self._momentum_cache_decay = tf.cast(0.96, tf.float32) 244 | self._momentum_cache_const = tf.pow(self._momentum_cache_decay, 1. * schedule_decay) 245 | 246 | # Tensor versions of the constructor arguments, created in _prepare(). 247 | self._lr_t = None 248 | self._beta1_t = None 249 | self._beta2_t = None 250 | self._epsilon_t = None 251 | self._schedule_decay_t = None 252 | 253 | # Variables to accumulate the powers of the beta parameters. 254 | # Created in _create_slots when we know the variables to optimize. 255 | self._beta1_power = None 256 | self._beta2_power = None 257 | self._iterations = None 258 | self._m_schedule = None 259 | 260 | # Created in SparseApply if needed. 261 | self._updated_lr = None 262 | 263 | def _prepare(self): 264 | self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate") 265 | self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1") 266 | self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2") 267 | self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon") 268 | self._schedule_decay_t = ops.convert_to_tensor(self._schedule_decay, name="schedule_decay") 269 | 270 | def _create_slots(self, var_list): 271 | # Create the beta1 and beta2 accumulators on the same device as the first 272 | # variable. Sort the var_list to make sure this device is consistent across 273 | # workers (these need to go on the same PS, otherwise some updates are 274 | # silently ignored). 275 | first_var = min(var_list, key=lambda x: x.name) 276 | 277 | create_new = self._iterations is None 278 | if not create_new and context.in_graph_mode(): 279 | create_new = (self._iterations.graph is not first_var.graph) 280 | 281 | if create_new: 282 | with ops.colocate_with(first_var): 283 | self._beta1_power = variable_scope.variable(self._beta1, 284 | name="beta1_power", 285 | trainable=False) 286 | self._beta2_power = variable_scope.variable(self._beta2, 287 | name="beta2_power", 288 | trainable=False) 289 | self._iterations = variable_scope.variable(0., 290 | name="iterations", 291 | trainable=False) 292 | self._m_schedule = variable_scope.variable(1., 293 | name="m_schedule", 294 | trainable=False) 295 | # Create slots for the first and second moments. 296 | for v in var_list: 297 | self._zeros_slot(v, "m", self._name) 298 | self._zeros_slot(v, "v", self._name) 299 | 300 | def _get_momentum_cache(self, schedule_decay_t, t): 301 | return tf.pow(self._momentum_cache_decay, t * schedule_decay_t) 302 | # return beta1_t * (1. - 0.5 * (tf.pow(self._momentum_cache_decay, t * schedule_decay_t))) 303 | 304 | """very slow 305 | we simply use the nadam update rule without warming momentum schedule 306 | def _apply_dense(self, grad, var): 307 | t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. 308 | m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) 309 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 310 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 311 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 312 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 313 | schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) 314 | 315 | # Due to the recommendations in [2], i.e. warming momentum schedule 316 | # see keras Nadam 317 | momentum_cache_t = self._get_momentum_cache(beta1_t, schedule_decay_t, t) 318 | momentum_cache_t_1 = self._get_momentum_cache(beta1_t, schedule_decay_t, t+1.) 319 | m_schedule_new = m_schedule * momentum_cache_t 320 | m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1 321 | 322 | # the following equations given in [1] 323 | # m_t = beta1 * m + (1 - beta1) * g_t 324 | m = self.get_slot(var, "m") 325 | m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking) 326 | g_prime = grad / (1. - m_schedule_new) 327 | m_t_prime = m_t / (1. - m_schedule_next) 328 | m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime 329 | 330 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 331 | v = self.get_slot(var, "v") 332 | v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking) 333 | v_t_prime = v_t / (1. - tf.pow(beta2_t, t)) 334 | 335 | var_update = state_ops.assign_sub(var, 336 | lr_t * m_t_bar / (tf.sqrt(v_t_prime) + epsilon_t), 337 | use_locking=self._use_locking) 338 | 339 | return control_flow_ops.group(*[var_update, m_t, v_t]) 340 | """ 341 | 342 | # nadam update rule without warming momentum schedule 343 | def _apply_dense(self, grad, var): 344 | m = self.get_slot(var, "m") 345 | v = self.get_slot(var, "v") 346 | return training_ops.apply_adam( 347 | var, 348 | m, 349 | v, 350 | math_ops.cast(self._beta1_power, var.dtype.base_dtype), 351 | math_ops.cast(self._beta2_power, var.dtype.base_dtype), 352 | math_ops.cast(self._lr_t, var.dtype.base_dtype), 353 | math_ops.cast(self._beta1_t, var.dtype.base_dtype), 354 | math_ops.cast(self._beta2_t, var.dtype.base_dtype), 355 | math_ops.cast(self._epsilon_t, var.dtype.base_dtype), 356 | grad, 357 | use_locking=self._use_locking, 358 | use_nesterov=True).op 359 | 360 | def _resource_apply_dense(self, grad, var): 361 | m = self.get_slot(var, "m") 362 | v = self.get_slot(var, "v") 363 | return training_ops.resource_apply_adam( 364 | var.handle, 365 | m.handle, 366 | v.handle, 367 | math_ops.cast(self._beta1_power, grad.dtype.base_dtype), 368 | math_ops.cast(self._beta2_power, grad.dtype.base_dtype), 369 | math_ops.cast(self._lr_t, grad.dtype.base_dtype), 370 | math_ops.cast(self._beta1_t, grad.dtype.base_dtype), 371 | math_ops.cast(self._beta2_t, grad.dtype.base_dtype), 372 | math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), 373 | grad, 374 | use_locking=self._use_locking, 375 | use_nesterov=True) 376 | 377 | # keras Nadam update rule 378 | def _apply_sparse(self, grad, var): 379 | t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. 380 | m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) 381 | lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 382 | beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 383 | beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 384 | epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 385 | schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) 386 | 387 | # Due to the recommendations in [2], i.e. warming momentum schedule 388 | momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) 389 | momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) 390 | momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) 391 | m_schedule_new = m_schedule * momentum_cache_t 392 | m_schedule_next = m_schedule_new * momentum_cache_t_1 393 | 394 | # the following equations given in [1] 395 | # m_t = beta1 * m + (1 - beta1) * g_t 396 | m = self.get_slot(var, "m") 397 | m_t = state_ops.scatter_update(m, grad.indices, 398 | beta1_t * array_ops.gather(m, grad.indices) + 399 | (1. - beta1_t) * grad.values, 400 | use_locking=self._use_locking) 401 | g_prime_slice = grad.values / (1. - m_schedule_new) 402 | m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) 403 | m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice 404 | 405 | # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 406 | v = self.get_slot(var, "v") 407 | v_t = state_ops.scatter_update(v, grad.indices, 408 | beta2_t * array_ops.gather(v, grad.indices) + 409 | (1. - beta2_t) * tf.square(grad.values), 410 | use_locking=self._use_locking) 411 | v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) 412 | 413 | var_update = state_ops.scatter_sub(var, grad.indices, 414 | lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), 415 | use_locking=self._use_locking) 416 | 417 | return control_flow_ops.group(*[var_update, m_t, v_t]) 418 | 419 | def _finish(self, update_ops, name_scope): 420 | # Update the power accumulators. 421 | with ops.control_dependencies(update_ops): 422 | with ops.colocate_with(self._iterations): 423 | update_beta1 = self._beta1_power.assign( 424 | self._beta1_power * self._beta1_t, 425 | use_locking=self._use_locking) 426 | update_beta2 = self._beta2_power.assign( 427 | self._beta2_power * self._beta2_t, 428 | use_locking=self._use_locking) 429 | t = self._iterations + 1. 430 | update_iterations = self._iterations.assign(t, use_locking=self._use_locking) 431 | momentum_cache_power = self._get_momentum_cache(self._schedule_decay_t, t) 432 | momentum_cache_t = self._beta1_t * (1. - 0.5 * momentum_cache_power) 433 | update_m_schedule = self._m_schedule.assign( 434 | self._m_schedule * momentum_cache_t, 435 | use_locking=self._use_locking) 436 | return control_flow_ops.group( 437 | *update_ops + [update_beta1, update_beta2] + [update_iterations, update_m_schedule], 438 | name=name_scope) -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/utils/__init__.py -------------------------------------------------------------------------------- /src/utils/dist_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for distance computation 5 | """ 6 | 7 | import warnings 8 | warnings.filterwarnings("ignore") 9 | import numpy as np 10 | try: 11 | import lzma 12 | import Levenshtein 13 | except: 14 | pass 15 | from difflib import SequenceMatcher 16 | from rouge import Rouge 17 | from utils import ngram_utils, np_utils 18 | 19 | 20 | def _edit_dist(str1, str2): 21 | try: 22 | # very fast 23 | # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed 24 | # d = Levenshtein.ratio(str1, str2) 25 | d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2))) 26 | except: 27 | # https://docs.python.org/2/library/difflib.html 28 | d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio() 29 | return d 30 | 31 | 32 | def _longest_match_size(str1, str2): 33 | sq = SequenceMatcher(lambda x: x==" ", str1, str2) 34 | match = sq.find_longest_match(0, len(str1), 0, len(str2)) 35 | return match.size 36 | 37 | 38 | def _longest_match_ratio(str1, str2): 39 | sq = SequenceMatcher(lambda x: x==" ", str1, str2) 40 | match = sq.find_longest_match(0, len(str1), 0, len(str2)) 41 | return np_utils._try_divide(match.size, min(len(str1), len(str2))) 42 | 43 | 44 | def _common_num(s1, s2): 45 | c = 0 46 | for s1_ in s1: 47 | for s2_ in s2: 48 | if s1_ == s2_: 49 | c += 1 50 | return c 51 | 52 | 53 | def _count_stats(s1, s2): 54 | # length 55 | l1 = len(s1) 56 | l2 = len(s2) 57 | len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.) 58 | 59 | # set 60 | s1_set = set(s1) 61 | s2_set = set(s2) 62 | 63 | # unique length 64 | l1_unique = len(s1_set) 65 | l2_unique = len(s2_set) 66 | len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.) 67 | 68 | # unique ratio 69 | r1_unique = np_utils._try_divide(l1_unique, l1) 70 | r2_unique = np_utils._try_divide(l2_unique, l2) 71 | 72 | # jaccard coef 73 | li = len(s1_set.intersection(s2_set)) 74 | lu = len(s1_set.union(s2_set)) 75 | jaccard_coef = np_utils._try_divide(li, lu) 76 | 77 | # dice coef 78 | dice_coef = np_utils._try_divide(li, l1_unique + l2_unique) 79 | 80 | # common number 81 | common_ = _common_num(s1, s2) 82 | common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.) 83 | common_ratio_max = np_utils._try_divide(common_, min(l1, l2)) 84 | common_ratio_min = np_utils._try_divide(common_, max(l1, l2)) 85 | 86 | # over all features 87 | f = [l1, l2, len_diff, 88 | l1_unique, l2_unique, len_diff_unique, 89 | r1_unique, r2_unique, 90 | li, lu, jaccard_coef, dice_coef, 91 | common_, common_ratio_avg, common_ratio_max, common_ratio_min 92 | ] 93 | return np.array(f, dtype=np.float32) 94 | 95 | 96 | rouge = Rouge() 97 | def _get_rouge_feat(s1, s2): 98 | if isinstance(s1, list): 99 | s1 = " ".join(s1) 100 | if isinstance(s2, list): 101 | s2 = " ".join(s2) 102 | scores = rouge.get_scores(s1, s2) 103 | feat = [] 104 | for k,v in scores[0].items(): 105 | feat.extend(v.values()) 106 | return np.array(feat, dtype=np.float32) 107 | 108 | 109 | def _get_bleu(s1, s2): 110 | count_dict={} 111 | count_dict_clip={} 112 | #1. count for each token at predict sentence side. 113 | for token in s1: 114 | if token not in count_dict: 115 | count_dict[token]=1 116 | else: 117 | count_dict[token]=count_dict[token]+1 118 | count=np.sum([value for key,value in count_dict.items()]) 119 | 120 | #2.count for tokens existing in predict sentence for target sentence side. 121 | for token in s2: 122 | if token in count_dict: 123 | if token not in count_dict_clip: 124 | count_dict_clip[token]=1 125 | else: 126 | count_dict_clip[token]=count_dict_clip[token]+1 127 | 128 | #3. clip value to ceiling value for that token 129 | count_dict_clip={key:(value if value<=count_dict[key] else count_dict[key]) for key,value in count_dict_clip.items()} 130 | count_clip=np.sum([value for key,value in count_dict_clip.items()]) 131 | result=float(count_clip)/(float(count)+0.00000001) 132 | return result 133 | 134 | 135 | def _get_bleu_feat(s1, s2, ngrams=3): 136 | if isinstance(s1, str): 137 | s1 = s1.split(" ") 138 | if isinstance(s2, str): 139 | s2 = s2.split(" ") 140 | feat = [] 141 | for ngram in range(ngrams+1): 142 | s1_ngram = ngram_utils._ngrams(s1, ngram+1, "_") 143 | s2_ngram = ngram_utils._ngrams(s2, ngram+1, "_") 144 | feat.append(_get_bleu(s1_ngram, s2_ngram)) 145 | return np.array(feat, dtype=np.float32) 146 | 147 | 148 | 149 | if __name__ == "__main__": 150 | s1 = ["W1", "W2", "W3", "W4", "W10"] 151 | s2 = ["W1", "W2", "W4", "W6", "W8"] 152 | print(_count_stats(s1, s2)) 153 | print(_edit_dist(s1, s2)) 154 | print(_longest_match_size(s1, s2)) 155 | print(_longest_match_ratio(s1, s2)) 156 | print(_get_rouge_feat(s1, s2)) 157 | print(_get_bleu_feat(s1, s2)) -------------------------------------------------------------------------------- /src/utils/log_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import logging 4 | import logging.handlers 5 | 6 | 7 | def _get_logger(logdir, logname, loglevel=logging.INFO): 8 | fmt = "[%(asctime)s] %(levelname)s: %(message)s" 9 | formatter = logging.Formatter(fmt) 10 | 11 | handler = logging.handlers.RotatingFileHandler( 12 | filename=os.path.join(logdir, logname), 13 | maxBytes=2 * 1024 * 1024 * 1024, 14 | backupCount=10) 15 | handler.setFormatter(formatter) 16 | 17 | logger = logging.getLogger("") 18 | logger.addHandler(handler) 19 | logger.setLevel(loglevel) 20 | return logger 21 | -------------------------------------------------------------------------------- /src/utils/ngram_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: Chenglong Chen 4 | @brief: utils for ngram 5 | """ 6 | 7 | 8 | def _unigrams(words): 9 | """ 10 | Input: a list of words, e.g., ["I", "am", "Denny"] 11 | Output: a list of unigram 12 | """ 13 | assert type(words) == list 14 | return words 15 | 16 | 17 | def _bigrams(words, join_string, skip=0): 18 | """ 19 | Input: a list of words, e.g., ["I", "am", "Denny"] 20 | Output: a list of bigram, e.g., ["I_am", "am_Denny"] 21 | I use _ as join_string for this example. 22 | """ 23 | assert type(words) == list 24 | L = len(words) 25 | if L > 1: 26 | lst = [] 27 | for i in range(L-1): 28 | for k in range(1,skip+2): 29 | if i+k < L: 30 | lst.append( join_string.join([words[i], words[i+k]]) ) 31 | else: 32 | # set it as unigram 33 | lst = _unigrams(words) 34 | return lst 35 | 36 | 37 | def _trigrams(words, join_string, skip=0): 38 | """ 39 | Input: a list of words, e.g., ["I", "am", "Denny"] 40 | Output: a list of trigram, e.g., ["I_am_Denny"] 41 | I use _ as join_string for this example. 42 | """ 43 | assert type(words) == list 44 | L = len(words) 45 | if L > 2: 46 | lst = [] 47 | for i in range(L-2): 48 | for k1 in range(1,skip+2): 49 | for k2 in range(1,skip+2): 50 | if i+k1 < L and i+k1+k2 < L: 51 | lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) ) 52 | else: 53 | # set it as bigram 54 | lst = _bigrams(words, join_string, skip) 55 | return lst 56 | 57 | 58 | def _fourgrams(words, join_string): 59 | """ 60 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 61 | Output: a list of trigram, e.g., ["I_am_Denny_boy"] 62 | I use _ as join_string for this example. 63 | """ 64 | assert type(words) == list 65 | L = len(words) 66 | if L > 3: 67 | lst = [] 68 | for i in range(L-3): 69 | lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) ) 70 | else: 71 | # set it as trigram 72 | lst = _trigrams(words, join_string) 73 | return lst 74 | 75 | 76 | def _uniterms(words): 77 | return _unigrams(words) 78 | 79 | 80 | def _biterms(words, join_string): 81 | """ 82 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 83 | Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"] 84 | I use _ as join_string for this example. 85 | """ 86 | assert type(words) == list 87 | L = len(words) 88 | if L > 1: 89 | lst = [] 90 | for i in range(L-1): 91 | for j in range(i+1,L): 92 | lst.append( join_string.join([words[i], words[j]]) ) 93 | else: 94 | # set it as uniterm 95 | lst = _uniterms(words) 96 | return lst 97 | 98 | 99 | def _triterms(words, join_string): 100 | """ 101 | Input: a list of words, e.g., ["I", "am", "Denny", "boy"] 102 | Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"] 103 | I use _ as join_string for this example. 104 | """ 105 | assert type(words) == list 106 | L = len(words) 107 | if L > 2: 108 | lst = [] 109 | for i in range(L-2): 110 | for j in range(i+1,L-1): 111 | for k in range(j+1,L): 112 | lst.append( join_string.join([words[i], words[j], words[k]]) ) 113 | else: 114 | # set it as biterm 115 | lst = _biterms(words, join_string) 116 | return lst 117 | 118 | 119 | def _fourterms(words, join_string): 120 | """ 121 | Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"] 122 | Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"] 123 | I use _ as join_string for this example. 124 | """ 125 | assert type(words) == list 126 | L = len(words) 127 | if L > 3: 128 | lst = [] 129 | for i in range(L-3): 130 | for j in range(i+1,L-2): 131 | for k in range(j+1,L-1): 132 | for l in range(k+1,L): 133 | lst.append( join_string.join([words[i], words[j], words[k], words[l]]) ) 134 | else: 135 | # set it as triterm 136 | lst = _triterms(words, join_string) 137 | return lst 138 | 139 | 140 | _ngram_str_map = { 141 | 1: "Unigram", 142 | 2: "Bigram", 143 | 3: "Trigram", 144 | 4: "Fourgram", 145 | 5: "Fivegram", 146 | 12: "UBgram", 147 | 123: "UBTgram", 148 | } 149 | 150 | 151 | def _ngrams(words, ngram, join_string=" "): 152 | """wrapper for ngram""" 153 | if ngram == 1: 154 | return _unigrams(words) 155 | elif ngram == 2: 156 | return _bigrams(words, join_string) 157 | elif ngram == 3: 158 | return _trigrams(words, join_string) 159 | elif ngram == 4: 160 | return _fourgrams(words, join_string) 161 | elif ngram == 12: 162 | unigram = _unigrams(words) 163 | bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2] 164 | return unigram + bigram 165 | elif ngram == 123: 166 | unigram = _unigrams(words) 167 | bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2] 168 | trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3] 169 | return unigram + bigram + trigram 170 | 171 | 172 | _nterm_str_map = { 173 | 1: "Uniterm", 174 | 2: "Biterm", 175 | 3: "Triterm", 176 | 4: "Fourterm", 177 | 5: "Fiveterm", 178 | } 179 | 180 | 181 | def _nterms(words, nterm, join_string=" "): 182 | """wrapper for nterm""" 183 | if nterm == 1: 184 | return _uniterms(words) 185 | elif nterm == 2: 186 | return _biterms(words, join_string) 187 | elif nterm == 3: 188 | return _triterms(words, join_string) 189 | elif nterm == 4: 190 | return _fourterms(words, join_string) 191 | 192 | 193 | if __name__ == "__main__": 194 | 195 | text = "I am Denny boy ha" 196 | words = text.split(" ") 197 | 198 | assert _ngrams(words, 1) == ["I", "am", "Denny", "boy", "ha"] 199 | assert _ngrams(words, 2) == ["I am", "am Denny", "Denny boy", "boy ha"] 200 | assert _ngrams(words, 3) == ["I am Denny", "am Denny boy", "Denny boy ha"] 201 | assert _ngrams(words, 4) == ["I am Denny boy", "am Denny boy ha"] 202 | 203 | assert _nterms(words, 1) == ["I", "am", "Denny", "boy", "ha"] 204 | assert _nterms(words, 2) == ["I am", "I Denny", "I boy", "I ha", "am Denny", "am boy", "am ha", "Denny boy", "Denny ha", "boy ha"] 205 | assert _nterms(words, 3) == ["I am Denny", "I am boy", "I am ha", "I Denny boy", "I Denny ha", "I boy ha", "am Denny boy", "am Denny ha", "am boy ha", "Denny boy ha"] 206 | assert _nterms(words, 4) == ["I am Denny boy", "I am Denny ha", "I am boy ha", "I Denny boy ha", "am Denny boy ha"] -------------------------------------------------------------------------------- /src/utils/np_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def _try_divide(x, y, val=0.0): 5 | """try to divide two numbers""" 6 | if y != 0.0: 7 | val = float(x) / y 8 | return val 9 | -------------------------------------------------------------------------------- /src/utils/os_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import shutil 4 | 5 | 6 | def _makedirs(dir, force=False): 7 | if os.path.exists(dir): 8 | if force: 9 | shutil.rmtree(dir) 10 | os.makedirs(dir) 11 | else: 12 | os.makedirs(dir) 13 | -------------------------------------------------------------------------------- /src/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | 4 | 5 | def _timestamp(): 6 | now = datetime.datetime.now() 7 | now_str = now.strftime("%Y%m%d%H%M") 8 | return now_str -------------------------------------------------------------------------------- /src/utils/topk_utils.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import defaultdict 3 | from random import randint 4 | 5 | 6 | # Bucket Sort 7 | # Time: O(n + klogk) ~ O(n + nlogn) 8 | # Space: O(n) 9 | class BucketSort(object): 10 | def topKFrequent(self, words, k): 11 | counts = defaultdict(int) 12 | for ws in words: 13 | for w in ws: 14 | counts[w] += 1 15 | 16 | buckets = [[]] * (sum(counts.values()) + 1) 17 | for i, count in counts.items(): 18 | buckets[count].append(i) 19 | 20 | result = [] 21 | # result_append = result.append 22 | for i in reversed(range(len(buckets))): 23 | for j in range(len(buckets[i])): 24 | # slower 25 | # result_append(buckets[i][j]) 26 | result.append(buckets[i][j]) 27 | if len(result) == k: 28 | return result 29 | return result 30 | 31 | 32 | # Quick Select 33 | # Time: O(n) ~ O(n^2), O(n) on average. 34 | # Space: O(n) 35 | class QuickSelect(object): 36 | def topKFrequent(self, words, k): 37 | """ 38 | :type words: List[str] 39 | :type k: int 40 | :rtype: List[str] 41 | """ 42 | counts = defaultdict(int) 43 | for ws in words: 44 | for w in ws: 45 | counts[w] += 1 46 | p = [] 47 | for key, val in counts.items(): 48 | p.append((-val, key)) 49 | self.kthElement(p, k) 50 | 51 | result = [] 52 | sorted_p = sorted(p[:k]) 53 | for i in range(k): 54 | result.append(sorted_p[i][1]) 55 | return result 56 | 57 | def kthElement(self, nums, k): # O(n) on average 58 | def PartitionAroundPivot(left, right, pivot_idx, nums): 59 | pivot_value = nums[pivot_idx] 60 | new_pivot_idx = left 61 | nums[pivot_idx], nums[right] = nums[right], nums[pivot_idx] 62 | for i in range(left, right): 63 | if nums[i] < pivot_value: 64 | nums[i], nums[new_pivot_idx] = nums[new_pivot_idx], nums[i] 65 | new_pivot_idx += 1 66 | 67 | nums[right], nums[new_pivot_idx] = nums[new_pivot_idx], nums[right] 68 | return new_pivot_idx 69 | 70 | left, right = 0, len(nums) - 1 71 | while left <= right: 72 | pivot_idx = randint(left, right) 73 | new_pivot_idx = PartitionAroundPivot(left, right, pivot_idx, nums) 74 | if new_pivot_idx == k - 1: 75 | return 76 | elif new_pivot_idx > k - 1: 77 | right = new_pivot_idx - 1 78 | else: # new_pivot_idx < k - 1. 79 | left = new_pivot_idx + 1 80 | 81 | 82 | top_k_selector = BucketSort() --------------------------------------------------------------------------------