├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── DATA.md
    ├── char_embed.txt
    ├── question.csv
    ├── test.csv
    ├── train.csv
    └── word_embed.txt
└── src
    ├── __init__.py
    ├── config.py
    ├── inputs
        ├── __init__.py
        ├── data.py
        └── dynamic_pooling.py
    ├── main.py
    ├── models
        ├── __init__.py
        ├── base_model.py
        ├── bcnn.py
        ├── decatt.py
        ├── dsmm.py
        ├── dssm.py
        ├── esim.py
        ├── match_pyramid.py
        └── model_library.py
    ├── tf_common
        ├── __init__.py
        ├── metrics.py
        ├── nadam.py
        ├── nn_module.py
        └── optimizer.py
    └── utils
        ├── __init__.py
        ├── dist_utils.py
        ├── log_utils.py
        ├── ngram_utils.py
        ├── np_utils.py
        ├── os_utils.py
        ├── time_utils.py
        └── topk_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | *.py[cod]
  3 | *$py.class
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | build/
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | MANIFEST
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *.cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | .static_storage/
 55 | .media/
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # Environments
 84 | .env
 85 | .venv
 86 | env/
 87 | venv/
 88 | ENV/
 89 | env.bak/
 90 | venv.bak/
 91 | 
 92 | # Spyder project settings
 93 | .spyderproject
 94 | .spyproject
 95 | 
 96 | # Rope project settings
 97 | .ropeproject
 98 | 
 99 | # mkdocs documentation
100 | /site
101 | 
102 | # mypy
103 | .mypy_cache/
104 | 
105 | #
106 | .idea
107 | __pycache__


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Chenglong Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tensorflow-DSMM
 2 | 
 3 | Ongoing project for implementing various Deep Semantic Matching Models (DSMM). DSMM is widely used for:
 4 | 
 5 | 
 6 | - duplicate detection
 7 | - sentence similarity
 8 | - question answering
 9 | - search relevance
10 | - ...
11 | 
12 | ## Quickstart
13 | ### Data
14 | This project is developed with regard to the data format provided in the [第三届魔镜杯大赛](https://www.ppdai.ai/mirror/goToMirrorDetail?mirrorId=1). 
15 | 
16 | You can see `/data/DATA.md` for the data format description and prepared data accordingly. 
17 | Your data should be placed in the `data` directory. Current `data` directory also holds a toy data.
18 | 
19 | If you want to run a quick demo, you can download data from the above competition link. Download is allowed after registration.
20 | 
21 | ### Demo
22 | ```bash
23 | python src/main.py
24 | ```
25 | 
26 | ## Supported Models
27 | 
28 | ### Representation based methods
29 | - DSSM style models
30 |     - DSSM: use FastText as encoder
31 |     - CDSSM: use TextCNN as encoder
32 |     - RDSSM: use TextRNN/TextBiRNN as encoder
33 |     
34 | ### Interaction based methods
35 | - MatchPyramid style models
36 |     - MatchPyramid: use identity/cosine similarity/dot product as match matrix
37 |     - General MatchPyramid: use match matrices based on various embeddings and various match scores
38 |         - word embeddings
39 |             - original word embedding
40 |             - compressed word embedding
41 |             - contextual word embedding (use an encoder to encode contextual information)
42 |         - match score
43 |             - identity
44 |             - cosine similarity/dot product
45 |             - element product
46 |             - element concat
47 | - BCNN style models
48 |     - BCNN
49 |     - ABCNN1
50 |     - ABCNN2
51 |     - ABCNN3
52 | - ESIM
53 | - DecAtt (Decomposable Attention)
54 | 
55 | 
56 | ## Building Blocks
57 | ### Encoder layers
58 | - FastText
59 | - TimeDistributed Dense Projection
60 | - TextCNN (Gated CNN and also Residual Gated CNN)
61 | - TextRNN/TextBiRNN with GRU and LSTM cell
62 | 
63 | ### Attention layers
64 | - mean/max/min pooling
65 | - scalar-based and vector-based attention
66 | - self and context attention
67 | - multi-head attention
68 | 
69 | # Acknowledgments
70 | This project gets inspirations from the following projects:
71 | - [MatchZoo](https://github.com/faneshion/MatchZoo)
72 | - [MatchPyramid-TensorFlow](https://github.com/pl8787/MatchPyramid-TensorFlow)
73 | - [ABCNN](https://github.com/galsang/ABCNN)
74 | 


--------------------------------------------------------------------------------
/data/DATA.md:
--------------------------------------------------------------------------------
 1 | # Data Format
 2 | ## char_embed.txt
 3 | This file should contains the char embedding.
 4 | 
 5 | Each line should be `char_id embedding_vector`. For example,
 6 | ```text
 7 | C1 0 0 0 0
 8 | C2 0.1 0.5 0.4 0.2
 9 | C3 0.8 0.2 0.9 1.0
10 | C4 0.14 0.15 0.64 0.12
11 | ```
12 | 
13 | ## word_embed.txt
14 | This file should contains the word embedding.
15 | 
16 | Each line should be `word_id embedding_vector`. For example,
17 | ```text
18 | W1 0 0 0 0
19 | W2 0.1 0.5 0.4 0.2
20 | W3 0.8 0.2 0.9 1.0
21 | W4 0.14 0.15 0.64 0.12
22 | ```
23 | 
24 | ## question.csv
25 | This file should contains all the question that appears in `train.csv` and `test.csv`.
26 | 
27 | Each line should be `question_id,word_sequence_ids,char_sequence_ids`. For example,
28 | ```text
29 | qid,words,chars
30 | Q1,W1 W2 W3,C31 C64 C45 C85
31 | Q2,W2 W9 W7 W10 W20,C39 C58 C3
32 | Q3,W23 W91 W7 W10 W290,C19 C81 C31
33 | Q4,W25 W9 W70 W101 W210,C92 C58 C33
34 | Q5,W22 W9 W7 W130 W20,C98 C85 C35
35 | Q6,W2 W19 W87,C39 C86 C34
36 | ```
37 | 
38 | ## train.csv
39 | This file should contains the training question pairs.
40 | 
41 | Each line should be `label,q1,q2`, where `label=1` means `q1` (`q1` is the id of question 1) and `q2` (`q2` is the id of question 2) is of the same meaning. `label=0` means they have different meanings. For example
42 | ```text
43 | label,q1,q2
44 | 1,Q1,Q2
45 | 0,Q1,Q3
46 | 0,Q2,Q4
47 | 0,Q5,Q1
48 | 1,Q2,Q6
49 | ```
50 | 
51 | ## test.csv
52 | This file should contains the testing question pairs.
53 | 
54 | Each line should be `q1,q2`, where `q1` is the id of question 1 and `q2` is the id of question 2. For example
55 | ```text
56 | q1,q2
57 | Q2,Q3
58 | Q6,Q5
59 | ```


--------------------------------------------------------------------------------
/data/char_embed.txt:
--------------------------------------------------------------------------------
1 | C1 0 0 0 0
2 | C2 0.1 0.5 0.4 0.2
3 | C3 0.8 0.2 0.9 1.0
4 | C4 0.14 0.15 0.64 0.12


--------------------------------------------------------------------------------
/data/question.csv:
--------------------------------------------------------------------------------
1 | qid,words,chars
2 | Q1,W1 W2 W3,C31 C64 C45 C85
3 | Q2,W2 W9 W7 W10 W20,C39 C58 C3
4 | Q3,W23 W91 W7 W10 W290,C19 C81 C31
5 | Q4,W25 W9 W70 W101 W210,C92 C58 C33
6 | Q5,W22 W9 W7 W130 W20,C98 C85 C35
7 | Q6,W2 W19 W87,C39 C86 C34


--------------------------------------------------------------------------------
/data/test.csv:
--------------------------------------------------------------------------------
1 | q1,q2
2 | Q2,Q3
3 | Q6,Q5


--------------------------------------------------------------------------------
/data/train.csv:
--------------------------------------------------------------------------------
1 | label,q1,q2
2 | 1,Q1,Q2
3 | 0,Q1,Q3
4 | 0,Q2,Q4
5 | 0,Q5,Q1
6 | 1,Q2,Q6


--------------------------------------------------------------------------------
/data/word_embed.txt:
--------------------------------------------------------------------------------
1 | W1 0 0 0 0
2 | W2 0.1 0.5 0.4 0.2
3 | W3 0.8 0.2 0.9 1.0
4 | W4 0.14 0.15 0.64 0.12


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/__init__.py


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | DATA_DIR = "../data"
 4 | 
 5 | TRAIN_FILE = DATA_DIR + "/train.csv"
 6 | TEST_FILE = DATA_DIR + "/test.csv"
 7 | 
 8 | TRAIN_FEATURES_FILE = DATA_DIR + "/train_features.npy"
 9 | TEST_FEATURES_FILE = DATA_DIR + "/test_features.npy"
10 | 
11 | QUESTION_FILE = DATA_DIR + "/question.csv"
12 | 
13 | WORD_EMBEDDING_FILE = DATA_DIR + "/word_embed.txt"
14 | CHAR_EMBEDDING_FILE = DATA_DIR + "/char_embed.txt"
15 | 
16 | SUB_DIR = "../sub"
17 | SUB_FILE = "submission.csv"
18 | SINGLE_SUB_FILE_PATTERN = "submission_%s_%s.csv"
19 | STACKING_SUB_FILE_PATTERN = "submission_%s.csv"
20 | 
21 | 
22 | # missing
23 | MISSING_INDEX_WORD = 20891
24 | PADDING_INDEX_WORD = 20892
25 | 
26 | MISSING_INDEX_CHAR = 3048
27 | PADDING_INDEX_CHAR = 3049
28 | 
29 | # ratio
30 | POS_RATIO_OFFLINE = 0.5191087559849992
31 | POS_RATIO_ONLINE = 0.50296075348400959
32 | 
33 | """
34 | 1/(p0 + p1) * (P0 * (0*log(0+eps) + (1-0)*log(1-0-eps)) + P1 * (1*log(0+eps) + (1-1)*log(1-0-eps))) = 17.371649
35 | 1/(p0 + p1) * (p0 * log(1-eps) + p1 * log(0+eps)) = 17.371649
36 | p1/(p0 + p1) ~= 17.371649/log(eps)
37 |               = 17.371649/log(1e-15)
38 |               = 0.50296075348400959
39 | """
40 | 
41 | NUM_TRAIN = 254386
42 | NUM_TEST = 172956
43 | 
44 | TRAIN_RATIO = 0.7
45 | 
46 | SPLIT_FILE = "split.pkl"
47 | 


--------------------------------------------------------------------------------
/src/inputs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/inputs/__init__.py


--------------------------------------------------------------------------------
/src/inputs/data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import config
 3 | import numpy as np
 4 | import pandas as pd
 5 | import scipy as sp
 6 | from keras.preprocessing.sequence import pad_sequences
 7 | 
 8 | 
 9 | def _to_ind(qid):
10 |     return int(qid[1:])
11 | 
12 | 
13 | def load_raw_question():
14 |     df = pd.read_csv(config.QUESTION_FILE)
15 |     df["words"] = df.words.str.split(" ")
16 |     df["chars"] = df.chars.str.split(" ")
17 |     Q = {}
18 |     Q["words"] = df["words"].values
19 |     Q["chars"] = df["chars"].values
20 |     return Q
21 | 
22 | 
23 | def load_question(params):
24 |     df = pd.read_csv(config.QUESTION_FILE)
25 |     df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
26 |     df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
27 |     Q = {}
28 |     Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"])
29 |     Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"])
30 |     Q["words"] = pad_sequences(df["words"],
31 |                                maxlen=params["max_seq_len_word"],
32 |                                padding=params["pad_sequences_padding"],
33 |                                truncating=params["pad_sequences_truncating"],
34 |                                value=config.PADDING_INDEX_WORD)
35 |     Q["chars"] = pad_sequences(df["chars"],
36 |                                maxlen=params["max_seq_len_char"],
37 |                                padding=params["pad_sequences_padding"],
38 |                                truncating=params["pad_sequences_truncating"],
39 |                                value=config.PADDING_INDEX_CHAR)
40 |     return Q
41 | 
42 | 
43 | def load_train():
44 |     df = pd.read_csv(config.TRAIN_FILE)
45 |     df["q1"] = df.q1.apply(_to_ind)
46 |     df["q2"] = df.q2.apply(_to_ind)
47 |     return df
48 | 
49 | 
50 | def load_test():
51 |     df = pd.read_csv(config.TEST_FILE)
52 |     df["q1"] = df.q1.apply(_to_ind)
53 |     df["q2"] = df.q2.apply(_to_ind)
54 |     df["label"] = np.zeros(df.shape[0])
55 |     return df
56 | 
57 | 
58 | def load_embedding_matrix(embedding_file):
59 |     print("read embedding from: %s " %embedding_file)
60 |     d = {}
61 |     n = 0
62 |     with open(embedding_file, "r") as f:
63 |         line = f.readline()
64 |         while line:
65 |             n += 1
66 |             w, v = line.strip().split(" ", 1)
67 |             d[int(w[1:])] = v
68 |             line = f.readline()
69 |     dim = len(v.split(" "))
70 | 
71 |     # add two index for missing and padding
72 |     emb_matrix = np.zeros((n+2, dim), dtype=float)
73 |     for key ,val in d.items():
74 |         v = np.asarray(val.split(" "), dtype=float)
75 |         emb_matrix[key] = v
76 |     emb_matrix = np.array(emb_matrix, dtype=np.float32)
77 |     return emb_matrix
78 | 
79 | 
80 | init_embedding_matrix = {
81 |     "word": load_embedding_matrix(config.WORD_EMBEDDING_FILE),
82 |     "char": load_embedding_matrix(config.CHAR_EMBEDDING_FILE),
83 | }


--------------------------------------------------------------------------------
/src/inputs/dynamic_pooling.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | 
 5 | # see https://github.com/pl8787/MatchPyramid-TensorFlow
 6 | def dpool_index_(batch_idx, len1_one, len2_one, max_len1, max_len2):
 7 |     stride1 = 1.0 * max_len1 / len1_one
 8 |     stride2 = 1.0 * max_len2 / len2_one
 9 |     idx1_one = np.arange(max_len1) / stride1
10 |     idx2_one = np.arange(max_len2) / stride2
11 |     mesh1, mesh2 = np.meshgrid(idx1_one, idx2_one)
12 |     index_one = np.transpose(np.stack([np.ones(mesh1.shape) * batch_idx, mesh1, mesh2]), (2, 1, 0))
13 |     return index_one
14 | 
15 | 
16 | def dynamic_pooling_index(len1, len2, max_len1, max_len2):
17 |     index = np.zeros((len(len1), max_len1, max_len2, 3), dtype=int)
18 |     for i in range(len(len1)):
19 |         index[i] = dpool_index_(i, len1[i], len2[i], max_len1, max_len2)
20 |     return index
21 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import pandas as pd
  4 | import pickle as pkl
  5 | import tensorflow as tf
  6 | 
  7 | from optparse import OptionParser
  8 | 
  9 | import config
 10 | 
 11 | from inputs.data import load_question, load_train, load_test
 12 | from inputs.data import init_embedding_matrix
 13 | from models.model_library import get_model
 14 | from utils import log_utils, os_utils, time_utils
 15 | 
 16 | 
 17 | params = {
 18 |     "model_name": "semantic_matching",
 19 |     "offline_model_dir": "./weights/semantic_matching",
 20 |     "summary_dir": "../summary",
 21 |     "construct_neg": False,
 22 | 
 23 |     "augmentation_init_permutation": 0.5,
 24 |     "augmentation_min_permutation": 0.01,
 25 |     "augmentation_permutation_decay_steps": 2000,
 26 |     "augmentation_permutation_decay_rate": 0.975,
 27 | 
 28 |     "augmentation_init_dropout": 0.5,
 29 |     "augmentation_min_dropout": 0.01,
 30 |     "augmentation_dropout_decay_steps": 2000,
 31 |     "augmentation_dropout_decay_rate": 0.975,
 32 | 
 33 |     "use_features": False,
 34 |     "num_features": 1,
 35 | 
 36 |     "n_runs": 10,
 37 |     "batch_size": 128,
 38 |     "epoch": 50,
 39 |     "max_batch": -1,
 40 |     "l2_lambda": 0.000,
 41 | 
 42 |     # embedding
 43 |     "embedding_dropout": 0.3,
 44 |     "embedding_dim_word": init_embedding_matrix["word"].shape[1],
 45 |     "embedding_dim_char": init_embedding_matrix["char"].shape[1],
 46 |     "embedding_dim": init_embedding_matrix["word"].shape[1],
 47 |     "embedding_dim_compressed": 32,
 48 |     "embedding_trainable": True,
 49 |     "embedding_mask_zero": True,
 50 | 
 51 |     "max_num_word": init_embedding_matrix["word"].shape[0],
 52 |     "max_num_char": init_embedding_matrix["char"].shape[0],
 53 | 
 54 |     "threshold": 0.217277,
 55 |     "calibration": False,
 56 | 
 57 |     "max_seq_len_word": 12,
 58 |     "max_seq_len_char": 20,
 59 |     "pad_sequences_padding": "post",
 60 |     "pad_sequences_truncating": "post",
 61 | 
 62 |     # optimization
 63 |     "optimizer_type": "lazyadam",
 64 |     "init_lr": 0.001,
 65 |     "beta1": 0.9,
 66 |     "beta2": 0.999,
 67 |     "decay_steps": 2000,
 68 |     "decay_rate": 0.95,
 69 |     "schedule_decay": 0.004,
 70 |     "random_seed": 2018,
 71 |     "eval_every_num_update": 5000,
 72 | 
 73 |     # semantic feature layer
 74 |     "encode_method": "textcnn",
 75 |     "attend_method": ["ave", "max", "min", "self-scalar-attention"],
 76 |     "attention_dim": 64,
 77 |     "attention_num_heads": 1,
 78 | 
 79 |     # cnn
 80 |     "cnn_num_layers": 1,
 81 |     "cnn_num_filters": 32,
 82 |     "cnn_filter_sizes": [1, 2, 3],
 83 |     "cnn_timedistributed": False,
 84 |     "cnn_activation": tf.nn.relu,
 85 |     "cnn_gated_conv": False,
 86 |     "cnn_residual": False,
 87 | 
 88 |     "rnn_num_units": 32,
 89 |     "rnn_cell_type": "gru",
 90 |     "rnn_num_layers": 1,
 91 | 
 92 |     # fc block
 93 |     "fc_type": "fc",
 94 |     "fc_hidden_units": [64*4, 64*2, 64],
 95 |     "fc_dropouts": [0, 0, 0],
 96 | 
 97 |     # True: cosine(l1, l2), sum(abs(l1 - l2))
 98 |     # False: l1 * l2, abs(l1 - l2)
 99 |     "similarity_aggregation": False,
100 | 
101 |     # match pyramid
102 |     "mp_num_filters": [8, 16],
103 |     "mp_filter_sizes": [5, 3],
104 |     "mp_activation": tf.nn.relu,
105 |     "mp_dynamic_pooling": False,
106 |     "mp_pool_sizes_word": [6, 3],
107 |     "mp_pool_sizes_char": [10, 5],
108 | 
109 |     # bcnn
110 |     "bcnn_num_layers": 2,
111 |     "bcnn_num_filters": 16,
112 |     "bcnn_filter_size": 3,
113 |     "bcnn_activation": tf.nn.tanh, # tf.nn.relu with euclidean/euclidean_exp produce nan
114 |     "bcnn_match_score_type": "cosine",
115 | 
116 |     "bcnn_mp_att_pooling": False,
117 |     "bcnn_mp_num_filters": [8, 16],
118 |     "bcnn_mp_filter_sizes": [5, 3],
119 |     "bcnn_mp_activation": tf.nn.relu,
120 |     "bcnn_mp_dynamic_pooling": False,
121 |     "bcnn_mp_pool_sizes_word": [6, 3],
122 |     "bcnn_mp_pool_sizes_char": [10, 5],
123 | 
124 |     # final layer
125 |     "final_dropout": 0.3,
126 | 
127 | }
128 | 
129 | 
130 | def get_model_data(df, features, params):
131 |     X = {
132 |         "q1": df.q1.values,
133 |         "q2": df.q2.values,
134 |         "label": df.label.values,
135 |     }
136 |     if params["use_features"]:
137 |         X.update({
138 |             "features": features,
139 |         })
140 |         params["num_features"] = X["features"].shape[1]
141 |     return X
142 | 
143 | 
144 | def downsample(df):
145 |     # downsample negative
146 |     num_pos = np.sum(df.label)
147 |     num_neg = int((1. / config.POS_RATIO_OFFLINE - 1.) * num_pos)
148 |     idx_pos = np.where(df.label == 1)[0]
149 |     idx_neg = np.where(df.label == 0)[0]
150 |     np.random.shuffle(idx_neg)
151 |     idx = np.hstack([idx_pos, idx_neg[:num_neg]])
152 |     return df.loc[idx]
153 | 
154 | 
155 | def get_train_valid_test_data(augmentation=False):
156 |     # load data
157 |     Q = load_question(params)
158 |     dfTrain = load_train()
159 |     dfTest = load_test()
160 |     # train_features = load_feat("train")
161 |     # test_features = load_feat("test")
162 |     # params["num_features"] = train_features.shape[1]
163 | 
164 |     # load split
165 |     with open(config.SPLIT_FILE, "rb") as f:
166 |         train_idx, valid_idx = pkl.load(f)
167 | 
168 |     # validation
169 |     if augmentation:
170 |         dfDev = pd.read_csv(config.DATA_DIR + "/" + "dev_aug.csv")
171 |         dfDev = downsample(dfDev)
172 |         params["use_features"] = False
173 |         params["augmentation_decay_steps"] = 50000
174 |         params["decay_steps"] = 50000
175 |         X_dev = get_model_data(dfDev, None, params)
176 |     else:
177 |         X_dev = get_model_data(dfTrain.loc[train_idx], None, params)
178 |     X_valid = get_model_data(dfTrain.loc[valid_idx], None, params)
179 | 
180 |     # submit
181 |     if augmentation:
182 |         dfTrain = pd.read_csv(config.DATA_DIR + "/" + "train_aug.csv")
183 |         dfTrain = downsample(dfTrain)
184 |         params["use_features"] = False
185 |         params["augmentation_decay_steps"] = 50000
186 |         params["decay_steps"] = 50000
187 |         X_train = get_model_data(dfTrain, None, params)
188 |     else:
189 |         X_train = get_model_data(dfTrain, None, params)
190 |     X_test = get_model_data(dfTest, None, params)
191 | 
192 |     return X_dev, X_valid, X_train, X_test, Q
193 | 
194 | 
195 | def parse_args(parser):
196 |     parser.add_option("-m", "--model", type="string", dest="model",
197 |                       help="model type", default="cdssm")
198 |     parser.add_option("-a", "--augmentation", action="store_true", dest="augmentation",
199 |                       help="augmentation", default=False)
200 |     parser.add_option("-g", "--granularity", type="string", dest="granularity",
201 |                       help="granularity, e.g., word or char", default="word")
202 | 
203 |     (options, args) = parser.parse_args()
204 |     return options, args
205 | 
206 | 
207 | def main(options):
208 | 
209 |     os_utils._makedirs("../logs")
210 |     os_utils._makedirs("../output")
211 |     logger = log_utils._get_logger("../logs", "tf-%s.log" % time_utils._timestamp())
212 | 
213 |     params["granularity"] = options.granularity
214 | 
215 |     # save path
216 |     model_name = "augmentation_%s_%s_%s"%(str(options.augmentation), options.granularity, options.model)
217 |     path = config.SUB_DIR + "/" + model_name
218 |     os_utils._makedirs(path)
219 | 
220 |     # load data
221 |     X_dev, X_valid, X_train, X_test, Q = get_train_valid_test_data(options.augmentation)
222 | 
223 |     # validation
224 |     model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix)
225 |     model.fit(X_dev, Q, validation_data=X_valid, shuffle=True)
226 |     y_pred_valid = model.predict_proba(X_valid, Q).flatten()
227 |     # save for stacking
228 |     df = pd.DataFrame({"y_pred": y_pred_valid, "y_true": X_valid["label"]})
229 |     df.to_csv(path + "/valid.csv", index=False, header=True)
230 | 
231 |     # submission
232 |     y_proba = np.zeros((len(X_test["label"]), params["n_runs"]), dtype=np.float32)
233 |     for run in range(params["n_runs"]):
234 |         params["random_seed"] = run
235 |         params["model_name"] = "semantic_model_%s"%str(run+1)
236 |         model = get_model(options.model)(params, logger, init_embedding_matrix=init_embedding_matrix)
237 |         model.fit(X_train, Q, validation_data=None, shuffle=True)
238 |         y_proba[:,run] = model.predict_proba(X_test, Q).flatten()
239 |         df = pd.DataFrame(y_proba[:,:(run+1)], columns=["y_proba_%d"%(i+1) for i in range(run+1)])
240 |         df.to_csv(path + "/test.csv", index=False, header=True)
241 | 
242 | 
243 | if __name__ == "__main__":
244 | 
245 |     parser = OptionParser()
246 |     options, args = parse_args(parser)
247 |     main(options)
248 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/models/__init__.py


--------------------------------------------------------------------------------
/src/models/base_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import time
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from sklearn.linear_model import LogisticRegression
  6 | from sklearn.metrics import log_loss
  7 | 
  8 | import config
  9 | from utils import os_utils
 10 | from tf_common.optimizer import *
 11 | from tf_common.nn_module import word_dropout, mlp_layer
 12 | from tf_common.nn_module import encode, attend
 13 | 
 14 | 
 15 | def sigmoid(x):
 16 |     return 1./(1.+np.exp(-x))
 17 | 
 18 | 
 19 | class BaseModel(object):
 20 |     def __init__(self, params, logger, init_embedding_matrix=None):
 21 |         self.params = params
 22 |         self.logger = logger
 23 |         self.init_embedding_matrix = init_embedding_matrix
 24 |         self.model_name = self.params["model_name"]
 25 |         self.threshold = self.params["threshold"]
 26 |         self.calibration_model = None
 27 |         # os_utils._makedirs(self.params["offline_model_dir"], force=True)
 28 | 
 29 |         self._init_tf_vars()
 30 |         self.matching_features_word, self.matching_features_char = self._get_matching_features()
 31 |         self.logits, self.proba = self._get_prediction()
 32 |         self.loss = self._get_loss()
 33 |         self.train_op = self._get_train_op()
 34 |         self.summary = self._get_summary()
 35 | 
 36 |         self.sess, self.saver = self._init_session()
 37 |         self.train_writer = tf.summary.FileWriter(self.params["summary_dir"] + '/train', self.sess.graph)
 38 |         self.test_writer = tf.summary.FileWriter(self.params["summary_dir"] + '/test')
 39 |         
 40 |         
 41 |     def _init_tf_vars(self):
 42 |         #### training flag
 43 |         self.training = tf.placeholder(tf.bool, shape=[], name="training")
 44 |         #### labels
 45 |         self.labels = tf.placeholder(tf.float32, shape=[None], name="labels")
 46 |         #### word
 47 |         self.seq_word_left = tf.placeholder(tf.int32, shape=[None, None], name="seq_word_left")
 48 |         self.seq_word_right = tf.placeholder(tf.int32, shape=[None, None], name="seq_word_right")
 49 |         #### char
 50 |         self.seq_char_left = tf.placeholder(tf.int32, shape=[None, None], name="seq_char_left")
 51 |         self.seq_char_right = tf.placeholder(tf.int32, shape=[None, None], name="seq_char_right")
 52 |         #### word len
 53 |         self.seq_len_word_left = tf.placeholder(tf.int32, shape=[None], name="seq_len_word_left")
 54 |         self.seq_len_word_right = tf.placeholder(tf.int32, shape=[None], name="seq_len_word_right")
 55 |         #### char len
 56 |         self.seq_len_char_left = tf.placeholder(tf.int32, shape=[None], name="seq_len_char_left")
 57 |         self.seq_len_char_right = tf.placeholder(tf.int32, shape=[None], name="seq_len_char_right")
 58 | 
 59 |         #### features
 60 |         self.features = tf.placeholder(tf.float32, shape=[None, self.params["num_features"]], name="features")
 61 | 
 62 |         #### training
 63 |         self.global_step = tf.Variable(0, trainable=False)
 64 |         self.learning_rate = tf.train.exponential_decay(self.params["init_lr"], self.global_step,
 65 |                                                         self.params["decay_steps"], self.params["decay_rate"])
 66 |         self.augmentation_dropout = tf.train.exponential_decay(self.params["augmentation_init_dropout"], self.global_step,
 67 |                                                                self.params["augmentation_dropout_decay_steps"],
 68 |                                                                self.params["augmentation_dropout_decay_rate"])
 69 |         self.augmentation_permutation = tf.train.exponential_decay(self.params["augmentation_init_permutation"],
 70 |                                                                self.global_step,
 71 |                                                                self.params["augmentation_permutation_decay_steps"],
 72 |                                                                self.params["augmentation_permutation_decay_rate"])
 73 | 
 74 | 
 75 |     def _get_embedding_matrix(self, granularity="word"):
 76 |         if self.init_embedding_matrix[granularity] is None:
 77 |             std = 0.1
 78 |             minval = -std
 79 |             maxval = std
 80 |             emb_matrix = tf.Variable(
 81 |                 tf.random_uniform(
 82 |                     [self.params["max_num_%s" % granularity] + 1, self.params["embedding_dim_%s" % granularity]],
 83 |                     minval, maxval,
 84 |                     seed=self.params["random_seed"],
 85 |                     dtype=tf.float32))
 86 |         else:
 87 |             emb_matrix = tf.Variable(self.init_embedding_matrix[granularity],
 88 |                                      trainable=self.params["embedding_trainable"])
 89 |         return emb_matrix
 90 | 
 91 | 
 92 |     def _semantic_feature_layer(self, seq_input, seq_len, granularity="word", reuse=False):
 93 |         assert granularity in ["char", "word"]
 94 |         #### embed
 95 |         emb_matrix = self._get_embedding_matrix(granularity)
 96 |         emb_seq = tf.nn.embedding_lookup(emb_matrix, seq_input)
 97 | 
 98 |         #### dropout
 99 |         random_seed = np.random.randint(10000000)
100 |         emb_seq = word_dropout(emb_seq,
101 |                                training=self.training,
102 |                                dropout=self.params["embedding_dropout"],
103 |                                seed=random_seed)
104 | 
105 |         #### encode
106 |         input_dim = self.params["embedding_dim"]
107 |         enc_seq = encode(emb_seq, method=self.params["encode_method"],
108 |                          input_dim=input_dim,
109 |                          params=self.params,
110 |                          sequence_length=seq_len,
111 |                          mask_zero=self.params["embedding_mask_zero"],
112 |                          scope_name=self.model_name + "enc_seq_%s"%granularity, reuse=reuse,
113 |                          training=self.training)
114 | 
115 |         #### attend
116 |         feature_dim = self.params["encode_dim"]
117 |         context = None
118 |         att_seq = attend(enc_seq, context=context,
119 |                          encode_dim=self.params["encode_dim"],
120 |                          feature_dim=feature_dim,
121 |                          attention_dim=self.params["attention_dim"],
122 |                          method=self.params["attend_method"],
123 |                          scope_name=self.model_name + "att_seq_%s"%granularity,
124 |                          reuse=reuse, num_heads=self.params["attention_num_heads"])
125 | 
126 |         #### MLP nonlinear projection
127 |         sem_seq = mlp_layer(att_seq, fc_type=self.params["fc_type"],
128 |                             hidden_units=self.params["fc_hidden_units"],
129 |                             dropouts=self.params["fc_dropouts"],
130 |                             scope_name=self.model_name + "sem_seq_%s"%granularity,
131 |                             reuse=reuse,
132 |                             training=self.training,
133 |                             seed=self.params["random_seed"])
134 | 
135 |         return emb_seq, enc_seq, att_seq, sem_seq
136 | 
137 | 
138 |     def _interaction_semantic_feature_layer(self, seq_input_left, seq_input_right, seq_len_left, seq_len_right, granularity="word"):
139 |         assert granularity in ["char", "word"]
140 |         #### embed
141 |         emb_matrix = self._get_embedding_matrix(granularity)
142 |         emb_seq_left = tf.nn.embedding_lookup(emb_matrix, seq_input_left)
143 |         emb_seq_right = tf.nn.embedding_lookup(emb_matrix, seq_input_right)
144 | 
145 |         #### dropout
146 |         random_seed = np.random.randint(10000000)
147 |         emb_seq_left = word_dropout(emb_seq_left,
148 |                                training=self.training,
149 |                                dropout=self.params["embedding_dropout"],
150 |                                seed=random_seed)
151 |         random_seed = np.random.randint(10000000)
152 |         emb_seq_right = word_dropout(emb_seq_right,
153 |                                     training=self.training,
154 |                                     dropout=self.params["embedding_dropout"],
155 |                                     seed=random_seed)
156 | 
157 |         #### encode
158 |         input_dim = self.params["embedding_dim"]
159 |         enc_seq_left = encode(emb_seq_left, method=self.params["encode_method"],
160 |                               input_dim=input_dim,
161 |                               params=self.params,
162 |                               sequence_length=seq_len_left,
163 |                               mask_zero=self.params["embedding_mask_zero"],
164 |                               scope_name=self.model_name + "enc_seq_%s"%granularity, reuse=False,
165 |                               training=self.training)
166 |         enc_seq_right = encode(emb_seq_right, method=self.params["encode_method"],
167 |                                input_dim=input_dim,
168 |                                params=self.params,
169 |                                sequence_length=seq_len_right,
170 |                                mask_zero=self.params["embedding_mask_zero"],
171 |                                scope_name=self.model_name + "enc_seq_%s" % granularity, reuse=True,
172 |                                training=self.training)
173 | 
174 |         #### attend
175 |         # [batchsize, s1, s2]
176 |         att_mat = tf.einsum("abd,acd->abc", enc_seq_left, enc_seq_right)
177 |         feature_dim = self.params["encode_dim"] + self.params["max_seq_len_%s"%granularity]
178 |         att_seq_left = attend(enc_seq_left, context=att_mat, feature_dim=feature_dim,
179 |                                    method=self.params["attend_method"],
180 |                                    scope_name=self.model_name + "att_seq_%s"%granularity,
181 |                                    reuse=False)
182 |         att_seq_right = attend(enc_seq_right, context=tf.transpose(att_mat), feature_dim=feature_dim,
183 |                               method=self.params["attend_method"],
184 |                               scope_name=self.model_name + "att_seq_%s" % granularity,
185 |                               reuse=True)
186 | 
187 |         #### MLP nonlinear projection
188 |         sem_seq_left = mlp_layer(att_seq_left, fc_type=self.params["fc_type"],
189 |                                  hidden_units=self.params["fc_hidden_units"],
190 |                                  dropouts=self.params["fc_dropouts"],
191 |                                  scope_name=self.model_name + "sem_seq_%s"%granularity,
192 |                                  reuse=False,
193 |                                  training=self.training,
194 |                                  seed=self.params["random_seed"])
195 |         sem_seq_right = mlp_layer(att_seq_right, fc_type=self.params["fc_type"],
196 |                                   hidden_units=self.params["fc_hidden_units"],
197 |                                   dropouts=self.params["fc_dropouts"],
198 |                                   scope_name=self.model_name + "sem_seq_%s" % granularity,
199 |                                   reuse=True,
200 |                                   training=self.training,
201 |                                   seed=self.params["random_seed"])
202 | 
203 |         return emb_seq_left, enc_seq_left, att_seq_left, sem_seq_left, \
204 |                 emb_seq_right, enc_seq_right, att_seq_right, sem_seq_right
205 | 
206 | 
207 |     def _get_matching_features(self):
208 |         pass
209 | 
210 | 
211 |     def _get_prediction(self):
212 |         with tf.name_scope(self.model_name + "/"):
213 |             with tf.name_scope("prediction"):
214 |                 lst = []
215 |                 if "word" in self.params["granularity"]:
216 |                     lst.append(self.matching_features_word)
217 |                 if "char" in self.params["granularity"]:
218 |                     lst.append(self.matching_features_char)
219 |                 if self.params["use_features"]:
220 |                     out_0 = mlp_layer(self.features, fc_type=self.params["fc_type"],
221 |                                       hidden_units=self.params["fc_hidden_units"],
222 |                                       dropouts=self.params["fc_dropouts"],
223 |                                       scope_name=self.model_name + "mlp_features",
224 |                                       reuse=False,
225 |                                       training=self.training,
226 |                                       seed=self.params["random_seed"])
227 |                     lst.append(out_0)
228 |                 out = tf.concat(lst, axis=-1)
229 |                 out = tf.layers.Dropout(self.params["final_dropout"])(out, training=self.training)
230 |                 out = mlp_layer(out, fc_type=self.params["fc_type"],
231 |                                 hidden_units=self.params["fc_hidden_units"],
232 |                                 dropouts=self.params["fc_dropouts"],
233 |                                 scope_name=self.model_name + "mlp",
234 |                                 reuse=False,
235 |                                 training=self.training,
236 |                                 seed=self.params["random_seed"])
237 |                 logits = tf.layers.dense(out, 1, activation=None,
238 |                                          kernel_initializer=tf.glorot_uniform_initializer(
239 |                                          seed=self.params["random_seed"]),
240 |                                          name=self.model_name + "logits")
241 |                 logits = tf.squeeze(logits, axis=1)
242 |                 proba = tf.nn.sigmoid(logits)
243 | 
244 |         return logits, proba
245 | 
246 | 
247 |     def _get_loss(self):
248 |         with tf.name_scope(self.model_name + "/"):
249 |             with tf.name_scope("loss"):
250 |                 loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.labels, logits=self.logits)
251 |                 loss = tf.reduce_mean(loss, name="log_loss")
252 |                 if self.params["l2_lambda"] > 0:
253 |                     l2_losses = tf.add_n(
254 |                         [tf.nn.l2_loss(v) for v in tf.trainable_variables() if "bias" not in v.name]) * self.params[
255 |                                     "l2_lambda"]
256 |                     loss = loss + l2_losses
257 |         return loss
258 | 
259 | 
260 |     def _get_train_op(self):
261 |         with tf.name_scope(self.model_name + "/"):
262 |             with tf.name_scope("optimization"):
263 |                 if self.params["optimizer_type"] == "lazynadam":
264 |                     optimizer = LazyNadamOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"],
265 |                                                    beta2=self.params["beta2"], epsilon=1e-8,
266 |                                                    schedule_decay=self.params["schedule_decay"])
267 |                 elif self.params["optimizer_type"] == "adam":
268 |                     optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate,
269 |                                                        beta1=self.params["beta1"],
270 |                                                        beta2=self.params["beta2"], epsilon=1e-8)
271 |                 elif self.params["optimizer_type"] == "lazyadam":
272 |                     optimizer = tf.contrib.opt.LazyAdamOptimizer(learning_rate=self.learning_rate,
273 |                                                                  beta1=self.params["beta1"],
274 |                                                                  beta2=self.params["beta2"], epsilon=1e-8)
275 |                 elif self.params["optimizer_type"] == "adagrad":
276 |                     optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
277 |                                                           initial_accumulator_value=1e-7)
278 |                 elif self.params["optimizer_type"] == "adadelta":
279 |                     optimizer = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate)
280 |                 elif self.params["optimizer_type"] == "gd":
281 |                     optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
282 |                 elif self.params["optimizer_type"] == "momentum":
283 |                     optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95)
284 |                 elif self.params["optimizer_type"] == "rmsprop":
285 |                     optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate, decay=0.9,
286 |                                                           momentum=0.9, epsilon=1e-8)
287 |                 elif self.params["optimizer_type"] == "lazypowersign":
288 |                     optimizer = LazyPowerSignOptimizer(learning_rate=self.learning_rate)
289 |                 elif self.params["optimizer_type"] == "lazyaddsign":
290 |                     optimizer = LazyAddSignOptimizer(learning_rate=self.learning_rate)
291 |                 elif self.params["optimizer_type"] == "lazyamsgrad":
292 |                     optimizer = LazyAMSGradOptimizer(learning_rate=self.learning_rate, beta1=self.params["beta1"],
293 |                                                      beta2=self.params["beta2"], epsilon=1e-8)
294 | 
295 |                 update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
296 |                 with tf.control_dependencies(update_ops):
297 |                     train_op = optimizer.minimize(self.loss, global_step=self.global_step)
298 |         return train_op
299 | 
300 | 
301 |     def _get_summary(self):
302 |         with tf.name_scope(self.model_name + "/"):
303 |             tf.summary.scalar("augmentation_dropout", self.augmentation_dropout)
304 |             tf.summary.scalar("logloss", self.loss)
305 |             tf.summary.scalar("lr", self.learning_rate)
306 |             # error: https://blog.csdn.net/u012436149/article/details/53894364
307 |             # summary = tf.summary.merge_all()
308 |             summary = tf.summary.merge(
309 |                 tf.get_collection(tf.GraphKeys.SUMMARIES, self.model_name)
310 |             )
311 |         return summary
312 | 
313 | 
314 |     def _init_session(self):
315 |         config = tf.ConfigProto(device_count={"gpu": 1})
316 |         config.gpu_options.allow_growth = True
317 |         config.intra_op_parallelism_threads = 4
318 |         config.inter_op_parallelism_threads = 4
319 |         sess = tf.Session(config=config)
320 |         sess.run(tf.global_variables_initializer())
321 |         # max_to_keep=None, keep all the models
322 |         saver = tf.train.Saver(max_to_keep=None)
323 |         return sess, saver
324 | 
325 | 
326 |     def save_session(self):
327 |         self.saver.save(self.sess, self.params["offline_model_dir"] + "/model.checkpoint")
328 | 
329 | 
330 |     def restore_session(self):
331 |         self.saver.restore(self.sess, self.params["offline_model_dir"] + "/model.checkpoint")
332 | 
333 | 
334 |     def _get_batch_index(self, seq, step):
335 |         n = len(seq)
336 |         res = []
337 |         for i in range(0, n, step):
338 |             res.append(seq[i:i + step])
339 |         # last batch
340 |         if len(res) * step < n:
341 |             res.append(seq[len(res) * step:])
342 |         return res
343 | 
344 | 
345 |     def _get_pos_neg_ind(self, label):
346 |         length = len(label)
347 |         pos_ind_tmp = np.where(label == 1)[0]
348 |         inds = np.zeros((len(pos_ind_tmp) * length, 2), dtype=int)
349 |         inds[:, 0] = np.tile(pos_ind_tmp, length)
350 |         inds[:, 1] = list(range(length)) * len(pos_ind_tmp)
351 |         mask = inds[:, 0] != inds[:, 1]
352 |         pos_ind = inds[mask, 0]
353 |         neg_ind = inds[mask, 1]
354 |         return pos_ind, neg_ind
355 | 
356 | 
357 |     def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False):
358 |         if training:
359 |             if construct_neg:
360 |                 q1 = X["q1"][idx]
361 |                 q2 = X["q2"][idx]
362 |                 # for label=1 sample, construct negative sample within batch
363 |                 pos_ind, neg_ind = self._get_pos_neg_ind(X["label"][idx])
364 |                 # original & symmetric
365 |                 feed_dict = {
366 |                     self.seq_word_left: np.vstack([Q["words"][q1],
367 |                                                    Q["words"][X["q1"][idx[pos_ind]]],
368 |                                                    Q["words"][X["q1"][idx[neg_ind]]],
369 |                                                    Q["words"][q2],
370 |                                                    Q["words"][X["q2"][idx[neg_ind]]],
371 |                                                    Q["words"][X["q2"][idx[pos_ind]]]
372 |                                                    ]),
373 |                     self.seq_word_right: np.vstack([Q["words"][q2],
374 |                                                     Q["words"][X["q2"][idx[neg_ind]]],
375 |                                                     Q["words"][X["q2"][idx[pos_ind]]],
376 |                                                     Q["words"][q1],
377 |                                                     Q["words"][X["q1"][idx[pos_ind]]],
378 |                                                     Q["words"][X["q1"][idx[neg_ind]]],
379 |                                                     ]),
380 |                     self.seq_char_left: np.vstack([Q["chars"][q1],
381 |                                                    Q["chars"][X["q1"][idx[pos_ind]]],
382 |                                                    Q["chars"][X["q1"][idx[neg_ind]]],
383 |                                                    Q["chars"][q2],
384 |                                                    Q["chars"][X["q2"][idx[neg_ind]]],
385 |                                                    Q["chars"][X["q2"][idx[pos_ind]]]
386 |                                                    ]),
387 |                     self.seq_char_right: np.vstack([Q["chars"][q2],
388 |                                                     Q["chars"][X["q2"][idx[neg_ind]]],
389 |                                                     Q["chars"][X["q2"][idx[pos_ind]]],
390 |                                                     Q["chars"][q1],
391 |                                                     Q["chars"][X["q1"][idx[pos_ind]]],
392 |                                                     Q["chars"][X["q1"][idx[neg_ind]]]
393 |                                                     ]),
394 |                     self.labels: np.hstack([X["label"][idx],
395 |                                             np.zeros(len(pos_ind)),
396 |                                             np.zeros(len(pos_ind)),
397 |                                             X["label"][idx],
398 |                                             np.zeros(len(pos_ind)),
399 |                                             np.zeros(len(pos_ind))
400 |                                             ]),
401 |                     self.training: training,
402 |                 }
403 |             else:
404 |                 q1 = X["q1"][idx]
405 |                 q2 = X["q2"][idx]
406 |                 feed_dict = {
407 |                     self.seq_word_left: np.vstack([Q["words"][q1],
408 |                                                    Q["words"][q2],
409 |                                                    ]),
410 |                     self.seq_word_right: np.vstack([Q["words"][q2],
411 |                                                     Q["words"][q1],
412 |                                                     ]),
413 |                     self.seq_char_left: np.vstack([Q["chars"][q1],
414 |                                                    Q["chars"][q2],
415 |                                                    ]),
416 |                     self.seq_char_right: np.vstack([Q["chars"][q2],
417 |                                                     Q["chars"][q1],
418 |                                                     ]),
419 |                     self.seq_len_word_left: np.hstack([Q["seq_len_word"][q1],
420 |                                                        Q["seq_len_word"][q2],
421 |                                                        ]),
422 |                     self.seq_len_word_right: np.hstack([Q["seq_len_word"][q2],
423 |                                                         Q["seq_len_word"][q1],
424 |                                                         ]),
425 |                     self.seq_len_char_left: np.hstack([Q["seq_len_char"][q1],
426 |                                                        Q["seq_len_char"][q2],
427 |                                                        ]),
428 |                     self.seq_len_char_right: np.hstack([Q["seq_len_char"][q2],
429 |                                                         Q["seq_len_char"][q1],
430 |                                                         ]),
431 |                     self.labels: np.hstack([X["label"][idx],
432 |                                             X["label"][idx],
433 |                                             ]),
434 |                     self.training: training,
435 |                 }
436 |                 if self.params["use_features"]:
437 |                     feed_dict.update({
438 |                         self.features: np.vstack([X["features"][idx],
439 |                                                   X["features"][idx],
440 |                                                   ]),
441 |                     })
442 |         elif not symmetric:
443 |             q1 = X["q1"][idx]
444 |             q2 = X["q2"][idx]
445 |             feed_dict = {
446 |                 self.seq_word_left: Q["words"][q1],
447 |                 self.seq_word_right: Q["words"][q2],
448 |                 self.seq_char_left: Q["chars"][q1],
449 |                 self.seq_char_right: Q["chars"][q2],
450 |                 self.seq_len_word_left: Q["seq_len_word"][q1],
451 |                 self.seq_len_word_right: Q["seq_len_word"][q2],
452 |                 self.seq_len_char_left: Q["seq_len_char"][q1],
453 |                 self.seq_len_char_right: Q["seq_len_char"][q2],
454 |                 self.labels: X["label"][idx],
455 |                 self.training: training,
456 |             }
457 |             if self.params["use_features"]:
458 |                 feed_dict.update({
459 |                     self.features: X["features"][idx],
460 |                 })
461 |         else:
462 |             q1 = X["q1"][idx]
463 |             q2 = X["q2"][idx]
464 |             feed_dict = {
465 |                 self.seq_word_left: np.vstack([Q["words"][q1],
466 |                                                Q["words"][q2],
467 |                                                ]),
468 |                 self.seq_word_right: np.vstack([Q["words"][q2],
469 |                                                 Q["words"][q1],
470 |                                                 ]),
471 |                 self.seq_char_left: np.vstack([Q["chars"][q1],
472 |                                                Q["chars"][q2],
473 |                                                ]),
474 |                 self.seq_char_right: np.vstack([Q["chars"][q2],
475 |                                                 Q["chars"][q1],
476 |                                                 ]),
477 |                 self.seq_len_word_left: np.hstack([Q["seq_len_word"][q1],
478 |                                                    Q["seq_len_word"][q2],
479 |                                                    ]),
480 |                 self.seq_len_word_right: np.hstack([Q["seq_len_word"][q2],
481 |                                                     Q["seq_len_word"][q1],
482 |                                                     ]),
483 |                 self.seq_len_char_left: np.hstack([Q["seq_len_char"][q1],
484 |                                                    Q["seq_len_char"][q2],
485 |                                                    ]),
486 |                 self.seq_len_char_right: np.hstack([Q["seq_len_char"][q2],
487 |                                                     Q["seq_len_char"][q1],
488 |                                                     ]),
489 |                 self.labels: np.hstack([X["label"][idx],
490 |                                         X["label"][idx],
491 |                                         ]),
492 |                 self.training: training,
493 |             }
494 |             if self.params["use_features"]:
495 |                 feed_dict.update({
496 |                     self.features: np.vstack([X["features"][idx],
497 |                                               X["features"][idx],
498 |                                               ]),
499 |                 })
500 |         # augmentation
501 |         if training:
502 |             if self.params["augmentation_init_dropout"] > 0:
503 |                 self._dropout_augmentation(feed_dict)
504 |             if self.params["augmentation_init_permutation"]:
505 |                 self._permutation_augmentation(feed_dict)
506 | 
507 |         return feed_dict
508 | 
509 | 
510 |     def _dropout(self, val_arr, ind_arr, p, value):
511 |         new_arr = np.array(val_arr)
512 |         drop = np.empty(val_arr.shape, dtype=np.bool)
513 |         for i in range(val_arr.shape[0]):
514 |             drop[i, :ind_arr[i]] = np.random.choice([True, False], ind_arr[i], p=[p, 1 - p])
515 |         new_arr[drop] = value
516 |         return new_arr
517 | 
518 | 
519 |     def _dropout_augmentation(self, feed_dict):
520 |         p = self.sess.run(self.augmentation_dropout)
521 |         if p <= self.params["augmentation_min_dropout"]:
522 |             return
523 | 
524 |         dropout_data = self._dropout(val_arr=feed_dict[self.seq_word_left],
525 |                                      ind_arr=feed_dict[self.seq_len_word_left],
526 |                                      p=p, value=config.MISSING_INDEX_WORD)
527 |         feed_dict[self.seq_word_left] = np.vstack([
528 |             feed_dict[self.seq_word_left],
529 |             dropout_data,
530 |         ])
531 | 
532 |         dropout_data = self._dropout(val_arr=feed_dict[self.seq_word_right],
533 |                                      ind_arr=feed_dict[self.seq_len_word_right],
534 |                                      p=p, value=config.MISSING_INDEX_WORD)
535 |         feed_dict[self.seq_word_right] = np.vstack([
536 |             feed_dict[self.seq_word_right],
537 |             dropout_data,
538 |         ])
539 | 
540 |         dropout_data = self._dropout(val_arr=feed_dict[self.seq_char_left],
541 |                                      ind_arr=feed_dict[self.seq_len_char_left],
542 |                                      p=p, value=config.MISSING_INDEX_CHAR)
543 |         feed_dict[self.seq_char_left] = np.vstack([
544 |             feed_dict[self.seq_char_left],
545 |             dropout_data,
546 |         ])
547 | 
548 |         dropout_data = self._dropout(val_arr=feed_dict[self.seq_char_right],
549 |                                      ind_arr=feed_dict[self.seq_len_char_right],
550 |                                      p=p, value=config.MISSING_INDEX_CHAR)
551 |         feed_dict[self.seq_char_right] = np.vstack([
552 |             feed_dict[self.seq_char_right],
553 |             dropout_data,
554 |         ])
555 | 
556 |         # double others
557 |         feed_dict[self.seq_len_word_left] = np.tile(feed_dict[self.seq_len_word_left], 2)
558 |         feed_dict[self.seq_len_word_right] = np.tile(feed_dict[self.seq_len_word_right], 2)
559 |         feed_dict[self.seq_len_char_left] = np.tile(feed_dict[self.seq_len_char_left], 2)
560 |         feed_dict[self.seq_len_char_right] = np.tile(feed_dict[self.seq_len_char_right], 2)
561 |         feed_dict[self.labels] = np.tile(feed_dict[self.labels], 2)
562 |         if self.params["use_features"]:
563 |             feed_dict[self.features] = np.tile(feed_dict[self.features], [2, 1])
564 | 
565 | 
566 |     def _permutation(self, val_arr, ind_arr, p):
567 |         if np.random.random() < p:
568 |             new_arr = np.array(val_arr)
569 |             for i in range(val_arr.shape[0]):
570 |                 new_arr[i, :ind_arr[i]] = np.random.permutation(new_arr[i,:ind_arr[i]])
571 |             return new_arr
572 |         else:
573 |             return val_arr
574 | 
575 | 
576 |     def _permutation_augmentation(self, feed_dict):
577 |         p = self.sess.run(self.augmentation_permutation)
578 |         if p <= self.params["augmentation_min_permutation"]:
579 |             return
580 | 
581 |         feed_dict[self.seq_word_left] = np.vstack([
582 |             feed_dict[self.seq_word_left],
583 |             self._permutation(feed_dict[self.seq_word_left], feed_dict[self.seq_len_word_left], p),
584 |         ])
585 |         feed_dict[self.seq_word_right] = np.vstack([
586 |             feed_dict[self.seq_word_right],
587 |             self._permutation(feed_dict[self.seq_word_right], feed_dict[self.seq_len_word_right], p),
588 |         ])
589 |         feed_dict[self.seq_char_left] = np.vstack([
590 |             feed_dict[self.seq_char_left],
591 |             self._permutation(feed_dict[self.seq_char_left], feed_dict[self.seq_len_char_left], p),
592 |         ])
593 |         feed_dict[self.seq_char_right] = np.vstack([
594 |             feed_dict[self.seq_char_right],
595 |             self._permutation(feed_dict[self.seq_char_right], feed_dict[self.seq_len_char_right], p),
596 |         ])
597 |         # double others
598 |         feed_dict[self.seq_len_word_left] = np.tile(feed_dict[self.seq_len_word_left], 2)
599 |         feed_dict[self.seq_len_word_right] = np.tile(feed_dict[self.seq_len_word_right], 2)
600 |         feed_dict[self.seq_len_char_left] = np.tile(feed_dict[self.seq_len_char_left], 2)
601 |         feed_dict[self.seq_len_char_right] = np.tile(feed_dict[self.seq_len_char_right], 2)
602 |         feed_dict[self.labels] = np.tile(feed_dict[self.labels], 2)
603 |         if self.params["use_features"]:
604 |             feed_dict[self.features] = np.tile(feed_dict[self.features], [2, 1])
605 | 
606 | 
607 |     def fit(self, X, Q, validation_data=None, shuffle=False, total_epoch=None):
608 |         start_time = time.time()
609 |         l = X["label"].shape[0]
610 |         self.logger.info("fit on %d sample" % l)
611 |         self.logger.info("max_batch: %d" % self.params["max_batch"])
612 |         if validation_data is not None:
613 |             self.logger.info("mean: %.5f"%np.mean(validation_data["label"]))
614 |         train_idx_shuffle = np.arange(l)
615 |         total_loss = 0.
616 |         loss_decay = 0.9
617 |         global_step = self.sess.run(self.global_step)
618 |         if total_epoch is None:
619 |             total_epoch = self.params["epoch"]
620 |         for epoch in range(total_epoch):
621 |             self.logger.info("epoch: %d" % (epoch + 1))
622 |             np.random.seed(epoch)
623 |             if shuffle:
624 |                 np.random.shuffle(train_idx_shuffle)
625 |             dropout_p = self.sess.run(self.augmentation_dropout)
626 |             batch_size = self.params["batch_size"]
627 |             if dropout_p <= self.params["augmentation_min_dropout"]:
628 |                 batch_size *= 2
629 |             batches = self._get_batch_index(train_idx_shuffle, batch_size)
630 |             for i, idx in enumerate(batches):
631 |                 feed_dict = self._get_feed_dict(X, idx, Q, construct_neg=self.params["construct_neg"], training=True)
632 |                 loss, lr, opt, summary, global_step = self.sess.run((self.loss, self.learning_rate, self.train_op, self.summary, self.global_step), feed_dict=feed_dict)
633 |                 self.train_writer.add_summary(summary, global_step)
634 |                 total_loss = loss_decay * total_loss + (1. - loss_decay) * loss
635 |                 if validation_data is not None and (self.params["eval_every_num_update"] > 0) and (global_step % self.params["eval_every_num_update"] == 0):
636 |                     y_valid = validation_data["label"]
637 |                     y_proba, y_proba_cal = self._predict_proba(validation_data, Q, fit_calibration=self.params["calibration"])
638 |                     valid_loss = log_loss(y_valid, y_proba, eps=1e-15)
639 |                     valid_loss_cal = log_loss(y_valid, y_proba_cal, eps=1e-15)
640 |                     summary = tf.Summary()
641 |                     summary.value.add(tag="logloss", simple_value=valid_loss)
642 |                     self.test_writer.add_summary(summary, global_step)
643 |                     self.logger.info(
644 |                         "[epoch-%d, batch-%d] train-loss=%.5f, valid-loss=%.5f, valid-loss-cal=%.5f, valid-proba=%.5f, predict-proba=%.5f, predict-proba-cal=%.5f, lr=%.5f [%.1f s]" % (
645 |                             epoch + 1, global_step, total_loss, valid_loss, valid_loss_cal,
646 |                             np.mean(y_valid), np.mean(y_proba), np.mean(y_proba_cal), lr, time.time() - start_time))
647 |                 else:
648 |                     self.logger.info("[epoch-%d, batch-%d] train-loss=%.5f, lr=%.5f [%.1f s]" % (
649 |                         epoch + 1, global_step, total_loss,
650 |                         lr, time.time() - start_time))
651 |                 if global_step >= self.params["max_batch"] and self.params["max_batch"] > 0:
652 |                     break
653 |             if global_step >= self.params["max_batch"] and self.params["max_batch"] > 0:
654 |                 break
655 | 
656 | 
657 |     def _predict_node(self, X, Q, node):
658 |         l = X["label"].shape[0]
659 |         train_idx = np.arange(l)
660 |         batches = self._get_batch_index(train_idx, self.params["batch_size"])
661 |         y_pred = []
662 |         y_pred_append = y_pred.append
663 |         for idx in batches:
664 |             feed_dict = self._get_feed_dict(X, idx, Q, training=False, symmetric=True)
665 |             pred = self.sess.run(node, feed_dict=feed_dict)
666 |             n = int(pred.shape[0]/2)
667 |             pred = (pred[:n] + pred[n:])/2.
668 |             y_pred_append(pred)
669 |         y_pred = np.hstack(y_pred).reshape((-1, 1)).astype(np.float64)
670 |         return y_pred
671 | 
672 | 
673 |     def _predict_proba(self, X, Q, fit_calibration=False):
674 |         y_logit = self._predict_node(X, Q, self.logits)
675 |         y_proba = sigmoid(y_logit)
676 |         y_proba_cal = y_proba
677 |         if fit_calibration:
678 |             y_valid = X["label"]
679 |             self.calibration_model = LogisticRegression()
680 |             self.calibration_model.fit(y_logit, y_valid)
681 |         if self.calibration_model is not None:
682 |             y_proba_cal = self.calibration_model.predict_proba(y_logit)[:,1]
683 |         return y_proba, y_proba_cal
684 | 
685 | 
686 |     def predict_proba(self, X, Q):
687 |         _, y_proba_cal = self._predict_proba(X, Q, fit_calibration=False)
688 |         return y_proba_cal
689 | 
690 | 
691 |     def predict(self, X, Q):
692 |         proba = self.predict_proba(X, Q)
693 |         y = np.array(proba > self.threshold, dtype=int)
694 |         return y
695 | 


--------------------------------------------------------------------------------
/src/models/bcnn.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import copy
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | 
  6 | from inputs.dynamic_pooling import dynamic_pooling_index
  7 | from models.base_model import BaseModel
  8 | from tf_common import metrics
  9 | 
 10 | 
 11 | class BCNNBaseModel(BaseModel):
 12 |     def __init__(self, params, logger, init_embedding_matrix):
 13 |         super(BCNNBaseModel, self).__init__(params, logger, init_embedding_matrix)
 14 | 
 15 | 
 16 |     def _init_tf_vars(self):
 17 |         super(BCNNBaseModel, self)._init_tf_vars()
 18 |         self.dpool_index_word = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_word"],
 19 |                                                                 self.params["max_seq_len_word"], 3],
 20 |                                                name="dpool_index_word")
 21 |         self.dpool_index_char = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_char"],
 22 |                                                                 self.params["max_seq_len_char"], 3],
 23 |                                                name="dpool_index_char")
 24 | 
 25 | 
 26 |     def _padding(self, x, name):
 27 |         # x: [batch, s, d, 1]
 28 |         # x => [batch, s+w*2-2, d, 1]
 29 |         w = self.params["bcnn_filter_size"]
 30 |         return tf.pad(x, np.array([[0, 0], [w - 1, w - 1], [0, 0], [0, 0]]), "CONSTANT", name)
 31 | 
 32 | 
 33 |     def _make_attention_matrix(self, x1, x2):
 34 |         # x1: [batch, s1, d, 1]
 35 |         # x2: [batch, s2, d, 1]
 36 |         # match score
 37 |         if "euclidean" in self.params["bcnn_match_score_type"]:
 38 |             # x1 => [batch, s1, 1, d]
 39 |             # x2 => [batch, 1, s2, d]
 40 |             x1_ = tf.transpose(x1, perm=[0, 1, 3, 2])
 41 |             x2_ = tf.transpose(x2, perm=[0, 3, 1, 2])
 42 |             euclidean = tf.sqrt(tf.reduce_sum(tf.square(x1_ - x2_), axis=-1))
 43 |             if "exp" in self.params["bcnn_match_score_type"]:
 44 |                 # exp(-euclidean / (2. * beta)) (producenan)
 45 |                 # from Convolutional Neural Network for Paraphrase Identification
 46 |                 beta = 2.
 47 |                 att = tf.exp(-euclidean / (2. * beta))
 48 |             else:
 49 |                 # euclidean distance (produce nan)
 50 |                 att = 1. / (1. + euclidean)
 51 |         elif self.params["bcnn_match_score_type"] == "cosine":
 52 |             # cosine similarity
 53 |             x1_ = tf.nn.l2_normalize(x1, dim=2)
 54 |             x2_ = tf.nn.l2_normalize(x2, dim=2)
 55 |             sim = tf.einsum("abcd,aecd->abe", x1_, x2_) # value in [-1, 1]
 56 |             att = (1. + sim) / 2. # value in [0, 1]
 57 |         return att
 58 | 
 59 | 
 60 |     def _convolution(self, x, d, name, reuse=False):
 61 |         # conv: [batch, s+w-1, 1, d]
 62 |         conv = tf.layers.conv2d(
 63 |             inputs=x,
 64 |             filters=self.params["bcnn_num_filters"],
 65 |             kernel_size=(self.params["bcnn_filter_size"], d),
 66 |             padding="valid",
 67 |             activation=self.params["bcnn_activation"],
 68 |             strides=1,
 69 |             reuse=reuse,
 70 |             name=name)
 71 | 
 72 |         # [batch, s+w-1, d, 1]
 73 |         return tf.transpose(conv, perm=[0, 1, 3, 2])
 74 | 
 75 | 
 76 |     def _w_ap(self, x, attention, name):
 77 |         # x: [batch, s+w-1, d, 1]
 78 |         # attention: [batch, s+w-1]
 79 |         if attention is not None:
 80 |             attention = tf.expand_dims(tf.expand_dims(attention, axis=-1), axis=-1)
 81 |             x2 = x * attention
 82 |         else:
 83 |             x2 = x
 84 |         w_ap = tf.layers.average_pooling2d(
 85 |             inputs=x2,
 86 |             pool_size=(self.params["bcnn_filter_size"], 1),
 87 |             strides=1,
 88 |             padding="valid",
 89 |             name=name)
 90 |         if attention is not None:
 91 |             w_ap = w_ap * self.params["bcnn_filter_size"]
 92 | 
 93 |         return w_ap
 94 | 
 95 | 
 96 |     def _all_ap(self, x, seq_len, name):
 97 |         if "input" in name:
 98 |             pool_width = seq_len
 99 |             d = self.params["embedding_dim"]
100 |         else:
101 |             pool_width = seq_len + self.params["bcnn_filter_size"] - 1
102 |             d = self.params["bcnn_num_filters"]
103 | 
104 |         all_ap = tf.layers.average_pooling2d(
105 |             inputs=x,
106 |             pool_size=(pool_width, 1),
107 |             strides=1,
108 |             padding="valid",
109 |             name=name)
110 |         all_ap_reshaped = tf.reshape(all_ap, [-1, d])
111 | 
112 |         return all_ap_reshaped
113 | 
114 | 
115 |     def _expand_input(self, x1, x2, att_mat, seq_len, d, name):
116 |         # att_mat: [batch, s, s]
117 |         aW = tf.get_variable(name=name, shape=(seq_len, d))
118 | 
119 |         # [batch, s, s] * [s,d] => [batch, s, d]
120 |         # expand dims => [batch, s, d, 1]
121 |         x1_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", att_mat, aW), -1)
122 |         x2_a = tf.expand_dims(tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW), -1)
123 | 
124 |         # [batch, s, d, 2]
125 |         x1 = tf.concat([x1, x1_a], axis=3)
126 |         x2 = tf.concat([x2, x2_a], axis=3)
127 | 
128 |         return x1, x2
129 | 
130 | 
131 |     def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index, granularity="word"):
132 |         return None, None, None, None, None
133 | 
134 | 
135 |     def _mp_cnn_layer(self, cross, dpool_index, filters, kernel_size, pool_size, strides, name):
136 |         cross_conv = tf.layers.conv2d(
137 |             inputs=cross,
138 |             filters=filters,
139 |             kernel_size=kernel_size,
140 |             padding="same",
141 |             activation=self.params["bcnn_mp_activation"],
142 |             strides=1,
143 |             reuse=False,
144 |             name=name+"cross_conv")
145 |         if self.params["bcnn_mp_dynamic_pooling"] and dpool_index is not None:
146 |             cross_conv = tf.gather_nd(cross_conv, dpool_index)
147 |         cross_pool = tf.layers.max_pooling2d(
148 |             inputs=cross_conv,
149 |             pool_size=pool_size,
150 |             strides=strides,
151 |             padding="valid",
152 |             name=name+"cross_pool")
153 |         return cross_pool
154 | 
155 |     def _bcnn_semantic_feature_layer(self, seq_left, seq_right, dpool_index=None, granularity="word"):
156 |         name = self.model_name + granularity
157 |         seq_len = self.params["max_seq_len_%s" % granularity]
158 |         # [batch, s, d] => [batch, s, d, 1]
159 |         seq_left = tf.expand_dims(seq_left, axis=-1)
160 |         seq_right = tf.expand_dims(seq_right, axis=-1)
161 | 
162 |         left_ap_list = [None] * (self.params["bcnn_num_layers"] + 1)
163 |         right_ap_list = [None] * (self.params["bcnn_num_layers"] + 1)
164 |         left_ap_list[0] = self._all_ap(x=seq_left, seq_len=seq_len, name=name + "global_pooling_input_left")
165 |         right_ap_list[0] = self._all_ap(x=seq_right, seq_len=seq_len, name=name + "global_pooling_input_right")
166 | 
167 |         x1 = seq_left
168 |         x2 = seq_right
169 |         d = self.params["embedding_dim"]
170 |         outputs = []
171 |         for layer in range(self.params["bcnn_num_layers"]):
172 |             x1, left_ap_list[layer + 1], x2, right_ap_list[layer + 1], att_pooled = self._bcnn_cnn_layer(x1=x1, x2=x2,
173 |                                                                                                          seq_len=seq_len,
174 |                                                                                                          d=d,
175 |                                                                                                          name=name + "cnn_layer_%d" % (
176 |                                                                                                                  layer + 1),
177 |                                                                                                          dpool_index=dpool_index,
178 |                                                                                                          granularity=granularity)
179 |             d = self.params["bcnn_num_filters"]
180 |             if self.params["bcnn_mp_att_pooling"] and att_pooled is not None:
181 |                 outputs.append(att_pooled)
182 | 
183 |         for l, r in zip(left_ap_list, right_ap_list):
184 |             outputs.append(metrics.cosine_similarity(l, r, self.params["similarity_aggregation"]))
185 |             outputs.append(metrics.dot_product(l, r, self.params["similarity_aggregation"]))
186 |             outputs.append(metrics.euclidean_distance(l, r, self.params["similarity_aggregation"]))
187 |         return tf.concat(outputs, axis=-1)
188 | 
189 | 
190 |     def _get_attention_matrix_pooled_features(self, att_mat, seq_len, dpool_index, granularity, name):
191 |         # get attention matrix pooled features (as in sec. 5.3.1)
192 |         att_mat0 = tf.expand_dims(att_mat, axis=3)
193 |         # conv-pool layer 1
194 |         filters = self.params["bcnn_mp_num_filters"][0]
195 |         kernel_size = self.params["bcnn_mp_filter_sizes"][0]
196 |         # seq_len = seq_len + self.params["bcnn_filter_size"] - 1
197 |         pool_size0 = self.params["bcnn_mp_pool_sizes_%s" % granularity][0]
198 |         pool_sizes = [seq_len / pool_size0, seq_len / pool_size0]
199 |         strides = [seq_len / pool_size0, seq_len / pool_size0]
200 |         conv1 = self._mp_cnn_layer(att_mat0, dpool_index, filters, kernel_size, pool_sizes, strides,
201 |                                    name=self.model_name + name + granularity + "1")
202 |         conv1_flatten = tf.reshape(conv1, [-1, self.params["mp_num_filters"][0] * (pool_size0 * pool_size0)])
203 | 
204 |         # conv-pool layer 2
205 |         filters = self.params["bcnn_mp_num_filters"][1]
206 |         kernel_size = self.params["bcnn_mp_filter_sizes"][1]
207 |         pool_size1 = self.params["bcnn_mp_pool_sizes_%s" % granularity][1]
208 |         pool_sizes = [pool_size0 / pool_size1, pool_size0 / pool_size1]
209 |         strides = [pool_size0 / pool_size1, pool_size0 / pool_size1]
210 |         conv2 = self._mp_cnn_layer(conv1, None, filters, kernel_size, pool_sizes, strides,
211 |                                    name=self.model_name + name + granularity + "2")
212 |         conv2_flatten = tf.reshape(conv2, [-1, self.params["mp_num_filters"][1] * (pool_size1 * pool_size1)])
213 | 
214 |         return conv2_flatten
215 | 
216 | 
217 |     def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False):
218 |         feed_dict = super(BCNNBaseModel, self)._get_feed_dict(X, idx, Q, construct_neg, training, symmetric)
219 |         if self.params["mp_dynamic_pooling"]:
220 |             dpool_index_word = dynamic_pooling_index(feed_dict[self.seq_len_word_left],
221 |                                                           feed_dict[self.seq_len_word_right],
222 |                                                           self.params["max_seq_len_word"],
223 |                                                           self.params["max_seq_len_word"])
224 |             dpool_index_char = dynamic_pooling_index(feed_dict[self.seq_len_char_left],
225 |                                                           feed_dict[self.seq_len_char_right],
226 |                                                           self.params["max_seq_len_char"],
227 |                                                           self.params["max_seq_len_char"])
228 |             feed_dict.update({
229 |                 self.dpool_index_word: dpool_index_word,
230 |                 self.dpool_index_char: dpool_index_char,
231 |             })
232 |         return feed_dict
233 | 
234 | 
235 |     def _get_matching_features(self):
236 |         with tf.name_scope(self.model_name):
237 |             tf.set_random_seed(self.params["random_seed"])
238 | 
239 |             with tf.name_scope("word_network"):
240 |                 if self.params["attend_method"] == "context-attention":
241 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \
242 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
243 |                         self._interaction_semantic_feature_layer(
244 |                             self.seq_word_left,
245 |                             self.seq_word_right,
246 |                             self.seq_len_word_left,
247 |                             self.seq_len_word_right,
248 |                             granularity="word")
249 |                 else:
250 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
251 |                         self._semantic_feature_layer(
252 |                             self.seq_word_left,
253 |                             self.seq_len_word_left,
254 |                             granularity="word", reuse=False)
255 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
256 |                         self._semantic_feature_layer(
257 |                             self.seq_word_right,
258 |                             self.seq_len_word_right,
259 |                             granularity="word", reuse=True)
260 |                 sim_word = self._bcnn_semantic_feature_layer(emb_seq_word_left, emb_seq_word_right, self.dpool_index_word, granularity="word")
261 | 
262 |             with tf.name_scope("char_network"):
263 |                 if self.params["attend_method"] == "context-attention":
264 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \
265 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
266 |                         self._interaction_semantic_feature_layer(
267 |                             self.seq_char_left,
268 |                             self.seq_char_right,
269 |                             self.seq_len_char_left,
270 |                             self.seq_len_char_right,
271 |                             granularity="char")
272 |                 else:
273 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
274 |                         self._semantic_feature_layer(
275 |                             self.seq_char_left,
276 |                             self.seq_len_char_left,
277 |                             granularity="char", reuse=False)
278 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
279 |                         self._semantic_feature_layer(
280 |                             self.seq_char_right,
281 |                             self.seq_len_char_right,
282 |                             granularity="char", reuse=True)
283 |                 sim_char = self._bcnn_semantic_feature_layer(emb_seq_char_left, emb_seq_char_right, self.dpool_index_char, granularity="char")
284 | 
285 |             with tf.name_scope("matching_features"):
286 |                 matching_features_word = sim_word
287 |                 matching_features_char = sim_char
288 | 
289 |         return matching_features_word, matching_features_char
290 | 
291 | 
292 | class BCNN(BCNNBaseModel):
293 |     def __init__(self, params, logger, init_embedding_matrix):
294 |         p = copy(params)
295 |         p["model_name"] = p["model_name"] + "bcnn"
296 |         super(BCNN, self).__init__(p, logger, init_embedding_matrix)
297 | 
298 | 
299 |     def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"):
300 |         # x1, x2 = [batch, s, d, 1]
301 |         # att_mat0: [batch, s, s]
302 |         att_mat0 = self._make_attention_matrix(x1, x2)
303 |         left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False)
304 |         right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True)
305 | 
306 |         left_attention, right_attention = None, None
307 | 
308 |         left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left")
309 |         left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left")
310 |         right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right")
311 |         right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right")
312 | 
313 |         # get attention matrix pooled features (as in sec. 5.3.1)
314 |         att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled")
315 | 
316 |         return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled
317 | 
318 | 
319 | class ABCNN1(BCNNBaseModel):
320 |     def __init__(self, params, logger, init_embedding_matrix):
321 |         p = copy(params)
322 |         p["model_name"] = p["model_name"] + "abcnn1"
323 |         super(ABCNN1, self).__init__(p, logger, init_embedding_matrix)
324 | 
325 | 
326 |     def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"):
327 |         # x1, x2 = [batch, s, d, 1]
328 |         # att_mat0: [batch, s, s]
329 |         att_mat0 = self._make_attention_matrix(x1, x2)
330 |         x1, x2 = self._expand_input(x1, x2, att_mat0, seq_len, d, name=name+"expand_input")
331 | 
332 |         left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False)
333 |         right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True)
334 | 
335 |         left_attention, right_attention = None, None
336 | 
337 |         left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left")
338 |         left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left")
339 |         right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right")
340 |         right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right")
341 | 
342 |         # get attention matrix pooled features (as in sec. 5.3.1)
343 |         att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled")
344 | 
345 |         return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled
346 | 
347 | 
348 | class ABCNN2(BCNNBaseModel):
349 |     def __init__(self, params, logger, init_embedding_matrix):
350 |         p = copy(params)
351 |         p["model_name"] = p["model_name"] + "abcnn2"
352 |         super(ABCNN2, self).__init__(p, logger, init_embedding_matrix)
353 | 
354 | 
355 |     def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"):
356 |         # x1, x2 = [batch, s, d, 1]
357 |         att_mat0 = self._make_attention_matrix(x1, x2)
358 |         left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False)
359 |         right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True)
360 | 
361 |         # [batch, s+w-1, s+w-1]
362 |         att_mat1 = self._make_attention_matrix(left_conv, right_conv)
363 |         # [batch, s+w-1], [batch, s+w-1]
364 |         left_attention, right_attention = tf.reduce_sum(att_mat1, axis=2), tf.reduce_sum(att_mat1, axis=1)
365 | 
366 |         left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left")
367 |         left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left")
368 |         right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right")
369 |         right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right")
370 | 
371 |         # get attention matrix pooled features (as in sec. 5.3.1)
372 |         att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled")
373 | 
374 |         return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled
375 | 
376 | 
377 | class ABCNN3(BCNNBaseModel):
378 |     def __init__(self, params, logger, init_embedding_matrix):
379 |         p = copy(params)
380 |         p["model_name"] = p["model_name"] + "abcnn3"
381 |         super(ABCNN3, self).__init__(p, logger, init_embedding_matrix)
382 | 
383 | 
384 |     def _bcnn_cnn_layer(self, x1, x2, seq_len, d, name, dpool_index=None, granularity="word"):
385 |         # x1, x2 = [batch, s, d, 1]
386 |         # att_mat0: [batch, s, s
387 |         att_mat0 = self._make_attention_matrix(x1, x2)
388 |         x1, x2 = self._expand_input(x1, x2, att_mat0, seq_len, d, name=name + "expand_input")
389 | 
390 |         left_conv = self._convolution(x=self._padding(x1, name=name+"padding_left"), d=d, name=name+"conv", reuse=False)
391 |         right_conv = self._convolution(x=self._padding(x2, name=name+"padding_right"), d=d, name=name+"conv", reuse=True)
392 | 
393 |         # [batch, s+w-1, s+w-1]
394 |         att_mat1 = self._make_attention_matrix(left_conv, right_conv)
395 |         # [batch, s+w-1], [batch, s+w-1]
396 |         left_attention, right_attention = tf.reduce_sum(att_mat1, axis=2), tf.reduce_sum(att_mat1, axis=1)
397 | 
398 |         left_wp = self._w_ap(x=left_conv, attention=left_attention, name=name+"attention_pooling_left")
399 |         left_ap = self._all_ap(x=left_conv, seq_len=seq_len, name=name+"global_pooling_left")
400 |         right_wp = self._w_ap(x=right_conv, attention=right_attention, name=name+"attention_pooling_right")
401 |         right_ap = self._all_ap(x=right_conv, seq_len=seq_len, name=name+"global_pooling_right")
402 | 
403 |         # get attention matrix pooled features (as in sec. 5.3.1)
404 |         att_mat0_pooled = self._get_attention_matrix_pooled_features(att_mat0, seq_len, dpool_index, granularity, name+"att_pooled")
405 | 
406 |         return left_wp, left_ap, right_wp, right_ap, att_mat0_pooled
407 | 


--------------------------------------------------------------------------------
/src/models/decatt.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from copy import copy
 3 | 
 4 | from models.esim import ESIMDecAttBaseModel
 5 | 
 6 | 
 7 | class DecAtt(ESIMDecAttBaseModel):
 8 |     def __init__(self, params, logger, init_embedding_matrix=None):
 9 |         p = copy(params)
10 |         # model config
11 |         p.update({
12 |             "model_name": p["model_name"] + "dec_att",
13 |             "encode_method": "project",
14 |             "attend_method": ["ave", "max", "min", "self-attention"],
15 | 
16 |             "project_type": "fc",
17 |             "project_hidden_units": [64 * 4, 64 * 2, 64],
18 |             "project_dropouts": [0, 0, 0],
19 | 
20 |             # fc block
21 |             "fc_type": "fc",
22 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
23 |             "fc_dropouts": [0, 0, 0],
24 |         })
25 |         super(DecAtt, self).__init__(p, logger, init_embedding_matrix)
26 | 


--------------------------------------------------------------------------------
/src/models/dsmm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import copy
  3 | import tensorflow as tf
  4 | 
  5 | from models.bcnn import BCNN, ABCNN1, ABCNN2, ABCNN3
  6 | from models.esim import ESIMDecAttBaseModel
  7 | from models.match_pyramid import MatchPyramidBaseModel
  8 | from tf_common import metrics
  9 | from tf_common.nn_module import mlp_layer
 10 | 
 11 | 
 12 | class DSMM(MatchPyramidBaseModel, ESIMDecAttBaseModel, BCNN):
 13 |     def __init__(self, params, logger, init_embedding_matrix=None):
 14 |         p = copy(params)
 15 |         p["model_name"] = p["model_name"] + "dsmm"
 16 |         super(DSMM, self).__init__(p, logger, init_embedding_matrix)
 17 | 
 18 | 
 19 |     def _get_matching_features(self):
 20 |         with tf.name_scope(self.model_name):
 21 |             tf.set_random_seed(self.params["random_seed"])
 22 | 
 23 |             with tf.name_scope("word_network"):
 24 |                 if self.params["attend_method"] == "context-attention":
 25 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \
 26 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
 27 |                         self._interaction_semantic_feature_layer(
 28 |                             self.seq_word_left,
 29 |                             self.seq_word_right,
 30 |                             self.seq_len_word_left,
 31 |                             self.seq_len_word_right,
 32 |                             granularity="word")
 33 |                 else:
 34 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
 35 |                         self._semantic_feature_layer(
 36 |                             self.seq_word_left,
 37 |                             self.seq_len_word_left,
 38 |                             granularity="word", reuse=False)
 39 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
 40 |                         self._semantic_feature_layer(
 41 |                             self.seq_word_right,
 42 |                             self.seq_len_word_right,
 43 |                             granularity="word", reuse=True)
 44 | 
 45 |                 #### matching
 46 |                 # match score
 47 |                 sim_word = tf.concat([
 48 |                     metrics.cosine_similarity(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 49 |                     metrics.dot_product(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 50 |                     metrics.euclidean_distance(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 51 |                     # self._canberra_score(sem_seq_word_left, sem_seq_word_right),
 52 |                 ], axis=-1)
 53 | 
 54 |                 # match pyramid
 55 |                 match_matrix_word = self._get_match_matrix(self.seq_word_left, emb_seq_word_left, enc_seq_word_left,
 56 |                                                            self.seq_word_right, emb_seq_word_right, enc_seq_word_right,
 57 |                                                            granularity="word")
 58 |                 mp_word = self._mp_semantic_feature_layer(match_matrix_word,
 59 |                                                           self.dpool_index_word,
 60 |                                                           granularity="word")
 61 | 
 62 |                 # esim
 63 |                 esim_word = self._esim_semantic_feature_layer(emb_seq_word_left,
 64 |                                                               emb_seq_word_right,
 65 |                                                               self.seq_len_word_left,
 66 |                                                               self.seq_len_word_right,
 67 |                                                               granularity="word")
 68 | 
 69 |                 # bcnn
 70 |                 bcnn_word = self._bcnn_semantic_feature_layer(emb_seq_word_left,
 71 |                                                               emb_seq_word_right,
 72 |                                                               granularity="word")
 73 | 
 74 |                 # dense
 75 |                 deep_in_word = tf.concat([sem_seq_word_left, sem_seq_word_right], axis=-1)
 76 |                 deep_word = mlp_layer(deep_in_word, fc_type=self.params["fc_type"],
 77 |                                       hidden_units=self.params["fc_hidden_units"],
 78 |                                       dropouts=self.params["fc_dropouts"],
 79 |                                       scope_name=self.model_name + "deep_word",
 80 |                                       reuse=False,
 81 |                                       training=self.training,
 82 |                                       seed=self.params["random_seed"])
 83 | 
 84 |             with tf.name_scope("char_network"):
 85 |                 if self.params["attend_method"] == "context-attention":
 86 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \
 87 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
 88 |                         self._interaction_semantic_feature_layer(
 89 |                             self.seq_char_left,
 90 |                             self.seq_char_right,
 91 |                             self.seq_len_char_left,
 92 |                             self.seq_len_char_right,
 93 |                             granularity="char")
 94 |                 else:
 95 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
 96 |                         self._semantic_feature_layer(
 97 |                             self.seq_char_left,
 98 |                             self.seq_len_char_left,
 99 |                             granularity="char", reuse=False)
100 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
101 |                         self._semantic_feature_layer(
102 |                             self.seq_char_right,
103 |                             self.seq_len_char_right,
104 |                             granularity="char", reuse=True)
105 | 
106 |                 # match score
107 |                 sim_char = tf.concat([
108 |                     metrics.cosine_similarity(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
109 |                     metrics.dot_product(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
110 |                     metrics.euclidean_distance(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
111 |                     # self._canberra_score(sem_seq_char_left, sem_seq_char_right),
112 |                 ], axis=-1)
113 | 
114 |                 # match pyramid
115 |                 match_matrix_char = self._get_match_matrix(self.seq_char_left, emb_seq_char_left, enc_seq_char_left,
116 |                                                            self.seq_char_right, emb_seq_char_right, enc_seq_char_right,
117 |                                                            granularity="char")
118 |                 mp_char = self._mp_semantic_feature_layer(match_matrix_char,
119 |                                                           self.dpool_index_char,
120 |                                                           granularity="char")
121 | 
122 |                 # esim
123 |                 esim_char = self._esim_semantic_feature_layer(emb_seq_char_left,
124 |                                                               emb_seq_char_right,
125 |                                                               self.seq_len_char_left,
126 |                                                               self.seq_len_char_right,
127 |                                                               granularity="char")
128 | 
129 |                 # bcnn
130 |                 bcnn_char = self._bcnn_semantic_feature_layer(emb_seq_char_left,
131 |                                                               emb_seq_char_right,
132 |                                                               granularity="char")
133 | 
134 |                 # dense
135 |                 deep_in_char = tf.concat([sem_seq_char_left, sem_seq_char_right], axis=-1)
136 |                 deep_char = mlp_layer(deep_in_char, fc_type=self.params["fc_type"],
137 |                                       hidden_units=self.params["fc_hidden_units"],
138 |                                       dropouts=self.params["fc_dropouts"],
139 |                                       scope_name=self.model_name + "deep_char",
140 |                                       reuse=False,
141 |                                       training=self.training,
142 |                                       seed=self.params["random_seed"])
143 | 
144 |             with tf.name_scope("matching_features"):
145 |                 matching_features_word = tf.concat([
146 |                     sim_word, mp_word, esim_word, bcnn_word, deep_word,# sem_seq_word_left, sem_seq_word_right,
147 |                 ], axis=-1)
148 |                 matching_features_char = tf.concat([
149 |                     sim_char, mp_char, esim_char, bcnn_char, deep_char,# sem_seq_char_left, sem_seq_char_right,
150 |                 ], axis=-1)
151 | 
152 |         return matching_features_word, matching_features_char
153 | 


--------------------------------------------------------------------------------
/src/models/dssm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import copy
  3 | import tensorflow as tf
  4 | 
  5 | from models.base_model import BaseModel
  6 | from tf_common import metrics
  7 | 
  8 | 
  9 | class DSSMBaseModel(BaseModel):
 10 |     def __init__(self, params, logger, init_embedding_matrix=None):
 11 |         super(DSSMBaseModel, self).__init__(params, logger, init_embedding_matrix)
 12 | 
 13 | 
 14 |     def _get_matching_features(self):
 15 |         with tf.name_scope(self.model_name):
 16 |             tf.set_random_seed(self.params["random_seed"])
 17 | 
 18 |             with tf.name_scope("word_network"):
 19 |                 if self.params["attend_method"] == "context-attention":
 20 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \
 21 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
 22 |                         self._interaction_semantic_feature_layer(
 23 |                             self.seq_word_left,
 24 |                             self.seq_word_right,
 25 |                             self.seq_len_word_left,
 26 |                             self.seq_len_word_right,
 27 |                             granularity="word")
 28 |                 else:
 29 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
 30 |                         self._semantic_feature_layer(
 31 |                             self.seq_word_left,
 32 |                             self.seq_len_word_left,
 33 |                             granularity="word", reuse=False)
 34 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
 35 |                         self._semantic_feature_layer(
 36 |                             self.seq_word_right,
 37 |                             self.seq_len_word_right,
 38 |                             granularity="word", reuse=True)
 39 |                 # match score
 40 |                 sim_word = tf.concat([
 41 |                     metrics.cosine_similarity(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 42 |                     metrics.dot_product(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 43 |                     metrics.euclidean_distance(sem_seq_word_left, sem_seq_word_right, self.params["similarity_aggregation"]),
 44 |                     # self._canberra_score(sem_seq_word_left, sem_seq_word_right),
 45 |                 ], axis=-1)
 46 | 
 47 |             with tf.name_scope("char_network"):
 48 |                 if self.params["attend_method"] == "context-attention":
 49 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \
 50 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
 51 |                         self._interaction_semantic_feature_layer(
 52 |                             self.seq_char_left,
 53 |                             self.seq_char_right,
 54 |                             self.seq_len_char_left,
 55 |                             self.seq_len_char_right,
 56 |                             granularity="char")
 57 |                 else:
 58 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
 59 |                         self._semantic_feature_layer(
 60 |                             self.seq_char_left,
 61 |                             self.seq_len_char_left,
 62 |                             granularity="char", reuse=False)
 63 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
 64 |                         self._semantic_feature_layer(
 65 |                             self.seq_char_right,
 66 |                             self.seq_len_char_right,
 67 |                             granularity="char", reuse=True)
 68 |                 # match score
 69 |                 sim_char = tf.concat([
 70 |                     metrics.cosine_similarity(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
 71 |                     metrics.dot_product(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
 72 |                     metrics.euclidean_distance(sem_seq_char_left, sem_seq_char_right, self.params["similarity_aggregation"]),
 73 |                     # self._canberra_score(sem_seq_char_left, sem_seq_char_right),
 74 |                 ], axis=-1)
 75 | 
 76 |             with tf.name_scope("matching_features"):
 77 |                 matching_features_word = sim_word
 78 |                 matching_features_char = sim_char
 79 | 
 80 |         return matching_features_word, matching_features_char
 81 | 
 82 | 
 83 | class DSSM(DSSMBaseModel):
 84 |     def __init__(self, params, logger, init_embedding_matrix=None):
 85 |         p = copy(params)
 86 |         # model config
 87 |         p.update({
 88 |             "model_name": p["model_name"] + "dssm",
 89 |             "encode_method": "fasttext",
 90 |             "attend_method": ["ave", "max", "min", "self-scalar-attention"],
 91 | 
 92 |             # fc block
 93 |             "fc_type": "fc",
 94 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
 95 |             "fc_dropouts": [0, 0, 0],
 96 |         })
 97 |         super(DSSM, self).__init__(p, logger, init_embedding_matrix)
 98 | 
 99 | 
100 | class CDSSM(DSSMBaseModel):
101 |     def __init__(self, params, logger, init_embedding_matrix=None):
102 |         p = copy(params)
103 |         # model config
104 |         p.update({
105 |             "model_name": p["model_name"] + "cdssm",
106 |             "encode_method": "textcnn",
107 |             "attend_method": ["ave", "max", "min", "self-scalar-attention"],
108 | 
109 |             # cnn
110 |             "cnn_num_layers": 1,
111 |             "cnn_num_filters": 32,
112 |             "cnn_filter_sizes": [1, 2, 3],
113 |             "cnn_timedistributed": False,
114 |             "cnn_activation": tf.nn.relu,
115 |             "cnn_gated_conv": False,
116 |             "cnn_residual": False,
117 | 
118 |             # fc block
119 |             "fc_type": "fc",
120 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
121 |             "fc_dropouts": [0, 0, 0],
122 |         })
123 |         super(CDSSM, self).__init__(p, logger, init_embedding_matrix)
124 | 
125 | 
126 | class RDSSM(DSSMBaseModel):
127 |     def __init__(self, params, logger, init_embedding_matrix=None):
128 |         p = copy(params)
129 |         # model config
130 |         p.update({
131 |             "model_name": p["model_name"] + "rdssm",
132 |             "encode_method": "textbirnn",
133 |             "attend_method": ["ave", "max", "min", "self-scalar-attention"],
134 | 
135 |             # rnn
136 |             "rnn_num_units": 32,
137 |             "rnn_cell_type": "gru",
138 |             "rnn_num_layers": 1,
139 | 
140 |             # fc block
141 |             "fc_type": "fc",
142 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
143 |             "fc_dropouts": [0, 0, 0],
144 |         })
145 |         super(RDSSM, self).__init__(p, logger, init_embedding_matrix)
146 | 


--------------------------------------------------------------------------------
/src/models/esim.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import copy
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | from models.base_model import BaseModel
  7 | from tf_common.nn_module import word_dropout
  8 | from tf_common.nn_module import encode, attend
  9 | 
 10 | 
 11 | class ESIMDecAttBaseModel(BaseModel):
 12 |     """
 13 |     Implementation of base model of ESIM and DecAtt
 14 |     The difference between them lies in the encoder they use.
 15 |         - ESIM: BiLSTM
 16 |         - DecAtt: timedistributed dense projection
 17 | 
 18 |     Reference
 19 |     Paper:
 20 |         - ESIM: Enhanced LSTM for Natural Language Inference
 21 |         - DecAtt: A Decomposable Attention Model for Natural Language Inference
 22 |     Keras:
 23 |         https://www.kaggle.com/lamdang/dl-models
 24 |     Pytorch:
 25 |         https://github.com/lanwuwei/SPM_toolkit
 26 |     """
 27 |     def __init__(self, params, logger, init_embedding_matrix=None):
 28 |         super(ESIMDecAttBaseModel, self).__init__(params, logger, init_embedding_matrix)
 29 | 
 30 | 
 31 |     def _soft_attention_alignment(self, x1, x2):
 32 |         "Align text representation with neural soft attention"
 33 |         # x1: [b, s1, d]
 34 |         # x2: [b, s2, d]
 35 |         # att: [b, s1, s2]
 36 |         att = tf.einsum("abd,acd->abc", x1, x2)
 37 |         w_att_1 = tf.nn.softmax(att, dim=1)
 38 |         w_att_2 = tf.nn.softmax(att, dim=2)
 39 |         x2_att = tf.einsum("abd,abc->acd", x1, w_att_1)
 40 |         x1_att = tf.einsum("abd,acb->acd", x2, w_att_2)
 41 |         return x1_att, x2_att
 42 | 
 43 | 
 44 |     def _esim_semantic_feature_layer(self, emb_seq_left, emb_seq_right, seq_len_left, seq_len_right, granularity="word"):
 45 |         # for sharing embedding with other sub-graph
 46 |         # #### embed
 47 |         # emb_matrix = self._get_embedding_matrix(granularity)
 48 |         # emb_seq_left = tf.nn.embedding_lookup(emb_matrix, seq_input_left)
 49 |         # emb_seq_right = tf.nn.embedding_lookup(emb_matrix, seq_input_right)
 50 |         #
 51 |         # #### dropout
 52 |         # random_seed = np.random.randint(10000000)
 53 |         # emb_seq_left = word_dropout(emb_seq_left,
 54 |         #                             training=self.training,
 55 |         #                             dropout=self.params["embedding_dropout"],
 56 |         #                             seed=random_seed)
 57 |         # random_seed = np.random.randint(10000000)
 58 |         # emb_seq_right = word_dropout(emb_seq_right,
 59 |         #                              training=self.training,
 60 |         #                              dropout=self.params["embedding_dropout"],
 61 |         #                              seed=random_seed)
 62 | 
 63 |         #### encode
 64 |         input_dim = self.params["embedding_dim"]
 65 |         enc_seq_left = encode(emb_seq_left, method=self.params["encode_method"],
 66 |                               input_dim=input_dim,
 67 |                               params=self.params,
 68 |                               sequence_length=seq_len_left,
 69 |                               mask_zero=self.params["embedding_mask_zero"],
 70 |                               scope_name=self.model_name + "esim_enc_seq_%s" % granularity, reuse=False,
 71 |                               training=self.training)
 72 |         enc_seq_right = encode(emb_seq_right, method=self.params["encode_method"],
 73 |                                input_dim=input_dim,
 74 |                                params=self.params,
 75 |                                sequence_length=seq_len_right,
 76 |                                mask_zero=self.params["embedding_mask_zero"],
 77 |                                scope_name=self.model_name + "esim_enc_seq_%s" % granularity, reuse=True,
 78 |                                training=self.training)
 79 | 
 80 |         #### align
 81 |         ali_seq_left, ali_seq_right = self._soft_attention_alignment(enc_seq_left, enc_seq_right)
 82 | 
 83 |         #### compose
 84 |         com_seq_left = tf.concat([
 85 |             enc_seq_left,
 86 |             ali_seq_left,
 87 |             enc_seq_left * ali_seq_left,
 88 |             enc_seq_left - ali_seq_left,
 89 |         ], axis=-1)
 90 |         com_seq_right = tf.concat([
 91 |             enc_seq_right,
 92 |             ali_seq_right,
 93 |             enc_seq_right * ali_seq_right,
 94 |             enc_seq_right - ali_seq_right,
 95 |         ], axis=-1)
 96 | 
 97 |         input_dim = self.params["encode_dim"] * 4
 98 |         compare_seq_left = encode(com_seq_left, method=self.params["encode_method"],
 99 |                                   input_dim=input_dim,
100 |                                   params=self.params,
101 |                                   sequence_length=seq_len_left,
102 |                                   mask_zero=self.params["embedding_mask_zero"],
103 |                                   scope_name=self.model_name + "compare_seq_%s" % granularity, reuse=False,
104 |                                   training=self.training)
105 |         compare_seq_right = encode(com_seq_right, method=self.params["encode_method"],
106 |                                    input_dim=input_dim,
107 |                                    params=self.params,
108 |                                    sequence_length=seq_len_right,
109 |                                    mask_zero=self.params["embedding_mask_zero"],
110 |                                    scope_name=self.model_name + "compare_seq_%s" % granularity, reuse=True,
111 |                                    training=self.training)
112 | 
113 |         #### attend
114 |         feature_dim = self.params["encode_dim"]
115 |         att_seq_left = attend(compare_seq_left, context=None,
116 |                               encode_dim=self.params["encode_dim"],
117 |                               feature_dim=feature_dim,
118 |                               attention_dim=self.params["attention_dim"],
119 |                               method=self.params["attend_method"],
120 |                               scope_name=self.model_name + "agg_seq_%s" % granularity,
121 |                               reuse=False, num_heads=self.params["attention_num_heads"])
122 |         att_seq_right = attend(compare_seq_right, context=None,
123 |                                encode_dim=self.params["encode_dim"],
124 |                                feature_dim=feature_dim,
125 |                                attention_dim=self.params["attention_dim"],
126 |                                method=self.params["attend_method"],
127 |                                scope_name=self.model_name + "agg_seq_%s" % granularity,
128 |                                reuse=True, num_heads=self.params["attention_num_heads"])
129 |         return tf.concat([att_seq_left, att_seq_right], axis=-1)
130 | 
131 | 
132 |     def _get_matching_features(self):
133 |         with tf.name_scope(self.model_name):
134 |             tf.set_random_seed(self.params["random_seed"])
135 | 
136 |             with tf.name_scope("word_network"):
137 |                 emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
138 |                     self._semantic_feature_layer(
139 |                         self.seq_word_left,
140 |                         self.seq_len_word_left,
141 |                         granularity="word", reuse=False)
142 |                 emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
143 |                     self._semantic_feature_layer(
144 |                         self.seq_word_right,
145 |                         self.seq_len_word_right,
146 |                         granularity="word", reuse=True)
147 |                 sim_word = self._esim_semantic_feature_layer(
148 |                     emb_seq_word_left,
149 |                     emb_seq_word_right,
150 |                     self.seq_len_word_left,
151 |                     self.seq_len_word_right,
152 |                     granularity="word")
153 | 
154 |             with tf.name_scope("char_network"):
155 |                 emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
156 |                     self._semantic_feature_layer(
157 |                         self.seq_char_left,
158 |                         self.seq_len_char_left,
159 |                         granularity="char", reuse=False)
160 |                 emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
161 |                     self._semantic_feature_layer(
162 |                         self.seq_char_right,
163 |                         self.seq_len_char_right,
164 |                         granularity="char", reuse=True)
165 |                 sim_char = self._esim_semantic_feature_layer(
166 |                     emb_seq_char_left,
167 |                     emb_seq_char_right,
168 |                     self.seq_len_char_left,
169 |                     self.seq_len_char_right,
170 |                     granularity="char")
171 | 
172 |             with tf.name_scope("matching_features"):
173 |                 matching_features_word = sim_word
174 |                 matching_features_char = sim_char
175 | 
176 |         return matching_features_word, matching_features_char
177 | 
178 | 
179 | class ESIM(ESIMDecAttBaseModel):
180 |     def __init__(self, params, logger, init_embedding_matrix=None):
181 |         p = copy(params)
182 |         # model config
183 |         p.update({
184 |             "model_name": p["model_name"] + "esim",
185 |             "encode_method": "textbirnn",
186 |             "attend_method": ["ave", "max", "min", "self-attention"],
187 | 
188 |             # rnn
189 |             "rnn_num_units": 32,
190 |             "rnn_cell_type": "gru",
191 |             "rnn_num_layers": 1,
192 | 
193 |             # fc block
194 |             "fc_type": "fc",
195 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
196 |             "fc_dropouts": [0, 0, 0],
197 |         })
198 |         super(ESIMDecAttBaseModel, self).__init__(p, logger, init_embedding_matrix)
199 | 


--------------------------------------------------------------------------------
/src/models/match_pyramid.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from copy import copy
  3 | import tensorflow as tf
  4 | 
  5 | from inputs.dynamic_pooling import dynamic_pooling_index
  6 | from models.base_model import BaseModel
  7 | 
  8 | 
  9 | class MatchPyramidBaseModel(BaseModel):
 10 |     def __init__(self, params, logger, init_embedding_matrix=None):
 11 |         super(MatchPyramidBaseModel, self).__init__(params, logger, init_embedding_matrix)
 12 | 
 13 | 
 14 |     def _init_tf_vars(self):
 15 |         super(MatchPyramidBaseModel, self)._init_tf_vars()
 16 |         self.dpool_index_word = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_word"],
 17 |                                                                 self.params["max_seq_len_word"], 3],
 18 |                                                name="dpool_index_word")
 19 |         self.dpool_index_char = tf.placeholder(tf.int32, shape=[None, self.params["max_seq_len_char"],
 20 |                                                                 self.params["max_seq_len_char"], 3],
 21 |                                                name="dpool_index_char")
 22 | 
 23 | 
 24 |     def _get_match_matrix(self, seq_left, emb_seq_left, enc_seq_left, seq_right, emb_seq_right, enc_seq_right,
 25 |                           granularity="word"):
 26 |         # 1. word embedding
 27 |         # 1.1 dot product: [batchsize, s1, s2, 1]
 28 |         match_matrix_dot_product = tf.expand_dims(
 29 |             tf.einsum("abd,acd->abc", emb_seq_left, emb_seq_right), axis=-1)
 30 |         # 1.2 identity: [batchsize, s1, s2, 1]
 31 |         match_matrix_identity = tf.expand_dims(tf.cast(
 32 |             tf.equal(
 33 |                 tf.expand_dims(seq_left, 2),
 34 |                 tf.expand_dims(seq_right, 1)
 35 |             ), tf.float32), axis=-1)
 36 | 
 37 |         # 2. compressed word embedding
 38 |         eW = tf.get_variable("eW_%s" % (self.model_name + granularity),
 39 |                              initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.2, dtype=tf.float32),
 40 |                              dtype=tf.float32,
 41 |                              shape=[self.params["embedding_dim_%s" % granularity],
 42 |                                     self.params["embedding_dim_compressed"]])
 43 |         emb_seq_com_left = tf.einsum("abd,dc->abc", emb_seq_left, eW)
 44 |         emb_seq_com_right = tf.einsum("abd,dc->abc", emb_seq_right, eW)
 45 |         # 2.1 dot product: [batchsize, s1, s2, 1]
 46 |         match_matrix_dot_product_com = tf.expand_dims(
 47 |             tf.einsum("abd,acd->abc", emb_seq_com_left, emb_seq_com_right), axis=-1)
 48 |         # 2.2 element product: [batchsize, s1, s2, d]
 49 |         match_matrix_element_product_com = tf.expand_dims(emb_seq_com_left, 2) * tf.expand_dims(
 50 |             emb_seq_com_right, 1)
 51 |         # 2.3 element concat: [batchsize, s1, s2, 2*d]
 52 |         match_matrix_element_concat_com = tf.concat([
 53 |             tf.tile(tf.expand_dims(emb_seq_com_left, 2), [1, 1, self.params["max_seq_len_%s" % granularity], 1]),
 54 |             tf.tile(tf.expand_dims(emb_seq_com_right, 1), [1, self.params["max_seq_len_%s" % granularity], 1, 1]),
 55 |         ], axis=-1)
 56 | 
 57 |         # 3. contextual word embedding
 58 |         # 3.1 dot product: [batchsize, s1, s2, 1]
 59 |         match_matrix_dot_product_ctx = tf.expand_dims(
 60 |             tf.einsum("abd,acd->abc", enc_seq_left, enc_seq_right), axis=-1)
 61 |         # 2.2 element product: [batchsize, s1, s2, d]
 62 |         match_matrix_element_product_ctx = tf.expand_dims(enc_seq_left, 2) * tf.expand_dims(
 63 |             enc_seq_right, 1)
 64 |         # 2.3 element concat: [batchsize, s1, s2, 2*d]
 65 |         match_matrix_element_concat_ctx = tf.concat([
 66 |             tf.tile(tf.expand_dims(enc_seq_left, 2), [1, 1, self.params["max_seq_len_%s" % granularity], 1]),
 67 |             tf.tile(tf.expand_dims(enc_seq_right, 1), [1, self.params["max_seq_len_%s" % granularity], 1, 1]),
 68 |         ], axis=-1)
 69 | 
 70 |         match_matrix = tf.concat([
 71 |             match_matrix_dot_product,
 72 |             match_matrix_identity,
 73 |             match_matrix_dot_product_com,
 74 |             match_matrix_element_product_com,
 75 |             match_matrix_element_concat_com,
 76 |             match_matrix_dot_product_ctx,
 77 |             match_matrix_element_product_ctx,
 78 |             match_matrix_element_concat_ctx,
 79 |         ], axis=-1)
 80 |         return match_matrix
 81 | 
 82 | 
 83 |     def _mp_cnn_layer(self, cross, dpool_index, filters, kernel_size, pool_size, strides, name):
 84 |         cross_conv = tf.layers.conv2d(
 85 |             inputs=cross,
 86 |             filters=filters,
 87 |             kernel_size=kernel_size,
 88 |             padding="same",
 89 |             activation=self.params["mp_activation"],
 90 |             strides=1,
 91 |             reuse=False,
 92 |             name=name+"cross_conv")
 93 |         if self.params["mp_dynamic_pooling"] and dpool_index is not None:
 94 |             cross_conv = tf.gather_nd(cross_conv, dpool_index)
 95 |         cross_pool = tf.layers.max_pooling2d(
 96 |             inputs=cross_conv,
 97 |             pool_size=pool_size,
 98 |             strides=strides,
 99 |             padding="valid",
100 |             name=name+"cross_pool")
101 |         return cross_pool
102 | 
103 | 
104 |     def _mp_semantic_feature_layer(self, match_matrix, dpool_index, granularity="word"):
105 | 
106 |         # conv-pool layer 1
107 |         filters = self.params["mp_num_filters"][0]
108 |         kernel_size = self.params["mp_filter_sizes"][0]
109 |         seq_len = self.params["max_seq_len_%s" % granularity]
110 |         pool_size0 = self.params["mp_pool_sizes_%s" % granularity][0]
111 |         pool_sizes = [seq_len / pool_size0, seq_len / pool_size0]
112 |         strides = [seq_len / pool_size0, seq_len / pool_size0]
113 |         conv1 = self._mp_cnn_layer(match_matrix, dpool_index, filters, kernel_size, pool_sizes, strides, name=self.model_name+granularity+"1")
114 |         conv1_flatten = tf.reshape(conv1, [-1, self.params["mp_num_filters"][0] * (pool_size0 * pool_size0)])
115 | 
116 |         # conv-pool layer 2
117 |         filters = self.params["mp_num_filters"][1]
118 |         kernel_size = self.params["mp_filter_sizes"][1]
119 |         pool_size1 = self.params["mp_pool_sizes_%s" % granularity][1]
120 |         pool_sizes = [pool_size0 / pool_size1, pool_size0 / pool_size1]
121 |         strides = [pool_size0 / pool_size1, pool_size0 / pool_size1]
122 |         conv2 = self._mp_cnn_layer(conv1, None, filters, kernel_size, pool_sizes, strides, name=self.model_name + granularity + "2")
123 |         conv2_flatten = tf.reshape(conv2, [-1, self.params["mp_num_filters"][1] * (pool_size1 * pool_size1)])
124 | 
125 |         # cross = tf.concat([conv1_flatten, conv2_flatten], axis=-1)
126 | 
127 |         return conv2_flatten
128 | 
129 | 
130 |     def _get_feed_dict(self, X, idx, Q, construct_neg=False, training=False, symmetric=False):
131 |         feed_dict = super(MatchPyramidBaseModel, self)._get_feed_dict(X, idx, Q, construct_neg, training, symmetric)
132 |         if self.params["mp_dynamic_pooling"]:
133 |             dpool_index_word = dynamic_pooling_index(feed_dict[self.seq_len_word_left],
134 |                                                           feed_dict[self.seq_len_word_right],
135 |                                                           self.params["max_seq_len_word"],
136 |                                                           self.params["max_seq_len_word"])
137 |             dpool_index_char = dynamic_pooling_index(feed_dict[self.seq_len_char_left],
138 |                                                           feed_dict[self.seq_len_char_right],
139 |                                                           self.params["max_seq_len_char"],
140 |                                                           self.params["max_seq_len_char"])
141 |             feed_dict.update({
142 |                 self.dpool_index_word: dpool_index_word,
143 |                 self.dpool_index_char: dpool_index_char,
144 |             })
145 |         return feed_dict
146 | 
147 | 
148 | class MatchPyramid(MatchPyramidBaseModel):
149 |     def __init__(self, params, logger, init_embedding_matrix=None):
150 |         p = copy(params)
151 |         p["model_name"] = p["model_name"] + "match_pyramid"
152 |         super(MatchPyramid, self).__init__(p, logger, init_embedding_matrix)
153 | 
154 | 
155 |     def _get_matching_features(self):
156 |         with tf.name_scope(self.model_name):
157 |             tf.set_random_seed(self.params["random_seed"])
158 | 
159 |             with tf.name_scope("word_network"):
160 |                 if self.params["attend_method"] == "context-attention":
161 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \
162 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
163 |                         self._interaction_semantic_feature_layer(
164 |                             self.seq_word_left,
165 |                             self.seq_word_right,
166 |                             self.seq_len_word_left,
167 |                             self.seq_len_word_right,
168 |                             granularity="word")
169 |                 else:
170 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
171 |                         self._semantic_feature_layer(
172 |                             self.seq_word_left,
173 |                             self.seq_len_word_left,
174 |                             granularity="word", reuse=False)
175 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
176 |                         self._semantic_feature_layer(
177 |                             self.seq_word_right,
178 |                             self.seq_len_word_right,
179 |                             granularity="word", reuse=True)
180 |                 match_matrix_word = tf.einsum("abd,acd->abc", emb_seq_word_left, emb_seq_word_right)
181 |                 match_matrix_word = tf.expand_dims(match_matrix_word, axis=-1)
182 |                 sim_word = self._mp_semantic_feature_layer(match_matrix_word, self.dpool_index_word,
183 |                                                              granularity="word")
184 | 
185 |             with tf.name_scope("char_network"):
186 |                 if self.params["attend_method"] == "context-attention":
187 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \
188 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
189 |                         self._interaction_semantic_feature_layer(
190 |                             self.seq_char_left,
191 |                             self.seq_char_right,
192 |                             self.seq_len_char_left,
193 |                             self.seq_len_char_right,
194 |                             granularity="char")
195 |                 else:
196 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
197 |                         self._semantic_feature_layer(
198 |                             self.seq_char_left,
199 |                             self.seq_len_char_left,
200 |                             granularity="char", reuse=False)
201 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
202 |                         self._semantic_feature_layer(
203 |                             self.seq_char_right,
204 |                             self.seq_len_char_right,
205 |                             granularity="char", reuse=True)
206 |                 match_matrix_char = tf.einsum("abd,acd->abc", emb_seq_char_left, emb_seq_char_right)
207 |                 match_matrix_char = tf.expand_dims(match_matrix_char, axis=-1)
208 |                 sim_char = self._mp_semantic_feature_layer(match_matrix_char, self.dpool_index_char,
209 |                                                              granularity="char")
210 |             with tf.name_scope("matching_features"):
211 |                 matching_features_word = sim_word
212 |                 matching_features_char = sim_char
213 | 
214 |         return matching_features_word, matching_features_char
215 | 
216 | 
217 | class GMatchPyramid(MatchPyramidBaseModel):
218 |     def __init__(self, params, logger, init_embedding_matrix=None):
219 |         p = copy(params)
220 |         # model config
221 |         p.update({
222 |             "model_name": p["model_name"] + "g_match_pyramid",
223 |             "encode_method": "textcnn",
224 |             "attend_method": ["ave", "max", "min", "self-attention"],
225 | 
226 |             # cnn
227 |             "cnn_num_layers": 1,
228 |             "cnn_num_filters": 32,
229 |             "cnn_filter_sizes": [1, 2, 3],
230 |             "cnn_timedistributed": False,
231 |             "cnn_activation": tf.nn.relu,
232 |             "cnn_gated_conv": True,
233 |             "cnn_residual": True,
234 | 
235 |             # fc block
236 |             "fc_type": "fc",
237 |             "fc_hidden_units": [64 * 4, 64 * 2, 64],
238 |             "fc_dropouts": [0, 0, 0],
239 |         })
240 |         super(GMatchPyramid, self).__init__(p, logger, init_embedding_matrix)
241 | 
242 | 
243 |     def _get_matching_features(self):
244 |         with tf.name_scope(self.model_name):
245 |             tf.set_random_seed(self.params["random_seed"])
246 | 
247 |             with tf.name_scope("word_network"):
248 |                 if self.params["attend_method"] == "context-attention":
249 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left, \
250 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
251 |                         self._interaction_semantic_feature_layer(
252 |                             self.seq_word_left,
253 |                             self.seq_word_right,
254 |                             self.seq_len_word_left,
255 |                             self.seq_len_word_right,
256 |                             granularity="word")
257 |                 else:
258 |                     emb_seq_word_left, enc_seq_word_left, att_seq_word_left, sem_seq_word_left = \
259 |                         self._semantic_feature_layer(
260 |                             self.seq_word_left,
261 |                             self.seq_len_word_left,
262 |                             granularity="word", reuse=False)
263 |                     emb_seq_word_right, enc_seq_word_right, att_seq_word_right, sem_seq_word_right = \
264 |                         self._semantic_feature_layer(
265 |                             self.seq_word_right,
266 |                             self.seq_len_word_right,
267 |                             granularity="word", reuse=True)
268 | 
269 |                 match_matrix_word = self._get_match_matrix(self.seq_word_left, emb_seq_word_left, enc_seq_word_left,
270 |                                                            self.seq_word_right, emb_seq_word_right, enc_seq_word_right,
271 |                                                            granularity="word")
272 |                 sim_word = self._mp_semantic_feature_layer(match_matrix_word, self.dpool_index_word, granularity="word")
273 | 
274 |             with tf.name_scope("char_network"):
275 |                 if self.params["attend_method"] == "context-attention":
276 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left, \
277 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
278 |                         self._interaction_semantic_feature_layer(
279 |                             self.seq_char_left,
280 |                             self.seq_char_right,
281 |                             self.seq_len_char_left,
282 |                             self.seq_len_char_right,
283 |                             granularity="char")
284 |                 else:
285 |                     emb_seq_char_left, enc_seq_char_left, att_seq_char_left, sem_seq_char_left = \
286 |                         self._semantic_feature_layer(
287 |                             self.seq_char_left,
288 |                             self.seq_len_char_left,
289 |                             granularity="char", reuse=False)
290 |                     emb_seq_char_right, enc_seq_char_right, att_seq_char_right, sem_seq_char_right = \
291 |                         self._semantic_feature_layer(
292 |                             self.seq_char_right,
293 |                             self.seq_len_char_right,
294 |                             granularity="char", reuse=True)
295 | 
296 |                 match_matrix_char = self._get_match_matrix(self.seq_char_left, emb_seq_char_left, enc_seq_char_left,
297 |                                                            self.seq_char_right, emb_seq_char_right, enc_seq_char_right,
298 |                                                            granularity="char")
299 |                 sim_char = self._mp_semantic_feature_layer(match_matrix_char, self.dpool_index_char,
300 |                                                              granularity="char")
301 | 
302 |             with tf.name_scope("matching_features"):
303 |                 matching_features_word = sim_word
304 |                 matching_features_char = sim_char
305 | 
306 |         return matching_features_word, matching_features_char
307 | 


--------------------------------------------------------------------------------
/src/models/model_library.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from models.bcnn import BCNN, ABCNN1, ABCNN2, ABCNN3
 3 | from models.decatt import DecAtt
 4 | from models.dssm import DSSM, CDSSM, RDSSM
 5 | from models.dsmm import DSMM
 6 | from models.esim import ESIM
 7 | from models.match_pyramid import MatchPyramid, GMatchPyramid
 8 | 
 9 | 
10 | def get_model(model_type):
11 |     if model_type == "dssm":
12 |         return DSSM
13 |     elif model_type == "cdssm":
14 |         return CDSSM
15 |     elif model_type == "rdssm":
16 |         return RDSSM
17 |     elif model_type == "match_pyramid":
18 |         return MatchPyramid
19 |     elif model_type == "g_match_pyramid":
20 |         return GMatchPyramid
21 |     elif model_type == "dsmm":
22 |         return DSMM
23 |     elif model_type == "bcnn":
24 |         return BCNN
25 |     elif model_type == "abcnn1":
26 |         return ABCNN1
27 |     elif model_type == "abcnn2":
28 |         return ABCNN2
29 |     elif model_type == "abcnn3":
30 |         return ABCNN3
31 |     elif model_type == "esim":
32 |         return ESIM
33 |     elif model_type == "decatt":
34 |         return DecAtt
35 |     else:
36 |         return DSMM
37 | 


--------------------------------------------------------------------------------
/src/tf_common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/tf_common/__init__.py


--------------------------------------------------------------------------------
/src/tf_common/metrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def cosine_similarity(v1, v2, aggregation=True):
 6 |     v1_n = tf.nn.l2_normalize(v1, dim=1)
 7 |     v2_n = tf.nn.l2_normalize(v2, dim=1)
 8 |     if aggregation:
 9 |         s = tf.reduce_sum(v1_n * v2_n, axis=1, keep_dims=True)
10 |     else:
11 |         s = v1_n * v2_n
12 |     return s
13 | 
14 | 
15 | def dot_product(v1, v2, aggregation=True):
16 |     if aggregation:
17 |         s = tf.reduce_sum(v1 * v2, axis=1, keep_dims=True)
18 |     else:
19 |         s = v1 * v2
20 |     return s
21 | 
22 | 
23 | def euclidean_distance(v1, v2, aggregation=True):
24 |     if aggregation:
25 |         s = tf.sqrt(tf.reduce_sum(tf.square(v1 - v2), axis=1, keep_dims=True))
26 |     else:
27 |         s = tf.abs(v1 - v2)
28 |     return s
29 | 
30 | 
31 | def euclidean_score(v1, v2, aggregation=True):
32 |     s = euclidean_distance(v1, v2, aggregation)
33 |     return 1. / (1. + s)
34 | 
35 | 
36 | def canberra_score(v1, v2, aggregation=True):
37 |     if aggregation:
38 |         s = tf.reduce_sum(tf.abs(v1 - v2) / (v1 + v2), axis=1, keep_dims=True)
39 |     else:
40 |         s = tf.abs(v1 - v2) / (v1 + v2)
41 |     return s


--------------------------------------------------------------------------------
/src/tf_common/nadam.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | from tensorflow.python.eager import context
  4 | from tensorflow.python.framework import ops
  5 | from tensorflow.python.ops import array_ops
  6 | from tensorflow.python.ops import control_flow_ops
  7 | from tensorflow.python.ops import math_ops
  8 | from tensorflow.python.ops import resource_variable_ops
  9 | from tensorflow.python.ops import state_ops
 10 | from tensorflow.python.ops import variable_scope
 11 | from tensorflow.python.training import optimizer
 12 | from tensorflow.python.training import training_ops
 13 | 
 14 | 
 15 | class NadamOptimizer(optimizer.Optimizer):
 16 |     def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8,
 17 |                  schedule_decay=0.004, use_locking=False, name="Nadam"):
 18 |         super(NadamOptimizer, self).__init__(use_locking, name)
 19 |         self._lr = learning_rate
 20 |         self._beta1 = beta1
 21 |         self._beta2 = beta2
 22 |         self._epsilon = epsilon
 23 |         self._schedule_decay = schedule_decay
 24 |         # momentum cache decay
 25 |         self._momentum_cache_decay = tf.cast(0.96, tf.float32)
 26 |         self._momentum_cache_const = tf.pow(self._momentum_cache_decay, 1. * schedule_decay)
 27 | 
 28 |         # Tensor versions of the constructor arguments, created in _prepare().
 29 |         self._lr_t = None
 30 |         self._beta1_t = None
 31 |         self._beta2_t = None
 32 |         self._epsilon_t = None
 33 |         self._schedule_decay_t = None
 34 | 
 35 |         # Variables to accumulate the powers of the beta parameters.
 36 |         # Created in _create_slots when we know the variables to optimize.
 37 |         self._beta1_power = None
 38 |         self._beta2_power = None
 39 |         self._iterations = None
 40 |         self._m_schedule = None
 41 | 
 42 |         # Created in SparseApply if needed.
 43 |         self._updated_lr = None
 44 | 
 45 | 
 46 |     def _prepare(self):
 47 |         self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
 48 |         self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
 49 |         self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
 50 |         self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
 51 |         self._schedule_decay_t = ops.convert_to_tensor(self._schedule_decay, name="schedule_decay")
 52 | 
 53 |     def _create_slots(self, var_list):
 54 |         # Create the beta1 and beta2 accumulators on the same device as the first
 55 |         # variable. Sort the var_list to make sure this device is consistent across
 56 |         # workers (these need to go on the same PS, otherwise some updates are
 57 |         # silently ignored).
 58 |         first_var = min(var_list, key=lambda x: x.name)
 59 | 
 60 |         create_new = self._iterations is None
 61 |         if not create_new and context.in_graph_mode():
 62 |             create_new = (self._iterations.graph is not first_var.graph)
 63 | 
 64 |         if create_new:
 65 |             with ops.colocate_with(first_var):
 66 |                 self._beta1_power = variable_scope.variable(self._beta1,
 67 |                                                             name="beta1_power",
 68 |                                                             trainable=False)
 69 |                 self._beta2_power = variable_scope.variable(self._beta2,
 70 |                                                             name="beta2_power",
 71 |                                                             trainable=False)
 72 |                 self._iterations = variable_scope.variable(0.,
 73 |                                                            name="iterations",
 74 |                                                            trainable=False)
 75 |                 self._m_schedule = variable_scope.variable(1.,
 76 |                                                            name="m_schedule",
 77 |                                                            trainable=False)
 78 |         # Create slots for the first and second moments.
 79 |         for v in var_list:
 80 |             self._zeros_slot(v, "m", self._name)
 81 |             self._zeros_slot(v, "v", self._name)
 82 | 
 83 |     def _get_momentum_cache(self, schedule_decay_t, t):
 84 |         return tf.pow(self._momentum_cache_decay, t * schedule_decay_t)
 85 |         # return beta1_t * (1. - 0.5 * (tf.pow(self._momentum_cache_decay, t * schedule_decay_t)))
 86 | 
 87 | 
 88 |     """very slow
 89 |     we simply use the nadam update rule without warming momentum schedule
 90 |     def _apply_dense(self, grad, var):
 91 |         t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1.
 92 |         m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype)
 93 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
 94 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
 95 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
 96 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
 97 |         schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype)
 98 | 
 99 |         # Due to the recommendations in [2], i.e. warming momentum schedule
100 |         # see keras Nadam
101 |         momentum_cache_t = self._get_momentum_cache(beta1_t, schedule_decay_t, t)
102 |         momentum_cache_t_1 = self._get_momentum_cache(beta1_t, schedule_decay_t, t+1.)
103 |         m_schedule_new = m_schedule * momentum_cache_t
104 |         m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1
105 | 
106 |         # the following equations given in [1]
107 |         # m_t = beta1 * m + (1 - beta1) * g_t
108 |         m = self.get_slot(var, "m")
109 |         m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
110 |         g_prime = grad / (1. - m_schedule_new)
111 |         m_t_prime = m_t / (1. - m_schedule_next)
112 |         m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
113 | 
114 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
115 |         v = self.get_slot(var, "v")
116 |         v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
117 |         v_t_prime = v_t / (1. - tf.pow(beta2_t, t))
118 | 
119 |         var_update = state_ops.assign_sub(var,
120 |                                       lr_t * m_t_bar / (tf.sqrt(v_t_prime) + epsilon_t),
121 |                                       use_locking=self._use_locking)
122 | 
123 |         return control_flow_ops.group(*[var_update, m_t, v_t])
124 |     """
125 |     # nadam update rule without warming momentum schedule
126 |     def _apply_dense(self, grad, var):
127 |         m = self.get_slot(var, "m")
128 |         v = self.get_slot(var, "v")
129 |         return training_ops.apply_adam(
130 |             var,
131 |             m,
132 |             v,
133 |             math_ops.cast(self._beta1_power, var.dtype.base_dtype),
134 |             math_ops.cast(self._beta2_power, var.dtype.base_dtype),
135 |             math_ops.cast(self._lr_t, var.dtype.base_dtype),
136 |             math_ops.cast(self._beta1_t, var.dtype.base_dtype),
137 |             math_ops.cast(self._beta2_t, var.dtype.base_dtype),
138 |             math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
139 |             grad,
140 |             use_locking=self._use_locking,
141 |             use_nesterov=True).op
142 | 
143 |     def _resource_apply_dense(self, grad, var):
144 |         m = self.get_slot(var, "m")
145 |         v = self.get_slot(var, "v")
146 |         return training_ops.resource_apply_adam(
147 |             var.handle,
148 |             m.handle,
149 |             v.handle,
150 |             math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
151 |             math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
152 |             math_ops.cast(self._lr_t, grad.dtype.base_dtype),
153 |             math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
154 |             math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
155 |             math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
156 |             grad,
157 |             use_locking=self._use_locking,
158 |             use_nesterov=True)
159 | 
160 |     # keras Nadam update rule
161 |     def _apply_sparse(self, grad, var):
162 |         t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1.
163 |         m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype)
164 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
165 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
166 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
167 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
168 |         schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype)
169 | 
170 |         # Due to the recommendations in [2], i.e. warming momentum schedule
171 |         momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t)
172 |         momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power)
173 |         momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const)
174 |         m_schedule_new = m_schedule * momentum_cache_t
175 |         m_schedule_next = m_schedule_new * momentum_cache_t_1
176 | 
177 |         # the following equations given in [1]
178 |         # m_t = beta1 * m + (1 - beta1) * g_t
179 |         m = self.get_slot(var, "m")
180 |         m_t = state_ops.scatter_update(m, grad.indices,
181 |                                        beta1_t * array_ops.gather(m, grad.indices) +
182 |                                        (1. - beta1_t) * grad.values,
183 |                                        use_locking=self._use_locking)
184 |         g_prime_slice = grad.values / (1. - m_schedule_new)
185 |         m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next)
186 |         m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice
187 | 
188 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
189 |         v = self.get_slot(var, "v")
190 |         v_t = state_ops.scatter_update(v, grad.indices,
191 |                                        beta2_t * array_ops.gather(v, grad.indices) +
192 |                                        (1. - beta2_t) * tf.square(grad.values),
193 |                                        use_locking=self._use_locking)
194 |         v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t))
195 | 
196 |         var_update = state_ops.scatter_sub(var, grad.indices,
197 |                                            lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t),
198 |                                            use_locking=self._use_locking)
199 | 
200 |         return control_flow_ops.group(*[var_update, m_t, v_t])
201 | 
202 |     def _finish(self, update_ops, name_scope):
203 |         # Update the power accumulators.
204 |         with ops.control_dependencies(update_ops):
205 |             with ops.colocate_with(self._iterations):
206 |                 update_beta1 = self._beta1_power.assign(
207 |                     self._beta1_power * self._beta1_t,
208 |                     use_locking=self._use_locking)
209 |                 update_beta2 = self._beta2_power.assign(
210 |                     self._beta2_power * self._beta2_t,
211 |                     use_locking=self._use_locking)
212 |                 t = self._iterations + 1.
213 |                 update_iterations = self._iterations.assign(t, use_locking=self._use_locking)
214 |                 momentum_cache_power = self._get_momentum_cache(self._schedule_decay_t, t)
215 |                 momentum_cache_t = self._beta1_t * (1. - 0.5 * momentum_cache_power)
216 |                 update_m_schedule = self._m_schedule.assign(
217 |                     self._m_schedule * momentum_cache_t,
218 |                     use_locking=self._use_locking)
219 |         return control_flow_ops.group(
220 |             *update_ops + [update_beta1, update_beta2] + [update_iterations, update_m_schedule],
221 |             name=name_scope)


--------------------------------------------------------------------------------
/src/tf_common/nn_module.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | """
  6 | https://explosion.ai/blog/deep-learning-formula-nlp
  7 | embed -> encode -> attend -> predict
  8 | """
  9 | def batch_normalization(x, training, name):
 10 |     # with tf.variable_scope(name, reuse=)
 11 |     bn_train = tf.layers.batch_normalization(x, training=True, reuse=None, name=name)
 12 |     bn_inference = tf.layers.batch_normalization(x, training=False, reuse=True, name=name)
 13 |     z = tf.cond(training, lambda: bn_train, lambda: bn_inference)
 14 |     return z
 15 | 
 16 | 
 17 | #### Step 1
 18 | def embed(x, size, dim, seed=0, flatten=False, reduce_sum=False):
 19 |     # std = np.sqrt(2 / dim)
 20 |     std = 0.001
 21 |     minval = -std
 22 |     maxval = std
 23 |     emb = tf.Variable(tf.random_uniform([size, dim], minval, maxval, dtype=tf.float32, seed=seed))
 24 |     # None * max_seq_len * embed_dim
 25 |     out = tf.nn.embedding_lookup(emb, x)
 26 |     if flatten:
 27 |         out = tf.layers.flatten(out)
 28 |     if reduce_sum:
 29 |         out = tf.reduce_sum(out, axis=1)
 30 |     return out
 31 | 
 32 | 
 33 | def embed_subword(x, size, dim, sequence_length, seed=0, mask_zero=False, maxlen=None):
 34 |     # std = np.sqrt(2 / dim)
 35 |     std = 0.001
 36 |     minval = -std
 37 |     maxval = std
 38 |     emb = tf.Variable(tf.random_uniform([size, dim], minval, maxval, dtype=tf.float32, seed=seed))
 39 |     # None * max_seq_len * max_word_len * embed_dim
 40 |     out = tf.nn.embedding_lookup(emb, x)
 41 |     if mask_zero:
 42 |         # word_len: None * max_seq_len
 43 |         # mask: shape=None * max_seq_len * max_word_len
 44 |         mask = tf.sequence_mask(sequence_length, maxlen)
 45 |         mask = tf.expand_dims(mask, axis=-1)
 46 |         mask = tf.cast(mask, tf.float32)
 47 |         out = out * mask
 48 |     # None * max_seq_len * embed_dim
 49 |     # according to facebook subword paper, it's sum
 50 |     out = tf.reduce_sum(out, axis=2)
 51 |     return out
 52 | 
 53 | 
 54 | def word_dropout(x, training, dropout=0, seed=0):
 55 |     # word dropout (dropout the entire embedding for some words)
 56 |     """
 57 |     tf.layers.Dropout doesn't work as it can't switch training or inference
 58 |     """
 59 |     if dropout > 0:
 60 |         input_shape = tf.shape(x)
 61 |         noise_shape = [input_shape[0], input_shape[1], 1]
 62 |         x = tf.layers.Dropout(rate=dropout, noise_shape=noise_shape, seed=seed)(x, training=training)
 63 |     return x
 64 | 
 65 | 
 66 | #### Step 2
 67 | def fasttext(x):
 68 |     return x
 69 | 
 70 | 
 71 | # Language Modeling with Gated Convolutional Networks
 72 | # https://github.com/anantzoid/Language-Modeling-GatedCNN
 73 | def gated_conv1d_op(inputs, filters=8, kernel_size=3, padding="same", activation=None, strides=1, reuse=False, name=""):
 74 |     conv_linear = tf.layers.conv1d(
 75 |         inputs=inputs,
 76 |         filters=filters,
 77 |         kernel_size=kernel_size,
 78 |         padding="same",
 79 |         activation=None,
 80 |         strides=strides,
 81 |         reuse=reuse,
 82 |         name=name+"_linear")
 83 |     conv_gated = tf.layers.conv1d(
 84 |         inputs=inputs,
 85 |         filters=filters,
 86 |         kernel_size=kernel_size,
 87 |         padding="same",
 88 |         activation=tf.nn.sigmoid,
 89 |         strides=strides,
 90 |         reuse=reuse,
 91 |         name=name+"_gated")
 92 |     conv = conv_linear * conv_gated
 93 |     return conv
 94 | 
 95 | 
 96 | def residual_gated_conv1d_op(inputs, filters=8, kernel_size=3, padding="same", activation=None, strides=1, reuse=False, name=""):
 97 |     conv_linear = tf.layers.conv1d(
 98 |         inputs=inputs,
 99 |         filters=filters,
100 |         kernel_size=kernel_size,
101 |         padding="same",
102 |         activation=None,
103 |         strides=strides,
104 |         reuse=reuse,
105 |         name=name+"_linear")
106 |     conv_gated = tf.layers.conv1d(
107 |         inputs=inputs,
108 |         filters=filters,
109 |         kernel_size=kernel_size,
110 |         padding="same",
111 |         activation=tf.nn.sigmoid,
112 |         strides=strides,
113 |         reuse=reuse,
114 |         name=name+"_gated")
115 |     conv = inputs * (1. - conv_gated) + conv_linear * conv_gated
116 |     return conv
117 | 
118 | 
119 | def _textcnn(x, conv_op, num_filters=8, filter_sizes=[2, 3], bn=False, training=False,
120 |             timedistributed=False, scope_name="textcnn", reuse=False, activation=tf.nn.relu):
121 |     # x: None * step_dim * embed_dim
122 |     conv_blocks = []
123 |     for i, filter_size in enumerate(filter_sizes):
124 |         scope_name_i = "%s_textcnn_%s"%(str(scope_name), str(filter_size))
125 |         with tf.variable_scope(scope_name_i, reuse=reuse):
126 |             if timedistributed:
127 |                 input_shape = tf.shape(x)
128 |                 step_dim = input_shape[1]
129 |                 embed_dim = input_shape[2]
130 |                 x = tf.transpose(x, [0, 2, 1])
131 |                 # None * embed_dim * step_dim
132 |                 x = tf.reshape(x, [input_shape[0] * embed_dim, step_dim, 1])
133 |                 conv = conv_op(
134 |                     inputs=x,
135 |                     filters=1,
136 |                     kernel_size=filter_size,
137 |                     padding="same",
138 |                     activation=activation,
139 |                     strides=1,
140 |                     reuse=reuse,
141 |                     name=scope_name_i)
142 |                 conv = tf.reshape(conv, [input_shape[0], embed_dim, step_dim])
143 |                 conv = tf.transpose(conv, [0, 2, 1])
144 |             else:
145 |                 conv = conv_op(
146 |                     inputs=x,
147 |                     filters=num_filters,
148 |                     kernel_size=filter_size,
149 |                     padding="same",
150 |                     activation=activation,
151 |                     strides=1,
152 |                     reuse=reuse,
153 |                     name=scope_name_i)
154 |             if bn:
155 |                 conv = tf.layers.BatchNormalization()(conv, training)
156 |             # conv = activation(conv)
157 |             conv_blocks.append(conv)
158 |     if len(conv_blocks) > 1:
159 |         z = tf.concat(conv_blocks, axis=-1)
160 |     else:
161 |         z = conv_blocks[0]
162 |     return z
163 | 
164 | 
165 | def textcnn(x, num_layers=2, num_filters=8, filter_sizes=[2, 3], bn=False, training=False,
166 |             timedistributed=False, scope_name="textcnn", reuse=False, activation=tf.nn.relu,
167 |             gated_conv=False, residual=False):
168 |     if gated_conv:
169 |         if residual:
170 |             conv_op = residual_gated_conv1d_op
171 |         else:
172 |             conv_op = gated_conv1d_op
173 |     else:
174 |         conv_op = tf.layers.conv1d
175 |     conv_blocks = []
176 |     for i in range(num_layers):
177 |         scope_name_i = "%s_textcnn_layer_%s" % (str(scope_name), str(i))
178 |         x = _textcnn(x, conv_op, num_filters, filter_sizes, bn, training, timedistributed, scope_name_i, reuse, activation)
179 |         conv_blocks.append(x)
180 |     if len(conv_blocks) > 1:
181 |         z = tf.concat(conv_blocks, axis=-1)
182 |     else:
183 |         z = conv_blocks[0]
184 |     return z
185 | 
186 | 
187 | def textrnn(x, num_units, cell_type, sequence_length, num_layers=1, mask_zero=False, scope_name="textrnn", reuse=False):
188 |     for i in range(num_layers):
189 |         scope_name_i = "%s_textrnn_%s_%s_%s" % (str(scope_name), cell_type, str(i), str(num_units))
190 |         with tf.variable_scope(scope_name_i, reuse=reuse):
191 |             if cell_type == "gru":
192 |                 cell_fw = tf.nn.rnn_cell.GRUCell(num_units)
193 |             elif cell_type == "lstm":
194 |                 cell_fw = tf.nn.rnn_cell.LSTMCell(num_units)
195 |             if mask_zero:
196 |                 x, _ = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=sequence_length, scope=scope_name_i)
197 |             else:
198 |                 x, _ = tf.nn.dynamic_rnn(cell_fw, x, dtype=tf.float32, sequence_length=None, scope=scope_name_i)
199 |     return x
200 | 
201 | 
202 | def textbirnn(x, num_units, cell_type, sequence_length, num_layers=1, mask_zero=False, scope_name="textbirnn", reuse=False):
203 |     for i in range(num_layers):
204 |         scope_name_i = "%s_textbirnn_%s_%s_%s" % (str(scope_name), cell_type, str(i), str(num_units))
205 |         with tf.variable_scope(scope_name_i, reuse=reuse):
206 |             if cell_type == "gru":
207 |                 cell_fw = tf.nn.rnn_cell.GRUCell(num_units)
208 |                 cell_bw = tf.nn.rnn_cell.GRUCell(num_units)
209 |             elif cell_type == "lstm":
210 |                 cell_fw = tf.nn.rnn_cell.LSTMCell(num_units)
211 |                 cell_bw = tf.nn.rnn_cell.LSTMCell(num_units)
212 |             if mask_zero:
213 |                 (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
214 |                     cell_fw, cell_bw, x, dtype=tf.float32, sequence_length=sequence_length, scope=scope_name_i)
215 |             else:
216 |                 (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
217 |                     cell_fw, cell_bw, x, dtype=tf.float32, sequence_length=None, scope=scope_name_i)
218 |             x = tf.concat([output_fw, output_bw], axis=-1)
219 |     return x
220 | 
221 | 
222 | 
223 | def encode(x, method, params, input_dim,
224 |            sequence_length=None, mask_zero=False,
225 |            scope_name="encode", reuse=False,
226 |            training=False, seed=0):
227 |     """
228 |     :param x: shape=(None,seqlen,dim)
229 |     :param params:
230 |     :return: shape=(None,seqlen,dim)
231 |     """
232 |     out_list = []
233 |     params["encode_dim"] = 0
234 |     for m in method.split("+"):
235 |         if m == "fasttext":
236 |             dim_f = input_dim  # params["embedding_dim"]
237 |             z = fasttext(x)
238 |             out_list.append(z)
239 |             params["encode_dim"] += dim_f
240 |         elif m == "project":
241 |             dim_p = params["project_hidden_units"][-1]
242 |             step_dim = tf.shape(x)[1]
243 |             z = tf.reshape(x, [-1, input_dim])
244 |             z = mlp_layer(z, fc_type=params["project_type"],
245 |                           hidden_units=params["project_hidden_units"],
246 |                           dropouts=params["project_dropouts"],
247 |                           scope_name=scope_name,
248 |                           reuse=reuse,
249 |                           training=training,
250 |                           seed=params["random_seed"])
251 |             z = tf.reshape(z, [-1, step_dim, params["project_hidden_units"][-1]])
252 |             out_list.append(z)
253 |             params["encode_dim"] += dim_p
254 |         elif m == "textcnn":
255 |             dim_c = params["cnn_num_layers"] * len(params["cnn_filter_sizes"]) * params["cnn_num_filters"]
256 |             z = textcnn(x, num_layers=params["cnn_num_layers"], num_filters=params["cnn_num_filters"], filter_sizes=params["cnn_filter_sizes"],
257 |                         timedistributed=params["cnn_timedistributed"], scope_name=scope_name, reuse=reuse)
258 |             out_list.append(z)
259 |             params["encode_dim"] += dim_c
260 |         elif m == "textrnn":
261 |             dim_r = params["rnn_num_units"]
262 |             z = textrnn(x, num_units=params["rnn_num_units"], cell_type=params["rnn_cell_type"], num_layers=params["rnn_num_layers"],
263 |                         sequence_length=sequence_length, mask_zero=mask_zero, scope_name=scope_name, reuse=reuse)
264 |             out_list.append(z)
265 |             params["encode_dim"] += dim_r
266 |         elif method == "textbirnn":
267 |             dim_b = params["rnn_num_units"] * 2
268 |             z = textbirnn(x, num_units=params["rnn_num_units"], cell_type=params["rnn_cell_type"], num_layers=params["rnn_num_layers"],
269 |                           sequence_length=sequence_length, mask_zero=mask_zero, scope_name=scope_name, reuse=reuse)
270 |             out_list.append(z)
271 |             params["encode_dim"] += dim_b
272 |     z = tf.concat(out_list, axis=-1)
273 |     return z
274 | 
275 | 
276 | def scalar_attention(x, encode_dim, feature_dim, attention_dim, sequence_length=None,
277 |                      mask_zero=False, maxlen=None, epsilon=1e-8, seed=0, scope_name="attention", reuse=False):
278 |     """
279 |     :param x: [batchsize, s, feature_dim]
280 |     :param encode_dim: dim of encoder output
281 |     :param feature_dim: dim of x (for self-attention, x is the encoder output;
282 |                         for context-attention, x is the concat of encoder output and contextual info)
283 |     :param sequence_length:
284 |     :param mask_zero:
285 |     :param maxlen:
286 |     :param epsilon:
287 |     :param seed:
288 |     :param scope_name:
289 |     :param reuse:
290 |     :return: [batchsize, s, 1]
291 |     """
292 |     with tf.variable_scope(scope_name, reuse=reuse):
293 |         # W1: [feature_dim]
294 |         W1 = tf.get_variable("W1_%s" % scope_name,
295 |                              initializer=tf.truncated_normal_initializer(
296 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
297 |                              dtype=tf.float32,
298 |                              shape=[feature_dim])
299 |         # b1: [1]
300 |         b1 = tf.get_variable("b1_%s" % scope_name,
301 |                              initializer=tf.truncated_normal_initializer(
302 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
303 |                              dtype=tf.float32,
304 |                              shape=[1])
305 |     e = tf.einsum("bsf,f->bs", x, W1) + \
306 |         tf.expand_dims(b1, axis=1)
307 |     a = tf.exp(e)
308 | 
309 |     # apply mask after the exp. will be re-normalized next
310 |     if mask_zero:
311 |         # None * s
312 |         mask = tf.sequence_mask(sequence_length, maxlen)
313 |         mask = tf.cast(mask, tf.float32)
314 |         a = a * mask
315 | 
316 |     # in some cases especially in the early stages of training the sum may be almost zero
317 |     s = tf.reduce_sum(a, axis=1, keep_dims=True)
318 |     a /= tf.cast(s + epsilon, tf.float32)
319 |     a = tf.expand_dims(a, axis=-1)
320 | 
321 |     return a
322 | 
323 | 
324 | # vector-based attention proposed in the following paper
325 | # Enhancing Sentence Embedding with Generalized Pooling
326 | def vector_attention(x, encode_dim, feature_dim, attention_dim, sequence_length=None,
327 |                      mask_zero=False, maxlen=None, epsilon=1e-8, seed=0,
328 |                      scope_name="attention", reuse=False):
329 |     """
330 |     :param x: [batchsize, s, feature_dim]
331 |     :param encode_dim: dim of encoder output
332 |     :param feature_dim: dim of x (for self-attention, x is the encoder output;
333 |                         for context-attention, x is the concat of encoder output and contextual info)
334 |     :param sequence_length:
335 |     :param mask_zero:
336 |     :param maxlen:
337 |     :param epsilon:
338 |     :param seed:
339 |     :param scope_name:
340 |     :param reuse:
341 |     :return: [batchsize, s, encode_dim]
342 |     """
343 |     with tf.variable_scope(scope_name, reuse=reuse):
344 |         # W1: [attention_dim, feature_dim]
345 |         W1 = tf.get_variable("W1_%s" % scope_name,
346 |                              initializer=tf.truncated_normal_initializer(
347 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
348 |                              dtype=tf.float32,
349 |                              shape=[attention_dim, feature_dim])
350 |         # b1: [attention_dim]
351 |         b1 = tf.get_variable("b1_%s" % scope_name,
352 |                              initializer=tf.truncated_normal_initializer(
353 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
354 |                              dtype=tf.float32,
355 |                              shape=[attention_dim])
356 |         # W2: [encode_dim, attention_dim]
357 |         W2 = tf.get_variable("W2_%s" % scope_name,
358 |                              initializer=tf.truncated_normal_initializer(
359 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
360 |                              dtype=tf.float32,
361 |                              shape=[encode_dim, attention_dim])
362 |         # b2: [encode_dim]
363 |         b2 = tf.get_variable("b2_%s" % scope_name,
364 |                              initializer=tf.truncated_normal_initializer(
365 |                                  mean=0.0, stddev=0.2, dtype=tf.float32, seed=seed),
366 |                              dtype=tf.float32,
367 |                              shape=[encode_dim])
368 |     # [batchsize, attention_dim, s]
369 |     e = tf.nn.relu(
370 |         tf.einsum("bsf,af->bas", x, W1) + \
371 |         tf.expand_dims(tf.expand_dims(b1, axis=0), axis=-1))
372 |     # [batchsize, s, encode_dim]
373 |     e = tf.einsum("bas,ea->bse", e, W2) + \
374 |         tf.expand_dims(tf.expand_dims(b2, axis=0), axis=0)
375 |     a = tf.exp(e)
376 | 
377 |     # apply mask after the exp. will be re-normalized next
378 |     if mask_zero:
379 |         # [batchsize, s, 1]
380 |         mask = tf.sequence_mask(sequence_length, maxlen)
381 |         mask = tf.expand_dims(tf.cast(mask, tf.float32), axis=-1)
382 |         a = a * mask
383 | 
384 |     # in some cases especially in the early stages of training the sum may be almost zero
385 |     s = tf.reduce_sum(a, axis=1, keep_dims=True)
386 |     a /= tf.cast(s + epsilon, tf.float32)
387 | 
388 |     return a
389 | 
390 | 
391 | def _attend(x, sequence_length=None, method="ave", context=None, encode_dim=None,
392 |             feature_dim=None, attention_dim=None, mask_zero=False, maxlen=None,
393 |            bn=False, training=False, seed=0, scope_name="attention", reuse=False,
394 |             num_heads=1):
395 |     if method == "ave":
396 |         if mask_zero:
397 |             # None * step_dim
398 |             mask = tf.sequence_mask(sequence_length, maxlen)
399 |             mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1))
400 |             mask = tf.cast(mask, tf.float32)
401 |             z = tf.reduce_sum(x * mask, axis=1)
402 |             l = tf.reduce_sum(mask, axis=1)
403 |             # in some cases especially in the early stages of training the sum may be almost zero
404 |             epsilon = 1e-8
405 |             z /= tf.cast(l + epsilon, tf.float32)
406 |         else:
407 |             z = tf.reduce_mean(x, axis=1)
408 |     elif method == "sum":
409 |         if mask_zero:
410 |             # None * step_dim
411 |             mask = tf.sequence_mask(sequence_length, maxlen)
412 |             mask = tf.reshape(mask, (-1, tf.shape(x)[1], 1))
413 |             mask = tf.cast(mask, tf.float32)
414 |             z = tf.reduce_sum(x * mask, axis=1)
415 |         else:
416 |             z = tf.reduce_sum(x, axis=1)
417 |     elif method == "max":
418 |         if mask_zero:
419 |             # None * step_dim
420 |             mask = tf.sequence_mask(sequence_length, maxlen)
421 |             mask = tf.expand_dims(mask, axis=-1)
422 |             mask = tf.tile(mask, (1, 1, tf.shape(x)[2]))
423 |             masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)),
424 |                                    tf.ones_like(x) * -np.inf, x)  # if masked assume value is -inf
425 |             z = tf.reduce_max(masked_data, axis=1)
426 |         else:
427 |             z = tf.reduce_max(x, axis=1)
428 |     elif method == "min":
429 |         if mask_zero:
430 |             # None * step_dim
431 |             mask = tf.sequence_mask(sequence_length, maxlen)
432 |             mask = tf.expand_dims(mask, axis=-1)
433 |             mask = tf.tile(mask, (1, 1, tf.shape(x)[2]))
434 |             masked_data = tf.where(tf.equal(mask, tf.zeros_like(mask)),
435 |                                    tf.ones_like(x) * np.inf, x)  # if masked assume value is -inf
436 |             z = tf.reduce_min(masked_data, axis=1)
437 |         else:
438 |             z = tf.reduce_min(x, axis=1)
439 |     elif "attention" in method:
440 |         if context is not None:
441 |             y = tf.concat([x, context], axis=-1)
442 |         else:
443 |             y = x
444 |         zs = []
445 |         for i in range(num_heads):
446 |             if "vector" in method:
447 |                 a = vector_attention(y, encode_dim, feature_dim, attention_dim, sequence_length, mask_zero, maxlen, seed=seed, scope_name=scope_name+str(i), reuse=reuse)
448 |             else:
449 |                 a = scalar_attention(y, encode_dim, feature_dim, attention_dim, sequence_length, mask_zero, maxlen, seed=seed, scope_name=scope_name+str(i), reuse=reuse)
450 |             zs.append(tf.reduce_sum(x * a, axis=1))
451 |         z = tf.concat(zs, axis=-1)
452 |     if bn:
453 |         z = tf.layers.BatchNormalization()(z, training=training)
454 |     return z
455 | 
456 | 
457 | def attend(x, sequence_length=None, method="ave", context=None, encode_dim=None,
458 |            feature_dim=None, attention_dim=None, mask_zero=False, maxlen=None,
459 |            bn=False, training=False, seed=0, scope_name="attention", reuse=False,
460 |            num_heads=1):
461 |     if isinstance(method, list):
462 |         outputs = [None]*len(method)
463 |         for i,m in enumerate(method):
464 |             outputs[i] = _attend(x, sequence_length, m, context, encode_dim, feature_dim, attention_dim, mask_zero, maxlen,
465 |                                 bn, training, seed, scope_name+m, reuse, num_heads)
466 |         return tf.concat(outputs, axis=-1)
467 |     else:
468 |         return _attend(x, sequence_length, method, context, encode_dim, feature_dim, attention_dim, mask_zero, maxlen,
469 |                                 bn, training, seed, scope_name+method, reuse, num_heads)
470 | 
471 | 
472 | #### Step 4
473 | def _dense_block_mode1(x, hidden_units, dropouts, densenet=False, scope_name="dense_block", reuse=False, training=False, seed=0, bn=False):
474 |     """
475 |     :param x:
476 |     :param hidden_units:
477 |     :param dropouts:
478 |     :param densenet: enable densenet
479 |     :return:
480 |     Ref: https://github.com/titu1994/DenseNet
481 |     """
482 |     for i, (h, d) in enumerate(zip(hidden_units, dropouts)):
483 |         scope_name_i = "%s-dense_block_mode1-%s"%(str(scope_name), str(i))
484 |         with tf.variable_scope(scope_name, reuse=reuse):
485 |             z = tf.layers.dense(x, h, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * i),
486 |                                   reuse=reuse,
487 |                                   name=scope_name_i)
488 |             if bn:
489 |                 z = batch_normalization(z, training=training, name=scope_name_i+"-bn")
490 |             z = tf.nn.relu(z)
491 |             z = tf.layers.Dropout(d, seed=seed * i)(z, training=training) if d > 0 else z
492 |             if densenet:
493 |                 x = tf.concat([x, z], axis=-1)
494 |             else:
495 |                 x = z
496 |     return x
497 | 
498 | 
499 | def _dense_block_mode2(x, hidden_units, dropouts, densenet=False, training=False, seed=0, bn=False, name="dense_block"):
500 |     """
501 |     :param x:
502 |     :param hidden_units:
503 |     :param dropouts:
504 |     :param densenet: enable densenet
505 |     :return:
506 |     Ref: https://github.com/titu1994/DenseNet
507 |     """
508 |     for i, (h, d) in enumerate(zip(hidden_units, dropouts)):
509 |         if bn:
510 |             z = batch_normalization(x, training=training, name=name + "-" + str(i))
511 |         z = tf.nn.relu(z)
512 |         z = tf.layers.Dropout(d, seed=seed * i)(z, training=training) if d > 0 else z
513 |         z = tf.layers.Dense(h, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * i), dtype=tf.float32,
514 |                             bias_initializer=tf.zeros_initializer())(z)
515 |         if densenet:
516 |             x = tf.concat([x, z], axis=-1)
517 |         else:
518 |             x = z
519 |     return x
520 | 
521 | 
522 | def dense_block(x, hidden_units, dropouts, densenet=False, scope_name="dense_block", reuse=False, training=False, seed=0, bn=False):
523 |     return _dense_block_mode1(x, hidden_units, dropouts, densenet, scope_name, reuse, training, seed, bn)
524 | 
525 | 
526 | def _resnet_branch_mode1(x, hidden_units, dropouts, training, seed=0):
527 |     h1, h2, h3 = hidden_units
528 |     dr1, dr2, dr3 = dropouts
529 |     name = "resnet_block"
530 |     # branch 2
531 |     x2 = tf.layers.Dense(h1, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 2), dtype=tf.float32,
532 |                          bias_initializer=tf.zeros_initializer())(x)
533 |     x2 = tf.layers.BatchNormalization()(x2, training=training)
534 |     # x2 = batch_normalization(x2, training=training, name=name + "-" + str(1))
535 |     x2 = tf.nn.relu(x2)
536 |     x2 = tf.layers.Dropout(dr1, seed=seed * 1)(x2, training=training) if dr1 > 0 else x2
537 | 
538 |     x2 = tf.layers.Dense(h2, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 3), dtype=tf.float32,
539 |                          bias_initializer=tf.zeros_initializer())(x2)
540 |     x2 = tf.layers.BatchNormalization()(x2, training=training)
541 |     # x2 = batch_normalization(x2, training=training, name=name + "-" + str(2))
542 |     x2 = tf.nn.relu(x2)
543 |     x2 = tf.layers.Dropout(dr2, seed=seed * 2)(x2, training=training) if dr2 > 0 else x2
544 | 
545 |     x2 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 4), dtype=tf.float32,
546 |                          bias_initializer=tf.zeros_initializer())(x2)
547 |     x2 = tf.layers.BatchNormalization()(x2, training=training)
548 |     # x2 = batch_normalization(x2, training=training, name=name + "-" + str(3))
549 | 
550 |     return x2
551 | 
552 | 
553 | def _resnet_block_mode1(x, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0):
554 |     """A block that has a dense layer at shortcut.
555 |     # Arguments
556 |         input_tensor: input tensor
557 |         kernel_size: default 3, the kernel size of middle conv layer at main path
558 |         filters: list of integers, the filters of 3 conv layer at main path
559 |         stage: integer, current stage label, used for generating layer names
560 |         block: 'a','b'..., current block label, used for generating layer names
561 |     # Returns
562 |         Output tensor for the block.
563 |     Note that from stage 3, the first conv layer at main path is with strides=(2,2)
564 |     And the shortcut should have strides=(2,2) as well
565 |     """
566 |     h1, h2, h3 = hidden_units
567 |     dr1, dr2, dr3 = dropouts
568 |     name = "resnet_block"
569 |     xs = []
570 |     # branch 0
571 |     if dense_shortcut:
572 |         x0 = tf.layers.Dense(h3, kernel_initializer=tf.glorot_uniform_initializer(seed=seed * 1), dtype=tf.float32,
573 |                              bias_initializer=tf.zeros_initializer())(x)
574 |         x0 = tf.layers.BatchNormalization()(x0, training=training)
575 |         # x0 = batch_normalization(x0, training=training, name=name + "-" + str(0))
576 |         xs.append(x0)
577 |     else:
578 |         xs.append(x)
579 | 
580 |     # branch 1 ~ cardinality
581 |     for i in range(cardinality):
582 |         xs.append(_resnet_branch_mode1(x, hidden_units, dropouts, training, seed))
583 | 
584 |     x = tf.add_n(xs)
585 |     x = tf.nn.relu(x)
586 |     x = tf.layers.Dropout(dr3, seed=seed * 4)(x, training=training) if dr3 > 0 else x
587 |     return x
588 | 
589 | 
590 | def _resnet_branch_mode2(x, hidden_units, dropouts, training=False, seed=0, scope_name="_resnet_branch_mode2", reuse=False):
591 |     h1, h2, h3 = hidden_units
592 |     dr1, dr2, dr3 = dropouts
593 |     # name = "resnet"
594 |     with tf.variable_scope(scope_name, reuse=reuse):
595 |         # branch 2: bn-relu->weight
596 |         x2 = tf.layers.BatchNormalization()(x)
597 |         # x2 = batch_normalization(x, training=training, name=scope_name + "-bn-" + str(1))
598 |         x2 = tf.nn.relu(x2)
599 |         x2 = tf.layers.Dropout(dr1)(x2, training=training) if dr1 > 0 else x2
600 |         x2 = tf.layers.dense(x2, h1, kernel_initializer=tf.glorot_uniform_initializer(seed * 1),
601 |                              bias_initializer=tf.zeros_initializer(),
602 |                              name=scope_name+"-dense-"+str(1),
603 |                              reuse=reuse)
604 | 
605 |         x2 = tf.layers.BatchNormalization()(x2)
606 |         # x2 = batch_normalization(x2, training=training, name=scope_name + "-bn-" + str(2))
607 |         x2 = tf.nn.relu(x2)
608 |         x2 = tf.layers.Dropout(dr2)(x2, training=training) if dr2 > 0 else x2
609 |         x2 = tf.layers.dense(x2, h2, kernel_initializer=tf.glorot_uniform_initializer(seed * 2),
610 |                              bias_initializer=tf.zeros_initializer(),
611 |                              name=scope_name + "-dense-" + str(2),
612 |                              reuse=reuse)
613 | 
614 |         x2 = tf.layers.BatchNormalization()(x2)
615 |         # x2 = batch_normalization(x2, training=training, name=scope_name + "-bn-" + str(3))
616 |         x2 = tf.nn.relu(x2)
617 |         x2 = tf.layers.Dropout(dr3)(x2, training=training) if dr3 > 0 else x2
618 |         x2 = tf.layers.dense(x2, h3, kernel_initializer=tf.glorot_uniform_initializer(seed * 3),
619 |                              bias_initializer=tf.zeros_initializer(),
620 |                              name=scope_name + "-dense-" + str(3),
621 |                              reuse=reuse)
622 | 
623 |     return x2
624 | 
625 | 
626 | def _resnet_block_mode2(x, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0,
627 |                         scope_name="_resnet_block_mode2", reuse=False):
628 |     """A block that has a dense layer at shortcut.
629 |     # Arguments
630 |         input_tensor: input tensor
631 |         kernel_size: default 3, the kernel size of middle conv layer at main path
632 |         filters: list of integers, the filters of 3 conv layer at main path
633 |         stage: integer, current stage label, used for generating layer names
634 |         block: 'a','b'..., current block label, used for generating layer names
635 |     # Returns
636 |         Output tensor for the block.
637 |     Note that from stage 3, the first conv layer at main path is with strides=(2,2)
638 |     And the shortcut should have strides=(2,2) as well
639 |     """
640 |     h1, h2, h3 = hidden_units
641 |     dr1, dr2, dr3 = dropouts
642 | 
643 |     xs = []
644 |     # branch 0
645 |     if dense_shortcut:
646 |         with tf.variable_scope(scope_name, reuse=reuse):
647 |             x0 = tf.layers.dense(x, h3, kernel_initializer=tf.glorot_uniform_initializer(seed * 1),
648 |                                  bias_initializer=tf.zeros_initializer(),
649 |                                  reuse=reuse,
650 |                                  name=scope_name+"-dense-"+str("0"))
651 |         xs.append(x0)
652 |     else:
653 |         xs.append(x)
654 | 
655 |     # branch 1 ~ cardinality
656 |     for i in range(cardinality):
657 |         xs.append(_resnet_branch_mode2(x, hidden_units, dropouts, training, seed, scope_name, reuse))
658 | 
659 |     x = tf.add_n(xs)
660 |     return x
661 | 
662 | 
663 | def resnet_block(input_tensor, hidden_units, dropouts, cardinality=1, dense_shortcut=False, training=False, seed=0,
664 |                  scope_name="resnet_block", reuse=False):
665 |     return _resnet_block_mode2(input_tensor, hidden_units, dropouts, cardinality, dense_shortcut, training, seed,
666 |                                scope_name, reuse)
667 | 
668 | 
669 | def mlp_layer(input, fc_type, hidden_units, dropouts, scope_name, reuse=False, training=False, seed=0):
670 |     if fc_type == "fc":
671 |         output = dense_block(input, hidden_units=hidden_units, dropouts=dropouts,
672 |                                          densenet=False, scope_name=scope_name,
673 |                                          reuse=reuse,
674 |                                          training=training, seed=seed)
675 |     elif fc_type == "densenet":
676 |         output = dense_block(input, hidden_units=hidden_units, dropouts=dropouts,
677 |                                          densenet=True, scope_name=scope_name,
678 |                                          reuse=reuse,
679 |                                          training=training, seed=seed)
680 |     elif fc_type == "resnet":
681 |         output = resnet_block(input, hidden_units=hidden_units, dropouts=dropouts,
682 |                                           cardinality=1, dense_shortcut=True, training=training,
683 |                                           reuse=reuse,
684 |                                           seed=seed,
685 |                                           scope_name=scope_name)
686 |     return output
687 | 


--------------------------------------------------------------------------------
/src/tf_common/optimizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=2&cad=rja&uact=8&ved=0ahUKEwih7-6VlejYAhWGS98KHWeLCWQQFgg3MAE&url=https%3A%2F%2Fwww.bigdatarepublic.nl%2Fcustom-optimizer-in-tensorflow%2F&usg=AOvVaw3jmxRDqr2pkGRLvX6rNJrl
  4 | """
  5 | 
  6 | import tensorflow as tf
  7 | from tensorflow.python.framework import constant_op
  8 | from tensorflow.python.ops import random_ops
  9 | from tensorflow.python.eager import context
 10 | from tensorflow.python.framework import ops
 11 | from tensorflow.python.ops import array_ops
 12 | from tensorflow.python.ops import control_flow_ops
 13 | from tensorflow.python.ops import math_ops
 14 | from tensorflow.python.ops import state_ops
 15 | from tensorflow.python.ops import variable_scope
 16 | from tensorflow.python.training import optimizer
 17 | from tensorflow.python.training import training_ops
 18 | 
 19 | 
 20 | class LazyPowerSignOptimizer(optimizer.Optimizer):
 21 |     """Implementation of PowerSign.
 22 |     See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417)
 23 |     @@__init__
 24 |     """
 25 | 
 26 |     def __init__(self, learning_rate=0.001, alpha=0.01, beta=0.5, use_locking=False, name="PowerSign"):
 27 |         super(LazyPowerSignOptimizer, self).__init__(use_locking, name)
 28 |         self._lr = learning_rate
 29 |         self._alpha = alpha
 30 |         self._beta = beta
 31 | 
 32 |         # Tensor versions of the constructor arguments, created in _prepare().
 33 |         self._lr_t = None
 34 |         self._alpha_t = None
 35 |         self._beta_t = None
 36 | 
 37 |     def _prepare(self):
 38 |         self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
 39 |         self._alpha_t = ops.convert_to_tensor(self._beta, name="alpha_t")
 40 |         self._beta_t = ops.convert_to_tensor(self._beta, name="beta_t")
 41 | 
 42 |     def _create_slots(self, var_list):
 43 |         # Create slots for the first and second moments.
 44 |         for v in var_list:
 45 |             self._zeros_slot(v, "m", self._name)
 46 | 
 47 |     def _apply_dense(self, grad, var):
 48 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
 49 |         alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
 50 |         beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
 51 | 
 52 |         eps = 1e-7  # cap for moving average
 53 | 
 54 |         m = self.get_slot(var, "m")
 55 |         m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
 56 | 
 57 |         var_update = state_ops.assign_sub(var, lr_t * grad * tf.exp(
 58 |             tf.log(alpha_t) * tf.sign(grad) * tf.sign(m_t)))  # Update 'ref' by subtracting 'value
 59 |         # Create an op that groups multiple operations.
 60 |         # When this op finishes, all ops in input have finished
 61 |         return control_flow_ops.group(*[var_update, m_t])
 62 | 
 63 |     def _apply_sparse(self, grad, var):
 64 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
 65 |         alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
 66 |         beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
 67 | 
 68 |         eps = 1e-7  # cap for moving average
 69 | 
 70 |         m = self.get_slot(var, "m")
 71 |         m_slice = tf.gather(m, grad.indices)
 72 |         m_t = state_ops.scatter_update(m, grad.indices,
 73 |                                        tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values)))
 74 |         m_t_slice = tf.gather(m_t, grad.indices)
 75 | 
 76 |         var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp(
 77 |             tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice)))  # Update 'ref' by subtracting 'value
 78 |         # Create an op that groups multiple operations.
 79 |         # When this op finishes, all ops in input have finished
 80 |         return control_flow_ops.group(*[var_update, m_t])
 81 | 
 82 | 
 83 | class LazyAddSignOptimizer(optimizer.Optimizer):
 84 |     """Implementation of AddSign.
 85 |     See [Bello et. al., 2017](https://arxiv.org/abs/1709.07417)
 86 |     @@__init__
 87 |     """
 88 | 
 89 |     def __init__(self, learning_rate=1.001, alpha=0.01, beta=0.5, use_locking=False, name="AddSign"):
 90 |         super(LazyAddSignOptimizer, self).__init__(use_locking, name)
 91 |         self._lr = learning_rate
 92 |         self._alpha = alpha
 93 |         self._beta = beta
 94 | 
 95 |         # Tensor versions of the constructor arguments, created in _prepare().
 96 |         self._lr_t = None
 97 |         self._alpha_t = None
 98 |         self._beta_t = None
 99 | 
100 |     def _prepare(self):
101 |         self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
102 |         self._alpha_t = ops.convert_to_tensor(self._beta, name="beta_t")
103 |         self._beta_t = ops.convert_to_tensor(self._beta, name="beta_t")
104 | 
105 |     def _create_slots(self, var_list):
106 |         # Create slots for the first and second moments.
107 |         for v in var_list:
108 |             self._zeros_slot(v, "m", self._name)
109 | 
110 |     def _apply_dense(self, grad, var):
111 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
112 |         beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
113 |         alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
114 | 
115 |         eps = 1e-7  # cap for moving average
116 | 
117 |         m = self.get_slot(var, "m")
118 |         m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
119 | 
120 |         var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t)))
121 |         # Create an op that groups multiple operations
122 |         # When this op finishes, all ops in input have finished
123 |         return control_flow_ops.group(*[var_update, m_t])
124 | 
125 |     def _apply_sparse(self, grad, var):
126 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
127 |         beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
128 |         alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
129 | 
130 |         eps = 1e-7  # cap for moving average
131 | 
132 |         m = self.get_slot(var, "m")
133 |         m_slice = tf.gather(m, grad.indices)
134 |         m_t = state_ops.scatter_update(m, grad.indices,
135 |                                        tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values)))
136 |         m_t_slice = tf.gather(m_t, grad.indices)
137 | 
138 |         var_update = state_ops.scatter_sub(var, grad.indices,
139 |                                            lr_t * grad.values * (
140 |                                                    1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice)))
141 | 
142 |         # Create an op that groups multiple operations
143 |         # When this op finishes, all ops in input have finished
144 |         return control_flow_ops.group(*[var_update, m_t])
145 | 
146 | 
147 | class LazyAMSGradOptimizer(optimizer.Optimizer):
148 |     def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8,
149 |                  use_locking=False, name="AMSGrad"):
150 |         super(LazyAMSGradOptimizer, self).__init__(use_locking, name)
151 |         self._lr = learning_rate
152 |         self._beta1 = beta1
153 |         self._beta2 = beta2
154 |         self._epsilon = epsilon
155 | 
156 |         # Tensor versions of the constructor arguments, created in _prepare().
157 |         self._lr_t = None
158 |         self._beta1_t = None
159 |         self._beta2_t = None
160 |         self._epsilon_t = None
161 | 
162 |     def _prepare(self):
163 |         self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
164 |         self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
165 |         self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
166 |         self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
167 | 
168 |     def _create_slots(self, var_list):
169 |         # Create slots for the first and second moments.
170 |         for v in var_list:
171 |             self._zeros_slot(v, "m", self._name)
172 |             self._zeros_slot(v, "v", self._name)
173 |             self._zeros_slot(v, "v_prime", self._name)
174 | 
175 |     def _apply_dense(self, grad, var):
176 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
177 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
178 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
179 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
180 | 
181 |         # the following equations given in [1]
182 |         # m_t = beta1 * m + (1 - beta1) * g_t
183 |         m = self.get_slot(var, "m")
184 |         m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
185 | 
186 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
187 |         v = self.get_slot(var, "v")
188 |         v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
189 |         v_prime = self.get_slot(var, "v_prime")
190 |         v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t))
191 | 
192 |         var_update = state_ops.assign_sub(var,
193 |                                           lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t),
194 |                                           use_locking=self._use_locking)
195 | 
196 |         return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
197 | 
198 |     # keras Nadam update rule
199 |     def _apply_sparse(self, grad, var):
200 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
201 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
202 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
203 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
204 | 
205 |         # the following equations given in [1]
206 |         # m_t = beta1 * m + (1 - beta1) * g_t
207 |         m = self.get_slot(var, "m")
208 |         m_t = state_ops.scatter_update(m, grad.indices,
209 |                                        beta1_t * array_ops.gather(m, grad.indices) +
210 |                                        (1. - beta1_t) * grad.values,
211 |                                        use_locking=self._use_locking)
212 |         m_t_slice = tf.gather(m_t, grad.indices)
213 | 
214 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
215 |         v = self.get_slot(var, "v")
216 |         v_t = state_ops.scatter_update(v, grad.indices,
217 |                                        beta2_t * array_ops.gather(v, grad.indices) +
218 |                                        (1. - beta2_t) * tf.square(grad.values),
219 |                                        use_locking=self._use_locking)
220 |         v_prime = self.get_slot(var, "v_prime")
221 |         v_t_slice = tf.gather(v_t, grad.indices)
222 |         v_prime_slice = tf.gather(v_prime, grad.indices)
223 |         v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice))
224 | 
225 |         v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices)
226 |         var_update = state_ops.scatter_sub(var, grad.indices,
227 |                                            lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t),
228 |                                            use_locking=self._use_locking)
229 | 
230 |         return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
231 | 
232 | 
233 | class LazyNadamOptimizer(optimizer.Optimizer):
234 |     def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, epsilon=1e-8,
235 |                  schedule_decay=0.004, use_locking=False, name="Nadam"):
236 |         super(LazyNadamOptimizer, self).__init__(use_locking, name)
237 |         self._lr = learning_rate
238 |         self._beta1 = beta1
239 |         self._beta2 = beta2
240 |         self._epsilon = epsilon
241 |         self._schedule_decay = schedule_decay
242 |         # momentum cache decay
243 |         self._momentum_cache_decay = tf.cast(0.96, tf.float32)
244 |         self._momentum_cache_const = tf.pow(self._momentum_cache_decay, 1. * schedule_decay)
245 | 
246 |         # Tensor versions of the constructor arguments, created in _prepare().
247 |         self._lr_t = None
248 |         self._beta1_t = None
249 |         self._beta2_t = None
250 |         self._epsilon_t = None
251 |         self._schedule_decay_t = None
252 | 
253 |         # Variables to accumulate the powers of the beta parameters.
254 |         # Created in _create_slots when we know the variables to optimize.
255 |         self._beta1_power = None
256 |         self._beta2_power = None
257 |         self._iterations = None
258 |         self._m_schedule = None
259 | 
260 |         # Created in SparseApply if needed.
261 |         self._updated_lr = None
262 | 
263 |     def _prepare(self):
264 |         self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
265 |         self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
266 |         self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
267 |         self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
268 |         self._schedule_decay_t = ops.convert_to_tensor(self._schedule_decay, name="schedule_decay")
269 | 
270 |     def _create_slots(self, var_list):
271 |         # Create the beta1 and beta2 accumulators on the same device as the first
272 |         # variable. Sort the var_list to make sure this device is consistent across
273 |         # workers (these need to go on the same PS, otherwise some updates are
274 |         # silently ignored).
275 |         first_var = min(var_list, key=lambda x: x.name)
276 | 
277 |         create_new = self._iterations is None
278 |         if not create_new and context.in_graph_mode():
279 |             create_new = (self._iterations.graph is not first_var.graph)
280 | 
281 |         if create_new:
282 |             with ops.colocate_with(first_var):
283 |                 self._beta1_power = variable_scope.variable(self._beta1,
284 |                                                             name="beta1_power",
285 |                                                             trainable=False)
286 |                 self._beta2_power = variable_scope.variable(self._beta2,
287 |                                                             name="beta2_power",
288 |                                                             trainable=False)
289 |                 self._iterations = variable_scope.variable(0.,
290 |                                                            name="iterations",
291 |                                                            trainable=False)
292 |                 self._m_schedule = variable_scope.variable(1.,
293 |                                                            name="m_schedule",
294 |                                                            trainable=False)
295 |         # Create slots for the first and second moments.
296 |         for v in var_list:
297 |             self._zeros_slot(v, "m", self._name)
298 |             self._zeros_slot(v, "v", self._name)
299 | 
300 |     def _get_momentum_cache(self, schedule_decay_t, t):
301 |         return tf.pow(self._momentum_cache_decay, t * schedule_decay_t)
302 |         # return beta1_t * (1. - 0.5 * (tf.pow(self._momentum_cache_decay, t * schedule_decay_t)))
303 | 
304 |     """very slow
305 |     we simply use the nadam update rule without warming momentum schedule
306 |     def _apply_dense(self, grad, var):
307 |         t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1.
308 |         m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype)
309 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
310 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
311 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
312 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
313 |         schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype)
314 | 
315 |         # Due to the recommendations in [2], i.e. warming momentum schedule
316 |         # see keras Nadam
317 |         momentum_cache_t = self._get_momentum_cache(beta1_t, schedule_decay_t, t)
318 |         momentum_cache_t_1 = self._get_momentum_cache(beta1_t, schedule_decay_t, t+1.)
319 |         m_schedule_new = m_schedule * momentum_cache_t
320 |         m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1
321 | 
322 |         # the following equations given in [1]
323 |         # m_t = beta1 * m + (1 - beta1) * g_t
324 |         m = self.get_slot(var, "m")
325 |         m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
326 |         g_prime = grad / (1. - m_schedule_new)
327 |         m_t_prime = m_t / (1. - m_schedule_next)
328 |         m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
329 | 
330 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
331 |         v = self.get_slot(var, "v")
332 |         v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
333 |         v_t_prime = v_t / (1. - tf.pow(beta2_t, t))
334 | 
335 |         var_update = state_ops.assign_sub(var,
336 |                                       lr_t * m_t_bar / (tf.sqrt(v_t_prime) + epsilon_t),
337 |                                       use_locking=self._use_locking)
338 | 
339 |         return control_flow_ops.group(*[var_update, m_t, v_t])
340 |     """
341 | 
342 |     # nadam update rule without warming momentum schedule
343 |     def _apply_dense(self, grad, var):
344 |         m = self.get_slot(var, "m")
345 |         v = self.get_slot(var, "v")
346 |         return training_ops.apply_adam(
347 |             var,
348 |             m,
349 |             v,
350 |             math_ops.cast(self._beta1_power, var.dtype.base_dtype),
351 |             math_ops.cast(self._beta2_power, var.dtype.base_dtype),
352 |             math_ops.cast(self._lr_t, var.dtype.base_dtype),
353 |             math_ops.cast(self._beta1_t, var.dtype.base_dtype),
354 |             math_ops.cast(self._beta2_t, var.dtype.base_dtype),
355 |             math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
356 |             grad,
357 |             use_locking=self._use_locking,
358 |             use_nesterov=True).op
359 | 
360 |     def _resource_apply_dense(self, grad, var):
361 |         m = self.get_slot(var, "m")
362 |         v = self.get_slot(var, "v")
363 |         return training_ops.resource_apply_adam(
364 |             var.handle,
365 |             m.handle,
366 |             v.handle,
367 |             math_ops.cast(self._beta1_power, grad.dtype.base_dtype),
368 |             math_ops.cast(self._beta2_power, grad.dtype.base_dtype),
369 |             math_ops.cast(self._lr_t, grad.dtype.base_dtype),
370 |             math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
371 |             math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
372 |             math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
373 |             grad,
374 |             use_locking=self._use_locking,
375 |             use_nesterov=True)
376 | 
377 |     # keras Nadam update rule
378 |     def _apply_sparse(self, grad, var):
379 |         t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1.
380 |         m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype)
381 |         lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
382 |         beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
383 |         beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
384 |         epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
385 |         schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype)
386 | 
387 |         # Due to the recommendations in [2], i.e. warming momentum schedule
388 |         momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t)
389 |         momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power)
390 |         momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const)
391 |         m_schedule_new = m_schedule * momentum_cache_t
392 |         m_schedule_next = m_schedule_new * momentum_cache_t_1
393 | 
394 |         # the following equations given in [1]
395 |         # m_t = beta1 * m + (1 - beta1) * g_t
396 |         m = self.get_slot(var, "m")
397 |         m_t = state_ops.scatter_update(m, grad.indices,
398 |                                        beta1_t * array_ops.gather(m, grad.indices) +
399 |                                        (1. - beta1_t) * grad.values,
400 |                                        use_locking=self._use_locking)
401 |         g_prime_slice = grad.values / (1. - m_schedule_new)
402 |         m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next)
403 |         m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice
404 | 
405 |         # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
406 |         v = self.get_slot(var, "v")
407 |         v_t = state_ops.scatter_update(v, grad.indices,
408 |                                        beta2_t * array_ops.gather(v, grad.indices) +
409 |                                        (1. - beta2_t) * tf.square(grad.values),
410 |                                        use_locking=self._use_locking)
411 |         v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t))
412 | 
413 |         var_update = state_ops.scatter_sub(var, grad.indices,
414 |                                            lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t),
415 |                                            use_locking=self._use_locking)
416 | 
417 |         return control_flow_ops.group(*[var_update, m_t, v_t])
418 | 
419 |     def _finish(self, update_ops, name_scope):
420 |         # Update the power accumulators.
421 |         with ops.control_dependencies(update_ops):
422 |             with ops.colocate_with(self._iterations):
423 |                 update_beta1 = self._beta1_power.assign(
424 |                     self._beta1_power * self._beta1_t,
425 |                     use_locking=self._use_locking)
426 |                 update_beta2 = self._beta2_power.assign(
427 |                     self._beta2_power * self._beta2_t,
428 |                     use_locking=self._use_locking)
429 |                 t = self._iterations + 1.
430 |                 update_iterations = self._iterations.assign(t, use_locking=self._use_locking)
431 |                 momentum_cache_power = self._get_momentum_cache(self._schedule_decay_t, t)
432 |                 momentum_cache_t = self._beta1_t * (1. - 0.5 * momentum_cache_power)
433 |                 update_m_schedule = self._m_schedule.assign(
434 |                     self._m_schedule * momentum_cache_t,
435 |                     use_locking=self._use_locking)
436 |         return control_flow_ops.group(
437 |             *update_ops + [update_beta1, update_beta2] + [update_iterations, update_m_schedule],
438 |             name=name_scope)


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenglongChen/tensorflow-DSMM/52a499a162f3837aa11bb1bb4c1029accfe5743d/src/utils/__init__.py


--------------------------------------------------------------------------------
/src/utils/dist_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for distance computation
  5 | """
  6 | 
  7 | import warnings
  8 | warnings.filterwarnings("ignore")
  9 | import numpy as np
 10 | try:
 11 |     import lzma
 12 |     import Levenshtein
 13 | except:
 14 |     pass
 15 | from difflib import SequenceMatcher
 16 | from rouge import Rouge
 17 | from utils import ngram_utils, np_utils
 18 | 
 19 | 
 20 | def _edit_dist(str1, str2):
 21 |     try:
 22 |         # very fast
 23 |         # http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
 24 |         # d = Levenshtein.ratio(str1, str2)
 25 |         d = Levenshtein.distance(str1, str2)/float(max(len(str1),len(str2)))
 26 |     except:
 27 |         # https://docs.python.org/2/library/difflib.html
 28 |         d = 1. - SequenceMatcher(lambda x: x==" ", str1, str2).ratio()
 29 |     return d
 30 | 
 31 | 
 32 | def _longest_match_size(str1, str2):
 33 |     sq = SequenceMatcher(lambda x: x==" ", str1, str2)
 34 |     match = sq.find_longest_match(0, len(str1), 0, len(str2))
 35 |     return match.size
 36 | 
 37 | 
 38 | def _longest_match_ratio(str1, str2):
 39 |     sq = SequenceMatcher(lambda x: x==" ", str1, str2)
 40 |     match = sq.find_longest_match(0, len(str1), 0, len(str2))
 41 |     return np_utils._try_divide(match.size, min(len(str1), len(str2)))
 42 | 
 43 | 
 44 | def _common_num(s1, s2):
 45 |     c = 0
 46 |     for s1_ in s1:
 47 |         for s2_ in s2:
 48 |             if s1_ == s2_:
 49 |                 c += 1
 50 |     return c
 51 | 
 52 | 
 53 | def _count_stats(s1, s2):
 54 |     # length
 55 |     l1 = len(s1)
 56 |     l2 = len(s2)
 57 |     len_diff = np_utils._try_divide(np.abs(l1-l2), (l1+l2)/2.)
 58 | 
 59 |     # set
 60 |     s1_set = set(s1)
 61 |     s2_set = set(s2)
 62 | 
 63 |     # unique length
 64 |     l1_unique = len(s1_set)
 65 |     l2_unique = len(s2_set)
 66 |     len_diff_unique = np_utils._try_divide(np.abs(l1_unique-l2_unique), (l1_unique+l2_unique)/2.)
 67 | 
 68 |     # unique ratio
 69 |     r1_unique = np_utils._try_divide(l1_unique, l1)
 70 |     r2_unique = np_utils._try_divide(l2_unique, l2)
 71 | 
 72 |     # jaccard coef
 73 |     li = len(s1_set.intersection(s2_set))
 74 |     lu = len(s1_set.union(s2_set))
 75 |     jaccard_coef = np_utils._try_divide(li, lu)
 76 | 
 77 |     # dice coef
 78 |     dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)
 79 | 
 80 |     # common number
 81 |     common_ = _common_num(s1, s2)
 82 |     common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
 83 |     common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
 84 |     common_ratio_min = np_utils._try_divide(common_, max(l1, l2))
 85 | 
 86 |     # over all features
 87 |     f = [l1, l2, len_diff,
 88 |          l1_unique, l2_unique, len_diff_unique,
 89 |          r1_unique, r2_unique,
 90 |          li, lu, jaccard_coef, dice_coef,
 91 |          common_, common_ratio_avg, common_ratio_max, common_ratio_min
 92 |     ]
 93 |     return np.array(f, dtype=np.float32)
 94 | 
 95 | 
 96 | rouge = Rouge()
 97 | def _get_rouge_feat(s1, s2):
 98 |     if isinstance(s1, list):
 99 |         s1 = " ".join(s1)
100 |     if isinstance(s2, list):
101 |         s2 = " ".join(s2)
102 |     scores = rouge.get_scores(s1, s2)
103 |     feat = []
104 |     for k,v in scores[0].items():
105 |         feat.extend(v.values())
106 |     return np.array(feat, dtype=np.float32)
107 | 
108 | 
109 | def _get_bleu(s1, s2):
110 |     count_dict={}
111 |     count_dict_clip={}
112 |     #1. count for each token at predict sentence side.
113 |     for token in s1:
114 |         if token not in count_dict:
115 |             count_dict[token]=1
116 |         else:
117 |             count_dict[token]=count_dict[token]+1
118 |     count=np.sum([value for key,value in count_dict.items()])
119 | 
120 |     #2.count for tokens existing in predict sentence for target sentence side.
121 |     for token in s2:
122 |         if token in count_dict:
123 |             if token not in count_dict_clip:
124 |                 count_dict_clip[token]=1
125 |             else:
126 |                 count_dict_clip[token]=count_dict_clip[token]+1
127 | 
128 |     #3. clip value to ceiling value for that token
129 |     count_dict_clip={key:(value if value<=count_dict[key] else count_dict[key]) for key,value in count_dict_clip.items()}
130 |     count_clip=np.sum([value for key,value in count_dict_clip.items()])
131 |     result=float(count_clip)/(float(count)+0.00000001)
132 |     return result
133 | 
134 | 
135 | def _get_bleu_feat(s1, s2, ngrams=3):
136 |     if isinstance(s1, str):
137 |         s1 = s1.split(" ")
138 |     if isinstance(s2, str):
139 |         s2 = s2.split(" ")
140 |     feat = []
141 |     for ngram in range(ngrams+1):
142 |         s1_ngram = ngram_utils._ngrams(s1, ngram+1, "_")
143 |         s2_ngram = ngram_utils._ngrams(s2, ngram+1, "_")
144 |         feat.append(_get_bleu(s1_ngram, s2_ngram))
145 |     return np.array(feat, dtype=np.float32)
146 | 
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     s1 = ["W1", "W2", "W3", "W4", "W10"]
151 |     s2 = ["W1", "W2", "W4", "W6", "W8"]
152 |     print(_count_stats(s1, s2))
153 |     print(_edit_dist(s1, s2))
154 |     print(_longest_match_size(s1, s2))
155 |     print(_longest_match_ratio(s1, s2))
156 |     print(_get_rouge_feat(s1, s2))
157 |     print(_get_bleu_feat(s1, s2))


--------------------------------------------------------------------------------
/src/utils/log_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import logging
 4 | import logging.handlers
 5 | 
 6 | 
 7 | def _get_logger(logdir, logname, loglevel=logging.INFO):
 8 |     fmt = "[%(asctime)s] %(levelname)s: %(message)s"
 9 |     formatter = logging.Formatter(fmt)
10 | 
11 |     handler = logging.handlers.RotatingFileHandler(
12 |         filename=os.path.join(logdir, logname),
13 |         maxBytes=2 * 1024 * 1024 * 1024,
14 |         backupCount=10)
15 |     handler.setFormatter(formatter)
16 | 
17 |     logger = logging.getLogger("")
18 |     logger.addHandler(handler)
19 |     logger.setLevel(loglevel)
20 |     return logger
21 | 


--------------------------------------------------------------------------------
/src/utils/ngram_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @author: Chenglong Chen <c.chenglong@gmail.com>
  4 | @brief: utils for ngram
  5 | """
  6 | 
  7 | 
  8 | def _unigrams(words):
  9 |     """
 10 |         Input: a list of words, e.g., ["I", "am", "Denny"]
 11 |         Output: a list of unigram
 12 |     """
 13 |     assert type(words) == list
 14 |     return words
 15 | 
 16 | 
 17 | def _bigrams(words, join_string, skip=0):
 18 |     """
 19 |        Input: a list of words, e.g., ["I", "am", "Denny"]
 20 |        Output: a list of bigram, e.g., ["I_am", "am_Denny"]
 21 |        I use _ as join_string for this example.
 22 |     """
 23 |     assert type(words) == list
 24 |     L = len(words)
 25 |     if L > 1:
 26 |         lst = []
 27 |         for i in range(L-1):
 28 |             for k in range(1,skip+2):
 29 |                 if i+k < L:
 30 |                     lst.append( join_string.join([words[i], words[i+k]]) )
 31 |     else:
 32 |         # set it as unigram
 33 |         lst = _unigrams(words)
 34 |     return lst
 35 | 
 36 | 
 37 | def _trigrams(words, join_string, skip=0):
 38 |     """
 39 |        Input: a list of words, e.g., ["I", "am", "Denny"]
 40 |        Output: a list of trigram, e.g., ["I_am_Denny"]
 41 |        I use _ as join_string for this example.
 42 |     """
 43 |     assert type(words) == list
 44 |     L = len(words)
 45 |     if L > 2:
 46 |         lst = []
 47 |         for i in range(L-2):
 48 |             for k1 in range(1,skip+2):
 49 |                 for k2 in range(1,skip+2):
 50 |                     if i+k1 < L and i+k1+k2 < L:
 51 |                         lst.append( join_string.join([words[i], words[i+k1], words[i+k1+k2]]) )
 52 |     else:
 53 |         # set it as bigram
 54 |         lst = _bigrams(words, join_string, skip)
 55 |     return lst
 56 | 
 57 | 
 58 | def _fourgrams(words, join_string):
 59 |     """
 60 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
 61 |         Output: a list of trigram, e.g., ["I_am_Denny_boy"]
 62 |         I use _ as join_string for this example.
 63 |     """
 64 |     assert type(words) == list
 65 |     L = len(words)
 66 |     if L > 3:
 67 |         lst = []
 68 |         for i in range(L-3):
 69 |             lst.append( join_string.join([words[i], words[i+1], words[i+2], words[i+3]]) )
 70 |     else:
 71 |         # set it as trigram
 72 |         lst = _trigrams(words, join_string)
 73 |     return lst
 74 | 
 75 | 
 76 | def _uniterms(words):
 77 |     return _unigrams(words)
 78 | 
 79 | 
 80 | def _biterms(words, join_string):
 81 |     """
 82 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
 83 |         Output: a list of biterm, e.g., ["I_am", "I_Denny", "I_boy", "am_Denny", "am_boy", "Denny_boy"]
 84 |         I use _ as join_string for this example.
 85 |     """
 86 |     assert type(words) == list
 87 |     L = len(words)
 88 |     if L > 1:
 89 |         lst = []
 90 |         for i in range(L-1):
 91 |             for j in range(i+1,L):
 92 |                 lst.append( join_string.join([words[i], words[j]]) )
 93 |     else:
 94 |         # set it as uniterm
 95 |         lst = _uniterms(words)
 96 |     return lst
 97 | 
 98 | 
 99 | def _triterms(words, join_string):
100 |     """
101 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy"]
102 |         Output: a list of triterm, e.g., ["I_am_Denny", "I_am_boy", "I_Denny_boy", "am_Denny_boy"]
103 |         I use _ as join_string for this example.
104 |     """
105 |     assert type(words) == list
106 |     L = len(words)
107 |     if L > 2:
108 |         lst = []
109 |         for i in range(L-2):
110 |             for j in range(i+1,L-1):
111 |                 for k in range(j+1,L):
112 |                     lst.append( join_string.join([words[i], words[j], words[k]]) )
113 |     else:
114 |         # set it as biterm
115 |         lst = _biterms(words, join_string)
116 |     return lst
117 | 
118 | 
119 | def _fourterms(words, join_string):
120 |     """
121 |         Input: a list of words, e.g., ["I", "am", "Denny", "boy", "ha"]
122 |         Output: a list of fourterm, e.g., ["I_am_Denny_boy", "I_am_Denny_ha", "I_am_boy_ha", "I_Denny_boy_ha", "am_Denny_boy_ha"]
123 |         I use _ as join_string for this example.
124 |     """
125 |     assert type(words) == list
126 |     L = len(words)
127 |     if L > 3:
128 |         lst = []
129 |         for i in range(L-3):
130 |             for j in range(i+1,L-2):
131 |                 for k in range(j+1,L-1):
132 |                     for l in range(k+1,L):
133 |                         lst.append( join_string.join([words[i], words[j], words[k], words[l]]) )
134 |     else:
135 |         # set it as triterm
136 |         lst = _triterms(words, join_string)
137 |     return lst
138 | 
139 | 
140 | _ngram_str_map = {
141 |     1: "Unigram",
142 |     2: "Bigram",
143 |     3: "Trigram",
144 |     4: "Fourgram",
145 |     5: "Fivegram",
146 |     12: "UBgram",
147 |     123: "UBTgram",
148 | }
149 | 
150 | 
151 | def _ngrams(words, ngram, join_string=" "):
152 |     """wrapper for ngram"""
153 |     if ngram == 1:
154 |         return _unigrams(words)
155 |     elif ngram == 2:
156 |         return _bigrams(words, join_string)
157 |     elif ngram == 3:
158 |         return _trigrams(words, join_string)
159 |     elif ngram == 4:
160 |         return _fourgrams(words, join_string)
161 |     elif ngram == 12:
162 |         unigram = _unigrams(words)
163 |         bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
164 |         return unigram + bigram
165 |     elif ngram == 123:
166 |         unigram = _unigrams(words)
167 |         bigram = [x for x in _bigrams(words, join_string) if len(x.split(join_string)) == 2]
168 |         trigram = [x for x in _trigrams(words, join_string) if len(x.split(join_string)) == 3]
169 |         return unigram + bigram + trigram
170 | 
171 | 
172 | _nterm_str_map = {
173 |     1: "Uniterm",
174 |     2: "Biterm",
175 |     3: "Triterm",
176 |     4: "Fourterm",
177 |     5: "Fiveterm",
178 | }
179 | 
180 | 
181 | def _nterms(words, nterm, join_string=" "):
182 |     """wrapper for nterm"""
183 |     if nterm == 1:
184 |         return _uniterms(words)
185 |     elif nterm == 2:
186 |         return _biterms(words, join_string)
187 |     elif nterm == 3:
188 |         return _triterms(words, join_string)
189 |     elif nterm == 4:
190 |         return _fourterms(words, join_string)
191 | 
192 | 
193 | if __name__ == "__main__":
194 | 
195 |     text = "I am Denny boy ha"
196 |     words = text.split(" ")
197 | 
198 |     assert _ngrams(words, 1) == ["I", "am", "Denny", "boy", "ha"]
199 |     assert _ngrams(words, 2) == ["I am", "am Denny", "Denny boy", "boy ha"]
200 |     assert _ngrams(words, 3) == ["I am Denny", "am Denny boy", "Denny boy ha"]
201 |     assert _ngrams(words, 4) == ["I am Denny boy", "am Denny boy ha"]
202 | 
203 |     assert _nterms(words, 1) == ["I", "am", "Denny", "boy", "ha"]
204 |     assert _nterms(words, 2) == ["I am", "I Denny", "I boy", "I ha", "am Denny", "am boy", "am ha", "Denny boy", "Denny ha", "boy ha"]
205 |     assert _nterms(words, 3) == ["I am Denny", "I am boy", "I am ha", "I Denny boy", "I Denny ha", "I boy ha", "am Denny boy", "am Denny ha", "am boy ha", "Denny boy ha"]
206 |     assert _nterms(words, 4) == ["I am Denny boy", "I am Denny ha", "I am boy ha", "I Denny boy ha", "am Denny boy ha"]


--------------------------------------------------------------------------------
/src/utils/np_utils.py:
--------------------------------------------------------------------------------
1 | 
2 | import numpy as np
3 | 
4 | def _try_divide(x, y, val=0.0):
5 |     """try to divide two numbers"""
6 |     if y != 0.0:
7 |         val = float(x) / y
8 |     return val
9 | 


--------------------------------------------------------------------------------
/src/utils/os_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import shutil
 4 | 
 5 | 
 6 | def _makedirs(dir, force=False):
 7 |     if os.path.exists(dir):
 8 |         if force:
 9 |             shutil.rmtree(dir)
10 |             os.makedirs(dir)
11 |     else:
12 |         os.makedirs(dir)
13 | 


--------------------------------------------------------------------------------
/src/utils/time_utils.py:
--------------------------------------------------------------------------------
1 | 
2 | import datetime
3 | 
4 | 
5 | def _timestamp():
6 |     now = datetime.datetime.now()
7 |     now_str = now.strftime("%Y%m%d%H%M")
8 |     return now_str


--------------------------------------------------------------------------------
/src/utils/topk_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from collections import defaultdict
 3 | from random import randint
 4 | 
 5 | 
 6 | # Bucket Sort
 7 | # Time:  O(n + klogk) ~ O(n + nlogn)
 8 | # Space: O(n)
 9 | class BucketSort(object):
10 |     def topKFrequent(self, words, k):
11 |         counts = defaultdict(int)
12 |         for ws in words:
13 |             for w in ws:
14 |                 counts[w] += 1
15 | 
16 |         buckets = [[]] * (sum(counts.values()) + 1)
17 |         for i, count in counts.items():
18 |             buckets[count].append(i)
19 | 
20 |         result = []
21 |         # result_append = result.append
22 |         for i in reversed(range(len(buckets))):
23 |             for j in range(len(buckets[i])):
24 |                 # slower
25 |                 # result_append(buckets[i][j])
26 |                 result.append(buckets[i][j])
27 |                 if len(result) == k:
28 |                     return result
29 |         return result
30 | 
31 | 
32 | # Quick Select
33 | # Time:  O(n) ~ O(n^2), O(n) on average.
34 | # Space: O(n)
35 | class QuickSelect(object):
36 |     def topKFrequent(self, words, k):
37 |         """
38 |         :type words: List[str]
39 |         :type k: int
40 |         :rtype: List[str]
41 |         """
42 |         counts = defaultdict(int)
43 |         for ws in words:
44 |             for w in ws:
45 |                 counts[w] += 1
46 |         p = []
47 |         for key, val in counts.items():
48 |             p.append((-val, key))
49 |         self.kthElement(p, k)
50 | 
51 |         result = []
52 |         sorted_p = sorted(p[:k])
53 |         for i in range(k):
54 |             result.append(sorted_p[i][1])
55 |         return result
56 | 
57 |     def kthElement(self, nums, k):  # O(n) on average
58 |         def PartitionAroundPivot(left, right, pivot_idx, nums):
59 |             pivot_value = nums[pivot_idx]
60 |             new_pivot_idx = left
61 |             nums[pivot_idx], nums[right] = nums[right], nums[pivot_idx]
62 |             for i in range(left, right):
63 |                 if nums[i] < pivot_value:
64 |                     nums[i], nums[new_pivot_idx] = nums[new_pivot_idx], nums[i]
65 |                     new_pivot_idx += 1
66 | 
67 |             nums[right], nums[new_pivot_idx] = nums[new_pivot_idx], nums[right]
68 |             return new_pivot_idx
69 | 
70 |         left, right = 0, len(nums) - 1
71 |         while left <= right:
72 |             pivot_idx = randint(left, right)
73 |             new_pivot_idx = PartitionAroundPivot(left, right, pivot_idx, nums)
74 |             if new_pivot_idx == k - 1:
75 |                 return
76 |             elif new_pivot_idx > k - 1:
77 |                 right = new_pivot_idx - 1
78 |             else:  # new_pivot_idx < k - 1.
79 |                 left = new_pivot_idx + 1
80 | 
81 | 
82 | top_k_selector = BucketSort()


--------------------------------------------------------------------------------