├── .gitignore ├── LICENSE ├── README.md ├── docs └── pics │ ├── framework.png │ └── logo.png ├── examples ├── run_bert_classifier.py ├── run_pnasnet_classifier.py └── tutorial │ ├── ALBERTBinaryClassifier.ipynb │ ├── ALBERTClassifier.ipynb │ ├── ALBERTLM.ipynb │ ├── ALBERTMRC.ipynb │ ├── ALBERTSeqClassifier.ipynb │ ├── AdaBERTClassifier.ipynb │ ├── BERTBinaryClassifier.ipynb │ ├── BERTCRFCascadeNER.ipynb │ ├── BERTCRFNER.ipynb │ ├── BERTClassifier.ipynb │ ├── BERTLM.ipynb │ ├── BERTMRC.ipynb │ ├── BERTNER.ipynb │ ├── BERTRegressor.ipynb │ ├── BERTSeqClassifier.ipynb │ ├── BERTSeqCrossClassifier.ipynb │ ├── BERTTmpBinaryClassifier.ipynb │ ├── BERTVerifierMRC.ipynb │ ├── BiRNNClassifier.ipynb │ ├── ELECTRABinaryClassifier.ipynb │ ├── ELECTRAClassifier.ipynb │ ├── ELECTRALM.ipynb │ ├── ELECTRAMRC.ipynb │ ├── ELECTRASeqClassifier.ipynb │ ├── FastBERTClassifier.ipynb │ ├── GPT2LM.ipynb │ ├── MotianClassifier.ipynb │ ├── PNasNetClassifier.ipynb │ ├── PerformerClassifier.ipynb │ ├── RNNClassifier.ipynb │ ├── RecBERT2LM.ipynb │ ├── RecBERT3LM.ipynb │ ├── RecBERTLM.ipynb │ ├── RetroReaderMRC.ipynb │ ├── RoBERTaBinaryClassifier.ipynb │ ├── RoBERTaClassifier.ipynb │ ├── RoBERTaLM.ipynb │ ├── RoBERTaMRC.ipynb │ ├── RoBERTaSeqClassifier.ipynb │ ├── SANetMRC.ipynb │ ├── SQPLM.ipynb │ ├── SemBERTClassifier.ipynb │ ├── TextCNNClassifier.ipynb │ ├── TinyBERTBinaryClassifier.ipynb │ ├── TinyBERTClassifier.ipynb │ ├── TransformerMT.ipynb │ ├── UDAClassifier.ipynb │ ├── UniLM.ipynb │ ├── UniLMPrompt.ipynb │ ├── VAELM.ipynb │ ├── WideDeepClassifier.ipynb │ ├── WideDeepRegressor.ipynb │ ├── XLNetBinaryClassifier.ipynb │ └── XLNetClassifier.ipynb ├── ref ├── albert_config.json ├── bert_config.json ├── spiece.model ├── vocab.txt └── xlnet_config.json ├── setup.py └── uf ├── __init__.py ├── apps ├── __init__.py ├── _base_ │ ├── __init__.py │ ├── _base_.py │ ├── _base_binary_classifier.py │ ├── _base_classifier.py │ ├── _base_lm.py │ ├── _base_mrc.py │ ├── _base_mt.py │ ├── _base_ner.py │ ├── _base_regressor.py │ └── _base_seq_classifier.py ├── adabert │ ├── __init__.py │ ├── adabert.py │ └── adabert_classifier.py ├── albert │ ├── __init__.py │ ├── albert.py │ ├── albert_binary_classifier.py │ ├── albert_classifier.py │ ├── albert_lm.py │ ├── albert_mrc.py │ └── albert_seq_classifier.py ├── bert │ ├── __init__.py │ ├── bert.py │ ├── bert_binary_classifier.py │ ├── bert_classifier.py │ ├── bert_crf_cascade_ner.py │ ├── bert_crf_ner.py │ ├── bert_lm.py │ ├── bert_mrc.py │ ├── bert_ner.py │ ├── bert_regressor.py │ ├── bert_seq_classifier.py │ ├── bert_seq_cross_classifier.py │ ├── bert_tmp_binary_classifier.py │ └── bert_verifier_mrc.py ├── chatbot │ ├── __init__.py │ ├── chatbot.py │ └── chatbot_mt.py ├── crf │ ├── __init__.py │ └── crf.py ├── dilated │ ├── __init__.py │ ├── dilated.py │ └── dilated_lm.py ├── electra │ ├── __init__.py │ ├── electra.py │ ├── electra_binary_classifier.py │ ├── electra_classifier.py │ ├── electra_lm.py │ ├── electra_mrc.py │ └── electra_seq_classifier.py ├── fastbert │ ├── __init__.py │ ├── fastbert.py │ └── fastbert_classifier.py ├── gpt2 │ ├── __init__.py │ ├── gpt2.py │ └── gpt2_lm.py ├── motian │ ├── __init__.py │ ├── motian.py │ └── motian_classifier.py ├── nasnet │ ├── __init__.py │ ├── nasnet.py │ ├── nasnet_utils.py │ ├── pnasnet.py │ └── pnasnet_classifier.py ├── performer │ ├── __init__.py │ ├── performer.py │ └── performer_classifier.py ├── recbert │ ├── __init__.py │ ├── recbert.py │ ├── recbert2.py │ ├── recbert2_lm.py │ ├── recbert3.py │ ├── recbert3_lm.py │ └── recbert_lm.py ├── retroreader │ ├── __init__.py │ ├── retroreader.py │ └── retroreader_mrc.py ├── rnn │ ├── __init__.py │ ├── bi_rnn.py │ ├── bi_rnn_classifier.py │ ├── rnn.py │ └── rnn_classifier.py ├── roberta │ ├── __init__.py │ ├── roberta.py │ ├── roberta_binary_classifier.py │ ├── roberta_classifier.py │ ├── roberta_lm.py │ ├── roberta_mrc.py │ └── roberta_seq_classifier.py ├── sanet │ ├── __init__.py │ ├── sanet.py │ └── sanet_mrc.py ├── sembert │ ├── __init__.py │ ├── sembert.py │ └── sembert_classifier.py ├── spe │ ├── __init__.py │ ├── spe.py │ └── spe_lm.py ├── sqp │ ├── __init__.py │ ├── sqp.py │ └── sqp_lm.py ├── stockbert │ ├── __init__.py │ ├── stockbert.py │ └── stockbert_classifier.py ├── textcnn │ ├── __init__.py │ ├── textcnn.py │ └── textcnn_classifier.py ├── tinybert │ ├── __init__.py │ ├── tinybert.py │ ├── tinybert_binary_classifier.py │ └── tinybert_classifier.py ├── transformer │ ├── __init__.py │ ├── transformer.py │ └── transformer_mt.py ├── uda │ ├── __init__.py │ ├── uda.py │ └── uda_classifier.py ├── unilm │ ├── __init__.py │ ├── unilm.py │ ├── unilm_lm.py │ └── unilm_prompt.py ├── util.py ├── vae │ ├── __init__.py │ ├── vae.py │ └── vae_lm.py ├── widedeep │ ├── __init__.py │ ├── widedeep.py │ ├── widedeep_classifier.py │ └── widedeep_regressor.py └── xlnet │ ├── __init__.py │ ├── xlnet.py │ ├── xlnet_binary_classifier.py │ ├── xlnet_classifier.py │ ├── xlnet_lm.py │ └── xlnet_seq_classifier.py ├── com ├── __init__.py ├── cache.py ├── checkpoint.py ├── com.py ├── graph.py ├── parallel.py ├── resource.py ├── text.py └── tfrecords.py ├── core.py ├── opt.py ├── task ├── __init__.py ├── _base_.py ├── export.py ├── infer.py ├── init.py ├── score.py ├── train.py └── train_adversarial.py ├── third.py └── token ├── __init__.py ├── sentencepiece.py └── wordpiece.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # 隐藏文件 3 | .* 4 | */.* 5 | */*/.* 6 | */*/*/.* 7 | 8 | # 压缩文件 9 | *.tar.gz 10 | *.zip 11 | 12 | # 安装生成文件 13 | build 14 | dist 15 | uf.egg-info 16 | 17 | # 无关文件 18 | */__pycache__ 19 | */*/__pycache__ 20 | */*/*/__pycache__ 21 | */*/*/*/__pycache__ 22 | docs/*.pptx 23 | *.sh 24 | PLAN.md 25 | modify.py 26 | tmp.* 27 | test.py 28 | replace.py 29 | alarm.mp3 30 | log 31 | tf_slim 32 | data 33 | checkpoint 34 | pretrained 35 | pnasnet 36 | -------------------------------------------------------------------------------- /docs/pics/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/docs/pics/framework.png -------------------------------------------------------------------------------- /docs/pics/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/docs/pics/logo.png -------------------------------------------------------------------------------- /examples/run_bert_classifier.py: -------------------------------------------------------------------------------- 1 | import uf 2 | import numpy as np 3 | 4 | 5 | def get_best_f1(probs, labels, label_index=1): 6 | """ Calculate the best f1 by scanning over probabilities. """ 7 | 8 | assert len(probs) == len(labels) 9 | probs = np.array(probs) 10 | labels = np.array(labels) 11 | 12 | # initialize metrics 13 | n = np.sum(labels == label_index) 14 | tp = n 15 | fp = len(labels) - n 16 | fn = 0 17 | tn = 0 18 | accuracy = (tp + tn) / max(tp + tn + fp + fn, 1) 19 | precision = tp / max(tp + fp, 1) 20 | recall = tp / max(tp + fn, 1) 21 | f1 = 2 * precision * recall / max(precision + recall, 1) 22 | threshold = 0 23 | 24 | ids = sorted(range(len(probs)), key=lambda i: probs[i]) 25 | for i in ids: 26 | prob = probs[i] 27 | label = labels[i] 28 | if label == label_index: 29 | tp -= 1 30 | fn += 1 31 | elif label != label_index: 32 | fp -= 1 33 | tn += 1 34 | 35 | _accuracy = (tp + tn) / max(tp + tn + fp + fn, 1) 36 | _precision = tp / max(tp + fp, 1) 37 | _recall = tp / max(tp + fn, 1) 38 | _f1 = 2 * _precision * _recall / max(_precision + _recall, 1) 39 | if _f1 > f1: 40 | accuracy = _accuracy 41 | precision = _precision 42 | recall = _recall 43 | f1 = _f1 44 | threshold = prob 45 | return (n, accuracy, precision, recall, f1, threshold) 46 | 47 | 48 | def main(): 49 | 50 | uf.set_log("./log") 51 | 52 | # load data 53 | X, y = [], [] 54 | X_dev, y_dev = [], [] 55 | with open("sst-2/train.txt", encoding="utf-8") as f: 56 | for i, line in enumerate(f): 57 | if i == 0: # ignore title 58 | continue 59 | query, label = line.strip("\n").split("\t") 60 | X.append(query) 61 | y.append(int(label)) 62 | with open("sst-2/dev.txt", encoding="utf-8") as f: 63 | for i, line in enumerate(f): 64 | if i == 0: # ignore title 65 | continue 66 | query, label = line.strip("\n").split("\t") 67 | X_dev.append(query) 68 | y_dev.append(int(label)) 69 | 70 | # modeling 71 | checkpoint_dir = "pretrained/bert-base-zh" 72 | model = uf.BERTClassifier( 73 | config_file=f"{checkpoint_dir}/bert_config.json", 74 | vocab_file=f"{checkpoint_dir}/vocab.txt", 75 | max_seq_length=32, 76 | label_size=2, 77 | init_checkpoint=checkpoint_dir, 78 | output_dir="bert", 79 | gpu_ids="0") 80 | 81 | # training 82 | for epoch in range(3): 83 | model.fit( 84 | X, y, 85 | batch_size=64, 86 | target_steps=-(epoch + 1), 87 | total_steps=-3, 88 | print_per_secs=5, 89 | save_per_steps=3000) 90 | model.localize("bp.%d" % epoch) 91 | 92 | # validation 93 | probs = model.predict(X_dev)["probs"] 94 | for i in range(2): 95 | n, acc, pre, rec, f1, thresh = get_best_f1(probs=probs[:, i], labels=y_dev, label_index=i) 96 | print("[dev] label %d (%d): accuracy %.3f, precision %.3f, recall %.3f, best_f1 %.3f, thresh >%s" 97 | % (i, n, acc, pre, rec, f1, thresh)) 98 | 99 | print("Application finished.") 100 | 101 | 102 | if __name__ == "__main__": 103 | main() 104 | -------------------------------------------------------------------------------- /examples/run_pnasnet_classifier.py: -------------------------------------------------------------------------------- 1 | import uf 2 | import pickle 3 | import numpy as np 4 | 5 | 6 | def get_best_f1(probs, labels, label_index=1): 7 | """ Calculate the best f1 by scanning over probabilities. """ 8 | assert len(probs) == len(labels) 9 | probs = np.array(probs) 10 | labels = np.array(labels) 11 | 12 | # initialize metrics 13 | n = np.sum(labels == label_index) 14 | tp = n 15 | fp = len(labels) - n 16 | fn = 0 17 | tn = 0 18 | accuracy = (tp + tn) / max(tp + tn + fp + fn, 1) 19 | precision = tp / max(tp + fp, 1) 20 | recall = tp / max(tp + fn, 1) 21 | f1 = 2 * precision * recall / max(precision + recall, 1) 22 | threshold = 0 23 | 24 | ids = sorted(range(len(probs)), key=lambda i: probs[i]) 25 | for i in ids: 26 | prob = probs[i] 27 | label = labels[i] 28 | if label == label_index: 29 | tp -= 1 30 | fn += 1 31 | elif label != label_index: 32 | fp -= 1 33 | tn += 1 34 | 35 | _accuracy = (tp + tn) / max(tp + tn + fp + fn, 1) 36 | _precision = tp / max(tp + fp, 1) 37 | _recall = tp / max(tp + fn, 1) 38 | _f1 = 2 * _precision * _recall / max(_precision + _recall, 1) 39 | if _f1 > f1: 40 | accuracy = _accuracy 41 | precision = _precision 42 | recall = _recall 43 | f1 = _f1 44 | threshold = prob 45 | return (n, accuracy, precision, recall, f1, threshold) 46 | 47 | 48 | def main(): 49 | 50 | uf.set_log("./log") 51 | 52 | # load data 53 | with open("data/cifar-10/batches.meta", "rb") as f: 54 | id2label = pickle.load(f)["label_names"] 55 | X, y = [], [] 56 | X_dev, y_dev = [], [] 57 | for i in range(1, 6): 58 | with open(f"data/cifar-10/data_batch_{i}", "rb") as f: 59 | data = pickle.load(f, encoding="bytes") 60 | for j in range(len(data[b"data"])): 61 | image = data[b"data"][j] 62 | image = np.reshape(image, [3, 32, 32]) 63 | image = np.transpose(image, [1, 2, 0]) 64 | X.append(image) 65 | y.append(data[b"labels"][j]) 66 | with open("data/cifar-10/test_batch", "rb") as f: 67 | data = pickle.load(f, encoding="bytes") 68 | for j in range(len(data[b"data"])): 69 | image = data[b"data"][j] 70 | image = np.reshape(image, [3, 32, 32]) 71 | image = np.transpose(image, [1, 2, 0]) 72 | X_dev.append(image) 73 | y_dev.append(data[b"labels"][j]) 74 | print(f"X: {len(X)}") 75 | print(f"X_dev: {len(X_dev)}") 76 | 77 | # modeling 78 | model = uf.PNasNetClassifier( 79 | label_size=len(id2label), 80 | init_checkpoint="pretrained/pnasnet5-mobile", 81 | output_dir="pnasnet", 82 | gpu_ids="0", 83 | model_size="mobile", 84 | data_format="NHWC") 85 | 86 | # training 87 | for epoch in range(3): 88 | model.fit( 89 | X, y, 90 | batch_size=64, 91 | target_steps=-(epoch + 1), 92 | total_steps=-3, 93 | print_per_secs=5, 94 | save_per_steps=3000) 95 | model.localize("bp.%d" % epoch, into_file=".unif") 96 | 97 | # validation 98 | probs = model.predict(X_dev)["probs"] 99 | for i in range(2): 100 | n, acc, pre, rec, f1, thresh = get_best_f1(probs=probs[:, i], labels=y_dev, label_index=i) 101 | print("[dev] label %d (%d): accuracy %.3f, precision %.3f, recall %.3f, best_f1 %.3f, thresh >%s" 102 | % (i, n, acc, pre, rec, f1, thresh)) 103 | 104 | print("Application finished.") 105 | 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /examples/tutorial/TextCNNClassifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "invalid-animation", 6 | "metadata": {}, 7 | "source": [ 8 | "# TextCNNClassifier\n", 9 | "\n", 10 | "可用的中文预训练参数:暂无" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "impossible-professor", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "v2.5.0\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "import uf\n", 29 | "\n", 30 | "print(uf.__version__)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "id": "minimal-cambodia", 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "uf.TextCNNClassifier(\n", 44 | " vocab_file=\"../../ref/vocab.txt\",\n", 45 | " max_seq_length=128,\n", 46 | " label_size=None,\n", 47 | " init_checkpoint=None,\n", 48 | " output_dir=None,\n", 49 | " gpu_ids=\"0\",\n", 50 | " filter_sizes=\"2,4,6\",\n", 51 | " num_channels=6,\n", 52 | " hidden_size=256,\n", 53 | " do_lower_case=True,\n", 54 | " truncate_method=\"LIFO\",\n", 55 | ")\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "model = uf.TextCNNClassifier(\"../../ref/vocab.txt\", gpu_ids=\"0\")\n", 61 | "print(model)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "id": "forty-marathon", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "X = [\"天亮以前说再见\", \"笑着泪流满面\", \"去迎接应该你的\", \"更好的明天\"]\n", 72 | "y = [1, 0, 2, 0]" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "looking-attempt", 78 | "metadata": {}, 79 | "source": [ 80 | "# 训练" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "id": "sharing-macintosh", 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "WARNING:tensorflow:From c:\\Users\\Luv_d\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\tensorflow\\python\\util\\dispatch.py:1176: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n", 94 | "Instructions for updating:\n", 95 | "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n", 96 | "INFO:tensorflow:Build graph with 16,281,825 parameters (among which 5,427,275 are trainable)\n", 97 | "INFO:tensorflow:Running local_init_op\n", 98 | "INFO:tensorflow:Done running local_init_op\n", 99 | "INFO:tensorflow:Running training on 4 samples (step 0 -> 20)\n", 100 | "INFO:tensorflow:step 1, accuracy 0.2500, loss 1.103519, 6.12 steps/sec, 24.46 examples/sec\n", 101 | "INFO:tensorflow:step 4, accuracy 0.7500, loss 0.582421, 28.88 steps/sec, 115.54 examples/sec\n", 102 | "INFO:tensorflow:step 7, accuracy 1.0000, loss 0.080866, 26.82 steps/sec, 107.29 examples/sec\n", 103 | "INFO:tensorflow:step 10, accuracy 1.0000, loss 0.096210, 24.48 steps/sec, 97.92 examples/sec\n", 104 | "INFO:tensorflow:step 13, accuracy 1.0000, loss 0.000033, 20.54 steps/sec, 82.17 examples/sec\n", 105 | "INFO:tensorflow:step 16, accuracy 1.0000, loss 0.000003, 21.19 steps/sec, 84.75 examples/sec\n", 106 | "INFO:tensorflow:step 18, accuracy 1.0000, loss 0.000004, 19.25 steps/sec, 76.99 examples/sec\n", 107 | "INFO:tensorflow:step 20, accuracy 1.0000, loss 0.000252, 21.41 steps/sec, 85.64 examples/sec\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "model.fit(X, y, total_steps=20, learning_rate=0.01) # 模型较小,可以适当提高学习率" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "studied-mechanism", 118 | "metadata": {}, 119 | "source": [ 120 | "# 推理" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "id": "funky-diversity", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "INFO:tensorflow:Running inference on 4 samples\n", 134 | "INFO:tensorflow:process 100.0%, 126.93 examples/sec\n" 135 | ] 136 | }, 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "{'preds': [1, 0, 2, 0],\n", 141 | " 'probs': array([[1.9521560e-03, 9.9782097e-01, 2.2688659e-04],\n", 142 | " [9.9999988e-01, 1.2660193e-08, 6.8851620e-08],\n", 143 | " [3.9597539e-09, 1.1635332e-10, 1.0000000e+00],\n", 144 | " [9.9992132e-01, 3.3130198e-06, 7.5329801e-05]], dtype=float32)}" 145 | ] 146 | }, 147 | "execution_count": 5, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "model.predict(X)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "id": "every-professor", 159 | "metadata": {}, 160 | "source": [ 161 | "# 评分" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 6, 167 | "id": "great-alpha", 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "INFO:tensorflow:Running scoring on 4 samples\n", 175 | "INFO:tensorflow:process 100.0%, 163.93 examples/sec\n" 176 | ] 177 | }, 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "{'accuracy': 1.0, 'loss': 2.2112392e-05}" 182 | ] 183 | }, 184 | "execution_count": 6, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "model.score(X, y)" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3.9.13 64-bit", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.9.13" 211 | }, 212 | "vscode": { 213 | "interpreter": { 214 | "hash": "265fd6f62f200408acbbeae0248f34bed9f93569a643842b7a25d2cd76cae5e5" 215 | } 216 | } 217 | }, 218 | "nbformat": 4, 219 | "nbformat_minor": 5 220 | } 221 | -------------------------------------------------------------------------------- /ref/albert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0, 3 | "hidden_act": "relu", 4 | "hidden_dropout_prob": 0, 5 | "embedding_size": 128, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "num_hidden_groups": 1, 13 | "net_structure_type": 0, 14 | "layers_to_keep": [], 15 | "gap_size": 0, 16 | "num_memory_blocks": 0, 17 | "inner_group_num": 1, 18 | "down_scale_factor": 1, 19 | "type_vocab_size": 2, 20 | "vocab_size": 21128 21 | } 22 | -------------------------------------------------------------------------------- /ref/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128 19 | } 20 | -------------------------------------------------------------------------------- /ref/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/ref/spiece.model -------------------------------------------------------------------------------- /ref/xlnet_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "d_head": 64, 3 | "d_inner": 3072, 4 | "d_model": 768, 5 | "ff_activation": "relu", 6 | "n_head": 12, 7 | "n_layer": 12, 8 | "n_token": 32000, 9 | "untie_r": true 10 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Building guideline: 3 | 4 | In the current directory, running `python setup.py install`. If the 5 | operation is not authorized, try `python setup.py install --user`. 6 | 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | 11 | setup( 12 | name="uf", 13 | version="v2.5.21", 14 | description="Unified framework for NLP tasks.", 15 | url="https://github.com/geyingli/unif", 16 | long_description=open("README.md", "r", encoding="utf-8").read(), 17 | long_description_content_type="text/markdown", 18 | author="Geying Li", 19 | author_email="luv_dusk@163.com", 20 | license="Apache-2.0", 21 | packages=find_packages(), 22 | install_requires=[ 23 | "numpy", 24 | ], 25 | extras_require={ 26 | "cpu": ["tensorflow>=1.11.0"], 27 | "gpu": ["tensorflow-gpu>=1.11.0"], 28 | }, 29 | python_requires=">=3.6.0", 30 | classifiers=[ 31 | "Operating System :: OS Independent", 32 | "License :: OSI Approved :: Apache Software License", 33 | "Programming Language :: Python :: 3", 34 | "Programming Language :: Python :: 3.6", 35 | "Programming Language :: Python :: 3.7", 36 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 37 | ], 38 | keywords=( 39 | "bert xlnet electra nlp tensorflow classification generation " 40 | "question-answering machine-reading-comprehension " 41 | "translation sequence-labeling" 42 | ), 43 | ) 44 | -------------------------------------------------------------------------------- /uf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__ = "v2.5.21" 3 | 4 | # loading models 5 | from .apps import * 6 | 7 | from .com import MultiProcess 8 | from .com import restore 9 | from .com import load 10 | from .com import download 11 | from .com import download_all 12 | from .com import get_checkpoint_path 13 | from .com import get_assignment_map 14 | from .com import list_variables 15 | from .com import list_resources 16 | from .com import set_verbosity 17 | from .com import set_log 18 | 19 | set_verbosity() 20 | 21 | __all__ = [ 22 | "MultiProcess", 23 | "restore", 24 | "load", 25 | "download", 26 | "download_all", 27 | "get_checkpoint_path", 28 | "get_assignment_map", 29 | "list_variables", 30 | "list_resources", 31 | "set_verbosity", 32 | "set_log", 33 | ] 34 | -------------------------------------------------------------------------------- /uf/apps/__init__.py: -------------------------------------------------------------------------------- 1 | from ..com import unimported_module 2 | 3 | from .bert.bert_lm import BERTLM 4 | from .roberta.roberta_lm import RoBERTaLM 5 | from .albert.albert_lm import ALBERTLM 6 | from .electra.electra_lm import ELECTRALM 7 | from .dilated.dilated_lm import DilatedLM 8 | from .recbert.recbert_lm import RecBERTLM 9 | from .recbert.recbert2_lm import RecBERT2LM 10 | from .recbert.recbert3_lm import RecBERT3LM 11 | from .vae.vae_lm import VAELM 12 | from .spe.spe_lm import SPELM 13 | from .gpt2.gpt2_lm import GPT2LM 14 | from .unilm.unilm_lm import UniLM 15 | from .unilm.unilm_prompt import UniLMPrompt 16 | from .sqp.sqp_lm import SQPLM 17 | from .textcnn.textcnn_classifier import TextCNNClassifier 18 | from .rnn.rnn_classifier import RNNClassifier 19 | from .rnn.bi_rnn_classifier import BiRNNClassifier 20 | from .bert.bert_classifier import BERTClassifier 21 | from .roberta.roberta_classifier import RoBERTaClassifier 22 | from .albert.albert_classifier import ALBERTClassifier 23 | from .electra.electra_classifier import ELECTRAClassifier 24 | from .widedeep.widedeep_classifier import WideDeepClassifier 25 | from .sembert.sembert_classifier import SemBERTClassifier 26 | from .performer.performer_classifier import PerformerClassifier 27 | from .uda.uda_classifier import UDAClassifier 28 | from .motian.motian_classifier import MotianClassifier 29 | from .tinybert.tinybert_classifier import TinyBERTClassifier 30 | from .tinybert.tinybert_binary_classifier import TinyBERTBinaryClassifier 31 | from .fastbert.fastbert_classifier import FastBERTClassifier 32 | from .adabert.adabert_classifier import AdaBERTClassifier 33 | from .stockbert.stockbert_classifier import StockBERTClassifier 34 | from .bert.bert_binary_classifier import BERTBinaryClassifier 35 | from .bert.bert_tmp_binary_classifier import BERTTmpBinaryClassifier 36 | from .roberta.roberta_binary_classifier import RoBERTaBinaryClassifier 37 | from .albert.albert_binary_classifier import ALBERTBinaryClassifier 38 | from .electra.electra_binary_classifier import ELECTRABinaryClassifier 39 | from .bert.bert_seq_classifier import BERTSeqClassifier 40 | from .roberta.roberta_seq_classifier import RoBERTaSeqClassifier 41 | from .albert.albert_seq_classifier import ALBERTSeqClassifier 42 | from .electra.electra_seq_classifier import ELECTRASeqClassifier 43 | from .bert.bert_seq_cross_classifier import BERTSeqCrossClassifier 44 | from .bert.bert_regressor import BERTRegressor 45 | from .widedeep.widedeep_regressor import WideDeepRegressor 46 | from .bert.bert_ner import BERTNER 47 | from .bert.bert_crf_ner import BERTCRFNER 48 | from .bert.bert_crf_cascade_ner import BERTCRFCascadeNER 49 | from .bert.bert_mrc import BERTMRC 50 | from .bert.bert_verifier_mrc import BERTVerifierMRC 51 | from .roberta.roberta_mrc import RoBERTaMRC 52 | from .albert.albert_mrc import ALBERTMRC 53 | from .electra.electra_mrc import ELECTRAMRC 54 | from .retroreader.retroreader_mrc import RetroReaderMRC 55 | from .sanet.sanet_mrc import SANetMRC 56 | from .transformer.transformer_mt import TransformerMT 57 | from .chatbot.chatbot_mt import ChatbotMT 58 | try: 59 | from .xlnet.xlnet_classifier import XLNetClassifier 60 | from .xlnet.xlnet_binary_classifier import XLNetBinaryClassifier 61 | except (ModuleNotFoundError, ImportError): 62 | XLNetClassifier = unimported_module( 63 | "XLNetClassifier", 64 | "Module `sentencepiece` is required to launch XLNetClassifier. " 65 | "Try `pip install sentencepiece` or build from source." 66 | ) 67 | XLNetBinaryClassifier = unimported_module( 68 | "XLNetBinaryClassifier", 69 | "Module `sentencepiece` is required to launch XLNetBinaryClassifier. " 70 | "Try `pip install sentencepiece` or build from source." 71 | ) 72 | try: 73 | from .nasnet.pnasnet_classifier import PNasNetClassifier 74 | except (ModuleNotFoundError, ImportError): 75 | PNasNetClassifier = unimported_module( 76 | "PNasNetClassifier", 77 | "Module `tf_slim` is required to launch PNasNetClassifier. " 78 | "Try `pip install tf_slim` or build from source." 79 | ) 80 | 81 | del unimported_module 82 | 83 | 84 | __all__ = [ 85 | "BERTLM", 86 | "RoBERTaLM", 87 | "ALBERTLM", 88 | "ELECTRALM", 89 | "VAELM", 90 | "GPT2LM", 91 | "UniLM", 92 | "TextCNNClassifier", 93 | "RNNClassifier", 94 | "BiRNNClassifier", 95 | "BERTClassifier", 96 | "XLNetClassifier", 97 | "RoBERTaClassifier", 98 | "ALBERTClassifier", 99 | "ELECTRAClassifier", 100 | "WideDeepClassifier", 101 | "SemBERTClassifier", 102 | "UDAClassifier", 103 | "PerformerClassifier", 104 | "TinyBERTClassifier", 105 | "TinyBERTBinaryClassifier", 106 | "FastBERTClassifier", 107 | "BERTBinaryClassifier", 108 | "XLNetBinaryClassifier", 109 | "RoBERTaBinaryClassifier", 110 | "ALBERTBinaryClassifier", 111 | "ELECTRABinaryClassifier", 112 | "BERTSeqClassifier", 113 | "RoBERTaSeqClassifier", 114 | "ALBERTSeqClassifier", 115 | "ELECTRASeqClassifier", 116 | "BERTSeqCrossClassifier", 117 | "BERTRegressor", 118 | "WideDeepRegressor", 119 | "BERTNER", 120 | "BERTCRFNER", 121 | "BERTCRFCascadeNER", 122 | "BERTMRC", 123 | "BERTVerifierMRC", 124 | "RoBERTaMRC", 125 | "ALBERTMRC", 126 | "ELECTRAMRC", 127 | "RetroReaderMRC", 128 | "SANetMRC", 129 | "TransformerMT", 130 | "PNasNetClassifier", 131 | 132 | # trial 133 | "DilatedLM", 134 | "RecBERTLM", 135 | "RecBERT2LM", 136 | "RecBERT3LM", 137 | "SPELM", 138 | "StockBERTClassifier", 139 | "AdaBERTClassifier", 140 | "ChatbotMT", 141 | "UniLMPrompt", 142 | "MotianClassifier", 143 | "SQPLM", 144 | "BERTTmpBinaryClassifier", 145 | ] 146 | -------------------------------------------------------------------------------- /uf/apps/_base_/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/_base_/__init__.py -------------------------------------------------------------------------------- /uf/apps/_base_/_base_.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | 4 | class BaseEncoder: 5 | def __init__(self, *args, **kwargs): 6 | pass 7 | 8 | def get_pooled_output(self, *args, **kwargs): 9 | raise NotImplementedError() 10 | 11 | def get_sequence_output(self, *args, **kwargs): 12 | raise NotImplementedError() 13 | 14 | 15 | class BaseDecoder: 16 | def __init__(self, *args, **kwargs): 17 | 18 | # scalar of total loss, used for back propagation 19 | self.train_loss = None 20 | 21 | # supervised tensors of each example 22 | self.tensors = collections.OrderedDict() 23 | 24 | def get_forward_outputs(self): 25 | return (self.train_loss, self.tensors) 26 | -------------------------------------------------------------------------------- /uf/apps/_base_/_base_lm.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from ...core import BaseModule 4 | 5 | 6 | class LMModule(BaseModule): 7 | """ Application class of language modeling (LM). """ 8 | 9 | _INFER_ATTRIBUTES = { # params whose value cannot be None in order to infer without training 10 | "max_seq_length": "An integer that defines max sequence length of input tokens", 11 | "init_checkpoint": "A string that directs to the checkpoint file used for initialization", 12 | } 13 | 14 | def fit_from_tfrecords( 15 | self, 16 | batch_size=32, 17 | learning_rate=5e-5, 18 | target_steps=None, 19 | total_steps=1000000, 20 | warmup_ratio=0.01, # 默认值不同 21 | print_per_secs=0.1, 22 | save_per_steps=10000, 23 | tfrecords_files=None, 24 | n_jobs=None, 25 | **kwargs, 26 | ): 27 | super().fit_from_tfrecords( 28 | batch_size, 29 | learning_rate, 30 | target_steps, 31 | total_steps, 32 | warmup_ratio, 33 | print_per_secs, 34 | save_per_steps, 35 | tfrecords_files, 36 | n_jobs, 37 | **kwargs, 38 | ) 39 | fit_from_tfrecords.__doc__ = BaseModule.fit_from_tfrecords.__doc__ 40 | 41 | def fit( 42 | self, 43 | X=None, y=None, sample_weight=None, X_tokenized=None, 44 | batch_size=32, 45 | learning_rate=5e-5, 46 | target_steps=None, 47 | total_steps=1000000, 48 | warmup_ratio=0.01, # 默认值不同 49 | print_per_secs=0.1, 50 | save_per_steps=10000, 51 | **kwargs, 52 | ): 53 | super().fit( 54 | X, y, sample_weight, X_tokenized, 55 | batch_size, 56 | learning_rate, 57 | target_steps, 58 | total_steps, 59 | warmup_ratio, 60 | print_per_secs, 61 | save_per_steps, 62 | **kwargs, 63 | ) 64 | fit.__doc__ = BaseModule.fit.__doc__ 65 | 66 | def score(self, *args, **kwargs): 67 | raise AttributeError("`score` method is not supported for unsupervised language modeling (LM) modules.") 68 | 69 | def _convert_x(self, x, tokenized): 70 | """ Convert text sample. """ 71 | 72 | # deal with untokenized inputs 73 | if not tokenized: 74 | 75 | # deal with general inputs 76 | if isinstance(x, str): 77 | return [self.tokenizer.tokenize(x)] 78 | 79 | # deal with multiple inputs 80 | return [self.tokenizer.tokenize(seg) for seg in x] 81 | 82 | # deal with tokenized inputs 83 | if isinstance(x[0], str): 84 | return [copy.deepcopy(x)] 85 | 86 | # deal with tokenized and multiple inputs 87 | return copy.deepcopy(x) 88 | -------------------------------------------------------------------------------- /uf/apps/_base_/_base_mt.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | 4 | from ...core import BaseModule 5 | from ... import com 6 | 7 | 8 | class MTModule(BaseModule): 9 | """ Application class of machine translation (MT). """ 10 | 11 | _INFER_ATTRIBUTES = { # params whose value cannot be None in order to infer without training 12 | "source_max_seq_length": "An integer that defines max sequence length of source language tokens", 13 | "target_max_seq_length": "An integer that defines max sequence length of target language tokens", 14 | "init_checkpoint": "A string that directs to the checkpoint file used for initialization", 15 | } 16 | 17 | def _get_bleu(self, preds, labels, mask, max_gram=4): 18 | """ Bilingual evaluation understudy. """ 19 | eos_id = self.tokenizer.convert_tokens_to_ids([""])[0] 20 | 21 | bleus = [] 22 | for _preds, _labels, _mask in zip(preds, labels, mask): 23 | 24 | # preprocess 25 | for i in range(len(_preds)): 26 | if _preds[i] == eos_id: 27 | _preds = _preds[:i+1] 28 | break 29 | _labels = _labels[:int(np.sum(_mask)) - 1] # remove 30 | 31 | power = 0 32 | for n in range(max_gram): 33 | ngrams = [] 34 | nominator = 0 35 | denominator = 0 36 | 37 | for i in range(len(_labels) - n): 38 | ngram = _labels[i:i+1+n].tolist() 39 | if ngram in ngrams: 40 | continue 41 | cand_count = len(com.find_all_boyer_moore(_preds, ngram)) 42 | ref_count = len(com.find_all_boyer_moore(_labels, ngram)) 43 | nominator += min(cand_count, ref_count) 44 | denominator += cand_count 45 | ngrams.append(ngram) 46 | 47 | power += 1 / (n + 1) * np.log(nominator / (denominator + 1e-6) + 1e-6) 48 | 49 | _bleu = np.exp(power) 50 | if len(_preds) >= len(_labels): 51 | _bleu *= np.exp(1 - len(_labels) / len(_preds)) 52 | bleus.append(_bleu) 53 | 54 | return np.mean(bleus) 55 | 56 | def _get_rouge(self, preds, labels, mask, max_gram=4): 57 | """ Recall-Oriented Understudy for Gisting Evaluation. """ 58 | eos_id = self.tokenizer.convert_tokens_to_ids([""])[0] 59 | 60 | rouges = [] 61 | for _preds, _labels, _mask in zip(preds, labels, mask): 62 | 63 | # preprocess 64 | for i in range(len(_preds)): 65 | if _preds[i] == eos_id: 66 | _preds = _preds[:i+1] 67 | break 68 | _labels = _labels[:int(np.sum(_mask)) - 1] # remove 69 | 70 | nominator = 0 71 | denominator = 0 72 | for n in range(max_gram): 73 | ngrams = [] 74 | 75 | for i in range(len(_labels) - n): 76 | ngram = _labels[i:i+1+n].tolist() 77 | if ngram in ngrams: 78 | continue 79 | nominator += len(com.find_all_boyer_moore(_preds, ngram)) 80 | denominator += len(com.find_all_boyer_moore(_labels, ngram)) 81 | ngrams.append(ngram) 82 | 83 | _rouge = nominator / denominator if denominator else 0 84 | rouges.append(_rouge) 85 | 86 | return np.mean(rouges) 87 | 88 | def _convert_x(self, x, tokenized): 89 | 90 | # deal with untokenized inputs 91 | if not tokenized: 92 | 93 | # deal with general inputs 94 | if isinstance(x, str): 95 | return self.tokenizer.tokenize(x) 96 | 97 | # deal with tokenized inputs 98 | elif isinstance(x[0], str): 99 | return copy.deepcopy(x) 100 | 101 | # deal with tokenized and multiple inputs 102 | raise ValueError("Machine translation module only supports single sentence inputs.") 103 | -------------------------------------------------------------------------------- /uf/apps/_base_/_base_regressor.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | 4 | from ._base_ import BaseDecoder 5 | from ...core import BaseModule 6 | from ... import com 7 | from ...third import tf 8 | from .. import util 9 | 10 | 11 | class RegDecoder(BaseDecoder): 12 | def __init__( 13 | self, 14 | is_training, 15 | input_tensor, 16 | label_floats, 17 | label_size=2, 18 | sample_weight=None, 19 | scope="reg", 20 | hidden_dropout_prob=0.1, 21 | initializer_range=0.02, 22 | trainable=True, 23 | **kwargs, 24 | ): 25 | super().__init__(**kwargs) 26 | 27 | if kwargs.get("is_logits"): 28 | logits = input_tensor 29 | else: 30 | if kwargs.get("return_hidden"): 31 | self.tensors["hidden"] = input_tensor 32 | 33 | with tf.variable_scope(scope): 34 | output_layer = util.dropout(input_tensor, hidden_dropout_prob if is_training else 0.0) 35 | intermediate_output = tf.layers.dense( 36 | output_layer, 37 | label_size * 4, 38 | use_bias=False, 39 | kernel_initializer=util.create_initializer(initializer_range), 40 | trainable=trainable, 41 | ) 42 | logits = tf.layers.dense( 43 | intermediate_output, 44 | label_size, 45 | use_bias=False, 46 | kernel_initializer=util.create_initializer(initializer_range), 47 | trainable=trainable, 48 | name="probs", 49 | ) 50 | 51 | self.tensors["probs"] = logits 52 | 53 | per_example_loss = util.mean_squared_error(logits, label_floats, **kwargs) 54 | if sample_weight is not None: 55 | per_example_loss *= sample_weight 56 | self.tensors["losses"] = per_example_loss 57 | self.train_loss = tf.reduce_mean(per_example_loss) 58 | 59 | 60 | class RegressorModule(BaseModule): 61 | """ Application class of regression. """ 62 | 63 | _INFER_ATTRIBUTES = { # params whose value cannot be None in order to infer without training 64 | "max_seq_length": "An integer that defines max sequence length of input tokens", 65 | "init_checkpoint": "A string that directs to the checkpoint file used for initialization", 66 | } 67 | 68 | def _convert_x(self, x, tokenized): 69 | """ Convert text sample. """ 70 | 71 | # deal with untokenized inputs 72 | if not tokenized: 73 | 74 | # deal with general inputs 75 | if isinstance(x, str): 76 | return [self.tokenizer.tokenize(x)] 77 | 78 | # deal with multiple inputs 79 | return [self.tokenizer.tokenize(seg) for seg in x] 80 | 81 | # deal with tokenized inputs 82 | if isinstance(x[0], str): 83 | return [copy.deepcopy(x)] 84 | 85 | # deal with tokenized and multiple inputs 86 | return copy.deepcopy(x) 87 | 88 | def _convert_y(self, y): 89 | 90 | sample = y[0] 91 | if isinstance(sample, list): 92 | self.label_size = len(sample) 93 | elif isinstance(sample, float) or isinstance(sample, int) or isinstance(sample, str): 94 | self.label_size = 1 95 | 96 | label_floats = [] 97 | for idx, sample in enumerate(y): 98 | try: 99 | if isinstance(sample, list): 100 | _label_floats = [float(label) for label in sample] 101 | elif isinstance(sample, float) or isinstance(sample, int) or isinstance(sample, str): 102 | _label_floats = [float(sample)] 103 | except Exception as e: 104 | raise ValueError("Wrong label format (%s): %s. An example: y = [[0.12, 0.09], [-0.53, 0.98], ...]" % (sample, e)) 105 | label_floats.append(_label_floats) 106 | 107 | return label_floats 108 | 109 | def _get_fit_ops(self, from_tfrecords=False): 110 | ops = [self.tensors["probs"]] 111 | if from_tfrecords: 112 | ops.extend([self.placeholders["label_floats"]]) 113 | return ops 114 | 115 | def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False): 116 | 117 | if from_tfrecords: 118 | batch_labels = output_arrays[-1] 119 | else: 120 | batch_labels = feed_dict[self.placeholders["label_floats"]] 121 | 122 | # mse 123 | batch_preds = output_arrays[0] 124 | mse = np.mean(np.square(batch_preds - batch_labels)) 125 | 126 | info = "" 127 | info += ", mse %.6f" % mse 128 | 129 | return info 130 | 131 | def _get_predict_ops(self): 132 | return [self.tensors["probs"]] 133 | 134 | def _get_predict_outputs(self, output_arrays, n_inputs): 135 | 136 | # probs 137 | probs = com.transform(output_arrays[0], n_inputs) 138 | 139 | outputs = {} 140 | outputs["probs"] = probs 141 | 142 | return outputs 143 | 144 | def _get_score_ops(self): 145 | return [self.tensors["probs"], self.tensors["losses"]] 146 | 147 | def _get_score_outputs(self, output_arrays, n_inputs): 148 | 149 | # mse 150 | probs = com.transform(output_arrays[0], n_inputs) 151 | labels = self.data["label_floats"] 152 | mse = np.mean(np.square(probs - labels)) 153 | 154 | outputs = {} 155 | outputs["mse"] = mse 156 | 157 | return outputs 158 | 159 | -------------------------------------------------------------------------------- /uf/apps/adabert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/adabert/__init__.py -------------------------------------------------------------------------------- /uf/apps/adabert/adabert_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .adabert import AdaBERTClsDistillor 4 | from .._base_._base_classifier import ClassifierModule 5 | from ..bert.bert_classifier import BERTClassifier 6 | from ..bert.bert import BERTConfig 7 | from ...token import WordPieceTokenizer 8 | from ...third import tf 9 | 10 | 11 | class AdaBERTClassifier(BERTClassifier, ClassifierModule): 12 | """ Single-label classifier on AdaBERT, a distillation model. """ 13 | 14 | def __init__( 15 | self, 16 | config_file, 17 | vocab_file, 18 | max_seq_length=128, 19 | label_size=None, 20 | init_checkpoint=None, 21 | output_dir=None, 22 | gpu_ids=None, 23 | drop_pooler=False, 24 | k_max=4, 25 | num_intermediates=3, 26 | embedding_size=128, 27 | temp_decay_steps=18000, 28 | model_l2_reg=3e-4, 29 | arch_l2_reg=1e-3, 30 | loss_gamma=0.8, 31 | loss_beta=4.0, 32 | do_lower_case=True, 33 | truncate_method="LIFO", 34 | ): 35 | self.__init_args__ = locals() 36 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 37 | 38 | self.max_seq_length = max_seq_length 39 | self.label_size = label_size 40 | self.truncate_method = truncate_method 41 | self._drop_pooler = drop_pooler 42 | self._k_max = k_max 43 | self._num_intermediates = num_intermediates 44 | self._embedding_size = embedding_size 45 | self._temp_decay_steps = temp_decay_steps 46 | self._model_l2_reg = model_l2_reg 47 | self._arch_l2_reg = arch_l2_reg 48 | self._loss_gamma = loss_gamma 49 | self._loss_beta = loss_beta 50 | 51 | self.bert_config = BERTConfig.from_json_file(config_file) 52 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 53 | self.decay_power = "unsupported" 54 | 55 | assert label_size, ("`label_size` can't be None.") 56 | if "[CLS]" not in self.tokenizer.vocab: 57 | self.tokenizer.add("[CLS]") 58 | self.bert_config.vocab_size += 1 59 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 60 | if "[SEP]" not in self.tokenizer.vocab: 61 | self.tokenizer.add("[SEP]") 62 | self.bert_config.vocab_size += 1 63 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 64 | 65 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 66 | self._assert_legal(X, y, sample_weight, X_tokenized) 67 | 68 | n_inputs = None 69 | data = {} 70 | 71 | # convert X 72 | if X is not None or X_tokenized is not None: 73 | tokenized = False if X is not None else X_tokenized 74 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 75 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 76 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 77 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 78 | n_inputs = len(input_ids) 79 | 80 | if n_inputs < self.batch_size: 81 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 82 | 83 | if y is not None: 84 | # convert y and sample_weight 85 | label_ids = self._convert_y(y) 86 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 87 | 88 | # convert sample_weight 89 | if is_training or y is not None: 90 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 91 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 92 | 93 | return data 94 | 95 | def _forward(self, is_training, placeholders, **kwargs): 96 | 97 | model = AdaBERTClsDistillor( 98 | bert_config=self.bert_config, 99 | is_training=is_training, 100 | input_ids=placeholders["input_ids"], 101 | input_mask=placeholders["input_mask"], 102 | segment_ids=placeholders["segment_ids"], 103 | label_ids=placeholders.get("label_ids"), 104 | sample_weight=placeholders.get("sample_weight"), 105 | drop_pooler=self._drop_pooler, 106 | label_size=self.label_size, 107 | k_max=self._k_max, 108 | num_intermediates=self._num_intermediates, 109 | embedding_size=self._embedding_size , 110 | temp_decay_steps=self._temp_decay_steps, 111 | model_l2_reg=self._model_l2_reg, 112 | arch_l2_reg=self._arch_l2_reg, 113 | loss_gamma=self._loss_gamma, 114 | loss_beta=self._loss_beta, 115 | **kwargs, 116 | ) 117 | train_loss, tensors = model.get_forward_outputs() 118 | return train_loss, tensors 119 | 120 | def _get_fit_ops(self, from_tfrecords=False): 121 | return [self.tensors["losses"]] 122 | 123 | def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False): 124 | 125 | # loss 126 | batch_losses = output_arrays[0] 127 | loss = np.mean(batch_losses) 128 | 129 | info = "" 130 | info += ", distill loss %.6f" % loss 131 | 132 | return info 133 | -------------------------------------------------------------------------------- /uf/apps/albert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/albert/__init__.py -------------------------------------------------------------------------------- /uf/apps/albert/albert_binary_classifier.py: -------------------------------------------------------------------------------- 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power 2 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule 3 | from ..bert.bert_binary_classifier import BERTBinaryClassifier 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ALBERTBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule): 9 | """ Multi-label classifier on ALBERT. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | label_weight=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | drop_pooler=False, 22 | do_lower_case=True, 23 | truncate_method="LIFO", 24 | ): 25 | self.__init_args__ = locals() 26 | super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 27 | 28 | self.max_seq_length = max_seq_length 29 | self.label_size = label_size 30 | self.label_weight = label_weight 31 | self.truncate_method = truncate_method 32 | self._drop_pooler = drop_pooler 33 | 34 | self.albert_config = ALBERTConfig.from_json_file(config_file) 35 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 36 | self.decay_power = get_decay_power(self.albert_config.num_hidden_layers) 37 | 38 | if "[CLS]" not in self.tokenizer.vocab: 39 | self.tokenizer.add("[CLS]") 40 | self.albert_config.vocab_size += 1 41 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 42 | if "[SEP]" not in self.tokenizer.vocab: 43 | self.tokenizer.add("[SEP]") 44 | self.albert_config.vocab_size += 1 45 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 46 | 47 | def _forward(self, is_training, placeholders, **kwargs): 48 | 49 | encoder = ALBERTEncoder( 50 | albert_config=self.albert_config, 51 | is_training=is_training, 52 | input_ids=placeholders["input_ids"], 53 | input_mask=placeholders["input_mask"], 54 | segment_ids=placeholders["segment_ids"], 55 | drop_pooler=self._drop_pooler, 56 | **kwargs, 57 | ) 58 | encoder_output = encoder.get_pooled_output() 59 | decoder = BinaryClsDecoder( 60 | is_training=is_training, 61 | input_tensor=encoder_output, 62 | label_ids=placeholders["label_ids"], 63 | label_size=self.label_size, 64 | sample_weight=placeholders.get("sample_weight"), 65 | label_weight=self.label_weight, 66 | scope="cls/seq_relationship", 67 | **kwargs, 68 | ) 69 | train_loss, tensors = decoder.get_forward_outputs() 70 | return train_loss, tensors 71 | -------------------------------------------------------------------------------- /uf/apps/albert/albert_classifier.py: -------------------------------------------------------------------------------- 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power 2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 3 | from ..bert.bert_classifier import BERTClassifier 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ALBERTClassifier(BERTClassifier, ClassifierModule): 9 | """ Single-label classifier on ALBERT. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | init_checkpoint=None, 18 | output_dir=None, 19 | gpu_ids=None, 20 | drop_pooler=False, 21 | do_lower_case=True, 22 | truncate_method="LIFO", 23 | ): 24 | self.__init_args__ = locals() 25 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 26 | 27 | self.max_seq_length = max_seq_length 28 | self.label_size = label_size 29 | self.truncate_method = truncate_method 30 | self._drop_pooler = drop_pooler 31 | 32 | self.albert_config = ALBERTConfig.from_json_file(config_file) 33 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 34 | self.decay_power = get_decay_power(self.albert_config.num_hidden_layers) 35 | 36 | if "[CLS]" not in self.tokenizer.vocab: 37 | self.tokenizer.add("[CLS]") 38 | self.albert_config.vocab_size += 1 39 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 40 | if "[SEP]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[SEP]") 42 | self.albert_config.vocab_size += 1 43 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 44 | 45 | def _forward(self, is_training, placeholders, **kwargs): 46 | 47 | encoder = ALBERTEncoder( 48 | albert_config=self.albert_config, 49 | is_training=is_training, 50 | input_ids=placeholders["input_ids"], 51 | input_mask=placeholders["input_mask"], 52 | segment_ids=placeholders["segment_ids"], 53 | drop_pooler=self._drop_pooler, 54 | **kwargs, 55 | ) 56 | encoder_output = encoder.get_pooled_output() 57 | decoder = ClsDecoder( 58 | is_training=is_training, 59 | input_tensor=encoder_output, 60 | label_ids=placeholders["label_ids"], 61 | label_size=self.label_size, 62 | sample_weight=placeholders.get("sample_weight"), 63 | scope="cls/seq_relationship", 64 | **kwargs, 65 | ) 66 | train_loss, tensors = decoder.get_forward_outputs() 67 | return train_loss, tensors 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /uf/apps/albert/albert_mrc.py: -------------------------------------------------------------------------------- 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power 2 | from ..bert.bert_mrc import BERTMRC 3 | from .._base_._base_mrc import MRCDecoder, MRCModule 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ALBERTMRC(BERTMRC, MRCModule): 9 | """ Machine reading comprehension on ALBERT. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=256, 16 | init_checkpoint=None, 17 | output_dir=None, 18 | gpu_ids=None, 19 | do_lower_case=True, 20 | truncate_method="longer-FO", 21 | ): 22 | self.__init_args__ = locals() 23 | super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 24 | 25 | self.max_seq_length = max_seq_length 26 | self.truncate_method = truncate_method 27 | self._do_lower_case = do_lower_case 28 | 29 | self.albert_config = ALBERTConfig.from_json_file(config_file) 30 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 31 | self.decay_power = get_decay_power(self.albert_config.num_hidden_layers) 32 | 33 | if "[CLS]" not in self.tokenizer.vocab: 34 | self.tokenizer.add("[CLS]") 35 | self.albert_config.vocab_size += 1 36 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 37 | if "[SEP]" not in self.tokenizer.vocab: 38 | self.tokenizer.add("[SEP]") 39 | self.albert_config.vocab_size += 1 40 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 41 | 42 | def _forward(self, is_training, placeholders, **kwargs): 43 | 44 | encoder = ALBERTEncoder( 45 | albert_config=self.albert_config, 46 | is_training=is_training, 47 | input_ids=placeholders["input_ids"], 48 | input_mask=placeholders["input_mask"], 49 | segment_ids=placeholders["segment_ids"], 50 | **kwargs, 51 | ) 52 | encoder_output = encoder.get_sequence_output() 53 | decoder = MRCDecoder( 54 | is_training=is_training, 55 | input_tensor=encoder_output, 56 | label_ids=placeholders["label_ids"], 57 | sample_weight=placeholders.get("sample_weight"), 58 | scope="mrc", 59 | **kwargs, 60 | ) 61 | train_loss, tensors = decoder.get_forward_outputs() 62 | return train_loss, tensors 63 | -------------------------------------------------------------------------------- /uf/apps/albert/albert_seq_classifier.py: -------------------------------------------------------------------------------- 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power 2 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule 3 | from ..bert.bert_seq_classifier import BERTSeqClassifier 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ALBERTSeqClassifier(BERTSeqClassifier, SeqClassifierModule): 9 | """ Sequence labeling classifier on ALBERT. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | init_checkpoint=None, 18 | output_dir=None, 19 | gpu_ids=None, 20 | do_lower_case=True, 21 | truncate_method="LIFO", 22 | ): 23 | self.__init_args__ = locals() 24 | super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 25 | 26 | self.max_seq_length = max_seq_length 27 | self.label_size = label_size 28 | self.truncate_method = truncate_method 29 | 30 | self.albert_config = ALBERTConfig.from_json_file(config_file) 31 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 32 | self.decay_power = get_decay_power(self.albert_config.num_hidden_layers) 33 | 34 | if "[CLS]" not in self.tokenizer.vocab: 35 | self.tokenizer.add("[CLS]") 36 | self.albert_config.vocab_size += 1 37 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 38 | if "[SEP]" not in self.tokenizer.vocab: 39 | self.tokenizer.add("[SEP]") 40 | self.albert_config.vocab_size += 1 41 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 42 | 43 | def _forward(self, is_training, placeholders, **kwargs): 44 | 45 | encoder = ALBERTEncoder( 46 | albert_config=self.albert_config, 47 | is_training=is_training, 48 | input_ids=placeholders["input_ids"], 49 | input_mask=placeholders["input_mask"], 50 | segment_ids=placeholders["segment_ids"], 51 | **kwargs, 52 | ) 53 | encoder_output = encoder.get_sequence_output() 54 | decoder = SeqClsDecoder( 55 | is_training=is_training, 56 | input_tensor=encoder_output, 57 | input_mask=placeholders["input_mask"], 58 | label_ids=placeholders["label_ids"], 59 | label_size=self.label_size, 60 | sample_weight=placeholders.get("sample_weight"), 61 | scope="cls/sequence", 62 | **kwargs, 63 | ) 64 | train_loss, tensors = decoder.get_forward_outputs() 65 | return train_loss, tensors 66 | -------------------------------------------------------------------------------- /uf/apps/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/bert/__init__.py -------------------------------------------------------------------------------- /uf/apps/bert/bert_binary_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .bert import BERTEncoder, BERTConfig, get_decay_power 4 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class BERTBinaryClassifier(BinaryClassifierModule): 11 | """ Multi-label classifier on BERT. """ 12 | 13 | def __init__( 14 | self, 15 | config_file, 16 | vocab_file, 17 | max_seq_length=128, 18 | label_size=None, 19 | label_weight=None, 20 | init_checkpoint=None, 21 | output_dir=None, 22 | gpu_ids=None, 23 | drop_pooler=False, 24 | do_lower_case=True, 25 | truncate_method="LIFO", 26 | ): 27 | self.__init_args__ = locals() 28 | super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 29 | 30 | self.max_seq_length = max_seq_length 31 | self.label_size = label_size 32 | self.label_weight = label_weight 33 | self.truncate_method = truncate_method 34 | self._drop_pooler = drop_pooler 35 | 36 | self.bert_config = BERTConfig.from_json_file(config_file) 37 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 38 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 39 | 40 | if "[CLS]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[CLS]") 42 | self.bert_config.vocab_size += 1 43 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 44 | if "[SEP]" not in self.tokenizer.vocab: 45 | self.tokenizer.add("[SEP]") 46 | self.bert_config.vocab_size += 1 47 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 48 | 49 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 50 | self._assert_legal(X, y, sample_weight, X_tokenized) 51 | 52 | if is_training: 53 | assert y is not None, "`y` can't be None." 54 | if is_parallel: 55 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 56 | 57 | n_inputs = None 58 | data = {} 59 | 60 | # convert X 61 | if X is not None or X_tokenized is not None: 62 | tokenized = False if X is not None else X_tokenized 63 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 64 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 65 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 66 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 67 | n_inputs = len(input_ids) 68 | 69 | if n_inputs < self.batch_size: 70 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 71 | 72 | # convert y 73 | if y is not None: 74 | label_ids = self._convert_y(y) 75 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 76 | 77 | # convert sample_weight 78 | if is_training or y is not None: 79 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 80 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 81 | 82 | return data 83 | 84 | def _convert_X(self, X_target, tokenized): 85 | 86 | # tokenize input texts 87 | segment_input_tokens = [] 88 | for idx, sample in enumerate(X_target): 89 | try: 90 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 91 | except Exception as e: 92 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 93 | 94 | input_ids = [] 95 | input_mask = [] 96 | segment_ids = [] 97 | for idx, segments in enumerate(segment_input_tokens): 98 | _input_tokens = ["[CLS]"] 99 | _input_ids = [] 100 | _input_mask = [1] 101 | _segment_ids = [0] 102 | 103 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 104 | for s_id, segment in enumerate(segments): 105 | _segment_id = min(s_id, 1) 106 | _input_tokens.extend(segment + ["[SEP]"]) 107 | _input_mask.extend([1] * (len(segment) + 1)) 108 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 109 | 110 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 111 | 112 | # padding 113 | for _ in range(self.max_seq_length - len(_input_ids)): 114 | _input_ids.append(0) 115 | _input_mask.append(0) 116 | _segment_ids.append(0) 117 | 118 | input_ids.append(_input_ids) 119 | input_mask.append(_input_mask) 120 | segment_ids.append(_segment_ids) 121 | 122 | return input_ids, input_mask, segment_ids 123 | 124 | def _set_placeholders(self, **kwargs): 125 | self.placeholders = { 126 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 127 | "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"), 128 | "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"), 129 | "label_ids": tf.placeholder(tf.int32, [None, self.label_size], "label_ids"), 130 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 131 | } 132 | 133 | def _forward(self, is_training, placeholders, **kwargs): 134 | 135 | encoder = BERTEncoder( 136 | bert_config=self.bert_config, 137 | is_training=is_training, 138 | input_ids=placeholders["input_ids"], 139 | input_mask=placeholders["input_mask"], 140 | segment_ids=placeholders["segment_ids"], 141 | drop_pooler=self._drop_pooler, 142 | **kwargs, 143 | ) 144 | encoder_output = encoder.get_pooled_output() 145 | decoder = BinaryClsDecoder( 146 | is_training=is_training, 147 | input_tensor=encoder_output, 148 | label_ids=placeholders["label_ids"], 149 | label_size=self.label_size, 150 | sample_weight=placeholders.get("sample_weight"), 151 | label_weight=self.label_weight, 152 | scope="cls/seq_relationship", 153 | **kwargs, 154 | ) 155 | train_loss, tensors = decoder.get_forward_outputs() 156 | return train_loss, tensors 157 | -------------------------------------------------------------------------------- /uf/apps/bert/bert_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .bert import BERTEncoder, BERTConfig, get_decay_power 4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class BERTClassifier(ClassifierModule): 11 | """ Single-label classifier on BERT. """ 12 | 13 | def __init__( 14 | self, 15 | config_file, 16 | vocab_file, 17 | max_seq_length=128, 18 | label_size=None, 19 | init_checkpoint=None, 20 | output_dir=None, 21 | gpu_ids=None, 22 | drop_pooler=False, 23 | do_lower_case=True, 24 | truncate_method="LIFO", 25 | ): 26 | self.__init_args__ = locals() 27 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 28 | 29 | self.max_seq_length = max_seq_length 30 | self.label_size = label_size 31 | self.truncate_method = truncate_method 32 | self._drop_pooler = drop_pooler 33 | 34 | self.bert_config = BERTConfig.from_json_file(config_file) 35 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 36 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 37 | 38 | if "[CLS]" not in self.tokenizer.vocab: 39 | self.tokenizer.add("[CLS]") 40 | self.bert_config.vocab_size += 1 41 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 42 | if "[SEP]" not in self.tokenizer.vocab: 43 | self.tokenizer.add("[SEP]") 44 | self.bert_config.vocab_size += 1 45 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 46 | 47 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 48 | self._assert_legal(X, y, sample_weight, X_tokenized) 49 | 50 | if is_training: 51 | assert y is not None, "`y` can't be None." 52 | if is_parallel: 53 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 54 | 55 | n_inputs = None 56 | data = {} 57 | 58 | # convert X 59 | if X is not None or X_tokenized is not None: 60 | tokenized = False if X is not None else X_tokenized 61 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 62 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 63 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 64 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 65 | n_inputs = len(input_ids) 66 | 67 | if n_inputs < self.batch_size: 68 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 69 | 70 | # convert y 71 | if y is not None: 72 | label_ids = self._convert_y(y) 73 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 74 | 75 | # convert sample_weight 76 | if is_training or y is not None: 77 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 78 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 79 | 80 | return data 81 | 82 | def _convert_X(self, X_target, tokenized): 83 | 84 | # tokenize input texts 85 | segment_input_tokens = [] 86 | for idx, sample in enumerate(X_target): 87 | try: 88 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 89 | except Exception as e: 90 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 91 | 92 | input_ids = [] 93 | input_mask = [] 94 | segment_ids = [] 95 | for idx, segments in enumerate(segment_input_tokens): 96 | _input_tokens = ["[CLS]"] 97 | _input_ids = [] 98 | _input_mask = [1] 99 | _segment_ids = [0] 100 | 101 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 102 | for s_id, segment in enumerate(segments): 103 | _segment_id = min(s_id, 1) 104 | _input_tokens.extend(segment + ["[SEP]"]) 105 | _input_mask.extend([1] * (len(segment) + 1)) 106 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 107 | 108 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 109 | 110 | # padding 111 | for _ in range(self.max_seq_length - len(_input_ids)): 112 | _input_ids.append(0) 113 | _input_mask.append(0) 114 | _segment_ids.append(0) 115 | 116 | input_ids.append(_input_ids) 117 | input_mask.append(_input_mask) 118 | segment_ids.append(_segment_ids) 119 | 120 | return input_ids, input_mask, segment_ids 121 | 122 | def _set_placeholders(self, **kwargs): 123 | self.placeholders = { 124 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 125 | "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"), 126 | "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"), 127 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 128 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 129 | } 130 | 131 | def _forward(self, is_training, placeholders, **kwargs): 132 | 133 | encoder = BERTEncoder( 134 | bert_config=self.bert_config, 135 | is_training=is_training, 136 | input_ids=placeholders["input_ids"], 137 | input_mask=placeholders["input_mask"], 138 | segment_ids=placeholders["segment_ids"], 139 | drop_pooler=self._drop_pooler, 140 | **kwargs, 141 | ) 142 | encoder_output = encoder.get_pooled_output() 143 | decoder = ClsDecoder( 144 | is_training=is_training, 145 | input_tensor=encoder_output, 146 | label_ids=placeholders["label_ids"], 147 | label_size=self.label_size, 148 | sample_weight=placeholders.get("sample_weight"), 149 | scope="cls/seq_relationship", 150 | **kwargs, 151 | ) 152 | train_loss, tensors = decoder.get_forward_outputs() 153 | return train_loss, tensors 154 | -------------------------------------------------------------------------------- /uf/apps/bert/bert_crf_ner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .bert import BERTEncoder 4 | from .bert_ner import BERTNER 5 | from .._base_._base_ner import NERModule 6 | from ..crf.crf import CRFDecoder, viterbi_decode 7 | from ... import com 8 | 9 | 10 | class BERTCRFNER(BERTNER, NERModule): 11 | """ Named entity recognization on BERT with CRF. """ 12 | 13 | def _forward(self, is_training, placeholders, **kwargs): 14 | 15 | encoder = BERTEncoder( 16 | bert_config=self.bert_config, 17 | is_training=is_training, 18 | input_ids=placeholders["input_ids"], 19 | input_mask=placeholders["input_mask"], 20 | segment_ids=placeholders["segment_ids"], 21 | **kwargs, 22 | ) 23 | encoder_output = encoder.get_sequence_output() 24 | decoder = CRFDecoder( 25 | is_training=is_training, 26 | input_tensor=encoder_output, 27 | input_mask=placeholders["input_mask"], 28 | label_ids=placeholders["label_ids"], 29 | label_size=5, 30 | sample_weight=placeholders.get("sample_weight"), 31 | scope="cls/sequence", 32 | **kwargs, 33 | ) 34 | train_loss, tensors = decoder.get_forward_outputs() 35 | return train_loss, tensors 36 | 37 | def _get_fit_ops(self, from_tfrecords=False): 38 | ops = [self.tensors["logits"], self.tensors["transition_matrix"], self.tensors["losses"]] 39 | if from_tfrecords: 40 | ops.extend([self.placeholders["input_mask"], self.placeholders["label_ids"]]) 41 | return ops 42 | 43 | def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False): 44 | 45 | if from_tfrecords: 46 | batch_mask = output_arrays[-2] 47 | batch_labels = output_arrays[-1] 48 | else: 49 | batch_mask = feed_dict[self.placeholders["input_mask"]] 50 | batch_labels = feed_dict[self.placeholders["label_ids"]] 51 | 52 | # f1 53 | batch_logits = output_arrays[0] 54 | batch_transition_matrix = output_arrays[1] 55 | batch_input_length = np.sum(batch_mask, axis=-1) 56 | batch_preds = [] 57 | for logit, seq_len in zip(batch_logits, batch_input_length): 58 | viterbi_seq, _ = viterbi_decode(logit[:seq_len], batch_transition_matrix) 59 | batch_preds.append(viterbi_seq) 60 | f1_token, f1_entity = self._get_f1(batch_preds, batch_labels, batch_mask) 61 | 62 | # loss 63 | batch_losses = output_arrays[2] 64 | loss = np.mean(batch_losses) 65 | 66 | info = "" 67 | info += ", f1/token %.4f" % f1_token 68 | info += ", f1/entity %.4f" % f1_entity 69 | info += ", loss %.6f" % loss 70 | 71 | return info 72 | 73 | def _get_predict_ops(self): 74 | return [self.tensors["logits"], self.tensors["transition_matrix"]] 75 | 76 | def _get_predict_outputs(self, output_arrays, n_inputs): 77 | 78 | # preds 79 | logits = com.transform(output_arrays[0], n_inputs) 80 | transition_matrix = output_arrays[1][0] 81 | tokens = self.data[com.BACKUP_DATA + "input_tokens"] 82 | mask = self.data["input_mask"] 83 | text = self.data[com.BACKUP_DATA + "X_target"] 84 | tokenized = self.data[com.BACKUP_DATA + "tokenized"][0] 85 | preds = [] 86 | for i in range(len(logits)): 87 | _logits = logits[i] 88 | _tokens = tokens[i] 89 | _mask = mask[i] 90 | _text = text[i] 91 | 92 | _input_length = int(np.sum(_mask)) 93 | _viterbi_seq, _ = viterbi_decode(_logits[:_input_length], transition_matrix) 94 | _entities = self._get_entities(_viterbi_seq) 95 | _preds = [] 96 | if not _entities: 97 | preds.append(_preds) 98 | continue 99 | 100 | if not tokenized: 101 | if isinstance(_text, list): 102 | _text = " ".join(_text) 103 | _mapping_start, _mapping_end = com.align_tokens_with_text(_tokens, _text, self._do_lower_case) 104 | 105 | for _entity in _entities: 106 | _start, _end = _entity[0], _entity[1] 107 | if tokenized: 108 | _entity_tokens = _tokens[_start: _end + 1] 109 | _preds.append(_entity_tokens) 110 | else: 111 | try: 112 | _text_start = _mapping_start[_start] 113 | _text_end = _mapping_end[_end] 114 | except Exception: 115 | continue 116 | _entity_text = _text[_text_start: _text_end] 117 | _preds.append(_entity_text) 118 | preds.append(_preds) 119 | 120 | # probs 121 | probs = logits 122 | 123 | outputs = {} 124 | outputs["preds"] = preds 125 | outputs["logits"] = probs 126 | 127 | return outputs 128 | 129 | def _get_score_ops(self): 130 | return [self.tensors["logits"], self.tensors["transition_matrix"], self.tensors["losses"]] 131 | 132 | def _get_score_outputs(self, output_arrays, n_inputs): 133 | 134 | # f1 135 | logits = com.transform(output_arrays[0], n_inputs) 136 | transition_matrix = output_arrays[1][0] 137 | mask = self.data["input_mask"] 138 | labels = self.data["label_ids"] 139 | input_length = np.sum(mask, axis=-1) 140 | preds = [] 141 | for logit, seq_len in zip(logits, input_length): 142 | viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_matrix) 143 | preds.append(viterbi_seq) 144 | f1_token, f1_entity = self._get_f1(preds, labels, mask) 145 | 146 | # loss 147 | losses = com.transform(output_arrays[2], n_inputs) 148 | loss = np.mean(losses) 149 | 150 | outputs = {} 151 | outputs["f1/token"] = f1_token 152 | outputs["f1/entity"] = f1_entity 153 | outputs["loss"] = loss 154 | 155 | return outputs 156 | -------------------------------------------------------------------------------- /uf/apps/bert/bert_seq_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .bert import BERTEncoder, BERTConfig, get_decay_power 4 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class BERTSeqClassifier(SeqClassifierModule): 11 | """ Sequence labeling classifier on BERT. """ 12 | 13 | def __init__( 14 | self, 15 | config_file, 16 | vocab_file, 17 | max_seq_length=128, 18 | label_size=None, 19 | init_checkpoint=None, 20 | output_dir=None, 21 | gpu_ids=None, 22 | do_lower_case=True, 23 | truncate_method="LIFO", 24 | ): 25 | self.__init_args__ = locals() 26 | super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 27 | 28 | self.max_seq_length = max_seq_length 29 | self.label_size = label_size 30 | self.truncate_method = truncate_method 31 | 32 | self.bert_config = BERTConfig.from_json_file(config_file) 33 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 34 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 35 | 36 | if "[CLS]" not in self.tokenizer.vocab: 37 | self.tokenizer.add("[CLS]") 38 | self.bert_config.vocab_size += 1 39 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 40 | if "[SEP]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[SEP]") 42 | self.bert_config.vocab_size += 1 43 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 44 | 45 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 46 | self._assert_legal(X, y, sample_weight, X_tokenized) 47 | 48 | if is_training: 49 | assert y is not None, "`y` can't be None." 50 | if is_parallel: 51 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 52 | 53 | n_inputs = None 54 | data = {} 55 | 56 | # convert X 57 | if X is not None or X_tokenized is not None: 58 | tokenized = False if X is not None else X_tokenized 59 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 60 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 61 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 62 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 63 | n_inputs = len(input_ids) 64 | 65 | if n_inputs < self.batch_size: 66 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 67 | 68 | if y is not None: 69 | # convert y and sample_weight 70 | label_ids = self._convert_y(y) 71 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 72 | 73 | # convert sample_weight 74 | if is_training or y is not None: 75 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 76 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 77 | 78 | return data 79 | 80 | def _convert_X(self, X_target, tokenized): 81 | input_ids = [] 82 | input_mask = [] 83 | segment_ids = [] 84 | 85 | # tokenize input texts 86 | for idx, sample in enumerate(X_target): 87 | _input_tokens = self._convert_x(sample, tokenized) 88 | 89 | com.truncate_segments([_input_tokens], self.max_seq_length, truncate_method=self.truncate_method) 90 | 91 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 92 | _input_mask = [1 for _ in range(len(_input_tokens))] 93 | _segment_ids = [0 for _ in range(len(_input_tokens))] 94 | 95 | # padding 96 | for _ in range(self.max_seq_length - len(_input_ids)): 97 | _input_ids.append(0) 98 | _input_mask.append(0) 99 | _segment_ids.append(0) 100 | 101 | input_ids.append(_input_ids) 102 | input_mask.append(_input_mask) 103 | segment_ids.append(_segment_ids) 104 | 105 | return input_ids, input_mask, segment_ids 106 | 107 | def _set_placeholders(self, **kwargs): 108 | self.placeholders = { 109 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 110 | "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"), 111 | "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"), 112 | "label_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "label_ids"), 113 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 114 | } 115 | 116 | def _forward(self, is_training, placeholders, **kwargs): 117 | 118 | encoder = BERTEncoder( 119 | bert_config=self.bert_config, 120 | is_training=is_training, 121 | input_ids=placeholders["input_ids"], 122 | input_mask=placeholders["input_mask"], 123 | segment_ids=placeholders["segment_ids"], 124 | **kwargs, 125 | ) 126 | encoder_output = encoder.get_sequence_output() 127 | decoder = SeqClsDecoder( 128 | is_training=is_training, 129 | input_tensor=encoder_output, 130 | input_mask=placeholders["input_mask"], 131 | label_ids=placeholders["label_ids"], 132 | label_size=self.label_size, 133 | sample_weight=placeholders.get("sample_weight"), 134 | scope="cls/sequence", 135 | **kwargs, 136 | ) 137 | train_loss, tensors = decoder.get_forward_outputs() 138 | return train_loss, tensors 139 | -------------------------------------------------------------------------------- /uf/apps/chatbot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/chatbot/__init__.py -------------------------------------------------------------------------------- /uf/apps/chatbot/chatbot_mt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .chatbot import Chatbot 4 | from ..transformer.transformer_mt import TransformerMT 5 | from .._base_._base_mt import MTModule 6 | 7 | 8 | class ChatbotMT(TransformerMT, MTModule): 9 | """ Chatbot. """ 10 | 11 | def _forward(self, is_training, placeholders, **kwargs): 12 | 13 | model = Chatbot( 14 | vocab_size=len(self.tokenizer.vocab), 15 | is_training=is_training, 16 | source_ids=placeholders["source_ids"], 17 | target_ids=placeholders["target_ids"], 18 | sos_id=self.tokenizer.convert_tokens_to_ids([""])[0], 19 | sample_weight=placeholders.get("sample_weight"), 20 | hidden_size=self._hidden_size, 21 | num_blocks=self._num_hidden_layers, 22 | num_attention_heads=self._num_attention_heads, 23 | **kwargs, 24 | ) 25 | self.transition_matrix = model.transition_matrix 26 | train_loss, tensors = model.get_forward_outputs() 27 | return train_loss, tensors 28 | -------------------------------------------------------------------------------- /uf/apps/crf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/crf/__init__.py -------------------------------------------------------------------------------- /uf/apps/dilated/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/dilated/__init__.py -------------------------------------------------------------------------------- /uf/apps/electra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/electra/__init__.py -------------------------------------------------------------------------------- /uf/apps/electra/electra_binary_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule 2 | from ..bert.bert_binary_classifier import BERTBinaryClassifier 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ELECTRABinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule): 9 | """ Multi-label classifier on ELECTRA. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | label_weight=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | do_lower_case=True, 22 | truncate_method="LIFO", 23 | ): 24 | self.__init_args__ = locals() 25 | super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 26 | 27 | self.max_seq_length = max_seq_length 28 | self.label_size = label_size 29 | self.label_weight = label_weight 30 | self.truncate_method = truncate_method 31 | 32 | self.bert_config = BERTConfig.from_json_file(config_file) 33 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 34 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 35 | 36 | if "[CLS]" not in self.tokenizer.vocab: 37 | self.tokenizer.add("[CLS]") 38 | self.bert_config.vocab_size += 1 39 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 40 | if "[SEP]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[SEP]") 42 | self.bert_config.vocab_size += 1 43 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 44 | 45 | def _forward(self, is_training, placeholders, **kwargs): 46 | 47 | encoder = BERTEncoder( 48 | bert_config=self.bert_config, 49 | is_training=is_training, 50 | input_ids=placeholders["input_ids"], 51 | input_mask=placeholders["input_mask"], 52 | segment_ids=placeholders["segment_ids"], 53 | scope="electra", 54 | drop_pooler=True, 55 | **kwargs, 56 | ) 57 | encoder_output = encoder.get_pooled_output() 58 | decoder = BinaryClsDecoder( 59 | is_training=is_training, 60 | input_tensor=encoder_output, 61 | label_ids=placeholders["label_ids"], 62 | label_size=self.label_size, 63 | sample_weight=placeholders.get("sample_weight"), 64 | label_weight=self.label_weight, 65 | scope="cls/seq_relationship", 66 | **kwargs, 67 | ) 68 | train_loss, tensors = decoder.get_forward_outputs() 69 | return train_loss, tensors 70 | -------------------------------------------------------------------------------- /uf/apps/electra/electra_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 2 | from ..bert.bert_classifier import BERTClassifier 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ELECTRAClassifier(BERTClassifier, ClassifierModule): 9 | """ Single-label classifier on ELECTRA. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | init_checkpoint=None, 18 | output_dir=None, 19 | gpu_ids=None, 20 | do_lower_case=True, 21 | truncate_method="LIFO", 22 | ): 23 | self.__init_args__ = locals() 24 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 25 | 26 | self.max_seq_length = max_seq_length 27 | self.label_size = label_size 28 | self.truncate_method = truncate_method 29 | 30 | self.bert_config = BERTConfig.from_json_file(config_file) 31 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 32 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 33 | 34 | if "[CLS]" not in self.tokenizer.vocab: 35 | self.tokenizer.add("[CLS]") 36 | self.bert_config.vocab_size += 1 37 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 38 | if "[SEP]" not in self.tokenizer.vocab: 39 | self.tokenizer.add("[SEP]") 40 | self.bert_config.vocab_size += 1 41 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 42 | 43 | def _forward(self, is_training, placeholders, **kwargs): 44 | 45 | encoder = BERTEncoder( 46 | bert_config=self.bert_config, 47 | is_training=is_training, 48 | input_ids=placeholders["input_ids"], 49 | input_mask=placeholders["input_mask"], 50 | segment_ids=placeholders["segment_ids"], 51 | scope="electra", 52 | drop_pooler=True, 53 | **kwargs, 54 | ) 55 | encoder_output = encoder.get_pooled_output() 56 | decoder = ClsDecoder( 57 | is_training=is_training, 58 | input_tensor=encoder_output, 59 | label_ids=placeholders["label_ids"], 60 | label_size=self.label_size, 61 | sample_weight=placeholders.get("sample_weight"), 62 | scope="cls/seq_relationship", 63 | **kwargs, 64 | ) 65 | train_loss, tensors = decoder.get_forward_outputs() 66 | return train_loss, tensors 67 | -------------------------------------------------------------------------------- /uf/apps/electra/electra_mrc.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_mrc import MRCDecoder, MRCModule 2 | from ..bert.bert_mrc import BERTMRC 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ELECTRAMRC(BERTMRC, MRCModule): 9 | """ Machine reading comprehension on ELECTRA. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=256, 16 | init_checkpoint=None, 17 | output_dir=None, 18 | gpu_ids=None, 19 | do_lower_case=True, 20 | truncate_method="longer-FO", 21 | ): 22 | self.__init_args__ = locals() 23 | super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 24 | 25 | self.max_seq_length = max_seq_length 26 | self.truncate_method = truncate_method 27 | self._do_lower_case = do_lower_case 28 | 29 | self.bert_config = BERTConfig.from_json_file(config_file) 30 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 31 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 32 | 33 | if "[CLS]" not in self.tokenizer.vocab: 34 | self.tokenizer.add("[CLS]") 35 | self.bert_config.vocab_size += 1 36 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 37 | if "[SEP]" not in self.tokenizer.vocab: 38 | self.tokenizer.add("[SEP]") 39 | self.bert_config.vocab_size += 1 40 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 41 | 42 | def _forward(self, is_training, placeholders, **kwargs): 43 | 44 | encoder = BERTEncoder( 45 | bert_config=self.bert_config, 46 | is_training=is_training, 47 | input_ids=placeholders["input_ids"], 48 | input_mask=placeholders["input_mask"], 49 | segment_ids=placeholders["segment_ids"], 50 | scope="electra", 51 | drop_pooler=True, 52 | **kwargs, 53 | ) 54 | encoder_output = encoder.get_sequence_output() 55 | decoder = MRCDecoder( 56 | is_training=is_training, 57 | input_tensor=encoder_output, 58 | label_ids=placeholders["label_ids"], 59 | sample_weight=placeholders.get("sample_weight"), 60 | scope="mrc", 61 | **kwargs, 62 | ) 63 | train_loss, tensors = decoder.get_forward_outputs() 64 | return train_loss, tensors 65 | -------------------------------------------------------------------------------- /uf/apps/electra/electra_seq_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule 2 | from ..bert.bert_seq_classifier import BERTSeqClassifier 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power 4 | from ...token import WordPieceTokenizer 5 | from ...third import tf 6 | 7 | 8 | class ELECTRASeqClassifier(BERTSeqClassifier, SeqClassifierModule): 9 | """ Sequence labeling classifier on ELECTRA. """ 10 | 11 | def __init__( 12 | self, 13 | config_file, 14 | vocab_file, 15 | max_seq_length=128, 16 | label_size=None, 17 | init_checkpoint=None, 18 | output_dir=None, 19 | gpu_ids=None, 20 | do_lower_case=True, 21 | truncate_method="LIFO", 22 | ): 23 | self.__init_args__ = locals() 24 | super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 25 | 26 | self.max_seq_length = max_seq_length 27 | self.label_size = label_size 28 | self.truncate_method = truncate_method 29 | 30 | self.bert_config = BERTConfig.from_json_file(config_file) 31 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 32 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 33 | 34 | if "[CLS]" not in self.tokenizer.vocab: 35 | self.tokenizer.add("[CLS]") 36 | self.bert_config.vocab_size += 1 37 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 38 | if "[SEP]" not in self.tokenizer.vocab: 39 | self.tokenizer.add("[SEP]") 40 | self.bert_config.vocab_size += 1 41 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 42 | 43 | def _forward(self, is_training, placeholders, **kwargs): 44 | 45 | encoder = BERTEncoder( 46 | bert_config=self.bert_config, 47 | is_training=is_training, 48 | input_ids=placeholders["input_ids"], 49 | input_mask=placeholders["input_mask"], 50 | segment_ids=placeholders["segment_ids"], 51 | scope="electra", 52 | drop_pooler=True, 53 | **kwargs, 54 | ) 55 | encoder_output = encoder.get_sequence_output() 56 | decoder = SeqClsDecoder( 57 | is_training=is_training, 58 | input_tensor=encoder_output, 59 | input_mask=placeholders["input_mask"], 60 | label_ids=placeholders["label_ids"], 61 | label_size=self.label_size, 62 | sample_weight=placeholders.get("sample_weight"), 63 | scope="cls/sequence", 64 | **kwargs, 65 | ) 66 | train_loss, tensors = decoder.get_forward_outputs() 67 | return train_loss, tensors 68 | -------------------------------------------------------------------------------- /uf/apps/fastbert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/fastbert/__init__.py -------------------------------------------------------------------------------- /uf/apps/gpt2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/gpt2/__init__.py -------------------------------------------------------------------------------- /uf/apps/motian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/motian/__init__.py -------------------------------------------------------------------------------- /uf/apps/motian/motian_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .motian import MotianEncoder, BERTConfig, get_decay_power 4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class MotianClassifier(ClassifierModule): 11 | """ Single-label classifier on Motian. """ 12 | 13 | def __init__( 14 | self, 15 | config_file, 16 | vocab_file, 17 | max_seq_length=128, 18 | label_size=None, 19 | init_checkpoint=None, 20 | output_dir=None, 21 | gpu_ids=None, 22 | do_lower_case=True, 23 | truncate_method="LIFO", 24 | ): 25 | self.__init_args__ = locals() 26 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 27 | 28 | self.max_seq_length = max_seq_length 29 | self.label_size = label_size 30 | self.truncate_method = truncate_method 31 | 32 | self.bert_config = BERTConfig.from_json_file(config_file) 33 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 34 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 35 | 36 | if "[CLS]" not in self.tokenizer.vocab: 37 | self.tokenizer.add("[CLS]") 38 | self.bert_config.vocab_size += 1 39 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 40 | if "[SEP]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[SEP]") 42 | self.bert_config.vocab_size += 1 43 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 44 | 45 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 46 | self._assert_legal(X, y, sample_weight, X_tokenized) 47 | 48 | if is_training: 49 | assert y is not None, "`y` can't be None." 50 | if is_parallel: 51 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 52 | 53 | n_inputs = None 54 | data = {} 55 | 56 | # convert X 57 | if X is not None or X_tokenized is not None: 58 | tokenized = False if X is not None else X_tokenized 59 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 60 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 61 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 62 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 63 | n_inputs = len(input_ids) 64 | 65 | if n_inputs < self.batch_size: 66 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 67 | 68 | # convert y 69 | if y is not None: 70 | label_ids = self._convert_y(y) 71 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 72 | 73 | # convert sample_weight 74 | if is_training or y is not None: 75 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 76 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 77 | 78 | return data 79 | 80 | def _convert_X(self, X_target, tokenized): 81 | 82 | # tokenize input texts 83 | segment_input_tokens = [] 84 | for idx, sample in enumerate(X_target): 85 | try: 86 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 87 | except Exception as e: 88 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 89 | 90 | input_ids = [] 91 | input_mask = [] 92 | segment_ids = [] 93 | for idx, segments in enumerate(segment_input_tokens): 94 | _input_tokens = ["[CLS]"] 95 | _input_ids = [] 96 | _input_mask = [1] 97 | _segment_ids = [0] 98 | 99 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 100 | for s_id, segment in enumerate(segments): 101 | _segment_id = min(s_id, 1) 102 | _input_tokens.extend(segment + ["[SEP]"]) 103 | _input_mask.extend([1] * (len(segment) + 1)) 104 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 105 | 106 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 107 | 108 | # padding 109 | for _ in range(self.max_seq_length - len(_input_ids)): 110 | _input_ids.append(0) 111 | _input_mask.append(0) 112 | _segment_ids.append(0) 113 | 114 | input_ids.append(_input_ids) 115 | input_mask.append(_input_mask) 116 | segment_ids.append(_segment_ids) 117 | 118 | return input_ids, input_mask, segment_ids 119 | 120 | def _set_placeholders(self, **kwargs): 121 | self.placeholders = { 122 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 123 | "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"), 124 | "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"), 125 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 126 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 127 | } 128 | 129 | def _forward(self, is_training, placeholders, **kwargs): 130 | 131 | encoder = MotianEncoder( 132 | config=self.bert_config, 133 | is_training=is_training, 134 | input_ids=placeholders["input_ids"], 135 | input_mask=placeholders["input_mask"], 136 | token_type_ids=placeholders["segment_ids"], 137 | **kwargs, 138 | ) 139 | encoder_output = encoder.get_pooled_output() 140 | decoder = ClsDecoder( 141 | is_training=is_training, 142 | input_tensor=encoder_output, 143 | label_ids=placeholders["label_ids"], 144 | label_size=self.label_size, 145 | sample_weight=placeholders.get("sample_weight"), 146 | scope="cls/seq_relationship", 147 | **kwargs, 148 | ) 149 | train_loss, tensors = decoder.get_forward_outputs() 150 | return train_loss, tensors 151 | -------------------------------------------------------------------------------- /uf/apps/nasnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/nasnet/__init__.py -------------------------------------------------------------------------------- /uf/apps/nasnet/pnasnet_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | 4 | from .pnasnet import build_pnasnet_mobile, build_pnasnet_large, get_decay_power 5 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 6 | from ...third import tf 7 | 8 | 9 | class PNasNetClassifier(ClassifierModule): 10 | """ Single-label classifier on PNasNet. """ 11 | 12 | _INFER_ATTRIBUTES = { # params whose value cannot be None in order to infer without training 13 | "label_size": "An integer that defines number of possible labels of outputs", 14 | "init_checkpoint": "A string that directs to the checkpoint file used for initialization", 15 | } 16 | 17 | def __init__( 18 | self, 19 | label_size=None, 20 | init_checkpoint=None, 21 | output_dir=None, 22 | gpu_ids=None, 23 | model_size="large", 24 | data_format="NHWC", 25 | ): 26 | self.__init_args__ = locals() 27 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 28 | 29 | self.label_size = label_size 30 | self.model_size = model_size 31 | self.data_format = data_format 32 | 33 | assert model_size in ("mobile", "large"), (f"Invalid `model_size`: {model_size}. Pick one from \"mobile\" and \"large\".") 34 | assert data_format in ("NHWC", "NCHW"), (f"Unsupported `data_format`: {data_format}. Piack one from \"NHWC\" and \"NCHW\"") 35 | self._image_size = 224 if model_size == "mobile" else 331 36 | self.decay_power = get_decay_power(model_size) 37 | 38 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 39 | self._assert_legal(X, y, sample_weight, X_tokenized) 40 | 41 | assert not X_tokenized, "%s does not support text input." % self.__class__.__name__ 42 | if is_training: 43 | assert y is not None, "`y` can't be None." 44 | if is_parallel: 45 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 46 | 47 | n_inputs = None 48 | data = {} 49 | 50 | # convert X 51 | if X is not None: 52 | input_ids = self._convert_X(X) 53 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 54 | n_inputs = len(input_ids) 55 | 56 | if n_inputs < self.batch_size: 57 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 58 | 59 | # convert y 60 | if y is not None: 61 | label_ids = self._convert_y(y) 62 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 63 | 64 | # convert sample_weight 65 | if is_training or y is not None: 66 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 67 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 68 | 69 | return data 70 | 71 | def _convert_X(self, X): 72 | 73 | # convert to numpy array 74 | image_arrays = [] 75 | for idx, sample in enumerate(X): 76 | try: 77 | image_arrays.append(self._convert_x(sample)) 78 | except Exception as e: 79 | raise ValueError("Wrong input format (image %d): %s." % (idx, e)) 80 | 81 | return np.array(image_arrays) 82 | 83 | def _convert_x(self, x): 84 | 85 | # format 86 | x = np.array(x).astype(np.uint8) 87 | 88 | # interpolate 89 | if self.data_format == "NHWC": 90 | x = np.array([ 91 | np.asarray(Image.fromarray(x[:, :, k]).resize((self._image_size, self._image_size))) 92 | for k in range(3) 93 | ]) 94 | elif self.data_format == "NCHW": 95 | x = np.array([ 96 | np.asarray(Image.fromarray(x[k, :, :]).resize((self._image_size, self._image_size))) 97 | for k in range(3) 98 | ]) 99 | 100 | # transpose 101 | x = np.transpose(x, [1, 2, 0]) 102 | 103 | return x 104 | 105 | def _set_placeholders(self, **kwargs): 106 | self.placeholders = { 107 | "input_ids": tf.placeholder(tf.float32, [None, self._image_size, self._image_size, 3], "input_ids"), 108 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 109 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 110 | } 111 | 112 | def _forward(self, is_training, placeholders, **kwargs): 113 | 114 | if self.model_size == "mobile": 115 | logits, _ = build_pnasnet_mobile( 116 | images=placeholders["input_ids"], num_classes=self.label_size, 117 | is_training=is_training, final_endpoint=None, 118 | ) 119 | elif self.model_size == "large": 120 | logits, _ = build_pnasnet_large( 121 | images=placeholders["input_ids"], num_classes=self.label_size, 122 | is_training=is_training, final_endpoint=None, 123 | ) 124 | decoder = ClsDecoder( 125 | is_training, 126 | input_tensor=logits, 127 | label_ids=placeholders["label_ids"], 128 | is_logits=True, 129 | label_size=self.label_size, 130 | sample_weight=placeholders.get("sample_weight"), 131 | ) 132 | train_loss, tensors = decoder.get_forward_outputs() 133 | return train_loss, tensors 134 | -------------------------------------------------------------------------------- /uf/apps/performer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/performer/__init__.py -------------------------------------------------------------------------------- /uf/apps/performer/performer_classifier.py: -------------------------------------------------------------------------------- 1 | from .performer import PerformerEncoder 2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 3 | from ..bert.bert_classifier import BERTClassifier 4 | from ..bert.bert import BERTConfig, get_decay_power 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | 8 | 9 | class PerformerClassifier(BERTClassifier, ClassifierModule): 10 | """ Single-label classifier on Performer. """ 11 | 12 | def __init__( 13 | self, 14 | config_file, 15 | vocab_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | kernel_transformation="relu", 22 | nb_random_features=1, 23 | drop_pooler=False, 24 | do_lower_case=True, 25 | truncate_method="LIFO", 26 | ): 27 | self.__init_args__ = locals() 28 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 29 | 30 | self.max_seq_length = max_seq_length 31 | self.label_size = label_size 32 | self.truncate_method = truncate_method 33 | self._drop_pooler = drop_pooler 34 | self._kernel_transformation = kernel_transformation 35 | self._nb_random_features = nb_random_features 36 | 37 | self.bert_config = BERTConfig.from_json_file(config_file) 38 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 39 | self.decay_power = get_decay_power(self.bert_config.num_hidden_layers) 40 | 41 | if "[CLS]" not in self.tokenizer.vocab: 42 | self.tokenizer.add("[CLS]") 43 | self.bert_config.vocab_size += 1 44 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 45 | if "[SEP]" not in self.tokenizer.vocab: 46 | self.tokenizer.add("[SEP]") 47 | self.bert_config.vocab_size += 1 48 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 49 | 50 | def _forward(self, is_training, placeholders, **kwargs): 51 | 52 | encoder = PerformerEncoder( 53 | bert_config=self.bert_config, 54 | is_training=is_training, 55 | input_ids=placeholders["input_ids"], 56 | input_mask=placeholders["input_mask"], 57 | segment_ids=placeholders["segment_ids"], 58 | kernel_transformation=self._kernel_transformation, 59 | nb_random_features=self._nb_random_features, 60 | drop_pooler=self._drop_pooler, 61 | **kwargs, 62 | ) 63 | encoder_output = encoder.get_pooled_output() 64 | decoder = ClsDecoder( 65 | is_training=is_training, 66 | input_tensor=encoder_output, 67 | label_ids=placeholders["label_ids"], 68 | label_size=self.label_size, 69 | sample_weight=placeholders.get("sample_weight"), 70 | scope="cls/seq_relationship", 71 | **kwargs, 72 | ) 73 | train_loss, tensors = decoder.get_forward_outputs() 74 | return train_loss, tensors 75 | -------------------------------------------------------------------------------- /uf/apps/recbert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/recbert/__init__.py -------------------------------------------------------------------------------- /uf/apps/retroreader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/retroreader/__init__.py -------------------------------------------------------------------------------- /uf/apps/rnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/rnn/__init__.py -------------------------------------------------------------------------------- /uf/apps/rnn/bi_rnn.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import rnn 2 | from tensorflow.python.ops import rnn_cell 3 | 4 | from .._base_._base_ import BaseEncoder 5 | from .. import util 6 | from ...third import tf 7 | 8 | 9 | class BiRNNEncoder(BaseEncoder): 10 | def __init__( 11 | self, 12 | is_training, 13 | input_ids, 14 | seq_length, 15 | vocab_size, 16 | rnn_core="lstm", 17 | hidden_size=128, 18 | scope="rnn", 19 | trainable=True, 20 | **kwargs, 21 | ): 22 | dropout_rate = 0.0 23 | if is_training: 24 | dropout_rate = 0.1 25 | half_hidden_size = hidden_size // 2 26 | self.rnn_core = rnn_core 27 | 28 | input_shape = util.get_shape_list(input_ids, expected_rank=2) 29 | batch_size = input_shape[0] 30 | max_seq_length = input_shape[1] 31 | 32 | 33 | with tf.variable_scope(scope): 34 | 35 | # embedding 36 | embedding_output, _ = util.embedding_lookup( 37 | input_ids=input_ids, 38 | vocab_size=vocab_size, 39 | batch_size=batch_size, 40 | max_seq_length=max_seq_length, 41 | embeddings=kwargs.get("tilda_embeddings"), 42 | embedding_size=hidden_size, 43 | word_embedding_name="word_embeddings", 44 | trainable=trainable, 45 | ) 46 | 47 | # rnn core 48 | if rnn_core == "rnn": 49 | cell_fw = rnn_cell.BasicRNNCell(num_units=half_hidden_size, trainable=trainable) 50 | cell_bw = rnn_cell.BasicRNNCell(num_units=half_hidden_size, trainable=trainable) 51 | elif rnn_core == "lstm": 52 | cell_fw = rnn_cell.LSTMCell(num_units=half_hidden_size, trainable=trainable) 53 | cell_bw = rnn_cell.LSTMCell(num_units=half_hidden_size, trainable=trainable) 54 | elif rnn_core == "gru": 55 | cell_fw = rnn_cell.GRUCell(num_units=half_hidden_size, trainable=trainable) 56 | cell_bw = rnn_cell.GRUCell(num_units=half_hidden_size, trainable=trainable) 57 | dropout_cell_fw = rnn_cell.DropoutWrapper(cell_fw, state_keep_prob=1 - dropout_rate) 58 | dropout_cell_bw = rnn_cell.DropoutWrapper(cell_bw, state_keep_prob=1 - dropout_rate) 59 | 60 | # inputs: [batch_size, max_seq_length, hidden_size] 61 | # outputs: ([batch_size, max_seq_length, half_hidden_size], [batch_size, max_seq_length, half_hidden_size]) 62 | outputs, self.last_states = rnn.bidirectional_dynamic_rnn( 63 | cell_fw=dropout_cell_fw, 64 | cell_bw=dropout_cell_bw, 65 | inputs=embedding_output, 66 | sequence_length=seq_length, 67 | dtype=tf.float32, 68 | ) 69 | self.outputs = tf.concat(outputs, axis=2) 70 | 71 | def get_pooled_output(self): 72 | return self.outputs[:, 0, :] 73 | 74 | def get_sequence_output(self): 75 | return self.outputs 76 | 77 | 78 | def get_decay_power(): 79 | return { 80 | "word_embeddings": 2, 81 | "/bidirectional_rnn/": 1, 82 | "cls/": 0, 83 | } -------------------------------------------------------------------------------- /uf/apps/rnn/bi_rnn_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .bi_rnn import BiRNNEncoder, get_decay_power 4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class BiRNNClassifier(ClassifierModule): 11 | """ Single-label classifier on bidirectional RNN/LSTM/GRU. """ 12 | 13 | def __init__( 14 | self, 15 | vocab_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | rnn_core="lstm", 22 | hidden_size=256, 23 | do_lower_case=True, 24 | truncate_method="LIFO", 25 | ): 26 | self.__init_args__ = locals() 27 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 28 | 29 | self.max_seq_length = max_seq_length 30 | self.label_size = label_size 31 | self.truncate_method = truncate_method 32 | self._rnn_core = rnn_core 33 | self._hidden_size = hidden_size 34 | 35 | assert rnn_core in ("rnn", "lstm", "gru"), (f"Invalid `rnn_core`: {rnn_core}. Pick one from \"rnn\", \"lstm\" and \"gru\".") 36 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 37 | self.decay_power = get_decay_power() 38 | 39 | if "[CLS]" not in self.tokenizer.vocab: 40 | self.tokenizer.add("[CLS]") 41 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 42 | if "[SEP]" not in self.tokenizer.vocab: 43 | self.tokenizer.add("[SEP]") 44 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 45 | 46 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 47 | self._assert_legal(X, y, sample_weight, X_tokenized) 48 | 49 | if is_training: 50 | assert y is not None, "`y` can't be None." 51 | if is_parallel: 52 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 53 | 54 | n_inputs = None 55 | data = {} 56 | 57 | # convert X 58 | if X is not None or X_tokenized is not None: 59 | tokenized = False if X is not None else X_tokenized 60 | input_ids, seq_length = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 61 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 62 | data["seq_length"] = np.array(seq_length, dtype=np.int32) 63 | n_inputs = len(input_ids) 64 | 65 | if n_inputs < self.batch_size: 66 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 67 | 68 | # convert y 69 | if y is not None: 70 | label_ids = self._convert_y(y) 71 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 72 | 73 | # convert sample_weight 74 | if is_training or y is not None: 75 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 76 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 77 | 78 | return data 79 | 80 | def _convert_X(self, X_target, tokenized): 81 | 82 | # tokenize input texts 83 | segment_input_tokens = [] 84 | for idx, sample in enumerate(X_target): 85 | try: 86 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 87 | except Exception as e: 88 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 89 | 90 | input_ids = [] 91 | seq_length = [] 92 | for idx, segments in enumerate(segment_input_tokens): 93 | _input_tokens = ["[CLS]"] 94 | _input_ids = [] 95 | _seq_length = 0 96 | 97 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 98 | for segment in segments: 99 | _input_tokens.extend(segment + ["[SEP]"]) 100 | 101 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 102 | _seq_length = len(_input_ids) 103 | 104 | # padding 105 | _input_ids += [0] * (self.max_seq_length - len(_input_ids)) 106 | 107 | input_ids.append(_input_ids) 108 | seq_length.append(_seq_length) 109 | 110 | return input_ids, seq_length 111 | 112 | def _set_placeholders(self, **kwargs): 113 | self.placeholders = { 114 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 115 | "seq_length": tf.placeholder(tf.int32, [None], "seq_length"), 116 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 117 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 118 | } 119 | 120 | def _forward(self, is_training, placeholders, **kwargs): 121 | 122 | encoder = BiRNNEncoder( 123 | is_training=is_training, 124 | input_ids=placeholders["input_ids"], 125 | seq_length=placeholders["seq_length"], 126 | vocab_size=len(self.tokenizer.vocab), 127 | rnn_core=self._rnn_core, 128 | hidden_size=self._hidden_size, 129 | scope=self._rnn_core, 130 | trainable=True, 131 | **kwargs, 132 | ) 133 | encoder_output = encoder.get_pooled_output() 134 | decoder = ClsDecoder( 135 | is_training=is_training, 136 | input_tensor=encoder_output, 137 | label_ids=placeholders["label_ids"], 138 | label_size=self.label_size, 139 | sample_weight=placeholders.get("sample_weight"), 140 | scope="cls/seq_relationship", 141 | **kwargs, 142 | ) 143 | train_loss, tensors = decoder.get_forward_outputs() 144 | return train_loss, tensors 145 | -------------------------------------------------------------------------------- /uf/apps/rnn/rnn.py: -------------------------------------------------------------------------------- 1 | from tensorflow.python.ops import rnn 2 | from tensorflow.python.ops import rnn_cell 3 | 4 | from .._base_._base_ import BaseEncoder 5 | from .. import util 6 | from ...third import tf 7 | 8 | 9 | class RNNEncoder(BaseEncoder): 10 | def __init__( 11 | self, 12 | is_training, 13 | input_ids, 14 | seq_length, 15 | vocab_size, 16 | rnn_core="lstm", 17 | hidden_size=128, 18 | scope="rnn", 19 | trainable=True, 20 | **kwargs, 21 | ): 22 | dropout_rate = 0.0 23 | if is_training: 24 | dropout_rate = 0.1 25 | input_shape = util.get_shape_list(input_ids, expected_rank=2) 26 | batch_size = input_shape[0] 27 | max_seq_length = input_shape[1] 28 | 29 | self.rnn_core = rnn_core 30 | 31 | with tf.variable_scope(scope): 32 | 33 | # embedding 34 | embedding_output, _ = util.embedding_lookup( 35 | input_ids=input_ids, 36 | vocab_size=vocab_size, 37 | batch_size=batch_size, 38 | max_seq_length=max_seq_length, 39 | embeddings=kwargs.get("tilda_embeddings"), 40 | embedding_size=hidden_size, 41 | word_embedding_name="word_embeddings", 42 | trainable=trainable, 43 | ) 44 | 45 | # rnn core 46 | if rnn_core == "rnn": 47 | cell = rnn_cell.BasicRNNCell(num_units=hidden_size, trainable=trainable) 48 | elif rnn_core == "lstm": 49 | cell = rnn_cell.LSTMCell(num_units=hidden_size, trainable=trainable) 50 | elif rnn_core == "gru": 51 | cell = rnn_cell.GRUCell(num_units=hidden_size, trainable=trainable) 52 | dropout_cell = rnn_cell.DropoutWrapper(cell, state_keep_prob=1 - dropout_rate) 53 | 54 | # inputs: [batch_size, max_seq_length, hidden_size] 55 | # outputs: [batch_size, max_seq_length, hidden_size] 56 | self.outputs, self.last_states = rnn.dynamic_rnn( 57 | cell=dropout_cell, 58 | inputs=embedding_output, 59 | sequence_length=seq_length, 60 | dtype=tf.float32, 61 | ) 62 | 63 | def get_pooled_output(self): 64 | if self.rnn_core == "lstm": 65 | return self.last_states[-1] # ([batch_size, hidden_size], [batch_size, hidden_size]) 66 | return self.last_states # [batch_size, hidden_size] 67 | 68 | def get_sequence_output(self): 69 | return self.outputs 70 | 71 | 72 | def get_decay_power(): 73 | return { 74 | "word_embeddings": 2, 75 | "/rnn/": 1, 76 | "cls/": 0, 77 | } -------------------------------------------------------------------------------- /uf/apps/rnn/rnn_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .rnn import RNNEncoder, get_decay_power 4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class RNNClassifier(ClassifierModule): 11 | """ Single-label classifier on RNN/LSTM/GRU. """ 12 | 13 | def __init__( 14 | self, 15 | vocab_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | rnn_core="lstm", 22 | hidden_size=128, 23 | do_lower_case=True, 24 | truncate_method="LIFO", 25 | ): 26 | self.__init_args__ = locals() 27 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 28 | 29 | self.max_seq_length = max_seq_length 30 | self.label_size = label_size 31 | self.truncate_method = truncate_method 32 | self._rnn_core = rnn_core 33 | self._hidden_size = hidden_size 34 | 35 | assert rnn_core in ("rnn", "lstm", "gru"), (f"Invalid `rnn_core`: {rnn_core}. Pick one from \"rnn\", \"lstm\" and \"gru\".") 36 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 37 | self.decay_power = get_decay_power() 38 | 39 | if "[SEP]" not in self.tokenizer.vocab: 40 | self.tokenizer.add("[SEP]") 41 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 42 | 43 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 44 | self._assert_legal(X, y, sample_weight, X_tokenized) 45 | 46 | if is_training: 47 | assert y is not None, "`y` can't be None." 48 | if is_parallel: 49 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 50 | 51 | n_inputs = None 52 | data = {} 53 | 54 | # convert X 55 | if X is not None or X_tokenized is not None: 56 | tokenized = False if X is not None else X_tokenized 57 | input_ids, seq_length = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 58 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 59 | data["seq_length"] = np.array(seq_length, dtype=np.int32) 60 | n_inputs = len(input_ids) 61 | 62 | if n_inputs < self.batch_size: 63 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 64 | 65 | # convert y 66 | if y is not None: 67 | label_ids = self._convert_y(y) 68 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 69 | 70 | # convert sample_weight 71 | if is_training or y is not None: 72 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 73 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 74 | 75 | return data 76 | 77 | def _convert_X(self, X_target, tokenized): 78 | 79 | # tokenize input texts 80 | segment_input_tokens = [] 81 | for idx, sample in enumerate(X_target): 82 | try: 83 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 84 | except Exception as e: 85 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 86 | 87 | input_ids = [] 88 | seq_length = [] 89 | for idx, segments in enumerate(segment_input_tokens): 90 | _input_tokens = [] 91 | _input_ids = [] 92 | _seq_length = 0 93 | 94 | com.truncate_segments(segments, self.max_seq_length - len(segments), truncate_method=self.truncate_method) 95 | for segment in segments: 96 | _input_tokens.extend(segment + ["[SEP]"]) 97 | 98 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 99 | _seq_length = len(_input_ids) 100 | 101 | # padding 102 | _input_ids += [0] * (self.max_seq_length - len(_input_ids)) 103 | 104 | input_ids.append(_input_ids) 105 | seq_length.append(_seq_length) 106 | 107 | return input_ids, seq_length 108 | 109 | def _set_placeholders(self, **kwargs): 110 | self.placeholders = { 111 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 112 | "seq_length": tf.placeholder(tf.int32, [None], "seq_length"), 113 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 114 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 115 | } 116 | 117 | def _forward(self, is_training, placeholders, **kwargs): 118 | 119 | encoder = RNNEncoder( 120 | is_training=is_training, 121 | input_ids=placeholders["input_ids"], 122 | seq_length=placeholders["seq_length"], 123 | vocab_size=len(self.tokenizer.vocab), 124 | rnn_core=self._rnn_core, 125 | hidden_size=self._hidden_size, 126 | scope=self._rnn_core, 127 | trainable=True, 128 | **kwargs, 129 | ) 130 | encoder_output = encoder.get_pooled_output() 131 | decoder = ClsDecoder( 132 | is_training=is_training, 133 | input_tensor=encoder_output, 134 | label_ids=placeholders["label_ids"], 135 | label_size=self.label_size, 136 | sample_weight=placeholders.get("sample_weight"), 137 | scope="cls/seq_relationship", 138 | **kwargs, 139 | ) 140 | train_loss, tensors = decoder.get_forward_outputs() 141 | return train_loss, tensors 142 | -------------------------------------------------------------------------------- /uf/apps/roberta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/roberta/__init__.py -------------------------------------------------------------------------------- /uf/apps/roberta/roberta.py: -------------------------------------------------------------------------------- 1 | """ RoBERTa. """ 2 | 3 | 4 | def create_instances_from_document(all_documents, document_index, max_seq_length): 5 | document = all_documents[document_index] 6 | instances = [] 7 | 8 | current_chunk = [] 9 | current_length = 0 10 | i = 0 11 | while i < len(document): 12 | segment = document[i] 13 | current_chunk.extend(segment) 14 | current_length += len(segment) 15 | i += 1 16 | if current_length >= max_seq_length: 17 | instances.append([current_chunk]) 18 | current_chunk = [] 19 | current_length = 0 20 | if current_chunk: 21 | instances.append([current_chunk]) 22 | 23 | return instances 24 | -------------------------------------------------------------------------------- /uf/apps/roberta/roberta_binary_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_binary_classifier import BinaryClassifierModule 2 | from ..bert.bert_binary_classifier import BERTBinaryClassifier 3 | 4 | 5 | class RoBERTaBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule): 6 | """ Multi-label classifier on RoBERTa. """ 7 | pass -------------------------------------------------------------------------------- /uf/apps/roberta/roberta_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_classifier import ClassifierModule 2 | from ..bert.bert_classifier import BERTClassifier 3 | 4 | 5 | class RoBERTaClassifier(BERTClassifier, ClassifierModule): 6 | """ Single-label classifier on RoBERTa. """ 7 | pass 8 | -------------------------------------------------------------------------------- /uf/apps/roberta/roberta_mrc.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_mrc import MRCModule 2 | from ..bert.bert_mrc import BERTMRC 3 | 4 | 5 | class RoBERTaMRC(BERTMRC, MRCModule): 6 | """ Machine reading comprehension on RoBERTa. """ 7 | pass 8 | -------------------------------------------------------------------------------- /uf/apps/roberta/roberta_seq_classifier.py: -------------------------------------------------------------------------------- 1 | from .._base_._base_seq_classifier import SeqClassifierModule 2 | from ..bert.bert_seq_classifier import BERTSeqClassifier 3 | 4 | 5 | class RoBERTaSeqClassifier(BERTSeqClassifier, SeqClassifierModule): 6 | """ Sequence labeling classifier on RoBERTa. """ 7 | pass 8 | -------------------------------------------------------------------------------- /uf/apps/sanet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sanet/__init__.py -------------------------------------------------------------------------------- /uf/apps/sembert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sembert/__init__.py -------------------------------------------------------------------------------- /uf/apps/sembert/sembert.py: -------------------------------------------------------------------------------- 1 | """ SemBERT decoder. """ 2 | 3 | from time import perf_counter 4 | from ...third import tf 5 | from .._base_._base_ import BaseDecoder 6 | from ..bert.bert import BERTEncoder 7 | from .. import util 8 | 9 | 10 | class SemBERTDecoder(BaseDecoder): 11 | def __init__(self, 12 | bert_config, 13 | is_training, 14 | input_tensor, 15 | input_mask, 16 | sem_features, 17 | label_ids, 18 | max_seq_length, 19 | feature_size, 20 | label_size=2, 21 | sample_weight=None, 22 | scope="cls/seq_relationship", 23 | hidden_dropout_prob=0.1, 24 | initializer_range=0.02, 25 | trainable=True, 26 | **kwargs): 27 | super().__init__(**kwargs) 28 | 29 | if kwargs.get("return_hidden"): 30 | self.tensors["hidden"] = input_tensor 31 | 32 | input_shape = util.get_shape_list(input_tensor) 33 | batch_size = input_shape[0] 34 | hidden_size = input_shape[-1] 35 | with tf.variable_scope("sem"): 36 | feature_embeddings = tf.get_variable( 37 | name="feature_embeddings", 38 | shape=[feature_size + 3, hidden_size], # for [PAD], [CLS], [SEP] 39 | initializer=util.create_initializer(initializer_range), 40 | trainable=trainable) 41 | sem_output = tf.gather( 42 | feature_embeddings, sem_features) # [B, N, H] 43 | 44 | attention_heads = [] 45 | with tf.variable_scope("self"): 46 | attention_mask = BERTEncoder.create_attention_mask_from_input_mask( 47 | input_mask, batch_size, max_seq_length) 48 | (attention_head, _) = BERTEncoder.attention_layer( 49 | from_tensor=sem_output, 50 | to_tensor=sem_output, 51 | attention_mask=attention_mask, 52 | num_attention_heads=bert_config.num_attention_heads, 53 | size_per_head=(hidden_size // bert_config.num_attention_heads), 54 | attention_probs_dropout_prob=hidden_dropout_prob if is_training else 0.0, 55 | initializer_range=initializer_range, 56 | do_return_2d_tensor=False, 57 | batch_size=batch_size, 58 | from_max_seq_length=max_seq_length, 59 | to_max_seq_length=max_seq_length, 60 | trainable=trainable) 61 | attention_heads.append(attention_head) 62 | 63 | if len(attention_heads) == 1: 64 | attention_output = attention_heads[0] 65 | else: 66 | attention_output = tf.concat(attention_heads, axis=-1) 67 | 68 | attention_output = attention_output[:, 0, :] # [B, H] 69 | input_tensor = util.layer_norm( 70 | attention_output + input_tensor, 71 | trainable=trainable) 72 | 73 | with tf.variable_scope(scope): 74 | output_weights = tf.get_variable( 75 | "output_weights", 76 | shape=[label_size, hidden_size], 77 | initializer=util.create_initializer(initializer_range), 78 | trainable=trainable) 79 | output_bias = tf.get_variable( 80 | "output_bias", 81 | shape=[label_size], 82 | initializer=tf.zeros_initializer(), 83 | trainable=trainable) 84 | 85 | output_layer = util.dropout( 86 | input_tensor, hidden_dropout_prob if is_training else 0.0) 87 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 88 | logits = tf.nn.bias_add(logits, output_bias) 89 | 90 | self.tensors["preds"] = tf.argmax(logits, axis=-1) 91 | self.tensors["probs"] = tf.nn.softmax(logits, axis=-1, name="probs") 92 | 93 | per_example_loss = util.cross_entropy(logits, label_ids, label_size, **kwargs) 94 | if sample_weight is not None: 95 | per_example_loss *= sample_weight 96 | 97 | self.tensors["losses"] = per_example_loss 98 | self.train_loss = tf.reduce_mean(per_example_loss) 99 | 100 | 101 | def get_decay_power(num_hidden_layers): 102 | decay_power = { 103 | "/embeddings": num_hidden_layers + 2, 104 | "sem/": 2, 105 | "/pooler/": 1, 106 | "cls/": 0, 107 | } 108 | for layer_idx in range(num_hidden_layers): 109 | decay_power["/layer_%d/" % layer_idx] = num_hidden_layers - layer_idx + 1 110 | return decay_power 111 | -------------------------------------------------------------------------------- /uf/apps/spe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/spe/__init__.py -------------------------------------------------------------------------------- /uf/apps/sqp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sqp/__init__.py -------------------------------------------------------------------------------- /uf/apps/stockbert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/stockbert/__init__.py -------------------------------------------------------------------------------- /uf/apps/stockbert/stockbert.py: -------------------------------------------------------------------------------- 1 | """ SemBERT decoder. """ 2 | 3 | import copy 4 | 5 | from ...third import tf 6 | from .._base_._base_ import BaseEncoder 7 | from ..bert.bert import BERTEncoder 8 | from .. import util 9 | 10 | 11 | class StockBERTEncoder(BERTEncoder, BaseEncoder): 12 | def __init__(self, 13 | bert_config, 14 | is_training, 15 | input_values, 16 | input_mask, 17 | scope="stock_bert", 18 | drop_pooler=False, 19 | trainable=True, 20 | **kwargs): 21 | 22 | bert_config = copy.deepcopy(bert_config) 23 | if not is_training: 24 | bert_config.hidden_dropout_prob = 0.0 25 | bert_config.attention_probs_dropout_prob = 0.0 26 | 27 | input_shape = util.get_shape_list(input_values, expected_rank=3) 28 | batch_size = input_shape[0] 29 | max_seq_length = input_shape[1] + 1 30 | 31 | with tf.variable_scope(scope): 32 | with tf.variable_scope("embeddings"): 33 | 34 | self.embedding_output = self.embedding_preprocessor( 35 | input_values=input_values, 36 | batch_size=batch_size, 37 | embedding_size=bert_config.hidden_size, 38 | initializer_range=bert_config.initializer_range, 39 | name="cls_embedding", 40 | trainable=trainable) 41 | 42 | # Add positional embeddings and token type embeddings 43 | # layer normalize and perform dropout. 44 | self.embedding_output = self.embedding_postprocessor( 45 | input_tensor=self.embedding_output, 46 | batch_size=batch_size, 47 | max_seq_length=max_seq_length, 48 | hidden_size=bert_config.hidden_size, 49 | use_token_type=False, 50 | segment_ids=None, 51 | token_type_vocab_size=bert_config.type_vocab_size, 52 | token_type_embedding_name="token_type_embeddings", 53 | use_position_embeddings=True, 54 | position_embedding_name="position_embeddings", 55 | initializer_range=bert_config.initializer_range, 56 | max_position_embeddings=\ 57 | bert_config.max_position_embeddings, 58 | dropout_prob=bert_config.hidden_dropout_prob, 59 | trainable=trainable) 60 | 61 | with tf.variable_scope("encoder"): 62 | attention_mask = self.create_attention_mask_from_input_mask( 63 | input_mask, batch_size, max_seq_length) 64 | 65 | # stacked transformers 66 | self.all_encoder_layers = self.transformer_model( 67 | input_tensor=self.embedding_output, 68 | batch_size=batch_size, 69 | max_seq_length=max_seq_length, 70 | attention_mask=attention_mask, 71 | hidden_size=bert_config.hidden_size, 72 | num_hidden_layers=bert_config.num_hidden_layers, 73 | num_attention_heads=bert_config.num_attention_heads, 74 | intermediate_size=bert_config.intermediate_size, 75 | intermediate_act_fn=util.get_activation( 76 | bert_config.hidden_act), 77 | hidden_dropout_prob=bert_config.hidden_dropout_prob, 78 | attention_probs_dropout_prob=\ 79 | bert_config.attention_probs_dropout_prob, 80 | initializer_range=bert_config.initializer_range, 81 | trainable=trainable) 82 | 83 | self.sequence_output = self.all_encoder_layers[-1] 84 | with tf.variable_scope("pooler"): 85 | first_token_tensor = self.sequence_output[:, 0, :] 86 | 87 | # trick: ignore the fully connected layer 88 | if drop_pooler: 89 | self.pooled_output = first_token_tensor 90 | else: 91 | self.pooled_output = tf.layers.dense( 92 | first_token_tensor, 93 | bert_config.hidden_size, 94 | activation=tf.tanh, 95 | kernel_initializer=util.create_initializer( 96 | bert_config.initializer_range), 97 | trainable=trainable) 98 | 99 | def embedding_preprocessor(self, 100 | input_values, 101 | batch_size=None, 102 | embedding_size=128, 103 | initializer_range=0.02, 104 | name="cls_embedding", 105 | dtype=tf.float32, 106 | trainable=True): 107 | 108 | with tf.variable_scope(name): 109 | input_values = util.layer_norm( 110 | input_values, 111 | trainable=trainable) 112 | linear_output = tf.layers.dense( 113 | input_values, 114 | embedding_size, 115 | activation=None, 116 | name="dense", 117 | kernel_initializer=util.create_initializer(initializer_range), 118 | trainable=trainable) 119 | 120 | cls_embedding = tf.get_variable( 121 | name="cls", 122 | shape=[1, 1, embedding_size], 123 | initializer=util.create_initializer(initializer_range), 124 | dtype=dtype, 125 | trainable=trainable) 126 | cls_output = tf.tile(cls_embedding, [batch_size, 1, 1]) 127 | 128 | output = tf.concat([cls_output, linear_output], axis=1) 129 | return output 130 | -------------------------------------------------------------------------------- /uf/apps/textcnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/textcnn/__init__.py -------------------------------------------------------------------------------- /uf/apps/textcnn/textcnn.py: -------------------------------------------------------------------------------- 1 | """ Convolutional neural network on texture analysis. """ 2 | 3 | from ...third import tf 4 | from .._base_._base_ import BaseEncoder 5 | from .. import util 6 | 7 | 8 | class TextCNNEncoder(BaseEncoder): 9 | def __init__(self, 10 | vocab_size, 11 | filter_sizes, 12 | num_channels, 13 | is_training, 14 | input_ids, 15 | scope="text_cnn", 16 | embedding_size=256, 17 | dropout_prob=0.1, 18 | trainable=True, 19 | **kwargs): 20 | 21 | input_shape = util.get_shape_list(input_ids, expected_rank=2) 22 | batch_size = input_shape[0] 23 | max_seq_length = input_shape[1] 24 | 25 | if isinstance(filter_sizes, str): 26 | filter_sizes = filter_sizes.split(",") 27 | assert isinstance(filter_sizes, list), ( 28 | "`filter_sizes` should be a list of integers or a string " 29 | "seperated with commas.") 30 | 31 | with tf.variable_scope(scope): 32 | with tf.variable_scope("embeddings"): 33 | 34 | embedding_table = kwargs.get("tilda_embeddings") 35 | if embedding_table is None: 36 | embedding_table = tf.get_variable( 37 | name="word_embeddings", 38 | shape=[vocab_size, embedding_size], 39 | initializer=util.create_initializer(0.02), 40 | dtype=tf.float32, 41 | trainable=trainable) 42 | 43 | flat_input_ids = tf.reshape(input_ids, [-1]) 44 | output = tf.gather( 45 | embedding_table, flat_input_ids, name="embedding_look_up") 46 | output = tf.reshape( 47 | output, [batch_size, max_seq_length, embedding_size]) 48 | 49 | output_expanded = tf.expand_dims(output, -1) 50 | 51 | # Create a convolution + maxpool layer for each filter size 52 | pooled_outputs = [] 53 | for i, filter_size in enumerate(filter_sizes): 54 | with tf.variable_scope("conv_%s" % filter_size): 55 | 56 | # Convolution Layer 57 | W = tf.get_variable( 58 | name="W", 59 | shape=[int(filter_size), embedding_size, 1, num_channels], 60 | initializer=tf.truncated_normal_initializer(0.1), 61 | dtype=tf.float32, 62 | trainable=trainable) 63 | b = tf.get_variable( 64 | name="b", 65 | shape=[num_channels], 66 | initializer=tf.constant_initializer(0.1), 67 | dtype=tf.float32, 68 | trainable=trainable) 69 | conv = tf.nn.conv2d( 70 | output_expanded, W, 71 | strides=[1, 1, 1, 1], 72 | padding="VALID", 73 | name="conv") 74 | 75 | # Apply nonlinearity 76 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 77 | 78 | # Maxpooling over the outputs 79 | pooled = tf.nn.max_pool( 80 | h, 81 | ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1], 82 | strides=[1, 1, 1, 1], 83 | padding="VALID", 84 | name="pool") 85 | pooled_outputs.append(pooled) 86 | 87 | num_channels_total = num_channels * len(filter_sizes) 88 | h_pool = tf.concat(pooled_outputs, 3) 89 | h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total]) 90 | 91 | with tf.name_scope("dropout"): 92 | self.pooled_output = util.dropout(h_pool_flat, dropout_prob) 93 | 94 | def get_pooled_output(self): 95 | """ Returns a tensor with shape [batch_size, hidden_size]. """ 96 | return self.pooled_output 97 | 98 | 99 | def get_decay_power(): 100 | decay_power = { 101 | "/embeddings": 2, 102 | "/conv_": 1, 103 | "cls/": 0, 104 | } 105 | return decay_power 106 | -------------------------------------------------------------------------------- /uf/apps/textcnn/textcnn_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .textcnn import TextCNNEncoder, get_decay_power 4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 5 | from ...token import WordPieceTokenizer 6 | from ...third import tf 7 | from ... import com 8 | 9 | 10 | class TextCNNClassifier(ClassifierModule): 11 | """ Single-label classifier on TextCNN. """ 12 | 13 | def __init__( 14 | self, 15 | vocab_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | filter_sizes="2,4,6", 22 | num_channels=6, 23 | hidden_size=256, 24 | do_lower_case=True, 25 | truncate_method="LIFO", 26 | ): 27 | self.__init_args__ = locals() 28 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 29 | 30 | self.max_seq_length = max_seq_length 31 | self.label_size = label_size 32 | self.truncate_method = truncate_method 33 | self._filter_sizes = filter_sizes 34 | self._num_channels = num_channels 35 | self._hidden_size = hidden_size 36 | 37 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 38 | self.decay_power = get_decay_power() 39 | 40 | if "[CLS]" not in self.tokenizer.vocab: 41 | self.tokenizer.add("[CLS]") 42 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 43 | if "[SEP]" not in self.tokenizer.vocab: 44 | self.tokenizer.add("[SEP]") 45 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 46 | 47 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 48 | self._assert_legal(X, y, sample_weight, X_tokenized) 49 | 50 | if is_training: 51 | assert y is not None, "`y` can't be None." 52 | if is_parallel: 53 | assert self.label_size, "Can't parse data on multi-processing when `label_size` is None." 54 | 55 | n_inputs = None 56 | data = {} 57 | 58 | # convert X 59 | if X is not None or X_tokenized is not None: 60 | tokenized = False if X is not None else X_tokenized 61 | input_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 62 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 63 | n_inputs = len(input_ids) 64 | 65 | if n_inputs < self.batch_size: 66 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 67 | 68 | # convert y 69 | if y is not None: 70 | label_ids = self._convert_y(y) 71 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 72 | 73 | # convert sample_weight 74 | if is_training or y is not None: 75 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 76 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 77 | 78 | return data 79 | 80 | def _convert_X(self, X_target, tokenized): 81 | 82 | # tokenize input texts 83 | segment_input_tokens = [] 84 | for idx, sample in enumerate(X_target): 85 | try: 86 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 87 | except Exception as e: 88 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 89 | 90 | input_ids = [] 91 | for idx, segments in enumerate(segment_input_tokens): 92 | _input_tokens = ["[CLS]"] 93 | _input_ids = [] 94 | 95 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 96 | for s_id, segment in enumerate(segments): 97 | _input_tokens.extend(segment + ["[SEP]"]) 98 | 99 | _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens) 100 | 101 | # padding 102 | for _ in range(self.max_seq_length - len(_input_ids)): 103 | _input_ids.append(0) 104 | 105 | input_ids.append(_input_ids) 106 | 107 | return input_ids 108 | 109 | def _set_placeholders(self, **kwargs): 110 | self.placeholders = { 111 | "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"), 112 | "label_ids": tf.placeholder(tf.int32, [None], "label_ids"), 113 | "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"), 114 | } 115 | 116 | def _forward(self, is_training, placeholders, **kwargs): 117 | 118 | encoder = TextCNNEncoder( 119 | vocab_size=len(self.tokenizer.vocab), 120 | filter_sizes=self._filter_sizes, 121 | num_channels=self._num_channels, 122 | is_training=is_training, 123 | input_ids=placeholders["input_ids"], 124 | embedding_size=self._hidden_size, 125 | **kwargs, 126 | ) 127 | encoder_output = encoder.get_pooled_output() 128 | decoder = ClsDecoder( 129 | is_training=is_training, 130 | input_tensor=encoder_output, 131 | label_ids=placeholders["label_ids"], 132 | label_size=self.label_size, 133 | sample_weight=placeholders.get("sample_weight"), 134 | scope="cls/seq_relationship", 135 | **kwargs, 136 | ) 137 | train_loss, tensors = decoder.get_forward_outputs() 138 | return train_loss, tensors 139 | -------------------------------------------------------------------------------- /uf/apps/tinybert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/tinybert/__init__.py -------------------------------------------------------------------------------- /uf/apps/tinybert/tinybert_binary_classifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import numpy as np 4 | 5 | from .._base_._base_binary_classifier import BinaryClassifierModule 6 | from ..bert.bert_binary_classifier import BERTBinaryClassifier 7 | from ..bert.bert import BERTConfig 8 | from .tinybert import TinyBERTBinaryClsDistillor 9 | from ...token import WordPieceTokenizer 10 | from ...third import tf 11 | 12 | 13 | class TinyBERTBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule): 14 | """ Multi-label classifier on TinyBERT, a distillation model. """ 15 | 16 | def __init__( 17 | self, 18 | config_file, 19 | vocab_file, 20 | max_seq_length=128, 21 | label_size=None, 22 | init_checkpoint=None, 23 | output_dir=None, 24 | gpu_ids=None, 25 | drop_pooler=False, 26 | hidden_size=384, 27 | num_hidden_layers=4, 28 | do_lower_case=True, 29 | truncate_method="LIFO", 30 | ): 31 | self.__init_args__ = locals() 32 | super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 33 | 34 | self.max_seq_length = max_seq_length 35 | self.label_size = label_size 36 | self.truncate_method = truncate_method 37 | self._drop_pooler = drop_pooler 38 | 39 | self.bert_config = BERTConfig.from_json_file(config_file) 40 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 41 | self.decay_power = "unsupported" 42 | 43 | self.student_config = copy.deepcopy(self.bert_config) 44 | self.student_config.hidden_size = hidden_size 45 | self.student_config.intermediate_size = 4 * hidden_size 46 | self.student_config.num_hidden_layers = num_hidden_layers 47 | 48 | assert label_size, ("`label_size` can't be None.") 49 | if "[CLS]" not in self.tokenizer.vocab: 50 | self.tokenizer.add("[CLS]") 51 | self.bert_config.vocab_size += 1 52 | self.student_config.vocab_size += 1 53 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 54 | if "[SEP]" not in self.tokenizer.vocab: 55 | self.tokenizer.add("[SEP]") 56 | self.bert_config.vocab_size += 1 57 | self.student_config.vocab_size += 1 58 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 59 | 60 | def to_bert(self, save_dir): 61 | """ Isolate student tiny_bert out of traing graph. """ 62 | if not self._session_built: 63 | raise ValueError("Init, fit, predict or score before saving checkpoint.") 64 | 65 | tf.gfile.MakeDirs(save_dir) 66 | 67 | tf.logging.info("Saving checkpoint into %s/bert_model.ckpt" % (save_dir)) 68 | self.init_checkpoint = save_dir + "/bert_model.ckpt" 69 | 70 | assignment_map = {} 71 | for var in self.global_variables: 72 | if var.name.startswith("tiny/"): 73 | assignment_map[var.name.replace("tiny/", "")[:-2]] = var 74 | saver = tf.train.Saver(assignment_map, max_to_keep=1000000) 75 | saver.save(self.sess, self.init_checkpoint) 76 | 77 | self.student_config.to_json_file(os.path.join(save_dir, "bert_config.json")) 78 | 79 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 80 | self._assert_legal(X, y, sample_weight, X_tokenized) 81 | 82 | if is_training: 83 | assert y is None, "Training of %s is unsupervised. `y` should be None." % self.__class__.__name__ 84 | 85 | n_inputs = None 86 | data = {} 87 | 88 | # convert X 89 | if X is not None or X_tokenized is not None: 90 | tokenized = False if X is not None else X_tokenized 91 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 92 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 93 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 94 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 95 | n_inputs = len(input_ids) 96 | 97 | if n_inputs < self.batch_size: 98 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 99 | 100 | # convert y 101 | if y is not None: 102 | label_ids = self._convert_y(y) 103 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 104 | 105 | # convert sample_weight 106 | if is_training or y is not None: 107 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 108 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 109 | 110 | return data 111 | 112 | def _forward(self, is_training, placeholders, **kwargs): 113 | 114 | model = TinyBERTBinaryClsDistillor( 115 | student_config=self.student_config, 116 | bert_config=self.bert_config, 117 | is_training=is_training, 118 | input_ids=placeholders["input_ids"], 119 | input_mask=placeholders["input_mask"], 120 | segment_ids=placeholders["segment_ids"], 121 | label_ids=placeholders.get("label_ids"), 122 | sample_weight=placeholders.get("sample_weight"), 123 | drop_pooler=self._drop_pooler, 124 | label_size=self.label_size, 125 | **kwargs, 126 | ) 127 | train_loss, tensors = model.get_forward_outputs() 128 | return train_loss, tensors 129 | 130 | def _get_fit_ops(self, from_tfrecords=False): 131 | return [self.tensors["losses"]] 132 | 133 | def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False): 134 | 135 | # loss 136 | batch_losses = output_arrays[0] 137 | loss = np.mean(batch_losses) 138 | 139 | info = "" 140 | info += ", distill loss %.6f" % loss 141 | 142 | return info 143 | -------------------------------------------------------------------------------- /uf/apps/tinybert/tinybert_classifier.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import numpy as np 4 | 5 | from .tinybert import TinyBERTClsDistillor 6 | from .._base_._base_classifier import ClassifierModule 7 | from ..bert.bert_classifier import BERTClassifier 8 | from ..bert.bert import BERTConfig 9 | from ...token import WordPieceTokenizer 10 | from ...third import tf 11 | 12 | 13 | class TinyBERTClassifier(BERTClassifier, ClassifierModule): 14 | """ Single-label classifier on TinyBERT, a distillation model. """ 15 | 16 | def __init__( 17 | self, 18 | config_file, 19 | vocab_file, 20 | max_seq_length=128, 21 | label_size=None, 22 | init_checkpoint=None, 23 | output_dir=None, 24 | gpu_ids=None, 25 | drop_pooler=False, 26 | hidden_size=384, 27 | num_hidden_layers=4, 28 | do_lower_case=True, 29 | truncate_method="LIFO", 30 | ): 31 | self.__init_args__ = locals() 32 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 33 | 34 | self.max_seq_length = max_seq_length 35 | self.label_size = label_size 36 | self.truncate_method = truncate_method 37 | self._drop_pooler = drop_pooler 38 | 39 | self.bert_config = BERTConfig.from_json_file(config_file) 40 | self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case) 41 | self.decay_power = "unsupported" 42 | 43 | self.student_config = copy.deepcopy(self.bert_config) 44 | self.student_config.hidden_size = hidden_size 45 | self.student_config.intermediate_size = 4 * hidden_size 46 | self.student_config.num_hidden_layers = num_hidden_layers 47 | 48 | assert label_size, ("`label_size` can't be None.") 49 | if "[CLS]" not in self.tokenizer.vocab: 50 | self.tokenizer.add("[CLS]") 51 | self.bert_config.vocab_size += 1 52 | self.student_config.vocab_size += 1 53 | tf.logging.info("Add necessary token `[CLS]` into vocabulary.") 54 | if "[SEP]" not in self.tokenizer.vocab: 55 | self.tokenizer.add("[SEP]") 56 | self.bert_config.vocab_size += 1 57 | self.student_config.vocab_size += 1 58 | tf.logging.info("Add necessary token `[SEP]` into vocabulary.") 59 | 60 | def to_bert(self, save_dir): 61 | """ Isolate student tiny_bert out of traing graph. """ 62 | if not self._session_built: 63 | raise ValueError("Init, fit, predict or score before saving checkpoint.") 64 | 65 | tf.gfile.MakeDirs(save_dir) 66 | 67 | tf.logging.info("Saving checkpoint into %s/bert_model.ckpt" % save_dir) 68 | self.init_checkpoint = save_dir + "/bert_model.ckpt" 69 | 70 | assignment_map = {} 71 | for var in self.global_variables: 72 | if var.name.startswith("tiny/"): 73 | assignment_map[var.name.replace("tiny/", "")[:-2]] = var 74 | saver = tf.train.Saver(assignment_map, max_to_keep=1000000) 75 | saver.save(self.sess, self.init_checkpoint) 76 | 77 | self.student_config.to_json_file(os.path.join(save_dir, "bert_config.json")) 78 | 79 | def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False): 80 | self._assert_legal(X, y, sample_weight, X_tokenized) 81 | 82 | if is_training: 83 | assert y is None, "Training of %s is unsupervised. `y` should be None." % self.__class__.__name__ 84 | 85 | n_inputs = None 86 | data = {} 87 | 88 | # convert X 89 | if X is not None or X_tokenized is not None: 90 | tokenized = False if X is not None else X_tokenized 91 | input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized) 92 | data["input_ids"] = np.array(input_ids, dtype=np.int32) 93 | data["input_mask"] = np.array(input_mask, dtype=np.int32) 94 | data["segment_ids"] = np.array(segment_ids, dtype=np.int32) 95 | n_inputs = len(input_ids) 96 | 97 | if n_inputs < self.batch_size: 98 | self.batch_size = max(n_inputs, len(self._gpu_ids)) 99 | 100 | if y is not None: 101 | # convert y and sample_weight 102 | label_ids = self._convert_y(y) 103 | data["label_ids"] = np.array(label_ids, dtype=np.int32) 104 | 105 | # convert sample_weight 106 | if is_training or y is not None: 107 | sample_weight = self._convert_sample_weight(sample_weight, n_inputs) 108 | data["sample_weight"] = np.array(sample_weight, dtype=np.float32) 109 | 110 | return data 111 | 112 | def _forward(self, is_training, placeholders, **kwargs): 113 | 114 | model = TinyBERTClsDistillor( 115 | student_config=self.student_config, 116 | bert_config=self.bert_config, 117 | is_training=is_training, 118 | input_ids=placeholders["input_ids"], 119 | input_mask=placeholders["input_mask"], 120 | segment_ids=placeholders["segment_ids"], 121 | label_ids=placeholders.get("label_ids"), 122 | sample_weight=placeholders.get("sample_weight"), 123 | drop_pooler=self._drop_pooler, 124 | label_size=self.label_size, 125 | **kwargs, 126 | ) 127 | train_loss, tensors = model.get_forward_outputs() 128 | return train_loss, tensors 129 | 130 | def _get_fit_ops(self, from_tfrecords=False): 131 | return [self.tensors["losses"]] 132 | 133 | def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False): 134 | 135 | # loss 136 | batch_losses = output_arrays[0] 137 | loss = np.mean(batch_losses) 138 | 139 | info = "" 140 | info += ", distill loss %.6f" % loss 141 | 142 | return info 143 | -------------------------------------------------------------------------------- /uf/apps/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/transformer/__init__.py -------------------------------------------------------------------------------- /uf/apps/uda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/uda/__init__.py -------------------------------------------------------------------------------- /uf/apps/unilm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/unilm/__init__.py -------------------------------------------------------------------------------- /uf/apps/vae/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/vae/__init__.py -------------------------------------------------------------------------------- /uf/apps/widedeep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/widedeep/__init__.py -------------------------------------------------------------------------------- /uf/apps/xlnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/xlnet/__init__.py -------------------------------------------------------------------------------- /uf/apps/xlnet/xlnet_binary_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID 4 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule 5 | from ..bert.bert_binary_classifier import BERTBinaryClassifier 6 | from ...token import SentencePieceTokenizer 7 | from ...third import tf 8 | from ... import com 9 | 10 | 11 | class XLNetBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule): 12 | """ Multi-label classifier on XLNet. """ 13 | 14 | def __init__( 15 | self, 16 | config_file, 17 | spm_file, 18 | max_seq_length=128, 19 | label_size=None, 20 | label_weight=None, 21 | init_checkpoint=None, 22 | output_dir=None, 23 | gpu_ids=None, 24 | do_lower_case=True, 25 | truncate_method="LIFO", 26 | ): 27 | self.__init_args__ = locals() 28 | super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 29 | 30 | self.max_seq_length = max_seq_length 31 | self.label_size = label_size 32 | self.label_weight = label_weight 33 | self.truncate_method = truncate_method 34 | 35 | self.xlnet_config = XLNetConfig(json_path=config_file) 36 | self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case) 37 | self.decay_power = get_decay_power(self.xlnet_config.n_layer) 38 | 39 | def _convert_X(self, X_target, tokenized): 40 | 41 | # tokenize input texts 42 | segment_input_tokens = [] 43 | for idx, sample in enumerate(X_target): 44 | try: 45 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 46 | except Exception as e: 47 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 48 | 49 | input_ids = [] 50 | input_mask = [] 51 | segment_ids = [] 52 | for idx, segments in enumerate(segment_input_tokens): 53 | _input_ids = [] 54 | _input_mask = [] 55 | _segment_ids = [] 56 | 57 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 58 | 59 | for s_id, segment in enumerate(segments): 60 | _segment_id = min(s_id, 1) 61 | _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID]) 62 | _input_mask.extend([0] * (len(segment) + 1)) 63 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 64 | 65 | _input_ids.append(CLS_ID) 66 | _input_mask.append(0) 67 | _segment_ids.append(SEG_ID_CLS) 68 | 69 | # padding 70 | if len(_input_ids) < self.max_seq_length: 71 | delta_len = self.max_seq_length - len(_input_ids) 72 | _input_ids = [0] * delta_len + _input_ids 73 | _input_mask = [1] * delta_len + _input_mask 74 | _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids 75 | 76 | input_ids.append(_input_ids) 77 | input_mask.append(_input_mask) 78 | segment_ids.append(_segment_ids) 79 | 80 | return input_ids, input_mask, segment_ids 81 | 82 | def _forward(self, is_training, placeholders, **kwargs): 83 | 84 | input_ids = tf.transpose(placeholders["input_ids"], [1, 0]) 85 | input_mask = tf.transpose(placeholders["input_mask"], [1, 0]) 86 | segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0]) 87 | 88 | encoder = XLNetEncoder( 89 | xlnet_config=self.xlnet_config, 90 | is_training=is_training, 91 | input_ids=input_ids, 92 | seg_ids=segment_ids, 93 | input_mask=input_mask, 94 | **kwargs, 95 | ) 96 | encoder_output = encoder.get_pooled_output() 97 | decoder = BinaryClsDecoder( 98 | is_training=is_training, 99 | input_tensor=encoder_output, 100 | label_ids=placeholders["label_ids"], 101 | label_size=self.label_size, 102 | sample_weight=placeholders.get("sample_weight"), 103 | label_weight=self.label_weight, 104 | scope="cls/seq_relationship", 105 | **kwargs, 106 | ) 107 | train_loss, tensors = decoder.get_forward_outputs() 108 | return train_loss, tensors 109 | -------------------------------------------------------------------------------- /uf/apps/xlnet/xlnet_classifier.py: -------------------------------------------------------------------------------- 1 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID 2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule 3 | from ..bert.bert_classifier import BERTClassifier 4 | from ...token import SentencePieceTokenizer 5 | from ...third import tf 6 | from ... import com 7 | 8 | 9 | class XLNetClassifier(BERTClassifier, ClassifierModule): 10 | """ Single-label classifier on XLNet. """ 11 | 12 | def __init__( 13 | self, 14 | config_file, 15 | spm_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | do_lower_case=True, 22 | truncate_method="LIFO", 23 | ): 24 | self.__init_args__ = locals() 25 | super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 26 | 27 | self.max_seq_length = max_seq_length 28 | self.label_size = label_size 29 | self.truncate_method = truncate_method 30 | 31 | self.xlnet_config = XLNetConfig(json_path=config_file) 32 | self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case) 33 | self.decay_power = get_decay_power(self.xlnet_config.n_layer) 34 | 35 | def _convert_X(self, X_target, tokenized): 36 | 37 | # tokenize input texts 38 | segment_input_tokens = [] 39 | for idx, sample in enumerate(X_target): 40 | try: 41 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 42 | except Exception as e: 43 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 44 | 45 | input_ids = [] 46 | input_mask = [] 47 | segment_ids = [] 48 | for idx, segments in enumerate(segment_input_tokens): 49 | _input_ids = [] 50 | _input_mask = [] 51 | _segment_ids = [] 52 | 53 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 54 | 55 | for s_id, segment in enumerate(segments): 56 | _segment_id = min(s_id, 1) 57 | _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID]) 58 | _input_mask.extend([0] * (len(segment) + 1)) 59 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 60 | 61 | _input_ids.append(CLS_ID) 62 | _input_mask.append(0) 63 | _segment_ids.append(SEG_ID_CLS) 64 | 65 | # padding 66 | if len(_input_ids) < self.max_seq_length: 67 | delta_len = self.max_seq_length - len(_input_ids) 68 | _input_ids = [0] * delta_len + _input_ids 69 | _input_mask = [1] * delta_len + _input_mask 70 | _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids 71 | 72 | input_ids.append(_input_ids) 73 | input_mask.append(_input_mask) 74 | segment_ids.append(_segment_ids) 75 | 76 | return input_ids, input_mask, segment_ids 77 | 78 | def _forward(self, is_training, placeholders, **kwargs): 79 | 80 | input_ids = tf.transpose(placeholders["input_ids"], [1, 0]) 81 | input_mask = tf.transpose(placeholders["input_mask"], [1, 0]) 82 | segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0]) 83 | 84 | encoder = XLNetEncoder( 85 | xlnet_config=self.xlnet_config, 86 | is_training=is_training, 87 | input_ids=input_ids, 88 | seg_ids=segment_ids, 89 | input_mask=input_mask, 90 | **kwargs, 91 | ) 92 | encoder_output = encoder.get_pooled_output() 93 | decoder = ClsDecoder( 94 | is_training=is_training, 95 | input_tensor=encoder_output, 96 | label_ids=placeholders["label_ids"], 97 | label_size=self.label_size, 98 | sample_weight=placeholders.get("sample_weight"), 99 | scope="cls/seq_relationship", 100 | **kwargs, 101 | ) 102 | train_loss, tensors = decoder.get_forward_outputs() 103 | return train_loss, tensors 104 | -------------------------------------------------------------------------------- /uf/apps/xlnet/xlnet_seq_classifier.py: -------------------------------------------------------------------------------- 1 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID 2 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule 3 | from ..bert.bert_seq_classifier import BERTSeqClassifier 4 | from ...token import SentencePieceTokenizer 5 | from ...third import tf 6 | from ... import com 7 | 8 | 9 | class XLNetSeqClassifier(BERTSeqClassifier, SeqClassifierModule): 10 | """ Sequence labeling classifier on XLNet. """ 11 | 12 | def __init__( 13 | self, 14 | config_file, 15 | spm_file, 16 | max_seq_length=128, 17 | label_size=None, 18 | init_checkpoint=None, 19 | output_dir=None, 20 | gpu_ids=None, 21 | do_lower_case=True, 22 | truncate_method="LIFO", 23 | ): 24 | self.__init_args__ = locals() 25 | super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids) 26 | 27 | self.max_seq_length = max_seq_length 28 | self.label_size = label_size 29 | self.truncate_method = truncate_method 30 | 31 | self.xlnet_config = XLNetConfig(json_path=config_file) 32 | self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case) 33 | self.decay_power = get_decay_power(self.xlnet_config.n_layer) 34 | 35 | def _convert_X(self, X_target, tokenized): 36 | 37 | # tokenize input texts 38 | segment_input_tokens = [] 39 | for idx, sample in enumerate(X_target): 40 | try: 41 | segment_input_tokens.append(self._convert_x(sample, tokenized)) 42 | except Exception as e: 43 | raise ValueError("Wrong input format (%s): %s." % (sample, e)) 44 | 45 | input_ids = [] 46 | input_mask = [] 47 | segment_ids = [] 48 | for idx, segments in enumerate(segment_input_tokens): 49 | _input_ids = [] 50 | _input_mask = [] 51 | _segment_ids = [] 52 | 53 | com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method) 54 | 55 | for s_id, segment in enumerate(segments): 56 | _segment_id = min(s_id, 1) 57 | _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID]) 58 | _input_mask.extend([1] * (len(segment) + 1)) 59 | _segment_ids.extend([_segment_id] * (len(segment) + 1)) 60 | 61 | _input_ids.append(CLS_ID) 62 | _input_mask.append(1) 63 | _segment_ids.append(SEG_ID_CLS) 64 | 65 | # padding 66 | if len(_input_ids) < self.max_seq_length: 67 | delta_len = self.max_seq_length - len(_input_ids) 68 | _input_ids = [0] * delta_len + _input_ids 69 | _input_mask = [0] * delta_len + _input_mask # it's 1 in source code 70 | _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids 71 | 72 | input_ids.append(_input_ids) 73 | input_mask.append(_input_mask) 74 | segment_ids.append(_segment_ids) 75 | 76 | return input_ids, input_mask, segment_ids 77 | 78 | def _forward(self, is_training, placeholders, **kwargs): 79 | 80 | input_ids = tf.transpose(placeholders["input_ids"], [1, 0]) 81 | input_mask = tf.transpose(placeholders["input_mask"], [1, 0]) 82 | segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0]) 83 | 84 | encoder = XLNetEncoder( 85 | xlnet_config=self.xlnet_config, 86 | is_training=is_training, 87 | input_ids=input_ids, 88 | seg_ids=segment_ids, 89 | input_mask=input_mask, 90 | **kwargs, 91 | ) 92 | encoder_output = encoder.get_sequence_output() 93 | decoder = SeqClsDecoder( 94 | is_training=is_training, 95 | input_tensor=encoder_output, 96 | input_mask=placeholders["input_mask"], 97 | label_ids=placeholders["label_ids"], 98 | label_size=self.label_size, 99 | sample_weight=placeholders.get("sample_weight"), 100 | scope="cls/sequence", 101 | **kwargs, 102 | ) 103 | train_loss, tensors = decoder.get_forward_outputs() 104 | return train_loss, tensors 105 | -------------------------------------------------------------------------------- /uf/com/__init__.py: -------------------------------------------------------------------------------- 1 | from .cache import * 2 | from .checkpoint import * 3 | from .graph import * 4 | from .parallel import * 5 | from .resource import * 6 | from .text import * 7 | from .tfrecords import * 8 | from .com import * 9 | -------------------------------------------------------------------------------- /uf/com/cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import collections 5 | 6 | from .. import apps 7 | from ..third import tf 8 | 9 | 10 | def restore(key, from_file="./.unif", **kwargs): 11 | """ Load model from configurations saved in local file. 12 | 13 | Args: 14 | key: string. Unique name of configuration to load. 15 | from_file: string. The path of configuration file. 16 | """ 17 | tf.logging.info("Loading model `%s` from %s" % (key, from_file)) 18 | 19 | if not os.path.exists(from_file): 20 | raise ValueError("No file found with `%s`." % from_file) 21 | from_fp = open(from_file, encoding="utf-8") 22 | from_json = json.load(from_fp) 23 | from_fp.close() 24 | if key not in from_json.keys(): 25 | raise ValueError("No key `%s`." % key) 26 | _from_json = from_json[key] 27 | 28 | # restore configuration 29 | model_name = _from_json["model"] 30 | init_args = collections.OrderedDict() 31 | if "__init__" in _from_json: # unif >= beta v2.1.35 32 | zips = _from_json["__init__"].items() 33 | elif "keys" in _from_json: # unif < beta v2.1.35 34 | zips = zip(_from_json["keys"], _from_json["values"]) 35 | else: 36 | raise ValueError("Wrong format.") 37 | 38 | from_dir = os.path.dirname(from_file) 39 | if from_dir == "": 40 | from_dir = "." 41 | for arg, value in zips: 42 | 43 | # convert from relative path 44 | if arg == "init_checkpoint" or arg.endswith("_dir") or arg.endswith("_file"): 45 | if isinstance(value, str) and not value.startswith("/"): 46 | value = get_simplified_path(from_dir + "/" + value) 47 | 48 | if arg in kwargs: 49 | value = kwargs[arg] 50 | init_args[arg] = value 51 | model = apps.__dict__[model_name](**init_args) 52 | 53 | # restore attributes 54 | for arg, value in _from_json.get("__dict__", {}).items(): 55 | model.__dict__[arg] = value 56 | 57 | return model 58 | 59 | 60 | def load(key, cache_file="./.cache", **kwargs): 61 | """ Load model from configurations saved in cache file. 62 | 63 | NOTE: This function is deprecated and not upgraded, 64 | retained only for compatibility with older versions. 65 | Try `uf.restore()` instead. 66 | """ 67 | return restore(key, from_file=cache_file, **kwargs) 68 | 69 | 70 | def get_init_values(model): 71 | values = [] 72 | for arg in model.__class__.__init__.__code__.co_varnames[1:]: 73 | try: 74 | value = model.__getattribute__(arg) 75 | except Exception: 76 | value = model.__init_args__[arg] 77 | values.append(value) 78 | return values 79 | 80 | 81 | def get_relative_path(source, target): 82 | source = source.replace("\\", "/") 83 | target = target.replace("\\", "/") 84 | 85 | if source.startswith("/"): 86 | raise ValueError("Not a relative path: %s." % source) 87 | if target.startswith("/"): 88 | raise ValueError("Not a relative path: %s." % target) 89 | 90 | output = get_reverse_path(source) + "/" + target 91 | output = get_simplified_path(output) 92 | return output 93 | 94 | 95 | def get_simplified_path(path): 96 | path = path.replace("\\", "/") 97 | while True: 98 | res = re.findall("[^/]+/[.][.]/", path) 99 | res = [item for item in res if item != "../../" and item != "./../"] 100 | if res: 101 | path = path.replace(res[0], "") 102 | else: 103 | return path.replace("/./", "/") 104 | 105 | 106 | def get_reverse_path(path): 107 | path = path.replace("\\", "/") 108 | 109 | if path.startswith("/"): 110 | raise ValueError("Not a relative path.") 111 | 112 | output = "" 113 | 114 | if os.path.isdir(path): 115 | if path.endswith("/"): 116 | path = path[:-1] 117 | else: 118 | path = os.path.dirname(path) 119 | 120 | if path == "": 121 | return "." 122 | 123 | cwd = os.getcwd() 124 | for seg in path.split("/"): 125 | if seg == ".": 126 | pass 127 | elif seg == "..": 128 | output = "/" + cwd.split("/")[-1] + output 129 | cwd = os.path.dirname(cwd) 130 | else: 131 | output = "/.." + output 132 | cwd += "/" + seg 133 | 134 | output = output[1:] 135 | 136 | if output == "": 137 | return "." 138 | 139 | return output 140 | -------------------------------------------------------------------------------- /uf/com/checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from ..third import tf 5 | 6 | 7 | def get_checkpoint_path(path): 8 | """ If detected no checkpoint file, return None. """ 9 | 10 | # get directory 11 | dir_name = path if os.path.isdir(path) else os.path.dirname(path) 12 | if not dir_name: 13 | dir_name = "." 14 | 15 | # get file 16 | if not os.path.isdir(path): 17 | prefix = path.strip("/").split("/")[-1] 18 | 19 | # find checkpoint 20 | if os.path.isfile(f"{dir_name}/{prefix}.index"): 21 | return f"{dir_name}/{prefix}" 22 | 23 | # stop to avoid error 24 | return None 25 | 26 | # get file from record file 27 | if os.path.exists(f"{dir_name}/checkpoint"): 28 | with open(f"{dir_name}/checkpoint") as f: 29 | line = f.readline() 30 | try: 31 | prefix = re.findall("model_checkpoint_path: \"(.+?)\"", line)[0] 32 | if os.path.exists(f"{dir_name}/{prefix}.index"): 33 | return f"{dir_name}/{prefix}" 34 | except IndexError: 35 | pass 36 | 37 | # find file with largest step 38 | files = [] 39 | for file in os.listdir(dir_name): 40 | if not file.endswith(".index"): 41 | continue 42 | prefix = file.replace(".index", "") 43 | step = 0 44 | try: 45 | step = int(prefix.split("-")[-1]) 46 | except: 47 | pass 48 | files.append((step, file)) 49 | if files: 50 | files.sort(key=lambda x: x[0], reverse=True) 51 | prefix = files[0][1].replace(".index", "") 52 | return f"{dir_name}/{prefix}" 53 | 54 | # find no checkpoint 55 | return None 56 | 57 | 58 | def get_assignment_map(checkpoint_file, variables, continual=False, show_matched=False): 59 | """ Carefully designed so as to fulfil any personalized needs. """ 60 | assignment_map = {} 61 | 62 | # read local variables 63 | name_to_variable = {} 64 | for var in variables: 65 | name = var.name 66 | res = re.match("^(.*):\\d+$", name) 67 | if res is not None: 68 | name = res.group(1) 69 | if not continual: 70 | if "global_step" in name \ 71 | or "/adam" in name \ 72 | or "/Adam" in name \ 73 | or "/lamb" in name: 74 | continue 75 | name_to_variable[name] = var 76 | 77 | # read checkpoint variables 78 | init_vars = tf.train.list_variables(checkpoint_file) 79 | inited_vars = {} 80 | for name_shape in init_vars: 81 | (from_name, from_shape) = (name_shape[0], name_shape[1]) 82 | 83 | to_name = from_name 84 | if to_name not in name_to_variable or \ 85 | name_to_variable[to_name].shape.as_list() != from_shape: 86 | if show_matched: 87 | tf.logging.info("checkpoint_file contains <%s>", from_name) 88 | continue 89 | if show_matched: 90 | tf.logging.info("checkpoint_file contains <%s>, matched", from_name) 91 | assignment_map[from_name] = name_to_variable[to_name] 92 | inited_vars[to_name] = 1 93 | 94 | # further feedback 95 | uninited_vars = {} 96 | for var in variables: 97 | if var.name[:-2] not in inited_vars: 98 | if var.name[:-2].endswith("_m") or var.name[:-2].endswith("_v"): 99 | continue 100 | if show_matched: 101 | tf.logging.info("unmatched parameter %s", var) 102 | uninited_vars[var.name[:-2]] = var 103 | return (assignment_map, uninited_vars) 104 | 105 | 106 | def list_variables(checkpoint): 107 | checkpoint_path = get_checkpoint_path(checkpoint) 108 | if not checkpoint_path: 109 | raise ValueError( 110 | "Checkpoint file \"%s\" does not exist. " 111 | "Make sure you pass correct value to " 112 | "`checkpoint`." % checkpoint 113 | ) 114 | return tf.train.list_variables(checkpoint_path) 115 | -------------------------------------------------------------------------------- /uf/com/com.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import numpy as np 4 | 5 | from ..third import tf 6 | 7 | PACK_DIR = os.path.dirname(__file__) 8 | 9 | 10 | class Null: 11 | """ A null class for keeping code compatible when hanging out. """ 12 | def __init__(self, *args, **kwargs): 13 | pass 14 | 15 | def __enter__(self, *args, **kwargs): 16 | pass 17 | 18 | def __exit__(self, *args, **kwargs): 19 | pass 20 | 21 | 22 | def unimported_module(name, message): 23 | """ Returns an invalid module where error occurs only when being called. """ 24 | 25 | class UnimportedModule: 26 | def __init__(self, *args, **kwargs): 27 | raise ImportError(message) 28 | return UnimportedModule 29 | 30 | 31 | def warning(func): 32 | """ A function wrapper to avoid application crash. """ 33 | def wrapper(*args, **kwargs): 34 | try: 35 | func(*args, **kwargs) 36 | except Exception as e: 37 | tf.logging.warning(e) 38 | return wrapper 39 | 40 | 41 | def set_verbosity(level=2): 42 | """ Set exposure level of detail information. """ 43 | if level == 2: 44 | tf.logging.set_verbosity(tf.logging.INFO) 45 | elif level == 1: 46 | tf.logging.set_verbosity(tf.logging.WARN) 47 | elif level == 0: 48 | tf.logging.set_verbosity(tf.logging.ERROR) 49 | else: 50 | raise ValueError( 51 | "Invalid value: %s. Pick from `0`, `1` and `2`. " 52 | "The larger the value, the more information will be printed." % level 53 | ) 54 | 55 | 56 | def set_log(log_file): 57 | """ Set logging file. """ 58 | log = logging.getLogger("tensorflow") 59 | log.setLevel(logging.INFO) 60 | fh = logging.FileHandler(log_file) 61 | fh.setLevel(logging.INFO) 62 | log.addHandler(fh) 63 | 64 | 65 | def truncate_segments(segments, max_seq_length, truncate_method="LIFO"): 66 | """ Truncate sequence segments to avoid the overall length exceeds the `max_seq_length`. """ 67 | total_seq_length = sum([len(segment) for segment in segments]) 68 | if total_seq_length <= max_seq_length: 69 | return 70 | if truncate_method not in ("longer-FO", "FIFO", "LIFO"): 71 | raise ValueError("Invalid value for `truncate_method`. Pick one from `FIFO`, `LIFO` and `longer-FO`.") 72 | 73 | n = 0 74 | if truncate_method == "FIFO": 75 | index = 0 76 | while n < total_seq_length - max_seq_length: 77 | if not segments[index]: 78 | index += 1 79 | continue 80 | segments[index].pop(0) 81 | n += 1 82 | elif truncate_method == "LIFO": 83 | index = len(segments) - 1 84 | while n < total_seq_length - max_seq_length: 85 | if not segments[index]: 86 | index -= 1 87 | continue 88 | segments[index].pop() 89 | n += 1 90 | else: 91 | while n < total_seq_length - max_seq_length: 92 | max(segments, key=lambda x: len(x)).pop() 93 | n += 1 94 | 95 | def transform(output_arrays, n_inputs=None): 96 | """ Transform raw outputs. """ 97 | 98 | # consolidate different batches 99 | if isinstance(output_arrays[0], np.ndarray): 100 | if len(output_arrays[0].shape) == 1: # 1D 101 | out = np.hstack(output_arrays) 102 | else: # 2D/3D/... 103 | out = np.vstack(output_arrays) 104 | return out[:n_inputs] if n_inputs else out 105 | 106 | # flatten 107 | elif isinstance(output_arrays[0], list): 108 | out = [item for output_array in output_arrays for item in output_array] 109 | return out[:n_inputs] if n_inputs else out 110 | 111 | else: 112 | return output_arrays 113 | -------------------------------------------------------------------------------- /uf/com/graph.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | 4 | from ..third import tf 5 | 6 | 7 | def get_grad_and_param(variables, grads, param_name): 8 | for (grad, param) in zip(grads, variables): 9 | if param_name in param.name: 10 | return (grad, param) 11 | return None, None 12 | 13 | 14 | def get_param(variables, param_name): 15 | for param in variables: 16 | if param_name in param.name: 17 | return param 18 | return None 19 | 20 | 21 | def get_param_name(param): 22 | res = re.match("^(.*):\\d+$", param.name) 23 | if res is not None: 24 | param_name = res.group(1) 25 | return param_name 26 | 27 | 28 | def count_params(global_variables, trainable_variables): 29 | def get_params(variable): 30 | _tuple = tuple(map(int, variable.shape)) 31 | if not _tuple: 32 | return 0 33 | return np.prod(_tuple) 34 | n_global = 0 35 | for variable in global_variables: 36 | n_global += get_params(variable) 37 | n_trainable = 0 38 | for variable in trainable_variables: 39 | n_trainable += get_params(variable) 40 | tf.logging.info( 41 | "Build graph with %s parameters (among which %s are trainable)" 42 | % (format(int(n_global), ","), format(int(n_trainable), ",")) 43 | ) 44 | 45 | 46 | def scale_grad(grad, scalar): 47 | if grad is None: 48 | return None 49 | 50 | if grad.__str__().startswith("IndexedSlices"): 51 | return tf.IndexedSlices(values=grad.values * scalar, indices=grad.indices, dense_shape=grad.dense_shape) 52 | else: 53 | return grad * scalar 54 | 55 | 56 | def add_n_grads(split_grads): 57 | split_grads = [grad for grad in split_grads if grad is not None] 58 | if len(split_grads) == 1: 59 | return split_grads[0] 60 | 61 | # Dealing with IndexedSlices for large-dimensional embedding 62 | # matrix. The gradient of an embedding matrix is not a tensor, 63 | # but a tuple-like object named `IndexedSlices`, for this one, 64 | # we need to take special processings. 65 | if split_grads[0].__str__().startswith("IndexedSlices"): 66 | 67 | values = tf.concat([grad.values for grad in split_grads], axis=0) 68 | indices = tf.concat([grad.indices for grad in split_grads], axis=0) 69 | dense_shape = split_grads[0].dense_shape 70 | 71 | return tf.IndexedSlices(values=values, indices=indices, dense_shape=dense_shape) 72 | 73 | return tf.add_n(split_grads) 74 | 75 | 76 | def average_n_grads(split_grads): 77 | split_grads = [grad for grad in split_grads if grad is not None] 78 | if not split_grads: 79 | return None 80 | if len(split_grads) == 1: 81 | return split_grads[0] 82 | 83 | # Dealing with IndexedSlices for large-dimensional embedding 84 | # matrix. The gradient of an embedding matrix is not a tensor, 85 | # but a tuple-like object named `IndexedSlices`, for this one, 86 | # we need to take special processings. 87 | if split_grads[0].__str__().startswith("IndexedSlices"): 88 | 89 | values = tf.divide(tf.concat([grad.values for grad in split_grads], axis=0), len(split_grads)) 90 | indices = tf.concat([grad.indices for grad in split_grads], axis=0) 91 | dense_shape = split_grads[0].dense_shape 92 | 93 | return tf.IndexedSlices(values=values, indices=indices, dense_shape=dense_shape) 94 | 95 | return tf.divide(tf.add_n(split_grads), len(split_grads)) 96 | 97 | 98 | def update_global_params(variables, global_step, optimizer, grads): 99 | assert len(grads) == len(variables) 100 | update_op = optimizer.apply_gradients(zip(grads, variables), global_step=global_step) 101 | return tf.group(update_op) 102 | -------------------------------------------------------------------------------- /uf/com/parallel.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | 3 | from ..third import tf 4 | 5 | 6 | class MultiProcessInstance(): 7 | def __init__(self): 8 | self.n = 1 9 | self.pool = None 10 | 11 | mp = MultiProcessInstance() 12 | 13 | 14 | class MultiProcess: 15 | def __init__(self, n_process="auto"): 16 | n_cpu = multiprocessing.cpu_count() 17 | if n_process != "auto": 18 | assert n_process <= n_cpu, ("Invalid value of `n_process`. It can not exceed the num of cpu cores in the device: %d." % n_cpu) 19 | else: 20 | n_process = n_cpu 21 | self.n = n_process 22 | 23 | def __enter__(self): 24 | if self.n > 1: 25 | mp.pool = multiprocessing.Pool(self.n) 26 | mp.n = self.n 27 | 28 | def __exit__(self, *args, **kwargs): 29 | if mp.pool is not None: 30 | mp.pool.close() 31 | mp.pool.join() 32 | mp.pool = None 33 | mp.n = 1 34 | 35 | 36 | def parallel_convert_single_process(args): 37 | bucket_id = args[0] 38 | app_class = args[1] 39 | mapping = args[2] 40 | data = args[3] 41 | is_training = args[4] 42 | 43 | # Verbosity of tensorflow in new process will be set to default, 44 | # for this reason we just have to silence the logging and don"t 45 | # have to care about the recovery. 46 | tf.logging.set_verbosity(tf.logging.FATAL) 47 | model = app_class(*mapping) 48 | 49 | data = model.convert( 50 | data["X"], data["y"], data["sample_weight"], data["X_tokenized"], 51 | is_training, True, 52 | ) 53 | return (bucket_id, data) 54 | -------------------------------------------------------------------------------- /uf/com/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | 4 | 5 | def convert_tokens_to_text(tokens): 6 | words = [""] 7 | for _token in tokens: 8 | if _token.startswith("##"): 9 | words[-1] += _token[2:] 10 | else: 11 | words.append(_token) 12 | text = " ".join(words) 13 | 14 | # remove spaces 15 | if len(text) >= 3: 16 | i = 1 17 | while i < len(text) - 1: 18 | if is_whitespace(text[i]): 19 | _last = text[i - 1] 20 | _next = text[i + 1] 21 | 22 | # remove space between chars and punctuations 23 | if not is_english_char(_last) or not is_english_char(_next): 24 | text = text.replace("%s%s%s" % (_last, text[i], _next), "%s%s" % (_last, _next)) 25 | i += 1 26 | 27 | return text.strip() 28 | 29 | 30 | def align_tokens_with_text(tokens, text, lower_case): 31 | if lower_case: 32 | text = text.lower() 33 | 34 | i = 0 35 | j = 0 36 | max_j = len(text) 37 | mapping_start = [] 38 | mapping_end = [] 39 | while i < len(tokens): 40 | token = tokens[i] 41 | token = token.replace("##", "") 42 | if text[j:].startswith(token): 43 | mapping_start.append(j) 44 | mapping_end.append(j + len(token)) 45 | i += 1 46 | j += len(token) 47 | elif token not in text[j:]: # [CLS], [SEP], None, some Japanese signs 48 | mapping_start.append(j) 49 | if token in ("[CLS]", "[SEP]"): 50 | mapping_end.append(j) 51 | else: 52 | mapping_end.append(j + len(token)) 53 | i += 1 54 | else: 55 | j += 1 56 | if j >= max_j: 57 | break 58 | 59 | for _ in range(len(tokens) - len(mapping_start)): 60 | mapping_start.append(max_j + 1000) 61 | mapping_end.append(max_j + 1000) 62 | 63 | return mapping_start, mapping_end 64 | 65 | 66 | def find_boyer_moore(T, P, start=0): 67 | """ BM algorithm for string match. """ 68 | 69 | n, m = len(T), len(P) 70 | last = {} 71 | for k in range(m): 72 | last[P[k]] = k 73 | 74 | # align end of pattern at index m-1 of text 75 | i = start + m - 1 76 | k = m - 1 77 | while i < n: 78 | if T[i] == P[k]: 79 | if k == 0: 80 | return i 81 | i -= 1 82 | k -= 1 83 | else: 84 | j = last.get(T[i], -1) 85 | i += m - min(k, j + 1) 86 | k = m - 1 87 | return -1 88 | 89 | 90 | def find_all_boyer_moore(T, P): 91 | start_ids = [] 92 | start = 0 93 | while True: 94 | start_position = find_boyer_moore( 95 | T, P, start=start) 96 | if start_position == -1: 97 | break 98 | start_ids.append(start_position) 99 | start = start_position + len(P) 100 | return start_ids 101 | 102 | 103 | def is_english_char(char): 104 | if re.findall("[a-zA-Z]", char): 105 | return True 106 | return False 107 | 108 | 109 | def is_numeric_char(char): 110 | if re.findall(r"[\d]", char): 111 | return True 112 | return False 113 | 114 | 115 | def is_whitespace(char): 116 | """Checks whether `chars` is a whitespace character.""" 117 | 118 | # \t, \n, and \r are technically contorl characters but we treat them 119 | # as whitespace since they are generally considered as such. 120 | if char in (" ", "\t", "\n", "\r"): 121 | return True 122 | cat = unicodedata.category(char) 123 | if cat == "Zs": 124 | return True 125 | return False 126 | 127 | 128 | def is_control(char): 129 | """Checks whether `chars` is a control character.""" 130 | 131 | # These are technically control characters but we count them as whitespace 132 | # characters. 133 | if char in ("\t", "\n", "\r"): 134 | return False 135 | cat = unicodedata.category(char) 136 | if cat in ("Cc", "Cf"): 137 | return True 138 | return False 139 | 140 | 141 | def is_punctuation(char): 142 | """Checks whether `chars` is a punctuation character.""" 143 | ord_id = ord(char) 144 | 145 | # We treat all non-letter/number ASCII as punctuation. 146 | # Characters such as "^", "$", and "`" are not in the Unicode 147 | # Punctuation class but we treat them as punctuation anyways, for 148 | # consistency. 149 | if (ord_id >= 33 and ord_id <= 47) or \ 150 | (ord_id >= 58 and ord_id <= 64) or \ 151 | (ord_id >= 91 and ord_id <= 96) or \ 152 | (ord_id >= 123 and ord_id <= 126): 153 | return True 154 | cat = unicodedata.category(char) 155 | if cat.startswith("P"): 156 | return True 157 | return False 158 | 159 | 160 | def is_chinese_char(ord_id): 161 | """Checks whether ord_id is the codepoint of a CJK character.""" 162 | # This defines a `Chinese character` as anything in the CJK 163 | # Unicode block: 164 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 165 | # 166 | # Note that the CJK Unicode block is NOT all Japanese and 167 | # Korean characters, despite its name. The modern Korean Hangul 168 | # alphabet is a different block, as is Japanese Hiragana and 169 | # Katakana. Those alphabets are used to write space-separated 170 | # words, so they are not treated specially and handled like the 171 | # all of the other languages. 172 | if (ord_id >= 0x4E00 and ord_id <= 0x9FFF) or \ 173 | (ord_id >= 0x3400 and ord_id <= 0x4DBF) or \ 174 | (ord_id >= 0x20000 and ord_id <= 0x2A6DF) or \ 175 | (ord_id >= 0x2A700 and ord_id <= 0x2B73F) or \ 176 | (ord_id >= 0x2B740 and ord_id <= 0x2B81F) or \ 177 | (ord_id >= 0x2B820 and ord_id <= 0x2CEAF) or \ 178 | (ord_id >= 0xF900 and ord_id <= 0xFAFF) or \ 179 | (ord_id >= 0x2F800 and ord_id <= 0x2FA1F): 180 | return True 181 | return False 182 | -------------------------------------------------------------------------------- /uf/com/tfrecords.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | from ..third import tf 4 | 5 | BACKUP_DATA = "ex:" # data with the prefix `ex:` will not be fed into Tensorflow graph 6 | 7 | 8 | def write_tfrecords(data, tfrecords_file): 9 | """ Write data into tfrecords file. """ 10 | 11 | writer = tf.python_io.TFRecordWriter(tfrecords_file) 12 | keys = [] 13 | values = [] 14 | for key, value in data.items(): 15 | if key.startswith(BACKUP_DATA): 16 | continue 17 | keys.append(key) 18 | values.append(value) 19 | examples = zip(*values) 20 | 21 | for example in examples: 22 | features = collections.OrderedDict() 23 | for i, value in enumerate(example): 24 | if isinstance(value, int): 25 | features[keys[i]] = create_int_feature([value]) 26 | elif isinstance(value, float): 27 | features[keys[i]] = create_float_feature([value]) 28 | elif value.dtype.name.startswith("int"): 29 | features[keys[i]] = create_int_feature(value.tolist()) 30 | elif value.dtype.name.startswith("float"): 31 | features[keys[i]] = create_float_feature(value.tolist()) 32 | else: 33 | raise ValueError("Invalid data type: %s." % type(value)) 34 | tf_example = tf.train.Example(features=tf.train.Features(feature=features)) 35 | writer.write(tf_example.SerializeToString()) 36 | 37 | 38 | def get_tfrecords_keys(tfrecords_file): 39 | """ Read keys from tfrecords file. """ 40 | iterator = tf.python_io.tf_record_iterator(tfrecords_file) 41 | record = next(iterator) 42 | example = tf.train.Example() 43 | example.ParseFromString(record) 44 | return list(example.features.feature.keys()) 45 | 46 | 47 | def get_tfrecords_length(tfrecords_files): 48 | """ Count number of data in tfrecords files. """ 49 | n = 0 50 | for tfrecords_file in tfrecords_files: 51 | for _ in tf.python_io.tf_record_iterator(tfrecords_file): 52 | n += 1 53 | return n 54 | 55 | 56 | def convert_placeholder_to_feature(placeholder): 57 | """ Convert `PlaceHolder` for feeding data in memory into `FixedLenFeature` for local TFRecords. """ 58 | if placeholder.dtype.name.startswith("int"): 59 | dtype = tf.int64 60 | elif placeholder.dtype.name.startswith("float"): 61 | dtype = tf.float32 62 | else: 63 | raise ValueError(f"Unsupported dtype: {placeholder.dtype}.") 64 | return tf.FixedLenFeature(list(placeholder.shape)[1:], dtype) 65 | 66 | 67 | def create_int_feature(values): 68 | """ Convert list of values into tf-serializable Int64. """ 69 | if not isinstance(values, list): 70 | values = [values] 71 | feature = tf.train.Feature(int64_list=tf.train.Int64List(value=values)) 72 | return feature 73 | 74 | 75 | def create_float_feature(values): 76 | """ Convert list of values into tf-serializable Float. """ 77 | if not isinstance(values, list): 78 | values = [values] 79 | feature = tf.train.Feature(float_list=tf.train.FloatList(value=values)) 80 | return feature 81 | -------------------------------------------------------------------------------- /uf/task/__init__.py: -------------------------------------------------------------------------------- 1 | from .init import Initialization 2 | from .train import Training 3 | from .train_adversarial import AdversarialTraining 4 | from .infer import Inference 5 | from .score import Scoring 6 | from .export import Exportation 7 | 8 | 9 | __all__ = [ 10 | "Training", 11 | "AdversarialTraining", 12 | "Initialization", 13 | "Inference", 14 | "Scoring", 15 | "Exportation", 16 | ] 17 | -------------------------------------------------------------------------------- /uf/task/_base_.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from abc import abstractmethod 4 | 5 | from ..third import tf 6 | from .. import com 7 | 8 | 9 | class Task: 10 | """ Parent class of all tasks. 11 | 12 | This is an internal class that does not provide interface for outside requests.""" 13 | 14 | def __init__(self, module): 15 | self.module = module 16 | 17 | @abstractmethod 18 | def run(self, *args, **kwargs): 19 | raise NotImplementedError() 20 | 21 | def _build_graph(self): 22 | """ Build computation graph. """ 23 | self.module._graph_mode = "infer" 24 | self.module._set_placeholders() 25 | _, self.module.tensors = self.module._parallel_forward(is_training=False) 26 | 27 | def _init_session(self, ignore_checkpoint=False): 28 | """ Initialize Tensorflow session. """ 29 | com.count_params(self.module.global_variables, self.module.trainable_variables) 30 | 31 | if self.module._gpu_ids: 32 | os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(self.module._gpu_ids) 33 | else: 34 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPUs 35 | config = tf.ConfigProto( 36 | allow_soft_placement=True, 37 | gpu_options=tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=1.0), 38 | ) 39 | self.module.sess = tf.Session(graph=self.module.graph, config=config) 40 | self._init_variables(self.module.global_variables, ignore_checkpoint=ignore_checkpoint) 41 | self.module._session_built = True 42 | 43 | def _init_variables(self, variables, ignore_checkpoint=False): 44 | """ Initialize variables in the session. """ 45 | 46 | # randomly initialize variables 47 | tf.logging.info("Running local_init_op") 48 | local_init_op = tf.variables_initializer(variables) 49 | self.module.sess.run(local_init_op) 50 | self.module._inited_vars |= set(variables) 51 | tf.logging.info("Done running local_init_op") 52 | 53 | # read from checkpoint file 54 | if not ignore_checkpoint and self.module.init_checkpoint: 55 | checkpoint_path = com.get_checkpoint_path(self.module.init_checkpoint) 56 | if not checkpoint_path: 57 | raise ValueError( 58 | "Checkpoint file \"%s\" does not exist. Make sure you pass correct value to " 59 | "`init_checkpoint`." 60 | % self.module.init_checkpoint 61 | ) 62 | self.module.init_checkpoint = checkpoint_path # rectified path replacement 63 | 64 | # `continual` means we tend to succeed the training step and momentums variables " 65 | # "stored in the checkpoint file 66 | continual = os.path.dirname(checkpoint_path) == self.module.output_dir 67 | if continual: 68 | self.module.step = int(checkpoint_path.split("-")[-1]) 69 | 70 | # build a bridge between the variables in checkpoint file and the variables in the graph 71 | (assignment_map, uninited_vars) = com.get_assignment_map(checkpoint_path, variables, continual=continual) 72 | self.module.assignment_map = assignment_map 73 | self.module.uninited_vars = uninited_vars 74 | 75 | if uninited_vars: 76 | tf.logging.info( 77 | "%d (out of %d) local variables failed to match up with the checkpoint file. " 78 | "Check more details through `.uninited_vars`." 79 | % (len(uninited_vars), len(assignment_map) + len(uninited_vars)) 80 | ) 81 | 82 | if not self.module.assignment_map: # no variables to restore 83 | return 84 | loader = tf.train.Saver(self.module.assignment_map) 85 | loader.restore(self.module.sess, checkpoint_path) 86 | 87 | if "_global_step" in self.module.__dict__: 88 | self.module.sess.run(tf.assign(self.module._global_step, self.module.step)) 89 | 90 | def _build_feed_dict(self): 91 | """ Build `feed dict` for the current batch of data. """ 92 | 93 | feed_dict = {} 94 | for key, data in self.module.data.items(): 95 | if key.startswith(com.BACKUP_DATA): # not to feed 96 | continue 97 | 98 | # move pointer and form the batch 99 | ptr = self._ptr 100 | batch = data[ptr: ptr + self.module.batch_size] 101 | ptr += self.module.batch_size 102 | 103 | # fill up the batch 104 | while len(batch) < self.module.batch_size: 105 | ptr = self.module.batch_size - len(batch) 106 | remainder = data[:ptr] 107 | concat_func = np.vstack if len(batch.shape) > 1 else np.hstack 108 | batch = concat_func((batch, remainder)) 109 | 110 | placeholder = self.module.placeholders[key] 111 | feed_dict[placeholder] = batch 112 | 113 | self._ptr = ptr 114 | return feed_dict 115 | -------------------------------------------------------------------------------- /uf/task/export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | from ..third import tf 6 | from ._base_ import Task 7 | 8 | 9 | class Exportation(Task): 10 | """ Export model into PB file. """ 11 | 12 | def run(self, export_dir, rename_inputs=None, rename_outputs=None, ignore_inputs=None, ignore_outputs=None): 13 | 14 | # build graph 15 | self._build_graph() 16 | 17 | # init session 18 | if not self.module._session_built: 19 | self._init_session() 20 | 21 | def set_input(key, value): 22 | inputs[key] = tf.saved_model.utils.build_tensor_info(value) 23 | tf.logging.info("Register Input: %s, %s, %s" % (key, value.shape.as_list(), value.dtype.name)) 24 | 25 | def set_output(key, value): 26 | outputs[key] = tf.saved_model.utils.build_tensor_info(value) 27 | tf.logging.info("Register Output: %s, %s, %s" % (key, value.shape.as_list(), value.dtype.name)) 28 | 29 | # define inputs 30 | inputs = {} 31 | if not ignore_inputs: 32 | ignore_inputs = [] 33 | for key, value in list(self.module.placeholders.items()): 34 | if key in ignore_inputs: 35 | continue 36 | if rename_inputs and key in rename_inputs: 37 | key = rename_inputs[key] 38 | set_input(key, value) 39 | 40 | # define outputs 41 | outputs = {} 42 | if not ignore_outputs: 43 | ignore_outputs = [] 44 | for key, value in self.module.tensors.items(): 45 | if key in ignore_outputs: 46 | continue 47 | if rename_outputs and key in rename_outputs: 48 | key = rename_outputs[key] 49 | set_output(key, value) 50 | 51 | # build signature 52 | signature = tf.saved_model.signature_def_utils.build_signature_def( 53 | inputs, outputs, tf.saved_model.signature_constants.PREDICT_METHOD_NAME, 54 | ) 55 | signature_def_map = {"predict": signature} 56 | tf.logging.info("Register Signature: predict") 57 | 58 | legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op") 59 | builder_path = os.path.join(export_dir, time.strftime("%Y%m%d%H%M%S")) 60 | 61 | # solve the path problem 62 | if sys.platform.startswith("win"): 63 | builder_path = builder_path.replace("/", "\\") 64 | 65 | # exportation 66 | try: 67 | builder = tf.saved_model.builder.SavedModelBuilder(builder_path) 68 | builder.add_meta_graph_and_variables( 69 | self.module.sess, 70 | [tf.saved_model.tag_constants.SERVING], 71 | signature_def_map=signature_def_map, 72 | legacy_init_op=legacy_init_op, 73 | ) 74 | except ValueError: 75 | raise ValueError( 76 | "Twice exportation is not allowed. Try `.save()` and " 77 | "`.reset()` method to save and reset the graph before " 78 | "next exportation." 79 | ) 80 | builder.save() 81 | -------------------------------------------------------------------------------- /uf/task/infer.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from ..third import tf 4 | from ._base_ import Task 5 | 6 | 7 | class Inference(Task): 8 | """ Inference, as its name. """ 9 | 10 | def run(self): 11 | 12 | # confirm inputs 13 | n_inputs = len(list(self.module.data.values())[0]) 14 | if not n_inputs: 15 | raise ValueError("0 input samples recognized.") 16 | 17 | # build graph 18 | if self.module._graph_mode != "infer" and not self.module._debug: 19 | self._build_graph() 20 | 21 | # init session 22 | if not self.module._session_built: 23 | self._init_session() 24 | 25 | tf.logging.info("Running inference on %d samples", n_inputs) 26 | 27 | # inference 28 | self._ptr = 0 29 | last_tic = time.time() 30 | last_step = 0 31 | batch_outputs = [] 32 | total_steps = (n_inputs - 1) // self.module.batch_size + 1 33 | for step in range(total_steps): 34 | last_tic, last_step = self._predict_one_batch( 35 | step + 1, last_tic, last_step, total_steps, batch_outputs, 36 | ) 37 | 38 | output_arrays = list(zip(*batch_outputs)) 39 | return self.module._get_predict_outputs(output_arrays, n_inputs) 40 | 41 | def _predict_one_batch(self, step, last_tic, last_step, total_steps, batch_outputs): 42 | feed_dict = self._build_feed_dict() 43 | predict_ops = self.module._get_predict_ops() 44 | output_arrays = self.module.sess.run(predict_ops, feed_dict=feed_dict) 45 | batch_outputs.append(output_arrays) 46 | 47 | # print 48 | diff_tic = time.time() - last_tic 49 | process = step / total_steps 50 | if (diff_tic > 10 and process >= 0.005) or step == total_steps: 51 | info = "process %.1f%%" % (process * 100) 52 | 53 | # print inference efficiency 54 | info += ", %.2f examples/sec" % ((step - last_step) / diff_tic * self.module.batch_size) 55 | 56 | tf.logging.info(info) 57 | last_tic = time.time() 58 | last_step = step 59 | 60 | return last_tic, last_step 61 | -------------------------------------------------------------------------------- /uf/task/init.py: -------------------------------------------------------------------------------- 1 | from ..third import tf 2 | from ._base_ import Task 3 | 4 | 5 | class Initialization(Task): 6 | """ Initialze the model, make it ready for inference. """ 7 | 8 | def run(self, reinit_all, ignore_checkpoint): 9 | 10 | # build graph 11 | if self.module._graph_mode is None: 12 | self._build_graph() 13 | 14 | # init session 15 | if reinit_all or not self.module._session_built: 16 | self._init_session(ignore_checkpoint=ignore_checkpoint) 17 | 18 | # init uninitialized variables 19 | else: 20 | variables = [] 21 | for var in self.module.global_variables: 22 | if var not in self.module._inited_vars: 23 | variables.append(var) 24 | if variables: 25 | self._init_variables(variables, ignore_checkpoint=ignore_checkpoint) 26 | else: 27 | tf.logging.info( 28 | "Global variables already initialized. To re-initialize all, " 29 | "pass `reinit_all` to True." 30 | ) 31 | -------------------------------------------------------------------------------- /uf/task/score.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from ..third import tf 4 | from ._base_ import Task 5 | 6 | 7 | class Scoring(Task): 8 | """ Infer the data and score the performance. """ 9 | 10 | def run(self): 11 | 12 | # confirm inputs 13 | n_inputs = len(list(self.module.data.values())[0]) 14 | if not n_inputs: 15 | raise ValueError("0 input samples recognized.") 16 | 17 | # build graph 18 | if self.module._graph_mode != "infer" and not self.module._debug: 19 | self._build_graph() 20 | 21 | # init session 22 | if not self.module._session_built: 23 | self._init_session() 24 | 25 | tf.logging.info("Running scoring on %d samples", n_inputs) 26 | 27 | # scoring 28 | self._ptr = 0 29 | last_tic = time.time() 30 | last_step = 0 31 | batch_outputs = [] 32 | total_steps = (n_inputs - 1) // self.module.batch_size + 1 33 | for step in range(total_steps): 34 | last_tic, last_step = self._score_one_batch( 35 | step + 1, last_tic, last_step, total_steps, batch_outputs, 36 | ) 37 | 38 | output_arrays = list(zip(*batch_outputs)) 39 | return self.module._get_score_outputs(output_arrays, n_inputs) 40 | 41 | def _score_one_batch(self, step, last_tic, last_step, total_steps, batch_outputs): 42 | feed_dict = self._build_feed_dict() 43 | score_ops = self.module._get_score_ops() 44 | output_arrays = self.module.sess.run(score_ops, feed_dict=feed_dict) 45 | batch_outputs.append(output_arrays) 46 | 47 | # print 48 | diff_tic = time.time() - last_tic 49 | process = step / total_steps 50 | if (diff_tic > 10 and process >= 0.005) or step == total_steps: 51 | info = "process %.1f%%" % (process * 100) 52 | 53 | # print scoring efficiency 54 | info += ", %.2f examples/sec" % ((step - last_step) / diff_tic * self.module.batch_size) 55 | 56 | tf.logging.info(info) 57 | last_tic = time.time() 58 | last_step = step 59 | 60 | return last_tic, last_step 61 | -------------------------------------------------------------------------------- /uf/third.py: -------------------------------------------------------------------------------- 1 | """ Version control of dependencies. """ 2 | 3 | import tensorflow as tf 4 | 5 | 6 | if tf.__version__.startswith("2"): 7 | import tensorflow.compat.v1 as tf 8 | tf.disable_eager_execution() 9 | -------------------------------------------------------------------------------- /uf/token/__init__.py: -------------------------------------------------------------------------------- 1 | from .wordpiece import WordPieceTokenizer 2 | try: 3 | from .sentencepiece import SentencePieceTokenizer 4 | except: 5 | pass 6 | 7 | __all__ = [ 8 | "WordPieceTokenizer", 9 | "SentencePieceTokenizer", 10 | ] 11 | -------------------------------------------------------------------------------- /uf/token/sentencepiece.py: -------------------------------------------------------------------------------- 1 | """ SentencePiece tokenizer class. 2 | Code revised from XLNet team's implementation of XLNet. 3 | See `https://github.com/zihangdai/xlnet`. 4 | """ 5 | 6 | import os 7 | import unicodedata 8 | from sentencepiece import SentencePieceProcessor 9 | 10 | 11 | class SentencePieceTokenizer: 12 | def __init__(self, spm_file, do_lower_case=True): 13 | if not os.path.exists(spm_file): 14 | raise ValueError( 15 | "Can't find spm_file \"%s\". " 16 | "Please pass the correct path of sentence-piece model file, " 17 | "e.g.`spiece.model`." % spm_file 18 | ) 19 | self.processor = SentencePieceProcessor() 20 | self.processor.Load(spm_file) 21 | self.do_lower_case = do_lower_case 22 | 23 | def tokenize(self, text): 24 | text = preprocess_text(text, lower=self.do_lower_case) 25 | pieces = encode_pieces(self.processor, text, sample=False) 26 | return pieces 27 | 28 | def convert_tokens_to_ids(self, tokens): 29 | return [self.processor.PieceToId(piece) for piece in tokens] 30 | 31 | def convert_ids_to_tokens(self, ids): 32 | pieces = [self.processor.IdToPiece(_id) for _id in ids] 33 | return pieces 34 | 35 | 36 | def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False): 37 | if remove_space: 38 | outputs = " ".join(inputs.strip().split()) 39 | else: 40 | outputs = inputs 41 | outputs = outputs.replace("``", """).replace("""", """) 42 | 43 | if not keep_accents: 44 | outputs = unicodedata.normalize("NFKD", outputs) 45 | outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) 46 | if lower: 47 | outputs = outputs.lower() 48 | 49 | return outputs 50 | 51 | 52 | def encode_pieces(sp_model, text, sample=False): 53 | 54 | if not sample: 55 | pieces = sp_model.EncodeAsPieces(text) 56 | else: 57 | pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1) 58 | new_pieces = [] 59 | for piece in pieces: 60 | if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit(): 61 | cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace("▁", "")) 62 | if piece[0] != "▁" and cur_pieces[0][0] == "▁": 63 | if len(cur_pieces[0]) == 1: 64 | cur_pieces = cur_pieces[1:] 65 | else: 66 | cur_pieces[0] = cur_pieces[0][1:] 67 | cur_pieces.append(piece[-1]) 68 | new_pieces.extend(cur_pieces) 69 | else: 70 | new_pieces.append(piece) 71 | return new_pieces 72 | --------------------------------------------------------------------------------