├── .gitignore
├── LICENSE
├── README.md
├── docs
    └── pics
    │   ├── framework.png
    │   └── logo.png
├── examples
    ├── run_bert_classifier.py
    ├── run_pnasnet_classifier.py
    └── tutorial
    │   ├── ALBERTBinaryClassifier.ipynb
    │   ├── ALBERTClassifier.ipynb
    │   ├── ALBERTLM.ipynb
    │   ├── ALBERTMRC.ipynb
    │   ├── ALBERTSeqClassifier.ipynb
    │   ├── AdaBERTClassifier.ipynb
    │   ├── BERTBinaryClassifier.ipynb
    │   ├── BERTCRFCascadeNER.ipynb
    │   ├── BERTCRFNER.ipynb
    │   ├── BERTClassifier.ipynb
    │   ├── BERTLM.ipynb
    │   ├── BERTMRC.ipynb
    │   ├── BERTNER.ipynb
    │   ├── BERTRegressor.ipynb
    │   ├── BERTSeqClassifier.ipynb
    │   ├── BERTSeqCrossClassifier.ipynb
    │   ├── BERTTmpBinaryClassifier.ipynb
    │   ├── BERTVerifierMRC.ipynb
    │   ├── BiRNNClassifier.ipynb
    │   ├── ELECTRABinaryClassifier.ipynb
    │   ├── ELECTRAClassifier.ipynb
    │   ├── ELECTRALM.ipynb
    │   ├── ELECTRAMRC.ipynb
    │   ├── ELECTRASeqClassifier.ipynb
    │   ├── FastBERTClassifier.ipynb
    │   ├── GPT2LM.ipynb
    │   ├── MotianClassifier.ipynb
    │   ├── PNasNetClassifier.ipynb
    │   ├── PerformerClassifier.ipynb
    │   ├── RNNClassifier.ipynb
    │   ├── RecBERT2LM.ipynb
    │   ├── RecBERT3LM.ipynb
    │   ├── RecBERTLM.ipynb
    │   ├── RetroReaderMRC.ipynb
    │   ├── RoBERTaBinaryClassifier.ipynb
    │   ├── RoBERTaClassifier.ipynb
    │   ├── RoBERTaLM.ipynb
    │   ├── RoBERTaMRC.ipynb
    │   ├── RoBERTaSeqClassifier.ipynb
    │   ├── SANetMRC.ipynb
    │   ├── SQPLM.ipynb
    │   ├── SemBERTClassifier.ipynb
    │   ├── TextCNNClassifier.ipynb
    │   ├── TinyBERTBinaryClassifier.ipynb
    │   ├── TinyBERTClassifier.ipynb
    │   ├── TransformerMT.ipynb
    │   ├── UDAClassifier.ipynb
    │   ├── UniLM.ipynb
    │   ├── UniLMPrompt.ipynb
    │   ├── VAELM.ipynb
    │   ├── WideDeepClassifier.ipynb
    │   ├── WideDeepRegressor.ipynb
    │   ├── XLNetBinaryClassifier.ipynb
    │   └── XLNetClassifier.ipynb
├── ref
    ├── albert_config.json
    ├── bert_config.json
    ├── spiece.model
    ├── vocab.txt
    └── xlnet_config.json
├── setup.py
└── uf
    ├── __init__.py
    ├── apps
        ├── __init__.py
        ├── _base_
        │   ├── __init__.py
        │   ├── _base_.py
        │   ├── _base_binary_classifier.py
        │   ├── _base_classifier.py
        │   ├── _base_lm.py
        │   ├── _base_mrc.py
        │   ├── _base_mt.py
        │   ├── _base_ner.py
        │   ├── _base_regressor.py
        │   └── _base_seq_classifier.py
        ├── adabert
        │   ├── __init__.py
        │   ├── adabert.py
        │   └── adabert_classifier.py
        ├── albert
        │   ├── __init__.py
        │   ├── albert.py
        │   ├── albert_binary_classifier.py
        │   ├── albert_classifier.py
        │   ├── albert_lm.py
        │   ├── albert_mrc.py
        │   └── albert_seq_classifier.py
        ├── bert
        │   ├── __init__.py
        │   ├── bert.py
        │   ├── bert_binary_classifier.py
        │   ├── bert_classifier.py
        │   ├── bert_crf_cascade_ner.py
        │   ├── bert_crf_ner.py
        │   ├── bert_lm.py
        │   ├── bert_mrc.py
        │   ├── bert_ner.py
        │   ├── bert_regressor.py
        │   ├── bert_seq_classifier.py
        │   ├── bert_seq_cross_classifier.py
        │   ├── bert_tmp_binary_classifier.py
        │   └── bert_verifier_mrc.py
        ├── chatbot
        │   ├── __init__.py
        │   ├── chatbot.py
        │   └── chatbot_mt.py
        ├── crf
        │   ├── __init__.py
        │   └── crf.py
        ├── dilated
        │   ├── __init__.py
        │   ├── dilated.py
        │   └── dilated_lm.py
        ├── electra
        │   ├── __init__.py
        │   ├── electra.py
        │   ├── electra_binary_classifier.py
        │   ├── electra_classifier.py
        │   ├── electra_lm.py
        │   ├── electra_mrc.py
        │   └── electra_seq_classifier.py
        ├── fastbert
        │   ├── __init__.py
        │   ├── fastbert.py
        │   └── fastbert_classifier.py
        ├── gpt2
        │   ├── __init__.py
        │   ├── gpt2.py
        │   └── gpt2_lm.py
        ├── motian
        │   ├── __init__.py
        │   ├── motian.py
        │   └── motian_classifier.py
        ├── nasnet
        │   ├── __init__.py
        │   ├── nasnet.py
        │   ├── nasnet_utils.py
        │   ├── pnasnet.py
        │   └── pnasnet_classifier.py
        ├── performer
        │   ├── __init__.py
        │   ├── performer.py
        │   └── performer_classifier.py
        ├── recbert
        │   ├── __init__.py
        │   ├── recbert.py
        │   ├── recbert2.py
        │   ├── recbert2_lm.py
        │   ├── recbert3.py
        │   ├── recbert3_lm.py
        │   └── recbert_lm.py
        ├── retroreader
        │   ├── __init__.py
        │   ├── retroreader.py
        │   └── retroreader_mrc.py
        ├── rnn
        │   ├── __init__.py
        │   ├── bi_rnn.py
        │   ├── bi_rnn_classifier.py
        │   ├── rnn.py
        │   └── rnn_classifier.py
        ├── roberta
        │   ├── __init__.py
        │   ├── roberta.py
        │   ├── roberta_binary_classifier.py
        │   ├── roberta_classifier.py
        │   ├── roberta_lm.py
        │   ├── roberta_mrc.py
        │   └── roberta_seq_classifier.py
        ├── sanet
        │   ├── __init__.py
        │   ├── sanet.py
        │   └── sanet_mrc.py
        ├── sembert
        │   ├── __init__.py
        │   ├── sembert.py
        │   └── sembert_classifier.py
        ├── spe
        │   ├── __init__.py
        │   ├── spe.py
        │   └── spe_lm.py
        ├── sqp
        │   ├── __init__.py
        │   ├── sqp.py
        │   └── sqp_lm.py
        ├── stockbert
        │   ├── __init__.py
        │   ├── stockbert.py
        │   └── stockbert_classifier.py
        ├── textcnn
        │   ├── __init__.py
        │   ├── textcnn.py
        │   └── textcnn_classifier.py
        ├── tinybert
        │   ├── __init__.py
        │   ├── tinybert.py
        │   ├── tinybert_binary_classifier.py
        │   └── tinybert_classifier.py
        ├── transformer
        │   ├── __init__.py
        │   ├── transformer.py
        │   └── transformer_mt.py
        ├── uda
        │   ├── __init__.py
        │   ├── uda.py
        │   └── uda_classifier.py
        ├── unilm
        │   ├── __init__.py
        │   ├── unilm.py
        │   ├── unilm_lm.py
        │   └── unilm_prompt.py
        ├── util.py
        ├── vae
        │   ├── __init__.py
        │   ├── vae.py
        │   └── vae_lm.py
        ├── widedeep
        │   ├── __init__.py
        │   ├── widedeep.py
        │   ├── widedeep_classifier.py
        │   └── widedeep_regressor.py
        └── xlnet
        │   ├── __init__.py
        │   ├── xlnet.py
        │   ├── xlnet_binary_classifier.py
        │   ├── xlnet_classifier.py
        │   ├── xlnet_lm.py
        │   └── xlnet_seq_classifier.py
    ├── com
        ├── __init__.py
        ├── cache.py
        ├── checkpoint.py
        ├── com.py
        ├── graph.py
        ├── parallel.py
        ├── resource.py
        ├── text.py
        └── tfrecords.py
    ├── core.py
    ├── opt.py
    ├── task
        ├── __init__.py
        ├── _base_.py
        ├── export.py
        ├── infer.py
        ├── init.py
        ├── score.py
        ├── train.py
        └── train_adversarial.py
    ├── third.py
    └── token
        ├── __init__.py
        ├── sentencepiece.py
        └── wordpiece.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # 隐藏文件
 3 | .*
 4 | */.*
 5 | */*/.*
 6 | */*/*/.*
 7 | 
 8 | # 压缩文件
 9 | *.tar.gz
10 | *.zip
11 | 
12 | # 安装生成文件
13 | build
14 | dist
15 | uf.egg-info
16 | 
17 | # 无关文件
18 | */__pycache__
19 | */*/__pycache__
20 | */*/*/__pycache__
21 | */*/*/*/__pycache__
22 | docs/*.pptx
23 | *.sh
24 | PLAN.md
25 | modify.py
26 | tmp.*
27 | test.py
28 | replace.py
29 | alarm.mp3
30 | log
31 | tf_slim
32 | data
33 | checkpoint
34 | pretrained
35 | pnasnet
36 | 


--------------------------------------------------------------------------------
/docs/pics/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/docs/pics/framework.png


--------------------------------------------------------------------------------
/docs/pics/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/docs/pics/logo.png


--------------------------------------------------------------------------------
/examples/run_bert_classifier.py:
--------------------------------------------------------------------------------
  1 | import uf
  2 | import numpy as np
  3 | 
  4 | 
  5 | def get_best_f1(probs, labels, label_index=1):
  6 |     """ Calculate the best f1 by scanning over probabilities. """
  7 | 
  8 |     assert len(probs) == len(labels)
  9 |     probs = np.array(probs)
 10 |     labels = np.array(labels)
 11 | 
 12 |     # initialize metrics
 13 |     n = np.sum(labels == label_index)
 14 |     tp = n
 15 |     fp = len(labels) - n
 16 |     fn = 0
 17 |     tn = 0
 18 |     accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
 19 |     precision = tp / max(tp + fp, 1)
 20 |     recall = tp / max(tp + fn, 1)
 21 |     f1 = 2 * precision * recall / max(precision + recall, 1)
 22 |     threshold = 0
 23 | 
 24 |     ids = sorted(range(len(probs)), key=lambda i: probs[i])
 25 |     for i in ids:
 26 |         prob = probs[i]
 27 |         label = labels[i]
 28 |         if label == label_index:
 29 |             tp -= 1
 30 |             fn += 1
 31 |         elif label != label_index:
 32 |             fp -= 1
 33 |             tn += 1
 34 | 
 35 |         _accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
 36 |         _precision = tp / max(tp + fp, 1)
 37 |         _recall = tp / max(tp + fn, 1)
 38 |         _f1 = 2 * _precision * _recall / max(_precision + _recall, 1)
 39 |         if _f1 > f1:
 40 |             accuracy = _accuracy
 41 |             precision = _precision
 42 |             recall = _recall
 43 |             f1 = _f1
 44 |             threshold = prob
 45 |     return (n, accuracy, precision, recall, f1, threshold)
 46 | 
 47 | 
 48 | def main():
 49 | 
 50 |     uf.set_log("./log")
 51 | 
 52 |     # load data
 53 |     X, y = [], []
 54 |     X_dev, y_dev = [], []
 55 |     with open("sst-2/train.txt", encoding="utf-8") as f:
 56 |         for i, line in enumerate(f):
 57 |             if i == 0:          # ignore title
 58 |                 continue
 59 |             query, label = line.strip("\n").split("\t")
 60 |             X.append(query)
 61 |             y.append(int(label))
 62 |     with open("sst-2/dev.txt", encoding="utf-8") as f:
 63 |         for i, line in enumerate(f):
 64 |             if i == 0:          # ignore title
 65 |                 continue
 66 |             query, label = line.strip("\n").split("\t")
 67 |             X_dev.append(query)
 68 |             y_dev.append(int(label))
 69 | 
 70 |     # modeling
 71 |     checkpoint_dir = "pretrained/bert-base-zh"
 72 |     model = uf.BERTClassifier(
 73 |         config_file=f"{checkpoint_dir}/bert_config.json",
 74 |         vocab_file=f"{checkpoint_dir}/vocab.txt",
 75 |         max_seq_length=32,
 76 |         label_size=2,
 77 |         init_checkpoint=checkpoint_dir,
 78 |         output_dir="bert",
 79 |         gpu_ids="0")
 80 | 
 81 |     # training
 82 |     for epoch in range(3):
 83 |         model.fit(
 84 |             X, y,
 85 |             batch_size=64,
 86 |             target_steps=-(epoch + 1),
 87 |             total_steps=-3,
 88 |             print_per_secs=5,
 89 |             save_per_steps=3000)
 90 |         model.localize("bp.%d" % epoch)
 91 | 
 92 |         # validation
 93 |         probs = model.predict(X_dev)["probs"]
 94 |         for i in range(2):
 95 |             n, acc, pre, rec, f1, thresh = get_best_f1(probs=probs[:, i], labels=y_dev, label_index=i)
 96 |             print("[dev] label %d (%d): accuracy %.3f, precision %.3f, recall %.3f, best_f1 %.3f, thresh >%s"
 97 |                   % (i, n, acc, pre, rec, f1, thresh))
 98 | 
 99 |     print("Application finished.")
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     main()
104 | 


--------------------------------------------------------------------------------
/examples/run_pnasnet_classifier.py:
--------------------------------------------------------------------------------
  1 | import uf
  2 | import pickle
  3 | import numpy as np
  4 | 
  5 | 
  6 | def get_best_f1(probs, labels, label_index=1):
  7 |     """ Calculate the best f1 by scanning over probabilities. """
  8 |     assert len(probs) == len(labels)
  9 |     probs = np.array(probs)
 10 |     labels = np.array(labels)
 11 | 
 12 |     # initialize metrics
 13 |     n = np.sum(labels == label_index)
 14 |     tp = n
 15 |     fp = len(labels) - n
 16 |     fn = 0
 17 |     tn = 0
 18 |     accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
 19 |     precision = tp / max(tp + fp, 1)
 20 |     recall = tp / max(tp + fn, 1)
 21 |     f1 = 2 * precision * recall / max(precision + recall, 1)
 22 |     threshold = 0
 23 | 
 24 |     ids = sorted(range(len(probs)), key=lambda i: probs[i])
 25 |     for i in ids:
 26 |         prob = probs[i]
 27 |         label = labels[i]
 28 |         if label == label_index:
 29 |             tp -= 1
 30 |             fn += 1
 31 |         elif label != label_index:
 32 |             fp -= 1
 33 |             tn += 1
 34 | 
 35 |         _accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
 36 |         _precision = tp / max(tp + fp, 1)
 37 |         _recall = tp / max(tp + fn, 1)
 38 |         _f1 = 2 * _precision * _recall / max(_precision + _recall, 1)
 39 |         if _f1 > f1:
 40 |             accuracy = _accuracy
 41 |             precision = _precision
 42 |             recall = _recall
 43 |             f1 = _f1
 44 |             threshold = prob
 45 |     return (n, accuracy, precision, recall, f1, threshold)
 46 | 
 47 | 
 48 | def main():
 49 | 
 50 |     uf.set_log("./log")
 51 | 
 52 |     # load data
 53 |     with open("data/cifar-10/batches.meta", "rb") as f:
 54 |         id2label = pickle.load(f)["label_names"]
 55 |     X, y = [], []
 56 |     X_dev, y_dev = [], []
 57 |     for i in range(1, 6):
 58 |         with open(f"data/cifar-10/data_batch_{i}", "rb") as f:
 59 |             data = pickle.load(f, encoding="bytes")
 60 |             for j in range(len(data[b"data"])):
 61 |                 image = data[b"data"][j]
 62 |                 image = np.reshape(image, [3, 32, 32])
 63 |                 image = np.transpose(image, [1, 2, 0])
 64 |                 X.append(image)
 65 |                 y.append(data[b"labels"][j])
 66 |     with open("data/cifar-10/test_batch", "rb") as f:
 67 |         data = pickle.load(f, encoding="bytes")
 68 |         for j in range(len(data[b"data"])):
 69 |             image = data[b"data"][j]
 70 |             image = np.reshape(image, [3, 32, 32])
 71 |             image = np.transpose(image, [1, 2, 0])
 72 |             X_dev.append(image)
 73 |             y_dev.append(data[b"labels"][j])
 74 |     print(f"X: {len(X)}")
 75 |     print(f"X_dev: {len(X_dev)}")
 76 | 
 77 |     # modeling
 78 |     model = uf.PNasNetClassifier(
 79 |         label_size=len(id2label),
 80 |         init_checkpoint="pretrained/pnasnet5-mobile",
 81 |         output_dir="pnasnet",
 82 |         gpu_ids="0",
 83 |         model_size="mobile",
 84 |         data_format="NHWC")
 85 | 
 86 |     # training
 87 |     for epoch in range(3):
 88 |         model.fit(
 89 |             X, y,
 90 |             batch_size=64,
 91 |             target_steps=-(epoch + 1),
 92 |             total_steps=-3,
 93 |             print_per_secs=5,
 94 |             save_per_steps=3000)
 95 |         model.localize("bp.%d" % epoch, into_file=".unif")
 96 | 
 97 |         # validation
 98 |         probs = model.predict(X_dev)["probs"]
 99 |         for i in range(2):
100 |             n, acc, pre, rec, f1, thresh = get_best_f1(probs=probs[:, i], labels=y_dev, label_index=i)
101 |             print("[dev] label %d (%d): accuracy %.3f, precision %.3f, recall %.3f, best_f1 %.3f, thresh >%s"
102 |                   % (i, n, acc, pre, rec, f1, thresh))
103 | 
104 |     print("Application finished.")
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/examples/tutorial/TextCNNClassifier.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "invalid-animation",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# TextCNNClassifier\n",
  9 |     "\n",
 10 |     "可用的中文预训练参数：暂无"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "impossible-professor",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "v2.5.0\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "import uf\n",
 29 |     "\n",
 30 |     "print(uf.__version__)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "id": "minimal-cambodia",
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "uf.TextCNNClassifier(\n",
 44 |       "    vocab_file=\"../../ref/vocab.txt\",\n",
 45 |       "    max_seq_length=128,\n",
 46 |       "    label_size=None,\n",
 47 |       "    init_checkpoint=None,\n",
 48 |       "    output_dir=None,\n",
 49 |       "    gpu_ids=\"0\",\n",
 50 |       "    filter_sizes=\"2,4,6\",\n",
 51 |       "    num_channels=6,\n",
 52 |       "    hidden_size=256,\n",
 53 |       "    do_lower_case=True,\n",
 54 |       "    truncate_method=\"LIFO\",\n",
 55 |       ")\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "model = uf.TextCNNClassifier(\"../../ref/vocab.txt\", gpu_ids=\"0\")\n",
 61 |     "print(model)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "id": "forty-marathon",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "X = [\"天亮以前说再见\", \"笑着泪流满面\", \"去迎接应该你的\", \"更好的明天\"]\n",
 72 |     "y = [1, 0, 2, 0]"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "id": "looking-attempt",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "# 训练"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 4,
 86 |    "id": "sharing-macintosh",
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "WARNING:tensorflow:From c:\\Users\\Luv_d\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\tensorflow\\python\\util\\dispatch.py:1176: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.\n",
 94 |       "Instructions for updating:\n",
 95 |       "Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.\n",
 96 |       "INFO:tensorflow:Build graph with 16,281,825 parameters (among which 5,427,275 are trainable)\n",
 97 |       "INFO:tensorflow:Running local_init_op\n",
 98 |       "INFO:tensorflow:Done running local_init_op\n",
 99 |       "INFO:tensorflow:Running training on 4 samples (step 0 -> 20)\n",
100 |       "INFO:tensorflow:step 1, accuracy 0.2500, loss 1.103519, 6.12 steps/sec, 24.46 examples/sec\n",
101 |       "INFO:tensorflow:step 4, accuracy 0.7500, loss 0.582421, 28.88 steps/sec, 115.54 examples/sec\n",
102 |       "INFO:tensorflow:step 7, accuracy 1.0000, loss 0.080866, 26.82 steps/sec, 107.29 examples/sec\n",
103 |       "INFO:tensorflow:step 10, accuracy 1.0000, loss 0.096210, 24.48 steps/sec, 97.92 examples/sec\n",
104 |       "INFO:tensorflow:step 13, accuracy 1.0000, loss 0.000033, 20.54 steps/sec, 82.17 examples/sec\n",
105 |       "INFO:tensorflow:step 16, accuracy 1.0000, loss 0.000003, 21.19 steps/sec, 84.75 examples/sec\n",
106 |       "INFO:tensorflow:step 18, accuracy 1.0000, loss 0.000004, 19.25 steps/sec, 76.99 examples/sec\n",
107 |       "INFO:tensorflow:step 20, accuracy 1.0000, loss 0.000252, 21.41 steps/sec, 85.64 examples/sec\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "model.fit(X, y, total_steps=20, learning_rate=0.01)     # 模型较小，可以适当提高学习率"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "studied-mechanism",
118 |    "metadata": {},
119 |    "source": [
120 |     "# 推理"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "id": "funky-diversity",
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "INFO:tensorflow:Running inference on 4 samples\n",
134 |       "INFO:tensorflow:process 100.0%, 126.93 examples/sec\n"
135 |      ]
136 |     },
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "{'preds': [1, 0, 2, 0],\n",
141 |        " 'probs': array([[1.9521560e-03, 9.9782097e-01, 2.2688659e-04],\n",
142 |        "        [9.9999988e-01, 1.2660193e-08, 6.8851620e-08],\n",
143 |        "        [3.9597539e-09, 1.1635332e-10, 1.0000000e+00],\n",
144 |        "        [9.9992132e-01, 3.3130198e-06, 7.5329801e-05]], dtype=float32)}"
145 |       ]
146 |      },
147 |      "execution_count": 5,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "model.predict(X)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "id": "every-professor",
159 |    "metadata": {},
160 |    "source": [
161 |     "# 评分"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 6,
167 |    "id": "great-alpha",
168 |    "metadata": {},
169 |    "outputs": [
170 |     {
171 |      "name": "stdout",
172 |      "output_type": "stream",
173 |      "text": [
174 |       "INFO:tensorflow:Running scoring on 4 samples\n",
175 |       "INFO:tensorflow:process 100.0%, 163.93 examples/sec\n"
176 |      ]
177 |     },
178 |     {
179 |      "data": {
180 |       "text/plain": [
181 |        "{'accuracy': 1.0, 'loss': 2.2112392e-05}"
182 |       ]
183 |      },
184 |      "execution_count": 6,
185 |      "metadata": {},
186 |      "output_type": "execute_result"
187 |     }
188 |    ],
189 |    "source": [
190 |     "model.score(X, y)"
191 |    ]
192 |   }
193 |  ],
194 |  "metadata": {
195 |   "kernelspec": {
196 |    "display_name": "Python 3.9.13 64-bit",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.9.13"
211 |   },
212 |   "vscode": {
213 |    "interpreter": {
214 |     "hash": "265fd6f62f200408acbbeae0248f34bed9f93569a643842b7a25d2cd76cae5e5"
215 |    }
216 |   }
217 |  },
218 |  "nbformat": 4,
219 |  "nbformat_minor": 5
220 | }
221 | 


--------------------------------------------------------------------------------
/ref/albert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "attention_probs_dropout_prob": 0,
 3 |     "hidden_act": "relu",
 4 |     "hidden_dropout_prob": 0,
 5 |     "embedding_size": 128,
 6 |     "hidden_size": 768,
 7 |     "initializer_range": 0.02,
 8 |     "intermediate_size": 3072,
 9 |     "max_position_embeddings": 512,
10 |     "num_attention_heads": 12,
11 |     "num_hidden_layers": 12,
12 |     "num_hidden_groups": 1,
13 |     "net_structure_type": 0,
14 |     "layers_to_keep": [],
15 |     "gap_size": 0,
16 |     "num_memory_blocks": 0,
17 |     "inner_group_num": 1,
18 |     "down_scale_factor": 1,
19 |     "type_vocab_size": 2,
20 |     "vocab_size": 21128
21 |   }
22 |   


--------------------------------------------------------------------------------
/ref/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1, 
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.1, 
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2, 
18 |   "vocab_size": 21128
19 | }
20 | 


--------------------------------------------------------------------------------
/ref/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/ref/spiece.model


--------------------------------------------------------------------------------
/ref/xlnet_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "d_head": 64, 
 3 |     "d_inner": 3072, 
 4 |     "d_model": 768, 
 5 |     "ff_activation": "relu", 
 6 |     "n_head": 12,
 7 |     "n_layer": 12, 
 8 |     "n_token": 32000, 
 9 |     "untie_r": true
10 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Building guideline:
 3 | 
 4 | In the current directory, running `python setup.py install`. If the
 5 | operation is not authorized, try `python setup.py install --user`.
 6 | 
 7 | """
 8 | 
 9 | from setuptools import setup, find_packages
10 | 
11 | setup(
12 |     name="uf",
13 |     version="v2.5.21",
14 |     description="Unified framework for NLP tasks.",
15 |     url="https://github.com/geyingli/unif",
16 |     long_description=open("README.md", "r", encoding="utf-8").read(),
17 |     long_description_content_type="text/markdown",
18 |     author="Geying Li",
19 |     author_email="luv_dusk@163.com",
20 |     license="Apache-2.0",
21 |     packages=find_packages(),
22 |     install_requires=[
23 |         "numpy",
24 |     ],
25 |     extras_require={
26 |         "cpu": ["tensorflow>=1.11.0"],
27 |         "gpu": ["tensorflow-gpu>=1.11.0"],
28 |     },
29 |     python_requires=">=3.6.0",
30 |     classifiers=[
31 |         "Operating System :: OS Independent",
32 |         "License :: OSI Approved :: Apache Software License",
33 |         "Programming Language :: Python :: 3",
34 |         "Programming Language :: Python :: 3.6",
35 |         "Programming Language :: Python :: 3.7",
36 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
37 |     ],
38 |     keywords=(
39 |         "bert xlnet electra nlp tensorflow classification generation "
40 |         "question-answering machine-reading-comprehension "
41 |         "translation sequence-labeling"
42 |     ),
43 | )
44 | 


--------------------------------------------------------------------------------
/uf/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | __version__ = "v2.5.21"
 3 | 
 4 | # loading models
 5 | from .apps import *
 6 | 
 7 | from .com import MultiProcess
 8 | from .com import restore
 9 | from .com import load
10 | from .com import download
11 | from .com import download_all
12 | from .com import get_checkpoint_path
13 | from .com import get_assignment_map
14 | from .com import list_variables
15 | from .com import list_resources
16 | from .com import set_verbosity
17 | from .com import set_log
18 | 
19 | set_verbosity()
20 | 
21 | __all__ = [
22 |     "MultiProcess",
23 |     "restore",
24 |     "load",
25 |     "download",
26 |     "download_all",
27 |     "get_checkpoint_path",
28 |     "get_assignment_map",
29 |     "list_variables",
30 |     "list_resources",
31 |     "set_verbosity",
32 |     "set_log",
33 | ]
34 | 


--------------------------------------------------------------------------------
/uf/apps/__init__.py:
--------------------------------------------------------------------------------
  1 | from ..com import unimported_module
  2 | 
  3 | from .bert.bert_lm import BERTLM
  4 | from .roberta.roberta_lm import RoBERTaLM
  5 | from .albert.albert_lm import ALBERTLM
  6 | from .electra.electra_lm import ELECTRALM
  7 | from .dilated.dilated_lm import DilatedLM
  8 | from .recbert.recbert_lm import RecBERTLM
  9 | from .recbert.recbert2_lm import RecBERT2LM
 10 | from .recbert.recbert3_lm import RecBERT3LM
 11 | from .vae.vae_lm import VAELM
 12 | from .spe.spe_lm import SPELM
 13 | from .gpt2.gpt2_lm import GPT2LM
 14 | from .unilm.unilm_lm import UniLM
 15 | from .unilm.unilm_prompt import UniLMPrompt
 16 | from .sqp.sqp_lm import SQPLM
 17 | from .textcnn.textcnn_classifier import TextCNNClassifier
 18 | from .rnn.rnn_classifier import RNNClassifier
 19 | from .rnn.bi_rnn_classifier import BiRNNClassifier
 20 | from .bert.bert_classifier import BERTClassifier
 21 | from .roberta.roberta_classifier import RoBERTaClassifier
 22 | from .albert.albert_classifier import ALBERTClassifier
 23 | from .electra.electra_classifier import ELECTRAClassifier
 24 | from .widedeep.widedeep_classifier import WideDeepClassifier
 25 | from .sembert.sembert_classifier import SemBERTClassifier
 26 | from .performer.performer_classifier import PerformerClassifier
 27 | from .uda.uda_classifier import UDAClassifier
 28 | from .motian.motian_classifier import MotianClassifier
 29 | from .tinybert.tinybert_classifier import TinyBERTClassifier
 30 | from .tinybert.tinybert_binary_classifier import TinyBERTBinaryClassifier
 31 | from .fastbert.fastbert_classifier import FastBERTClassifier
 32 | from .adabert.adabert_classifier import AdaBERTClassifier
 33 | from .stockbert.stockbert_classifier import StockBERTClassifier
 34 | from .bert.bert_binary_classifier import BERTBinaryClassifier
 35 | from .bert.bert_tmp_binary_classifier import BERTTmpBinaryClassifier
 36 | from .roberta.roberta_binary_classifier import RoBERTaBinaryClassifier
 37 | from .albert.albert_binary_classifier import ALBERTBinaryClassifier
 38 | from .electra.electra_binary_classifier import ELECTRABinaryClassifier
 39 | from .bert.bert_seq_classifier import BERTSeqClassifier
 40 | from .roberta.roberta_seq_classifier import RoBERTaSeqClassifier
 41 | from .albert.albert_seq_classifier import ALBERTSeqClassifier
 42 | from .electra.electra_seq_classifier import ELECTRASeqClassifier
 43 | from .bert.bert_seq_cross_classifier import BERTSeqCrossClassifier
 44 | from .bert.bert_regressor import BERTRegressor
 45 | from .widedeep.widedeep_regressor import WideDeepRegressor
 46 | from .bert.bert_ner import BERTNER
 47 | from .bert.bert_crf_ner import BERTCRFNER
 48 | from .bert.bert_crf_cascade_ner import BERTCRFCascadeNER
 49 | from .bert.bert_mrc import BERTMRC
 50 | from .bert.bert_verifier_mrc import BERTVerifierMRC
 51 | from .roberta.roberta_mrc import RoBERTaMRC
 52 | from .albert.albert_mrc import ALBERTMRC
 53 | from .electra.electra_mrc import ELECTRAMRC
 54 | from .retroreader.retroreader_mrc import RetroReaderMRC
 55 | from .sanet.sanet_mrc import SANetMRC
 56 | from .transformer.transformer_mt import TransformerMT
 57 | from .chatbot.chatbot_mt import ChatbotMT
 58 | try:
 59 |     from .xlnet.xlnet_classifier import XLNetClassifier
 60 |     from .xlnet.xlnet_binary_classifier import XLNetBinaryClassifier
 61 | except (ModuleNotFoundError, ImportError):
 62 |     XLNetClassifier = unimported_module(
 63 |         "XLNetClassifier",
 64 |         "Module `sentencepiece` is required to launch XLNetClassifier. "
 65 |         "Try `pip install sentencepiece` or build from source."
 66 |     )
 67 |     XLNetBinaryClassifier = unimported_module(
 68 |         "XLNetBinaryClassifier",
 69 |         "Module `sentencepiece` is required to launch XLNetBinaryClassifier. "
 70 |         "Try `pip install sentencepiece` or build from source."
 71 |     )
 72 | try:
 73 |     from .nasnet.pnasnet_classifier import PNasNetClassifier
 74 | except (ModuleNotFoundError, ImportError):
 75 |     PNasNetClassifier = unimported_module(
 76 |         "PNasNetClassifier",
 77 |         "Module `tf_slim` is required to launch PNasNetClassifier. "
 78 |         "Try `pip install tf_slim` or build from source."
 79 |     )
 80 | 
 81 | del unimported_module
 82 | 
 83 | 
 84 | __all__ = [
 85 |     "BERTLM",
 86 |     "RoBERTaLM",
 87 |     "ALBERTLM",
 88 |     "ELECTRALM",
 89 |     "VAELM",
 90 |     "GPT2LM",
 91 |     "UniLM",
 92 |     "TextCNNClassifier",
 93 |     "RNNClassifier",
 94 |     "BiRNNClassifier",
 95 |     "BERTClassifier",
 96 |     "XLNetClassifier",
 97 |     "RoBERTaClassifier",
 98 |     "ALBERTClassifier",
 99 |     "ELECTRAClassifier",
100 |     "WideDeepClassifier",
101 |     "SemBERTClassifier",
102 |     "UDAClassifier",
103 |     "PerformerClassifier",
104 |     "TinyBERTClassifier",
105 |     "TinyBERTBinaryClassifier",
106 |     "FastBERTClassifier",
107 |     "BERTBinaryClassifier",
108 |     "XLNetBinaryClassifier",
109 |     "RoBERTaBinaryClassifier",
110 |     "ALBERTBinaryClassifier",
111 |     "ELECTRABinaryClassifier",
112 |     "BERTSeqClassifier",
113 |     "RoBERTaSeqClassifier",
114 |     "ALBERTSeqClassifier",
115 |     "ELECTRASeqClassifier",
116 |     "BERTSeqCrossClassifier",
117 |     "BERTRegressor",
118 |     "WideDeepRegressor",
119 |     "BERTNER",
120 |     "BERTCRFNER",
121 |     "BERTCRFCascadeNER",
122 |     "BERTMRC",
123 |     "BERTVerifierMRC",
124 |     "RoBERTaMRC",
125 |     "ALBERTMRC",
126 |     "ELECTRAMRC",
127 |     "RetroReaderMRC",
128 |     "SANetMRC",
129 |     "TransformerMT",
130 |     "PNasNetClassifier",
131 | 
132 |     # trial
133 |     "DilatedLM",
134 |     "RecBERTLM",
135 |     "RecBERT2LM",
136 |     "RecBERT3LM",
137 |     "SPELM",
138 |     "StockBERTClassifier",
139 |     "AdaBERTClassifier",
140 |     "ChatbotMT",
141 |     "UniLMPrompt",
142 |     "MotianClassifier",
143 |     "SQPLM",
144 |     "BERTTmpBinaryClassifier",
145 | ]
146 | 


--------------------------------------------------------------------------------
/uf/apps/_base_/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/_base_/__init__.py


--------------------------------------------------------------------------------
/uf/apps/_base_/_base_.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | 
 4 | class BaseEncoder:
 5 |     def __init__(self, *args, **kwargs):
 6 |         pass
 7 | 
 8 |     def get_pooled_output(self, *args, **kwargs):
 9 |         raise NotImplementedError()
10 | 
11 |     def get_sequence_output(self, *args, **kwargs):
12 |         raise NotImplementedError()
13 | 
14 | 
15 | class BaseDecoder:
16 |     def __init__(self, *args, **kwargs):
17 | 
18 |         # scalar of total loss, used for back propagation
19 |         self.train_loss = None
20 | 
21 |         # supervised tensors of each example
22 |         self.tensors = collections.OrderedDict()
23 | 
24 |     def get_forward_outputs(self):
25 |         return (self.train_loss, self.tensors)
26 | 


--------------------------------------------------------------------------------
/uf/apps/_base_/_base_lm.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from ...core import BaseModule
 4 | 
 5 | 
 6 | class LMModule(BaseModule):
 7 |     """ Application class of language modeling (LM). """
 8 | 
 9 |     _INFER_ATTRIBUTES = {    # params whose value cannot be None in order to infer without training
10 |         "max_seq_length": "An integer that defines max sequence length of input tokens",
11 |         "init_checkpoint": "A string that directs to the checkpoint file used for initialization",
12 |     }
13 | 
14 |     def fit_from_tfrecords(
15 |         self,
16 |         batch_size=32,
17 |         learning_rate=5e-5,
18 |         target_steps=None,
19 |         total_steps=1000000,
20 |         warmup_ratio=0.01,        # 默认值不同
21 |         print_per_secs=0.1,
22 |         save_per_steps=10000,
23 |         tfrecords_files=None,
24 |         n_jobs=None,
25 |         **kwargs,
26 |     ):
27 |         super().fit_from_tfrecords(
28 |             batch_size,
29 |             learning_rate,
30 |             target_steps,
31 |             total_steps,
32 |             warmup_ratio,
33 |             print_per_secs,
34 |             save_per_steps,
35 |             tfrecords_files,
36 |             n_jobs,
37 |             **kwargs,
38 |         )
39 |     fit_from_tfrecords.__doc__ = BaseModule.fit_from_tfrecords.__doc__
40 | 
41 |     def fit(
42 |         self,
43 |         X=None, y=None, sample_weight=None, X_tokenized=None,
44 |         batch_size=32,
45 |         learning_rate=5e-5,
46 |         target_steps=None,
47 |         total_steps=1000000,
48 |         warmup_ratio=0.01,        # 默认值不同
49 |         print_per_secs=0.1,
50 |         save_per_steps=10000,
51 |         **kwargs,
52 |     ):
53 |         super().fit(
54 |             X, y, sample_weight, X_tokenized,
55 |             batch_size,
56 |             learning_rate,
57 |             target_steps,
58 |             total_steps,
59 |             warmup_ratio,
60 |             print_per_secs,
61 |             save_per_steps,
62 |             **kwargs,
63 |         )
64 |     fit.__doc__ = BaseModule.fit.__doc__
65 | 
66 |     def score(self, *args, **kwargs):
67 |         raise AttributeError("`score` method is not supported for unsupervised language modeling (LM) modules.")
68 | 
69 |     def _convert_x(self, x, tokenized):
70 |         """ Convert text sample. """
71 | 
72 |         # deal with untokenized inputs
73 |         if not tokenized:
74 | 
75 |             # deal with general inputs
76 |             if isinstance(x, str):
77 |                 return [self.tokenizer.tokenize(x)]
78 | 
79 |             # deal with multiple inputs
80 |             return [self.tokenizer.tokenize(seg) for seg in x]
81 | 
82 |         # deal with tokenized inputs
83 |         if isinstance(x[0], str):
84 |             return [copy.deepcopy(x)]
85 | 
86 |         # deal with tokenized and multiple inputs
87 |         return copy.deepcopy(x)
88 | 


--------------------------------------------------------------------------------
/uf/apps/_base_/_base_mt.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | 
  4 | from ...core import BaseModule
  5 | from ... import com
  6 | 
  7 | 
  8 | class MTModule(BaseModule):
  9 |     """ Application class of machine translation (MT). """
 10 | 
 11 |     _INFER_ATTRIBUTES = {    # params whose value cannot be None in order to infer without training
 12 |         "source_max_seq_length": "An integer that defines max sequence length of source language tokens",
 13 |         "target_max_seq_length": "An integer that defines max sequence length of target language tokens",
 14 |         "init_checkpoint": "A string that directs to the checkpoint file used for initialization",
 15 |     }
 16 | 
 17 |     def _get_bleu(self, preds, labels, mask, max_gram=4):
 18 |         """ Bilingual evaluation understudy. """
 19 |         eos_id = self.tokenizer.convert_tokens_to_ids(["</s>"])[0]
 20 | 
 21 |         bleus = []
 22 |         for _preds, _labels, _mask in zip(preds, labels, mask):
 23 | 
 24 |             # preprocess
 25 |             for i in range(len(_preds)):
 26 |                 if _preds[i] == eos_id:
 27 |                     _preds = _preds[:i+1]
 28 |                     break
 29 |             _labels = _labels[:int(np.sum(_mask)) - 1]  # remove </s>
 30 | 
 31 |             power = 0
 32 |             for n in range(max_gram):
 33 |                 ngrams = []
 34 |                 nominator = 0
 35 |                 denominator = 0
 36 | 
 37 |                 for i in range(len(_labels) - n):
 38 |                     ngram = _labels[i:i+1+n].tolist()
 39 |                     if ngram in ngrams:
 40 |                         continue
 41 |                     cand_count = len(com.find_all_boyer_moore(_preds, ngram))
 42 |                     ref_count = len(com.find_all_boyer_moore(_labels, ngram))
 43 |                     nominator += min(cand_count, ref_count)
 44 |                     denominator += cand_count
 45 |                     ngrams.append(ngram)
 46 | 
 47 |                 power += 1 / (n + 1) * np.log(nominator / (denominator + 1e-6) + 1e-6)
 48 | 
 49 |             _bleu = np.exp(power)
 50 |             if len(_preds) >= len(_labels):
 51 |                 _bleu *= np.exp(1 - len(_labels) / len(_preds))
 52 |             bleus.append(_bleu)
 53 | 
 54 |         return np.mean(bleus)
 55 | 
 56 |     def _get_rouge(self, preds, labels, mask, max_gram=4):
 57 |         """ Recall-Oriented Understudy for Gisting Evaluation. """
 58 |         eos_id = self.tokenizer.convert_tokens_to_ids(["</s>"])[0]
 59 | 
 60 |         rouges = []
 61 |         for _preds, _labels, _mask in zip(preds, labels, mask):
 62 | 
 63 |             # preprocess
 64 |             for i in range(len(_preds)):
 65 |                 if _preds[i] == eos_id:
 66 |                     _preds = _preds[:i+1]
 67 |                     break
 68 |             _labels = _labels[:int(np.sum(_mask)) - 1]  # remove </s>
 69 | 
 70 |             nominator = 0
 71 |             denominator = 0
 72 |             for n in range(max_gram):
 73 |                 ngrams = []
 74 | 
 75 |                 for i in range(len(_labels) - n):
 76 |                     ngram = _labels[i:i+1+n].tolist()
 77 |                     if ngram in ngrams:
 78 |                         continue
 79 |                     nominator += len(com.find_all_boyer_moore(_preds, ngram))
 80 |                     denominator += len(com.find_all_boyer_moore(_labels, ngram))
 81 |                     ngrams.append(ngram)
 82 | 
 83 |             _rouge = nominator / denominator if denominator else 0
 84 |             rouges.append(_rouge)
 85 | 
 86 |         return np.mean(rouges)
 87 | 
 88 |     def _convert_x(self, x, tokenized):
 89 | 
 90 |         # deal with untokenized inputs
 91 |         if not tokenized:
 92 | 
 93 |             # deal with general inputs
 94 |             if isinstance(x, str):
 95 |                 return self.tokenizer.tokenize(x)
 96 | 
 97 |         # deal with tokenized inputs
 98 |         elif isinstance(x[0], str):
 99 |             return copy.deepcopy(x)
100 | 
101 |         # deal with tokenized and multiple inputs
102 |         raise ValueError("Machine translation module only supports single sentence inputs.")
103 | 


--------------------------------------------------------------------------------
/uf/apps/_base_/_base_regressor.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | 
  4 | from ._base_ import BaseDecoder
  5 | from ...core import BaseModule
  6 | from ... import com
  7 | from ...third import tf
  8 | from .. import util
  9 | 
 10 | 
 11 | class RegDecoder(BaseDecoder):
 12 |     def __init__(
 13 |         self,
 14 |         is_training,
 15 |         input_tensor,
 16 |         label_floats,
 17 |         label_size=2,
 18 |         sample_weight=None,
 19 |         scope="reg",
 20 |         hidden_dropout_prob=0.1,
 21 |         initializer_range=0.02,
 22 |         trainable=True,
 23 |         **kwargs,
 24 |     ):
 25 |         super().__init__(**kwargs)
 26 | 
 27 |         if kwargs.get("is_logits"):
 28 |             logits = input_tensor
 29 |         else:
 30 |             if kwargs.get("return_hidden"):
 31 |                 self.tensors["hidden"] = input_tensor
 32 | 
 33 |             with tf.variable_scope(scope):
 34 |                 output_layer = util.dropout(input_tensor, hidden_dropout_prob if is_training else 0.0)
 35 |                 intermediate_output = tf.layers.dense(
 36 |                     output_layer,
 37 |                     label_size * 4,
 38 |                     use_bias=False,
 39 |                     kernel_initializer=util.create_initializer(initializer_range),
 40 |                     trainable=trainable,
 41 |                 )
 42 |                 logits = tf.layers.dense(
 43 |                     intermediate_output,
 44 |                     label_size,
 45 |                     use_bias=False,
 46 |                     kernel_initializer=util.create_initializer(initializer_range),
 47 |                     trainable=trainable,
 48 |                     name="probs",
 49 |                 )
 50 | 
 51 |         self.tensors["probs"] = logits
 52 | 
 53 |         per_example_loss = util.mean_squared_error(logits, label_floats, **kwargs)
 54 |         if sample_weight is not None:
 55 |             per_example_loss *= sample_weight
 56 |         self.tensors["losses"] = per_example_loss
 57 |         self.train_loss = tf.reduce_mean(per_example_loss)
 58 | 
 59 | 
 60 | class RegressorModule(BaseModule):
 61 |     """ Application class of regression. """
 62 | 
 63 |     _INFER_ATTRIBUTES = {    # params whose value cannot be None in order to infer without training
 64 |         "max_seq_length": "An integer that defines max sequence length of input tokens",
 65 |         "init_checkpoint": "A string that directs to the checkpoint file used for initialization",
 66 |     }
 67 | 
 68 |     def _convert_x(self, x, tokenized):
 69 |         """ Convert text sample. """
 70 | 
 71 |         # deal with untokenized inputs
 72 |         if not tokenized:
 73 | 
 74 |             # deal with general inputs
 75 |             if isinstance(x, str):
 76 |                 return [self.tokenizer.tokenize(x)]
 77 | 
 78 |             # deal with multiple inputs
 79 |             return [self.tokenizer.tokenize(seg) for seg in x]
 80 | 
 81 |         # deal with tokenized inputs
 82 |         if isinstance(x[0], str):
 83 |             return [copy.deepcopy(x)]
 84 | 
 85 |         # deal with tokenized and multiple inputs
 86 |         return copy.deepcopy(x)
 87 | 
 88 |     def _convert_y(self, y):
 89 | 
 90 |         sample = y[0]
 91 |         if isinstance(sample, list):
 92 |             self.label_size = len(sample)
 93 |         elif isinstance(sample, float) or isinstance(sample, int) or isinstance(sample, str):
 94 |             self.label_size = 1
 95 | 
 96 |         label_floats = []
 97 |         for idx, sample in enumerate(y):
 98 |             try:
 99 |                 if isinstance(sample, list):
100 |                     _label_floats = [float(label) for label in sample]
101 |                 elif isinstance(sample, float) or isinstance(sample, int) or isinstance(sample, str):
102 |                     _label_floats = [float(sample)]
103 |             except Exception as e:
104 |                 raise ValueError("Wrong label format (%s): %s. An example: y = [[0.12, 0.09], [-0.53, 0.98], ...]" % (sample, e))
105 |             label_floats.append(_label_floats)
106 | 
107 |         return label_floats
108 | 
109 |     def _get_fit_ops(self, from_tfrecords=False):
110 |         ops = [self.tensors["probs"]]
111 |         if from_tfrecords:
112 |             ops.extend([self.placeholders["label_floats"]])
113 |         return ops
114 | 
115 |     def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False):
116 | 
117 |         if from_tfrecords:
118 |             batch_labels = output_arrays[-1]
119 |         else:
120 |             batch_labels = feed_dict[self.placeholders["label_floats"]]
121 | 
122 |         # mse
123 |         batch_preds = output_arrays[0]
124 |         mse = np.mean(np.square(batch_preds - batch_labels))
125 | 
126 |         info = ""
127 |         info += ", mse %.6f" % mse
128 | 
129 |         return info
130 | 
131 |     def _get_predict_ops(self):
132 |         return [self.tensors["probs"]]
133 | 
134 |     def _get_predict_outputs(self, output_arrays, n_inputs):
135 | 
136 |         # probs
137 |         probs = com.transform(output_arrays[0], n_inputs)
138 | 
139 |         outputs = {}
140 |         outputs["probs"] = probs
141 | 
142 |         return outputs
143 | 
144 |     def _get_score_ops(self):
145 |         return [self.tensors["probs"], self.tensors["losses"]]
146 | 
147 |     def _get_score_outputs(self, output_arrays, n_inputs):
148 | 
149 |         # mse
150 |         probs = com.transform(output_arrays[0], n_inputs)
151 |         labels = self.data["label_floats"]
152 |         mse = np.mean(np.square(probs - labels))
153 | 
154 |         outputs = {}
155 |         outputs["mse"] = mse
156 | 
157 |         return outputs
158 | 
159 | 


--------------------------------------------------------------------------------
/uf/apps/adabert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/adabert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/adabert/adabert_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .adabert import AdaBERTClsDistillor
  4 | from .._base_._base_classifier import ClassifierModule
  5 | from ..bert.bert_classifier import BERTClassifier
  6 | from ..bert.bert import BERTConfig
  7 | from ...token import WordPieceTokenizer
  8 | from ...third import tf
  9 | 
 10 | 
 11 | class AdaBERTClassifier(BERTClassifier, ClassifierModule):
 12 |     """ Single-label classifier on AdaBERT, a distillation model. """
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         config_file,
 17 |         vocab_file,
 18 |         max_seq_length=128,
 19 |         label_size=None,
 20 |         init_checkpoint=None,
 21 |         output_dir=None,
 22 |         gpu_ids=None,
 23 |         drop_pooler=False,
 24 |         k_max=4,
 25 |         num_intermediates=3,
 26 |         embedding_size=128,
 27 |         temp_decay_steps=18000,
 28 |         model_l2_reg=3e-4,
 29 |         arch_l2_reg=1e-3,
 30 |         loss_gamma=0.8,
 31 |         loss_beta=4.0,
 32 |         do_lower_case=True,
 33 |         truncate_method="LIFO",
 34 |     ):
 35 |         self.__init_args__ = locals()
 36 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 37 | 
 38 |         self.max_seq_length = max_seq_length
 39 |         self.label_size = label_size
 40 |         self.truncate_method = truncate_method
 41 |         self._drop_pooler = drop_pooler
 42 |         self._k_max = k_max
 43 |         self._num_intermediates = num_intermediates
 44 |         self._embedding_size = embedding_size
 45 |         self._temp_decay_steps = temp_decay_steps
 46 |         self._model_l2_reg = model_l2_reg
 47 |         self._arch_l2_reg = arch_l2_reg
 48 |         self._loss_gamma = loss_gamma
 49 |         self._loss_beta = loss_beta
 50 | 
 51 |         self.bert_config = BERTConfig.from_json_file(config_file)
 52 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 53 |         self.decay_power = "unsupported"
 54 | 
 55 |         assert label_size, ("`label_size` can't be None.")
 56 |         if "[CLS]" not in self.tokenizer.vocab:
 57 |             self.tokenizer.add("[CLS]")
 58 |             self.bert_config.vocab_size += 1
 59 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 60 |         if "[SEP]" not in self.tokenizer.vocab:
 61 |             self.tokenizer.add("[SEP]")
 62 |             self.bert_config.vocab_size += 1
 63 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 64 | 
 65 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 66 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 67 | 
 68 |         n_inputs = None
 69 |         data = {}
 70 | 
 71 |         # convert X
 72 |         if X is not None or X_tokenized is not None:
 73 |             tokenized = False if X is not None else X_tokenized
 74 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 75 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 76 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 77 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 78 |             n_inputs = len(input_ids)
 79 | 
 80 |             if n_inputs < self.batch_size:
 81 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 82 | 
 83 |         if y is not None:
 84 |             # convert y and sample_weight
 85 |             label_ids = self._convert_y(y)
 86 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 87 | 
 88 |         # convert sample_weight
 89 |         if is_training or y is not None:
 90 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 91 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 92 | 
 93 |         return data
 94 | 
 95 |     def _forward(self, is_training, placeholders, **kwargs):
 96 | 
 97 |         model = AdaBERTClsDistillor(
 98 |             bert_config=self.bert_config,
 99 |             is_training=is_training,
100 |             input_ids=placeholders["input_ids"],
101 |             input_mask=placeholders["input_mask"],
102 |             segment_ids=placeholders["segment_ids"],
103 |             label_ids=placeholders.get("label_ids"),
104 |             sample_weight=placeholders.get("sample_weight"),
105 |             drop_pooler=self._drop_pooler,
106 |             label_size=self.label_size,
107 |             k_max=self._k_max,
108 |             num_intermediates=self._num_intermediates,
109 |             embedding_size=self._embedding_size ,
110 |             temp_decay_steps=self._temp_decay_steps,
111 |             model_l2_reg=self._model_l2_reg,
112 |             arch_l2_reg=self._arch_l2_reg,
113 |             loss_gamma=self._loss_gamma,
114 |             loss_beta=self._loss_beta,
115 |             **kwargs,
116 |         )
117 |         train_loss, tensors = model.get_forward_outputs()
118 |         return train_loss, tensors
119 | 
120 |     def _get_fit_ops(self, from_tfrecords=False):
121 |         return [self.tensors["losses"]]
122 | 
123 |     def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False):
124 | 
125 |         # loss
126 |         batch_losses = output_arrays[0]
127 |         loss = np.mean(batch_losses)
128 | 
129 |         info = ""
130 |         info += ", distill loss %.6f" % loss
131 | 
132 |         return info
133 | 


--------------------------------------------------------------------------------
/uf/apps/albert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/albert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/albert/albert_binary_classifier.py:
--------------------------------------------------------------------------------
 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power
 2 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule
 3 | from ..bert.bert_binary_classifier import BERTBinaryClassifier
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ALBERTBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule):
 9 |     """ Multi-label classifier on ALBERT. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         label_weight=None,
18 |         init_checkpoint=None,
19 |         output_dir=None,
20 |         gpu_ids=None,
21 |         drop_pooler=False,
22 |         do_lower_case=True,
23 |         truncate_method="LIFO",
24 |     ):
25 |         self.__init_args__ = locals()
26 |         super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
27 | 
28 |         self.max_seq_length = max_seq_length
29 |         self.label_size = label_size
30 |         self.label_weight = label_weight
31 |         self.truncate_method = truncate_method
32 |         self._drop_pooler = drop_pooler
33 | 
34 |         self.albert_config = ALBERTConfig.from_json_file(config_file)
35 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
36 |         self.decay_power = get_decay_power(self.albert_config.num_hidden_layers)
37 | 
38 |         if "[CLS]" not in self.tokenizer.vocab:
39 |             self.tokenizer.add("[CLS]")
40 |             self.albert_config.vocab_size += 1
41 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
42 |         if "[SEP]" not in self.tokenizer.vocab:
43 |             self.tokenizer.add("[SEP]")
44 |             self.albert_config.vocab_size += 1
45 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
46 | 
47 |     def _forward(self, is_training, placeholders, **kwargs):
48 | 
49 |         encoder = ALBERTEncoder(
50 |             albert_config=self.albert_config,
51 |             is_training=is_training,
52 |             input_ids=placeholders["input_ids"],
53 |             input_mask=placeholders["input_mask"],
54 |             segment_ids=placeholders["segment_ids"],
55 |             drop_pooler=self._drop_pooler,
56 |             **kwargs,
57 |         )
58 |         encoder_output = encoder.get_pooled_output()
59 |         decoder = BinaryClsDecoder(
60 |             is_training=is_training,
61 |             input_tensor=encoder_output,
62 |             label_ids=placeholders["label_ids"],
63 |             label_size=self.label_size,
64 |             sample_weight=placeholders.get("sample_weight"),
65 |             label_weight=self.label_weight,
66 |             scope="cls/seq_relationship",
67 |             **kwargs,
68 |         )
69 |         train_loss, tensors = decoder.get_forward_outputs()
70 |         return train_loss, tensors
71 | 


--------------------------------------------------------------------------------
/uf/apps/albert/albert_classifier.py:
--------------------------------------------------------------------------------
 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power
 2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
 3 | from ..bert.bert_classifier import BERTClassifier
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ALBERTClassifier(BERTClassifier, ClassifierModule):
 9 |     """ Single-label classifier on ALBERT. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         init_checkpoint=None,
18 |         output_dir=None,
19 |         gpu_ids=None,
20 |         drop_pooler=False,
21 |         do_lower_case=True,
22 |         truncate_method="LIFO",
23 |     ):
24 |         self.__init_args__ = locals()
25 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
26 | 
27 |         self.max_seq_length = max_seq_length
28 |         self.label_size = label_size
29 |         self.truncate_method = truncate_method
30 |         self._drop_pooler = drop_pooler
31 | 
32 |         self.albert_config = ALBERTConfig.from_json_file(config_file)
33 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
34 |         self.decay_power = get_decay_power(self.albert_config.num_hidden_layers)
35 | 
36 |         if "[CLS]" not in self.tokenizer.vocab:
37 |             self.tokenizer.add("[CLS]")
38 |             self.albert_config.vocab_size += 1
39 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
40 |         if "[SEP]" not in self.tokenizer.vocab:
41 |             self.tokenizer.add("[SEP]")
42 |             self.albert_config.vocab_size += 1
43 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
44 | 
45 |     def _forward(self, is_training, placeholders, **kwargs):
46 | 
47 |         encoder = ALBERTEncoder(
48 |             albert_config=self.albert_config,
49 |             is_training=is_training,
50 |             input_ids=placeholders["input_ids"],
51 |             input_mask=placeholders["input_mask"],
52 |             segment_ids=placeholders["segment_ids"],
53 |             drop_pooler=self._drop_pooler,
54 |             **kwargs,
55 |         )
56 |         encoder_output = encoder.get_pooled_output()
57 |         decoder = ClsDecoder(
58 |             is_training=is_training,
59 |             input_tensor=encoder_output,
60 |             label_ids=placeholders["label_ids"],
61 |             label_size=self.label_size,
62 |             sample_weight=placeholders.get("sample_weight"),
63 |             scope="cls/seq_relationship",
64 |             **kwargs,
65 |         )
66 |         train_loss, tensors = decoder.get_forward_outputs()
67 |         return train_loss, tensors
68 | 
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/uf/apps/albert/albert_mrc.py:
--------------------------------------------------------------------------------
 1 | from .albert import ALBERTEncoder, ALBERTConfig, get_decay_power
 2 | from ..bert.bert_mrc import BERTMRC
 3 | from .._base_._base_mrc import MRCDecoder, MRCModule
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ALBERTMRC(BERTMRC, MRCModule):
 9 |     """ Machine reading comprehension on ALBERT. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=256,
16 |         init_checkpoint=None,
17 |         output_dir=None,
18 |         gpu_ids=None,
19 |         do_lower_case=True,
20 |         truncate_method="longer-FO",
21 |     ):
22 |         self.__init_args__ = locals()
23 |         super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
24 | 
25 |         self.max_seq_length = max_seq_length
26 |         self.truncate_method = truncate_method
27 |         self._do_lower_case = do_lower_case
28 | 
29 |         self.albert_config = ALBERTConfig.from_json_file(config_file)
30 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
31 |         self.decay_power = get_decay_power(self.albert_config.num_hidden_layers)
32 | 
33 |         if "[CLS]" not in self.tokenizer.vocab:
34 |             self.tokenizer.add("[CLS]")
35 |             self.albert_config.vocab_size += 1
36 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
37 |         if "[SEP]" not in self.tokenizer.vocab:
38 |             self.tokenizer.add("[SEP]")
39 |             self.albert_config.vocab_size += 1
40 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
41 | 
42 |     def _forward(self, is_training, placeholders, **kwargs):
43 | 
44 |         encoder = ALBERTEncoder(
45 |             albert_config=self.albert_config,
46 |             is_training=is_training,
47 |             input_ids=placeholders["input_ids"],
48 |             input_mask=placeholders["input_mask"],
49 |             segment_ids=placeholders["segment_ids"],
50 |             **kwargs,
51 |         )
52 |         encoder_output = encoder.get_sequence_output()
53 |         decoder = MRCDecoder(
54 |             is_training=is_training,
55 |             input_tensor=encoder_output,
56 |             label_ids=placeholders["label_ids"],
57 |             sample_weight=placeholders.get("sample_weight"),
58 |             scope="mrc",
59 |             **kwargs,
60 |         )
61 |         train_loss, tensors = decoder.get_forward_outputs()
62 |         return train_loss, tensors
63 | 


--------------------------------------------------------------------------------
/uf/apps/albert/albert_seq_classifier.py:
--------------------------------------------------------------------------------
 1 | from .albert import ALBERTEncoder, ALBERTConfig,  get_decay_power
 2 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule
 3 | from ..bert.bert_seq_classifier import BERTSeqClassifier
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ALBERTSeqClassifier(BERTSeqClassifier, SeqClassifierModule):
 9 |     """ Sequence labeling classifier on ALBERT. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         init_checkpoint=None,
18 |         output_dir=None,
19 |         gpu_ids=None,
20 |         do_lower_case=True,
21 |         truncate_method="LIFO",
22 |     ):
23 |         self.__init_args__ = locals()
24 |         super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
25 | 
26 |         self.max_seq_length = max_seq_length
27 |         self.label_size = label_size
28 |         self.truncate_method = truncate_method
29 | 
30 |         self.albert_config = ALBERTConfig.from_json_file(config_file)
31 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
32 |         self.decay_power = get_decay_power(self.albert_config.num_hidden_layers)
33 | 
34 |         if "[CLS]" not in self.tokenizer.vocab:
35 |             self.tokenizer.add("[CLS]")
36 |             self.albert_config.vocab_size += 1
37 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
38 |         if "[SEP]" not in self.tokenizer.vocab:
39 |             self.tokenizer.add("[SEP]")
40 |             self.albert_config.vocab_size += 1
41 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
42 | 
43 |     def _forward(self, is_training, placeholders, **kwargs):
44 | 
45 |         encoder = ALBERTEncoder(
46 |             albert_config=self.albert_config,
47 |             is_training=is_training,
48 |             input_ids=placeholders["input_ids"],
49 |             input_mask=placeholders["input_mask"],
50 |             segment_ids=placeholders["segment_ids"],
51 |             **kwargs,
52 |         )
53 |         encoder_output = encoder.get_sequence_output()
54 |         decoder = SeqClsDecoder(
55 |             is_training=is_training,
56 |             input_tensor=encoder_output,
57 |             input_mask=placeholders["input_mask"],
58 |             label_ids=placeholders["label_ids"],
59 |             label_size=self.label_size,
60 |             sample_weight=placeholders.get("sample_weight"),
61 |             scope="cls/sequence",
62 |             **kwargs,
63 |         )
64 |         train_loss, tensors = decoder.get_forward_outputs()
65 |         return train_loss, tensors
66 | 


--------------------------------------------------------------------------------
/uf/apps/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/bert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/bert/bert_binary_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .bert import BERTEncoder, BERTConfig, get_decay_power
  4 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class BERTBinaryClassifier(BinaryClassifierModule):
 11 |     """ Multi-label classifier on BERT. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         config_file,
 16 |         vocab_file,
 17 |         max_seq_length=128,
 18 |         label_size=None,
 19 |         label_weight=None,
 20 |         init_checkpoint=None,
 21 |         output_dir=None,
 22 |         gpu_ids=None,
 23 |         drop_pooler=False,
 24 |         do_lower_case=True,
 25 |         truncate_method="LIFO",
 26 |     ):
 27 |         self.__init_args__ = locals()
 28 |         super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 29 | 
 30 |         self.max_seq_length = max_seq_length
 31 |         self.label_size = label_size
 32 |         self.label_weight = label_weight
 33 |         self.truncate_method = truncate_method
 34 |         self._drop_pooler = drop_pooler
 35 | 
 36 |         self.bert_config = BERTConfig.from_json_file(config_file)
 37 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 38 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
 39 | 
 40 |         if "[CLS]" not in self.tokenizer.vocab:
 41 |             self.tokenizer.add("[CLS]")
 42 |             self.bert_config.vocab_size += 1
 43 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 44 |         if "[SEP]" not in self.tokenizer.vocab:
 45 |             self.tokenizer.add("[SEP]")
 46 |             self.bert_config.vocab_size += 1
 47 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 48 | 
 49 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 50 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 51 | 
 52 |         if is_training:
 53 |             assert y is not None, "`y` can't be None."
 54 |         if is_parallel:
 55 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 56 | 
 57 |         n_inputs = None
 58 |         data = {}
 59 | 
 60 |         # convert X
 61 |         if X is not None or X_tokenized is not None:
 62 |             tokenized = False if X is not None else X_tokenized
 63 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 64 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 65 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 66 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 67 |             n_inputs = len(input_ids)
 68 | 
 69 |             if n_inputs < self.batch_size:
 70 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 71 | 
 72 |         # convert y
 73 |         if y is not None:
 74 |             label_ids = self._convert_y(y)
 75 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 76 | 
 77 |         # convert sample_weight
 78 |         if is_training or y is not None:
 79 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 80 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 81 | 
 82 |         return data
 83 | 
 84 |     def _convert_X(self, X_target, tokenized):
 85 | 
 86 |         # tokenize input texts
 87 |         segment_input_tokens = []
 88 |         for idx, sample in enumerate(X_target):
 89 |             try:
 90 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 91 |             except Exception as e:
 92 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 93 | 
 94 |         input_ids = []
 95 |         input_mask = []
 96 |         segment_ids = []
 97 |         for idx, segments in enumerate(segment_input_tokens):
 98 |             _input_tokens = ["[CLS]"]
 99 |             _input_ids = []
100 |             _input_mask = [1]
101 |             _segment_ids = [0]
102 | 
103 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
104 |             for s_id, segment in enumerate(segments):
105 |                 _segment_id = min(s_id, 1)
106 |                 _input_tokens.extend(segment + ["[SEP]"])
107 |                 _input_mask.extend([1] * (len(segment) + 1))
108 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
109 | 
110 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
111 | 
112 |             # padding
113 |             for _ in range(self.max_seq_length - len(_input_ids)):
114 |                 _input_ids.append(0)
115 |                 _input_mask.append(0)
116 |                 _segment_ids.append(0)
117 | 
118 |             input_ids.append(_input_ids)
119 |             input_mask.append(_input_mask)
120 |             segment_ids.append(_segment_ids)
121 | 
122 |         return input_ids, input_mask, segment_ids
123 | 
124 |     def _set_placeholders(self, **kwargs):
125 |         self.placeholders = {
126 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
127 |             "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"),
128 |             "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"),
129 |             "label_ids": tf.placeholder(tf.int32, [None, self.label_size], "label_ids"),
130 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
131 |         }
132 | 
133 |     def _forward(self, is_training, placeholders, **kwargs):
134 | 
135 |         encoder = BERTEncoder(
136 |             bert_config=self.bert_config,
137 |             is_training=is_training,
138 |             input_ids=placeholders["input_ids"],
139 |             input_mask=placeholders["input_mask"],
140 |             segment_ids=placeholders["segment_ids"],
141 |             drop_pooler=self._drop_pooler,
142 |             **kwargs,
143 |         )
144 |         encoder_output = encoder.get_pooled_output()
145 |         decoder = BinaryClsDecoder(
146 |             is_training=is_training,
147 |             input_tensor=encoder_output,
148 |             label_ids=placeholders["label_ids"],
149 |             label_size=self.label_size,
150 |             sample_weight=placeholders.get("sample_weight"),
151 |             label_weight=self.label_weight,
152 |             scope="cls/seq_relationship",
153 |             **kwargs,
154 |         )
155 |         train_loss, tensors = decoder.get_forward_outputs()
156 |         return train_loss, tensors
157 | 


--------------------------------------------------------------------------------
/uf/apps/bert/bert_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .bert import BERTEncoder, BERTConfig, get_decay_power
  4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class BERTClassifier(ClassifierModule):
 11 |     """ Single-label classifier on BERT. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         config_file,
 16 |         vocab_file,
 17 |         max_seq_length=128,
 18 |         label_size=None,
 19 |         init_checkpoint=None,
 20 |         output_dir=None,
 21 |         gpu_ids=None,
 22 |         drop_pooler=False,
 23 |         do_lower_case=True,
 24 |         truncate_method="LIFO",
 25 |     ):
 26 |         self.__init_args__ = locals()
 27 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 28 | 
 29 |         self.max_seq_length = max_seq_length
 30 |         self.label_size = label_size
 31 |         self.truncate_method = truncate_method
 32 |         self._drop_pooler = drop_pooler
 33 | 
 34 |         self.bert_config = BERTConfig.from_json_file(config_file)
 35 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 36 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
 37 | 
 38 |         if "[CLS]" not in self.tokenizer.vocab:
 39 |             self.tokenizer.add("[CLS]")
 40 |             self.bert_config.vocab_size += 1
 41 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 42 |         if "[SEP]" not in self.tokenizer.vocab:
 43 |             self.tokenizer.add("[SEP]")
 44 |             self.bert_config.vocab_size += 1
 45 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 46 | 
 47 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 48 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 49 | 
 50 |         if is_training:
 51 |             assert y is not None, "`y` can't be None."
 52 |         if is_parallel:
 53 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 54 | 
 55 |         n_inputs = None
 56 |         data = {}
 57 | 
 58 |         # convert X
 59 |         if X is not None or X_tokenized is not None:
 60 |             tokenized = False if X is not None else X_tokenized
 61 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 62 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 63 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 64 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 65 |             n_inputs = len(input_ids)
 66 | 
 67 |             if n_inputs < self.batch_size:
 68 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 69 | 
 70 |         # convert y
 71 |         if y is not None:
 72 |             label_ids = self._convert_y(y)
 73 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 74 | 
 75 |         # convert sample_weight
 76 |         if is_training or y is not None:
 77 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 78 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 79 | 
 80 |         return data
 81 | 
 82 |     def _convert_X(self, X_target, tokenized):
 83 | 
 84 |         # tokenize input texts
 85 |         segment_input_tokens = []
 86 |         for idx, sample in enumerate(X_target):
 87 |             try:
 88 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 89 |             except Exception as e:
 90 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 91 | 
 92 |         input_ids = []
 93 |         input_mask = []
 94 |         segment_ids = []
 95 |         for idx, segments in enumerate(segment_input_tokens):
 96 |             _input_tokens = ["[CLS]"]
 97 |             _input_ids = []
 98 |             _input_mask = [1]
 99 |             _segment_ids = [0]
100 | 
101 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
102 |             for s_id, segment in enumerate(segments):
103 |                 _segment_id = min(s_id, 1)
104 |                 _input_tokens.extend(segment + ["[SEP]"])
105 |                 _input_mask.extend([1] * (len(segment) + 1))
106 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
107 | 
108 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
109 | 
110 |             # padding
111 |             for _ in range(self.max_seq_length - len(_input_ids)):
112 |                 _input_ids.append(0)
113 |                 _input_mask.append(0)
114 |                 _segment_ids.append(0)
115 | 
116 |             input_ids.append(_input_ids)
117 |             input_mask.append(_input_mask)
118 |             segment_ids.append(_segment_ids)
119 | 
120 |         return input_ids, input_mask, segment_ids
121 | 
122 |     def _set_placeholders(self, **kwargs):
123 |         self.placeholders = {
124 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
125 |             "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"),
126 |             "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"),
127 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
128 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
129 |         }
130 | 
131 |     def _forward(self, is_training, placeholders, **kwargs):
132 | 
133 |         encoder = BERTEncoder(
134 |             bert_config=self.bert_config,
135 |             is_training=is_training,
136 |             input_ids=placeholders["input_ids"],
137 |             input_mask=placeholders["input_mask"],
138 |             segment_ids=placeholders["segment_ids"],
139 |             drop_pooler=self._drop_pooler,
140 |             **kwargs,
141 |         )
142 |         encoder_output = encoder.get_pooled_output()
143 |         decoder = ClsDecoder(
144 |             is_training=is_training,
145 |             input_tensor=encoder_output,
146 |             label_ids=placeholders["label_ids"],
147 |             label_size=self.label_size,
148 |             sample_weight=placeholders.get("sample_weight"),
149 |             scope="cls/seq_relationship",
150 |             **kwargs,
151 |         )
152 |         train_loss, tensors = decoder.get_forward_outputs()
153 |         return train_loss, tensors
154 | 


--------------------------------------------------------------------------------
/uf/apps/bert/bert_crf_ner.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .bert import BERTEncoder
  4 | from .bert_ner import BERTNER
  5 | from .._base_._base_ner import NERModule
  6 | from ..crf.crf import CRFDecoder, viterbi_decode
  7 | from ... import com
  8 | 
  9 | 
 10 | class BERTCRFNER(BERTNER, NERModule):
 11 |     """ Named entity recognization on BERT with CRF. """
 12 | 
 13 |     def _forward(self, is_training, placeholders, **kwargs):
 14 | 
 15 |         encoder = BERTEncoder(
 16 |             bert_config=self.bert_config,
 17 |             is_training=is_training,
 18 |             input_ids=placeholders["input_ids"],
 19 |             input_mask=placeholders["input_mask"],
 20 |             segment_ids=placeholders["segment_ids"],
 21 |             **kwargs,
 22 |         )
 23 |         encoder_output = encoder.get_sequence_output()
 24 |         decoder = CRFDecoder(
 25 |             is_training=is_training,
 26 |             input_tensor=encoder_output,
 27 |             input_mask=placeholders["input_mask"],
 28 |             label_ids=placeholders["label_ids"],
 29 |             label_size=5,
 30 |             sample_weight=placeholders.get("sample_weight"),
 31 |             scope="cls/sequence",
 32 |             **kwargs,
 33 |         )
 34 |         train_loss, tensors = decoder.get_forward_outputs()
 35 |         return train_loss, tensors
 36 | 
 37 |     def _get_fit_ops(self, from_tfrecords=False):
 38 |         ops = [self.tensors["logits"], self.tensors["transition_matrix"], self.tensors["losses"]]
 39 |         if from_tfrecords:
 40 |             ops.extend([self.placeholders["input_mask"], self.placeholders["label_ids"]])
 41 |         return ops
 42 | 
 43 |     def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False):
 44 | 
 45 |         if from_tfrecords:
 46 |             batch_mask = output_arrays[-2]
 47 |             batch_labels = output_arrays[-1]
 48 |         else:
 49 |             batch_mask = feed_dict[self.placeholders["input_mask"]]
 50 |             batch_labels = feed_dict[self.placeholders["label_ids"]]
 51 | 
 52 |         # f1
 53 |         batch_logits = output_arrays[0]
 54 |         batch_transition_matrix = output_arrays[1]
 55 |         batch_input_length = np.sum(batch_mask, axis=-1)
 56 |         batch_preds = []
 57 |         for logit, seq_len in zip(batch_logits, batch_input_length):
 58 |             viterbi_seq, _ = viterbi_decode(logit[:seq_len], batch_transition_matrix)
 59 |             batch_preds.append(viterbi_seq)
 60 |         f1_token, f1_entity = self._get_f1(batch_preds, batch_labels, batch_mask)
 61 | 
 62 |         # loss
 63 |         batch_losses = output_arrays[2]
 64 |         loss = np.mean(batch_losses)
 65 | 
 66 |         info = ""
 67 |         info += ", f1/token %.4f" % f1_token
 68 |         info += ", f1/entity %.4f" % f1_entity
 69 |         info += ", loss %.6f" % loss
 70 | 
 71 |         return info
 72 | 
 73 |     def _get_predict_ops(self):
 74 |         return [self.tensors["logits"], self.tensors["transition_matrix"]]
 75 | 
 76 |     def _get_predict_outputs(self, output_arrays, n_inputs):
 77 | 
 78 |         # preds
 79 |         logits = com.transform(output_arrays[0], n_inputs)
 80 |         transition_matrix = output_arrays[1][0]
 81 |         tokens = self.data[com.BACKUP_DATA + "input_tokens"]
 82 |         mask = self.data["input_mask"]
 83 |         text = self.data[com.BACKUP_DATA + "X_target"]
 84 |         tokenized = self.data[com.BACKUP_DATA + "tokenized"][0]
 85 |         preds = []
 86 |         for i in range(len(logits)):
 87 |             _logits = logits[i]
 88 |             _tokens = tokens[i]
 89 |             _mask = mask[i]
 90 |             _text = text[i]
 91 | 
 92 |             _input_length = int(np.sum(_mask))
 93 |             _viterbi_seq, _ = viterbi_decode(_logits[:_input_length], transition_matrix)
 94 |             _entities = self._get_entities(_viterbi_seq)
 95 |             _preds = []
 96 |             if not _entities:
 97 |                 preds.append(_preds)
 98 |                 continue
 99 | 
100 |             if not tokenized:
101 |                 if isinstance(_text, list):
102 |                     _text = " ".join(_text)
103 |                 _mapping_start, _mapping_end = com.align_tokens_with_text(_tokens, _text, self._do_lower_case)
104 | 
105 |             for _entity in _entities:
106 |                 _start, _end = _entity[0], _entity[1]
107 |                 if tokenized:
108 |                     _entity_tokens = _tokens[_start: _end + 1]
109 |                     _preds.append(_entity_tokens)
110 |                 else:
111 |                     try:
112 |                         _text_start = _mapping_start[_start]
113 |                         _text_end = _mapping_end[_end]
114 |                     except Exception:
115 |                         continue
116 |                     _entity_text = _text[_text_start: _text_end]
117 |                     _preds.append(_entity_text)
118 |             preds.append(_preds)
119 | 
120 |         # probs
121 |         probs = logits
122 | 
123 |         outputs = {}
124 |         outputs["preds"] = preds
125 |         outputs["logits"] = probs
126 | 
127 |         return outputs
128 | 
129 |     def _get_score_ops(self):
130 |         return [self.tensors["logits"], self.tensors["transition_matrix"], self.tensors["losses"]]
131 | 
132 |     def _get_score_outputs(self, output_arrays, n_inputs):
133 | 
134 |         # f1
135 |         logits = com.transform(output_arrays[0], n_inputs)
136 |         transition_matrix = output_arrays[1][0]
137 |         mask = self.data["input_mask"]
138 |         labels = self.data["label_ids"]
139 |         input_length = np.sum(mask, axis=-1)
140 |         preds = []
141 |         for logit, seq_len in zip(logits, input_length):
142 |             viterbi_seq, _ = viterbi_decode(logit[:seq_len], transition_matrix)
143 |             preds.append(viterbi_seq)
144 |         f1_token, f1_entity = self._get_f1(preds, labels, mask)
145 | 
146 |         # loss
147 |         losses = com.transform(output_arrays[2], n_inputs)
148 |         loss = np.mean(losses)
149 | 
150 |         outputs = {}
151 |         outputs["f1/token"] = f1_token
152 |         outputs["f1/entity"] = f1_entity
153 |         outputs["loss"] = loss
154 | 
155 |         return outputs
156 | 


--------------------------------------------------------------------------------
/uf/apps/bert/bert_seq_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .bert import BERTEncoder, BERTConfig, get_decay_power
  4 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class BERTSeqClassifier(SeqClassifierModule):
 11 |     """ Sequence labeling classifier on BERT. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         config_file,
 16 |         vocab_file,
 17 |         max_seq_length=128,
 18 |         label_size=None,
 19 |         init_checkpoint=None,
 20 |         output_dir=None,
 21 |         gpu_ids=None,
 22 |         do_lower_case=True,
 23 |         truncate_method="LIFO",
 24 |     ):
 25 |         self.__init_args__ = locals()
 26 |         super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 27 | 
 28 |         self.max_seq_length = max_seq_length
 29 |         self.label_size = label_size
 30 |         self.truncate_method = truncate_method
 31 | 
 32 |         self.bert_config = BERTConfig.from_json_file(config_file)
 33 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 34 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
 35 | 
 36 |         if "[CLS]" not in self.tokenizer.vocab:
 37 |             self.tokenizer.add("[CLS]")
 38 |             self.bert_config.vocab_size += 1
 39 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 40 |         if "[SEP]" not in self.tokenizer.vocab:
 41 |             self.tokenizer.add("[SEP]")
 42 |             self.bert_config.vocab_size += 1
 43 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 44 | 
 45 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 46 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 47 | 
 48 |         if is_training:
 49 |             assert y is not None, "`y` can't be None."
 50 |         if is_parallel:
 51 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 52 | 
 53 |         n_inputs = None
 54 |         data = {}
 55 | 
 56 |         # convert X
 57 |         if X is not None or X_tokenized is not None:
 58 |             tokenized = False if X is not None else X_tokenized
 59 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 60 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 61 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 62 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 63 |             n_inputs = len(input_ids)
 64 | 
 65 |             if n_inputs < self.batch_size:
 66 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 67 | 
 68 |         if y is not None:
 69 |             # convert y and sample_weight
 70 |             label_ids = self._convert_y(y)
 71 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 72 | 
 73 |         # convert sample_weight
 74 |         if is_training or y is not None:
 75 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 76 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 77 | 
 78 |         return data
 79 | 
 80 |     def _convert_X(self, X_target, tokenized):
 81 |         input_ids = []
 82 |         input_mask = []
 83 |         segment_ids = []
 84 | 
 85 |         # tokenize input texts
 86 |         for idx, sample in enumerate(X_target):
 87 |             _input_tokens = self._convert_x(sample, tokenized)
 88 | 
 89 |             com.truncate_segments([_input_tokens], self.max_seq_length, truncate_method=self.truncate_method)
 90 | 
 91 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
 92 |             _input_mask = [1 for _ in range(len(_input_tokens))]
 93 |             _segment_ids = [0 for _ in range(len(_input_tokens))]
 94 | 
 95 |             # padding
 96 |             for _ in range(self.max_seq_length - len(_input_ids)):
 97 |                 _input_ids.append(0)
 98 |                 _input_mask.append(0)
 99 |                 _segment_ids.append(0)
100 | 
101 |             input_ids.append(_input_ids)
102 |             input_mask.append(_input_mask)
103 |             segment_ids.append(_segment_ids)
104 | 
105 |         return input_ids, input_mask, segment_ids
106 | 
107 |     def _set_placeholders(self, **kwargs):
108 |         self.placeholders = {
109 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
110 |             "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"),
111 |             "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"),
112 |             "label_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "label_ids"),
113 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
114 |         }
115 | 
116 |     def _forward(self, is_training, placeholders, **kwargs):
117 | 
118 |         encoder = BERTEncoder(
119 |             bert_config=self.bert_config,
120 |             is_training=is_training,
121 |             input_ids=placeholders["input_ids"],
122 |             input_mask=placeholders["input_mask"],
123 |             segment_ids=placeholders["segment_ids"],
124 |             **kwargs,
125 |         )
126 |         encoder_output = encoder.get_sequence_output()
127 |         decoder = SeqClsDecoder(
128 |             is_training=is_training,
129 |             input_tensor=encoder_output,
130 |             input_mask=placeholders["input_mask"],
131 |             label_ids=placeholders["label_ids"],
132 |             label_size=self.label_size,
133 |             sample_weight=placeholders.get("sample_weight"),
134 |             scope="cls/sequence",
135 |             **kwargs,
136 |         )
137 |         train_loss, tensors = decoder.get_forward_outputs()
138 |         return train_loss, tensors
139 | 


--------------------------------------------------------------------------------
/uf/apps/chatbot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/chatbot/__init__.py


--------------------------------------------------------------------------------
/uf/apps/chatbot/chatbot_mt.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .chatbot import Chatbot
 4 | from ..transformer.transformer_mt import TransformerMT
 5 | from .._base_._base_mt import MTModule
 6 | 
 7 | 
 8 | class ChatbotMT(TransformerMT, MTModule):
 9 |     """ Chatbot. """
10 | 
11 |     def _forward(self, is_training, placeholders, **kwargs):
12 | 
13 |         model = Chatbot(
14 |             vocab_size=len(self.tokenizer.vocab),
15 |             is_training=is_training,
16 |             source_ids=placeholders["source_ids"],
17 |             target_ids=placeholders["target_ids"],
18 |             sos_id=self.tokenizer.convert_tokens_to_ids(["<s>"])[0],
19 |             sample_weight=placeholders.get("sample_weight"),
20 |             hidden_size=self._hidden_size,
21 |             num_blocks=self._num_hidden_layers,
22 |             num_attention_heads=self._num_attention_heads,
23 |             **kwargs,
24 |         )
25 |         self.transition_matrix = model.transition_matrix
26 |         train_loss, tensors = model.get_forward_outputs()
27 |         return train_loss, tensors
28 | 


--------------------------------------------------------------------------------
/uf/apps/crf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/crf/__init__.py


--------------------------------------------------------------------------------
/uf/apps/dilated/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/dilated/__init__.py


--------------------------------------------------------------------------------
/uf/apps/electra/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/electra/__init__.py


--------------------------------------------------------------------------------
/uf/apps/electra/electra_binary_classifier.py:
--------------------------------------------------------------------------------
 1 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule
 2 | from ..bert.bert_binary_classifier import BERTBinaryClassifier
 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ELECTRABinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule):
 9 |     """ Multi-label classifier on ELECTRA. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         label_weight=None,
18 |         init_checkpoint=None,
19 |         output_dir=None,
20 |         gpu_ids=None,
21 |         do_lower_case=True,
22 |         truncate_method="LIFO",
23 |     ):
24 |         self.__init_args__ = locals()
25 |         super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
26 | 
27 |         self.max_seq_length = max_seq_length
28 |         self.label_size = label_size
29 |         self.label_weight = label_weight
30 |         self.truncate_method = truncate_method
31 | 
32 |         self.bert_config = BERTConfig.from_json_file(config_file)
33 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
34 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
35 | 
36 |         if "[CLS]" not in self.tokenizer.vocab:
37 |             self.tokenizer.add("[CLS]")
38 |             self.bert_config.vocab_size += 1
39 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
40 |         if "[SEP]" not in self.tokenizer.vocab:
41 |             self.tokenizer.add("[SEP]")
42 |             self.bert_config.vocab_size += 1
43 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
44 | 
45 |     def _forward(self, is_training, placeholders, **kwargs):
46 | 
47 |         encoder = BERTEncoder(
48 |             bert_config=self.bert_config,
49 |             is_training=is_training,
50 |             input_ids=placeholders["input_ids"],
51 |             input_mask=placeholders["input_mask"],
52 |             segment_ids=placeholders["segment_ids"],
53 |             scope="electra",
54 |             drop_pooler=True,
55 |             **kwargs,
56 |         )
57 |         encoder_output = encoder.get_pooled_output()
58 |         decoder = BinaryClsDecoder(
59 |             is_training=is_training,
60 |             input_tensor=encoder_output,
61 |             label_ids=placeholders["label_ids"],
62 |             label_size=self.label_size,
63 |             sample_weight=placeholders.get("sample_weight"),
64 |             label_weight=self.label_weight,
65 |             scope="cls/seq_relationship",
66 |             **kwargs,
67 |         )
68 |         train_loss, tensors = decoder.get_forward_outputs()
69 |         return train_loss, tensors
70 | 


--------------------------------------------------------------------------------
/uf/apps/electra/electra_classifier.py:
--------------------------------------------------------------------------------
 1 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
 2 | from ..bert.bert_classifier import BERTClassifier
 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ELECTRAClassifier(BERTClassifier, ClassifierModule):
 9 |     """ Single-label classifier on ELECTRA. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         init_checkpoint=None,
18 |         output_dir=None,
19 |         gpu_ids=None,
20 |         do_lower_case=True,
21 |         truncate_method="LIFO",
22 |     ):
23 |         self.__init_args__ = locals()
24 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
25 | 
26 |         self.max_seq_length = max_seq_length
27 |         self.label_size = label_size
28 |         self.truncate_method = truncate_method
29 | 
30 |         self.bert_config = BERTConfig.from_json_file(config_file)
31 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
32 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
33 | 
34 |         if "[CLS]" not in self.tokenizer.vocab:
35 |             self.tokenizer.add("[CLS]")
36 |             self.bert_config.vocab_size += 1
37 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
38 |         if "[SEP]" not in self.tokenizer.vocab:
39 |             self.tokenizer.add("[SEP]")
40 |             self.bert_config.vocab_size += 1
41 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
42 | 
43 |     def _forward(self, is_training, placeholders, **kwargs):
44 | 
45 |         encoder = BERTEncoder(
46 |             bert_config=self.bert_config,
47 |             is_training=is_training,
48 |             input_ids=placeholders["input_ids"],
49 |             input_mask=placeholders["input_mask"],
50 |             segment_ids=placeholders["segment_ids"],
51 |             scope="electra",
52 |             drop_pooler=True,
53 |             **kwargs,
54 |         )
55 |         encoder_output = encoder.get_pooled_output()
56 |         decoder = ClsDecoder(
57 |             is_training=is_training,
58 |             input_tensor=encoder_output,
59 |             label_ids=placeholders["label_ids"],
60 |             label_size=self.label_size,
61 |             sample_weight=placeholders.get("sample_weight"),
62 |             scope="cls/seq_relationship",
63 |             **kwargs,
64 |         )
65 |         train_loss, tensors = decoder.get_forward_outputs()
66 |         return train_loss, tensors
67 | 


--------------------------------------------------------------------------------
/uf/apps/electra/electra_mrc.py:
--------------------------------------------------------------------------------
 1 | from .._base_._base_mrc import MRCDecoder, MRCModule
 2 | from ..bert.bert_mrc import BERTMRC
 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ELECTRAMRC(BERTMRC, MRCModule):
 9 |     """ Machine reading comprehension on ELECTRA. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=256,
16 |         init_checkpoint=None,
17 |         output_dir=None,
18 |         gpu_ids=None,
19 |         do_lower_case=True,
20 |         truncate_method="longer-FO",
21 |     ):
22 |         self.__init_args__ = locals()
23 |         super(MRCModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
24 | 
25 |         self.max_seq_length = max_seq_length
26 |         self.truncate_method = truncate_method
27 |         self._do_lower_case = do_lower_case
28 | 
29 |         self.bert_config = BERTConfig.from_json_file(config_file)
30 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
31 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
32 | 
33 |         if "[CLS]" not in self.tokenizer.vocab:
34 |             self.tokenizer.add("[CLS]")
35 |             self.bert_config.vocab_size += 1
36 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
37 |         if "[SEP]" not in self.tokenizer.vocab:
38 |             self.tokenizer.add("[SEP]")
39 |             self.bert_config.vocab_size += 1
40 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
41 | 
42 |     def _forward(self, is_training, placeholders, **kwargs):
43 | 
44 |         encoder = BERTEncoder(
45 |             bert_config=self.bert_config,
46 |             is_training=is_training,
47 |             input_ids=placeholders["input_ids"],
48 |             input_mask=placeholders["input_mask"],
49 |             segment_ids=placeholders["segment_ids"],
50 |             scope="electra",
51 |             drop_pooler=True,
52 |             **kwargs,
53 |         )
54 |         encoder_output = encoder.get_sequence_output()
55 |         decoder = MRCDecoder(
56 |             is_training=is_training,
57 |             input_tensor=encoder_output,
58 |             label_ids=placeholders["label_ids"],
59 |             sample_weight=placeholders.get("sample_weight"),
60 |             scope="mrc",
61 |             **kwargs,
62 |         )
63 |         train_loss, tensors = decoder.get_forward_outputs()
64 |         return train_loss, tensors
65 | 


--------------------------------------------------------------------------------
/uf/apps/electra/electra_seq_classifier.py:
--------------------------------------------------------------------------------
 1 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule
 2 | from ..bert.bert_seq_classifier import BERTSeqClassifier
 3 | from ..bert.bert import BERTEncoder, BERTConfig, get_decay_power
 4 | from ...token import WordPieceTokenizer
 5 | from ...third import tf
 6 | 
 7 | 
 8 | class ELECTRASeqClassifier(BERTSeqClassifier, SeqClassifierModule):
 9 |     """ Sequence labeling classifier on ELECTRA. """
10 | 
11 |     def __init__(
12 |         self,
13 |         config_file,
14 |         vocab_file,
15 |         max_seq_length=128,
16 |         label_size=None,
17 |         init_checkpoint=None,
18 |         output_dir=None,
19 |         gpu_ids=None,
20 |         do_lower_case=True,
21 |         truncate_method="LIFO",
22 |     ):
23 |         self.__init_args__ = locals()
24 |         super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
25 | 
26 |         self.max_seq_length = max_seq_length
27 |         self.label_size = label_size
28 |         self.truncate_method = truncate_method
29 | 
30 |         self.bert_config = BERTConfig.from_json_file(config_file)
31 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
32 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
33 | 
34 |         if "[CLS]" not in self.tokenizer.vocab:
35 |             self.tokenizer.add("[CLS]")
36 |             self.bert_config.vocab_size += 1
37 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
38 |         if "[SEP]" not in self.tokenizer.vocab:
39 |             self.tokenizer.add("[SEP]")
40 |             self.bert_config.vocab_size += 1
41 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
42 | 
43 |     def _forward(self, is_training, placeholders, **kwargs):
44 | 
45 |         encoder = BERTEncoder(
46 |             bert_config=self.bert_config,
47 |             is_training=is_training,
48 |             input_ids=placeholders["input_ids"],
49 |             input_mask=placeholders["input_mask"],
50 |             segment_ids=placeholders["segment_ids"],
51 |             scope="electra",
52 |             drop_pooler=True,
53 |             **kwargs,
54 |         )
55 |         encoder_output = encoder.get_sequence_output()
56 |         decoder = SeqClsDecoder(
57 |             is_training=is_training,
58 |             input_tensor=encoder_output,
59 |             input_mask=placeholders["input_mask"],
60 |             label_ids=placeholders["label_ids"],
61 |             label_size=self.label_size,
62 |             sample_weight=placeholders.get("sample_weight"),
63 |             scope="cls/sequence",
64 |             **kwargs,
65 |         )
66 |         train_loss, tensors = decoder.get_forward_outputs()
67 |         return train_loss, tensors
68 | 


--------------------------------------------------------------------------------
/uf/apps/fastbert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/fastbert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/gpt2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/gpt2/__init__.py


--------------------------------------------------------------------------------
/uf/apps/motian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/motian/__init__.py


--------------------------------------------------------------------------------
/uf/apps/motian/motian_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .motian import MotianEncoder, BERTConfig, get_decay_power
  4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class MotianClassifier(ClassifierModule):
 11 |     """ Single-label classifier on Motian. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         config_file,
 16 |         vocab_file,
 17 |         max_seq_length=128,
 18 |         label_size=None,
 19 |         init_checkpoint=None,
 20 |         output_dir=None,
 21 |         gpu_ids=None,
 22 |         do_lower_case=True,
 23 |         truncate_method="LIFO",
 24 |     ):
 25 |         self.__init_args__ = locals()
 26 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 27 | 
 28 |         self.max_seq_length = max_seq_length
 29 |         self.label_size = label_size
 30 |         self.truncate_method = truncate_method
 31 | 
 32 |         self.bert_config = BERTConfig.from_json_file(config_file)
 33 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 34 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
 35 | 
 36 |         if "[CLS]" not in self.tokenizer.vocab:
 37 |             self.tokenizer.add("[CLS]")
 38 |             self.bert_config.vocab_size += 1
 39 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 40 |         if "[SEP]" not in self.tokenizer.vocab:
 41 |             self.tokenizer.add("[SEP]")
 42 |             self.bert_config.vocab_size += 1
 43 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 44 | 
 45 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 46 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 47 | 
 48 |         if is_training:
 49 |             assert y is not None, "`y` can't be None."
 50 |         if is_parallel:
 51 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 52 | 
 53 |         n_inputs = None
 54 |         data = {}
 55 | 
 56 |         # convert X
 57 |         if X is not None or X_tokenized is not None:
 58 |             tokenized = False if X is not None else X_tokenized
 59 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 60 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 61 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 62 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 63 |             n_inputs = len(input_ids)
 64 | 
 65 |             if n_inputs < self.batch_size:
 66 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 67 | 
 68 |         # convert y
 69 |         if y is not None:
 70 |             label_ids = self._convert_y(y)
 71 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 72 | 
 73 |         # convert sample_weight
 74 |         if is_training or y is not None:
 75 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 76 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 77 | 
 78 |         return data
 79 | 
 80 |     def _convert_X(self, X_target, tokenized):
 81 | 
 82 |         # tokenize input texts
 83 |         segment_input_tokens = []
 84 |         for idx, sample in enumerate(X_target):
 85 |             try:
 86 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 87 |             except Exception as e:
 88 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 89 | 
 90 |         input_ids = []
 91 |         input_mask = []
 92 |         segment_ids = []
 93 |         for idx, segments in enumerate(segment_input_tokens):
 94 |             _input_tokens = ["[CLS]"]
 95 |             _input_ids = []
 96 |             _input_mask = [1]
 97 |             _segment_ids = [0]
 98 | 
 99 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
100 |             for s_id, segment in enumerate(segments):
101 |                 _segment_id = min(s_id, 1)
102 |                 _input_tokens.extend(segment + ["[SEP]"])
103 |                 _input_mask.extend([1] * (len(segment) + 1))
104 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
105 | 
106 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
107 | 
108 |             # padding
109 |             for _ in range(self.max_seq_length - len(_input_ids)):
110 |                 _input_ids.append(0)
111 |                 _input_mask.append(0)
112 |                 _segment_ids.append(0)
113 | 
114 |             input_ids.append(_input_ids)
115 |             input_mask.append(_input_mask)
116 |             segment_ids.append(_segment_ids)
117 | 
118 |         return input_ids, input_mask, segment_ids
119 | 
120 |     def _set_placeholders(self, **kwargs):
121 |         self.placeholders = {
122 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
123 |             "input_mask": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_mask"),
124 |             "segment_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "segment_ids"),
125 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
126 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
127 |         }
128 | 
129 |     def _forward(self, is_training, placeholders, **kwargs):
130 | 
131 |         encoder = MotianEncoder(
132 |             config=self.bert_config,
133 |             is_training=is_training,
134 |             input_ids=placeholders["input_ids"],
135 |             input_mask=placeholders["input_mask"],
136 |             token_type_ids=placeholders["segment_ids"],
137 |             **kwargs,
138 |         )
139 |         encoder_output = encoder.get_pooled_output()
140 |         decoder = ClsDecoder(
141 |             is_training=is_training,
142 |             input_tensor=encoder_output,
143 |             label_ids=placeholders["label_ids"],
144 |             label_size=self.label_size,
145 |             sample_weight=placeholders.get("sample_weight"),
146 |             scope="cls/seq_relationship",
147 |             **kwargs,
148 |         )
149 |         train_loss, tensors = decoder.get_forward_outputs()
150 |         return train_loss, tensors
151 | 


--------------------------------------------------------------------------------
/uf/apps/nasnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/nasnet/__init__.py


--------------------------------------------------------------------------------
/uf/apps/nasnet/pnasnet_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | 
  4 | from .pnasnet import build_pnasnet_mobile, build_pnasnet_large, get_decay_power
  5 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  6 | from ...third import tf
  7 | 
  8 | 
  9 | class PNasNetClassifier(ClassifierModule):
 10 |     """ Single-label classifier on PNasNet. """
 11 | 
 12 |     _INFER_ATTRIBUTES = {    # params whose value cannot be None in order to infer without training
 13 |         "label_size": "An integer that defines number of possible labels of outputs",
 14 |         "init_checkpoint": "A string that directs to the checkpoint file used for initialization",
 15 |     }
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         label_size=None,
 20 |         init_checkpoint=None,
 21 |         output_dir=None,
 22 |         gpu_ids=None,
 23 |         model_size="large",
 24 |         data_format="NHWC",
 25 |     ):
 26 |         self.__init_args__ = locals()
 27 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 28 | 
 29 |         self.label_size = label_size
 30 |         self.model_size = model_size
 31 |         self.data_format = data_format
 32 | 
 33 |         assert model_size in ("mobile", "large"), (f"Invalid `model_size`: {model_size}. Pick one from \"mobile\" and \"large\".")
 34 |         assert data_format in ("NHWC", "NCHW"), (f"Unsupported `data_format`: {data_format}. Piack one from \"NHWC\" and \"NCHW\"")
 35 |         self._image_size = 224 if model_size == "mobile" else 331
 36 |         self.decay_power = get_decay_power(model_size)
 37 | 
 38 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 39 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 40 | 
 41 |         assert not X_tokenized, "%s does not support text input." % self.__class__.__name__
 42 |         if is_training:
 43 |             assert y is not None, "`y` can't be None."
 44 |         if is_parallel:
 45 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 46 | 
 47 |         n_inputs = None
 48 |         data = {}
 49 | 
 50 |         # convert X
 51 |         if X is not None:
 52 |             input_ids = self._convert_X(X)
 53 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 54 |             n_inputs = len(input_ids)
 55 | 
 56 |             if n_inputs < self.batch_size:
 57 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 58 | 
 59 |         # convert y
 60 |         if y is not None:
 61 |             label_ids = self._convert_y(y)
 62 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 63 | 
 64 |         # convert sample_weight
 65 |         if is_training or y is not None:
 66 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 67 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 68 | 
 69 |         return data
 70 | 
 71 |     def _convert_X(self, X):
 72 | 
 73 |         # convert to numpy array
 74 |         image_arrays = []
 75 |         for idx, sample in enumerate(X):
 76 |             try:
 77 |                 image_arrays.append(self._convert_x(sample))
 78 |             except Exception as e:
 79 |                 raise ValueError("Wrong input format (image %d): %s." % (idx, e))
 80 | 
 81 |         return np.array(image_arrays)
 82 | 
 83 |     def _convert_x(self, x):
 84 | 
 85 |         # format
 86 |         x = np.array(x).astype(np.uint8)
 87 | 
 88 |         # interpolate
 89 |         if self.data_format == "NHWC":
 90 |             x = np.array([
 91 |                 np.asarray(Image.fromarray(x[:, :, k]).resize((self._image_size, self._image_size)))
 92 |                 for k in range(3)
 93 |             ])
 94 |         elif self.data_format == "NCHW":
 95 |             x = np.array([
 96 |                 np.asarray(Image.fromarray(x[k, :, :]).resize((self._image_size, self._image_size)))
 97 |                 for k in range(3)
 98 |             ])
 99 | 
100 |         # transpose
101 |         x = np.transpose(x, [1, 2, 0])
102 | 
103 |         return x
104 | 
105 |     def _set_placeholders(self, **kwargs):
106 |         self.placeholders = {
107 |             "input_ids": tf.placeholder(tf.float32, [None, self._image_size, self._image_size, 3], "input_ids"),
108 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
109 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
110 |         }
111 | 
112 |     def _forward(self, is_training, placeholders, **kwargs):
113 | 
114 |         if self.model_size == "mobile":
115 |             logits, _ = build_pnasnet_mobile(
116 |                 images=placeholders["input_ids"], num_classes=self.label_size,
117 |                 is_training=is_training, final_endpoint=None,
118 |             )
119 |         elif self.model_size == "large":
120 |             logits, _ = build_pnasnet_large(
121 |                 images=placeholders["input_ids"], num_classes=self.label_size,
122 |                 is_training=is_training, final_endpoint=None,
123 |             )
124 |         decoder = ClsDecoder(
125 |             is_training,
126 |             input_tensor=logits,
127 |             label_ids=placeholders["label_ids"],
128 |             is_logits=True,
129 |             label_size=self.label_size,
130 |             sample_weight=placeholders.get("sample_weight"),
131 |         )
132 |         train_loss, tensors = decoder.get_forward_outputs()
133 |         return train_loss, tensors
134 | 


--------------------------------------------------------------------------------
/uf/apps/performer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/performer/__init__.py


--------------------------------------------------------------------------------
/uf/apps/performer/performer_classifier.py:
--------------------------------------------------------------------------------
 1 | from .performer import PerformerEncoder
 2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
 3 | from ..bert.bert_classifier import BERTClassifier
 4 | from ..bert.bert import BERTConfig, get_decay_power
 5 | from ...token import WordPieceTokenizer
 6 | from ...third import tf
 7 | 
 8 | 
 9 | class PerformerClassifier(BERTClassifier, ClassifierModule):
10 |     """ Single-label classifier on Performer. """
11 | 
12 |     def __init__(
13 |         self,
14 |         config_file,
15 |         vocab_file,
16 |         max_seq_length=128,
17 |         label_size=None,
18 |         init_checkpoint=None,
19 |         output_dir=None,
20 |         gpu_ids=None,
21 |         kernel_transformation="relu",
22 |         nb_random_features=1,
23 |         drop_pooler=False,
24 |         do_lower_case=True,
25 |         truncate_method="LIFO",
26 |     ):
27 |         self.__init_args__ = locals()
28 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
29 | 
30 |         self.max_seq_length = max_seq_length
31 |         self.label_size = label_size
32 |         self.truncate_method = truncate_method
33 |         self._drop_pooler = drop_pooler
34 |         self._kernel_transformation = kernel_transformation
35 |         self._nb_random_features = nb_random_features
36 | 
37 |         self.bert_config = BERTConfig.from_json_file(config_file)
38 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
39 |         self.decay_power = get_decay_power(self.bert_config.num_hidden_layers)
40 | 
41 |         if "[CLS]" not in self.tokenizer.vocab:
42 |             self.tokenizer.add("[CLS]")
43 |             self.bert_config.vocab_size += 1
44 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
45 |         if "[SEP]" not in self.tokenizer.vocab:
46 |             self.tokenizer.add("[SEP]")
47 |             self.bert_config.vocab_size += 1
48 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
49 | 
50 |     def _forward(self, is_training, placeholders, **kwargs):
51 | 
52 |         encoder = PerformerEncoder(
53 |             bert_config=self.bert_config,
54 |             is_training=is_training,
55 |             input_ids=placeholders["input_ids"],
56 |             input_mask=placeholders["input_mask"],
57 |             segment_ids=placeholders["segment_ids"],
58 |             kernel_transformation=self._kernel_transformation,
59 |             nb_random_features=self._nb_random_features,
60 |             drop_pooler=self._drop_pooler,
61 |             **kwargs,
62 |         )
63 |         encoder_output = encoder.get_pooled_output()
64 |         decoder = ClsDecoder(
65 |             is_training=is_training,
66 |             input_tensor=encoder_output,
67 |             label_ids=placeholders["label_ids"],
68 |             label_size=self.label_size,
69 |             sample_weight=placeholders.get("sample_weight"),
70 |             scope="cls/seq_relationship",
71 |             **kwargs,
72 |         )
73 |         train_loss, tensors = decoder.get_forward_outputs()
74 |         return train_loss, tensors
75 | 


--------------------------------------------------------------------------------
/uf/apps/recbert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/recbert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/retroreader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/retroreader/__init__.py


--------------------------------------------------------------------------------
/uf/apps/rnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/rnn/__init__.py


--------------------------------------------------------------------------------
/uf/apps/rnn/bi_rnn.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.ops import rnn
 2 | from tensorflow.python.ops import rnn_cell
 3 | 
 4 | from .._base_._base_ import BaseEncoder
 5 | from .. import util
 6 | from ...third import tf
 7 | 
 8 | 
 9 | class BiRNNEncoder(BaseEncoder):
10 |     def __init__(
11 |         self,
12 |         is_training,
13 |         input_ids,
14 |         seq_length,
15 |         vocab_size,
16 |         rnn_core="lstm",
17 |         hidden_size=128,
18 |         scope="rnn",
19 |         trainable=True,
20 |         **kwargs,
21 |     ):
22 |         dropout_rate = 0.0
23 |         if is_training:
24 |             dropout_rate = 0.1
25 |         half_hidden_size = hidden_size // 2
26 |         self.rnn_core = rnn_core
27 | 
28 |         input_shape = util.get_shape_list(input_ids, expected_rank=2)
29 |         batch_size = input_shape[0]
30 |         max_seq_length = input_shape[1]
31 | 
32 | 
33 |         with tf.variable_scope(scope):
34 | 
35 |             # embedding
36 |             embedding_output, _ = util.embedding_lookup(
37 |                 input_ids=input_ids,
38 |                 vocab_size=vocab_size,
39 |                 batch_size=batch_size,
40 |                 max_seq_length=max_seq_length,
41 |                 embeddings=kwargs.get("tilda_embeddings"),
42 |                 embedding_size=hidden_size,
43 |                 word_embedding_name="word_embeddings",
44 |                 trainable=trainable,
45 |             )
46 | 
47 |             # rnn core
48 |             if rnn_core == "rnn":
49 |                 cell_fw = rnn_cell.BasicRNNCell(num_units=half_hidden_size, trainable=trainable)
50 |                 cell_bw = rnn_cell.BasicRNNCell(num_units=half_hidden_size, trainable=trainable)
51 |             elif rnn_core == "lstm":
52 |                 cell_fw = rnn_cell.LSTMCell(num_units=half_hidden_size, trainable=trainable)
53 |                 cell_bw = rnn_cell.LSTMCell(num_units=half_hidden_size, trainable=trainable)
54 |             elif rnn_core == "gru":
55 |                 cell_fw = rnn_cell.GRUCell(num_units=half_hidden_size, trainable=trainable)
56 |                 cell_bw = rnn_cell.GRUCell(num_units=half_hidden_size, trainable=trainable)
57 |             dropout_cell_fw = rnn_cell.DropoutWrapper(cell_fw, state_keep_prob=1 - dropout_rate)
58 |             dropout_cell_bw = rnn_cell.DropoutWrapper(cell_bw, state_keep_prob=1 - dropout_rate)
59 | 
60 |             # inputs: [batch_size, max_seq_length, hidden_size]
61 |             # outputs: ([batch_size, max_seq_length, half_hidden_size], [batch_size, max_seq_length, half_hidden_size])
62 |             outputs, self.last_states = rnn.bidirectional_dynamic_rnn(
63 |                 cell_fw=dropout_cell_fw,
64 |                 cell_bw=dropout_cell_bw,
65 |                 inputs=embedding_output,
66 |                 sequence_length=seq_length,
67 |                 dtype=tf.float32,
68 |             )
69 |             self.outputs = tf.concat(outputs, axis=2)
70 | 
71 |     def get_pooled_output(self):
72 |         return self.outputs[:, 0, :]
73 | 
74 |     def get_sequence_output(self):
75 |         return self.outputs
76 | 
77 | 
78 | def get_decay_power():
79 |     return {
80 |         "word_embeddings": 2,
81 |         "/bidirectional_rnn/": 1,
82 |         "cls/": 0,
83 |     }


--------------------------------------------------------------------------------
/uf/apps/rnn/bi_rnn_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .bi_rnn import BiRNNEncoder, get_decay_power
  4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class BiRNNClassifier(ClassifierModule):
 11 |     """ Single-label classifier on bidirectional RNN/LSTM/GRU. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         vocab_file,
 16 |         max_seq_length=128,
 17 |         label_size=None,
 18 |         init_checkpoint=None,
 19 |         output_dir=None,
 20 |         gpu_ids=None,
 21 |         rnn_core="lstm",
 22 |         hidden_size=256,
 23 |         do_lower_case=True,
 24 |         truncate_method="LIFO",
 25 |     ):
 26 |         self.__init_args__ = locals()
 27 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 28 | 
 29 |         self.max_seq_length = max_seq_length
 30 |         self.label_size = label_size
 31 |         self.truncate_method = truncate_method
 32 |         self._rnn_core = rnn_core
 33 |         self._hidden_size = hidden_size
 34 | 
 35 |         assert rnn_core in ("rnn", "lstm", "gru"), (f"Invalid `rnn_core`: {rnn_core}. Pick one from \"rnn\", \"lstm\" and \"gru\".")
 36 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 37 |         self.decay_power = get_decay_power()
 38 | 
 39 |         if "[CLS]" not in self.tokenizer.vocab:
 40 |             self.tokenizer.add("[CLS]")
 41 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 42 |         if "[SEP]" not in self.tokenizer.vocab:
 43 |             self.tokenizer.add("[SEP]")
 44 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 45 | 
 46 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 47 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 48 | 
 49 |         if is_training:
 50 |             assert y is not None, "`y` can't be None."
 51 |         if is_parallel:
 52 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 53 | 
 54 |         n_inputs = None
 55 |         data = {}
 56 | 
 57 |         # convert X
 58 |         if X is not None or X_tokenized is not None:
 59 |             tokenized = False if X is not None else X_tokenized
 60 |             input_ids, seq_length = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 61 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 62 |             data["seq_length"] = np.array(seq_length, dtype=np.int32)
 63 |             n_inputs = len(input_ids)
 64 | 
 65 |             if n_inputs < self.batch_size:
 66 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 67 | 
 68 |         # convert y
 69 |         if y is not None:
 70 |             label_ids = self._convert_y(y)
 71 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 72 | 
 73 |         # convert sample_weight
 74 |         if is_training or y is not None:
 75 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 76 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 77 | 
 78 |         return data
 79 | 
 80 |     def _convert_X(self, X_target, tokenized):
 81 | 
 82 |         # tokenize input texts
 83 |         segment_input_tokens = []
 84 |         for idx, sample in enumerate(X_target):
 85 |             try:
 86 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 87 |             except Exception as e:
 88 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 89 | 
 90 |         input_ids = []
 91 |         seq_length = []
 92 |         for idx, segments in enumerate(segment_input_tokens):
 93 |             _input_tokens = ["[CLS]"]
 94 |             _input_ids = []
 95 |             _seq_length = 0
 96 | 
 97 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
 98 |             for segment in segments:
 99 |                 _input_tokens.extend(segment + ["[SEP]"])
100 | 
101 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
102 |             _seq_length = len(_input_ids)
103 | 
104 |             # padding
105 |             _input_ids += [0] * (self.max_seq_length - len(_input_ids))
106 | 
107 |             input_ids.append(_input_ids)
108 |             seq_length.append(_seq_length)
109 | 
110 |         return input_ids, seq_length
111 | 
112 |     def _set_placeholders(self, **kwargs):
113 |         self.placeholders = {
114 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
115 |             "seq_length": tf.placeholder(tf.int32, [None], "seq_length"),
116 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
117 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
118 |         }
119 | 
120 |     def _forward(self, is_training, placeholders, **kwargs):
121 | 
122 |         encoder = BiRNNEncoder(
123 |             is_training=is_training,
124 |             input_ids=placeholders["input_ids"],
125 |             seq_length=placeholders["seq_length"],
126 |             vocab_size=len(self.tokenizer.vocab),
127 |             rnn_core=self._rnn_core,
128 |             hidden_size=self._hidden_size,
129 |             scope=self._rnn_core,
130 |             trainable=True,
131 |             **kwargs,
132 |         )
133 |         encoder_output = encoder.get_pooled_output()
134 |         decoder = ClsDecoder(
135 |             is_training=is_training,
136 |             input_tensor=encoder_output,
137 |             label_ids=placeholders["label_ids"],
138 |             label_size=self.label_size,
139 |             sample_weight=placeholders.get("sample_weight"),
140 |             scope="cls/seq_relationship",
141 |             **kwargs,
142 |         )
143 |         train_loss, tensors = decoder.get_forward_outputs()
144 |         return train_loss, tensors
145 | 


--------------------------------------------------------------------------------
/uf/apps/rnn/rnn.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.python.ops import rnn
 2 | from tensorflow.python.ops import rnn_cell
 3 | 
 4 | from .._base_._base_ import BaseEncoder
 5 | from .. import util
 6 | from ...third import tf
 7 | 
 8 | 
 9 | class RNNEncoder(BaseEncoder):
10 |     def __init__(
11 |         self,
12 |         is_training,
13 |         input_ids,
14 |         seq_length,
15 |         vocab_size,
16 |         rnn_core="lstm",
17 |         hidden_size=128,
18 |         scope="rnn",
19 |         trainable=True,
20 |         **kwargs,
21 |     ):
22 |         dropout_rate = 0.0
23 |         if is_training:
24 |             dropout_rate = 0.1
25 |         input_shape = util.get_shape_list(input_ids, expected_rank=2)
26 |         batch_size = input_shape[0]
27 |         max_seq_length = input_shape[1]
28 | 
29 |         self.rnn_core = rnn_core
30 | 
31 |         with tf.variable_scope(scope):
32 | 
33 |             # embedding
34 |             embedding_output, _ = util.embedding_lookup(
35 |                 input_ids=input_ids,
36 |                 vocab_size=vocab_size,
37 |                 batch_size=batch_size,
38 |                 max_seq_length=max_seq_length,
39 |                 embeddings=kwargs.get("tilda_embeddings"),
40 |                 embedding_size=hidden_size,
41 |                 word_embedding_name="word_embeddings",
42 |                 trainable=trainable,
43 |             )
44 | 
45 |             # rnn core
46 |             if rnn_core == "rnn":
47 |                 cell = rnn_cell.BasicRNNCell(num_units=hidden_size, trainable=trainable)
48 |             elif rnn_core == "lstm":
49 |                 cell = rnn_cell.LSTMCell(num_units=hidden_size, trainable=trainable)
50 |             elif rnn_core == "gru":
51 |                 cell = rnn_cell.GRUCell(num_units=hidden_size, trainable=trainable)
52 |             dropout_cell = rnn_cell.DropoutWrapper(cell, state_keep_prob=1 - dropout_rate)
53 | 
54 |             # inputs: [batch_size, max_seq_length, hidden_size]
55 |             # outputs: [batch_size, max_seq_length, hidden_size]
56 |             self.outputs, self.last_states = rnn.dynamic_rnn(
57 |                 cell=dropout_cell,
58 |                 inputs=embedding_output,
59 |                 sequence_length=seq_length,
60 |                 dtype=tf.float32,
61 |             )
62 | 
63 |     def get_pooled_output(self):
64 |         if self.rnn_core == "lstm":
65 |             return self.last_states[-1]     # ([batch_size, hidden_size], [batch_size, hidden_size])
66 |         return self.last_states             # [batch_size, hidden_size]
67 | 
68 |     def get_sequence_output(self):
69 |         return self.outputs
70 | 
71 | 
72 | def get_decay_power():
73 |     return {
74 |         "word_embeddings": 2,
75 |         "/rnn/": 1,
76 |         "cls/": 0,
77 |     }


--------------------------------------------------------------------------------
/uf/apps/rnn/rnn_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .rnn import RNNEncoder, get_decay_power
  4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class RNNClassifier(ClassifierModule):
 11 |     """ Single-label classifier on RNN/LSTM/GRU. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         vocab_file,
 16 |         max_seq_length=128,
 17 |         label_size=None,
 18 |         init_checkpoint=None,
 19 |         output_dir=None,
 20 |         gpu_ids=None,
 21 |         rnn_core="lstm",
 22 |         hidden_size=128,
 23 |         do_lower_case=True,
 24 |         truncate_method="LIFO",
 25 |     ):
 26 |         self.__init_args__ = locals()
 27 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 28 | 
 29 |         self.max_seq_length = max_seq_length
 30 |         self.label_size = label_size
 31 |         self.truncate_method = truncate_method
 32 |         self._rnn_core = rnn_core
 33 |         self._hidden_size = hidden_size
 34 | 
 35 |         assert rnn_core in ("rnn", "lstm", "gru"), (f"Invalid `rnn_core`: {rnn_core}. Pick one from \"rnn\", \"lstm\" and \"gru\".")
 36 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 37 |         self.decay_power = get_decay_power()
 38 | 
 39 |         if "[SEP]" not in self.tokenizer.vocab:
 40 |             self.tokenizer.add("[SEP]")
 41 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 42 | 
 43 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 44 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 45 | 
 46 |         if is_training:
 47 |             assert y is not None, "`y` can't be None."
 48 |         if is_parallel:
 49 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 50 | 
 51 |         n_inputs = None
 52 |         data = {}
 53 | 
 54 |         # convert X
 55 |         if X is not None or X_tokenized is not None:
 56 |             tokenized = False if X is not None else X_tokenized
 57 |             input_ids, seq_length = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 58 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 59 |             data["seq_length"] = np.array(seq_length, dtype=np.int32)
 60 |             n_inputs = len(input_ids)
 61 | 
 62 |             if n_inputs < self.batch_size:
 63 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 64 | 
 65 |         # convert y
 66 |         if y is not None:
 67 |             label_ids = self._convert_y(y)
 68 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 69 | 
 70 |         # convert sample_weight
 71 |         if is_training or y is not None:
 72 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 73 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 74 | 
 75 |         return data
 76 | 
 77 |     def _convert_X(self, X_target, tokenized):
 78 | 
 79 |         # tokenize input texts
 80 |         segment_input_tokens = []
 81 |         for idx, sample in enumerate(X_target):
 82 |             try:
 83 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 84 |             except Exception as e:
 85 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 86 | 
 87 |         input_ids = []
 88 |         seq_length = []
 89 |         for idx, segments in enumerate(segment_input_tokens):
 90 |             _input_tokens = []
 91 |             _input_ids = []
 92 |             _seq_length = 0
 93 | 
 94 |             com.truncate_segments(segments, self.max_seq_length - len(segments), truncate_method=self.truncate_method)
 95 |             for segment in segments:
 96 |                 _input_tokens.extend(segment + ["[SEP]"])
 97 | 
 98 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
 99 |             _seq_length = len(_input_ids)
100 | 
101 |             # padding
102 |             _input_ids += [0] * (self.max_seq_length - len(_input_ids))
103 | 
104 |             input_ids.append(_input_ids)
105 |             seq_length.append(_seq_length)
106 | 
107 |         return input_ids, seq_length
108 | 
109 |     def _set_placeholders(self, **kwargs):
110 |         self.placeholders = {
111 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
112 |             "seq_length": tf.placeholder(tf.int32, [None], "seq_length"),
113 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
114 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
115 |         }
116 | 
117 |     def _forward(self, is_training, placeholders, **kwargs):
118 | 
119 |         encoder = RNNEncoder(
120 |             is_training=is_training,
121 |             input_ids=placeholders["input_ids"],
122 |             seq_length=placeholders["seq_length"],
123 |             vocab_size=len(self.tokenizer.vocab),
124 |             rnn_core=self._rnn_core,
125 |             hidden_size=self._hidden_size,
126 |             scope=self._rnn_core,
127 |             trainable=True,
128 |             **kwargs,
129 |         )
130 |         encoder_output = encoder.get_pooled_output()
131 |         decoder = ClsDecoder(
132 |             is_training=is_training,
133 |             input_tensor=encoder_output,
134 |             label_ids=placeholders["label_ids"],
135 |             label_size=self.label_size,
136 |             sample_weight=placeholders.get("sample_weight"),
137 |             scope="cls/seq_relationship",
138 |             **kwargs,
139 |         )
140 |         train_loss, tensors = decoder.get_forward_outputs()
141 |         return train_loss, tensors
142 | 


--------------------------------------------------------------------------------
/uf/apps/roberta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/roberta/__init__.py


--------------------------------------------------------------------------------
/uf/apps/roberta/roberta.py:
--------------------------------------------------------------------------------
 1 | """ RoBERTa. """
 2 | 
 3 | 
 4 | def create_instances_from_document(all_documents, document_index, max_seq_length):
 5 |     document = all_documents[document_index]
 6 |     instances = []
 7 | 
 8 |     current_chunk = []
 9 |     current_length = 0
10 |     i = 0
11 |     while i < len(document):
12 |         segment = document[i]
13 |         current_chunk.extend(segment)
14 |         current_length += len(segment)
15 |         i += 1
16 |         if current_length >= max_seq_length:
17 |             instances.append([current_chunk])
18 |             current_chunk = []
19 |             current_length = 0
20 |     if current_chunk:
21 |         instances.append([current_chunk])
22 | 
23 |     return instances
24 | 


--------------------------------------------------------------------------------
/uf/apps/roberta/roberta_binary_classifier.py:
--------------------------------------------------------------------------------
1 | from .._base_._base_binary_classifier import BinaryClassifierModule
2 | from ..bert.bert_binary_classifier import BERTBinaryClassifier
3 | 
4 | 
5 | class RoBERTaBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule):
6 |     """ Multi-label classifier on RoBERTa. """
7 |     pass


--------------------------------------------------------------------------------
/uf/apps/roberta/roberta_classifier.py:
--------------------------------------------------------------------------------
1 | from .._base_._base_classifier import ClassifierModule
2 | from ..bert.bert_classifier import BERTClassifier
3 | 
4 | 
5 | class RoBERTaClassifier(BERTClassifier, ClassifierModule):
6 |     """ Single-label classifier on RoBERTa. """
7 |     pass
8 | 


--------------------------------------------------------------------------------
/uf/apps/roberta/roberta_mrc.py:
--------------------------------------------------------------------------------
1 | from .._base_._base_mrc import MRCModule
2 | from ..bert.bert_mrc import BERTMRC
3 | 
4 | 
5 | class RoBERTaMRC(BERTMRC, MRCModule):
6 |     """ Machine reading comprehension on RoBERTa. """
7 |     pass
8 | 


--------------------------------------------------------------------------------
/uf/apps/roberta/roberta_seq_classifier.py:
--------------------------------------------------------------------------------
1 | from .._base_._base_seq_classifier import SeqClassifierModule
2 | from ..bert.bert_seq_classifier import BERTSeqClassifier
3 | 
4 | 
5 | class RoBERTaSeqClassifier(BERTSeqClassifier, SeqClassifierModule):
6 |     """ Sequence labeling classifier on RoBERTa. """
7 |     pass
8 | 


--------------------------------------------------------------------------------
/uf/apps/sanet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sanet/__init__.py


--------------------------------------------------------------------------------
/uf/apps/sembert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sembert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/sembert/sembert.py:
--------------------------------------------------------------------------------
  1 | """ SemBERT decoder. """
  2 | 
  3 | from time import perf_counter
  4 | from ...third import tf
  5 | from .._base_._base_ import BaseDecoder
  6 | from ..bert.bert import BERTEncoder
  7 | from .. import util
  8 | 
  9 | 
 10 | class SemBERTDecoder(BaseDecoder):
 11 |     def __init__(self,
 12 |                  bert_config,
 13 |                  is_training,
 14 |                  input_tensor,
 15 |                  input_mask,
 16 |                  sem_features,
 17 |                  label_ids,
 18 |                  max_seq_length,
 19 |                  feature_size,
 20 |                  label_size=2,
 21 |                  sample_weight=None,
 22 |                  scope="cls/seq_relationship",
 23 |                  hidden_dropout_prob=0.1,
 24 |                  initializer_range=0.02,
 25 |                  trainable=True,
 26 |                  **kwargs):
 27 |         super().__init__(**kwargs)
 28 | 
 29 |         if kwargs.get("return_hidden"):
 30 |             self.tensors["hidden"] = input_tensor
 31 | 
 32 |         input_shape = util.get_shape_list(input_tensor)
 33 |         batch_size = input_shape[0]
 34 |         hidden_size = input_shape[-1]
 35 |         with tf.variable_scope("sem"):
 36 |             feature_embeddings = tf.get_variable(
 37 |                 name="feature_embeddings",
 38 |                 shape=[feature_size + 3, hidden_size],  # for [PAD], [CLS], [SEP]
 39 |                 initializer=util.create_initializer(initializer_range),
 40 |                 trainable=trainable)
 41 |             sem_output = tf.gather(
 42 |                 feature_embeddings, sem_features)  # [B, N, H]
 43 | 
 44 |             attention_heads = []
 45 |             with tf.variable_scope("self"):
 46 |                 attention_mask = BERTEncoder.create_attention_mask_from_input_mask(
 47 |                     input_mask, batch_size, max_seq_length)
 48 |                 (attention_head, _) = BERTEncoder.attention_layer(
 49 |                     from_tensor=sem_output,
 50 |                     to_tensor=sem_output,
 51 |                     attention_mask=attention_mask,
 52 |                     num_attention_heads=bert_config.num_attention_heads,
 53 |                     size_per_head=(hidden_size // bert_config.num_attention_heads),
 54 |                     attention_probs_dropout_prob=hidden_dropout_prob if is_training else 0.0,
 55 |                     initializer_range=initializer_range,
 56 |                     do_return_2d_tensor=False,
 57 |                     batch_size=batch_size,
 58 |                     from_max_seq_length=max_seq_length,
 59 |                     to_max_seq_length=max_seq_length,
 60 |                     trainable=trainable)
 61 |                 attention_heads.append(attention_head)
 62 | 
 63 |             if len(attention_heads) == 1:
 64 |                 attention_output = attention_heads[0]
 65 |             else:
 66 |                 attention_output = tf.concat(attention_heads, axis=-1)
 67 | 
 68 |             attention_output = attention_output[:, 0, :]  # [B, H]
 69 |             input_tensor = util.layer_norm(
 70 |                 attention_output + input_tensor,
 71 |                 trainable=trainable)
 72 | 
 73 |         with tf.variable_scope(scope):
 74 |             output_weights = tf.get_variable(
 75 |                 "output_weights",
 76 |                 shape=[label_size, hidden_size],
 77 |                 initializer=util.create_initializer(initializer_range),
 78 |                 trainable=trainable)
 79 |             output_bias = tf.get_variable(
 80 |                 "output_bias",
 81 |                 shape=[label_size],
 82 |                 initializer=tf.zeros_initializer(),
 83 |                 trainable=trainable)
 84 | 
 85 |             output_layer = util.dropout(
 86 |                 input_tensor, hidden_dropout_prob if is_training else 0.0)
 87 |             logits = tf.matmul(output_layer, output_weights, transpose_b=True)
 88 |             logits = tf.nn.bias_add(logits, output_bias)
 89 | 
 90 |             self.tensors["preds"] = tf.argmax(logits, axis=-1)
 91 |             self.tensors["probs"] = tf.nn.softmax(logits, axis=-1, name="probs")
 92 | 
 93 |             per_example_loss = util.cross_entropy(logits, label_ids, label_size, **kwargs)
 94 |             if sample_weight is not None:
 95 |                 per_example_loss *= sample_weight
 96 | 
 97 |             self.tensors["losses"] = per_example_loss
 98 |             self.train_loss = tf.reduce_mean(per_example_loss)
 99 | 
100 | 
101 | def get_decay_power(num_hidden_layers):
102 |     decay_power = {
103 |         "/embeddings": num_hidden_layers + 2,
104 |         "sem/": 2,
105 |         "/pooler/": 1,
106 |         "cls/": 0,
107 |     }
108 |     for layer_idx in range(num_hidden_layers):
109 |         decay_power["/layer_%d/" % layer_idx] = num_hidden_layers - layer_idx + 1
110 |     return decay_power
111 | 


--------------------------------------------------------------------------------
/uf/apps/spe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/spe/__init__.py


--------------------------------------------------------------------------------
/uf/apps/sqp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/sqp/__init__.py


--------------------------------------------------------------------------------
/uf/apps/stockbert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/stockbert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/stockbert/stockbert.py:
--------------------------------------------------------------------------------
  1 | """ SemBERT decoder. """
  2 | 
  3 | import copy
  4 | 
  5 | from ...third import tf
  6 | from .._base_._base_ import BaseEncoder
  7 | from ..bert.bert import BERTEncoder
  8 | from .. import util
  9 | 
 10 | 
 11 | class StockBERTEncoder(BERTEncoder, BaseEncoder):
 12 |     def __init__(self,
 13 |                  bert_config,
 14 |                  is_training,
 15 |                  input_values,
 16 |                  input_mask,
 17 |                  scope="stock_bert",
 18 |                  drop_pooler=False,
 19 |                  trainable=True,
 20 |                  **kwargs):
 21 | 
 22 |         bert_config = copy.deepcopy(bert_config)
 23 |         if not is_training:
 24 |             bert_config.hidden_dropout_prob = 0.0
 25 |             bert_config.attention_probs_dropout_prob = 0.0
 26 | 
 27 |         input_shape = util.get_shape_list(input_values, expected_rank=3)
 28 |         batch_size = input_shape[0]
 29 |         max_seq_length = input_shape[1] + 1
 30 | 
 31 |         with tf.variable_scope(scope):
 32 |             with tf.variable_scope("embeddings"):
 33 | 
 34 |                 self.embedding_output = self.embedding_preprocessor(
 35 |                     input_values=input_values,
 36 |                     batch_size=batch_size,
 37 |                     embedding_size=bert_config.hidden_size,
 38 |                     initializer_range=bert_config.initializer_range,
 39 |                     name="cls_embedding",
 40 |                     trainable=trainable)
 41 | 
 42 |                 # Add positional embeddings and token type embeddings
 43 |                 # layer normalize and perform dropout.
 44 |                 self.embedding_output = self.embedding_postprocessor(
 45 |                     input_tensor=self.embedding_output,
 46 |                     batch_size=batch_size,
 47 |                     max_seq_length=max_seq_length,
 48 |                     hidden_size=bert_config.hidden_size,
 49 |                     use_token_type=False,
 50 |                     segment_ids=None,
 51 |                     token_type_vocab_size=bert_config.type_vocab_size,
 52 |                     token_type_embedding_name="token_type_embeddings",
 53 |                     use_position_embeddings=True,
 54 |                     position_embedding_name="position_embeddings",
 55 |                     initializer_range=bert_config.initializer_range,
 56 |                     max_position_embeddings=\
 57 |                         bert_config.max_position_embeddings,
 58 |                     dropout_prob=bert_config.hidden_dropout_prob,
 59 |                     trainable=trainable)
 60 | 
 61 |             with tf.variable_scope("encoder"):
 62 |                 attention_mask = self.create_attention_mask_from_input_mask(
 63 |                     input_mask, batch_size, max_seq_length)
 64 | 
 65 |                 # stacked transformers
 66 |                 self.all_encoder_layers = self.transformer_model(
 67 |                     input_tensor=self.embedding_output,
 68 |                     batch_size=batch_size,
 69 |                     max_seq_length=max_seq_length,
 70 |                     attention_mask=attention_mask,
 71 |                     hidden_size=bert_config.hidden_size,
 72 |                     num_hidden_layers=bert_config.num_hidden_layers,
 73 |                     num_attention_heads=bert_config.num_attention_heads,
 74 |                     intermediate_size=bert_config.intermediate_size,
 75 |                     intermediate_act_fn=util.get_activation(
 76 |                         bert_config.hidden_act),
 77 |                     hidden_dropout_prob=bert_config.hidden_dropout_prob,
 78 |                     attention_probs_dropout_prob=\
 79 |                     bert_config.attention_probs_dropout_prob,
 80 |                     initializer_range=bert_config.initializer_range,
 81 |                     trainable=trainable)
 82 | 
 83 |             self.sequence_output = self.all_encoder_layers[-1]
 84 |             with tf.variable_scope("pooler"):
 85 |                 first_token_tensor = self.sequence_output[:, 0, :]
 86 | 
 87 |                 # trick: ignore the fully connected layer
 88 |                 if drop_pooler:
 89 |                     self.pooled_output = first_token_tensor
 90 |                 else:
 91 |                     self.pooled_output = tf.layers.dense(
 92 |                         first_token_tensor,
 93 |                         bert_config.hidden_size,
 94 |                         activation=tf.tanh,
 95 |                         kernel_initializer=util.create_initializer(
 96 |                             bert_config.initializer_range),
 97 |                         trainable=trainable)
 98 | 
 99 |     def embedding_preprocessor(self,
100 |                                input_values,
101 |                                batch_size=None,
102 |                                embedding_size=128,
103 |                                initializer_range=0.02,
104 |                                name="cls_embedding",
105 |                                dtype=tf.float32,
106 |                                trainable=True):
107 | 
108 |         with tf.variable_scope(name):
109 |             input_values = util.layer_norm(
110 |                 input_values,
111 |                 trainable=trainable)
112 |             linear_output = tf.layers.dense(
113 |                 input_values,
114 |                 embedding_size,
115 |                 activation=None,
116 |                 name="dense",
117 |                 kernel_initializer=util.create_initializer(initializer_range),
118 |                 trainable=trainable)
119 | 
120 |             cls_embedding = tf.get_variable(
121 |                 name="cls",
122 |                 shape=[1, 1, embedding_size],
123 |                 initializer=util.create_initializer(initializer_range),
124 |                 dtype=dtype,
125 |                 trainable=trainable)
126 |             cls_output = tf.tile(cls_embedding, [batch_size, 1, 1])
127 | 
128 |         output = tf.concat([cls_output, linear_output], axis=1)
129 |         return output
130 | 


--------------------------------------------------------------------------------
/uf/apps/textcnn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/textcnn/__init__.py


--------------------------------------------------------------------------------
/uf/apps/textcnn/textcnn.py:
--------------------------------------------------------------------------------
  1 | """ Convolutional neural network on texture analysis. """
  2 | 
  3 | from ...third import tf
  4 | from .._base_._base_ import BaseEncoder
  5 | from .. import util
  6 | 
  7 | 
  8 | class TextCNNEncoder(BaseEncoder):
  9 |     def __init__(self,
 10 |                  vocab_size,
 11 |                  filter_sizes,
 12 |                  num_channels,
 13 |                  is_training,
 14 |                  input_ids,
 15 |                  scope="text_cnn",
 16 |                  embedding_size=256,
 17 |                  dropout_prob=0.1,
 18 |                  trainable=True,
 19 |                  **kwargs):
 20 | 
 21 |         input_shape = util.get_shape_list(input_ids, expected_rank=2)
 22 |         batch_size = input_shape[0]
 23 |         max_seq_length = input_shape[1]
 24 | 
 25 |         if isinstance(filter_sizes, str):
 26 |             filter_sizes = filter_sizes.split(",")
 27 |         assert isinstance(filter_sizes, list), (
 28 |             "`filter_sizes` should be a list of integers or a string "
 29 |             "seperated with commas.")
 30 | 
 31 |         with tf.variable_scope(scope):
 32 |             with tf.variable_scope("embeddings"):
 33 | 
 34 |                 embedding_table = kwargs.get("tilda_embeddings")
 35 |                 if embedding_table is None:
 36 |                     embedding_table = tf.get_variable(
 37 |                         name="word_embeddings",
 38 |                         shape=[vocab_size, embedding_size],
 39 |                         initializer=util.create_initializer(0.02),
 40 |                         dtype=tf.float32,
 41 |                         trainable=trainable)
 42 | 
 43 |                 flat_input_ids = tf.reshape(input_ids, [-1])
 44 |                 output = tf.gather(
 45 |                     embedding_table, flat_input_ids, name="embedding_look_up")
 46 |                 output = tf.reshape(
 47 |                     output, [batch_size, max_seq_length, embedding_size])
 48 | 
 49 |                 output_expanded = tf.expand_dims(output, -1)
 50 | 
 51 |             # Create a convolution + maxpool layer for each filter size
 52 |             pooled_outputs = []
 53 |             for i, filter_size in enumerate(filter_sizes):
 54 |                 with tf.variable_scope("conv_%s" % filter_size):
 55 | 
 56 |                     # Convolution Layer
 57 |                     W = tf.get_variable(
 58 |                         name="W",
 59 |                         shape=[int(filter_size), embedding_size, 1, num_channels],
 60 |                         initializer=tf.truncated_normal_initializer(0.1),
 61 |                         dtype=tf.float32,
 62 |                         trainable=trainable)
 63 |                     b = tf.get_variable(
 64 |                         name="b",
 65 |                         shape=[num_channels],
 66 |                         initializer=tf.constant_initializer(0.1),
 67 |                         dtype=tf.float32,
 68 |                         trainable=trainable)
 69 |                     conv = tf.nn.conv2d(
 70 |                         output_expanded, W,
 71 |                         strides=[1, 1, 1, 1],
 72 |                         padding="VALID",
 73 |                         name="conv")
 74 | 
 75 |                     # Apply nonlinearity
 76 |                     h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
 77 | 
 78 |                     # Maxpooling over the outputs
 79 |                     pooled = tf.nn.max_pool(
 80 |                         h,
 81 |                         ksize=[1, max_seq_length - int(filter_size) + 1, 1, 1],
 82 |                         strides=[1, 1, 1, 1],
 83 |                         padding="VALID",
 84 |                         name="pool")
 85 |                     pooled_outputs.append(pooled)
 86 | 
 87 |             num_channels_total = num_channels * len(filter_sizes)
 88 |             h_pool = tf.concat(pooled_outputs, 3)
 89 |             h_pool_flat = tf.reshape(h_pool, [batch_size, num_channels_total])
 90 | 
 91 |             with tf.name_scope("dropout"):
 92 |                 self.pooled_output = util.dropout(h_pool_flat, dropout_prob)
 93 | 
 94 |     def get_pooled_output(self):
 95 |         """ Returns a tensor with shape [batch_size, hidden_size]. """
 96 |         return self.pooled_output
 97 | 
 98 | 
 99 | def get_decay_power():
100 |     decay_power = {
101 |         "/embeddings": 2,
102 |         "/conv_": 1,
103 |         "cls/": 0,
104 |     }
105 |     return decay_power
106 | 


--------------------------------------------------------------------------------
/uf/apps/textcnn/textcnn_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .textcnn import TextCNNEncoder, get_decay_power
  4 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  5 | from ...token import WordPieceTokenizer
  6 | from ...third import tf
  7 | from ... import com
  8 | 
  9 | 
 10 | class TextCNNClassifier(ClassifierModule):
 11 |     """ Single-label classifier on TextCNN. """
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         vocab_file,
 16 |         max_seq_length=128,
 17 |         label_size=None,
 18 |         init_checkpoint=None,
 19 |         output_dir=None,
 20 |         gpu_ids=None,
 21 |         filter_sizes="2,4,6",
 22 |         num_channels=6,
 23 |         hidden_size=256,
 24 |         do_lower_case=True,
 25 |         truncate_method="LIFO",
 26 |     ):
 27 |         self.__init_args__ = locals()
 28 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 29 | 
 30 |         self.max_seq_length = max_seq_length
 31 |         self.label_size = label_size
 32 |         self.truncate_method = truncate_method
 33 |         self._filter_sizes = filter_sizes
 34 |         self._num_channels = num_channels
 35 |         self._hidden_size = hidden_size
 36 | 
 37 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 38 |         self.decay_power = get_decay_power()
 39 | 
 40 |         if "[CLS]" not in self.tokenizer.vocab:
 41 |             self.tokenizer.add("[CLS]")
 42 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 43 |         if "[SEP]" not in self.tokenizer.vocab:
 44 |             self.tokenizer.add("[SEP]")
 45 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 46 | 
 47 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 48 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 49 | 
 50 |         if is_training:
 51 |             assert y is not None, "`y` can't be None."
 52 |         if is_parallel:
 53 |             assert self.label_size, "Can't parse data on multi-processing when `label_size` is None."
 54 | 
 55 |         n_inputs = None
 56 |         data = {}
 57 | 
 58 |         # convert X
 59 |         if X is not None or X_tokenized is not None:
 60 |             tokenized = False if X is not None else X_tokenized
 61 |             input_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 62 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 63 |             n_inputs = len(input_ids)
 64 | 
 65 |             if n_inputs < self.batch_size:
 66 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 67 | 
 68 |         # convert y
 69 |         if y is not None:
 70 |             label_ids = self._convert_y(y)
 71 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
 72 | 
 73 |         # convert sample_weight
 74 |         if is_training or y is not None:
 75 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
 76 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
 77 | 
 78 |         return data
 79 | 
 80 |     def _convert_X(self, X_target, tokenized):
 81 | 
 82 |         # tokenize input texts
 83 |         segment_input_tokens = []
 84 |         for idx, sample in enumerate(X_target):
 85 |             try:
 86 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 87 |             except Exception as e:
 88 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 89 | 
 90 |         input_ids = []
 91 |         for idx, segments in enumerate(segment_input_tokens):
 92 |             _input_tokens = ["[CLS]"]
 93 |             _input_ids = []
 94 | 
 95 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
 96 |             for s_id, segment in enumerate(segments):
 97 |                 _input_tokens.extend(segment + ["[SEP]"])
 98 | 
 99 |             _input_ids = self.tokenizer.convert_tokens_to_ids(_input_tokens)
100 | 
101 |             # padding
102 |             for _ in range(self.max_seq_length - len(_input_ids)):
103 |                 _input_ids.append(0)
104 | 
105 |             input_ids.append(_input_ids)
106 | 
107 |         return input_ids
108 | 
109 |     def _set_placeholders(self, **kwargs):
110 |         self.placeholders = {
111 |             "input_ids": tf.placeholder(tf.int32, [None, self.max_seq_length], "input_ids"),
112 |             "label_ids": tf.placeholder(tf.int32, [None], "label_ids"),
113 |             "sample_weight": tf.placeholder(tf.float32, [None], "sample_weight"),
114 |         }
115 | 
116 |     def _forward(self, is_training, placeholders, **kwargs):
117 | 
118 |         encoder = TextCNNEncoder(
119 |             vocab_size=len(self.tokenizer.vocab),
120 |             filter_sizes=self._filter_sizes,
121 |             num_channels=self._num_channels,
122 |             is_training=is_training,
123 |             input_ids=placeholders["input_ids"],
124 |             embedding_size=self._hidden_size,
125 |             **kwargs,
126 |         )
127 |         encoder_output = encoder.get_pooled_output()
128 |         decoder = ClsDecoder(
129 |             is_training=is_training,
130 |             input_tensor=encoder_output,
131 |             label_ids=placeholders["label_ids"],
132 |             label_size=self.label_size,
133 |             sample_weight=placeholders.get("sample_weight"),
134 |             scope="cls/seq_relationship",
135 |             **kwargs,
136 |         )
137 |         train_loss, tensors = decoder.get_forward_outputs()
138 |         return train_loss, tensors
139 | 


--------------------------------------------------------------------------------
/uf/apps/tinybert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/tinybert/__init__.py


--------------------------------------------------------------------------------
/uf/apps/tinybert/tinybert_binary_classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import numpy as np
  4 | 
  5 | from .._base_._base_binary_classifier import BinaryClassifierModule
  6 | from ..bert.bert_binary_classifier import BERTBinaryClassifier
  7 | from ..bert.bert import BERTConfig
  8 | from .tinybert import TinyBERTBinaryClsDistillor
  9 | from ...token import WordPieceTokenizer
 10 | from ...third import tf
 11 | 
 12 | 
 13 | class TinyBERTBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule):
 14 |     """ Multi-label classifier on TinyBERT, a distillation model. """
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         config_file,
 19 |         vocab_file,
 20 |         max_seq_length=128,
 21 |         label_size=None,
 22 |         init_checkpoint=None,
 23 |         output_dir=None,
 24 |         gpu_ids=None,
 25 |         drop_pooler=False,
 26 |         hidden_size=384,
 27 |         num_hidden_layers=4,
 28 |         do_lower_case=True,
 29 |         truncate_method="LIFO",
 30 |     ):
 31 |         self.__init_args__ = locals()
 32 |         super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 33 | 
 34 |         self.max_seq_length = max_seq_length
 35 |         self.label_size = label_size
 36 |         self.truncate_method = truncate_method
 37 |         self._drop_pooler = drop_pooler
 38 | 
 39 |         self.bert_config = BERTConfig.from_json_file(config_file)
 40 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 41 |         self.decay_power = "unsupported"
 42 | 
 43 |         self.student_config = copy.deepcopy(self.bert_config)
 44 |         self.student_config.hidden_size = hidden_size
 45 |         self.student_config.intermediate_size = 4 * hidden_size
 46 |         self.student_config.num_hidden_layers = num_hidden_layers
 47 | 
 48 |         assert label_size, ("`label_size` can't be None.")
 49 |         if "[CLS]" not in self.tokenizer.vocab:
 50 |             self.tokenizer.add("[CLS]")
 51 |             self.bert_config.vocab_size += 1
 52 |             self.student_config.vocab_size += 1
 53 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 54 |         if "[SEP]" not in self.tokenizer.vocab:
 55 |             self.tokenizer.add("[SEP]")
 56 |             self.bert_config.vocab_size += 1
 57 |             self.student_config.vocab_size += 1
 58 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 59 | 
 60 |     def to_bert(self, save_dir):
 61 |         """ Isolate student tiny_bert out of traing graph. """
 62 |         if not self._session_built:
 63 |             raise ValueError("Init, fit, predict or score before saving checkpoint.")
 64 | 
 65 |         tf.gfile.MakeDirs(save_dir)
 66 | 
 67 |         tf.logging.info("Saving checkpoint into %s/bert_model.ckpt" % (save_dir))
 68 |         self.init_checkpoint = save_dir + "/bert_model.ckpt"
 69 | 
 70 |         assignment_map = {}
 71 |         for var in self.global_variables:
 72 |             if var.name.startswith("tiny/"):
 73 |                 assignment_map[var.name.replace("tiny/", "")[:-2]] = var
 74 |         saver = tf.train.Saver(assignment_map, max_to_keep=1000000)
 75 |         saver.save(self.sess, self.init_checkpoint)
 76 | 
 77 |         self.student_config.to_json_file(os.path.join(save_dir, "bert_config.json"))
 78 | 
 79 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 80 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 81 | 
 82 |         if is_training:
 83 |             assert y is None, "Training of %s is unsupervised. `y` should be None." % self.__class__.__name__
 84 | 
 85 |         n_inputs = None
 86 |         data = {}
 87 | 
 88 |         # convert X
 89 |         if X is not None or X_tokenized is not None:
 90 |             tokenized = False if X is not None else X_tokenized
 91 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 92 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 93 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 94 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 95 |             n_inputs = len(input_ids)
 96 | 
 97 |             if n_inputs < self.batch_size:
 98 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 99 | 
100 |         # convert y
101 |         if y is not None:
102 |             label_ids = self._convert_y(y)
103 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
104 | 
105 |         # convert sample_weight
106 |         if is_training or y is not None:
107 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
108 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
109 | 
110 |         return data
111 | 
112 |     def _forward(self, is_training, placeholders, **kwargs):
113 | 
114 |         model = TinyBERTBinaryClsDistillor(
115 |             student_config=self.student_config,
116 |             bert_config=self.bert_config,
117 |             is_training=is_training,
118 |             input_ids=placeholders["input_ids"],
119 |             input_mask=placeholders["input_mask"],
120 |             segment_ids=placeholders["segment_ids"],
121 |             label_ids=placeholders.get("label_ids"),
122 |             sample_weight=placeholders.get("sample_weight"),
123 |             drop_pooler=self._drop_pooler,
124 |             label_size=self.label_size,
125 |             **kwargs,
126 |         )
127 |         train_loss, tensors = model.get_forward_outputs()
128 |         return train_loss, tensors
129 | 
130 |     def _get_fit_ops(self, from_tfrecords=False):
131 |         return [self.tensors["losses"]]
132 | 
133 |     def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False):
134 | 
135 |         # loss
136 |         batch_losses = output_arrays[0]
137 |         loss = np.mean(batch_losses)
138 | 
139 |         info = ""
140 |         info += ", distill loss %.6f" % loss
141 | 
142 |         return info
143 | 


--------------------------------------------------------------------------------
/uf/apps/tinybert/tinybert_classifier.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import copy
  3 | import numpy as np
  4 | 
  5 | from .tinybert import TinyBERTClsDistillor
  6 | from .._base_._base_classifier import ClassifierModule
  7 | from ..bert.bert_classifier import BERTClassifier
  8 | from ..bert.bert import BERTConfig
  9 | from ...token import WordPieceTokenizer
 10 | from ...third import tf
 11 | 
 12 | 
 13 | class TinyBERTClassifier(BERTClassifier, ClassifierModule):
 14 |     """ Single-label classifier on TinyBERT, a distillation model. """
 15 | 
 16 |     def __init__(
 17 |         self,
 18 |         config_file,
 19 |         vocab_file,
 20 |         max_seq_length=128,
 21 |         label_size=None,
 22 |         init_checkpoint=None,
 23 |         output_dir=None,
 24 |         gpu_ids=None,
 25 |         drop_pooler=False,
 26 |         hidden_size=384,
 27 |         num_hidden_layers=4,
 28 |         do_lower_case=True,
 29 |         truncate_method="LIFO",
 30 |     ):
 31 |         self.__init_args__ = locals()
 32 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 33 | 
 34 |         self.max_seq_length = max_seq_length
 35 |         self.label_size = label_size
 36 |         self.truncate_method = truncate_method
 37 |         self._drop_pooler = drop_pooler
 38 | 
 39 |         self.bert_config = BERTConfig.from_json_file(config_file)
 40 |         self.tokenizer = WordPieceTokenizer(vocab_file, do_lower_case)
 41 |         self.decay_power = "unsupported"
 42 | 
 43 |         self.student_config = copy.deepcopy(self.bert_config)
 44 |         self.student_config.hidden_size = hidden_size
 45 |         self.student_config.intermediate_size = 4 * hidden_size
 46 |         self.student_config.num_hidden_layers = num_hidden_layers
 47 | 
 48 |         assert label_size, ("`label_size` can't be None.")
 49 |         if "[CLS]" not in self.tokenizer.vocab:
 50 |             self.tokenizer.add("[CLS]")
 51 |             self.bert_config.vocab_size += 1
 52 |             self.student_config.vocab_size += 1
 53 |             tf.logging.info("Add necessary token `[CLS]` into vocabulary.")
 54 |         if "[SEP]" not in self.tokenizer.vocab:
 55 |             self.tokenizer.add("[SEP]")
 56 |             self.bert_config.vocab_size += 1
 57 |             self.student_config.vocab_size += 1
 58 |             tf.logging.info("Add necessary token `[SEP]` into vocabulary.")
 59 | 
 60 |     def to_bert(self, save_dir):
 61 |         """ Isolate student tiny_bert out of traing graph. """
 62 |         if not self._session_built:
 63 |             raise ValueError("Init, fit, predict or score before saving checkpoint.")
 64 | 
 65 |         tf.gfile.MakeDirs(save_dir)
 66 | 
 67 |         tf.logging.info("Saving checkpoint into %s/bert_model.ckpt" % save_dir)
 68 |         self.init_checkpoint = save_dir + "/bert_model.ckpt"
 69 | 
 70 |         assignment_map = {}
 71 |         for var in self.global_variables:
 72 |             if var.name.startswith("tiny/"):
 73 |                 assignment_map[var.name.replace("tiny/", "")[:-2]] = var
 74 |         saver = tf.train.Saver(assignment_map, max_to_keep=1000000)
 75 |         saver.save(self.sess, self.init_checkpoint)
 76 | 
 77 |         self.student_config.to_json_file(os.path.join(save_dir, "bert_config.json"))
 78 | 
 79 |     def convert(self, X=None, y=None, sample_weight=None, X_tokenized=None, is_training=False, is_parallel=False):
 80 |         self._assert_legal(X, y, sample_weight, X_tokenized)
 81 | 
 82 |         if is_training:
 83 |             assert y is None, "Training of %s is unsupervised. `y` should be None." % self.__class__.__name__
 84 | 
 85 |         n_inputs = None
 86 |         data = {}
 87 | 
 88 |         # convert X
 89 |         if X is not None or X_tokenized is not None:
 90 |             tokenized = False if X is not None else X_tokenized
 91 |             input_ids, input_mask, segment_ids = self._convert_X(X_tokenized if tokenized else X, tokenized=tokenized)
 92 |             data["input_ids"] = np.array(input_ids, dtype=np.int32)
 93 |             data["input_mask"] = np.array(input_mask, dtype=np.int32)
 94 |             data["segment_ids"] = np.array(segment_ids, dtype=np.int32)
 95 |             n_inputs = len(input_ids)
 96 | 
 97 |             if n_inputs < self.batch_size:
 98 |                 self.batch_size = max(n_inputs, len(self._gpu_ids))
 99 | 
100 |         if y is not None:
101 |             # convert y and sample_weight
102 |             label_ids = self._convert_y(y)
103 |             data["label_ids"] = np.array(label_ids, dtype=np.int32)
104 | 
105 |         # convert sample_weight
106 |         if is_training or y is not None:
107 |             sample_weight = self._convert_sample_weight(sample_weight, n_inputs)
108 |             data["sample_weight"] = np.array(sample_weight, dtype=np.float32)
109 | 
110 |         return data
111 | 
112 |     def _forward(self, is_training, placeholders, **kwargs):
113 | 
114 |         model = TinyBERTClsDistillor(
115 |             student_config=self.student_config,
116 |             bert_config=self.bert_config,
117 |             is_training=is_training,
118 |             input_ids=placeholders["input_ids"],
119 |             input_mask=placeholders["input_mask"],
120 |             segment_ids=placeholders["segment_ids"],
121 |             label_ids=placeholders.get("label_ids"),
122 |             sample_weight=placeholders.get("sample_weight"),
123 |             drop_pooler=self._drop_pooler,
124 |             label_size=self.label_size,
125 |             **kwargs,
126 |         )
127 |         train_loss, tensors = model.get_forward_outputs()
128 |         return train_loss, tensors
129 | 
130 |     def _get_fit_ops(self, from_tfrecords=False):
131 |         return [self.tensors["losses"]]
132 | 
133 |     def _get_fit_info(self, output_arrays, feed_dict, from_tfrecords=False):
134 | 
135 |         # loss
136 |         batch_losses = output_arrays[0]
137 |         loss = np.mean(batch_losses)
138 | 
139 |         info = ""
140 |         info += ", distill loss %.6f" % loss
141 | 
142 |         return info
143 | 


--------------------------------------------------------------------------------
/uf/apps/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/transformer/__init__.py


--------------------------------------------------------------------------------
/uf/apps/uda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/uda/__init__.py


--------------------------------------------------------------------------------
/uf/apps/unilm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/unilm/__init__.py


--------------------------------------------------------------------------------
/uf/apps/vae/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/vae/__init__.py


--------------------------------------------------------------------------------
/uf/apps/widedeep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/widedeep/__init__.py


--------------------------------------------------------------------------------
/uf/apps/xlnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyingli/unif/2d4a447acc540f29d4b04c117ca03583cd94a5e2/uf/apps/xlnet/__init__.py


--------------------------------------------------------------------------------
/uf/apps/xlnet/xlnet_binary_classifier.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID
  4 | from .._base_._base_binary_classifier import BinaryClsDecoder, BinaryClassifierModule
  5 | from ..bert.bert_binary_classifier import BERTBinaryClassifier
  6 | from ...token import SentencePieceTokenizer
  7 | from ...third import tf
  8 | from ... import com
  9 | 
 10 | 
 11 | class XLNetBinaryClassifier(BERTBinaryClassifier, BinaryClassifierModule):
 12 |     """ Multi-label classifier on XLNet. """
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         config_file,
 17 |         spm_file,
 18 |         max_seq_length=128,
 19 |         label_size=None,
 20 |         label_weight=None,
 21 |         init_checkpoint=None,
 22 |         output_dir=None,
 23 |         gpu_ids=None,
 24 |         do_lower_case=True,
 25 |         truncate_method="LIFO",
 26 |     ):
 27 |         self.__init_args__ = locals()
 28 |         super(BinaryClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 29 | 
 30 |         self.max_seq_length = max_seq_length
 31 |         self.label_size = label_size
 32 |         self.label_weight = label_weight
 33 |         self.truncate_method = truncate_method
 34 | 
 35 |         self.xlnet_config = XLNetConfig(json_path=config_file)
 36 |         self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case)
 37 |         self.decay_power = get_decay_power(self.xlnet_config.n_layer)
 38 | 
 39 |     def _convert_X(self, X_target, tokenized):
 40 | 
 41 |         # tokenize input texts
 42 |         segment_input_tokens = []
 43 |         for idx, sample in enumerate(X_target):
 44 |             try:
 45 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 46 |             except Exception as e:
 47 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 48 | 
 49 |         input_ids = []
 50 |         input_mask = []
 51 |         segment_ids = []
 52 |         for idx, segments in enumerate(segment_input_tokens):
 53 |             _input_ids = []
 54 |             _input_mask = []
 55 |             _segment_ids = []
 56 | 
 57 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
 58 | 
 59 |             for s_id, segment in enumerate(segments):
 60 |                 _segment_id = min(s_id, 1)
 61 |                 _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID])
 62 |                 _input_mask.extend([0] * (len(segment) + 1))
 63 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
 64 | 
 65 |             _input_ids.append(CLS_ID)
 66 |             _input_mask.append(0)
 67 |             _segment_ids.append(SEG_ID_CLS)
 68 | 
 69 |             # padding
 70 |             if len(_input_ids) < self.max_seq_length:
 71 |                 delta_len = self.max_seq_length - len(_input_ids)
 72 |                 _input_ids = [0] * delta_len + _input_ids
 73 |                 _input_mask = [1] * delta_len + _input_mask
 74 |                 _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids
 75 | 
 76 |             input_ids.append(_input_ids)
 77 |             input_mask.append(_input_mask)
 78 |             segment_ids.append(_segment_ids)
 79 | 
 80 |         return input_ids, input_mask, segment_ids
 81 | 
 82 |     def _forward(self, is_training, placeholders, **kwargs):
 83 | 
 84 |         input_ids = tf.transpose(placeholders["input_ids"], [1, 0])
 85 |         input_mask = tf.transpose(placeholders["input_mask"], [1, 0])
 86 |         segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0])
 87 | 
 88 |         encoder = XLNetEncoder(
 89 |             xlnet_config=self.xlnet_config,
 90 |             is_training=is_training,
 91 |             input_ids=input_ids,
 92 |             seg_ids=segment_ids,
 93 |             input_mask=input_mask,
 94 |             **kwargs,
 95 |         )
 96 |         encoder_output = encoder.get_pooled_output()
 97 |         decoder = BinaryClsDecoder(
 98 |             is_training=is_training,
 99 |             input_tensor=encoder_output,
100 |             label_ids=placeholders["label_ids"],
101 |             label_size=self.label_size,
102 |             sample_weight=placeholders.get("sample_weight"),
103 |             label_weight=self.label_weight,
104 |             scope="cls/seq_relationship",
105 |             **kwargs,
106 |         )
107 |         train_loss, tensors = decoder.get_forward_outputs()
108 |         return train_loss, tensors
109 | 


--------------------------------------------------------------------------------
/uf/apps/xlnet/xlnet_classifier.py:
--------------------------------------------------------------------------------
  1 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID
  2 | from .._base_._base_classifier import ClsDecoder, ClassifierModule
  3 | from ..bert.bert_classifier import BERTClassifier
  4 | from ...token import SentencePieceTokenizer
  5 | from ...third import tf
  6 | from ... import com
  7 | 
  8 | 
  9 | class XLNetClassifier(BERTClassifier, ClassifierModule):
 10 |     """ Single-label classifier on XLNet. """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         config_file,
 15 |         spm_file,
 16 |         max_seq_length=128,
 17 |         label_size=None,
 18 |         init_checkpoint=None,
 19 |         output_dir=None,
 20 |         gpu_ids=None,
 21 |         do_lower_case=True,
 22 |         truncate_method="LIFO",
 23 |     ):
 24 |         self.__init_args__ = locals()
 25 |         super(ClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 26 | 
 27 |         self.max_seq_length = max_seq_length
 28 |         self.label_size = label_size
 29 |         self.truncate_method = truncate_method
 30 | 
 31 |         self.xlnet_config = XLNetConfig(json_path=config_file)
 32 |         self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case)
 33 |         self.decay_power = get_decay_power(self.xlnet_config.n_layer)
 34 | 
 35 |     def _convert_X(self, X_target, tokenized):
 36 | 
 37 |         # tokenize input texts
 38 |         segment_input_tokens = []
 39 |         for idx, sample in enumerate(X_target):
 40 |             try:
 41 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 42 |             except Exception as e:
 43 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 44 | 
 45 |         input_ids = []
 46 |         input_mask = []
 47 |         segment_ids = []
 48 |         for idx, segments in enumerate(segment_input_tokens):
 49 |             _input_ids = []
 50 |             _input_mask = []
 51 |             _segment_ids = []
 52 | 
 53 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
 54 | 
 55 |             for s_id, segment in enumerate(segments):
 56 |                 _segment_id = min(s_id, 1)
 57 |                 _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID])
 58 |                 _input_mask.extend([0] * (len(segment) + 1))
 59 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
 60 | 
 61 |             _input_ids.append(CLS_ID)
 62 |             _input_mask.append(0)
 63 |             _segment_ids.append(SEG_ID_CLS)
 64 | 
 65 |             # padding
 66 |             if len(_input_ids) < self.max_seq_length:
 67 |                 delta_len = self.max_seq_length - len(_input_ids)
 68 |                 _input_ids = [0] * delta_len + _input_ids
 69 |                 _input_mask = [1] * delta_len + _input_mask
 70 |                 _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids
 71 | 
 72 |             input_ids.append(_input_ids)
 73 |             input_mask.append(_input_mask)
 74 |             segment_ids.append(_segment_ids)
 75 | 
 76 |         return input_ids, input_mask, segment_ids
 77 | 
 78 |     def _forward(self, is_training, placeholders, **kwargs):
 79 | 
 80 |         input_ids = tf.transpose(placeholders["input_ids"], [1, 0])
 81 |         input_mask = tf.transpose(placeholders["input_mask"], [1, 0])
 82 |         segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0])
 83 | 
 84 |         encoder = XLNetEncoder(
 85 |             xlnet_config=self.xlnet_config,
 86 |             is_training=is_training,
 87 |             input_ids=input_ids,
 88 |             seg_ids=segment_ids,
 89 |             input_mask=input_mask,
 90 |             **kwargs,
 91 |         )
 92 |         encoder_output = encoder.get_pooled_output()
 93 |         decoder = ClsDecoder(
 94 |             is_training=is_training,
 95 |             input_tensor=encoder_output,
 96 |             label_ids=placeholders["label_ids"],
 97 |             label_size=self.label_size,
 98 |             sample_weight=placeholders.get("sample_weight"),
 99 |             scope="cls/seq_relationship",
100 |             **kwargs,
101 |         )
102 |         train_loss, tensors = decoder.get_forward_outputs()
103 |         return train_loss, tensors
104 | 


--------------------------------------------------------------------------------
/uf/apps/xlnet/xlnet_seq_classifier.py:
--------------------------------------------------------------------------------
  1 | from .xlnet import XLNetEncoder, XLNetConfig, get_decay_power, SEG_ID_CLS, SEG_ID_PAD, CLS_ID, SEP_ID
  2 | from .._base_._base_seq_classifier import SeqClsDecoder, SeqClassifierModule
  3 | from ..bert.bert_seq_classifier import BERTSeqClassifier
  4 | from ...token import SentencePieceTokenizer
  5 | from ...third import tf
  6 | from ... import com
  7 | 
  8 | 
  9 | class XLNetSeqClassifier(BERTSeqClassifier, SeqClassifierModule):
 10 |     """ Sequence labeling classifier on XLNet. """
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         config_file,
 15 |         spm_file,
 16 |         max_seq_length=128,
 17 |         label_size=None,
 18 |         init_checkpoint=None,
 19 |         output_dir=None,
 20 |         gpu_ids=None,
 21 |         do_lower_case=True,
 22 |         truncate_method="LIFO",
 23 |     ):
 24 |         self.__init_args__ = locals()
 25 |         super(SeqClassifierModule, self).__init__(init_checkpoint, output_dir, gpu_ids)
 26 | 
 27 |         self.max_seq_length = max_seq_length
 28 |         self.label_size = label_size
 29 |         self.truncate_method = truncate_method
 30 | 
 31 |         self.xlnet_config = XLNetConfig(json_path=config_file)
 32 |         self.tokenizer = SentencePieceTokenizer(spm_file, do_lower_case)
 33 |         self.decay_power = get_decay_power(self.xlnet_config.n_layer)
 34 | 
 35 |     def _convert_X(self, X_target, tokenized):
 36 | 
 37 |         # tokenize input texts
 38 |         segment_input_tokens = []
 39 |         for idx, sample in enumerate(X_target):
 40 |             try:
 41 |                 segment_input_tokens.append(self._convert_x(sample, tokenized))
 42 |             except Exception as e:
 43 |                 raise ValueError("Wrong input format (%s): %s." % (sample, e))
 44 | 
 45 |         input_ids = []
 46 |         input_mask = []
 47 |         segment_ids = []
 48 |         for idx, segments in enumerate(segment_input_tokens):
 49 |             _input_ids = []
 50 |             _input_mask = []
 51 |             _segment_ids = []
 52 | 
 53 |             com.truncate_segments(segments, self.max_seq_length - len(segments) - 1, truncate_method=self.truncate_method)
 54 | 
 55 |             for s_id, segment in enumerate(segments):
 56 |                 _segment_id = min(s_id, 1)
 57 |                 _input_ids.extend(self.tokenizer.convert_tokens_to_ids(segment) + [SEP_ID])
 58 |                 _input_mask.extend([1] * (len(segment) + 1))
 59 |                 _segment_ids.extend([_segment_id] * (len(segment) + 1))
 60 | 
 61 |             _input_ids.append(CLS_ID)
 62 |             _input_mask.append(1)
 63 |             _segment_ids.append(SEG_ID_CLS)
 64 | 
 65 |             # padding
 66 |             if len(_input_ids) < self.max_seq_length:
 67 |                 delta_len = self.max_seq_length - len(_input_ids)
 68 |                 _input_ids = [0] * delta_len + _input_ids
 69 |                 _input_mask = [0] * delta_len + _input_mask  # it's 1 in source code
 70 |                 _segment_ids = [SEG_ID_PAD] * delta_len + _segment_ids
 71 | 
 72 |             input_ids.append(_input_ids)
 73 |             input_mask.append(_input_mask)
 74 |             segment_ids.append(_segment_ids)
 75 | 
 76 |         return input_ids, input_mask, segment_ids
 77 | 
 78 |     def _forward(self, is_training, placeholders, **kwargs):
 79 | 
 80 |         input_ids = tf.transpose(placeholders["input_ids"], [1, 0])
 81 |         input_mask = tf.transpose(placeholders["input_mask"], [1, 0])
 82 |         segment_ids = tf.transpose(placeholders["segment_ids"], [1, 0])
 83 | 
 84 |         encoder = XLNetEncoder(
 85 |             xlnet_config=self.xlnet_config,
 86 |             is_training=is_training,
 87 |             input_ids=input_ids,
 88 |             seg_ids=segment_ids,
 89 |             input_mask=input_mask,
 90 |             **kwargs,
 91 |         )
 92 |         encoder_output = encoder.get_sequence_output()
 93 |         decoder = SeqClsDecoder(
 94 |             is_training=is_training,
 95 |             input_tensor=encoder_output,
 96 |             input_mask=placeholders["input_mask"],
 97 |             label_ids=placeholders["label_ids"],
 98 |             label_size=self.label_size,
 99 |             sample_weight=placeholders.get("sample_weight"),
100 |             scope="cls/sequence",
101 |             **kwargs,
102 |         )
103 |         train_loss, tensors = decoder.get_forward_outputs()
104 |         return train_loss, tensors
105 | 


--------------------------------------------------------------------------------
/uf/com/__init__.py:
--------------------------------------------------------------------------------
1 | from .cache import *
2 | from .checkpoint import *
3 | from .graph import *
4 | from .parallel import *
5 | from .resource import *
6 | from .text import *
7 | from .tfrecords import *
8 | from .com import *
9 | 


--------------------------------------------------------------------------------
/uf/com/cache.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import json
  4 | import collections
  5 | 
  6 | from .. import apps
  7 | from ..third import tf
  8 | 
  9 | 
 10 | def restore(key, from_file="./.unif", **kwargs):
 11 |     """ Load model from configurations saved in local file.
 12 | 
 13 |     Args:
 14 |         key: string. Unique name of configuration to load.
 15 |         from_file: string. The path of configuration file.
 16 |     """
 17 |     tf.logging.info("Loading model `%s` from %s" % (key, from_file))
 18 | 
 19 |     if not os.path.exists(from_file):
 20 |         raise ValueError("No file found with `%s`." % from_file)
 21 |     from_fp = open(from_file, encoding="utf-8")
 22 |     from_json = json.load(from_fp)
 23 |     from_fp.close()
 24 |     if key not in from_json.keys():
 25 |         raise ValueError("No key `%s`." % key)
 26 |     _from_json = from_json[key]
 27 | 
 28 |     # restore configuration
 29 |     model_name = _from_json["model"]
 30 |     init_args = collections.OrderedDict()
 31 |     if "__init__" in _from_json:        # unif >= beta v2.1.35
 32 |         zips = _from_json["__init__"].items()
 33 |     elif "keys" in _from_json:          # unif < beta v2.1.35
 34 |         zips = zip(_from_json["keys"], _from_json["values"])
 35 |     else:
 36 |         raise ValueError("Wrong format.")
 37 | 
 38 |     from_dir = os.path.dirname(from_file)
 39 |     if from_dir == "":
 40 |         from_dir = "."
 41 |     for arg, value in zips:
 42 | 
 43 |         # convert from relative path
 44 |         if arg == "init_checkpoint" or arg.endswith("_dir") or arg.endswith("_file"):
 45 |             if isinstance(value, str) and not value.startswith("/"):
 46 |                 value = get_simplified_path(from_dir + "/" + value)
 47 | 
 48 |         if arg in kwargs:
 49 |             value = kwargs[arg]
 50 |         init_args[arg] = value
 51 |     model = apps.__dict__[model_name](**init_args)
 52 | 
 53 |     # restore attributes
 54 |     for arg, value in _from_json.get("__dict__", {}).items():
 55 |         model.__dict__[arg] = value
 56 | 
 57 |     return model
 58 | 
 59 | 
 60 | def load(key, cache_file="./.cache", **kwargs):
 61 |     """ Load model from configurations saved in cache file.
 62 | 
 63 |     NOTE: This function is deprecated and not upgraded,
 64 |     retained only for compatibility with older versions.
 65 |     Try `uf.restore()` instead.
 66 |     """
 67 |     return restore(key, from_file=cache_file, **kwargs)
 68 | 
 69 | 
 70 | def get_init_values(model):
 71 |     values = []
 72 |     for arg in model.__class__.__init__.__code__.co_varnames[1:]:
 73 |         try:
 74 |             value = model.__getattribute__(arg)
 75 |         except Exception:
 76 |             value = model.__init_args__[arg]
 77 |         values.append(value)
 78 |     return values
 79 | 
 80 | 
 81 | def get_relative_path(source, target):
 82 |     source = source.replace("\\", "/")
 83 |     target = target.replace("\\", "/")
 84 | 
 85 |     if source.startswith("/"):
 86 |         raise ValueError("Not a relative path: %s." % source)
 87 |     if target.startswith("/"):
 88 |         raise ValueError("Not a relative path: %s." % target)
 89 | 
 90 |     output = get_reverse_path(source) + "/" + target
 91 |     output = get_simplified_path(output)
 92 |     return output
 93 | 
 94 | 
 95 | def get_simplified_path(path):
 96 |     path = path.replace("\\", "/")
 97 |     while True:
 98 |         res = re.findall("[^/]+/[.][.]/", path)
 99 |         res = [item for item in res if item != "../../" and item != "./../"]
100 |         if res:
101 |             path = path.replace(res[0], "")
102 |         else:
103 |             return path.replace("/./", "/")
104 | 
105 | 
106 | def get_reverse_path(path):
107 |     path = path.replace("\\", "/")
108 | 
109 |     if path.startswith("/"):
110 |         raise ValueError("Not a relative path.")
111 | 
112 |     output = ""
113 | 
114 |     if os.path.isdir(path):
115 |         if path.endswith("/"):
116 |             path = path[:-1]
117 |     else:
118 |         path = os.path.dirname(path)
119 | 
120 |     if path == "":
121 |         return "."
122 | 
123 |     cwd = os.getcwd()
124 |     for seg in path.split("/"):
125 |         if seg == ".":
126 |             pass
127 |         elif seg == "..":
128 |             output = "/" + cwd.split("/")[-1] + output
129 |             cwd = os.path.dirname(cwd)
130 |         else:
131 |             output = "/.." + output
132 |             cwd += "/" + seg
133 | 
134 |     output = output[1:]
135 | 
136 |     if output == "":
137 |         return "."
138 | 
139 |     return output
140 | 


--------------------------------------------------------------------------------
/uf/com/checkpoint.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | from ..third import tf
  5 | 
  6 | 
  7 | def get_checkpoint_path(path):
  8 |     """ If detected no checkpoint file, return None. """
  9 | 
 10 |     # get directory
 11 |     dir_name = path if os.path.isdir(path) else os.path.dirname(path)
 12 |     if not dir_name:
 13 |         dir_name = "."
 14 | 
 15 |     # get file
 16 |     if not os.path.isdir(path):
 17 |         prefix = path.strip("/").split("/")[-1]
 18 | 
 19 |         # find checkpoint
 20 |         if os.path.isfile(f"{dir_name}/{prefix}.index"):
 21 |             return f"{dir_name}/{prefix}"
 22 | 
 23 |         # stop to avoid error
 24 |         return None
 25 | 
 26 |     # get file from record file
 27 |     if os.path.exists(f"{dir_name}/checkpoint"):
 28 |         with open(f"{dir_name}/checkpoint") as f:
 29 |             line = f.readline()
 30 |         try:
 31 |             prefix = re.findall("model_checkpoint_path: \"(.+?)\"", line)[0]
 32 |             if os.path.exists(f"{dir_name}/{prefix}.index"):
 33 |                 return f"{dir_name}/{prefix}"
 34 |         except IndexError:
 35 |             pass
 36 | 
 37 |     # find file with largest step
 38 |     files = []
 39 |     for file in os.listdir(dir_name):
 40 |         if not file.endswith(".index"):
 41 |             continue
 42 |         prefix = file.replace(".index", "")
 43 |         step = 0
 44 |         try:
 45 |             step = int(prefix.split("-")[-1])
 46 |         except:
 47 |             pass
 48 |         files.append((step, file))
 49 |     if files:
 50 |         files.sort(key=lambda x: x[0], reverse=True)
 51 |         prefix = files[0][1].replace(".index", "")
 52 |         return f"{dir_name}/{prefix}"
 53 | 
 54 |     # find no checkpoint
 55 |     return None
 56 | 
 57 | 
 58 | def get_assignment_map(checkpoint_file, variables, continual=False, show_matched=False):
 59 |     """ Carefully designed so as to fulfil any personalized needs. """
 60 |     assignment_map = {}
 61 | 
 62 |     # read local variables
 63 |     name_to_variable = {}
 64 |     for var in variables:
 65 |         name = var.name
 66 |         res = re.match("^(.*):\\d+$", name)
 67 |         if res is not None:
 68 |             name = res.group(1)
 69 |         if not continual:
 70 |             if "global_step" in name \
 71 |                     or "/adam" in name \
 72 |                     or "/Adam" in name \
 73 |                     or "/lamb" in name:
 74 |                 continue
 75 |         name_to_variable[name] = var
 76 | 
 77 |     # read checkpoint variables
 78 |     init_vars = tf.train.list_variables(checkpoint_file)
 79 |     inited_vars = {}
 80 |     for name_shape in init_vars:
 81 |         (from_name, from_shape) = (name_shape[0], name_shape[1])
 82 | 
 83 |         to_name = from_name
 84 |         if to_name not in name_to_variable or \
 85 |                 name_to_variable[to_name].shape.as_list() != from_shape:
 86 |             if show_matched:
 87 |                 tf.logging.info("checkpoint_file contains <%s>", from_name)
 88 |             continue
 89 |         if show_matched:
 90 |             tf.logging.info("checkpoint_file contains <%s>, matched", from_name)
 91 |         assignment_map[from_name] = name_to_variable[to_name]
 92 |         inited_vars[to_name] = 1
 93 | 
 94 |     # further feedback
 95 |     uninited_vars = {}
 96 |     for var in variables:
 97 |         if var.name[:-2] not in inited_vars:
 98 |             if var.name[:-2].endswith("_m") or var.name[:-2].endswith("_v"):
 99 |                 continue
100 |             if show_matched:
101 |                 tf.logging.info("unmatched parameter %s", var)
102 |             uninited_vars[var.name[:-2]] = var
103 |     return (assignment_map, uninited_vars)
104 | 
105 | 
106 | def list_variables(checkpoint):
107 |     checkpoint_path = get_checkpoint_path(checkpoint)
108 |     if not checkpoint_path:
109 |         raise ValueError(
110 |             "Checkpoint file \"%s\" does not exist. "
111 |             "Make sure you pass correct value to "
112 |             "`checkpoint`." % checkpoint
113 |         )
114 |     return tf.train.list_variables(checkpoint_path)
115 | 


--------------------------------------------------------------------------------
/uf/com/com.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import numpy as np
  4 | 
  5 | from ..third import tf
  6 | 
  7 | PACK_DIR = os.path.dirname(__file__)
  8 | 
  9 | 
 10 | class Null:
 11 |     """ A null class for keeping code compatible when hanging out. """
 12 |     def __init__(self, *args, **kwargs):
 13 |         pass
 14 | 
 15 |     def __enter__(self, *args, **kwargs):
 16 |         pass
 17 | 
 18 |     def __exit__(self, *args, **kwargs):
 19 |         pass
 20 | 
 21 | 
 22 | def unimported_module(name, message):
 23 |     """ Returns an invalid module where error occurs only when being called. """
 24 | 
 25 |     class UnimportedModule:
 26 |         def __init__(self, *args, **kwargs):
 27 |             raise ImportError(message)
 28 |     return UnimportedModule
 29 | 
 30 | 
 31 | def warning(func):
 32 |     """ A function wrapper to avoid application crash. """
 33 |     def wrapper(*args, **kwargs):
 34 |         try:
 35 |             func(*args, **kwargs)
 36 |         except Exception as e:
 37 |             tf.logging.warning(e)
 38 |     return wrapper
 39 | 
 40 | 
 41 | def set_verbosity(level=2):
 42 |     """ Set exposure level of detail information. """
 43 |     if level == 2:
 44 |         tf.logging.set_verbosity(tf.logging.INFO)
 45 |     elif level == 1:
 46 |         tf.logging.set_verbosity(tf.logging.WARN)
 47 |     elif level == 0:
 48 |         tf.logging.set_verbosity(tf.logging.ERROR)
 49 |     else:
 50 |         raise ValueError(
 51 |           "Invalid value: %s. Pick from `0`, `1` and `2`. "
 52 |           "The larger the value, the more information will be printed." % level
 53 |         )
 54 | 
 55 | 
 56 | def set_log(log_file):
 57 |     """ Set logging file. """
 58 |     log = logging.getLogger("tensorflow")
 59 |     log.setLevel(logging.INFO)
 60 |     fh = logging.FileHandler(log_file)
 61 |     fh.setLevel(logging.INFO)
 62 |     log.addHandler(fh)
 63 | 
 64 | 
 65 | def truncate_segments(segments, max_seq_length, truncate_method="LIFO"):
 66 |     """ Truncate sequence segments to avoid the overall length exceeds the `max_seq_length`. """
 67 |     total_seq_length = sum([len(segment) for segment in segments])
 68 |     if total_seq_length <= max_seq_length:
 69 |         return
 70 |     if truncate_method not in ("longer-FO", "FIFO", "LIFO"):
 71 |         raise ValueError("Invalid value for `truncate_method`. Pick one from `FIFO`, `LIFO` and `longer-FO`.")
 72 | 
 73 |     n = 0
 74 |     if truncate_method == "FIFO":
 75 |         index = 0
 76 |         while n < total_seq_length - max_seq_length:
 77 |             if not segments[index]:
 78 |                 index += 1
 79 |                 continue
 80 |             segments[index].pop(0)
 81 |             n += 1
 82 |     elif truncate_method == "LIFO":
 83 |         index = len(segments) - 1
 84 |         while n < total_seq_length - max_seq_length:
 85 |             if not segments[index]:
 86 |                 index -= 1
 87 |                 continue
 88 |             segments[index].pop()
 89 |             n += 1
 90 |     else:
 91 |         while n < total_seq_length - max_seq_length:
 92 |             max(segments, key=lambda x: len(x)).pop()
 93 |             n += 1
 94 | 
 95 | def transform(output_arrays, n_inputs=None):
 96 |     """ Transform raw outputs. """
 97 | 
 98 |     # consolidate different batches
 99 |     if isinstance(output_arrays[0], np.ndarray):
100 |         if len(output_arrays[0].shape) == 1:    # 1D
101 |             out = np.hstack(output_arrays)
102 |         else:                                   # 2D/3D/...
103 |             out = np.vstack(output_arrays)
104 |         return out[:n_inputs] if n_inputs else out
105 | 
106 |     # flatten
107 |     elif isinstance(output_arrays[0], list):
108 |         out = [item for output_array in output_arrays for item in output_array]
109 |         return out[:n_inputs] if n_inputs else out
110 | 
111 |     else:
112 |         return output_arrays
113 | 


--------------------------------------------------------------------------------
/uf/com/graph.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | 
  4 | from ..third import tf
  5 | 
  6 | 
  7 | def get_grad_and_param(variables, grads, param_name):
  8 |     for (grad, param) in zip(grads, variables):
  9 |         if param_name in param.name:
 10 |             return (grad, param)
 11 |     return None, None
 12 | 
 13 | 
 14 | def get_param(variables, param_name):
 15 |     for param in variables:
 16 |         if param_name in param.name:
 17 |             return param
 18 |     return None
 19 | 
 20 | 
 21 | def get_param_name(param):
 22 |     res = re.match("^(.*):\\d+$", param.name)
 23 |     if res is not None:
 24 |         param_name = res.group(1)
 25 |     return param_name
 26 | 
 27 | 
 28 | def count_params(global_variables, trainable_variables):
 29 |     def get_params(variable):
 30 |         _tuple = tuple(map(int, variable.shape))
 31 |         if not _tuple:
 32 |             return 0
 33 |         return np.prod(_tuple)
 34 |     n_global = 0
 35 |     for variable in global_variables:
 36 |         n_global += get_params(variable)
 37 |     n_trainable = 0
 38 |     for variable in trainable_variables:
 39 |         n_trainable += get_params(variable)
 40 |     tf.logging.info(
 41 |         "Build graph with %s parameters (among which %s are trainable)"
 42 |         % (format(int(n_global), ","), format(int(n_trainable), ","))
 43 |     )
 44 | 
 45 | 
 46 | def scale_grad(grad, scalar):
 47 |     if grad is None:
 48 |         return None
 49 | 
 50 |     if grad.__str__().startswith("IndexedSlices"):
 51 |         return tf.IndexedSlices(values=grad.values * scalar, indices=grad.indices, dense_shape=grad.dense_shape)
 52 |     else:
 53 |         return grad * scalar
 54 | 
 55 | 
 56 | def add_n_grads(split_grads):
 57 |     split_grads = [grad for grad in split_grads if grad is not None]
 58 |     if len(split_grads) == 1:
 59 |         return split_grads[0]
 60 | 
 61 |     # Dealing with IndexedSlices for large-dimensional embedding
 62 |     # matrix. The gradient of an embedding matrix is not a tensor,
 63 |     # but a tuple-like object named `IndexedSlices`, for this one,
 64 |     # we need to take special processings.
 65 |     if split_grads[0].__str__().startswith("IndexedSlices"):
 66 | 
 67 |         values = tf.concat([grad.values for grad in split_grads], axis=0)
 68 |         indices = tf.concat([grad.indices for grad in split_grads], axis=0)
 69 |         dense_shape = split_grads[0].dense_shape
 70 | 
 71 |         return tf.IndexedSlices(values=values, indices=indices, dense_shape=dense_shape)
 72 | 
 73 |     return tf.add_n(split_grads)
 74 | 
 75 | 
 76 | def average_n_grads(split_grads):
 77 |     split_grads = [grad for grad in split_grads if grad is not None]
 78 |     if not split_grads:
 79 |         return None
 80 |     if len(split_grads) == 1:
 81 |         return split_grads[0]
 82 | 
 83 |     # Dealing with IndexedSlices for large-dimensional embedding
 84 |     # matrix. The gradient of an embedding matrix is not a tensor,
 85 |     # but a tuple-like object named `IndexedSlices`, for this one,
 86 |     # we need to take special processings.
 87 |     if split_grads[0].__str__().startswith("IndexedSlices"):
 88 | 
 89 |         values = tf.divide(tf.concat([grad.values for grad in split_grads], axis=0), len(split_grads))
 90 |         indices = tf.concat([grad.indices for grad in split_grads], axis=0)
 91 |         dense_shape = split_grads[0].dense_shape
 92 | 
 93 |         return tf.IndexedSlices(values=values, indices=indices, dense_shape=dense_shape)
 94 | 
 95 |     return tf.divide(tf.add_n(split_grads), len(split_grads))
 96 | 
 97 | 
 98 | def update_global_params(variables, global_step, optimizer, grads):
 99 |     assert len(grads) == len(variables)
100 |     update_op = optimizer.apply_gradients(zip(grads, variables), global_step=global_step)
101 |     return tf.group(update_op)
102 | 


--------------------------------------------------------------------------------
/uf/com/parallel.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | 
 3 | from ..third import tf
 4 | 
 5 | 
 6 | class MultiProcessInstance():
 7 |     def __init__(self):
 8 |         self.n = 1
 9 |         self.pool = None
10 | 
11 | mp = MultiProcessInstance()
12 | 
13 | 
14 | class MultiProcess:
15 |     def __init__(self, n_process="auto"):
16 |         n_cpu = multiprocessing.cpu_count()
17 |         if n_process != "auto":
18 |             assert n_process <= n_cpu, ("Invalid value of `n_process`. It can not exceed the num of cpu cores in the device: %d." % n_cpu)
19 |         else:
20 |             n_process = n_cpu
21 |         self.n = n_process
22 | 
23 |     def __enter__(self):
24 |         if self.n > 1:
25 |             mp.pool = multiprocessing.Pool(self.n)
26 |         mp.n = self.n
27 | 
28 |     def __exit__(self, *args, **kwargs):
29 |         if mp.pool is not None:
30 |             mp.pool.close()
31 |             mp.pool.join()
32 |             mp.pool = None
33 |         mp.n = 1
34 | 
35 | 
36 | def parallel_convert_single_process(args):
37 |     bucket_id = args[0]
38 |     app_class = args[1]
39 |     mapping = args[2]
40 |     data = args[3]
41 |     is_training = args[4]
42 | 
43 |     # Verbosity of tensorflow in new process will be set to default,
44 |     # for this reason we just have to silence the logging and don"t
45 |     # have to care about the recovery.
46 |     tf.logging.set_verbosity(tf.logging.FATAL)
47 |     model = app_class(*mapping)
48 | 
49 |     data = model.convert(
50 |         data["X"], data["y"], data["sample_weight"], data["X_tokenized"],
51 |         is_training, True,
52 |     )
53 |     return (bucket_id, data)
54 | 


--------------------------------------------------------------------------------
/uf/com/text.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import unicodedata
  3 | 
  4 | 
  5 | def convert_tokens_to_text(tokens):
  6 |     words = [""]
  7 |     for _token in tokens:
  8 |         if _token.startswith("##"):
  9 |             words[-1] += _token[2:]
 10 |         else:
 11 |             words.append(_token)
 12 |     text = " ".join(words)
 13 | 
 14 |     # remove spaces
 15 |     if len(text) >= 3:
 16 |         i = 1
 17 |         while i < len(text) - 1:
 18 |             if is_whitespace(text[i]):
 19 |                 _last = text[i - 1]
 20 |                 _next = text[i + 1]
 21 | 
 22 |                 # remove space between chars and punctuations
 23 |                 if not is_english_char(_last) or not is_english_char(_next):
 24 |                     text = text.replace("%s%s%s" % (_last, text[i], _next), "%s%s" % (_last, _next))
 25 |             i += 1
 26 | 
 27 |     return text.strip()
 28 | 
 29 | 
 30 | def align_tokens_with_text(tokens, text, lower_case):
 31 |     if lower_case:
 32 |         text = text.lower()
 33 | 
 34 |     i = 0
 35 |     j = 0
 36 |     max_j = len(text)
 37 |     mapping_start = []
 38 |     mapping_end = []
 39 |     while i < len(tokens):
 40 |         token = tokens[i]
 41 |         token = token.replace("##", "")
 42 |         if text[j:].startswith(token):
 43 |             mapping_start.append(j)
 44 |             mapping_end.append(j + len(token))
 45 |             i += 1
 46 |             j += len(token)
 47 |         elif token not in text[j:]:  # [CLS], [SEP], None, some Japanese signs
 48 |             mapping_start.append(j)
 49 |             if token in ("[CLS]", "[SEP]"):
 50 |                 mapping_end.append(j)
 51 |             else:
 52 |                 mapping_end.append(j + len(token))
 53 |             i += 1
 54 |         else:
 55 |             j += 1
 56 |         if j >= max_j:
 57 |             break
 58 | 
 59 |     for _ in range(len(tokens) - len(mapping_start)):
 60 |         mapping_start.append(max_j + 1000)
 61 |         mapping_end.append(max_j + 1000)
 62 | 
 63 |     return mapping_start, mapping_end
 64 | 
 65 | 
 66 | def find_boyer_moore(T, P, start=0):
 67 |     """ BM algorithm for string match. """
 68 | 
 69 |     n, m = len(T), len(P)
 70 |     last = {}
 71 |     for k in range(m):
 72 |         last[P[k]] = k
 73 | 
 74 |     # align end of pattern at index m-1 of text
 75 |     i = start + m - 1
 76 |     k = m - 1
 77 |     while i < n:
 78 |         if T[i] == P[k]:
 79 |             if k == 0:
 80 |                 return i
 81 |             i -= 1
 82 |             k -= 1
 83 |         else:
 84 |             j = last.get(T[i], -1)
 85 |             i += m - min(k, j + 1)
 86 |             k = m - 1
 87 |     return -1
 88 | 
 89 | 
 90 | def find_all_boyer_moore(T, P):
 91 |     start_ids = []
 92 |     start = 0
 93 |     while True:
 94 |         start_position = find_boyer_moore(
 95 |             T, P, start=start)
 96 |         if start_position == -1:
 97 |             break
 98 |         start_ids.append(start_position)
 99 |         start = start_position + len(P)
100 |     return start_ids
101 | 
102 | 
103 | def is_english_char(char):
104 |     if re.findall("[a-zA-Z]", char):
105 |         return True
106 |     return False
107 | 
108 | 
109 | def is_numeric_char(char):
110 |     if re.findall(r"[\d]", char):
111 |         return True
112 |     return False
113 | 
114 | 
115 | def is_whitespace(char):
116 |     """Checks whether `chars` is a whitespace character."""
117 | 
118 |     # \t, \n, and \r are technically contorl characters but we treat them
119 |     # as whitespace since they are generally considered as such.
120 |     if char in (" ", "\t", "\n", "\r"):
121 |         return True
122 |     cat = unicodedata.category(char)
123 |     if cat == "Zs":
124 |         return True
125 |     return False
126 | 
127 | 
128 | def is_control(char):
129 |     """Checks whether `chars` is a control character."""
130 | 
131 |     # These are technically control characters but we count them as whitespace
132 |     # characters.
133 |     if char in ("\t", "\n", "\r"):
134 |         return False
135 |     cat = unicodedata.category(char)
136 |     if cat in ("Cc", "Cf"):
137 |         return True
138 |     return False
139 | 
140 | 
141 | def is_punctuation(char):
142 |     """Checks whether `chars` is a punctuation character."""
143 |     ord_id = ord(char)
144 | 
145 |     # We treat all non-letter/number ASCII as punctuation.
146 |     # Characters such as "^", "$", and "`" are not in the Unicode
147 |     # Punctuation class but we treat them as punctuation anyways, for
148 |     # consistency.
149 |     if (ord_id >= 33 and ord_id <= 47) or \
150 |             (ord_id >= 58 and ord_id <= 64) or \
151 |             (ord_id >= 91 and ord_id <= 96) or \
152 |             (ord_id >= 123 and ord_id <= 126):
153 |         return True
154 |     cat = unicodedata.category(char)
155 |     if cat.startswith("P"):
156 |         return True
157 |     return False
158 | 
159 | 
160 | def is_chinese_char(ord_id):
161 |     """Checks whether ord_id is the codepoint of a CJK character."""
162 |     # This defines a `Chinese character` as anything in the CJK
163 |     # Unicode block:
164 |     # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
165 |     #
166 |     # Note that the CJK Unicode block is NOT all Japanese and
167 |     # Korean characters, despite its name. The modern Korean Hangul
168 |     # alphabet is a different block, as is Japanese Hiragana and
169 |     # Katakana. Those alphabets are used to write space-separated
170 |     # words, so they are not treated specially and handled like the
171 |     # all of the other languages.
172 |     if (ord_id >= 0x4E00 and ord_id <= 0x9FFF) or \
173 |             (ord_id >= 0x3400 and ord_id <= 0x4DBF) or \
174 |             (ord_id >= 0x20000 and ord_id <= 0x2A6DF) or \
175 |             (ord_id >= 0x2A700 and ord_id <= 0x2B73F) or \
176 |             (ord_id >= 0x2B740 and ord_id <= 0x2B81F) or \
177 |             (ord_id >= 0x2B820 and ord_id <= 0x2CEAF) or \
178 |             (ord_id >= 0xF900 and ord_id <= 0xFAFF) or \
179 |             (ord_id >= 0x2F800 and ord_id <= 0x2FA1F):
180 |         return True
181 |     return False
182 | 


--------------------------------------------------------------------------------
/uf/com/tfrecords.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | from ..third import tf
 4 | 
 5 | BACKUP_DATA = "ex:"        # data with the prefix `ex:` will not be fed into Tensorflow graph
 6 | 
 7 | 
 8 | def write_tfrecords(data, tfrecords_file):
 9 |     """ Write data into tfrecords file. """
10 | 
11 |     writer = tf.python_io.TFRecordWriter(tfrecords_file)
12 |     keys = []
13 |     values = []
14 |     for key, value in data.items():
15 |         if key.startswith(BACKUP_DATA):
16 |             continue
17 |         keys.append(key)
18 |         values.append(value)
19 |     examples = zip(*values)
20 | 
21 |     for example in examples:
22 |         features = collections.OrderedDict()
23 |         for i, value in enumerate(example):
24 |             if isinstance(value, int):
25 |                 features[keys[i]] = create_int_feature([value])
26 |             elif isinstance(value, float):
27 |                 features[keys[i]] = create_float_feature([value])
28 |             elif value.dtype.name.startswith("int"):
29 |                 features[keys[i]] = create_int_feature(value.tolist())
30 |             elif value.dtype.name.startswith("float"):
31 |                 features[keys[i]] = create_float_feature(value.tolist())
32 |             else:
33 |                 raise ValueError("Invalid data type: %s." % type(value))
34 |         tf_example = tf.train.Example(features=tf.train.Features(feature=features))
35 |         writer.write(tf_example.SerializeToString())
36 | 
37 | 
38 | def get_tfrecords_keys(tfrecords_file):
39 |     """ Read keys from tfrecords file. """
40 |     iterator = tf.python_io.tf_record_iterator(tfrecords_file)
41 |     record = next(iterator)
42 |     example = tf.train.Example()
43 |     example.ParseFromString(record)
44 |     return list(example.features.feature.keys())
45 | 
46 | 
47 | def get_tfrecords_length(tfrecords_files):
48 |     """ Count number of data in tfrecords files. """
49 |     n = 0
50 |     for tfrecords_file in tfrecords_files:
51 |         for _ in tf.python_io.tf_record_iterator(tfrecords_file):
52 |             n += 1
53 |     return n
54 | 
55 | 
56 | def convert_placeholder_to_feature(placeholder):
57 |     """ Convert `PlaceHolder` for feeding data in memory into `FixedLenFeature` for local TFRecords. """
58 |     if placeholder.dtype.name.startswith("int"):
59 |         dtype = tf.int64
60 |     elif placeholder.dtype.name.startswith("float"):
61 |         dtype = tf.float32
62 |     else:
63 |         raise ValueError(f"Unsupported dtype: {placeholder.dtype}.")
64 |     return tf.FixedLenFeature(list(placeholder.shape)[1:], dtype)
65 | 
66 | 
67 | def create_int_feature(values):
68 |     """ Convert list of values into tf-serializable Int64. """
69 |     if not isinstance(values, list):
70 |         values = [values]
71 |     feature = tf.train.Feature(int64_list=tf.train.Int64List(value=values))
72 |     return feature
73 | 
74 | 
75 | def create_float_feature(values):
76 |     """ Convert list of values into tf-serializable Float. """
77 |     if not isinstance(values, list):
78 |         values = [values]
79 |     feature = tf.train.Feature(float_list=tf.train.FloatList(value=values))
80 |     return feature
81 | 


--------------------------------------------------------------------------------
/uf/task/__init__.py:
--------------------------------------------------------------------------------
 1 | from .init import Initialization
 2 | from .train import Training
 3 | from .train_adversarial import AdversarialTraining
 4 | from .infer import Inference
 5 | from .score import Scoring
 6 | from .export import Exportation
 7 | 
 8 | 
 9 | __all__ = [
10 |     "Training",
11 |     "AdversarialTraining",
12 |     "Initialization",
13 |     "Inference",
14 |     "Scoring",
15 |     "Exportation",
16 | ]
17 | 


--------------------------------------------------------------------------------
/uf/task/_base_.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from abc import abstractmethod
  4 | 
  5 | from ..third import tf
  6 | from .. import com
  7 | 
  8 | 
  9 | class Task:
 10 |     """ Parent class of all tasks.
 11 | 
 12 |     This is an internal class that does not provide interface for outside requests."""
 13 | 
 14 |     def __init__(self, module):
 15 |         self.module = module
 16 | 
 17 |     @abstractmethod
 18 |     def run(self, *args, **kwargs):
 19 |         raise NotImplementedError()
 20 | 
 21 |     def _build_graph(self):
 22 |         """ Build computation graph. """
 23 |         self.module._graph_mode = "infer"
 24 |         self.module._set_placeholders()
 25 |         _, self.module.tensors = self.module._parallel_forward(is_training=False)
 26 | 
 27 |     def _init_session(self, ignore_checkpoint=False):
 28 |         """ Initialize Tensorflow session. """
 29 |         com.count_params(self.module.global_variables, self.module.trainable_variables)
 30 | 
 31 |         if self.module._gpu_ids:
 32 |             os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(self.module._gpu_ids)
 33 |         else:
 34 |             os.environ["CUDA_VISIBLE_DEVICES"] = "-1"           # disable GPUs
 35 |         config = tf.ConfigProto(
 36 |             allow_soft_placement=True,
 37 |             gpu_options=tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=1.0),
 38 |         )
 39 |         self.module.sess = tf.Session(graph=self.module.graph, config=config)
 40 |         self._init_variables(self.module.global_variables, ignore_checkpoint=ignore_checkpoint)
 41 |         self.module._session_built = True
 42 | 
 43 |     def _init_variables(self, variables, ignore_checkpoint=False):
 44 |         """ Initialize variables in the session. """
 45 | 
 46 |         # randomly initialize variables
 47 |         tf.logging.info("Running local_init_op")
 48 |         local_init_op = tf.variables_initializer(variables)
 49 |         self.module.sess.run(local_init_op)
 50 |         self.module._inited_vars |= set(variables)
 51 |         tf.logging.info("Done running local_init_op")
 52 | 
 53 |         # read from checkpoint file
 54 |         if not ignore_checkpoint and self.module.init_checkpoint:
 55 |             checkpoint_path = com.get_checkpoint_path(self.module.init_checkpoint)
 56 |             if not checkpoint_path:
 57 |                 raise ValueError(
 58 |                     "Checkpoint file \"%s\" does not exist. Make sure you pass correct value to "
 59 |                     "`init_checkpoint`."
 60 |                     % self.module.init_checkpoint
 61 |                 )
 62 |             self.module.init_checkpoint = checkpoint_path       # rectified path replacement
 63 | 
 64 |             # `continual` means we tend to succeed the training step and momentums variables "
 65 |             # "stored in the checkpoint file
 66 |             continual = os.path.dirname(checkpoint_path) == self.module.output_dir
 67 |             if continual:
 68 |                 self.module.step = int(checkpoint_path.split("-")[-1])
 69 | 
 70 |             # build a bridge between the variables in checkpoint file and the variables in the graph
 71 |             (assignment_map, uninited_vars) = com.get_assignment_map(checkpoint_path, variables, continual=continual)
 72 |             self.module.assignment_map = assignment_map
 73 |             self.module.uninited_vars = uninited_vars
 74 | 
 75 |             if uninited_vars:
 76 |                 tf.logging.info(
 77 |                     "%d (out of %d) local variables failed to match up with the checkpoint file. "
 78 |                     "Check more details through `.uninited_vars`."
 79 |                     % (len(uninited_vars), len(assignment_map) + len(uninited_vars))
 80 |                 )
 81 | 
 82 |             if not self.module.assignment_map:    # no variables to restore
 83 |                 return
 84 |             loader = tf.train.Saver(self.module.assignment_map)
 85 |             loader.restore(self.module.sess, checkpoint_path)
 86 | 
 87 |             if "_global_step" in self.module.__dict__:
 88 |                 self.module.sess.run(tf.assign(self.module._global_step, self.module.step))
 89 | 
 90 |     def _build_feed_dict(self):
 91 |         """ Build `feed dict` for the current batch of data. """
 92 | 
 93 |         feed_dict = {}
 94 |         for key, data in self.module.data.items():
 95 |             if key.startswith(com.BACKUP_DATA):     # not to feed
 96 |                 continue
 97 | 
 98 |             # move pointer and form the batch
 99 |             ptr = self._ptr
100 |             batch = data[ptr: ptr + self.module.batch_size]
101 |             ptr += self.module.batch_size
102 | 
103 |             # fill up the batch
104 |             while len(batch) < self.module.batch_size:
105 |                 ptr = self.module.batch_size - len(batch)
106 |                 remainder = data[:ptr]
107 |                 concat_func = np.vstack if len(batch.shape) > 1 else np.hstack
108 |                 batch = concat_func((batch, remainder))
109 | 
110 |             placeholder = self.module.placeholders[key]
111 |             feed_dict[placeholder] = batch
112 | 
113 |         self._ptr = ptr
114 |         return feed_dict
115 | 


--------------------------------------------------------------------------------
/uf/task/export.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | 
 5 | from ..third import tf
 6 | from ._base_ import Task
 7 | 
 8 | 
 9 | class Exportation(Task):
10 |     """ Export model into PB file. """
11 | 
12 |     def run(self, export_dir, rename_inputs=None, rename_outputs=None, ignore_inputs=None, ignore_outputs=None):
13 | 
14 |         # build graph
15 |         self._build_graph()
16 | 
17 |         # init session
18 |         if not self.module._session_built:
19 |             self._init_session()
20 | 
21 |         def set_input(key, value):
22 |             inputs[key] = tf.saved_model.utils.build_tensor_info(value)
23 |             tf.logging.info("Register Input: %s, %s, %s" % (key, value.shape.as_list(), value.dtype.name))
24 | 
25 |         def set_output(key, value):
26 |             outputs[key] = tf.saved_model.utils.build_tensor_info(value)
27 |             tf.logging.info("Register Output: %s, %s, %s" % (key, value.shape.as_list(), value.dtype.name))
28 | 
29 |         # define inputs
30 |         inputs = {}
31 |         if not ignore_inputs:
32 |             ignore_inputs = []
33 |         for key, value in list(self.module.placeholders.items()):
34 |             if key in ignore_inputs:
35 |                 continue
36 |             if rename_inputs and key in rename_inputs:
37 |                 key = rename_inputs[key]
38 |             set_input(key, value)
39 | 
40 |         # define outputs
41 |         outputs = {}
42 |         if not ignore_outputs:
43 |             ignore_outputs = []
44 |         for key, value in self.module.tensors.items():
45 |             if key in ignore_outputs:
46 |                 continue
47 |             if rename_outputs and key in rename_outputs:
48 |                 key = rename_outputs[key]
49 |             set_output(key, value)
50 | 
51 |         # build signature
52 |         signature = tf.saved_model.signature_def_utils.build_signature_def(
53 |             inputs, outputs, tf.saved_model.signature_constants.PREDICT_METHOD_NAME,
54 |         )
55 |         signature_def_map = {"predict": signature}
56 |         tf.logging.info("Register Signature: predict")
57 | 
58 |         legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
59 |         builder_path = os.path.join(export_dir, time.strftime("%Y%m%d%H%M%S"))
60 | 
61 |         # solve the path problem
62 |         if sys.platform.startswith("win"):
63 |             builder_path = builder_path.replace("/", "\\")
64 | 
65 |         # exportation
66 |         try:
67 |             builder = tf.saved_model.builder.SavedModelBuilder(builder_path)
68 |             builder.add_meta_graph_and_variables(
69 |                 self.module.sess,
70 |                 [tf.saved_model.tag_constants.SERVING],
71 |                 signature_def_map=signature_def_map,
72 |                 legacy_init_op=legacy_init_op,
73 |             )
74 |         except ValueError:
75 |             raise ValueError(
76 |                 "Twice exportation is not allowed. Try `.save()` and "
77 |                 "`.reset()` method to save and reset the graph before "
78 |                 "next exportation."
79 |             )
80 |         builder.save()
81 | 


--------------------------------------------------------------------------------
/uf/task/infer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from ..third import tf
 4 | from ._base_ import Task
 5 | 
 6 | 
 7 | class Inference(Task):
 8 |     """ Inference, as its name. """
 9 | 
10 |     def run(self):
11 | 
12 |         # confirm inputs
13 |         n_inputs = len(list(self.module.data.values())[0])
14 |         if not n_inputs:
15 |             raise ValueError("0 input samples recognized.")
16 | 
17 |         # build graph
18 |         if self.module._graph_mode != "infer" and not self.module._debug:
19 |             self._build_graph()
20 | 
21 |         # init session
22 |         if not self.module._session_built:
23 |             self._init_session()
24 | 
25 |         tf.logging.info("Running inference on %d samples", n_inputs)
26 | 
27 |         # inference
28 |         self._ptr = 0
29 |         last_tic = time.time()
30 |         last_step = 0
31 |         batch_outputs = []
32 |         total_steps = (n_inputs - 1) // self.module.batch_size + 1
33 |         for step in range(total_steps):
34 |             last_tic, last_step = self._predict_one_batch(
35 |                 step + 1, last_tic, last_step, total_steps, batch_outputs,
36 |             )
37 | 
38 |         output_arrays = list(zip(*batch_outputs))
39 |         return self.module._get_predict_outputs(output_arrays, n_inputs)
40 | 
41 |     def _predict_one_batch(self, step, last_tic, last_step, total_steps, batch_outputs):
42 |         feed_dict = self._build_feed_dict()
43 |         predict_ops = self.module._get_predict_ops()
44 |         output_arrays = self.module.sess.run(predict_ops, feed_dict=feed_dict)
45 |         batch_outputs.append(output_arrays)
46 | 
47 |         # print
48 |         diff_tic = time.time() - last_tic
49 |         process = step / total_steps
50 |         if (diff_tic > 10 and process >= 0.005) or step == total_steps:
51 |             info = "process %.1f%%" % (process * 100)
52 | 
53 |             # print inference efficiency
54 |             info += ", %.2f examples/sec" % ((step - last_step) / diff_tic * self.module.batch_size)
55 | 
56 |             tf.logging.info(info)
57 |             last_tic = time.time()
58 |             last_step = step
59 | 
60 |         return last_tic, last_step
61 | 


--------------------------------------------------------------------------------
/uf/task/init.py:
--------------------------------------------------------------------------------
 1 | from ..third import tf
 2 | from ._base_ import Task
 3 | 
 4 | 
 5 | class Initialization(Task):
 6 |     """ Initialze the model, make it ready for inference. """
 7 | 
 8 |     def run(self, reinit_all, ignore_checkpoint):
 9 | 
10 |         # build graph
11 |         if self.module._graph_mode is None:
12 |             self._build_graph()
13 | 
14 |         # init session
15 |         if reinit_all or not self.module._session_built:
16 |             self._init_session(ignore_checkpoint=ignore_checkpoint)
17 | 
18 |         # init uninitialized variables
19 |         else:
20 |             variables = []
21 |             for var in self.module.global_variables:
22 |                 if var not in self.module._inited_vars:
23 |                     variables.append(var)
24 |             if variables:
25 |                 self._init_variables(variables, ignore_checkpoint=ignore_checkpoint)
26 |             else:
27 |                 tf.logging.info(
28 |                     "Global variables already initialized. To re-initialize all, "
29 |                     "pass `reinit_all` to True."
30 |                 )
31 | 


--------------------------------------------------------------------------------
/uf/task/score.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from ..third import tf
 4 | from ._base_ import Task
 5 | 
 6 | 
 7 | class Scoring(Task):
 8 |     """ Infer the data and score the performance. """
 9 | 
10 |     def run(self):
11 | 
12 |         # confirm inputs
13 |         n_inputs = len(list(self.module.data.values())[0])
14 |         if not n_inputs:
15 |             raise ValueError("0 input samples recognized.")
16 | 
17 |         # build graph
18 |         if self.module._graph_mode != "infer" and not self.module._debug:
19 |             self._build_graph()
20 | 
21 |         # init session
22 |         if not self.module._session_built:
23 |             self._init_session()
24 | 
25 |         tf.logging.info("Running scoring on %d samples", n_inputs)
26 | 
27 |         # scoring
28 |         self._ptr = 0
29 |         last_tic = time.time()
30 |         last_step = 0
31 |         batch_outputs = []
32 |         total_steps = (n_inputs - 1) // self.module.batch_size + 1
33 |         for step in range(total_steps):
34 |             last_tic, last_step = self._score_one_batch(
35 |                 step + 1, last_tic, last_step, total_steps, batch_outputs,
36 |             )
37 | 
38 |         output_arrays = list(zip(*batch_outputs))
39 |         return self.module._get_score_outputs(output_arrays, n_inputs)
40 | 
41 |     def _score_one_batch(self, step, last_tic, last_step, total_steps, batch_outputs):
42 |         feed_dict = self._build_feed_dict()
43 |         score_ops = self.module._get_score_ops()
44 |         output_arrays = self.module.sess.run(score_ops, feed_dict=feed_dict)
45 |         batch_outputs.append(output_arrays)
46 | 
47 |         # print
48 |         diff_tic = time.time() - last_tic
49 |         process = step / total_steps
50 |         if (diff_tic > 10 and process >= 0.005) or step == total_steps:
51 |             info = "process %.1f%%" % (process * 100)
52 | 
53 |             # print scoring efficiency
54 |             info += ", %.2f examples/sec" % ((step - last_step) / diff_tic * self.module.batch_size)
55 | 
56 |             tf.logging.info(info)
57 |             last_tic = time.time()
58 |             last_step = step
59 | 
60 |         return last_tic, last_step
61 | 


--------------------------------------------------------------------------------
/uf/third.py:
--------------------------------------------------------------------------------
1 | """ Version control of dependencies. """
2 | 
3 | import tensorflow as tf
4 | 
5 | 
6 | if tf.__version__.startswith("2"):
7 |     import tensorflow.compat.v1 as tf
8 |     tf.disable_eager_execution()
9 | 


--------------------------------------------------------------------------------
/uf/token/__init__.py:
--------------------------------------------------------------------------------
 1 | from .wordpiece import WordPieceTokenizer
 2 | try:
 3 |     from .sentencepiece import SentencePieceTokenizer
 4 | except:
 5 |     pass
 6 | 
 7 | __all__ = [
 8 |     "WordPieceTokenizer",
 9 |     "SentencePieceTokenizer",
10 | ]
11 | 


--------------------------------------------------------------------------------
/uf/token/sentencepiece.py:
--------------------------------------------------------------------------------
 1 | """ SentencePiece tokenizer class.
 2 |   Code revised from XLNet team's implementation of XLNet.
 3 |   See `https://github.com/zihangdai/xlnet`.
 4 | """
 5 | 
 6 | import os
 7 | import unicodedata
 8 | from sentencepiece import SentencePieceProcessor
 9 | 
10 | 
11 | class SentencePieceTokenizer:
12 |     def __init__(self, spm_file, do_lower_case=True):
13 |         if not os.path.exists(spm_file):
14 |             raise ValueError(
15 |                 "Can't find spm_file \"%s\". "
16 |                 "Please pass the correct path of sentence-piece model file, "
17 |                 "e.g.`spiece.model`." % spm_file
18 |             )
19 |         self.processor = SentencePieceProcessor()
20 |         self.processor.Load(spm_file)
21 |         self.do_lower_case = do_lower_case
22 | 
23 |     def tokenize(self, text):
24 |         text = preprocess_text(text, lower=self.do_lower_case)
25 |         pieces = encode_pieces(self.processor, text, sample=False)
26 |         return pieces
27 | 
28 |     def convert_tokens_to_ids(self, tokens):
29 |         return [self.processor.PieceToId(piece) for piece in tokens]
30 | 
31 |     def convert_ids_to_tokens(self, ids):
32 |         pieces = [self.processor.IdToPiece(_id) for _id in ids]
33 |         return pieces
34 | 
35 | 
36 | def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
37 |     if remove_space:
38 |         outputs = " ".join(inputs.strip().split())
39 |     else:
40 |         outputs = inputs
41 |     outputs = outputs.replace("``", """).replace("""", """)
42 | 
43 |     if not keep_accents:
44 |         outputs = unicodedata.normalize("NFKD", outputs)
45 |         outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
46 |     if lower:
47 |         outputs = outputs.lower()
48 | 
49 |     return outputs
50 | 
51 | 
52 | def encode_pieces(sp_model, text, sample=False):
53 | 
54 |     if not sample:
55 |         pieces = sp_model.EncodeAsPieces(text)
56 |     else:
57 |         pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
58 |     new_pieces = []
59 |     for piece in pieces:
60 |         if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
61 |             cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace("▁", ""))
62 |             if piece[0] != "▁" and cur_pieces[0][0] == "▁":
63 |                 if len(cur_pieces[0]) == 1:
64 |                     cur_pieces = cur_pieces[1:]
65 |                 else:
66 |                     cur_pieces[0] = cur_pieces[0][1:]
67 |             cur_pieces.append(piece[-1])
68 |             new_pieces.extend(cur_pieces)
69 |         else:
70 |             new_pieces.append(piece)
71 |     return new_pieces
72 | 


--------------------------------------------------------------------------------