├── .gitignore
├── README.md
├── data
    └── data_creator.py
├── rnn_segment_completed.py
├── rnn_segment_exercise.ipynb
├── utils.py
└── word2vec
    ├── word2vec.ipynb
    └── word2vec_utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | .idea*
92 | 
93 | # data folders
94 | _*/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tf-text-workshop
2 | "Deep Learning with TensorFlow Workshop 3 - Deep Learning for Text"
3 | 


--------------------------------------------------------------------------------
/data/data_creator.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | import glob
  4 | from functools import *
  5 | import itertools
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | import random
  9 | import re
 10 | from multiprocessing import Pool
 11 | 
 12 | MARKS = {'<NE>': '', '</NE>': '', '<AB>': '', '</AB>': ''}
 13 | 
 14 | ALL_CHAR = [
 15 |     '', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
 16 |     ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
 17 |     '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
 18 |     'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
 19 |     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
 20 |     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
 21 |     'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
 22 |     'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
 23 |     'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
 24 |     'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
 25 |     'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
 26 |     'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
 27 |     'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
 28 |     '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff', 'other'
 29 | ]
 30 | CHARS_MAP = {v: k for k, v in enumerate(ALL_CHAR)}
 31 | IDX_MAP = dict(list(enumerate(ALL_CHAR)))
 32 | OTHER_KEY = max(CHARS_MAP.values())
 33 | CLASS_MAP = {'B': 0, 'M': 1, 'E': 2, 'S': 3}
 34 | 
 35 | 
 36 | def word_to_idx(word):
 37 |     w_size = len(word)
 38 |     w_idx = list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), word))
 39 |     label = []
 40 |     if w_size == 1:
 41 |         label += [3]
 42 |     else:
 43 |         label = [0] + list(np.repeat([1], w_size - 2)) + [2]
 44 | 
 45 |     return w_idx, label
 46 | 
 47 | 
 48 | def get_feature(tokens, k):
 49 |     n_tokens = len(tokens)
 50 |     padded_tokens = tokens + [0] * k
 51 |     res = []
 52 |     for i in range(n_tokens):
 53 |         res.append(padded_tokens[i:i + k + 1])
 54 |     return res
 55 | 
 56 | 
 57 | def make_example(seq_features, labels, key):
 58 |     # The object we return
 59 |     ex = tf.train.SequenceExample()
 60 |     # A non-sequential feature of our example
 61 |     sequence_length = len(seq_features)
 62 |     ex.context.feature["seq_length"].int64_list.value.append(sequence_length)
 63 |     ex.context.feature["key"].int64_list.value.append(key)
 64 |     # Feature lists for the two sequential features of our example
 65 |     fl_tokens = ex.feature_lists.feature_list["seq_feature"]
 66 |     fl_labels = ex.feature_lists.feature_list["label"]
 67 |     for feature, label in zip(seq_features, labels):
 68 |         fl_tokens.feature.add().int64_list.value.extend(feature)
 69 |         fl_labels.feature.add().int64_list.value.append(label)
 70 |     return ex
 71 | 
 72 | 
 73 | def save_to_tfrecords(data_path, output_path, type, k):
 74 |     all_files = glob.glob(os.path.join(data_path, '*.txt'))
 75 |     random.shuffle(all_files)
 76 |     train_size = int(0.8 * len(all_files))
 77 |     train = all_files[:train_size]
 78 |     test = all_files[train_size:]
 79 | 
 80 |     def write(files, prefix, type):
 81 |         if not os.path.isdir(os.path.join(os.getcwd(), output_path, prefix)):
 82 |             os.makedirs(os.path.join(output_path, prefix))
 83 |         for file in files:
 84 |             words_all = []
 85 |             print(file)
 86 |             lines = open(file, 'r', encoding='utf-8')
 87 |             for line in lines:
 88 |                 line = reduce(lambda a, kv: a.replace(*kv), MARKS.items(), line)
 89 |                 sentence = line.split(" ")
 90 |                 words = [[word for word in s.split("|") if word not in ['', '\n']] for s in sentence]
 91 |                 words = filter(lambda x: len(x) > 0, words)
 92 |                 words_all.extend(list(words))
 93 |             lines.close()
 94 |             word_idxs = list(map(lambda s: list(map(lambda w: word_to_idx(w), s)), words_all))
 95 |             st_idx, label = map(list, zip(
 96 |                 *list(
 97 |                     map(lambda s: tuple(map(lambda x: list(itertools.chain.from_iterable(x)), list(zip(*s)))),
 98 |                         word_idxs))))
 99 |             input_feature = list(map(lambda x: get_feature(x, k), st_idx))
100 | 
101 |             # Write all examples into a TFRecords file
102 |             f_name = re.search('([0-9].*).txt', file).group(1)
103 | 
104 |             with open(os.path.join(output_path, prefix, type + '_' + f_name + '.tf'), 'w') as fp:
105 |                 writer = tf.python_io.TFRecordWriter(fp.name)
106 |                 for key, sequence in enumerate(zip(input_feature, label)):
107 |                     seq_input, label = sequence
108 |                     ex = make_example(seq_input, label, key)
109 |                     writer.write(ex.SerializeToString())
110 |                 writer.close()
111 | 
112 |     write(train, "train", type)
113 |     write(test, "test", type)
114 | 
115 | 
116 | def read_and_decode_single_example(filenames, shuffle=False, num_epochs=None):
117 |     # first construct a queue containing a list of filenames.
118 |     # this lets a user split up there dataset in multiple files to keep size down
119 |     # filename_queue = tf.train.string_input_producer([filename], num_epochs=10)
120 |     filename_queue = tf.train.string_input_producer(filenames,
121 |                                                     shuffle=shuffle, num_epochs=num_epochs)
122 | 
123 |     reader = tf.TFRecordReader()
124 |     # One can read a single serialized example from a filename
125 |     # serialized_example is a Tensor of type string.
126 |     _, serialized_ex = reader.read(filename_queue)
127 |     context, sequences = tf.parse_single_sequence_example(serialized_ex,
128 |                                                           context_features={
129 |                                                               "seq_length": tf.FixedLenFeature([], dtype=tf.int64)
130 |                                                           },
131 |                                                           sequence_features={
132 |                                                               "seq_feature": tf.VarLenFeature(dtype=tf.int64),
133 |                                                               "label": tf.VarLenFeature(dtype=tf.int64)
134 |                                                           })
135 |     return context, sequences
136 | 
137 | 
138 | def fn(cat):
139 |     return save_to_tfrecords("_BEST/" + cat, "_tf_records_k2", cat, 2)
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     category = ['article', 'encyclopedia', 'news', 'novel']
144 |     p = Pool(4)
145 |     p.map(fn, category)
146 | 


--------------------------------------------------------------------------------
/rnn_segment_completed.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.contrib import rnn
 3 | from tensorflow.contrib import metrics
 4 | from tensorflow.contrib import layers
 5 | from tensorflow.contrib import learn
 6 | from tensorflow.contrib.learn import *
 7 | from tensorflow.contrib import seq2seq
 8 | from data.data_creator import *
 9 | from utils import *
10 | 
11 | tf.logging.set_verbosity(tf.logging.INFO)
12 | 
13 | print(tf.__version__)
14 | 
15 | 
16 | def rnn_segment(features, targets, mode, params):
17 |     seq_feature = features['seq_feature']
18 |     seq_length = features['seq_length']
19 |     with tf.variable_scope("emb"):
20 |         embeddings = tf.get_variable("char_emb", shape=[params['num_char'], params['emb_size']])
21 |     seq_emb = tf.nn.embedding_lookup(embeddings, seq_feature)
22 |     batch_size = tf.shape(seq_feature)[0]
23 |     time_step = tf.shape(seq_feature)[1]
24 |     flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']])
25 |     cell = rnn.LSTMCell(params['rnn_units'])
26 |     if mode == ModeKeys.TRAIN:
27 |         cell = rnn.DropoutWrapper(cell, params['input_keep_prob'], params['output_keep_prob'])
28 |     projection_cell = rnn.OutputProjectionWrapper(cell, params['num_class'])
29 |     logits, _ = tf.nn.dynamic_rnn(projection_cell, flat_seq_emb, sequence_length=seq_length, dtype=tf.float32)
30 |     weight_mask = tf.to_float(tf.sequence_mask(seq_length))
31 |     loss = seq2seq.sequence_loss(logits, targets, weights=weight_mask)
32 |     train_op = layers.optimize_loss(
33 |         loss=loss,
34 |         global_step=tf.contrib.framework.get_global_step(),
35 |         learning_rate=params["learning_rate"],
36 |         optimizer=tf.train.AdamOptimizer,
37 |         clip_gradients=params['grad_clip'],
38 |         summaries=[
39 |             "learning_rate",
40 |             "loss",
41 |             "gradients",
42 |             "gradient_norm",
43 |         ])
44 |     pred_classes = tf.to_int32(tf.argmax(input=logits, axis=2))
45 |     pred_words = tf.logical_or(tf.equal(pred_classes, 0), tf.equal(pred_classes, 3))
46 |     target_words = tf.logical_or(tf.equal(targets, 0), tf.equal(targets, 3))
47 |     precision = metrics.streaming_precision(pred_words, target_words, weights=weight_mask)
48 |     recall = metrics.streaming_recall(pred_words, target_words, weights=weight_mask)
49 |     predictions = {
50 |         "classes": pred_classes
51 |     }
52 |     eval_metric_ops = {
53 |         "precision": precision,
54 |         "recall": recall
55 |     }
56 |     return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256,
61 |                         input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2)
62 |     rnn_model = learn.Estimator(model_fn=rnn_segment
63 |                                 , params=model_params
64 |                                 , model_dir="model/_rnn_model"
65 |                                 , config=learn.RunConfig(save_checkpoints_secs=30,
66 |                                                          keep_checkpoint_max=2))
67 | 
68 |     train_input_fn = data_provider("data/_tf_records/train", batch_size=128)
69 |     test_input_fn = data_provider("data/_tf_records/test", batch_size=512)
70 | 
71 |     validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn,
72 |                                                     eval_steps=10,
73 |                                                     every_n_steps=500,
74 |                                                     name='validation')
75 | 
76 |     # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor])
77 |     rnn_model.evaluate(input_fn=test_input_fn, steps=10)
78 | 
79 |     text = """นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ"""
80 |     tudkum(text, rnn_model, model_params['k'])
81 | 


--------------------------------------------------------------------------------
/rnn_segment_exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tensorflow as tf\n",
 10 |     "from tensorflow.contrib import rnn\n",
 11 |     "from tensorflow.contrib import metrics\n",
 12 |     "from tensorflow.contrib import layers\n",
 13 |     "from tensorflow.contrib import learn\n",
 14 |     "from tensorflow.contrib.learn import *\n",
 15 |     "from tensorflow.contrib import seq2seq\n",
 16 |     "\n",
 17 |     "tf.logging.set_verbosity(tf.logging.INFO)\n",
 18 |     "print(tf.__version__)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Import helper functions for processing data\n",
 26 |     "- [data_creator.py](/edit/data/data_creator.py)\n",
 27 |     "- [utils.py](/edit/utils.py)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": true
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "from data.data_creator import *\n",
 39 |     "from utils import *"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Define RNN model function for word segmentation"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "def rnn_segment(features, targets, mode, params):\n",
 58 |     "    seq_feature = features['seq_feature']\n",
 59 |     "    seq_length = features['seq_length']\n",
 60 |     "    \n",
 61 |     "    # Create a variable matrix in which each row represents an embedding vector for a given character.\n",
 62 |     "    with tf.variable_scope(\"emb\"):\n",
 63 |     "        embeddings = tf.get_variable(\"char_emb\", shape=[params[???], params[???]])\n",
 64 |     "    \n",
 65 |     "    # Convert sequence features to sequence embeddings \n",
 66 |     "    seq_emb = tf.nn.embedding_lookup(params=???, ids=???)\n",
 67 |     "    \n",
 68 |     "    # Flatten each sequence embedding\n",
 69 |     "    # [[1st char embedding], [2nd char embedding], [3rd char embedding]] => [flatten seq embedding]\n",
 70 |     "    batch_size = tf.shape(seq_feature)[0]\n",
 71 |     "    time_step = tf.shape(seq_feature)[1]\n",
 72 |     "    flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']])\n",
 73 |     "    \n",
 74 |     "    cell = rnn.LSTMCell(num_units=???)\n",
 75 |     "    if mode == ModeKeys.TRAIN:\n",
 76 |     "        # Create a cell with added input and output dropout with given probabilities.\n",
 77 |     "        cell = rnn.DropoutWrapper(cell, ???)\n",
 78 |     "        \n",
 79 |     "    # Project output from RNN cell into classes\n",
 80 |     "    projection_cell = rnn.OutputProjectionWrapper(cell, ???)\n",
 81 |     "    \n",
 82 |     "    # Create a recurrent neural network from the projection_cell\n",
 83 |     "    logits, _ = tf.nn.dynamic_rnn(???, dtype=tf.float32)\n",
 84 |     "    \n",
 85 |     "    # Training samples in each batch might have different lengths. \n",
 86 |     "    # We put them in the same matrix by padding each sample to have the same length as the longest sample. \n",
 87 |     "    # We need to create a mask tensor to avoid accounting for losses from the padding. \n",
 88 |     "    weight_mask = tf.to_float(tf.sequence_mask(seq_length))\n",
 89 |     "    loss = seq2seq.sequence_loss(???)\n",
 90 |     "    \n",
 91 |     "    train_op = layers.optimize_loss(\n",
 92 |     "        loss=loss,\n",
 93 |     "        global_step=tf.contrib.framework.get_global_step(),\n",
 94 |     "        learning_rate=params[\"learning_rate\"],\n",
 95 |     "        optimizer=tf.train.AdamOptimizer,\n",
 96 |     "        clip_gradients=params['grad_clip'],\n",
 97 |     "        summaries=[\n",
 98 |     "            \"learning_rate\",\n",
 99 |     "            \"loss\",\n",
100 |     "            \"gradients\",\n",
101 |     "            \"gradient_norm\",\n",
102 |     "        ])\n",
103 |     "    \n",
104 |     "    # Determine the predicted classes from logits\n",
105 |     "    pred_classes = tf.to_int32(tf.argmax(???))\n",
106 |     "    \n",
107 |     "    # Convert classes to whether each sequence is the beginning of a word (class = 0 or 3)\n",
108 |     "    pred_words = tf.logical_or(tf.equal(???), tf.equal(???))\n",
109 |     "    target_words = tf.logical_or(tf.equal(???), tf.equal(???))\n",
110 |     "    \n",
111 |     "    # Compute precisions and recall\n",
112 |     "    precision = metrics.streaming_precision(???)\n",
113 |     "    recall = metrics.streaming_recall(???)\n",
114 |     "    \n",
115 |     "    predictions = {\n",
116 |     "        \"classes\": pred_classes\n",
117 |     "    }\n",
118 |     "    eval_metric_ops = {\n",
119 |     "        \"precision\": precision,\n",
120 |     "        \"recall\": recall\n",
121 |     "    }\n",
122 |     "    \n",
123 |     "    return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "if __name__ == \"__main__\":\n",
135 |     "    model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256,\n",
136 |     "                        input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2)\n",
137 |     "    \n",
138 |     "    rnn_model = learn.Estimator(model_fn=???\n",
139 |     "                                , params=model_params\n",
140 |     "                                , model_dir=\"model/_rnn_model\"\n",
141 |     "                                , config=learn.RunConfig(save_checkpoints_secs=30,\n",
142 |     "                                                         keep_checkpoint_max=2))\n",
143 |     "\n",
144 |     "    train_input_fn = data_provider(\"data/_tf_records_k2/train\", batch_size=128)\n",
145 |     "    test_input_fn = data_provider(\"data/_tf_records_k2/test\", batch_size=512)\n",
146 |     "\n",
147 |     "    validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn,\n",
148 |     "                                                    eval_steps=10,\n",
149 |     "                                                    every_n_steps=500,\n",
150 |     "                                                    name='validation')\n",
151 |     "\n",
152 |     "    # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor])\n",
153 |     "    # rnn_model.evaluate(input_fn=train_input_fn, steps=1)\n",
154 |     "\n",
155 |     "    text = \"\"\"นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ\"\"\"\n",
156 |     "    tudkum(text, rnn_model, model_params['k'])"
157 |    ]
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "anaconda-cloud": {},
162 |   "kernelspec": {
163 |    "display_name": "Python [conda root]",
164 |    "language": "python",
165 |    "name": "conda-root-py"
166 |   },
167 |   "language_info": {
168 |    "codemirror_mode": {
169 |     "name": "ipython",
170 |     "version": 3
171 |    },
172 |    "file_extension": ".py",
173 |    "mimetype": "text/x-python",
174 |    "name": "python",
175 |    "nbconvert_exporter": "python",
176 |    "pygments_lexer": "ipython3",
177 |    "version": "3.5.2"
178 |   }
179 |  },
180 |  "nbformat": 4,
181 |  "nbformat_minor": 1
182 | }
183 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from data.data_creator import *
 2 | 
 3 | 
 4 | def predict_input_fn(sentenses, seq_length, k):
 5 |     def input_fn():
 6 |         max_length = max(seq_length)
 7 |         sentenses_idx = [list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), sentense)) for sentense in sentenses]
 8 |         pad_sentense = [s + [0] * (max_length - l) for s, l in zip(sentenses_idx, seq_length)]
 9 |         seq_feature = list(map(lambda x: get_feature(x, k), pad_sentense))
10 |         features = {"seq_feature": tf.convert_to_tensor(seq_feature), 'seq_length': tf.convert_to_tensor(seq_length)}
11 | 
12 |         return features
13 | 
14 |     return input_fn
15 | 
16 | 
17 | def insert_pipe(s, c, l):
18 |     begin_index = np.where(c[:l] == 0)
19 |     return ''.join(np.insert(np.array(list(s[:l])), begin_index[0], ['|'] * len(begin_index)))
20 | 
21 | 
22 | def tudkum(text, estimator, k):
23 |     text = text.replace('\n', ' ')
24 |     sentenses = text.split(" ")
25 |     sentenses = list(filter(lambda x: len(x) > 0, sentenses))
26 |     seq_length = [len(sentense) for sentense in sentenses]
27 |     classes = [x['classes'] for x in estimator.predict(input_fn=predict_input_fn(sentenses, seq_length, k))]
28 |     sentenses = [insert_pipe(s, c, l) for s, c, l in zip(sentenses, classes, seq_length)]
29 |     return ''.join(sentenses).split("|")[1:]
30 | 
31 | def data_provider(data_path, batch_size):
32 |     def input_fn():
33 |         filenames = glob.glob(os.path.join(data_path, '*.tf'))
34 | 
35 |         contexts, sequences = read_and_decode_single_example(filenames, shuffle=True)
36 | 
37 |         tensors = {**contexts, **sequences}
38 | 
39 |         batch = tf.train.batch(
40 |             tensors=tensors,
41 |             batch_size=batch_size,
42 |             dynamic_pad=True,
43 |             name="seq_batch"
44 |         )
45 | 
46 |         for key in sequences.keys():
47 |             batch[key] = tf.to_int32(tf.sparse_tensor_to_dense(batch[key]))
48 | 
49 |         label = tf.squeeze(batch.pop('label'), axis=2)
50 | 
51 |         return batch, label
52 | 
53 |     return input_fn


--------------------------------------------------------------------------------
/word2vec/word2vec.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "from tensorflow.contrib import layers\n",
 13 |     "from tensorflow.contrib.learn import *\n",
 14 |     "from tensorflow.contrib import seq2seq\n",
 15 |     "from tensorflow.python.estimator.inputs import numpy_io\n",
 16 |     "import pickle\n",
 17 |     "import numpy as np\n",
 18 |     "import math\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "tf.logging.set_verbosity(tf.logging.INFO)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "def word2vec(features, labels, mode, params):\n",
 33 |     "    target = features['target']\n",
 34 |     "\n",
 35 |     "    with tf.variable_scope(\"emb\"):\n",
 36 |     "        target_weight = tf.get_variable(\"target_w\",\n",
 37 |     "                                        initializer=tf.random_uniform([params['num_words'], params['emb_size']], -1.0,\n",
 38 |     "                                                                      1.0))\n",
 39 |     "        context_weight = tf.get_variable(\"context_w\",\n",
 40 |     "                                         initializer=tf.truncated_normal([params['num_words'], params['emb_size']]))\n",
 41 |     "        context_bias = tf.get_variable(\"context_b\", initializer=tf.zeros(params['num_words']))\n",
 42 |     "\n",
 43 |     "    target_emb = tf.nn.embedding_lookup(target_weight, target)\n",
 44 |     "    loss = tf.reduce_mean(\n",
 45 |     "        tf.nn.sampled_softmax_loss(weights=context_weight,\n",
 46 |     "                                   biases=context_bias,\n",
 47 |     "                                   labels=tf.expand_dims(labels, 1),\n",
 48 |     "                                   inputs=target_emb,\n",
 49 |     "                                   num_sampled=params['num_negative'],\n",
 50 |     "                                   num_classes=params['num_words'],\n",
 51 |     "                                   remove_accidental_hits=True))\n",
 52 |     "\n",
 53 |     "    for v in tf.trainable_variables():\n",
 54 |     "        tf.summary.histogram(v.name.replace(\":\", ''), v)\n",
 55 |     "\n",
 56 |     "    train_op = layers.optimize_loss(\n",
 57 |     "        loss=loss,\n",
 58 |     "        global_step=tf.contrib.framework.get_global_step(),\n",
 59 |     "        learning_rate=params[\"learning_rate\"],\n",
 60 |     "        optimizer=tf.train.AdagradOptimizer,\n",
 61 |     "        summaries=[\n",
 62 |     "            \"learning_rate\",\n",
 63 |     "            \"loss\",\n",
 64 |     "            \"gradients\",\n",
 65 |     "            \"gradient_norm\",\n",
 66 |     "        ])\n",
 67 |     "    return ModelFnOps(mode=mode, predictions=None, train_op=train_op, loss=loss)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": false,
 75 |     "scrolled": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "with open('_word2vec_data/target_list_novel', 'rb') as fp:\n",
 80 |     "    target_list = pickle.load(fp)\n",
 81 |     "with open('_word2vec_data/context_list_novel', 'rb') as fp:\n",
 82 |     "    context_list = pickle.load(fp)\n",
 83 |     "with open('_word2vec_data/indexer_novel', 'rb') as fp:\n",
 84 |     "    indexer = pickle.load(fp)\n",
 85 |     "\n",
 86 |     "x = {'target': np.array(target_list)}\n",
 87 |     "y = np.array(context_list)\n",
 88 |     "\n",
 89 |     "model_params = dict(num_words=len(indexer), emb_size=64, num_negative=64, learning_rate=1.0)\n",
 90 |     "input_fn = numpy_io.numpy_input_fn(x, y, batch_size=512, shuffle=True, num_epochs=None)\n",
 91 |     "rnn_model = Estimator(model_fn=word2vec\n",
 92 |     "                      , params=model_params\n",
 93 |     "                      , model_dir=\"model/_word2vec\"\n",
 94 |     "                      , config=RunConfig(save_checkpoints_secs=30,\n",
 95 |     "                                         keep_checkpoint_max=2))\n",
 96 |     "\n",
 97 |     "rnn_model.fit(input_fn=input_fn, steps=100000)"
 98 |    ]
 99 |   }
100 |  ],
101 |  "metadata": {
102 |   "kernelspec": {
103 |    "display_name": "Python 3",
104 |    "language": "python",
105 |    "name": "python3"
106 |   },
107 |   "language_info": {
108 |    "codemirror_mode": {
109 |     "name": "ipython",
110 |     "version": 3
111 |    },
112 |    "file_extension": ".py",
113 |    "mimetype": "text/x-python",
114 |    "name": "python",
115 |    "nbconvert_exporter": "python",
116 |    "pygments_lexer": "ipython3",
117 |    "version": "3.5.1"
118 |   }
119 |  },
120 |  "nbformat": 4,
121 |  "nbformat_minor": 2
122 | }
123 | 


--------------------------------------------------------------------------------
/word2vec/word2vec_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import *
 2 | from data.data_creator import *
 3 | import itertools
 4 | import glob
 5 | import os
 6 | 
 7 | 
 8 | def get_skip_gram(tokens, k1, k2):
 9 |     n_tokens = len(tokens)
10 |     target_list = []
11 |     context_list = []
12 |     for i in range(n_tokens):
13 |         if (i < k1) | ((i + k2 + 1) > len(tokens)):
14 |             continue
15 |         target = tokens[i]
16 |         context = tokens[i - k1:i + k2 + 1]
17 |         context.remove(target)
18 |         for c in context:
19 |             target_list.append(target)
20 |             context_list.append(c)
21 |     return target_list, context_list
22 | 
23 | 
24 | if __name__ == "__main__":
25 | 
26 |     files = glob.glob(os.path.join("data/_BEST/novel", '*.txt'))
27 |     filtered_words = ['', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
28 |                       ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_']
29 |     novels = []
30 |     for file in files:
31 |         words = []
32 |         lines = open(file, 'r', encoding='utf-8')
33 |         for line in lines:
34 |             line = reduce(lambda a, kv: a.replace(*kv), list(MARKS.items()) + [('\n', ''), ('+', ''), (' ', '')], line)
35 |             word = list(filter(lambda x: x not in filtered_words, line.split("|")))
36 |             words.extend(word)
37 |         novels.append(words)
38 | 
39 |     import pickle
40 |     from collections import Counter
41 | 
42 |     all_words = list(itertools.chain.from_iterable(novels))
43 |     word_count_dict = Counter(all_words)
44 |     word_count = [(-c, w) for w, c in word_count_dict.items()]
45 |     word_count.sort()
46 |     all_words = [w for c, w in word_count if word_count_dict[w] > 5]
47 | 
48 |     indexer = {v: i for i, v in enumerate(all_words)}
49 | 
50 |     target_list = []
51 |     context_list = []
52 |     for n in novels:
53 |         target, context = get_skip_gram([indexer[w] for w in n if word_count_dict[w] > 5], 4, 4)
54 |         target_list.extend(target)
55 |         context_list.extend(context)
56 | 
57 |     with open('target_list_novel', 'wb') as fp:
58 |         pickle.dump(target_list, fp)
59 |     with open('context_list_novel', 'wb') as fp:
60 |         pickle.dump(context_list, fp)
61 |     with open('indexer_novel', 'wb') as fp:
62 |         pickle.dump(indexer, fp)
63 | 


--------------------------------------------------------------------------------