├── .gitignore ├── README.md ├── data └── data_creator.py ├── rnn_segment_completed.py ├── rnn_segment_exercise.ipynb ├── utils.py └── word2vec ├── word2vec.ipynb └── word2vec_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea* 92 | 93 | # data folders 94 | _*/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tf-text-workshop 2 | "Deep Learning with TensorFlow Workshop 3 - Deep Learning for Text" 3 | -------------------------------------------------------------------------------- /data/data_creator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import glob 4 | from functools import * 5 | import itertools 6 | import tensorflow as tf 7 | import numpy as np 8 | import random 9 | import re 10 | from multiprocessing import Pool 11 | 12 | MARKS = {'': '', '': '', '': '', '': ''} 13 | 14 | ALL_CHAR = [ 15 | '', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', 16 | ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', 17 | '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 18 | 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 19 | 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 20 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 21 | 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 22 | 'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 23 | 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท', 24 | 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ', 25 | 'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า', 26 | 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ', 27 | 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓', 28 | '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff', 'other' 29 | ] 30 | CHARS_MAP = {v: k for k, v in enumerate(ALL_CHAR)} 31 | IDX_MAP = dict(list(enumerate(ALL_CHAR))) 32 | OTHER_KEY = max(CHARS_MAP.values()) 33 | CLASS_MAP = {'B': 0, 'M': 1, 'E': 2, 'S': 3} 34 | 35 | 36 | def word_to_idx(word): 37 | w_size = len(word) 38 | w_idx = list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), word)) 39 | label = [] 40 | if w_size == 1: 41 | label += [3] 42 | else: 43 | label = [0] + list(np.repeat([1], w_size - 2)) + [2] 44 | 45 | return w_idx, label 46 | 47 | 48 | def get_feature(tokens, k): 49 | n_tokens = len(tokens) 50 | padded_tokens = tokens + [0] * k 51 | res = [] 52 | for i in range(n_tokens): 53 | res.append(padded_tokens[i:i + k + 1]) 54 | return res 55 | 56 | 57 | def make_example(seq_features, labels, key): 58 | # The object we return 59 | ex = tf.train.SequenceExample() 60 | # A non-sequential feature of our example 61 | sequence_length = len(seq_features) 62 | ex.context.feature["seq_length"].int64_list.value.append(sequence_length) 63 | ex.context.feature["key"].int64_list.value.append(key) 64 | # Feature lists for the two sequential features of our example 65 | fl_tokens = ex.feature_lists.feature_list["seq_feature"] 66 | fl_labels = ex.feature_lists.feature_list["label"] 67 | for feature, label in zip(seq_features, labels): 68 | fl_tokens.feature.add().int64_list.value.extend(feature) 69 | fl_labels.feature.add().int64_list.value.append(label) 70 | return ex 71 | 72 | 73 | def save_to_tfrecords(data_path, output_path, type, k): 74 | all_files = glob.glob(os.path.join(data_path, '*.txt')) 75 | random.shuffle(all_files) 76 | train_size = int(0.8 * len(all_files)) 77 | train = all_files[:train_size] 78 | test = all_files[train_size:] 79 | 80 | def write(files, prefix, type): 81 | if not os.path.isdir(os.path.join(os.getcwd(), output_path, prefix)): 82 | os.makedirs(os.path.join(output_path, prefix)) 83 | for file in files: 84 | words_all = [] 85 | print(file) 86 | lines = open(file, 'r', encoding='utf-8') 87 | for line in lines: 88 | line = reduce(lambda a, kv: a.replace(*kv), MARKS.items(), line) 89 | sentence = line.split(" ") 90 | words = [[word for word in s.split("|") if word not in ['', '\n']] for s in sentence] 91 | words = filter(lambda x: len(x) > 0, words) 92 | words_all.extend(list(words)) 93 | lines.close() 94 | word_idxs = list(map(lambda s: list(map(lambda w: word_to_idx(w), s)), words_all)) 95 | st_idx, label = map(list, zip( 96 | *list( 97 | map(lambda s: tuple(map(lambda x: list(itertools.chain.from_iterable(x)), list(zip(*s)))), 98 | word_idxs)))) 99 | input_feature = list(map(lambda x: get_feature(x, k), st_idx)) 100 | 101 | # Write all examples into a TFRecords file 102 | f_name = re.search('([0-9].*).txt', file).group(1) 103 | 104 | with open(os.path.join(output_path, prefix, type + '_' + f_name + '.tf'), 'w') as fp: 105 | writer = tf.python_io.TFRecordWriter(fp.name) 106 | for key, sequence in enumerate(zip(input_feature, label)): 107 | seq_input, label = sequence 108 | ex = make_example(seq_input, label, key) 109 | writer.write(ex.SerializeToString()) 110 | writer.close() 111 | 112 | write(train, "train", type) 113 | write(test, "test", type) 114 | 115 | 116 | def read_and_decode_single_example(filenames, shuffle=False, num_epochs=None): 117 | # first construct a queue containing a list of filenames. 118 | # this lets a user split up there dataset in multiple files to keep size down 119 | # filename_queue = tf.train.string_input_producer([filename], num_epochs=10) 120 | filename_queue = tf.train.string_input_producer(filenames, 121 | shuffle=shuffle, num_epochs=num_epochs) 122 | 123 | reader = tf.TFRecordReader() 124 | # One can read a single serialized example from a filename 125 | # serialized_example is a Tensor of type string. 126 | _, serialized_ex = reader.read(filename_queue) 127 | context, sequences = tf.parse_single_sequence_example(serialized_ex, 128 | context_features={ 129 | "seq_length": tf.FixedLenFeature([], dtype=tf.int64) 130 | }, 131 | sequence_features={ 132 | "seq_feature": tf.VarLenFeature(dtype=tf.int64), 133 | "label": tf.VarLenFeature(dtype=tf.int64) 134 | }) 135 | return context, sequences 136 | 137 | 138 | def fn(cat): 139 | return save_to_tfrecords("_BEST/" + cat, "_tf_records_k2", cat, 2) 140 | 141 | 142 | if __name__ == "__main__": 143 | category = ['article', 'encyclopedia', 'news', 'novel'] 144 | p = Pool(4) 145 | p.map(fn, category) 146 | -------------------------------------------------------------------------------- /rnn_segment_completed.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib import rnn 3 | from tensorflow.contrib import metrics 4 | from tensorflow.contrib import layers 5 | from tensorflow.contrib import learn 6 | from tensorflow.contrib.learn import * 7 | from tensorflow.contrib import seq2seq 8 | from data.data_creator import * 9 | from utils import * 10 | 11 | tf.logging.set_verbosity(tf.logging.INFO) 12 | 13 | print(tf.__version__) 14 | 15 | 16 | def rnn_segment(features, targets, mode, params): 17 | seq_feature = features['seq_feature'] 18 | seq_length = features['seq_length'] 19 | with tf.variable_scope("emb"): 20 | embeddings = tf.get_variable("char_emb", shape=[params['num_char'], params['emb_size']]) 21 | seq_emb = tf.nn.embedding_lookup(embeddings, seq_feature) 22 | batch_size = tf.shape(seq_feature)[0] 23 | time_step = tf.shape(seq_feature)[1] 24 | flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']]) 25 | cell = rnn.LSTMCell(params['rnn_units']) 26 | if mode == ModeKeys.TRAIN: 27 | cell = rnn.DropoutWrapper(cell, params['input_keep_prob'], params['output_keep_prob']) 28 | projection_cell = rnn.OutputProjectionWrapper(cell, params['num_class']) 29 | logits, _ = tf.nn.dynamic_rnn(projection_cell, flat_seq_emb, sequence_length=seq_length, dtype=tf.float32) 30 | weight_mask = tf.to_float(tf.sequence_mask(seq_length)) 31 | loss = seq2seq.sequence_loss(logits, targets, weights=weight_mask) 32 | train_op = layers.optimize_loss( 33 | loss=loss, 34 | global_step=tf.contrib.framework.get_global_step(), 35 | learning_rate=params["learning_rate"], 36 | optimizer=tf.train.AdamOptimizer, 37 | clip_gradients=params['grad_clip'], 38 | summaries=[ 39 | "learning_rate", 40 | "loss", 41 | "gradients", 42 | "gradient_norm", 43 | ]) 44 | pred_classes = tf.to_int32(tf.argmax(input=logits, axis=2)) 45 | pred_words = tf.logical_or(tf.equal(pred_classes, 0), tf.equal(pred_classes, 3)) 46 | target_words = tf.logical_or(tf.equal(targets, 0), tf.equal(targets, 3)) 47 | precision = metrics.streaming_precision(pred_words, target_words, weights=weight_mask) 48 | recall = metrics.streaming_recall(pred_words, target_words, weights=weight_mask) 49 | predictions = { 50 | "classes": pred_classes 51 | } 52 | eval_metric_ops = { 53 | "precision": precision, 54 | "recall": recall 55 | } 56 | return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops) 57 | 58 | 59 | if __name__ == "__main__": 60 | model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256, 61 | input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2) 62 | rnn_model = learn.Estimator(model_fn=rnn_segment 63 | , params=model_params 64 | , model_dir="model/_rnn_model" 65 | , config=learn.RunConfig(save_checkpoints_secs=30, 66 | keep_checkpoint_max=2)) 67 | 68 | train_input_fn = data_provider("data/_tf_records/train", batch_size=128) 69 | test_input_fn = data_provider("data/_tf_records/test", batch_size=512) 70 | 71 | validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn, 72 | eval_steps=10, 73 | every_n_steps=500, 74 | name='validation') 75 | 76 | # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor]) 77 | rnn_model.evaluate(input_fn=test_input_fn, steps=10) 78 | 79 | text = """นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ""" 80 | tudkum(text, rnn_model, model_params['k']) 81 | -------------------------------------------------------------------------------- /rnn_segment_exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tensorflow as tf\n", 10 | "from tensorflow.contrib import rnn\n", 11 | "from tensorflow.contrib import metrics\n", 12 | "from tensorflow.contrib import layers\n", 13 | "from tensorflow.contrib import learn\n", 14 | "from tensorflow.contrib.learn import *\n", 15 | "from tensorflow.contrib import seq2seq\n", 16 | "\n", 17 | "tf.logging.set_verbosity(tf.logging.INFO)\n", 18 | "print(tf.__version__)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Import helper functions for processing data\n", 26 | "- [data_creator.py](/edit/data/data_creator.py)\n", 27 | "- [utils.py](/edit/utils.py)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "from data.data_creator import *\n", 39 | "from utils import *" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Define RNN model function for word segmentation" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "def rnn_segment(features, targets, mode, params):\n", 58 | " seq_feature = features['seq_feature']\n", 59 | " seq_length = features['seq_length']\n", 60 | " \n", 61 | " # Create a variable matrix in which each row represents an embedding vector for a given character.\n", 62 | " with tf.variable_scope(\"emb\"):\n", 63 | " embeddings = tf.get_variable(\"char_emb\", shape=[params[???], params[???]])\n", 64 | " \n", 65 | " # Convert sequence features to sequence embeddings \n", 66 | " seq_emb = tf.nn.embedding_lookup(params=???, ids=???)\n", 67 | " \n", 68 | " # Flatten each sequence embedding\n", 69 | " # [[1st char embedding], [2nd char embedding], [3rd char embedding]] => [flatten seq embedding]\n", 70 | " batch_size = tf.shape(seq_feature)[0]\n", 71 | " time_step = tf.shape(seq_feature)[1]\n", 72 | " flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']])\n", 73 | " \n", 74 | " cell = rnn.LSTMCell(num_units=???)\n", 75 | " if mode == ModeKeys.TRAIN:\n", 76 | " # Create a cell with added input and output dropout with given probabilities.\n", 77 | " cell = rnn.DropoutWrapper(cell, ???)\n", 78 | " \n", 79 | " # Project output from RNN cell into classes\n", 80 | " projection_cell = rnn.OutputProjectionWrapper(cell, ???)\n", 81 | " \n", 82 | " # Create a recurrent neural network from the projection_cell\n", 83 | " logits, _ = tf.nn.dynamic_rnn(???, dtype=tf.float32)\n", 84 | " \n", 85 | " # Training samples in each batch might have different lengths. \n", 86 | " # We put them in the same matrix by padding each sample to have the same length as the longest sample. \n", 87 | " # We need to create a mask tensor to avoid accounting for losses from the padding. \n", 88 | " weight_mask = tf.to_float(tf.sequence_mask(seq_length))\n", 89 | " loss = seq2seq.sequence_loss(???)\n", 90 | " \n", 91 | " train_op = layers.optimize_loss(\n", 92 | " loss=loss,\n", 93 | " global_step=tf.contrib.framework.get_global_step(),\n", 94 | " learning_rate=params[\"learning_rate\"],\n", 95 | " optimizer=tf.train.AdamOptimizer,\n", 96 | " clip_gradients=params['grad_clip'],\n", 97 | " summaries=[\n", 98 | " \"learning_rate\",\n", 99 | " \"loss\",\n", 100 | " \"gradients\",\n", 101 | " \"gradient_norm\",\n", 102 | " ])\n", 103 | " \n", 104 | " # Determine the predicted classes from logits\n", 105 | " pred_classes = tf.to_int32(tf.argmax(???))\n", 106 | " \n", 107 | " # Convert classes to whether each sequence is the beginning of a word (class = 0 or 3)\n", 108 | " pred_words = tf.logical_or(tf.equal(???), tf.equal(???))\n", 109 | " target_words = tf.logical_or(tf.equal(???), tf.equal(???))\n", 110 | " \n", 111 | " # Compute precisions and recall\n", 112 | " precision = metrics.streaming_precision(???)\n", 113 | " recall = metrics.streaming_recall(???)\n", 114 | " \n", 115 | " predictions = {\n", 116 | " \"classes\": pred_classes\n", 117 | " }\n", 118 | " eval_metric_ops = {\n", 119 | " \"precision\": precision,\n", 120 | " \"recall\": recall\n", 121 | " }\n", 122 | " \n", 123 | " return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "if __name__ == \"__main__\":\n", 135 | " model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256,\n", 136 | " input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2)\n", 137 | " \n", 138 | " rnn_model = learn.Estimator(model_fn=???\n", 139 | " , params=model_params\n", 140 | " , model_dir=\"model/_rnn_model\"\n", 141 | " , config=learn.RunConfig(save_checkpoints_secs=30,\n", 142 | " keep_checkpoint_max=2))\n", 143 | "\n", 144 | " train_input_fn = data_provider(\"data/_tf_records_k2/train\", batch_size=128)\n", 145 | " test_input_fn = data_provider(\"data/_tf_records_k2/test\", batch_size=512)\n", 146 | "\n", 147 | " validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn,\n", 148 | " eval_steps=10,\n", 149 | " every_n_steps=500,\n", 150 | " name='validation')\n", 151 | "\n", 152 | " # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor])\n", 153 | " # rnn_model.evaluate(input_fn=train_input_fn, steps=1)\n", 154 | "\n", 155 | " text = \"\"\"นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ\"\"\"\n", 156 | " tudkum(text, rnn_model, model_params['k'])" 157 | ] 158 | } 159 | ], 160 | "metadata": { 161 | "anaconda-cloud": {}, 162 | "kernelspec": { 163 | "display_name": "Python [conda root]", 164 | "language": "python", 165 | "name": "conda-root-py" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 3 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython3", 177 | "version": "3.5.2" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 1 182 | } 183 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from data.data_creator import * 2 | 3 | 4 | def predict_input_fn(sentenses, seq_length, k): 5 | def input_fn(): 6 | max_length = max(seq_length) 7 | sentenses_idx = [list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), sentense)) for sentense in sentenses] 8 | pad_sentense = [s + [0] * (max_length - l) for s, l in zip(sentenses_idx, seq_length)] 9 | seq_feature = list(map(lambda x: get_feature(x, k), pad_sentense)) 10 | features = {"seq_feature": tf.convert_to_tensor(seq_feature), 'seq_length': tf.convert_to_tensor(seq_length)} 11 | 12 | return features 13 | 14 | return input_fn 15 | 16 | 17 | def insert_pipe(s, c, l): 18 | begin_index = np.where(c[:l] == 0) 19 | return ''.join(np.insert(np.array(list(s[:l])), begin_index[0], ['|'] * len(begin_index))) 20 | 21 | 22 | def tudkum(text, estimator, k): 23 | text = text.replace('\n', ' ') 24 | sentenses = text.split(" ") 25 | sentenses = list(filter(lambda x: len(x) > 0, sentenses)) 26 | seq_length = [len(sentense) for sentense in sentenses] 27 | classes = [x['classes'] for x in estimator.predict(input_fn=predict_input_fn(sentenses, seq_length, k))] 28 | sentenses = [insert_pipe(s, c, l) for s, c, l in zip(sentenses, classes, seq_length)] 29 | return ''.join(sentenses).split("|")[1:] 30 | 31 | def data_provider(data_path, batch_size): 32 | def input_fn(): 33 | filenames = glob.glob(os.path.join(data_path, '*.tf')) 34 | 35 | contexts, sequences = read_and_decode_single_example(filenames, shuffle=True) 36 | 37 | tensors = {**contexts, **sequences} 38 | 39 | batch = tf.train.batch( 40 | tensors=tensors, 41 | batch_size=batch_size, 42 | dynamic_pad=True, 43 | name="seq_batch" 44 | ) 45 | 46 | for key in sequences.keys(): 47 | batch[key] = tf.to_int32(tf.sparse_tensor_to_dense(batch[key])) 48 | 49 | label = tf.squeeze(batch.pop('label'), axis=2) 50 | 51 | return batch, label 52 | 53 | return input_fn -------------------------------------------------------------------------------- /word2vec/word2vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "from tensorflow.contrib import layers\n", 13 | "from tensorflow.contrib.learn import *\n", 14 | "from tensorflow.contrib import seq2seq\n", 15 | "from tensorflow.python.estimator.inputs import numpy_io\n", 16 | "import pickle\n", 17 | "import numpy as np\n", 18 | "import math\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "tf.logging.set_verbosity(tf.logging.INFO)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "def word2vec(features, labels, mode, params):\n", 33 | " target = features['target']\n", 34 | "\n", 35 | " with tf.variable_scope(\"emb\"):\n", 36 | " target_weight = tf.get_variable(\"target_w\",\n", 37 | " initializer=tf.random_uniform([params['num_words'], params['emb_size']], -1.0,\n", 38 | " 1.0))\n", 39 | " context_weight = tf.get_variable(\"context_w\",\n", 40 | " initializer=tf.truncated_normal([params['num_words'], params['emb_size']]))\n", 41 | " context_bias = tf.get_variable(\"context_b\", initializer=tf.zeros(params['num_words']))\n", 42 | "\n", 43 | " target_emb = tf.nn.embedding_lookup(target_weight, target)\n", 44 | " loss = tf.reduce_mean(\n", 45 | " tf.nn.sampled_softmax_loss(weights=context_weight,\n", 46 | " biases=context_bias,\n", 47 | " labels=tf.expand_dims(labels, 1),\n", 48 | " inputs=target_emb,\n", 49 | " num_sampled=params['num_negative'],\n", 50 | " num_classes=params['num_words'],\n", 51 | " remove_accidental_hits=True))\n", 52 | "\n", 53 | " for v in tf.trainable_variables():\n", 54 | " tf.summary.histogram(v.name.replace(\":\", ''), v)\n", 55 | "\n", 56 | " train_op = layers.optimize_loss(\n", 57 | " loss=loss,\n", 58 | " global_step=tf.contrib.framework.get_global_step(),\n", 59 | " learning_rate=params[\"learning_rate\"],\n", 60 | " optimizer=tf.train.AdagradOptimizer,\n", 61 | " summaries=[\n", 62 | " \"learning_rate\",\n", 63 | " \"loss\",\n", 64 | " \"gradients\",\n", 65 | " \"gradient_norm\",\n", 66 | " ])\n", 67 | " return ModelFnOps(mode=mode, predictions=None, train_op=train_op, loss=loss)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false, 75 | "scrolled": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "with open('_word2vec_data/target_list_novel', 'rb') as fp:\n", 80 | " target_list = pickle.load(fp)\n", 81 | "with open('_word2vec_data/context_list_novel', 'rb') as fp:\n", 82 | " context_list = pickle.load(fp)\n", 83 | "with open('_word2vec_data/indexer_novel', 'rb') as fp:\n", 84 | " indexer = pickle.load(fp)\n", 85 | "\n", 86 | "x = {'target': np.array(target_list)}\n", 87 | "y = np.array(context_list)\n", 88 | "\n", 89 | "model_params = dict(num_words=len(indexer), emb_size=64, num_negative=64, learning_rate=1.0)\n", 90 | "input_fn = numpy_io.numpy_input_fn(x, y, batch_size=512, shuffle=True, num_epochs=None)\n", 91 | "rnn_model = Estimator(model_fn=word2vec\n", 92 | " , params=model_params\n", 93 | " , model_dir=\"model/_word2vec\"\n", 94 | " , config=RunConfig(save_checkpoints_secs=30,\n", 95 | " keep_checkpoint_max=2))\n", 96 | "\n", 97 | "rnn_model.fit(input_fn=input_fn, steps=100000)" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.5.1" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /word2vec/word2vec_utils.py: -------------------------------------------------------------------------------- 1 | from functools import * 2 | from data.data_creator import * 3 | import itertools 4 | import glob 5 | import os 6 | 7 | 8 | def get_skip_gram(tokens, k1, k2): 9 | n_tokens = len(tokens) 10 | target_list = [] 11 | context_list = [] 12 | for i in range(n_tokens): 13 | if (i < k1) | ((i + k2 + 1) > len(tokens)): 14 | continue 15 | target = tokens[i] 16 | context = tokens[i - k1:i + k2 + 1] 17 | context.remove(target) 18 | for c in context: 19 | target_list.append(target) 20 | context_list.append(c) 21 | return target_list, context_list 22 | 23 | 24 | if __name__ == "__main__": 25 | 26 | files = glob.glob(os.path.join("data/_BEST/novel", '*.txt')) 27 | filtered_words = ['', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', 28 | ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_'] 29 | novels = [] 30 | for file in files: 31 | words = [] 32 | lines = open(file, 'r', encoding='utf-8') 33 | for line in lines: 34 | line = reduce(lambda a, kv: a.replace(*kv), list(MARKS.items()) + [('\n', ''), ('+', ''), (' ', '')], line) 35 | word = list(filter(lambda x: x not in filtered_words, line.split("|"))) 36 | words.extend(word) 37 | novels.append(words) 38 | 39 | import pickle 40 | from collections import Counter 41 | 42 | all_words = list(itertools.chain.from_iterable(novels)) 43 | word_count_dict = Counter(all_words) 44 | word_count = [(-c, w) for w, c in word_count_dict.items()] 45 | word_count.sort() 46 | all_words = [w for c, w in word_count if word_count_dict[w] > 5] 47 | 48 | indexer = {v: i for i, v in enumerate(all_words)} 49 | 50 | target_list = [] 51 | context_list = [] 52 | for n in novels: 53 | target, context = get_skip_gram([indexer[w] for w in n if word_count_dict[w] > 5], 4, 4) 54 | target_list.extend(target) 55 | context_list.extend(context) 56 | 57 | with open('target_list_novel', 'wb') as fp: 58 | pickle.dump(target_list, fp) 59 | with open('context_list_novel', 'wb') as fp: 60 | pickle.dump(context_list, fp) 61 | with open('indexer_novel', 'wb') as fp: 62 | pickle.dump(indexer, fp) 63 | --------------------------------------------------------------------------------