├── .gitignore
├── README.md
├── data
└── data_creator.py
├── rnn_segment_completed.py
├── rnn_segment_exercise.ipynb
├── utils.py
└── word2vec
├── word2vec.ipynb
└── word2vec_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | .idea*
92 |
93 | # data folders
94 | _*/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tf-text-workshop
2 | "Deep Learning with TensorFlow Workshop 3 - Deep Learning for Text"
3 |
--------------------------------------------------------------------------------
/data/data_creator.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import os
3 | import glob
4 | from functools import *
5 | import itertools
6 | import tensorflow as tf
7 | import numpy as np
8 | import random
9 | import re
10 | from multiprocessing import Pool
11 |
12 | MARKS = {'': '', '': '', '': '', ' ': ''}
13 |
14 | ALL_CHAR = [
15 | '', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
16 | ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
17 | '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
18 | 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
19 | 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
20 | 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
21 | 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
22 | 'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
23 | 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
24 | 'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
25 | 'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
26 | 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
27 | 'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
28 | '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff', 'other'
29 | ]
30 | CHARS_MAP = {v: k for k, v in enumerate(ALL_CHAR)}
31 | IDX_MAP = dict(list(enumerate(ALL_CHAR)))
32 | OTHER_KEY = max(CHARS_MAP.values())
33 | CLASS_MAP = {'B': 0, 'M': 1, 'E': 2, 'S': 3}
34 |
35 |
36 | def word_to_idx(word):
37 | w_size = len(word)
38 | w_idx = list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), word))
39 | label = []
40 | if w_size == 1:
41 | label += [3]
42 | else:
43 | label = [0] + list(np.repeat([1], w_size - 2)) + [2]
44 |
45 | return w_idx, label
46 |
47 |
48 | def get_feature(tokens, k):
49 | n_tokens = len(tokens)
50 | padded_tokens = tokens + [0] * k
51 | res = []
52 | for i in range(n_tokens):
53 | res.append(padded_tokens[i:i + k + 1])
54 | return res
55 |
56 |
57 | def make_example(seq_features, labels, key):
58 | # The object we return
59 | ex = tf.train.SequenceExample()
60 | # A non-sequential feature of our example
61 | sequence_length = len(seq_features)
62 | ex.context.feature["seq_length"].int64_list.value.append(sequence_length)
63 | ex.context.feature["key"].int64_list.value.append(key)
64 | # Feature lists for the two sequential features of our example
65 | fl_tokens = ex.feature_lists.feature_list["seq_feature"]
66 | fl_labels = ex.feature_lists.feature_list["label"]
67 | for feature, label in zip(seq_features, labels):
68 | fl_tokens.feature.add().int64_list.value.extend(feature)
69 | fl_labels.feature.add().int64_list.value.append(label)
70 | return ex
71 |
72 |
73 | def save_to_tfrecords(data_path, output_path, type, k):
74 | all_files = glob.glob(os.path.join(data_path, '*.txt'))
75 | random.shuffle(all_files)
76 | train_size = int(0.8 * len(all_files))
77 | train = all_files[:train_size]
78 | test = all_files[train_size:]
79 |
80 | def write(files, prefix, type):
81 | if not os.path.isdir(os.path.join(os.getcwd(), output_path, prefix)):
82 | os.makedirs(os.path.join(output_path, prefix))
83 | for file in files:
84 | words_all = []
85 | print(file)
86 | lines = open(file, 'r', encoding='utf-8')
87 | for line in lines:
88 | line = reduce(lambda a, kv: a.replace(*kv), MARKS.items(), line)
89 | sentence = line.split(" ")
90 | words = [[word for word in s.split("|") if word not in ['', '\n']] for s in sentence]
91 | words = filter(lambda x: len(x) > 0, words)
92 | words_all.extend(list(words))
93 | lines.close()
94 | word_idxs = list(map(lambda s: list(map(lambda w: word_to_idx(w), s)), words_all))
95 | st_idx, label = map(list, zip(
96 | *list(
97 | map(lambda s: tuple(map(lambda x: list(itertools.chain.from_iterable(x)), list(zip(*s)))),
98 | word_idxs))))
99 | input_feature = list(map(lambda x: get_feature(x, k), st_idx))
100 |
101 | # Write all examples into a TFRecords file
102 | f_name = re.search('([0-9].*).txt', file).group(1)
103 |
104 | with open(os.path.join(output_path, prefix, type + '_' + f_name + '.tf'), 'w') as fp:
105 | writer = tf.python_io.TFRecordWriter(fp.name)
106 | for key, sequence in enumerate(zip(input_feature, label)):
107 | seq_input, label = sequence
108 | ex = make_example(seq_input, label, key)
109 | writer.write(ex.SerializeToString())
110 | writer.close()
111 |
112 | write(train, "train", type)
113 | write(test, "test", type)
114 |
115 |
116 | def read_and_decode_single_example(filenames, shuffle=False, num_epochs=None):
117 | # first construct a queue containing a list of filenames.
118 | # this lets a user split up there dataset in multiple files to keep size down
119 | # filename_queue = tf.train.string_input_producer([filename], num_epochs=10)
120 | filename_queue = tf.train.string_input_producer(filenames,
121 | shuffle=shuffle, num_epochs=num_epochs)
122 |
123 | reader = tf.TFRecordReader()
124 | # One can read a single serialized example from a filename
125 | # serialized_example is a Tensor of type string.
126 | _, serialized_ex = reader.read(filename_queue)
127 | context, sequences = tf.parse_single_sequence_example(serialized_ex,
128 | context_features={
129 | "seq_length": tf.FixedLenFeature([], dtype=tf.int64)
130 | },
131 | sequence_features={
132 | "seq_feature": tf.VarLenFeature(dtype=tf.int64),
133 | "label": tf.VarLenFeature(dtype=tf.int64)
134 | })
135 | return context, sequences
136 |
137 |
138 | def fn(cat):
139 | return save_to_tfrecords("_BEST/" + cat, "_tf_records_k2", cat, 2)
140 |
141 |
142 | if __name__ == "__main__":
143 | category = ['article', 'encyclopedia', 'news', 'novel']
144 | p = Pool(4)
145 | p.map(fn, category)
146 |
--------------------------------------------------------------------------------
/rnn_segment_completed.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.contrib import rnn
3 | from tensorflow.contrib import metrics
4 | from tensorflow.contrib import layers
5 | from tensorflow.contrib import learn
6 | from tensorflow.contrib.learn import *
7 | from tensorflow.contrib import seq2seq
8 | from data.data_creator import *
9 | from utils import *
10 |
11 | tf.logging.set_verbosity(tf.logging.INFO)
12 |
13 | print(tf.__version__)
14 |
15 |
16 | def rnn_segment(features, targets, mode, params):
17 | seq_feature = features['seq_feature']
18 | seq_length = features['seq_length']
19 | with tf.variable_scope("emb"):
20 | embeddings = tf.get_variable("char_emb", shape=[params['num_char'], params['emb_size']])
21 | seq_emb = tf.nn.embedding_lookup(embeddings, seq_feature)
22 | batch_size = tf.shape(seq_feature)[0]
23 | time_step = tf.shape(seq_feature)[1]
24 | flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']])
25 | cell = rnn.LSTMCell(params['rnn_units'])
26 | if mode == ModeKeys.TRAIN:
27 | cell = rnn.DropoutWrapper(cell, params['input_keep_prob'], params['output_keep_prob'])
28 | projection_cell = rnn.OutputProjectionWrapper(cell, params['num_class'])
29 | logits, _ = tf.nn.dynamic_rnn(projection_cell, flat_seq_emb, sequence_length=seq_length, dtype=tf.float32)
30 | weight_mask = tf.to_float(tf.sequence_mask(seq_length))
31 | loss = seq2seq.sequence_loss(logits, targets, weights=weight_mask)
32 | train_op = layers.optimize_loss(
33 | loss=loss,
34 | global_step=tf.contrib.framework.get_global_step(),
35 | learning_rate=params["learning_rate"],
36 | optimizer=tf.train.AdamOptimizer,
37 | clip_gradients=params['grad_clip'],
38 | summaries=[
39 | "learning_rate",
40 | "loss",
41 | "gradients",
42 | "gradient_norm",
43 | ])
44 | pred_classes = tf.to_int32(tf.argmax(input=logits, axis=2))
45 | pred_words = tf.logical_or(tf.equal(pred_classes, 0), tf.equal(pred_classes, 3))
46 | target_words = tf.logical_or(tf.equal(targets, 0), tf.equal(targets, 3))
47 | precision = metrics.streaming_precision(pred_words, target_words, weights=weight_mask)
48 | recall = metrics.streaming_recall(pred_words, target_words, weights=weight_mask)
49 | predictions = {
50 | "classes": pred_classes
51 | }
52 | eval_metric_ops = {
53 | "precision": precision,
54 | "recall": recall
55 | }
56 | return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops)
57 |
58 |
59 | if __name__ == "__main__":
60 | model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256,
61 | input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2)
62 | rnn_model = learn.Estimator(model_fn=rnn_segment
63 | , params=model_params
64 | , model_dir="model/_rnn_model"
65 | , config=learn.RunConfig(save_checkpoints_secs=30,
66 | keep_checkpoint_max=2))
67 |
68 | train_input_fn = data_provider("data/_tf_records/train", batch_size=128)
69 | test_input_fn = data_provider("data/_tf_records/test", batch_size=512)
70 |
71 | validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn,
72 | eval_steps=10,
73 | every_n_steps=500,
74 | name='validation')
75 |
76 | # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor])
77 | rnn_model.evaluate(input_fn=test_input_fn, steps=10)
78 |
79 | text = """นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ"""
80 | tudkum(text, rnn_model, model_params['k'])
81 |
--------------------------------------------------------------------------------
/rnn_segment_exercise.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import tensorflow as tf\n",
10 | "from tensorflow.contrib import rnn\n",
11 | "from tensorflow.contrib import metrics\n",
12 | "from tensorflow.contrib import layers\n",
13 | "from tensorflow.contrib import learn\n",
14 | "from tensorflow.contrib.learn import *\n",
15 | "from tensorflow.contrib import seq2seq\n",
16 | "\n",
17 | "tf.logging.set_verbosity(tf.logging.INFO)\n",
18 | "print(tf.__version__)"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Import helper functions for processing data\n",
26 | "- [data_creator.py](/edit/data/data_creator.py)\n",
27 | "- [utils.py](/edit/utils.py)"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "metadata": {
34 | "collapsed": true
35 | },
36 | "outputs": [],
37 | "source": [
38 | "from data.data_creator import *\n",
39 | "from utils import *"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "## Define RNN model function for word segmentation"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "outputs": [],
56 | "source": [
57 | "def rnn_segment(features, targets, mode, params):\n",
58 | " seq_feature = features['seq_feature']\n",
59 | " seq_length = features['seq_length']\n",
60 | " \n",
61 | " # Create a variable matrix in which each row represents an embedding vector for a given character.\n",
62 | " with tf.variable_scope(\"emb\"):\n",
63 | " embeddings = tf.get_variable(\"char_emb\", shape=[params[???], params[???]])\n",
64 | " \n",
65 | " # Convert sequence features to sequence embeddings \n",
66 | " seq_emb = tf.nn.embedding_lookup(params=???, ids=???)\n",
67 | " \n",
68 | " # Flatten each sequence embedding\n",
69 | " # [[1st char embedding], [2nd char embedding], [3rd char embedding]] => [flatten seq embedding]\n",
70 | " batch_size = tf.shape(seq_feature)[0]\n",
71 | " time_step = tf.shape(seq_feature)[1]\n",
72 | " flat_seq_emb = tf.reshape(seq_emb, shape=[batch_size, time_step, (params['k'] + 1) * params['emb_size']])\n",
73 | " \n",
74 | " cell = rnn.LSTMCell(num_units=???)\n",
75 | " if mode == ModeKeys.TRAIN:\n",
76 | " # Create a cell with added input and output dropout with given probabilities.\n",
77 | " cell = rnn.DropoutWrapper(cell, ???)\n",
78 | " \n",
79 | " # Project output from RNN cell into classes\n",
80 | " projection_cell = rnn.OutputProjectionWrapper(cell, ???)\n",
81 | " \n",
82 | " # Create a recurrent neural network from the projection_cell\n",
83 | " logits, _ = tf.nn.dynamic_rnn(???, dtype=tf.float32)\n",
84 | " \n",
85 | " # Training samples in each batch might have different lengths. \n",
86 | " # We put them in the same matrix by padding each sample to have the same length as the longest sample. \n",
87 | " # We need to create a mask tensor to avoid accounting for losses from the padding. \n",
88 | " weight_mask = tf.to_float(tf.sequence_mask(seq_length))\n",
89 | " loss = seq2seq.sequence_loss(???)\n",
90 | " \n",
91 | " train_op = layers.optimize_loss(\n",
92 | " loss=loss,\n",
93 | " global_step=tf.contrib.framework.get_global_step(),\n",
94 | " learning_rate=params[\"learning_rate\"],\n",
95 | " optimizer=tf.train.AdamOptimizer,\n",
96 | " clip_gradients=params['grad_clip'],\n",
97 | " summaries=[\n",
98 | " \"learning_rate\",\n",
99 | " \"loss\",\n",
100 | " \"gradients\",\n",
101 | " \"gradient_norm\",\n",
102 | " ])\n",
103 | " \n",
104 | " # Determine the predicted classes from logits\n",
105 | " pred_classes = tf.to_int32(tf.argmax(???))\n",
106 | " \n",
107 | " # Convert classes to whether each sequence is the beginning of a word (class = 0 or 3)\n",
108 | " pred_words = tf.logical_or(tf.equal(???), tf.equal(???))\n",
109 | " target_words = tf.logical_or(tf.equal(???), tf.equal(???))\n",
110 | " \n",
111 | " # Compute precisions and recall\n",
112 | " precision = metrics.streaming_precision(???)\n",
113 | " recall = metrics.streaming_recall(???)\n",
114 | " \n",
115 | " predictions = {\n",
116 | " \"classes\": pred_classes\n",
117 | " }\n",
118 | " eval_metric_ops = {\n",
119 | " \"precision\": precision,\n",
120 | " \"recall\": recall\n",
121 | " }\n",
122 | " \n",
123 | " return learn.ModelFnOps(mode, predictions, loss, train_op, eval_metric_ops=eval_metric_ops)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {
130 | "collapsed": true
131 | },
132 | "outputs": [],
133 | "source": [
134 | "if __name__ == \"__main__\":\n",
135 | " model_params = dict(num_class=len(CLASS_MAP), num_char=len(CHARS_MAP), emb_size=128, rnn_units=256,\n",
136 | " input_keep_prob=0.85, output_keep_prob=0.85, learning_rate=10e-4, grad_clip=1.0, k=2)\n",
137 | " \n",
138 | " rnn_model = learn.Estimator(model_fn=???\n",
139 | " , params=model_params\n",
140 | " , model_dir=\"model/_rnn_model\"\n",
141 | " , config=learn.RunConfig(save_checkpoints_secs=30,\n",
142 | " keep_checkpoint_max=2))\n",
143 | "\n",
144 | " train_input_fn = data_provider(\"data/_tf_records_k2/train\", batch_size=128)\n",
145 | " test_input_fn = data_provider(\"data/_tf_records_k2/test\", batch_size=512)\n",
146 | "\n",
147 | " validation_monitor = monitors.ValidationMonitor(input_fn=test_input_fn,\n",
148 | " eval_steps=10,\n",
149 | " every_n_steps=500,\n",
150 | " name='validation')\n",
151 | "\n",
152 | " # rnn_model.fit(input_fn=train_input_fn, steps=1, monitors=[validation_monitor])\n",
153 | " # rnn_model.evaluate(input_fn=train_input_fn, steps=1)\n",
154 | "\n",
155 | " text = \"\"\"นางกุหลาบขายกุหลาบจำหน่ายไม้ดอกไม้ประดับ\"\"\"\n",
156 | " tudkum(text, rnn_model, model_params['k'])"
157 | ]
158 | }
159 | ],
160 | "metadata": {
161 | "anaconda-cloud": {},
162 | "kernelspec": {
163 | "display_name": "Python [conda root]",
164 | "language": "python",
165 | "name": "conda-root-py"
166 | },
167 | "language_info": {
168 | "codemirror_mode": {
169 | "name": "ipython",
170 | "version": 3
171 | },
172 | "file_extension": ".py",
173 | "mimetype": "text/x-python",
174 | "name": "python",
175 | "nbconvert_exporter": "python",
176 | "pygments_lexer": "ipython3",
177 | "version": "3.5.2"
178 | }
179 | },
180 | "nbformat": 4,
181 | "nbformat_minor": 1
182 | }
183 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from data.data_creator import *
2 |
3 |
4 | def predict_input_fn(sentenses, seq_length, k):
5 | def input_fn():
6 | max_length = max(seq_length)
7 | sentenses_idx = [list(map(lambda x: CHARS_MAP.get(x, OTHER_KEY), sentense)) for sentense in sentenses]
8 | pad_sentense = [s + [0] * (max_length - l) for s, l in zip(sentenses_idx, seq_length)]
9 | seq_feature = list(map(lambda x: get_feature(x, k), pad_sentense))
10 | features = {"seq_feature": tf.convert_to_tensor(seq_feature), 'seq_length': tf.convert_to_tensor(seq_length)}
11 |
12 | return features
13 |
14 | return input_fn
15 |
16 |
17 | def insert_pipe(s, c, l):
18 | begin_index = np.where(c[:l] == 0)
19 | return ''.join(np.insert(np.array(list(s[:l])), begin_index[0], ['|'] * len(begin_index)))
20 |
21 |
22 | def tudkum(text, estimator, k):
23 | text = text.replace('\n', ' ')
24 | sentenses = text.split(" ")
25 | sentenses = list(filter(lambda x: len(x) > 0, sentenses))
26 | seq_length = [len(sentense) for sentense in sentenses]
27 | classes = [x['classes'] for x in estimator.predict(input_fn=predict_input_fn(sentenses, seq_length, k))]
28 | sentenses = [insert_pipe(s, c, l) for s, c, l in zip(sentenses, classes, seq_length)]
29 | return ''.join(sentenses).split("|")[1:]
30 |
31 | def data_provider(data_path, batch_size):
32 | def input_fn():
33 | filenames = glob.glob(os.path.join(data_path, '*.tf'))
34 |
35 | contexts, sequences = read_and_decode_single_example(filenames, shuffle=True)
36 |
37 | tensors = {**contexts, **sequences}
38 |
39 | batch = tf.train.batch(
40 | tensors=tensors,
41 | batch_size=batch_size,
42 | dynamic_pad=True,
43 | name="seq_batch"
44 | )
45 |
46 | for key in sequences.keys():
47 | batch[key] = tf.to_int32(tf.sparse_tensor_to_dense(batch[key]))
48 |
49 | label = tf.squeeze(batch.pop('label'), axis=2)
50 |
51 | return batch, label
52 |
53 | return input_fn
--------------------------------------------------------------------------------
/word2vec/word2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "from tensorflow.contrib import layers\n",
13 | "from tensorflow.contrib.learn import *\n",
14 | "from tensorflow.contrib import seq2seq\n",
15 | "from tensorflow.python.estimator.inputs import numpy_io\n",
16 | "import pickle\n",
17 | "import numpy as np\n",
18 | "import math\n",
19 | "import pandas as pd\n",
20 | "\n",
21 | "tf.logging.set_verbosity(tf.logging.INFO)"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "def word2vec(features, labels, mode, params):\n",
33 | " target = features['target']\n",
34 | "\n",
35 | " with tf.variable_scope(\"emb\"):\n",
36 | " target_weight = tf.get_variable(\"target_w\",\n",
37 | " initializer=tf.random_uniform([params['num_words'], params['emb_size']], -1.0,\n",
38 | " 1.0))\n",
39 | " context_weight = tf.get_variable(\"context_w\",\n",
40 | " initializer=tf.truncated_normal([params['num_words'], params['emb_size']]))\n",
41 | " context_bias = tf.get_variable(\"context_b\", initializer=tf.zeros(params['num_words']))\n",
42 | "\n",
43 | " target_emb = tf.nn.embedding_lookup(target_weight, target)\n",
44 | " loss = tf.reduce_mean(\n",
45 | " tf.nn.sampled_softmax_loss(weights=context_weight,\n",
46 | " biases=context_bias,\n",
47 | " labels=tf.expand_dims(labels, 1),\n",
48 | " inputs=target_emb,\n",
49 | " num_sampled=params['num_negative'],\n",
50 | " num_classes=params['num_words'],\n",
51 | " remove_accidental_hits=True))\n",
52 | "\n",
53 | " for v in tf.trainable_variables():\n",
54 | " tf.summary.histogram(v.name.replace(\":\", ''), v)\n",
55 | "\n",
56 | " train_op = layers.optimize_loss(\n",
57 | " loss=loss,\n",
58 | " global_step=tf.contrib.framework.get_global_step(),\n",
59 | " learning_rate=params[\"learning_rate\"],\n",
60 | " optimizer=tf.train.AdagradOptimizer,\n",
61 | " summaries=[\n",
62 | " \"learning_rate\",\n",
63 | " \"loss\",\n",
64 | " \"gradients\",\n",
65 | " \"gradient_norm\",\n",
66 | " ])\n",
67 | " return ModelFnOps(mode=mode, predictions=None, train_op=train_op, loss=loss)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": false,
75 | "scrolled": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "with open('_word2vec_data/target_list_novel', 'rb') as fp:\n",
80 | " target_list = pickle.load(fp)\n",
81 | "with open('_word2vec_data/context_list_novel', 'rb') as fp:\n",
82 | " context_list = pickle.load(fp)\n",
83 | "with open('_word2vec_data/indexer_novel', 'rb') as fp:\n",
84 | " indexer = pickle.load(fp)\n",
85 | "\n",
86 | "x = {'target': np.array(target_list)}\n",
87 | "y = np.array(context_list)\n",
88 | "\n",
89 | "model_params = dict(num_words=len(indexer), emb_size=64, num_negative=64, learning_rate=1.0)\n",
90 | "input_fn = numpy_io.numpy_input_fn(x, y, batch_size=512, shuffle=True, num_epochs=None)\n",
91 | "rnn_model = Estimator(model_fn=word2vec\n",
92 | " , params=model_params\n",
93 | " , model_dir=\"model/_word2vec\"\n",
94 | " , config=RunConfig(save_checkpoints_secs=30,\n",
95 | " keep_checkpoint_max=2))\n",
96 | "\n",
97 | "rnn_model.fit(input_fn=input_fn, steps=100000)"
98 | ]
99 | }
100 | ],
101 | "metadata": {
102 | "kernelspec": {
103 | "display_name": "Python 3",
104 | "language": "python",
105 | "name": "python3"
106 | },
107 | "language_info": {
108 | "codemirror_mode": {
109 | "name": "ipython",
110 | "version": 3
111 | },
112 | "file_extension": ".py",
113 | "mimetype": "text/x-python",
114 | "name": "python",
115 | "nbconvert_exporter": "python",
116 | "pygments_lexer": "ipython3",
117 | "version": "3.5.1"
118 | }
119 | },
120 | "nbformat": 4,
121 | "nbformat_minor": 2
122 | }
123 |
--------------------------------------------------------------------------------
/word2vec/word2vec_utils.py:
--------------------------------------------------------------------------------
1 | from functools import *
2 | from data.data_creator import *
3 | import itertools
4 | import glob
5 | import os
6 |
7 |
8 | def get_skip_gram(tokens, k1, k2):
9 | n_tokens = len(tokens)
10 | target_list = []
11 | context_list = []
12 | for i in range(n_tokens):
13 | if (i < k1) | ((i + k2 + 1) > len(tokens)):
14 | continue
15 | target = tokens[i]
16 | context = tokens[i - k1:i + k2 + 1]
17 | context.remove(target)
18 | for c in context:
19 | target_list.append(target)
20 | context_list.append(c)
21 | return target_list, context_list
22 |
23 |
24 | if __name__ == "__main__":
25 |
26 | files = glob.glob(os.path.join("data/_BEST/novel", '*.txt'))
27 | filtered_words = ['', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
28 | ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_']
29 | novels = []
30 | for file in files:
31 | words = []
32 | lines = open(file, 'r', encoding='utf-8')
33 | for line in lines:
34 | line = reduce(lambda a, kv: a.replace(*kv), list(MARKS.items()) + [('\n', ''), ('+', ''), (' ', '')], line)
35 | word = list(filter(lambda x: x not in filtered_words, line.split("|")))
36 | words.extend(word)
37 | novels.append(words)
38 |
39 | import pickle
40 | from collections import Counter
41 |
42 | all_words = list(itertools.chain.from_iterable(novels))
43 | word_count_dict = Counter(all_words)
44 | word_count = [(-c, w) for w, c in word_count_dict.items()]
45 | word_count.sort()
46 | all_words = [w for c, w in word_count if word_count_dict[w] > 5]
47 |
48 | indexer = {v: i for i, v in enumerate(all_words)}
49 |
50 | target_list = []
51 | context_list = []
52 | for n in novels:
53 | target, context = get_skip_gram([indexer[w] for w in n if word_count_dict[w] > 5], 4, 4)
54 | target_list.extend(target)
55 | context_list.extend(context)
56 |
57 | with open('target_list_novel', 'wb') as fp:
58 | pickle.dump(target_list, fp)
59 | with open('context_list_novel', 'wb') as fp:
60 | pickle.dump(context_list, fp)
61 | with open('indexer_novel', 'wb') as fp:
62 | pickle.dump(indexer, fp)
63 |
--------------------------------------------------------------------------------