├── README.md
├── .gitignore
├── config.yml
├── text_fast.py
├── text_dnn.py
├── text_rnn.py
├── text_cnn.py
├── text_birnn.py
├── train_word2vec.py
├── text_rcnn.py
├── eval_pred.py
├── data_helpers.py
├── LICENSE
├── train.py
└── text_han.py


/README.md:
--------------------------------------------------------------------------------
1 | # text-classification
2 | Text Classificators including FastText, TextCNN, TextRNN, TextBiRNN, TextRCNN, HAN, etc.
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | word_embeddings:
 2 |   # Two types of word embedding algorithm (word2vec and glove) are supported.
 3 |   # Just set the default to empty string to disable the word embeddings
 4 |   default: word2vec
 5 |   word2vec:
 6 |     path: data/embeddings/GoogleNews-vectors-negative300.bin
 7 |     # path: data/embeddings/news/w2v.bin
 8 |     dimension: 300
 9 |     binary: True
10 |   glove:
11 |     path: data/embeddings/glove.6B/glove.6B.300d.txt
12 |     dimension: 300
13 |     length: 400000
14 | 
15 | datasets:
16 |   # Support currently 5 datasets (default: mrpolarity):
17 |   # Classification task: mrpolarity, 20newsgroup, localdata, financenews
18 |   # Regression task: scoringdocuments
19 |   default: mrpolarity
20 |   mrpolarity:
21 |     # The dataset includes following positive/negative movie reviews:
22 |     positive_data_file:
23 |       path: "data/rt-polaritydata/rt-polarity.pos"
24 |       info: "Data source for the positive data"
25 |     negative_data_file:
26 |       path: "data/rt-polaritydata/rt-polarity.neg"
27 |       info: "Data source for the negative data"
28 |   20newsgroup:
29 |     # The dataset includes following 20 newsgroups:
30 |     # alt.atheism, comp.windows.x, rec.sport.hockey, soc.religion.christian
31 |     # comp.graphics, misc.forsale, sci.crypt, talk.politics.guns
32 |     # comp.os.ms-windows.misc, rec.autos, sci.electronics, talk.politics.mideast
33 |     # comp.sys.ibm.pc.hardware, rec.motorcycles, sci.med, talk.politics.misc
34 |     # comp.sys.mac.hardware, rec.sport.baseball, sci.space, talk.religion.misc
35 |     categories:
36 |       - alt.atheism
37 |       - comp.graphics
38 |       - sci.med
39 |       - soc.religion.christian
40 |     shuffle: True
41 |     random_state: 42
42 |   localdata:
43 |     # Load text files with categories as subfolder names.
44 |     # Individual samples are assumed to be files stored
45 |     # a two levels folder structure such as the following:
46 |     # container_folder/
47 |     #   category_1_folder/
48 |     #     file_1.txt file_2.txt ... file_42.txt
49 |     #   category_2_folder/
50 |     #     file_43.txt file_44.txt ...
51 |     #
52 |     # As an example, a SentenceCorpus dataset from
53 |     # https://archive.ics.uci.edu/ml/datasets/Sentence+Classification
54 |     # has been used. The dataset includes following 3 domains:
55 |     # arxiv, jdm and plos
56 |     container_path: data/SentenceCorpus
57 |     categories:
58 |     shuffle: True
59 |     random_state: 42
60 |   financenews:
61 |   # The dataset includes following 5 sentiments:
62 |     path: "data/financenews/financeNews12000_abstract.csv"
63 |     test_path: "data/financenews/news.csv"
64 |     info: "Data source for the finance news data"
65 |   scoringdocuments:
66 |   # The dataset includes documents following with their scores:
67 |     path: "data/scoringdocuments/scoringdocuments.csv"
68 |     test_path: "data/scoringdocuments/scoringdocuments.csv"
69 |     info: "Data source for the scored documents data"
70 | 


--------------------------------------------------------------------------------
/text_fast.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextFast(object):
 6 |     """
 7 |     A FastText for text classification/regression.
 8 |     Uses an embedding layer, followed by a average, fully-connected (and softmax) layer.
 9 |     """
10 |     def __init__(
11 |       self, model_type, sequence_length, num_classes, vocab_size,
12 |       embedding_size, l2_reg_lambda=0.0):
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
19 | 
20 |         # Keeping track of l2 regularization loss (optional)
21 |         l2_loss = tf.constant(0.0)
22 | 
23 |         # Embedding layer
24 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
26 |             self.W = tf.Variable(
27 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
28 |                 name="W", trainable=True)
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) # [None, sequence_length, embedding_size]
30 | 
31 |         # Create a average layer (avg pooling)
32 |         with tf.name_scope("avg-pool"):
33 |             self.output = tf.reduce_mean(self.embedded_chars, axis=1)
34 | 
35 |         # Add dropout
36 |         with tf.name_scope("dropout"):
37 |             self.h_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
38 | 
39 |         # Final (unnormalized) scores and predictions
40 |         with tf.name_scope("output"):
41 |             W = tf.get_variable(
42 |                 "W",
43 |                 shape=[embedding_size, num_classes],
44 |                 initializer=tf.contrib.layers.xavier_initializer())
45 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
46 |             l2_loss += tf.nn.l2_loss(W)
47 |             l2_loss += tf.nn.l2_loss(b)
48 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
49 |             if model_type == 'clf':
50 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
51 |             elif model_type == 'reg':
52 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
53 |                 self.predictions = tf.expand_dims(self.predictions, -1)
54 | 
55 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
56 |         with tf.name_scope("loss"):
57 |             if model_type == 'clf':
58 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
59 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
60 |             elif model_type == 'reg':
61 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
62 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
63 | 
64 |         # Accuracy
65 |         with tf.name_scope("accuracy"):
66 |             if model_type == 'clf':
67 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
68 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
69 |             elif model_type == 'reg':
70 |                 self.accuracy = tf.constant(0.0, name="accuracy")
71 | 


--------------------------------------------------------------------------------
/text_dnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextDNN(object):
 6 |     """
 7 |     A deep neural network for text classification/regression.
 8 |     Uses an embedding layer, followed by several fully-connected (and softmax) layer.
 9 |     """
10 |     def __init__(
11 |       self, model_type, sequence_length, num_classes, vocab_size,
12 |       embedding_size, hidden_layers, hidden_size, l2_reg_lambda=0.0):
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
19 | 
20 |         # Keeping track of l2 regularization loss (optional)
21 |         l2_loss = tf.constant(0.0)
22 | 
23 |         # Embedding layer
24 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
26 |             self.W = tf.Variable(
27 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
28 |                 name="W", trainable=True)
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) # [None, sequence_length, embedding_size]
30 |             # concatenate the [None, sequence_length * embedding_size] as features
31 |             feature_size = sequence_length * embedding_size
32 |             x = tf.reshape(self.embedded_chars, [-1, feature_size])
33 | 
34 |         # Create fully-connected layers
35 |         with tf.name_scope("fully-connected"):
36 |             def fc(x, num_hidden_units, name, dtype=tf.float32):
37 |                 with tf.variable_scope(name):
38 |                     in_dim = x.get_shape().as_list()[-1]
39 |                     d = 1.0 / np.sqrt(in_dim)
40 |                     w = tf.get_variable('W', shape=[in_dim, num_hidden_units], dtype=dtype,
41 |                                         initializer=tf.random_uniform_initializer(-d, d))
42 |                     b = tf.get_variable('b', shape=[num_hidden_units], dtype=dtype,
43 |                                         initializer=tf.random_uniform_initializer(-d, d))
44 |                     output = tf.matmul(x, w) + b
45 |                     return output
46 | 
47 |             for i in range(hidden_layers):
48 |                 x = tf.nn.elu(fc(x, hidden_size, "l{}".format(i + 1)))
49 |             self.output = x
50 | 
51 |         # Add dropout
52 |         with tf.name_scope("dropout"):
53 |             self.h_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
54 | 
55 |         # Final (unnormalized) scores and predictions
56 |         with tf.name_scope("output"):
57 |             W = tf.get_variable(
58 |                 "W",
59 |                 shape=[hidden_size, num_classes],
60 |                 initializer=tf.contrib.layers.xavier_initializer())
61 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
62 |             l2_loss += tf.nn.l2_loss(W)
63 |             l2_loss += tf.nn.l2_loss(b)
64 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
65 |             if model_type == 'clf':
66 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
67 |             elif model_type == 'reg':
68 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
69 |                 self.predictions = tf.expand_dims(self.predictions, -1)
70 | 
71 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
72 |         with tf.name_scope("loss"):
73 |             if model_type == 'clf':
74 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
75 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
76 |             elif model_type == 'reg':
77 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
78 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
79 | 
80 |         # Accuracy
81 |         with tf.name_scope("accuracy"):
82 |             if model_type == 'clf':
83 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
84 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
85 |             elif model_type == 'reg':
86 |                 self.accuracy = tf.constant(0.0, name="accuracy")
87 | 


--------------------------------------------------------------------------------
/text_rnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextRNN(object):
 6 |     """
 7 |     A RNN for text classification/regression.
 8 |     Uses an embedding layer, followed by a recurrent, fully-connected (and softmax) layer.
 9 |     """
10 |     def __init__(
11 |       self, model_type, sequence_length, num_classes, vocab_size,
12 |       embedding_size, rnn_size, num_layers, l2_reg_lambda=0.5, model='lstm'):  # batch_size, 
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
19 | 
20 |         # Keeping track of l2 regularization loss (optional)
21 |         l2_loss = tf.constant(0.0)
22 | 
23 |         # Embedding layer
24 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
26 |             self.W = tf.Variable(
27 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
28 |                 name="W", trainable=True)
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)  # [None, sequence_length, embedding_size]
30 | 
31 |         # Create a recurrent layer for each rnn layer
32 |         with tf.name_scope(model):
33 |             if model == 'rnn':
34 |                 cell_fun = tf.nn.rnn_cell.BasicRNNCell
35 |             elif model == 'gru':
36 |                 cell_fun = tf.nn.rnn_cell.GRUCell
37 |             elif model == 'lstm':
38 |                 cell_fun = tf.nn.rnn_cell.BasicLSTMCell
39 |             
40 |             def get_a_cell():
41 |                 cell_tmp = cell_fun(rnn_size, state_is_tuple=True)
42 |                 # cell_tmp = tf.contrib.rnn.DropoutWrapper(cell_tmp, output_keep_prob=self.dropout_keep_prob)
43 |                 return cell_tmp
44 |             
45 |             # Stacking multi-layers
46 |             cell = tf.nn.rnn_cell.MultiRNNCell([get_a_cell() for _ in range(num_layers)])
47 |             # initial_state = cell.zero_state(None, tf.float32)
48 |             outputs, last_state = tf.nn.dynamic_rnn(cell, self.embedded_chars, dtype=tf.float32)  # , initial_state=initial_state
49 |             # --'outputs' is a tensor of shape [batch_size, max_time, cell_state_size]
50 |             # --'last_state' is a tensor of shape [batch_size, cell_state_size]
51 |             # self.output = outputs[:, -1, :]
52 |             self.output = tf.reduce_mean(outputs, axis=1)
53 |             # self.output = tf.reshape(outputs, [batch_size, -1])
54 | 
55 |         # Add dropout
56 |         with tf.name_scope("dropout"):
57 |             self.rnn_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
58 | 
59 |         # Final (unnormalized) scores and predictions
60 |         with tf.name_scope("output"):
61 |             W = tf.get_variable(
62 |                 "W",
63 |                 shape=[rnn_size, num_classes],  # sequence_length * 
64 |                 initializer=tf.contrib.layers.xavier_initializer())
65 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
66 |             l2_loss += tf.nn.l2_loss(W)
67 |             l2_loss += tf.nn.l2_loss(b)
68 |             self.scores = tf.nn.xw_plus_b(self.rnn_drop, W, b, name="scores")
69 |             if model_type == 'clf':
70 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
71 |             elif model_type == 'reg':
72 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
73 |                 self.predictions = tf.expand_dims(self.predictions, -1)
74 | 
75 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
76 |         with tf.name_scope("loss"):
77 |             if model_type == 'clf':
78 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
79 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
80 |             elif model_type == 'reg':
81 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
82 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
83 | 
84 |         # Accuracy
85 |         with tf.name_scope("accuracy"):
86 |             if model_type == 'clf':
87 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
88 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
89 |             elif model_type == 'reg':
90 |                 self.accuracy = tf.constant(0.0, name="accuracy")


--------------------------------------------------------------------------------
/text_cnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextCNN(object):
 6 |     """
 7 |     A CNN for text classification/regression.
 8 |     Uses an embedding layer, followed by a convolutional, max-pooling, fully-connected (and softmax) layer.
 9 |     """
10 |     def __init__(
11 |       self, model_type, sequence_length, num_classes, vocab_size,
12 |       embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
19 | 
20 |         # Keeping track of l2 regularization loss (optional)
21 |         l2_loss = tf.constant(0.0)
22 | 
23 |         # Embedding layer
24 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
26 |             self.W = tf.Variable(
27 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
28 |                 name="W", trainable=True)
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) # [None, sequence_length, embedding_size]
30 |             self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) # [None, sequence_length, embedding_size, 1]
31 | 
32 |         # Create a convolution + maxpool layer for each filter size
33 |         pooled_outputs = []
34 |         for i, filter_size in enumerate(filter_sizes):
35 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
36 |                 # Convolution Layer
37 |                 filter_shape = [filter_size, embedding_size, 1, num_filters]
38 |                 W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
39 |                 b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
40 |                 conv = tf.nn.conv2d(
41 |                     self.embedded_chars_expanded,
42 |                     W,
43 |                     strides=[1, 1, 1, 1],
44 |                     padding="VALID",
45 |                     name="conv")
46 |                 # Apply nonlinearity
47 |                 h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
48 |                 # Maxpooling over the outputs
49 |                 pooled = tf.nn.max_pool(
50 |                     h,
51 |                     ksize=[1, sequence_length - filter_size + 1, 1, 1],
52 |                     strides=[1, 1, 1, 1],
53 |                     padding='VALID',
54 |                     name="pool")
55 |                 pooled_outputs.append(pooled)
56 | 
57 |         # Combine all the pooled features
58 |         num_filters_total = num_filters * len(filter_sizes)
59 |         self.h_pool = tf.concat(pooled_outputs, 3)
60 |         self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
61 | 
62 |         # Add dropout
63 |         with tf.name_scope("dropout"):
64 |             self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
65 | 
66 |         # Final (unnormalized) scores and predictions
67 |         with tf.name_scope("output"):
68 |             W = tf.get_variable(
69 |                 "W",
70 |                 shape=[num_filters_total, num_classes],
71 |                 initializer=tf.contrib.layers.xavier_initializer())
72 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
73 |             l2_loss += tf.nn.l2_loss(W)
74 |             l2_loss += tf.nn.l2_loss(b)
75 |             self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
76 |             if model_type == 'clf':
77 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
78 |             elif model_type == 'reg':
79 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
80 |                 self.predictions = tf.expand_dims(self.predictions, -1)
81 | 
82 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
83 |         with tf.name_scope("loss"):
84 |             if model_type == 'clf':
85 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
86 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
87 |             elif model_type == 'reg':
88 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
89 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
90 | 
91 |         # Accuracy
92 |         with tf.name_scope("accuracy"):
93 |             if model_type == 'clf':
94 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
95 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
96 |             elif model_type == 'reg':
97 |                 self.accuracy = tf.constant(0.0, name="accuracy")
98 | 


--------------------------------------------------------------------------------
/text_birnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TextBiRNN(object):
 6 |     """
 7 |     A Bi-directional RNN for text classification/regression.
 8 |     Uses an embedding layer, followed by a bi-directional recurrent, fully-connected (and softmax) layer.
 9 |     """
10 |     def __init__(
11 |       self, model_type, sequence_length, num_classes, vocab_size,
12 |       embedding_size, rnn_size, num_layers, l2_reg_lambda=0.5, model='lstm'):  # batch_size, 
13 | 
14 |         # Placeholders for input, output and dropout
15 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
16 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
17 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
18 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
19 | 
20 |         # Keeping track of l2 regularization loss (optional)
21 |         l2_loss = tf.constant(0.0)
22 | 
23 |         # Embedding layer
24 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
25 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
26 |             self.W = tf.Variable(
27 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
28 |                 name="W", trainable=True)
29 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)  # [None, sequence_length, embedding_size]
30 | 
31 |         # Create a bi-directional recurrent layer for each rnn layer
32 |         with tf.name_scope('bi'+model):
33 |             if model == 'rnn':
34 |                 cell_fun = tf.nn.rnn_cell.BasicRNNCell
35 |             elif model == 'gru':
36 |                 cell_fun = tf.nn.rnn_cell.GRUCell
37 |             elif model == 'lstm':
38 |                 cell_fun = tf.nn.rnn_cell.BasicLSTMCell
39 |             
40 |             def get_bi_cell():
41 |                 fw_cell = cell_fun(rnn_size, state_is_tuple=True) #forward direction cell
42 |                 bw_cell = cell_fun(rnn_size, state_is_tuple=True) #backward direction cell
43 |                 # fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=self.dropout_keep_prob)
44 |                 # bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=self.dropout_keep_prob)
45 |                 return fw_cell, bw_cell
46 |             
47 |             # Stacking multi-layers
48 |             # cell = tf.nn.rnn_cell.MultiRNNCell([get_bi_cell() for _ in range(num_layers)])
49 |             # initial_state = cell.zero_state(None, tf.float32)
50 | 
51 |             # Bi-lstm layer
52 |             fw_cell, bw_cell = get_bi_cell()
53 |             outputs, last_state = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, self.embedded_chars, dtype=tf.float32)
54 |             # outputs, last_state = tf.nn.dynamic_rnn(cell, self.embedded_chars, dtype=tf.float32)  # , initial_state=initial_state
55 |             # --'outputs' is a tensor of shape [batch_size, max_time, cell_state_size], [batch_size, max_time, cell_state_size]
56 |             # --'last_state' is a tensor of shape [batch_size, cell_state_size], [batch_size, cell_state_size]
57 |             outputs = tf.concat(outputs, axis=2)
58 |             # self.output = outputs[:, -1, :]
59 |             self.output = tf.reduce_mean(outputs, axis=1)
60 |             # self.output = tf.reshape(outputs, [batch_size, -1])
61 | 
62 |         # Add dropout
63 |         with tf.name_scope("dropout"):
64 |             self.rnn_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
65 | 
66 |         # Final (unnormalized) scores and predictions
67 |         with tf.name_scope("output"):
68 |             W = tf.get_variable(
69 |                 "W",
70 |                 shape=[rnn_size*2, num_classes],
71 |                 initializer=tf.contrib.layers.xavier_initializer())
72 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
73 |             l2_loss += tf.nn.l2_loss(W)
74 |             l2_loss += tf.nn.l2_loss(b)
75 |             self.scores = tf.nn.xw_plus_b(self.rnn_drop, W, b, name="scores")
76 |             if model_type == 'clf':
77 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
78 |             elif model_type == 'reg':
79 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
80 |                 self.predictions = tf.expand_dims(self.predictions, -1)
81 | 
82 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
83 |         with tf.name_scope("loss"):
84 |             if model_type == 'clf':
85 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
86 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
87 |             elif model_type == 'reg':
88 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
89 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
90 | 
91 |         # Accuracy
92 |         with tf.name_scope("accuracy"):
93 |             if model_type == 'clf':
94 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
95 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
96 |             elif model_type == 'reg':
97 |                 self.accuracy = tf.constant(0.0, name="accuracy")
98 | 


--------------------------------------------------------------------------------
/train_word2vec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | modified and improved from https://github.com/zake7749/word2vec-tutorial
  6 | 1. Download the Chinese or English Wikipedia corpus
  7 | $ wget https://dumps.wikimedia.org/zhwiki/
  8 | 2. Extract the articles from wiki xml file
  9 | $ python3 wiki_to_txt.py zhwiki-20160820-pages-articles.xml.bz2
 10 | 3. Using OpenCC for transforming the text from Traditional Chinese to Simplified Chinese
 11 | $ opencc -i wiki_texts.txt -o wiki_zh_tw.txt -c s2tw.json
 12 | 4. Using jieba for segmenting the texts and removing the stop words
 13 | $ python3 segment.py
 14 | 5. Using gensim's word2vec model for training
 15 | $ python3 train.py
 16 | 6. Testing the trained model
 17 | $ python3 demo.py
 18 | """
 19 | 
 20 | import logging
 21 | import os
 22 | import sys
 23 | import multiprocessing
 24 | 
 25 | import pandas as pd
 26 | import jieba
 27 | import jieba.posseg as pseg
 28 | from gensim.models import Word2Vec
 29 | from gensim.models.word2vec import LineSentence
 30 | from gensim.corpora import WikiCorpus
 31 | 
 32 | logger = logging.getLogger("Word2Vec")
 33 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 34 | 
 35 | base_path = os.getcwd() + "/data/embeddings/news/"
 36 | config = {
 37 |     "wiki_raw": "zhwiki-20160820-pages-articles.xml.bz2",
 38 |     "input_raw": base_path+"raw_text.txt",
 39 |     "input_seg": base_path+"seg_text.txt",
 40 |     "model_file": base_path+"w2v.model",
 41 |     "word_vector": base_path+"w2v.bin",
 42 | }
 43 | 
 44 | 
 45 | def to_text():
 46 |     # wiki_corpus = WikiCorpus(config['wiki_raw'], dictionary={})
 47 |     # texts_num = 0
 48 |     # with open(config['input_raw'], 'w', encoding='utf-8') as output:
 49 |     #     for text in wiki_corpus.get_texts():
 50 |     #         output.write(' '.join(text) + '\n')
 51 |     #         texts_num += 1
 52 |     #         if texts_num % 10000 == 0:
 53 |     #             logging.info("Parsed %d th articles" % texts_num)
 54 | 
 55 |     df = pd.read_csv(os.getcwd() + '/data/financenews/news.csv')
 56 |     title = list(df['Title'].values)
 57 |     content = list(df['NewsContent'].values)
 58 |     raw_text = title + content
 59 | 
 60 |     texts_num = 0
 61 |     with open(config['input_raw'], 'w', encoding='utf-8') as output:
 62 |         for text in raw_text:
 63 |             text = str(text)
 64 |             output.write(text.strip() + '\n')
 65 |             texts_num += 1
 66 |             if texts_num % 10000 == 0:
 67 |                 logging.info("Parsed %d th articles" % texts_num)
 68 | 
 69 | 
 70 | def segment():
 71 |     # jieba custom setting.
 72 |     DATA_DIR = os.getcwd() + '/data/user_dict'
 73 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt'))
 74 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt'))
 75 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt'))
 76 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt'))
 77 |     jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt'))
 78 |     jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt'))
 79 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt'))
 80 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt'))
 81 |     jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt'))
 82 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt'))
 83 |     jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt'))
 84 | 
 85 |     # load stopwords set
 86 |     stopword_set = set()
 87 |     with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords:
 88 |         for stopword in stopwords:
 89 |             stopword_set.add(stopword.strip('\n'))
 90 | 
 91 |     output = open(config['input_seg'], 'w', encoding='utf-8')
 92 |     with open(config['input_raw'], 'r', encoding='utf-8') as content :
 93 |         for texts_num, line in enumerate(content):
 94 |             line = line.strip('\n')
 95 |             words = jieba.cut(line, cut_all=False)
 96 |             for word in words:
 97 |                 if word not in stopword_set:
 98 |                     output.write(word + ' ')
 99 |             output.write('\n')
100 | 
101 |             if (texts_num + 1) % 10000 == 0:
102 |                 logging.info("Segmented %d th articles" % (texts_num + 1))
103 |     output.close()
104 | 
105 | 
106 | def train():
107 |     sentences = LineSentence(config['input_seg'])
108 |     model = Word2Vec(sentences, size=300, window=5, min_count=5,
109 |                      workers=multiprocessing.cpu_count())
110 | 
111 |     model.save(config['model_file'])
112 |     model.wv.save_word2vec_format(config['word_vector'], binary=True)
113 |     logging.info("Training process done")
114 | 
115 | 
116 | def demo():
117 |     model = Word2Vec.load(config['model_file'])
118 | 
119 |     print("Provide three testing modes\n")
120 |     print("Input a word, return 10 most similar words")
121 |     print("Input two words, return their cosine similarity")
122 |     print("Input three words, return the inference word")
123 | 
124 |     while True:
125 |         try:
126 |             query = input()
127 |             q_list = query.split()
128 | 
129 |             if len(q_list) == 1:
130 |                 print("The 10 most similar words:")
131 |                 res = model.most_similar(q_list[0],topn = 10)
132 |                 for item in res:
133 |                     print(item[0]+","+str(item[1]))
134 | 
135 |             elif len(q_list) == 2:
136 |                 print("Cosine similarity:")
137 |                 res = model.similarity(q_list[0],q_list[1])
138 |                 print(res)
139 |             
140 |             else:
141 |                 print("%s to %s, is like %s to " % (q_list[0],q_list[2],q_list[1]))
142 |                 res = model.most_similar([q_list[0],q_list[1]], [q_list[2]], topn= 10)
143 |                 for item in res:
144 |                     print(item[0]+","+str(item[1]))
145 |             print("----------------------------")
146 |         except Exception as e:
147 |             print(repr(e))
148 | 
149 | 
150 | 
151 | if __name__ == '__main__':
152 |     to_text()
153 |     segment()
154 |     train()
155 |     demo()
156 | 


--------------------------------------------------------------------------------
/text_rcnn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import copy
  4 | 
  5 | 
  6 | class TextRCNN(object):
  7 |     """
  8 |     A RNN-CNN for text classification/regression.
  9 |     Uses an embedding layer, followed by a recurrent, convolutional, fully-connected (and softmax) layer.
 10 |     """
 11 |     def __init__(
 12 |       self, model_type, sequence_length, num_classes, vocab_size,
 13 |       embedding_size, batch_size, l2_reg_lambda=0.5):  # batch_size, 
 14 | 
 15 |         # Placeholders for input, output and dropout
 16 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
 17 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
 18 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 19 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
 20 | 
 21 |         # Keeping track of l2 regularization loss (optional)
 22 |         l2_loss = tf.constant(0.0)
 23 | 
 24 |         # Embedding layer
 25 |         with tf.device('/cpu:0'), tf.name_scope("embedding"):
 26 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
 27 |             self.W = tf.Variable(
 28 |                 tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
 29 |                 name="W", trainable=True)
 30 |             self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)  # [None, sequence_length, embedding_size]
 31 | 
 32 |         # Create a recurrent-convolutional layer for each rnn layer
 33 |         with tf.name_scope('rcnn'):
 34 |             # define weights here
 35 |             self.initializer = tf.random_normal_initializer(stddev=0.1)
 36 |             self.left_side_first_word = tf.get_variable("left_side_first_word", shape=[batch_size, embedding_size], initializer=self.initializer)
 37 |             self.right_side_last_word = tf.get_variable("right_side_last_word", shape=[batch_size, embedding_size], initializer=self.initializer)
 38 |             self.W_l = tf.get_variable("W_l", shape=[embedding_size, embedding_size], initializer=self.initializer)
 39 |             self.W_r = tf.get_variable("W_r", shape=[embedding_size, embedding_size], initializer=self.initializer)
 40 |             self.W_sl = tf.get_variable("W_sl", shape=[embedding_size, embedding_size], initializer=self.initializer)
 41 |             self.W_sr = tf.get_variable("W_sr", shape=[embedding_size, embedding_size], initializer=self.initializer)
 42 |             
 43 |             # rnn-cnn layer
 44 |             def get_context_left(context_left, embedding_previous):
 45 |                 left_c = tf.matmul(context_left, self.W_l)  #context_left:[batch_size,embed_size]; W_l:[embed_size,embed_size]
 46 |                 left_e = tf.matmul(embedding_previous, self.W_sl)  #embedding_previous; [batch_size,embed_size]
 47 |                 left_h = left_c + left_e
 48 |                 context_left = tf.nn.relu(left_h, name="relu") # [None,embed_size]
 49 |                 return context_left
 50 |             def get_context_right(context_right, embedding_afterward):
 51 |                 right_c = tf.matmul(context_right, self.W_r)
 52 |                 right_e = tf.matmul(embedding_afterward, self.W_sr)
 53 |                 right_h = right_c + right_e
 54 |                 context_right = tf.nn.relu(right_h, name="relu")
 55 |                 return context_right
 56 | 
 57 |             #1. get splitted list of word embeddings
 58 |             #2. get list of context left
 59 |             embedded_words_split = tf.split(self.embedded_chars, sequence_length, axis=1) #sentence_length * [None,1,embed_size]
 60 |             embedded_words_squeezed = [tf.squeeze(x, axis=1) for x in embedded_words_split] #sentence_length * [None,embed_size]
 61 |             embedding_previous = self.left_side_first_word
 62 |             context_left_previous = tf.zeros((batch_size, embedding_size))
 63 |             context_left_list=[]
 64 |             for i, current_embedding_word in enumerate(embedded_words_squeezed): #sentence_length * [None,embed_size]
 65 |                 context_left = get_context_left(context_left_previous, embedding_previous) #[None,embed_size]
 66 |                 context_left_list.append(context_left) #append result to list
 67 |                 embedding_previous = current_embedding_word #assign embedding_previous
 68 |                 context_left_previous = context_left #assign context_left_previous
 69 |             #3. get context right
 70 |             embedded_words_squeezed2 = copy.copy(embedded_words_squeezed)
 71 |             embedded_words_squeezed2.reverse()
 72 |             embedding_afterward = self.right_side_last_word
 73 |             context_right_afterward = tf.zeros((batch_size, embedding_size))
 74 |             context_right_list=[]
 75 |             for j, current_embedding_word in enumerate(embedded_words_squeezed2):
 76 |                 context_right = get_context_right(context_right_afterward, embedding_afterward)
 77 |                 context_right_list.append(context_right)
 78 |                 embedding_afterward = current_embedding_word
 79 |                 context_right_afterward = context_right
 80 |             #4.ensemble left, embedding, right to output
 81 |             output_list=[]
 82 |             for index, current_embedding_word in enumerate(embedded_words_squeezed):
 83 |                 representation = tf.concat([context_left_list[index], current_embedding_word, context_right_list[index]], axis=1)
 84 |                 output_list.append(representation) #shape:sentence_length * [None,embed_size*3]
 85 |             #5. stack list to a tensor
 86 |             outputs = tf.stack(output_list, axis=1) #shape:[None,sentence_length,embed_size*3]
 87 |             #6. max pooling
 88 |             self.output = tf.reduce_max(outputs, axis=1) #shape:[None,embed_size*3]
 89 | 
 90 |         # Add dropout
 91 |         with tf.name_scope("dropout"):
 92 |             self.rnn_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
 93 | 
 94 |         # Final (unnormalized) scores and predictions
 95 |         with tf.name_scope("output"):
 96 |             W = tf.get_variable(
 97 |                 "W",
 98 |                 shape=[embedding_size*3, num_classes],
 99 |                 initializer=tf.contrib.layers.xavier_initializer())
100 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
101 |             l2_loss += tf.nn.l2_loss(W)
102 |             l2_loss += tf.nn.l2_loss(b)
103 |             self.scores = tf.nn.xw_plus_b(self.rnn_drop, W, b, name="scores")
104 |             if model_type == 'clf':
105 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
106 |             elif model_type == 'reg':
107 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
108 |                 self.predictions = tf.expand_dims(self.predictions, -1)
109 | 
110 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
111 |         with tf.name_scope("loss"):
112 |             if model_type == 'clf':
113 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
114 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
115 |             elif model_type == 'reg':
116 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
117 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
118 | 
119 |         # Accuracy
120 |         with tf.name_scope("accuracy"):
121 |             if model_type == 'clf':
122 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
123 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
124 |             elif model_type == 'reg':
125 |                 self.accuracy = tf.constant(0.0, name="accuracy")


--------------------------------------------------------------------------------
/eval_pred.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | """
  5 | $ ~/anaconda3/bin/python eval_pred.py --evaluate --checkpoint_dir="./runs/1523240176/checkpoints/"
  6 | $ ~/anaconda3/bin/python eval_pred.py --predict --checkpoint_dir="./runs/1523240176/checkpoints/"
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import os
 12 | import time
 13 | import csv
 14 | import yaml
 15 | import datetime
 16 | 
 17 | import tensorflow as tf
 18 | from tensorflow.contrib import learn
 19 | from sklearn import metrics
 20 | import jieba
 21 | import jieba.posseg as pseg
 22 | 
 23 | import data_helpers
 24 | 
 25 | 
 26 | def zh_tokenizer(iterator):
 27 |     for value in iterator:
 28 |         yield list(jieba.cut(value, cut_all=False))
 29 | 
 30 | def softmax(x):
 31 |     """Compute softmax values for each sets of scores in x."""
 32 |     if x.ndim == 1:
 33 |         x = x.reshape((1, -1))
 34 |     max_x = np.max(x, axis=1).reshape((-1, 1))
 35 |     exp_x = np.exp(x - max_x)
 36 |     return exp_x / np.sum(exp_x, axis=1).reshape((-1, 1))
 37 | 
 38 | with open("config.yml", 'r') as ymlfile:
 39 |     cfg = yaml.load(ymlfile)
 40 | 
 41 | # Parameters
 42 | # ==================================================
 43 | 
 44 | # Data Parameters
 45 | tf.flags.DEFINE_string("model_type", "clf", "The type of model, classification or regression (default: clf)")  # clf/reg
 46 | 
 47 | # Evaluating Parameters
 48 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 49 | tf.flags.DEFINE_string("checkpoint_dir", "", "Checkpoint directory from training run")
 50 | tf.flags.DEFINE_boolean("evaluate", False, "Evaluate on all training data")
 51 | tf.flags.DEFINE_boolean("predict", False, "Predict on test dataset")
 52 | 
 53 | # Misc Parameters
 54 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 55 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 56 | 
 57 | 
 58 | FLAGS = tf.flags.FLAGS
 59 | FLAGS._parse_flags()
 60 | print("\nParameters:")
 61 | for attr, value in sorted(FLAGS.__flags.items()):
 62 |     print("{}={}".format(attr.upper(), value))
 63 | print("")
 64 | 
 65 | 
 66 | # CHANGE THIS: Load data. Load your own evaluating set or testing set here
 67 | datasets = None
 68 | dataset_name = cfg["datasets"]["default"]
 69 | if FLAGS.evaluate:
 70 |     if dataset_name == "mrpolarity":
 71 |         datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
 72 |                                              cfg["datasets"][dataset_name]["negative_data_file"]["path"])
 73 |     elif dataset_name == "20newsgroup":
 74 |         datasets = data_helpers.get_datasets_20newsgroup(subset="test",
 75 |                                               categories=cfg["datasets"][dataset_name]["categories"],
 76 |                                               shuffle=cfg["datasets"][dataset_name]["shuffle"],
 77 |                                               random_state=cfg["datasets"][dataset_name]["random_state"])
 78 |     elif dataset_name == "financenews":
 79 |         datasets = data_helpers.get_datasets_financenews(cfg["datasets"][dataset_name]["path"])
 80 |     elif dataset_name == "scoringdocuments":
 81 |         datasets = data_helpers.get_datasets_scoringdocuments(cfg["datasets"][dataset_name]["path"])
 82 | 
 83 |     if FLAGS.model_type == 'clf':
 84 |         x_raw, y_test = data_helpers.load_data_labels(datasets)
 85 |         y_test = np.argmax(y_test, axis=1)
 86 |     elif FLAGS.model_type == 'reg':
 87 |         x_raw, y_test = data_helpers.load_data_label(datasets)
 88 | 
 89 | elif FLAGS.predict:
 90 |     if dataset_name == "mrpolarity":
 91 |         datasets = {"target_names": ['positive_examples', 'negative_examples']}
 92 |         x_raw = ["a masterpiece four years in the making", "everything is off."]
 93 |         y_test = None
 94 |     elif dataset_name == "20newsgroup":
 95 |         datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
 96 |         x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
 97 |                  "I am in the market for a 24-bit graphics card for a PC"]
 98 |         y_test = None
 99 |     elif dataset_name == "financenews":
100 |         datasets = {"target_names": ['strong_neg_examples', 'weak_neg_examples', 'neutral_examples', 'weak_pos_examples', 'strong_pos_examples']}        
101 |         datasets = data_helpers.get_datasets_financenews_test(cfg["datasets"][dataset_name]["test_path"])
102 |         x_raw = data_helpers.load_data(datasets)
103 |         y_test = None
104 |     elif dataset_name == "scoringdocuments":
105 |         datasets = {"target_names": ['document_score']}
106 |         datasets = data_helpers.get_datasets_scoringdocuments_test(cfg["datasets"][dataset_name]["test_path"])
107 |         x_raw = data_helpers.load_data(datasets)
108 |         y_test = None
109 | 
110 | # Map data into vocabulary
111 | vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
112 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
113 | x_test = np.array(list(vocab_processor.transform(x_raw)))
114 | 
115 | print("\nPredicting...\n")
116 | 
117 | # Evaluation
118 | # ==================================================
119 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
120 | graph = tf.Graph()
121 | with graph.as_default():
122 |     session_conf = tf.ConfigProto(
123 |       allow_soft_placement=FLAGS.allow_soft_placement,
124 |       log_device_placement=FLAGS.log_device_placement)
125 |     sess = tf.Session(config=session_conf)
126 |     with sess.as_default():
127 |         # Load the saved meta graph and restore variables
128 |         saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
129 |         saver.restore(sess, checkpoint_file)
130 | 
131 |         # Get the placeholders from the graph by name
132 |         input_x = graph.get_operation_by_name("input_x").outputs[0]
133 |         # input_y = graph.get_operation_by_name("input_y").outputs[0]
134 |         dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]
135 | 
136 |         # Tensors we want to evaluate
137 |         scores = graph.get_operation_by_name("output/scores").outputs[0]
138 |  
139 |         # Tensors we want to evaluate
140 |         predictions = graph.get_operation_by_name("output/predictions").outputs[0]
141 | 
142 |         # Generate batches for one epoch
143 |         batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False)
144 | 
145 |         # Collect the predictions here
146 |         all_predictions = []
147 |         all_probabilities = None
148 | 
149 |         for index, x_test_batch in enumerate(batches):
150 |             batch_predictions_scores = sess.run([predictions, scores], {input_x: x_test_batch, dropout_keep_prob: 1.0})
151 |             all_predictions = np.concatenate([all_predictions, batch_predictions_scores[0]])
152 |             if FLAGS.model_type == 'clf':
153 |                 probabilities = softmax(batch_predictions_scores[1])
154 |             elif FLAGS.model_type == 'reg':
155 |                 probabilities = batch_predictions_scores[1]
156 |             if all_probabilities is not None:
157 |                 all_probabilities = np.concatenate([all_probabilities, probabilities])
158 |             else:
159 |                 all_probabilities = probabilities
160 |             time_str = datetime.datetime.now().isoformat()
161 |             print("{}: step {}".format(time_str, (index+1)*FLAGS.batch_size))
162 | 
163 | # Print accuracy if y_test is defined
164 | if y_test is not None and FLAGS.model_type == 'clf':
165 |     y_test = y_test[:len(y_test)-len(y_test)%FLAGS.batch_size]    
166 |     correct_predictions = float(sum(all_predictions == y_test))
167 |     print("Total number of test examples: {}".format(len(y_test)))
168 |     print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))
169 |     print(metrics.classification_report(y_test, all_predictions, target_names=datasets['target_names']))
170 |     print(metrics.confusion_matrix(y_test, all_predictions))
171 | 
172 | # Save the evaluation result or testing result to a csv
173 | x_raw = x_raw[:len(x_raw)-len(x_raw)%FLAGS.batch_size]
174 | if FLAGS.model_type == 'clf':
175 |     predictions_human_readable = np.column_stack((np.array(x_raw),
176 |                                                   [int(prediction)+1 for prediction in all_predictions],
177 |                                                   ["{}".format(probability) for probability in all_probabilities]))
178 |     predict_results = pd.DataFrame(predictions_human_readable, columns=['Content','Label','Probabilities'])
179 | elif FLAGS.model_type == 'reg':
180 |     predictions_human_readable = np.column_stack((np.array(x_raw),
181 |                                                   ["{}".format(prediction) for prediction in all_predictions],
182 |                                                   [probability[0] for probability in all_probabilities]))
183 |     predict_results = pd.DataFrame(predictions_human_readable, columns=['Content','Value','Score'])
184 | 
185 | if FLAGS.evaluate:
186 |     out_path = os.path.join(FLAGS.checkpoint_dir, "..", "evaluation.csv")
187 | elif FLAGS.predict:
188 |     out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")    
189 | print("Saving evaluation to {0}".format(out_path))
190 | predict_results.to_csv(out_path, index=False)
191 | 


--------------------------------------------------------------------------------
/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import re
  4 | import itertools
  5 | from collections import Counter
  6 | from sklearn.datasets import fetch_20newsgroups
  7 | from sklearn.datasets import load_files
  8 | 
  9 | def clean_str(string):
 10 |     """
 11 |     Tokenization/string cleaning for all datasets except for SST.
 12 |     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
 13 |     """
 14 |     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
 15 |     string = re.sub(r"\'s", " \'s", string)
 16 |     string = re.sub(r"\'ve", " \'ve", string)
 17 |     string = re.sub(r"n\'t", " n\'t", string)
 18 |     string = re.sub(r"\'re", " \'re", string)
 19 |     string = re.sub(r"\'d", " \'d", string)
 20 |     string = re.sub(r"\'ll", " \'ll", string)
 21 |     string = re.sub(r",", " , ", string)
 22 |     string = re.sub(r"!", " ! ", string)
 23 |     string = re.sub(r"\(", " \( ", string)
 24 |     string = re.sub(r"\)", " \) ", string)
 25 |     string = re.sub(r"\?", " \? ", string)
 26 |     string = re.sub(r"\s{2,}", " ", string)
 27 |     return string.strip().lower()
 28 | 
 29 | 
 30 | def batch_iter(data, batch_size, num_epochs, shuffle=True):
 31 |     """
 32 |     Generates a batch iterator for a dataset.
 33 |     """
 34 |     data = np.array(data)
 35 |     data_size = len(data)
 36 |     # num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
 37 |     num_batches_per_epoch = data_size // batch_size
 38 |     for epoch in range(num_epochs):
 39 |         # Shuffle the data at each epoch
 40 |         if shuffle:
 41 |             shuffle_indices = np.random.permutation(np.arange(data_size))
 42 |             shuffled_data = data[shuffle_indices]
 43 |         else:
 44 |             shuffled_data = data
 45 |         for batch_num in range(num_batches_per_epoch):
 46 |             start_index = batch_num * batch_size
 47 |             end_index = min((batch_num + 1) * batch_size, data_size)
 48 |             # if end_index - start_index != batch_size:
 49 |                 # yield shuffled_data[end_index-batch_size:end_index]
 50 |             yield shuffled_data[start_index:end_index]
 51 | 
 52 | 
 53 | def get_datasets_20newsgroup(subset='train', categories=None, shuffle=True, random_state=42):
 54 |     """
 55 |     Retrieve data from 20 newsgroups
 56 |     :param subset: train, test or all
 57 |     :param categories: List of newsgroup name
 58 |     :param shuffle: shuffle the list or not
 59 |     :param random_state: seed integer to shuffle the dataset
 60 |     :return: data and labels of the newsgroup
 61 |     """
 62 |     datasets = fetch_20newsgroups(subset=subset, categories=categories, shuffle=shuffle, random_state=random_state)
 63 |     return datasets
 64 | 
 65 | 
 66 | def get_datasets_mrpolarity(positive_data_file, negative_data_file):
 67 |     """
 68 |     Loads MR polarity data from files, splits the data into words and generates labels.
 69 |     Returns split sentences and labels.
 70 |     """
 71 |     # Load data from files
 72 |     positive_examples = list(open(positive_data_file, "r").readlines())
 73 |     positive_examples = [s.strip() for s in positive_examples]
 74 |     negative_examples = list(open(negative_data_file, "r").readlines())
 75 |     negative_examples = [s.strip() for s in negative_examples]
 76 | 
 77 |     datasets = dict()
 78 |     datasets['data'] = positive_examples + negative_examples
 79 |     target = [0 for x in positive_examples] + [1 for x in negative_examples]
 80 |     datasets['target'] = target
 81 |     datasets['target_names'] = ['positive_examples', 'negative_examples']
 82 |     return datasets
 83 | 
 84 | 
 85 | def get_datasets_localdata(container_path=None, categories=None, load_content=True,
 86 |                        encoding='utf-8', shuffle=True, random_state=42):
 87 |     """
 88 |     Load text files with categories as subfolder names.
 89 |     Individual samples are assumed to be files stored a two levels folder structure.
 90 |     :param container_path: The path of the container
 91 |     :param categories: List of classes to choose, all classes are chosen by default (if empty or omitted)
 92 |     :param shuffle: shuffle the list or not
 93 |     :param random_state: seed integer to shuffle the dataset
 94 |     :return: data and labels of the dataset
 95 |     """
 96 |     datasets = load_files(container_path=container_path, categories=categories,
 97 |                           load_content=load_content, shuffle=shuffle, encoding=encoding,
 98 |                           random_state=random_state)
 99 |     return datasets
100 | 
101 | 
102 | def get_datasets_financenews(data_file):
103 |     """
104 |     Loads finance news data from files, splits the data into sentences and generates labels.
105 |     Returns split sentences and labels.
106 |     """
107 |     df_data = pd.read_csv(data_file)
108 |     datasets = dict()
109 | 
110 |     use_text = 'Title' # Abstract
111 |     strong_neg_examples = list(df_data[df_data['score']==1][use_text].values)
112 |     strong_neg_examples = [str(s).strip() for s in strong_neg_examples]
113 |     weak_neg_examples = list(df_data[df_data['score']==2][use_text].values)
114 |     weak_neg_examples = [str(s).strip() for s in weak_neg_examples]
115 |     neutral_examples = list(df_data[df_data['score']==3][use_text].values)
116 |     neutral_examples = [str(s).strip() for s in neutral_examples]
117 |     weak_pos_examples = list(df_data[df_data['score']==4][use_text].values)
118 |     weak_pos_examples = [str(s).strip() for s in weak_pos_examples]
119 |     strong_pos_examples = list(df_data[df_data['score']==5][use_text].values)
120 |     strong_pos_examples = [str(s).strip() for s in strong_pos_examples]
121 |     datasets['data'] = strong_neg_examples + weak_neg_examples + neutral_examples + weak_pos_examples + strong_pos_examples
122 | 
123 |     target = [0 for x in strong_neg_examples] + [1 for x in weak_neg_examples] + [2 for x in neutral_examples] + \
124 |              [3 for x in weak_pos_examples] + [4 for x in strong_pos_examples]
125 |     datasets['target'] = target
126 |     datasets['target_names'] = ['strong_neg_examples', 'weak_neg_examples', 'neutral_examples', 'weak_pos_examples', 'strong_pos_examples']
127 |     return datasets
128 | 
129 | 
130 | def get_datasets_financenews_test(data_file):
131 |     """
132 |     Loads finance news data from files, splits the data into sentences.
133 |     Returns split sentences.
134 |     """
135 |     df_data = pd.read_csv(data_file)
136 | 
137 |     use_text = 'Title' # Abstract
138 |     examples = list(df_data[use_text].values)
139 |     examples = [str(s).strip() for s in examples]
140 | 
141 |     datasets = dict()
142 |     datasets['data'] = examples
143 |     return datasets
144 | 
145 | 
146 | def get_datasets_scoringdocuments(data_file):
147 |     """
148 |     Loads scored documents data from files, splits the data into sentences and generates labels.
149 |     Returns split sentences and score label.
150 |     """
151 |     df_data = pd.read_csv(data_file)
152 |     datasets = dict()
153 | 
154 |     use_text = 'Abstract'
155 |     examples = list(df_data[use_text].values)
156 |     examples = [str(s).strip() for s in examples]
157 |     datasets['data'] = examples
158 |     target = list(df_data['Score'].values)
159 |     datasets['target'] = target
160 |     datasets['target_names'] = ['document_score']
161 |     return datasets
162 | 
163 | 
164 | def get_datasets_scoringdocuments_test(data_file):
165 |     """
166 |     Loads document data from files, splits the data into sentences.
167 |     Returns split sentences.
168 |     """
169 |     df_data = pd.read_csv(data_file)
170 | 
171 |     use_text = 'Abstract'
172 |     examples = list(df_data[use_text].values)
173 |     examples = [str(s).strip() for s in examples]
174 | 
175 |     datasets = dict()
176 |     datasets['data'] = examples
177 |     return datasets
178 | 
179 | 
180 | def load_data_label(datasets):
181 |     """
182 |     Load data and label
183 |     :param datasets:
184 |     :return:
185 |     """
186 |     # Split by words
187 |     x_text = datasets['data']
188 |     # x_text = [clean_str(sent) for sent in x_text]
189 |     x_text = [sent for sent in x_text]
190 |     # Generate regressor label
191 |     label = []
192 |     for i in range(len(x_text)):
193 |         score = datasets['target'][i]
194 |         label.append([score])
195 |     y = np.array(label)
196 |     return [x_text, y]
197 | 
198 | 
199 | def load_data_labels(datasets):
200 |     """
201 |     Load data and labels
202 |     :param datasets:
203 |     :return:
204 |     """
205 |     # Split by words
206 |     x_text = datasets['data']
207 |     # x_text = [clean_str(sent) for sent in x_text]
208 |     x_text = [sent for sent in x_text]
209 |     # Generate labels (one-hot encoding)
210 |     labels = []
211 |     for i in range(len(x_text)):
212 |         label = [0 for j in datasets['target_names']]
213 |         label[datasets['target'][i]] = 1
214 |         labels.append(label)
215 |     y = np.array(labels)
216 |     return [x_text, y]
217 | 
218 | 
219 | def load_data(datasets):
220 |     """
221 |     Load data without labels
222 |     :param datasets:
223 |     :return:
224 |     """
225 |     # Split by words
226 |     x_text = datasets['data']
227 |     # x_text = [clean_str(sent) for sent in x_text]
228 |     x_text = [sent for sent in x_text]
229 |     return x_text
230 | 
231 | 
232 | def load_embedding_vectors_word2vec(vocabulary, filename, binary):
233 |     # load embedding_vectors from the word2vec
234 |     encoding = 'utf-8'
235 |     with open(filename, "rb") as f:
236 |         header = f.readline()
237 |         vocab_size, vector_size = map(int, header.split())
238 |         # initial matrix with random uniform
239 |         embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
240 |         if binary:
241 |             binary_len = np.dtype('float32').itemsize * vector_size
242 |             for line_no in range(vocab_size):
243 |                 word = []
244 |                 while True:
245 |                     ch = f.read(1)
246 |                     if ch == b' ':
247 |                         break
248 |                     if ch == b'':
249 |                         raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
250 |                     if ch != b'\n':
251 |                         word.append(ch)
252 |                 word = str(b''.join(word), encoding=encoding, errors='strict')
253 |                 idx = vocabulary.get(word)
254 |                 if idx != 0:
255 |                     embedding_vectors[idx] = np.fromstring(f.read(binary_len), dtype='float32')
256 |                 else:
257 |                     f.seek(binary_len, 1)
258 |         else:
259 |             for line_no in range(vocab_size):
260 |                 line = f.readline()
261 |                 if line == b'':
262 |                     raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
263 |                 parts = str(line.rstrip(), encoding=encoding, errors='strict').split(" ")
264 |                 if len(parts) != vector_size + 1:
265 |                     raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no))
266 |                 word, vector = parts[0], list(map('float32', parts[1:]))
267 |                 idx = vocabulary.get(word)
268 |                 if idx != 0:
269 |                     embedding_vectors[idx] = vector
270 |         f.close()
271 |         return embedding_vectors
272 | 
273 | 
274 | def load_embedding_vectors_glove(vocabulary, filename, vector_size):
275 |     # load embedding_vectors from the glove
276 |     # initial matrix with random uniform
277 |     embedding_vectors = np.random.uniform(-0.25, 0.25, (len(vocabulary), vector_size))
278 |     f = open(filename)
279 |     for line in f:
280 |         values = line.split()
281 |         word = values[0]
282 |         vector = np.asarray(values[1:], dtype="float32")
283 |         idx = vocabulary.get(word)
284 |         if idx != 0:
285 |             embedding_vectors[idx] = vector
286 |     f.close()
287 |     return embedding_vectors


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | """
  5 | $ ~/anaconda3/bin/python train.py
  6 | $ tensorboard --host localhost --port 6006 --logdir summaries/
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import os
 12 | import time
 13 | import math
 14 | import yaml
 15 | import datetime
 16 | import jieba
 17 | import jieba.posseg as pseg
 18 | import tensorflow as tf
 19 | from tensorflow.contrib import learn
 20 | from sklearn.model_selection import KFold
 21 | 
 22 | import data_helpers
 23 | from text_fast import TextFast
 24 | from text_dnn import TextDNN
 25 | from text_cnn import TextCNN
 26 | from text_rnn import TextRNN
 27 | from text_birnn import TextBiRNN
 28 | from text_rcnn import TextRCNN
 29 | from text_han import TextHAN
 30 | 
 31 | 
 32 | # Parameters
 33 | # ==================================================
 34 | 
 35 | # Data loading params
 36 | tf.flags.DEFINE_string("model_type", "clf", "The type of model, classification or regression (default: clf)")  # clf/reg
 37 | tf.flags.DEFINE_string("using_nn_type", "textcnn", "The type of neural network type (default: textcnn)")  # fasttext/textdnn/textcnn/textrnn/textbirnn/textrcnn/texthan
 38 | tf.flags.DEFINE_string("language_type", "en", "Text language type (default: en)")  # en/zh
 39 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
 40 | tf.flags.DEFINE_float("cross_val_folds", 10, "Split the training data to validation with k folds")
 41 | 
 42 | # Model Hyperparameters
 43 | tf.flags.DEFINE_boolean("enable_word_embeddings", True, "Enable/disable the word embedding (default: True)")
 44 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
 45 | tf.flags.DEFINE_integer("hidden_size", 128, "Number of hidden layer units (default: 128)")
 46 | tf.flags.DEFINE_integer("hidden_layers", 2, "Number of hidden layers (default: 2)")
 47 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
 48 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
 49 | tf.flags.DEFINE_integer("rnn_size", 300, "Number of units rnn_size (default: 300)")
 50 | tf.flags.DEFINE_integer("num_rnn_layers", 3, "Number of rnn layers (default: 3)")
 51 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 52 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")
 53 | 
 54 | # Training parameters
 55 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 56 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
 57 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
 58 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
 59 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
 60 | tf.flags.DEFINE_float("grad_clip", 5, "grad clip to prevent gradient explode")
 61 | tf.flags.DEFINE_float("decay_coefficient", 2.5, "Decay coefficient (default: 2.5)")
 62 | 
 63 | # Misc Parameters
 64 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 65 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
 66 | 
 67 | FLAGS = tf.flags.FLAGS
 68 | FLAGS._parse_flags()
 69 | print("\nParameters:")
 70 | for attr, value in sorted(FLAGS.__flags.items()):
 71 |     print("{}={}".format(attr.upper(), value))
 72 | print("")
 73 | 
 74 | 
 75 | with open("config.yml", 'r') as ymlfile:
 76 |     cfg = yaml.load(ymlfile)
 77 | 
 78 | dataset_name = cfg["datasets"]["default"]
 79 | if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
 80 |     embedding_name = cfg['word_embeddings']['default']
 81 |     embedding_dimension = cfg['word_embeddings'][embedding_name]['dimension']
 82 | else:
 83 |     embedding_dimension = FLAGS.embedding_dim
 84 | 
 85 | 
 86 | # Data Preparation
 87 | # ==================================================
 88 | 
 89 | # CHANGE THIS: Load data. Load your own training set here
 90 | print("Loading data...")
 91 | datasets = None
 92 | if dataset_name == "mrpolarity":
 93 |     datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
 94 |                                                     cfg["datasets"][dataset_name]["negative_data_file"]["path"])
 95 | elif dataset_name == "20newsgroup":
 96 |     datasets = data_helpers.get_datasets_20newsgroup(subset="train",
 97 |                                                      categories=cfg["datasets"][dataset_name]["categories"],
 98 |                                                      shuffle=cfg["datasets"][dataset_name]["shuffle"],
 99 |                                                      random_state=cfg["datasets"][dataset_name]["random_state"])
100 | elif dataset_name == "localdata":
101 |     datasets = data_helpers.get_datasets_localdata(container_path=cfg["datasets"][dataset_name]["container_path"],
102 |                                                      categories=cfg["datasets"][dataset_name]["categories"],
103 |                                                      shuffle=cfg["datasets"][dataset_name]["shuffle"],
104 |                                                      random_state=cfg["datasets"][dataset_name]["random_state"])
105 | elif dataset_name == "financenews":
106 |     datasets = data_helpers.get_datasets_financenews(cfg["datasets"][dataset_name]["path"])
107 | elif dataset_name == "scoringdocuments":
108 |     datasets = data_helpers.get_datasets_scoringdocuments(cfg["datasets"][dataset_name]["path"])
109 | 
110 | if FLAGS.model_type == 'clf':
111 |     x_text, y = data_helpers.load_data_labels(datasets)
112 | elif FLAGS.model_type == 'reg':
113 |     x_text, y = data_helpers.load_data_label(datasets)
114 | 
115 | # Build vocabulary
116 | if FLAGS.language_type == 'en':
117 |     max_document_length = max([len(x.split(" ")) for x in x_text])+1
118 |     vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
119 | elif FLAGS.language_type == 'zh':
120 |     def zh_tokenizer(iterator):
121 |         for value in iterator:
122 |             yield list(jieba.cut(value, cut_all=False))
123 |     max_document_length = max([len(list(jieba.cut(x, cut_all=False))) for x in x_text])
124 |     vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, tokenizer_fn=zh_tokenizer)
125 | print("Max document length: {:d}".format(max_document_length))
126 | x = np.array(list(vocab_processor.fit_transform(x_text)))
127 | 
128 | # Randomly shuffle data
129 | np.random.seed(10)
130 | shuffle_indices = np.random.permutation(np.arange(len(y)))
131 | x_shuffled = x[shuffle_indices]
132 | y_shuffled = y[shuffle_indices]
133 | 
134 | # Split train/dev set
135 | # TODO: This is crude, should use cross-validation
136 | dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
137 | x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
138 | y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
139 | # kfold = KFold(n_splits=FLAGS.cross_val_folds, shuffle=True, random_state=10)
140 | # for train_index, dev_index in kfold.split(x_shuffled, y_shuffled):
141 | #     x_train, x_dev = x_shuffled[train_index], x_shuffled[dev_index]
142 | #     y_train, y_dev = y_shuffled[train_index], y_shuffled[dev_index]
143 | 
144 | del x, y, x_shuffled, y_shuffled
145 | 
146 | print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
147 | print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
148 | 
149 | 
150 | # Training
151 | # ==================================================
152 | 
153 | with tf.Graph().as_default():
154 |     session_conf = tf.ConfigProto(
155 |       allow_soft_placement=FLAGS.allow_soft_placement,
156 |       log_device_placement=FLAGS.log_device_placement)
157 |     sess = tf.Session(config=session_conf)
158 |     with sess.as_default():
159 |         if FLAGS.using_nn_type == 'fasttext':
160 |             nn = TextFast(
161 |                 model_type=FLAGS.model_type,
162 |                 sequence_length=x_train.shape[1],
163 |                 num_classes=y_train.shape[1],
164 |                 vocab_size=len(vocab_processor.vocabulary_),
165 |                 embedding_size=embedding_dimension,
166 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
167 |         elif FLAGS.using_nn_type == 'textdnn':
168 |             nn = TextDNN(
169 |                 model_type=FLAGS.model_type,
170 |                 sequence_length=x_train.shape[1],
171 |                 num_classes=y_train.shape[1],
172 |                 vocab_size=len(vocab_processor.vocabulary_),
173 |                 embedding_size=embedding_dimension,
174 |                 hidden_layers=FLAGS.hidden_layers,
175 |                 hidden_size=FLAGS.hidden_size,
176 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
177 |         elif FLAGS.using_nn_type == 'textcnn':
178 |             nn = TextCNN(
179 |                 model_type=FLAGS.model_type,
180 |                 sequence_length=x_train.shape[1],
181 |                 num_classes=y_train.shape[1],
182 |                 vocab_size=len(vocab_processor.vocabulary_),
183 |                 embedding_size=embedding_dimension,
184 |                 filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
185 |                 num_filters=FLAGS.num_filters,
186 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
187 |         elif FLAGS.using_nn_type == 'textrnn':
188 |             nn = TextRNN(
189 |                 model_type=FLAGS.model_type,
190 |                 sequence_length=x_train.shape[1],
191 |                 num_classes=y_train.shape[1],
192 |                 vocab_size=len(vocab_processor.vocabulary_),
193 |                 embedding_size=embedding_dimension,
194 |                 rnn_size=FLAGS.rnn_size,
195 |                 num_layers=FLAGS.num_rnn_layers,
196 |                 # batch_size=FLAGS.batch_size,
197 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
198 |         elif FLAGS.using_nn_type == 'textbirnn':
199 |             nn = TextBiRNN(
200 |                 model_type=FLAGS.model_type,
201 |                 sequence_length=x_train.shape[1],
202 |                 num_classes=y_train.shape[1],
203 |                 vocab_size=len(vocab_processor.vocabulary_),
204 |                 embedding_size=embedding_dimension,
205 |                 rnn_size=FLAGS.rnn_size,
206 |                 num_layers=FLAGS.num_rnn_layers,
207 |                 # batch_size=FLAGS.batch_size,
208 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
209 |         elif FLAGS.using_nn_type == 'textrcnn':
210 |             nn = TextRCNN(
211 |                 model_type=FLAGS.model_type,
212 |                 sequence_length=x_train.shape[1],
213 |                 num_classes=y_train.shape[1],
214 |                 vocab_size=len(vocab_processor.vocabulary_),
215 |                 embedding_size=embedding_dimension,
216 |                 batch_size=FLAGS.batch_size,
217 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
218 |         elif FLAGS.using_nn_type == 'texthan':
219 |             nn = TextHAN(
220 |                 model_type=FLAGS.model_type,
221 |                 sequence_length=x_train.shape[1],
222 |                 num_sentences=3,
223 |                 num_classes=y_train.shape[1],
224 |                 vocab_size=len(vocab_processor.vocabulary_),
225 |                 embedding_size=embedding_dimension,
226 |                 hidden_size=FLAGS.rnn_size,
227 |                 batch_size=FLAGS.batch_size,
228 |                 l2_reg_lambda=FLAGS.l2_reg_lambda)
229 | 
230 |         # Define Training procedure
231 |         global_step = tf.Variable(0, name="global_step", trainable=False)
232 |         optimizer = tf.train.AdamOptimizer(nn.learning_rate)
233 |         # Clip the gradient to avoid larger ones
234 |         tvars = tf.trainable_variables()
235 |         grads, _ = tf.clip_by_global_norm(tf.gradients(nn.loss, tvars), FLAGS.grad_clip)
236 |         # grads_and_vars = optimizer.compute_gradients(nn.loss)
237 |         grads_and_vars = tuple(zip(grads, tvars))
238 |         train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
239 |         
240 |         # Keep track of gradient values and sparsity (optional)
241 |         grad_summaries = []
242 |         for g, v in grads_and_vars:
243 |             if g is not None:
244 |                 grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
245 |                 sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
246 |                 grad_summaries.append(grad_hist_summary)
247 |                 grad_summaries.append(sparsity_summary)
248 |         grad_summaries_merged = tf.summary.merge(grad_summaries)
249 | 
250 |         # Output directory for models and summaries
251 |         timestamp = str(int(time.time()))
252 |         out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
253 |         print("Writing to {}\n".format(out_dir))
254 | 
255 |         # Summaries for loss and accuracy
256 |         loss_summary = tf.summary.scalar("loss", nn.loss)
257 |         acc_summary = tf.summary.scalar("accuracy", nn.accuracy)
258 | 
259 |         # Train Summaries
260 |         train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
261 |         train_summary_dir = os.path.join(out_dir, "summaries", "train")
262 |         train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
263 | 
264 |         # Dev summaries
265 |         dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
266 |         dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
267 |         dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)
268 | 
269 |         # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
270 |         checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
271 |         checkpoint_prefix = os.path.join(checkpoint_dir, "model")
272 |         if not os.path.exists(checkpoint_dir):
273 |             os.makedirs(checkpoint_dir)
274 |         saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)
275 | 
276 |         # Write vocabulary
277 |         vocab_processor.save(os.path.join(out_dir, "vocab"))
278 | 
279 |         # Initialize all variables
280 |         sess.run(tf.global_variables_initializer())
281 | 
282 |         # Initialize the word embedding vectors
283 |         if FLAGS.enable_word_embeddings and cfg['word_embeddings']['default'] is not None:
284 |             vocabulary = vocab_processor.vocabulary_
285 |             initW = None
286 |             if embedding_name == 'word2vec':
287 |                 # load embedding vectors from the word2vec
288 |                 print("Load word2vec file {}".format(cfg['word_embeddings']['word2vec']['path']))
289 |                 initW = data_helpers.load_embedding_vectors_word2vec(vocabulary,
290 |                                                                      cfg['word_embeddings']['word2vec']['path'],
291 |                                                                      cfg['word_embeddings']['word2vec']['binary'])
292 |                 print("word2vec file has been loaded")
293 |             elif embedding_name == 'glove':
294 |                 # load embedding vectors from the glove
295 |                 print("Load glove file {}".format(cfg['word_embeddings']['glove']['path']))
296 |                 initW = data_helpers.load_embedding_vectors_glove(vocabulary,
297 |                                                                   cfg['word_embeddings']['glove']['path'],
298 |                                                                   embedding_dimension)
299 |                 print("glove file has been loaded\n")
300 |             sess.run(nn.W.assign(initW))
301 | 
302 |         def train_step(x_batch, y_batch, learning_rate):
303 |             """
304 |             A single training step
305 |             """
306 |             feed_dict = {
307 |                 nn.input_x: x_batch,
308 |                 nn.input_y: y_batch,
309 |                 nn.dropout_keep_prob: FLAGS.dropout_keep_prob,
310 |                 nn.learning_rate: learning_rate
311 |             }
312 |             _, step, summaries, loss, accuracy = sess.run(
313 |                 [train_op, global_step, train_summary_op, nn.loss, nn.accuracy],
314 |                 feed_dict)
315 |             time_str = datetime.datetime.now().isoformat()
316 |             print("{}: step {}, loss {:g}, acc {:g}, lr {:g}".format(time_str, step, loss, accuracy, learning_rate))
317 |             train_summary_writer.add_summary(summaries, step)
318 | 
319 |         def dev_step(x_batch, y_batch, writer=None):
320 |             """
321 |             Evaluates model on a dev set
322 |             """
323 |             if FLAGS.using_nn_type in ['fasttext', 'textdnn', 'textcnn', 'textrnn', 'textbirnn']:
324 |                 feed_dict = {
325 |                     nn.input_x: x_batch,
326 |                     nn.input_y: y_batch,
327 |                     nn.dropout_keep_prob: 1.0
328 |                 }
329 |                 step, summaries, loss, accuracy = sess.run(
330 |                     [global_step, dev_summary_op, nn.loss, nn.accuracy],
331 |                     feed_dict)
332 |                 if writer:
333 |                     writer.add_summary(summaries, step)
334 |             elif FLAGS.using_nn_type in ['textrcnn', 'texthan']:
335 |                 loss_sum = 0
336 |                 accuracy_sum = 0
337 |                 summaries = None
338 |                 step = None
339 |                 batches_in_dev = len(y_batch) // FLAGS.batch_size
340 |                 for batch in range(batches_in_dev):
341 |                     start_index = batch * FLAGS.batch_size
342 |                     end_index = (batch + 1) * FLAGS.batch_size
343 |                     feed_dict = {
344 |                         nn.input_x: x_batch[start_index:end_index],
345 |                         nn.input_y: y_batch[start_index:end_index],
346 |                         nn.dropout_keep_prob: 1.0
347 |                     }
348 |                     step, summaries, loss, accuracy = sess.run(
349 |                         [global_step, dev_summary_op, nn.loss, nn.accuracy],
350 |                         feed_dict)
351 |                     loss_sum += loss
352 |                     accuracy_sum += accuracy
353 |                     if writer:
354 |                         writer.add_summary(summaries, step)
355 |                 loss = loss_sum / batches_in_dev
356 |                 accuracy = accuracy_sum / batches_in_dev
357 |             time_str = datetime.datetime.now().isoformat()
358 |             print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
359 | 
360 |         # Generate batches
361 |         batches = data_helpers.batch_iter(
362 |             list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
363 |         # It uses dynamic learning rate with a high value at the beginning to speed up the training
364 |         max_learning_rate = 0.005
365 |         min_learning_rate = 0.0001
366 |         decay_speed = FLAGS.decay_coefficient*len(y_train)/FLAGS.batch_size
367 |         # Training loop. For each batch...
368 |         counter = 0
369 |         for batch in batches:
370 |             learning_rate = min_learning_rate + (max_learning_rate - min_learning_rate) * math.exp(-counter/decay_speed)
371 |             counter += 1
372 |             x_batch, y_batch = zip(*batch)
373 |             train_step(x_batch, y_batch, learning_rate)
374 |             current_step = tf.train.global_step(sess, global_step)
375 |             if current_step % FLAGS.evaluate_every == 0:
376 |                 print("\nEvaluation:")
377 |                 dev_step(x_dev, y_dev, writer=dev_summary_writer)
378 |                 print("")
379 |             if current_step % FLAGS.checkpoint_every == 0:
380 |                 path = saver.save(sess, checkpoint_prefix, global_step=current_step)
381 |                 print("Saved model checkpoint to {}\n".format(path))
382 | 
383 |         # Save config to csv
384 |         attrs = []
385 |         values = []
386 |         for attr, value in sorted(FLAGS.__flags.items()):
387 |             attrs += [attr]
388 |             values += [value]
389 |         info = pd.DataFrame()
390 |         info['attr'] = attrs
391 |         info['value'] = values
392 |         info.to_csv(out_dir + '/config.csv', index=False)
393 | 


--------------------------------------------------------------------------------
/text_han.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | class TextHAN(object):
  6 |     """
  7 |     A Hierarchical Attention Network for text classification/regression.
  8 |     Uses an embedding layer, followed by a Word Encoder, Word Attention, Sentence Encoder, Sentence Attetion, fully-connected (and softmax) layer.
  9 |     """
 10 |     def __init__(
 11 |       self, model_type, sequence_length, num_sentences, num_classes, vocab_size,
 12 |       embedding_size, hidden_size, batch_size, l2_reg_lambda=0.5):  # batch_size, 
 13 | 
 14 |         # parameters
 15 |         self.sequence_length = sequence_length
 16 |         self.num_sentences = num_sentences
 17 |         self.vocab_size = vocab_size
 18 |         self.embed_size = embedding_size
 19 |         self.hidden_size = hidden_size
 20 |         self.batch_size = batch_size
 21 |         self.initializer = tf.random_normal_initializer(stddev=0.1)
 22 | 
 23 |         # Placeholders for input, output and dropout
 24 |         self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
 25 |         self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
 26 |         self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
 27 |         self.learning_rate = tf.placeholder(tf.float32, name="learning_rate")
 28 |         self.sequence_length = int(self.sequence_length / self.num_sentences) # TODO
 29 | 
 30 |         # Keeping track of l2 regularization loss (optional)
 31 |         l2_loss = tf.constant(0.0)
 32 | 
 33 |         # Initialize weights
 34 |         self.instantiate_weights()
 35 |         
 36 |         # Create a han
 37 |         with tf.name_scope('han'):
 38 |             # 1.Word Encoder
 39 |             # 1.1 embedding of words
 40 |             input_x = tf.split(self.input_x, self.num_sentences, axis=1)  # a list. length:num_sentences.each element is:[None,self.sequence_length/num_sentences]
 41 |             input_x = tf.stack(input_x, axis=1)  # shape:[None,self.num_sentences,self.sequence_length/num_sentences]
 42 |             self.embedded_words = tf.nn.embedding_lookup(self.W, input_x)  # [None,num_sentences,sentence_length,embed_size]
 43 |             embedded_words_reshaped = tf.reshape(self.embedded_words, shape=[-1, self.sequence_length, self.embed_size])  # [batch_size*num_sentences,sentence_length,embed_size]
 44 |             # 1.2 forward gru
 45 |             hidden_state_forward_list = self.gru_forward_word_level(embedded_words_reshaped)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
 46 |             # 1.3 backward gru
 47 |             hidden_state_backward_list = self.gru_backward_word_level(embedded_words_reshaped)  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
 48 |             # 1.4 concat forward hidden state and backward hidden state. hidden_state: a list.len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
 49 |             self.hidden_state = [tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in
 50 |                                 zip(hidden_state_forward_list, hidden_state_backward_list)]  # hidden_state:list,len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
 51 | 
 52 |             # 2.Word Attention
 53 |             # for each sentence.
 54 |             sentence_representation = self.attention_word_level(self.hidden_state)  # output:[batch_size*num_sentences,hidden_size*2]
 55 |             sentence_representation = tf.reshape(sentence_representation, shape=[-1, self.num_sentences, self.hidden_size * 2])  # shape:[batch_size,num_sentences,hidden_size*2]
 56 |             #with tf.name_scope("dropout"):#TODO
 57 |             #    sentence_representation = tf.nn.dropout(sentence_representation,keep_prob=self.dropout_keep_prob)  # shape:[None,hidden_size*4]
 58 | 
 59 |             # 3.Sentence Encoder
 60 |             # 3.1) forward gru for sentence
 61 |             hidden_state_forward_sentences = self.gru_forward_sentence_level(sentence_representation)  # a list.length is sentence_length, each element is [None,hidden_size]
 62 |             # 3.2) backward gru for sentence
 63 |             hidden_state_backward_sentences = self.gru_backward_sentence_level(sentence_representation)  # a list,length is sentence_length, each element is [None,hidden_size]
 64 |             # 3.3) concat forward hidden state and backward hidden state
 65 |             # below hidden_state_sentence is a list,len:sentence_length,element:[None,hidden_size*2]
 66 |             self.hidden_state_sentence = [tf.concat([h_forward, h_backward], axis=1) for h_forward, h_backward in zip(hidden_state_forward_sentences, hidden_state_backward_sentences)]
 67 | 
 68 |             # 4.Sentence Attention
 69 |             document_representation = self.attention_sentence_level(self.hidden_state_sentence)  # shape:[None,hidden_size*4]
 70 |             self.output = document_representation
 71 | 
 72 |         # Add dropout
 73 |         with tf.name_scope("dropout"):
 74 |             self.rnn_drop = tf.nn.dropout(self.output, self.dropout_keep_prob)
 75 | 
 76 |         # Final (unnormalized) scores and predictions
 77 |         with tf.name_scope("output"):
 78 |             W = tf.get_variable(
 79 |                 "W",
 80 |                 shape=[hidden_size*4, num_classes],
 81 |                 initializer=tf.contrib.layers.xavier_initializer())
 82 |             b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
 83 |             l2_loss += tf.nn.l2_loss(W)
 84 |             l2_loss += tf.nn.l2_loss(b)
 85 |             self.scores = tf.nn.xw_plus_b(self.rnn_drop, W, b, name="scores")
 86 |             if model_type == 'clf':
 87 |                 self.predictions = tf.argmax(self.scores, 1, name="predictions")
 88 |             elif model_type == 'reg':
 89 |                 self.predictions = tf.reduce_max(self.scores, 1, name="predictions")
 90 |                 self.predictions = tf.expand_dims(self.predictions, -1)
 91 | 
 92 |         # Calculate mean cross-entropy loss, or root-mean-square error loss
 93 |         with tf.name_scope("loss"):
 94 |             if model_type == 'clf':
 95 |                 losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
 96 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
 97 |             elif model_type == 'reg':
 98 |                 losses = tf.sqrt(tf.losses.mean_squared_error(predictions=self.predictions, labels=self.input_y))
 99 |                 self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
100 | 
101 |         # Accuracy
102 |         with tf.name_scope("accuracy"):
103 |             if model_type == 'clf':
104 |                 correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
105 |                 self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
106 |             elif model_type == 'reg':
107 |                 self.accuracy = tf.constant(0.0, name="accuracy")
108 | 
109 |     
110 |     def instantiate_weights(self):
111 |         """define all weights here"""
112 |         # Embedding layer
113 |         with tf.name_scope("embedding"):
114 |             # When trainable parameter equals True the embedding vector is non-static, otherwise is static
115 |             self.W = tf.Variable(
116 |                 tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0),
117 |                 name="W", trainable=True)
118 | 
119 |         with tf.name_scope("gru_weights_word_level"):
120 |             self.W_z = tf.get_variable("W_z", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
121 |             self.U_z = tf.get_variable("U_z", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
122 |             self.b_z = tf.get_variable("b_z", shape=[self.hidden_size])
123 |             # GRU parameters:reset gate related
124 |             self.W_r = tf.get_variable("W_r", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
125 |             self.U_r = tf.get_variable("U_r", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
126 |             self.b_r = tf.get_variable("b_r", shape=[self.hidden_size])
127 | 
128 |             self.W_h = tf.get_variable("W_h", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
129 |             self.U_h = tf.get_variable("U_h", shape=[self.embed_size, self.hidden_size], initializer=self.initializer)
130 |             self.b_h = tf.get_variable("b_h", shape=[self.hidden_size])
131 | 
132 |         with tf.name_scope("gru_weights_sentence_level"):
133 |             self.W_z_sentence = tf.get_variable("W_z_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
134 |                                                 initializer=self.initializer)
135 |             self.U_z_sentence = tf.get_variable("U_z_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
136 |                                                 initializer=self.initializer)
137 |             self.b_z_sentence = tf.get_variable("b_z_sentence", shape=[self.hidden_size * 2])
138 |             # GRU parameters:reset gate related
139 |             self.W_r_sentence = tf.get_variable("W_r_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
140 |                                                 initializer=self.initializer)
141 |             self.U_r_sentence = tf.get_variable("U_r_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
142 |                                                 initializer=self.initializer)
143 |             self.b_r_sentence = tf.get_variable("b_r_sentence", shape=[self.hidden_size * 2])
144 | 
145 |             self.W_h_sentence = tf.get_variable("W_h_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
146 |                                                 initializer=self.initializer)
147 |             self.U_h_sentence = tf.get_variable("U_h_sentence", shape=[self.hidden_size * 2, self.hidden_size * 2],
148 |                                                 initializer=self.initializer)
149 |             self.b_h_sentence = tf.get_variable("b_h_sentence", shape=[self.hidden_size * 2])
150 | 
151 |         with tf.name_scope("attention"):
152 |             self.W_w_attention_word = tf.get_variable("W_w_attention_word",
153 |                                                       shape=[self.hidden_size * 2, self.hidden_size * 2],
154 |                                                       initializer=self.initializer)
155 |             self.W_b_attention_word = tf.get_variable("W_b_attention_word", shape=[self.hidden_size * 2])
156 | 
157 |             self.W_w_attention_sentence = tf.get_variable("W_w_attention_sentence",
158 |                                                           shape=[self.hidden_size * 4, self.hidden_size * 2],
159 |                                                           initializer=self.initializer)
160 |             self.W_b_attention_sentence = tf.get_variable("W_b_attention_sentence", shape=[self.hidden_size * 2])
161 |             self.context_vecotor_word = tf.get_variable("what_is_the_informative_word", shape=[self.hidden_size * 2],
162 |                                                         initializer=self.initializer)  # TODO o.k to use batch_size in first demension?
163 |             self.context_vecotor_sentence = tf.get_variable("what_is_the_informative_sentence",
164 |                                                             shape=[self.hidden_size * 2], initializer=self.initializer)
165 | 
166 |     
167 |     def gru_single_step_word_level(self, Xt, h_t_minus_1):
168 |         """
169 |         single step of gru for word level
170 |         :param Xt: Xt:[batch_size*num_sentences,embed_size]
171 |         :param h_t_minus_1:[batch_size*num_sentences,embed_size]
172 |         :return:
173 |         """
174 |         # update gate: decides how much past information is kept and how much new information is added.
175 |         z_t = tf.nn.sigmoid(tf.matmul(Xt, self.W_z) + tf.matmul(h_t_minus_1,
176 |                                                                 self.U_z) + self.b_z)  # z_t:[batch_size*num_sentences,self.hidden_size]
177 |         # reset gate: controls how much the past state contributes to the candidate state.
178 |         r_t = tf.nn.sigmoid(tf.matmul(Xt, self.W_r) + tf.matmul(h_t_minus_1,
179 |                                                                 self.U_r) + self.b_r)  # r_t:[batch_size*num_sentences,self.hidden_size]
180 |         # candiate state h_t~
181 |         h_t_candiate = tf.nn.tanh(tf.matmul(Xt, self.W_h) +r_t * (tf.matmul(h_t_minus_1, self.U_h)) + self.b_h)  # h_t_candiate:[batch_size*num_sentences,self.hidden_size]
182 |         # new state: a linear combine of pervious hidden state and the current new state h_t~
183 |         h_t = (1 - z_t) * h_t_minus_1 + z_t * h_t_candiate  # h_t:[batch_size*num_sentences,hidden_size]
184 |         return h_t
185 | 
186 |     def gru_single_step_sentence_level(self, Xt,
187 |                                        h_t_minus_1):  # Xt:[batch_size, hidden_size*2]; h_t:[batch_size, hidden_size*2]
188 |         """
189 |         single step of gru for sentence level
190 |         :param Xt:[batch_size, hidden_size*2]
191 |         :param h_t_minus_1:[batch_size, hidden_size*2]
192 |         :return:h_t:[batch_size,hidden_size]
193 |         """
194 |         # update gate: decides how much past information is kept and how much new information is added.
195 |         z_t = tf.nn.sigmoid(tf.matmul(Xt, self.W_z_sentence) + tf.matmul(h_t_minus_1,
196 |                                                                          self.U_z_sentence) + self.b_z_sentence)  # z_t:[batch_size,self.hidden_size]
197 |         # reset gate: controls how much the past state contributes to the candidate state.
198 |         r_t = tf.nn.sigmoid(tf.matmul(Xt, self.W_r_sentence) + tf.matmul(h_t_minus_1,
199 |                                                                          self.U_r_sentence) + self.b_r_sentence)  # r_t:[batch_size,self.hidden_size]
200 |         # candiate state h_t~
201 |         h_t_candiate = tf.nn.tanh(tf.matmul(Xt, self.W_h_sentence) + r_t * (
202 |         tf.matmul(h_t_minus_1, self.U_h_sentence)) + self.b_h_sentence)  # h_t_candiate:[batch_size,self.hidden_size]
203 |         # new state: a linear combine of pervious hidden state and the current new state h_t~
204 |         h_t = (1 - z_t) * h_t_minus_1 + z_t * h_t_candiate
205 |         return h_t
206 | 
207 |     # forward gru for first level: word levels
208 |     def gru_forward_word_level(self, embedded_words):
209 |         """
210 |         :param embedded_words:[batch_size*num_sentences,sentence_length,embed_size]
211 |         :return:forward hidden state: a list.length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
212 |         """
213 |         # split embedded_words
214 |         embedded_words_splitted = tf.split(embedded_words, self.sequence_length,
215 |                                            axis=1)  # it is a list,length is sentence_length, each element is [batch_size*num_sentences,1,embed_size]
216 |         embedded_words_squeeze = [tf.squeeze(x, axis=1) for x in
217 |                                   embedded_words_splitted]  # it is a list,length is sentence_length, each element is [batch_size*num_sentences,embed_size]
218 |         # demension_1=embedded_words_squeeze[0].get_shape().dims[0]
219 |         h_t = tf.ones((self.batch_size * self.num_sentences,
220 |                        self.hidden_size))  #TODO self.hidden_size h_t =int(tf.get_shape(embedded_words_squeeze[0])[0]) # tf.ones([self.batch_size*self.num_sentences, self.hidden_size]) # [batch_size*num_sentences,embed_size]
221 |         h_t_forward_list = []
222 |         for time_step, Xt in enumerate(embedded_words_squeeze):  # Xt: [batch_size*num_sentences,embed_size]
223 |             h_t = self.gru_single_step_word_level(Xt,h_t)  # [batch_size*num_sentences,embed_size]<------Xt:[batch_size*num_sentences,embed_size];h_t:[batch_size*num_sentences,embed_size]
224 |             h_t_forward_list.append(h_t)
225 |         return h_t_forward_list  # a list,length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
226 | 
227 |     # backward gru for first level: word level
228 |     def gru_backward_word_level(self, embedded_words):
229 |         """
230 |         :param   embedded_words:[batch_size*num_sentences,sentence_length,embed_size]
231 |         :return: backward hidden state:a list.length is sentence_length, each element is [batch_size*num_sentences,hidden_size]
232 |         """
233 |         # split embedded_words
234 |         embedded_words_splitted = tf.split(embedded_words, self.sequence_length,
235 |                                            axis=1)  # it is a list,length is sentence_length, each element is [batch_size*num_sentences,1,embed_size]
236 |         embedded_words_squeeze = [tf.squeeze(x, axis=1) for x in
237 |                                   embedded_words_splitted]  # it is a list,length is sentence_length, each element is [batch_size*num_sentences,embed_size]
238 |         embedded_words_squeeze.reverse()  # it is a list,length is sentence_length, each element is [batch_size*num_sentences,embed_size]
239 |         # demension_1=int(tf.get_shape(embedded_words_squeeze[0])[0]) #h_t = tf.ones([self.batch_size*self.num_sentences, self.hidden_size])
240 |         h_t = tf.ones((self.batch_size * self.num_sentences, self.hidden_size))
241 |         h_t_backward_list = []
242 |         for time_step, Xt in enumerate(embedded_words_squeeze):
243 |             h_t = self.gru_single_step_word_level(Xt, h_t)
244 |             h_t_backward_list.append(h_t)
245 |         h_t_backward_list.reverse() #ADD 2017.06.14
246 |         return h_t_backward_list
247 | 
248 |     # forward gru for second level: sentence level
249 |     def gru_forward_sentence_level(self, sentence_representation):
250 |         """
251 |         :param sentence_representation: [batch_size,num_sentences,hidden_size*2]
252 |         :return:forward hidden state: a list,length is num_sentences, each element is [batch_size,hidden_size]
253 |         """
254 |         # split embedded_words
255 |         sentence_representation_splitted = tf.split(sentence_representation, self.num_sentences,
256 |                                                     axis=1)  # it is a list.length is num_sentences,each element is [batch_size,1,hidden_size*2]
257 |         sentence_representation_squeeze = [tf.squeeze(x, axis=1) for x in
258 |                                            sentence_representation_splitted]  # it is a list.length is num_sentences,each element is [batch_size, hidden_size*2]
259 |         # demension_1 = int(tf.get_shape(sentence_representation_squeeze[0])[0]) #scalar: batch_size
260 |         h_t = tf.ones((self.batch_size, self.hidden_size * 2))  # TODO
261 |         h_t_forward_list = []
262 |         for time_step, Xt in enumerate(sentence_representation_squeeze):  # Xt:[batch_size, hidden_size*2]
263 |             h_t = self.gru_single_step_sentence_level(Xt,
264 |                                                       h_t)  # h_t:[batch_size,hidden_size]<---------Xt:[batch_size, hidden_size*2]; h_t:[batch_size, hidden_size*2]
265 |             h_t_forward_list.append(h_t)
266 |         return h_t_forward_list  # a list,length is num_sentences, each element is [batch_size,hidden_size]
267 | 
268 |     # backward gru for second level: sentence level
269 |     def gru_backward_sentence_level(self, sentence_representation):
270 |         """
271 |         :param sentence_representation: [batch_size,num_sentences,hidden_size*2]
272 |         :return:forward hidden state: a list,length is num_sentences, each element is [batch_size,hidden_size]
273 |         """
274 |         # split embedded_words
275 |         sentence_representation_splitted = tf.split(sentence_representation, self.num_sentences,
276 |                                                     axis=1)  # it is a list.length is num_sentences,each element is [batch_size,1,hidden_size*2]
277 |         sentence_representation_squeeze = [tf.squeeze(x, axis=1) for x in
278 |                                            sentence_representation_splitted]  # it is a list.length is num_sentences,each element is [batch_size, hidden_size*2]
279 |         sentence_representation_squeeze.reverse()
280 |         # demension_1 = int(tf.get_shape(sentence_representation_squeeze[0])[0])  # scalar: batch_size
281 |         h_t = tf.ones((self.batch_size, self.hidden_size * 2))
282 |         h_t_forward_list = []
283 |         for time_step, Xt in enumerate(sentence_representation_squeeze):  # Xt:[batch_size, hidden_size*2]
284 |             h_t = self.gru_single_step_sentence_level(Xt,h_t)  # h_t:[batch_size,hidden_size]<---------Xt:[batch_size, hidden_size*2]; h_t:[batch_size, hidden_size*2]
285 |             h_t_forward_list.append(h_t)
286 |         h_t_forward_list.reverse() #ADD 2017.06.14
287 |         return h_t_forward_list  # a list,length is num_sentences, each element is [batch_size,hidden_size]
288 | 
289 |     def attention_word_level(self, hidden_state):
290 |         """
291 |         input1:self.hidden_state: hidden_state:list,len:sentence_length,element:[batch_size*num_sentences,hidden_size*2]
292 |         input2:sentence level context vector:[batch_size*num_sentences,hidden_size*2]
293 |         :return:representation.shape:[batch_size*num_sentences,hidden_size*2]
294 |         """
295 |         hidden_state_ = tf.stack(hidden_state, axis=1)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
296 |         # 0) one layer of feed forward network
297 |         hidden_state_2 = tf.reshape(hidden_state_, shape=[-1,
298 |                                                           self.hidden_size * 2])  # shape:[batch_size*num_sentences*sequence_length,hidden_size*2]
299 |         # hidden_state_:[batch_size*num_sentences*sequence_length,hidden_size*2];W_w_attention_sentence:[,hidden_size*2,,hidden_size*2]
300 |         hidden_representation = tf.nn.tanh(tf.matmul(hidden_state_2,
301 |                                                      self.W_w_attention_word) + self.W_b_attention_word)  # shape:[batch_size*num_sentences*sequence_length,hidden_size*2]
302 |         hidden_representation = tf.reshape(hidden_representation, shape=[-1, self.sequence_length,
303 |                                                                          self.hidden_size * 2])  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
304 |         # attention process:1.get logits for each word in the sentence. 2.get possibility distribution for each word in the sentence. 3.get weighted sum for the sentence as sentence representation.
305 |         # 1) get logits for each word in the sentence.
306 |         hidden_state_context_similiarity = tf.multiply(hidden_representation,
307 |                                                        self.context_vecotor_word)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
308 |         attention_logits = tf.reduce_sum(hidden_state_context_similiarity,
309 |                                          axis=2)  # shape:[batch_size*num_sentences,sequence_length]
310 |         # subtract max for numerical stability (softmax is shift invariant). tf.reduce_max:Computes the maximum of elements across dimensions of a tensor.
311 |         attention_logits_max = tf.reduce_max(attention_logits, axis=1,
312 |                                              keep_dims=True)  # shape:[batch_size*num_sentences,1]
313 |         # 2) get possibility distribution for each word in the sentence.
314 |         p_attention = tf.nn.softmax(
315 |             attention_logits - attention_logits_max)  # shape:[batch_size*num_sentences,sequence_length]
316 |         # 3) get weighted hidden state by attention vector
317 |         p_attention_expanded = tf.expand_dims(p_attention, axis=2)  # shape:[batch_size*num_sentences,sequence_length,1]
318 |         # below sentence_representation'shape:[batch_size*num_sentences,sequence_length,hidden_size*2]<----p_attention_expanded:[batch_size*num_sentences,sequence_length,1];hidden_state_:[batch_size*num_sentences,sequence_length,hidden_size*2]
319 |         sentence_representation = tf.multiply(p_attention_expanded,
320 |                                               hidden_state_)  # shape:[batch_size*num_sentences,sequence_length,hidden_size*2]
321 |         sentence_representation = tf.reduce_sum(sentence_representation,
322 |                                                 axis=1)  # shape:[batch_size*num_sentences,hidden_size*2]
323 |         return sentence_representation  # shape:[batch_size*num_sentences,hidden_size*2]
324 | 
325 |     def attention_sentence_level(self, hidden_state_sentence):
326 |         """
327 |         input1: hidden_state_sentence: a list,len:num_sentence,element:[None,hidden_size*4]
328 |         input2: sentence level context vector:[self.hidden_size*2]
329 |         :return:representation.shape:[None,hidden_size*4]
330 |         """
331 |         hidden_state_ = tf.stack(hidden_state_sentence, axis=1)  # shape:[None,num_sentence,hidden_size*4]
332 | 
333 |         # 0) one layer of feed forward
334 |         hidden_state_2 = tf.reshape(hidden_state_,
335 |                                     shape=[-1, self.hidden_size * 4])  # [None*num_sentence,hidden_size*4]
336 |         hidden_representation = tf.nn.tanh(tf.matmul(hidden_state_2,
337 |                                                      self.W_w_attention_sentence) + self.W_b_attention_sentence)  # shape:[None*num_sentence,hidden_size*2]
338 |         hidden_representation = tf.reshape(hidden_representation, shape=[-1, self.num_sentences,
339 |                                                                          self.hidden_size * 2])  # [None,num_sentence,hidden_size*2]
340 |         # attention process:1.get logits for each sentence in the doc.2.get possibility distribution for each sentence in the doc.3.get weighted sum for the sentences as doc representation.
341 |         # 1) get logits for each word in the sentence.
342 |         hidden_state_context_similiarity = tf.multiply(hidden_representation,
343 |                                                        self.context_vecotor_sentence)  # shape:[None,num_sentence,hidden_size*2]
344 |         attention_logits = tf.reduce_sum(hidden_state_context_similiarity,
345 |                                          axis=2)  # shape:[None,num_sentence]. that is get logit for each num_sentence.
346 |         # subtract max for numerical stability (softmax is shift invariant). tf.reduce_max:computes the maximum of elements across dimensions of a tensor.
347 |         attention_logits_max = tf.reduce_max(attention_logits, axis=1, keep_dims=True)  # shape:[None,1]
348 |         # 2) get possibility distribution for each word in the sentence.
349 |         p_attention = tf.nn.softmax(attention_logits - attention_logits_max)  # shape:[None,num_sentence]
350 |         # 3) get weighted hidden state by attention vector(sentence level)
351 |         p_attention_expanded = tf.expand_dims(p_attention, axis=2)  # shape:[None,num_sentence,1]
352 |         sentence_representation = tf.multiply(p_attention_expanded,
353 |                                               hidden_state_)  # shape:[None,num_sentence,hidden_size*2]<---p_attention_expanded:[None,num_sentence,1];hidden_state_:[None,num_sentence,hidden_size*2]
354 |         sentence_representation = tf.reduce_sum(sentence_representation, axis=1)  # shape:[None,hidden_size*2]
355 |         return sentence_representation  # shape:[None,hidden_size*2]
356 | 


--------------------------------------------------------------------------------