├── .gitignore
├── README.md
├── data
    └── .gitkeep
├── model.py
├── reader.py
├── resources
    └── tagspace-model.png
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | data/
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TagSpace-tensorflow
 2 | 
 3 | ![model image of TagSpace](https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/master/resources/tagspace-model.png)
 4 | 
 5 | Tensorflow implementation of Facebook **#TagSpace**
 6 | 
 7 | You can read more about #TagSpace from [here](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/)
 8 | 
 9 | Special thanks to Facebook research team's [Starspace](https://github.com/facebookresearch/Starspace) project, it was really good reference.
10 | 
11 | ## Key Concept
12 | 
13 | Beside choosing 1000 random negative tag (for performance reason I guess), I choosed worst positive tag, best negative tag for calculating WARP loss. It's not good for performance but since we don't have much tags(labels) as Facebook, it seems okay.
14 | 
15 | ## Usage
16 | 
17 | Download [ag news dataset](https://github.com/mhjabreel/CharCNN/tree/36791268d7eec96dc3330cf7eedbfb427524b604/data/ag_news_csv) as below
18 | 
19 | ```
20 | $ tree ./data
21 | ./data
22 | └── ag_news_csv
23 |     ├── classes.txt
24 |     ├── readme.txt
25 |     ├── test.csv
26 |     ├── train.csv
27 |     └── train_mini.csv
28 | ```
29 | 
30 | and then
31 | 
32 | ```
33 | $ python train.py
34 | ```
35 | 
36 | ## Result
37 | 
38 | Accuracy 0.89 (ag test data, compare 0.91 from StarSpace with same condition [5 epoch, 10 dim])
39 | 
40 | ## To-do list
41 | 
42 | - support multiple dataset
43 | - improve performance
44 | - adopt WARP sampling (now is just a WARP loss)
45 | - add Tensorboard metrics
46 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/0d404fdbdd67ebfcdb0bfb1f407ea98113ed0f8a/data/.gitkeep


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tflearn
 3 | 
 4 | class TagSpace(object):
 5 |   def __init__(self):
 6 |     pass
 7 | 
 8 |   def create_model(self, l, tN, N=100000, d=10, K=5, H=1000, m=0.05, reuse=False):
 9 |     '''
10 |     N = 1000000 (Paper)
11 |     d = Unknown
12 |     '''
13 |     with tf.variable_scope('TagSpace', reuse=reuse):
14 |       lr = tf.placeholder('float32', shape=[1], name='lr')
15 |       doc = tf.placeholder('float32', shape=[None, l], name='doc')
16 |       tag_flag = tf.placeholder('float32', shape=[None, tN], name='tag_flag')
17 | 
18 |       doc_embed = tflearn.embedding(doc, input_dim=N, output_dim=d)
19 |       self.lt_embed = lt_embed = tf.Variable(tf.random_normal([tN, d], stddev=0.1))
20 | 
21 |       net = tflearn.conv_1d(doc_embed, H, K, activation='tanh')
22 |       net = tflearn.max_pool_1d(net, K)
23 |       net = tflearn.tanh(net)
24 |       self.logit = logit = tflearn.fully_connected(net, d, activation=None)
25 | 
26 |       zero_vector = tf.zeros(shape=(1,1), dtype=tf.float32)
27 | 
28 |       logit = tf.expand_dims(logit, 1)
29 |       logit_set = tf.concat([logit for i in range(tN)], axis=1)
30 | 
31 |       tag_flag_ex = tf.expand_dims(tag_flag, 2)
32 |       tg = tf.concat([tag_flag_ex for i in range(d)], axis=2)
33 | 
34 |       self.tag_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tf.ones_like(tg), lt_embed)), axis=2)
35 | 
36 |       self.positive_logit = positive_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tg, lt_embed)), axis=2)
37 |       self.f_positive = f_positive = tf.map_fn(lambda x: (tf.boolean_mask(x[0], x[1]), True), (positive_logit, tf.not_equal(positive_logit, zero_vector)))
38 |       positive = tf.reduce_min(f_positive[0], axis=1)
39 |       self.positive = positive
40 | 
41 |       tag_flag_ex = tf.expand_dims(1-tag_flag, 2)
42 |       tg = tf.concat([tag_flag_ex for i in range(d)], axis=2)
43 |       negative_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tg, lt_embed)), axis=2)
44 | 
45 |       self.f_negative = f_negative = tf.map_fn(lambda x: (tf.boolean_mask(x[0], x[1]), True), (negative_logit, tf.not_equal(negative_logit, zero_vector)))
46 |       self.negative = negative = tf.reduce_max(f_negative[0], axis=1)
47 | 
48 |       self.f_loss = f_loss = tf.reduce_mean(tf.reduce_max([tf.reduce_min([tf.expand_dims(m - positive + negative,1), tf.expand_dims(tf.fill([tf.shape(doc)[0]], 10e7),1)], axis=0), tf.zeros([tf.shape(doc)[0], 1])], axis=0))
49 |       params = tf.trainable_variables()
50 | 
51 |       opt = tf.train.AdamOptimizer(learning_rate=lr[0])
52 |       gradients = tf.gradients(f_loss, params)
53 |       clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
54 |       self.op = opt.apply_gradients(zip(clipped_gradients, params))
55 | 
56 | 
57 |   def train_opts(self):
58 |     return [self.op, self.f_loss, self.logit, self.lt_embed, self.f_positive[0][0], self.f_negative[0][0]]
59 | 
60 |   def test_opts(self):
61 |     return [self.tag_logit]
62 | 


--------------------------------------------------------------------------------
/reader.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import numpy as np
 3 | 
 4 | class VocabDict(object):
 5 |   def __init__(self):
 6 |     self.dict = {'<unk>': 0}
 7 | 
 8 |   def fit(self, word):
 9 |     if word not in self.dict:
10 |       self.dict[word] = len(self.dict)
11 | 
12 |   def size(self):
13 |     return len(self.dict)
14 | 
15 |   def transform(self, word):
16 |     if word in self.dict:
17 |       return self.dict[word]
18 |     return 0
19 | 
20 |   def fit_and_transform(self, word):
21 |     self.fit(word)
22 |     return self.transform(word)
23 | 
24 | def to_categorical(y, target_dict, mode_transform=False):
25 |   result = []
26 |   if mode_transform == False:
27 |     l = len(np.unique(y)) + 1
28 |   else:
29 |     l = target_dict.size()
30 | 
31 |   for i, d in enumerate(y):
32 |     tmp = [0.] * l
33 |     for _i, _d in enumerate(d):
34 |       if mode_transform == False:
35 |         tmp[target_dict.fit_and_transform(_d)] = 1.
36 |       else:
37 |         tmp[target_dict.transform(_d)] = 1.
38 |     result.append(tmp)
39 |   return result
40 | 
41 | def load_csv(filepath, target_columns=-1, columns_to_ignore=None,
42 |     has_header=True, n_classes=None, target_dict=None, mode_transform=False):
43 | 
44 |   if isinstance(target_columns, list) and len(target_columns) < 1:
45 |     raise Exception('target_columns must be list with one value at least')
46 | 
47 |   from tensorflow.python.platform import gfile
48 |   with gfile.Open(filepath) as csv_file:
49 |     data_file = csv.reader(csv_file)
50 |     if not columns_to_ignore:
51 |       columns_to_ignore = []
52 |     if has_header:
53 |       header = next(data_file)
54 | 
55 |     data, target = [], []
56 |     for i, d in enumerate(data_file):
57 |       data.append([_d for _i, _d in enumerate(d) if _i not in target_columns and _i not in columns_to_ignore])
58 |       target.append([_d for _i, _d in enumerate(d) if _i in target_columns])
59 | 
60 |     if target_dict is None:
61 |       target_dict = VocabDict()
62 |     target = to_categorical(target, target_dict=target_dict, mode_transform=mode_transform)
63 |     return data, target
64 | 


--------------------------------------------------------------------------------
/resources/tagspace-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/0d404fdbdd67ebfcdb0bfb1f407ea98113ed0f8a/resources/tagspace-model.png


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tflearn
 3 | import numpy as np
 4 | import re
 5 | from model import TagSpace
 6 | from sklearn.utils import shuffle
 7 | from reader import load_csv, VocabDict
 8 | 
 9 | '''
10 | parse
11 | '''
12 | 
13 | tf.app.flags.DEFINE_integer('num_epochs', 5, 'number of epochs to train')
14 | tf.app.flags.DEFINE_integer('batch_size', 20, 'batch size to train in one step')
15 | tf.app.flags.DEFINE_integer('labels', 5, 'number of label classes')
16 | tf.app.flags.DEFINE_integer('word_pad_length', 60, 'word pad length for training')
17 | tf.app.flags.DEFINE_float('learn_rate', 1e-2, 'learn rate for training optimization')
18 | tf.app.flags.DEFINE_boolean('shuffle', True, 'shuffle data FLAG')
19 | 
20 | FLAGS = tf.app.flags.FLAGS
21 | 
22 | num_epochs = FLAGS.num_epochs
23 | batch_size = FLAGS.batch_size
24 | tag_size = FLAGS.labels
25 | word_pad_length = FLAGS.word_pad_length
26 | lr = FLAGS.learn_rate
27 | lr_decr = (lr - (1e-9))/num_epochs
28 | 
29 | TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE)
30 | def token_parse(iterator):
31 |   for value in iterator:
32 |     return TOKENIZER_RE.findall(value)
33 | 
34 | tokenizer = tflearn.data_utils.VocabularyProcessor(word_pad_length, tokenizer_fn=lambda tokens: [token_parse(x) for x in tokens])
35 | label_dict = VocabDict()
36 | 
37 | def string_parser(arr, fit):
38 |   if fit == False:
39 |     return list(tokenizer.transform(arr))
40 |   else:
41 |     return list(tokenizer.fit_transform(arr))
42 | 
43 | model = TagSpace()
44 | with tf.Session() as sess:
45 |   #with tf.device('/cpu:0'):
46 |   model.create_model(word_pad_length, tag_size)
47 |   train_opts = model.train_opts()
48 |   test_opts = model.test_opts()
49 | 
50 |   sess.run(tf.global_variables_initializer())
51 | 
52 |   words, tags = load_csv('./data/ag_news_csv/train.csv', target_columns=[0], columns_to_ignore=[1], target_dict=label_dict)
53 |   if FLAGS.shuffle == True:
54 |     words, tags = shuffle(words, tags)
55 | 
56 |   words = string_parser(words, fit=True)
57 |   word_input = tflearn.data_utils.pad_sequences(words, maxlen=word_pad_length)
58 |   total = len(word_input)
59 |   step_print = int((total/batch_size) / 13)
60 |   global_step = 0
61 | 
62 |   print('start training')
63 |   for epoch_num in range(num_epochs):
64 |     epoch_loss = 0
65 |     step_loss = 0
66 |     for i in range(int(total/batch_size)):
67 |       batch_input, batch_tags = (word_input[i*batch_size:(i+1)*batch_size], tags[i*batch_size:(i+1)*batch_size])
68 |       result = sess.run(train_opts, feed_dict={f'{model.__class__.__name__}/doc:0': batch_input, f'{model.__class__.__name__}/tag_flag:0': batch_tags, f'{model.__class__.__name__}/lr:0': [lr]})
69 |       step_loss += result[1]
70 |       epoch_loss += result[1]
71 |       if i % step_print == 0:
72 |         print(f'step_log: (epoch: {epoch_num}, step: {i}, global_step: {global_step}), Loss:{step_loss/step_print}), Positive: {result[4]}, Negative: {result[5]}')
73 |         step_loss = 0
74 |       global_step += 1
75 |     print(f'epoch_log: (epoch: {epoch_num}, global_step: {global_step}), Loss:{epoch_loss/(total/batch_size)})')
76 |     lr -= lr_decr
77 | 
78 |   words, tags = load_csv('./data/ag_news_csv/test.csv', target_columns=[0], columns_to_ignore=[1], target_dict=label_dict)
79 |   words = string_parser(words, fit=True)
80 |   word_input = tflearn.data_utils.pad_sequences(words, maxlen=word_pad_length)
81 |   total = len(word_input)
82 |   rs = 0.
83 | 
84 |   for i in range(int(total/batch_size)):
85 |     batch_input, batch_tags = (word_input[i*batch_size:(i+1)*batch_size], tags[i*batch_size:(i+1)*batch_size])
86 |     result = sess.run(test_opts, feed_dict={f'{model.__class__.__name__}/doc:0': batch_input, f'{model.__class__.__name__}/tag_flag:0': np.ones_like(batch_tags)})
87 |     arr = result[0]
88 |     for j in range(len(batch_tags)):
89 |       rs+=np.sum(np.argmax(arr[j]) == np.argmax(batch_tags[j]))
90 |   print(f'Test accuracy: {rs/total}')
91 | 
92 |   sess.close()
93 | 
94 | 


--------------------------------------------------------------------------------