├── .gitignore ├── README.md ├── data └── .gitkeep ├── model.py ├── reader.py ├── resources └── tagspace-model.png └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | data/ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TagSpace-tensorflow 2 | 3 | ![model image of TagSpace](https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/master/resources/tagspace-model.png) 4 | 5 | Tensorflow implementation of Facebook **#TagSpace** 6 | 7 | You can read more about #TagSpace from [here](https://research.fb.com/publications/tagspace-semantic-embeddings-from-hashtags/) 8 | 9 | Special thanks to Facebook research team's [Starspace](https://github.com/facebookresearch/Starspace) project, it was really good reference. 10 | 11 | ## Key Concept 12 | 13 | Beside choosing 1000 random negative tag (for performance reason I guess), I choosed worst positive tag, best negative tag for calculating WARP loss. It's not good for performance but since we don't have much tags(labels) as Facebook, it seems okay. 14 | 15 | ## Usage 16 | 17 | Download [ag news dataset](https://github.com/mhjabreel/CharCNN/tree/36791268d7eec96dc3330cf7eedbfb427524b604/data/ag_news_csv) as below 18 | 19 | ``` 20 | $ tree ./data 21 | ./data 22 | └── ag_news_csv 23 | ├── classes.txt 24 | ├── readme.txt 25 | ├── test.csv 26 | ├── train.csv 27 | └── train_mini.csv 28 | ``` 29 | 30 | and then 31 | 32 | ``` 33 | $ python train.py 34 | ``` 35 | 36 | ## Result 37 | 38 | Accuracy 0.89 (ag test data, compare 0.91 from StarSpace with same condition [5 epoch, 10 dim]) 39 | 40 | ## To-do list 41 | 42 | - support multiple dataset 43 | - improve performance 44 | - adopt WARP sampling (now is just a WARP loss) 45 | - add Tensorboard metrics 46 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/0d404fdbdd67ebfcdb0bfb1f407ea98113ed0f8a/data/.gitkeep -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tflearn 3 | 4 | class TagSpace(object): 5 | def __init__(self): 6 | pass 7 | 8 | def create_model(self, l, tN, N=100000, d=10, K=5, H=1000, m=0.05, reuse=False): 9 | ''' 10 | N = 1000000 (Paper) 11 | d = Unknown 12 | ''' 13 | with tf.variable_scope('TagSpace', reuse=reuse): 14 | lr = tf.placeholder('float32', shape=[1], name='lr') 15 | doc = tf.placeholder('float32', shape=[None, l], name='doc') 16 | tag_flag = tf.placeholder('float32', shape=[None, tN], name='tag_flag') 17 | 18 | doc_embed = tflearn.embedding(doc, input_dim=N, output_dim=d) 19 | self.lt_embed = lt_embed = tf.Variable(tf.random_normal([tN, d], stddev=0.1)) 20 | 21 | net = tflearn.conv_1d(doc_embed, H, K, activation='tanh') 22 | net = tflearn.max_pool_1d(net, K) 23 | net = tflearn.tanh(net) 24 | self.logit = logit = tflearn.fully_connected(net, d, activation=None) 25 | 26 | zero_vector = tf.zeros(shape=(1,1), dtype=tf.float32) 27 | 28 | logit = tf.expand_dims(logit, 1) 29 | logit_set = tf.concat([logit for i in range(tN)], axis=1) 30 | 31 | tag_flag_ex = tf.expand_dims(tag_flag, 2) 32 | tg = tf.concat([tag_flag_ex for i in range(d)], axis=2) 33 | 34 | self.tag_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tf.ones_like(tg), lt_embed)), axis=2) 35 | 36 | self.positive_logit = positive_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tg, lt_embed)), axis=2) 37 | self.f_positive = f_positive = tf.map_fn(lambda x: (tf.boolean_mask(x[0], x[1]), True), (positive_logit, tf.not_equal(positive_logit, zero_vector))) 38 | positive = tf.reduce_min(f_positive[0], axis=1) 39 | self.positive = positive 40 | 41 | tag_flag_ex = tf.expand_dims(1-tag_flag, 2) 42 | tg = tf.concat([tag_flag_ex for i in range(d)], axis=2) 43 | negative_logit = tf.reduce_sum(tf.multiply(logit_set, tf.multiply(tg, lt_embed)), axis=2) 44 | 45 | self.f_negative = f_negative = tf.map_fn(lambda x: (tf.boolean_mask(x[0], x[1]), True), (negative_logit, tf.not_equal(negative_logit, zero_vector))) 46 | self.negative = negative = tf.reduce_max(f_negative[0], axis=1) 47 | 48 | self.f_loss = f_loss = tf.reduce_mean(tf.reduce_max([tf.reduce_min([tf.expand_dims(m - positive + negative,1), tf.expand_dims(tf.fill([tf.shape(doc)[0]], 10e7),1)], axis=0), tf.zeros([tf.shape(doc)[0], 1])], axis=0)) 49 | params = tf.trainable_variables() 50 | 51 | opt = tf.train.AdamOptimizer(learning_rate=lr[0]) 52 | gradients = tf.gradients(f_loss, params) 53 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0) 54 | self.op = opt.apply_gradients(zip(clipped_gradients, params)) 55 | 56 | 57 | def train_opts(self): 58 | return [self.op, self.f_loss, self.logit, self.lt_embed, self.f_positive[0][0], self.f_negative[0][0]] 59 | 60 | def test_opts(self): 61 | return [self.tag_logit] 62 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import numpy as np 3 | 4 | class VocabDict(object): 5 | def __init__(self): 6 | self.dict = {'': 0} 7 | 8 | def fit(self, word): 9 | if word not in self.dict: 10 | self.dict[word] = len(self.dict) 11 | 12 | def size(self): 13 | return len(self.dict) 14 | 15 | def transform(self, word): 16 | if word in self.dict: 17 | return self.dict[word] 18 | return 0 19 | 20 | def fit_and_transform(self, word): 21 | self.fit(word) 22 | return self.transform(word) 23 | 24 | def to_categorical(y, target_dict, mode_transform=False): 25 | result = [] 26 | if mode_transform == False: 27 | l = len(np.unique(y)) + 1 28 | else: 29 | l = target_dict.size() 30 | 31 | for i, d in enumerate(y): 32 | tmp = [0.] * l 33 | for _i, _d in enumerate(d): 34 | if mode_transform == False: 35 | tmp[target_dict.fit_and_transform(_d)] = 1. 36 | else: 37 | tmp[target_dict.transform(_d)] = 1. 38 | result.append(tmp) 39 | return result 40 | 41 | def load_csv(filepath, target_columns=-1, columns_to_ignore=None, 42 | has_header=True, n_classes=None, target_dict=None, mode_transform=False): 43 | 44 | if isinstance(target_columns, list) and len(target_columns) < 1: 45 | raise Exception('target_columns must be list with one value at least') 46 | 47 | from tensorflow.python.platform import gfile 48 | with gfile.Open(filepath) as csv_file: 49 | data_file = csv.reader(csv_file) 50 | if not columns_to_ignore: 51 | columns_to_ignore = [] 52 | if has_header: 53 | header = next(data_file) 54 | 55 | data, target = [], [] 56 | for i, d in enumerate(data_file): 57 | data.append([_d for _i, _d in enumerate(d) if _i not in target_columns and _i not in columns_to_ignore]) 58 | target.append([_d for _i, _d in enumerate(d) if _i in target_columns]) 59 | 60 | if target_dict is None: 61 | target_dict = VocabDict() 62 | target = to_categorical(target, target_dict=target_dict, mode_transform=mode_transform) 63 | return data, target 64 | -------------------------------------------------------------------------------- /resources/tagspace-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/flrngel/TagSpace-tensorflow/0d404fdbdd67ebfcdb0bfb1f407ea98113ed0f8a/resources/tagspace-model.png -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tflearn 3 | import numpy as np 4 | import re 5 | from model import TagSpace 6 | from sklearn.utils import shuffle 7 | from reader import load_csv, VocabDict 8 | 9 | ''' 10 | parse 11 | ''' 12 | 13 | tf.app.flags.DEFINE_integer('num_epochs', 5, 'number of epochs to train') 14 | tf.app.flags.DEFINE_integer('batch_size', 20, 'batch size to train in one step') 15 | tf.app.flags.DEFINE_integer('labels', 5, 'number of label classes') 16 | tf.app.flags.DEFINE_integer('word_pad_length', 60, 'word pad length for training') 17 | tf.app.flags.DEFINE_float('learn_rate', 1e-2, 'learn rate for training optimization') 18 | tf.app.flags.DEFINE_boolean('shuffle', True, 'shuffle data FLAG') 19 | 20 | FLAGS = tf.app.flags.FLAGS 21 | 22 | num_epochs = FLAGS.num_epochs 23 | batch_size = FLAGS.batch_size 24 | tag_size = FLAGS.labels 25 | word_pad_length = FLAGS.word_pad_length 26 | lr = FLAGS.learn_rate 27 | lr_decr = (lr - (1e-9))/num_epochs 28 | 29 | TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE) 30 | def token_parse(iterator): 31 | for value in iterator: 32 | return TOKENIZER_RE.findall(value) 33 | 34 | tokenizer = tflearn.data_utils.VocabularyProcessor(word_pad_length, tokenizer_fn=lambda tokens: [token_parse(x) for x in tokens]) 35 | label_dict = VocabDict() 36 | 37 | def string_parser(arr, fit): 38 | if fit == False: 39 | return list(tokenizer.transform(arr)) 40 | else: 41 | return list(tokenizer.fit_transform(arr)) 42 | 43 | model = TagSpace() 44 | with tf.Session() as sess: 45 | #with tf.device('/cpu:0'): 46 | model.create_model(word_pad_length, tag_size) 47 | train_opts = model.train_opts() 48 | test_opts = model.test_opts() 49 | 50 | sess.run(tf.global_variables_initializer()) 51 | 52 | words, tags = load_csv('./data/ag_news_csv/train.csv', target_columns=[0], columns_to_ignore=[1], target_dict=label_dict) 53 | if FLAGS.shuffle == True: 54 | words, tags = shuffle(words, tags) 55 | 56 | words = string_parser(words, fit=True) 57 | word_input = tflearn.data_utils.pad_sequences(words, maxlen=word_pad_length) 58 | total = len(word_input) 59 | step_print = int((total/batch_size) / 13) 60 | global_step = 0 61 | 62 | print('start training') 63 | for epoch_num in range(num_epochs): 64 | epoch_loss = 0 65 | step_loss = 0 66 | for i in range(int(total/batch_size)): 67 | batch_input, batch_tags = (word_input[i*batch_size:(i+1)*batch_size], tags[i*batch_size:(i+1)*batch_size]) 68 | result = sess.run(train_opts, feed_dict={f'{model.__class__.__name__}/doc:0': batch_input, f'{model.__class__.__name__}/tag_flag:0': batch_tags, f'{model.__class__.__name__}/lr:0': [lr]}) 69 | step_loss += result[1] 70 | epoch_loss += result[1] 71 | if i % step_print == 0: 72 | print(f'step_log: (epoch: {epoch_num}, step: {i}, global_step: {global_step}), Loss:{step_loss/step_print}), Positive: {result[4]}, Negative: {result[5]}') 73 | step_loss = 0 74 | global_step += 1 75 | print(f'epoch_log: (epoch: {epoch_num}, global_step: {global_step}), Loss:{epoch_loss/(total/batch_size)})') 76 | lr -= lr_decr 77 | 78 | words, tags = load_csv('./data/ag_news_csv/test.csv', target_columns=[0], columns_to_ignore=[1], target_dict=label_dict) 79 | words = string_parser(words, fit=True) 80 | word_input = tflearn.data_utils.pad_sequences(words, maxlen=word_pad_length) 81 | total = len(word_input) 82 | rs = 0. 83 | 84 | for i in range(int(total/batch_size)): 85 | batch_input, batch_tags = (word_input[i*batch_size:(i+1)*batch_size], tags[i*batch_size:(i+1)*batch_size]) 86 | result = sess.run(test_opts, feed_dict={f'{model.__class__.__name__}/doc:0': batch_input, f'{model.__class__.__name__}/tag_flag:0': np.ones_like(batch_tags)}) 87 | arr = result[0] 88 | for j in range(len(batch_tags)): 89 | rs+=np.sum(np.argmax(arr[j]) == np.argmax(batch_tags[j])) 90 | print(f'Test accuracy: {rs/total}') 91 | 92 | sess.close() 93 | 94 | --------------------------------------------------------------------------------