├── .gitignore ├── README.md ├── build_question_graph.ipynb ├── build_question_graph.py ├── combined.py ├── combined_model.ipynb ├── combined_model_experiment.py ├── data_helpers.py ├── deepwalk.ipynb ├── deepwalk.py ├── embedding_labels.ipynb ├── eval_helpers.py ├── evaluate.py ├── extract_embedding_labels.py ├── fastxml_experiment.py ├── get_tensor_from_checkpoint.py ├── k_max_pooling.ipynb ├── kim_cnn.py ├── kim_cnn_experiment.py ├── print_data_set_property.py ├── process_posts.ipynb ├── process_posts.py ├── process_train_dev_test.py ├── project-slides.pdf ├── requirements.txt ├── sample_random_walks.py ├── scripts └── preprocessing_pipeline.sh ├── split_train_dev_test.py ├── test_data_helpers.py ├── test_eval_helpers.py ├── text_cnn.py ├── tf_gather.ipynb ├── tf_helpers.py ├── train.py └── word2vec.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | *~ 91 | *.prof 92 | *.lprof 93 | outputs/* 94 | data/* 95 | figs/* 96 | bad_examples/* 97 | 98 | MNIST_data 99 | 100 | eval_results/ 101 | runs -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # graph embedding + deep learning for multi-label text classification 2 | 3 | this projects attempts to combine: 4 | 5 | - **graph embedding** 6 | - **ConvNet** 7 | 8 | for the purpose of **multi-label text classification**. 9 | 10 | I compared three methods on stackexchange datasets, where the goal is to predict the tags of posts. 11 | 12 | If you wan to know more, here are [some slides](https://github.com/xiaohan2012/network_embedding/blob/master/project-slides.pdf) 13 | 14 | # utility scripts 15 | 16 | - `scripts/preprocessing_pipeline.sh`: all the preprocessing, data splitting, feature extractio, etc 17 | - `sample_random_walks.py`: sample random walks on a graph 18 | - `extract_embedding_labels.py`: extract labels for embedding visualization 19 | 20 | # main scripts 21 | 22 | - `fastxml_experiment.py`: experiment for fastxml 23 | - `kim_cnn_experiment.py`: experiment for cnn 24 | - `combined_model_experiment.py`: experiment for cnn + deepwalk 25 | -------------------------------------------------------------------------------- /build_question_graph.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 36, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "fraciton of nodes in largest cc: 0.8441345365053322\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# coding: utf-8\n", 20 | "\n", 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import itertools\n", 24 | "import pickle as pkl\n", 25 | "\n", 26 | "from graph_tool import Graph, GraphView\n", 27 | "from graph_tool.topology import label_largest_component\n", 28 | "from collections import defaultdict\n", 29 | "from scipy import sparse as sp\n", 30 | "\n", 31 | "\n", 32 | "QUESTION = 1\n", 33 | "\n", 34 | "data_dir = 'data/stackexchange/datascience'\n", 35 | "df = pd.read_csv('{}/posts.csv'.format(data_dir), sep=',')\n", 36 | "\n", 37 | "# create a graph\n", 38 | "# each node is a question,\n", 39 | "# a question is associated with a list of users, including the author of both the question and answers\n", 40 | "\n", 41 | "# question to users mapping\n", 42 | "q2us = defaultdict(set)\n", 43 | "\n", 44 | "for i, r in df.iterrows():\n", 45 | " pid = None\n", 46 | " if r['PostTypeId'] == QUESTION:\n", 47 | " pid = int(r['Id'])\n", 48 | " else:\n", 49 | " parend_id = r['ParentId']\n", 50 | " if parend_id > 0:\n", 51 | " pid = int(parend_id)\n", 52 | "\n", 53 | " if pid:\n", 54 | " uname, uid = r['OwnerDisplayName'], r['OwnerUserId']\n", 55 | " if not np.isnan(uid):\n", 56 | " q2us[pid].add(int(uid))\n", 57 | " elif isinstance(uname, str):\n", 58 | " q2us[pid].add(uname)\n", 59 | "\n", 60 | "\n", 61 | "id2q_map = dict(enumerate(q2us))\n", 62 | "q2id_map = dict(zip(id2q_map.values(), id2q_map.keys()))\n", 63 | "\n", 64 | "\n", 65 | "all_users = set(itertools.chain(*q2us.values()))\n", 66 | "id2u_map = dict(enumerate(all_users))\n", 67 | "u2id_map = dict(zip(id2u_map.values(), id2u_map.keys()))\n", 68 | "\n", 69 | "\n", 70 | "# create a bi-partite adjacency matrix, row->question, column->user\n", 71 | "n_entries = sum(map(len, q2us.values()))\n", 72 | "data = np.ones(n_entries)\n", 73 | "row_idx = []\n", 74 | "col_idx = []\n", 75 | "for q, us in q2us.items():\n", 76 | " row_idx += [q2id_map[q]]*len(us)\n", 77 | " col_idx += [u2id_map[u] for u in us]\n", 78 | "assert len(data) == len(row_idx) == len(col_idx)\n", 79 | "m = sp.csr_matrix((data, (row_idx, col_idx)), shape=(len(q2id_map), len(u2id_map)))\n", 80 | "\n", 81 | "\n", 82 | "qm = m * m.T # question adj matrix via unipartite projection\n", 83 | "\n", 84 | "g = Graph()\n", 85 | "edges = zip(*qm.nonzero())\n", 86 | "g.add_edge_list(edges)\n", 87 | "\n", 88 | "vfilt = label_largest_component(g)\n", 89 | "f = np.sum(vfilt.a) / len(vfilt.a)\n", 90 | "print('fraciton of nodes in largest cc: {}'.format(f))\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 37, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "# node to question id\n", 102 | "prop_question_id = g.new_vertex_property('int')\n", 103 | "prop_question_id.a = np.array(list(id2q_map.values()))" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 38, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "g.set_vertex_filter(vfilt)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 39, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "n2i = {n: i for i, n in enumerate(g.vertices())}\n", 126 | "i2n = dict(zip(n2i.values(), n2i.keys()))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 40, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "new_g = Graph()\n", 138 | "new_g.add_edge_list([(n2i[e.source()], n2i[e.target()]) for e in g.edges()])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 51, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "# update the question ids\n", 150 | "new_prop_question_id = new_g.new_vertex_property('int')\n", 151 | "new_prop_question_id.a = [prop_question_id[i2n[i]] for i in range(new_g.num_vertices())]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 44, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "new_g.vertex_properties['question_id'] = new_prop_question_id" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 45, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "saving largest CC in graph\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# extract the largest component\n", 182 | "\n", 183 | "print('saving largest CC in graph')\n", 184 | "new_g.save('{}/question_graph.gt'.format(data_dir))\n", 185 | "\n" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.5.2" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 2 210 | } 211 | -------------------------------------------------------------------------------- /build_question_graph.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import tensorflow as tf 4 | import pandas as pd 5 | import numpy as np 6 | import pickle as pkl 7 | import itertools 8 | 9 | from graph_tool import Graph 10 | from graph_tool.topology import label_largest_component 11 | from collections import defaultdict 12 | from scipy import sparse as sp 13 | 14 | 15 | QUESTION = 1 16 | 17 | tf.flags.DEFINE_string('data_dir', '', 'directory of dataset') 18 | 19 | FLAGS = tf.flags.FLAGS 20 | FLAGS._parse_flags() 21 | 22 | data_dir = FLAGS.data_dir 23 | df = pd.read_csv('{}/posts.csv'.format(data_dir), sep=',') 24 | 25 | # create a graph 26 | # each node is a question, 27 | # a question is associated with a list of users, including the author of both the question and answers 28 | 29 | # question to users mapping 30 | q2us = defaultdict(set) 31 | 32 | for i, r in df.iterrows(): 33 | pid = None 34 | if r['PostTypeId'] == QUESTION: 35 | pid = int(r['Id']) 36 | else: 37 | parend_id = r['ParentId'] 38 | if parend_id > 0: 39 | pid = int(parend_id) 40 | 41 | if pid: 42 | uname, uid = r['OwnerDisplayName'], r['OwnerUserId'] 43 | if not np.isnan(uid): 44 | q2us[pid].add(int(uid)) 45 | elif isinstance(uname, str): 46 | q2us[pid].add(uname) 47 | 48 | 49 | id2q_map = dict(enumerate(q2us)) 50 | q2id_map = dict(zip(id2q_map.values(), id2q_map.keys())) 51 | 52 | 53 | all_users = set(itertools.chain(*q2us.values())) 54 | id2u_map = dict(enumerate(all_users)) 55 | u2id_map = dict(zip(id2u_map.values(), id2u_map.keys())) 56 | 57 | 58 | # create a bi-partite adjacency matrix, row->question, column->user 59 | n_entries = sum(map(len, q2us.values())) 60 | data = np.ones(n_entries) 61 | row_idx = [] 62 | col_idx = [] 63 | for q, us in q2us.items(): 64 | row_idx += [q2id_map[q]]*len(us) 65 | col_idx += [u2id_map[u] for u in us] 66 | assert len(data) == len(row_idx) == len(col_idx) 67 | m = sp.csr_matrix((data, (row_idx, col_idx)), shape=(len(q2id_map), len(u2id_map))) 68 | 69 | 70 | qm = m * m.T # question adj matrix via unipartite projection 71 | 72 | g = Graph() 73 | edges = zip(*qm.nonzero()) 74 | g.add_edge_list(edges) 75 | 76 | vfilt = label_largest_component(g) 77 | f = np.sum(vfilt.a) / len(vfilt.a) 78 | print('fraciton of nodes in largest cc: {}'.format(f)) 79 | 80 | 81 | prop_question_id = g.new_vertex_property('int') 82 | prop_question_id.a = np.array(list(id2q_map.values())) 83 | 84 | # focus on largest CC 85 | g.set_vertex_filter(vfilt) 86 | 87 | # re-index the graph 88 | # SO qustion: https://stackoverflow.com/questions/46264296/graph-tool-re-index-vertex-ids-to-be-consecutive-integers 89 | n2i = {n: i for i, n in enumerate(g.vertices())} 90 | i2n = dict(zip(n2i.values(), n2i.keys())) 91 | 92 | new_g = Graph() 93 | new_g.add_edge_list([(n2i[e.source()], n2i[e.target()]) for e in g.edges()]) 94 | 95 | 96 | # update question ids 97 | new_prop_question_id = new_g.new_vertex_property('int') 98 | new_prop_question_id.a = [prop_question_id[i2n[i]] for i in range(new_g.num_vertices())] 99 | new_g.vertex_properties['question_id'] = new_prop_question_id 100 | 101 | 102 | print('saving largest CC in graph') 103 | new_g.save('{}/question_graph.gt'.format(data_dir)) 104 | 105 | 106 | print('saving connected_question_ids') 107 | pkl.dump(list(new_prop_question_id.a), 108 | open('{}/connected_question_ids.pkl'.format(data_dir), 'wb')) 109 | -------------------------------------------------------------------------------- /combined.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from eval_helpers import tf_precision_at_k 3 | 4 | 5 | class Combined(): 6 | def __init__(self, cnn_model, dw_model): 7 | self.cnn, self.dw = cnn_model, dw_model 8 | print('cnn:', type(self.cnn)) 9 | print('dw:', type(self.dw)) 10 | 11 | # input document ids 12 | # to retrieve node embedding 13 | self.node_ids = tf.placeholder(dtype=tf.int32, shape=None, name="input_node_ids") 14 | self.l2_loss = tf.constant(0.0) 15 | 16 | self.add_output() 17 | self.add_losses() 18 | self.add_performance() 19 | 20 | def add_output(self): 21 | # redefine output 22 | # concatenate the filters and node embedding for classification 23 | with tf.name_scope("output"): 24 | input_length = self.cnn.num_filters_total + self.dw.embedding_size 25 | W = tf.get_variable( 26 | "W", 27 | shape=[input_length, 28 | self.cnn.num_classes], 29 | initializer=tf.contrib.layers.xavier_initializer()) 30 | b = tf.Variable(tf.constant(0.1, shape=[self.cnn.num_classes]), name="b") 31 | self.l2_loss += tf.nn.l2_loss(W) 32 | self.l2_loss += tf.nn.l2_loss(b) 33 | 34 | # look up the node embeddings 35 | node_embeddings = tf.nn.embedding_lookup(self.dw.normalized_embeddings, self.node_ids, 36 | name='node_embeddings') 37 | input_tensor = tf.concat([self.cnn.h_drop, node_embeddings], 38 | 1, name="input_concat") 39 | self.scores = tf.nn.xw_plus_b(input_tensor, W, b, name="scores") 40 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 41 | 42 | def add_losses(self): 43 | # CalculateMean cross-entropy loss 44 | with tf.name_scope("loss"): 45 | if self.cnn.loss_function == 'sigmoid': 46 | print('use sigmoid xentropy') 47 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.scores, 48 | labels=self.cnn.input_y_binary) 49 | elif self.cnn.loss_function == 'softmax': 50 | print('use softmax xentropy') 51 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, 52 | labels=self.cnn.input_y_binary) 53 | else: 54 | raise ValueError('invalid loss function') 55 | 56 | # two losses 57 | self.label_loss = tf.reduce_mean(losses) + self.cnn.l2_reg_lambda * self.l2_loss 58 | self.graph_loss = self.dw.loss 59 | 60 | def add_performance(self): 61 | # Accuracy 62 | with tf.name_scope("performance"): 63 | self.p1 = tf_precision_at_k(self.scores, self.cnn.input_y_labels, k=1, name='p1') 64 | self.p3 = tf_precision_at_k(self.scores, self.cnn.input_y_labels, k=3, name='p3') 65 | self.p5 = tf_precision_at_k(self.scores, self.cnn.input_y_labels, k=5, name='p5') 66 | -------------------------------------------------------------------------------- /combined_model_experiment.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import pickle as pkl 5 | import numpy as np 6 | import tensorflow as tf 7 | import datetime 8 | import pandas as pd 9 | import itertools 10 | 11 | from sklearn.cross_validation import train_test_split 12 | from tensorflow.contrib import learn 13 | 14 | from kim_cnn import KimCNN 15 | from word2vec import Word2Vec 16 | from combined import Combined 17 | from eval_helpers import label_lists_to_sparse_tuple 18 | from data_helpers import batch_iter, RWBatchGenerator, label_ids_to_binary_matrix, load_pickle 19 | from tf_helpers import get_variable_value_from_checkpoint 20 | 21 | from tensorflow.python import debug as tf_debug 22 | from tf_helpers import save_embedding_for_viz 23 | 24 | 25 | # In[2]: 26 | 27 | 28 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 29 | tf.flags.DEFINE_integer('tag_freq_threshold', 5, 'minimum frequency of a tag') 30 | 31 | tf.flags.DEFINE_float("max_document_length", 2000, "Maximum length of document, exceeding part is truncated") 32 | 33 | # Architecutural parameters for KimCNN 34 | 35 | tf.flags.DEFINE_string("loss_function", 'sigmoid', "loss function: (softmax|sigmoid) (Default: sigmoid)") 36 | 37 | # Model Hyperparameters 38 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 39 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 40 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 41 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 42 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") 43 | 44 | # Training parameters 45 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 46 | 47 | 48 | tf.flags.DEFINE_integer("dw_batch_size", 128, "Batch Size for deep walk model (default: 128)") 49 | tf.flags.DEFINE_integer("dw_skip_window", 3, "How many words to consider left and right. (default: 3)") 50 | tf.flags.DEFINE_integer("dw_num_skips", 4, "How many times to reuse an input to generate a label. (default: 4)") 51 | tf.flags.DEFINE_integer("dw_embedding_size", 128, "Dimensionality of node embedding. (default: 128)") 52 | tf.flags.DEFINE_integer("dw_num_negative_samples", 64, "Number of negative examples to sample. (default: 64)") 53 | 54 | 55 | # global training parameter 56 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") 57 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") 58 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") 59 | tf.flags.DEFINE_integer("num_checkpoints", 1, "Number of checkpoints to store (default: 1)") # disk quota is low 60 | 61 | tf.flags.DEFINE_string("pretrained_embedding_checkpoint_dir", "", 62 | "directory of checkpoint where pretrained embedding lives") 63 | tf.flags.DEFINE_string("pretrained_embedding_name", 64 | "embedding/table", 65 | "variable name of the pretrained emebdding (defualt: embedding/table)") 66 | tf.flags.DEFINE_string("pretrained_nce_W_name", 67 | "nce/Variable", 68 | "variable name of the nce W parameter (defualt: nce/Variable)") 69 | tf.flags.DEFINE_string("pretrained_nce_b_name", 70 | "nce/Variable_1", 71 | "variable name of the nce W parameter (defualt: nce/Variable_1)") 72 | 73 | # Misc Parameters 74 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 75 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 76 | 77 | 78 | # In[5]: 79 | 80 | 81 | FLAGS = tf.flags.FLAGS 82 | FLAGS._parse_flags() 83 | print("\nParameters:") 84 | for attr, value in sorted(FLAGS.__flags.items()): 85 | print("{}={}".format(attr.upper(), value)) 86 | print("") 87 | 88 | data_dir = FLAGS.data_dir 89 | 90 | 91 | # load data 92 | # =============================================== 93 | train_text, dev_text, _ = load_pickle( 94 | os.path.join(data_dir, "text_split.pkl")) 95 | y_id_train, y_id_dev, _ = load_pickle( 96 | os.path.join(data_dir, "labels_id_split.pkl")) 97 | y_binary_train, y_binary_dev, _ = load_pickle( 98 | os.path.join(data_dir, "labels_binary_split.pkl")) 99 | node_ids_train, node_ids_dev, _ = load_pickle( 100 | os.path.join(data_dir, "node_ids_split.pkl")) 101 | 102 | # preprocessing text documents 103 | # =============================================== 104 | vocab_processor = learn.preprocessing.VocabularyProcessor(FLAGS.max_document_length) 105 | x_train = np.array(list(vocab_processor.fit_transform(train_text))) 106 | x_dev = np.array(list(vocab_processor.transform(dev_text))) 107 | 108 | print("Train/Dev split: {:d}/{:d}".format(len(x_train), len(x_dev))) 109 | 110 | num_classes = y_binary_train.shape[1] 111 | print("num of classes: {:d}".format(num_classes)) 112 | 113 | 114 | # load node embedding data 115 | walks = RWBatchGenerator.read_walks("{}/random_walks.txt".format(data_dir)) 116 | 117 | vocabulary_size = len(set(itertools.chain(*walks))) 118 | 119 | dw_data_generator = RWBatchGenerator( 120 | walks, FLAGS.dw_batch_size, FLAGS.dw_num_skips, FLAGS.dw_skip_window) 121 | 122 | # Training 123 | # ================================================== 124 | 125 | 126 | with tf.Graph().as_default(): 127 | session_conf = tf.ConfigProto( 128 | allow_soft_placement=FLAGS.allow_soft_placement, 129 | log_device_placement=FLAGS.log_device_placement) 130 | sess = tf.Session(config=session_conf) 131 | 132 | # DEBUG 133 | # sess = tf_debug.LocalCLIDebugWrapperSession(sess) 134 | 135 | with sess.as_default(): 136 | with tf.name_scope('kim_cnn'): 137 | cnn = KimCNN( 138 | sequence_length=x_train.shape[1], 139 | num_classes=num_classes, 140 | vocab_size=len(vocab_processor.vocabulary_), 141 | embedding_size=FLAGS.embedding_dim, 142 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 143 | num_filters=FLAGS.num_filters, 144 | l2_reg_lambda=FLAGS.l2_reg_lambda, 145 | loss_function=FLAGS.loss_function, 146 | redefine_output_layer=True) 147 | 148 | if FLAGS.pretrained_embedding_checkpoint_dir: 149 | print('use pretrained embedding from {}'.format( 150 | FLAGS.pretrained_embedding_checkpoint_dir)) 151 | 152 | embedding_value, nce_W_value, nce_b_value = get_variable_value_from_checkpoint( 153 | FLAGS.pretrained_embedding_checkpoint_dir, 154 | [FLAGS.pretrained_embedding_name, 155 | FLAGS.pretrained_nce_W_name, 156 | FLAGS.pretrained_nce_b_name]) 157 | else: 158 | embedding_value, nce_W_value, nce_b_value = None, None, None 159 | 160 | with tf.name_scope('dw'): 161 | dw = Word2Vec(FLAGS.dw_num_negative_samples, 162 | vocabulary_size, 163 | FLAGS.dw_embedding_size, 164 | embedding_value=embedding_value, 165 | nce_W_value=nce_W_value, 166 | nce_b_value=nce_b_value) 167 | 168 | with tf.name_scope('combined'): 169 | model = Combined(cnn, dw) 170 | 171 | global_step = tf.Variable(0, name="global_step", trainable=False) 172 | 173 | label_train_op = tf.train.AdamOptimizer(1e-3).minimize( 174 | model.label_loss, 175 | global_step=global_step) 176 | graph_train_op = tf.train.GradientDescentOptimizer(1e-2).minimize(model.graph_loss) 177 | 178 | # Output directory for models and summaries 179 | dataset_id = list(filter(None, data_dir.split('/')))[-1] 180 | print('dataset_id:', dataset_id) 181 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", 182 | dataset_id, 'combined')) 183 | print("Writing to {}\n".format(out_dir)) 184 | 185 | if tf.gfile.Exists(out_dir): 186 | print('cleaning ', out_dir) 187 | tf.gfile.DeleteRecursively(out_dir) 188 | tf.gfile.MakeDirs(out_dir) 189 | 190 | # Summaries for loss and precision 191 | label_loss_summary = tf.summary.scalar("label_loss", model.label_loss) 192 | graph_loss_summary = tf.summary.scalar("graph_loss", model.graph_loss) 193 | 194 | p1 = tf.summary.scalar("p1", model.p1) 195 | p3 = tf.summary.scalar("p3", model.p3) 196 | p5 = tf.summary.scalar("p5", model.p5) 197 | 198 | # Train Summaries 199 | train_summary_op = tf.summary.merge([label_loss_summary, graph_loss_summary, 200 | p1, p3, p5]) 201 | 202 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 203 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 204 | 205 | # Dev summaries 206 | dev_summary_op = tf.summary.merge([label_loss_summary, graph_loss_summary, 207 | p1, p3, p5]) 208 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 209 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 210 | 211 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 212 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 213 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 214 | if not os.path.exists(checkpoint_dir): 215 | os.makedirs(checkpoint_dir) 216 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 217 | 218 | # Write vocabulary 219 | vocab_processor.save(os.path.join(data_dir, "vocab")) 220 | 221 | sess.run(tf.global_variables_initializer()) 222 | 223 | #### DEBUG 224 | sess.graph.finalize() 225 | 226 | def train_label_step(x_batch, y_batch_binary, y_batch_labels, node_ids, writer): 227 | """ 228 | one training step for the label part 229 | """ 230 | feed_dict = { 231 | model.cnn.input_x: x_batch, 232 | model.cnn.input_y_binary: y_batch_binary, 233 | model.cnn.input_y_labels: label_lists_to_sparse_tuple( 234 | y_batch_labels, num_classes), # needs some conversion 235 | model.node_ids: node_ids, # node ids 236 | model.cnn.dropout_keep_prob: FLAGS.dropout_keep_prob, 237 | 238 | # the following is in vain 239 | # tf requires all placeholder to be provided some value 240 | 241 | model.dw.train_inputs: [0], 242 | model.dw.train_labels: [[0]], 243 | } 244 | _, step, summaries, label_loss, p1, p3, p5 = sess.run( 245 | [label_train_op, global_step, train_summary_op, model.label_loss, 246 | model.p1, model.p3, model.p5], 247 | feed_dict) 248 | time_str = datetime.datetime.now().isoformat() 249 | print("{}: step {}, label loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format( 250 | time_str, step, label_loss, p1, p3, p5)) 251 | train_summary_writer.add_summary(summaries, step) 252 | 253 | def train_graph_step(x_batch, batch_labels, writer): 254 | """ 255 | one training step for the graph part 256 | """ 257 | feed_dict = { 258 | model.dw.train_inputs: x_batch, 259 | model.dw.train_labels: np.expand_dims(np.array(batch_labels), -1), 260 | 261 | # the following is in vain 262 | # tf requires all placeholder to be provided some value 263 | model.cnn.input_x: list(vocab_processor.transform(["asdfkjahdkfhakslfh"])), # non-sense stuff 264 | model.cnn.input_y_binary: [[0] * num_classes], # with no label 265 | model.cnn.input_y_labels: label_lists_to_sparse_tuple( 266 | [[0]], num_classes), # needs some conversion 267 | model.node_ids: [0], # node ids 268 | model.cnn.dropout_keep_prob: FLAGS.dropout_keep_prob, 269 | 270 | } 271 | _, step, summaries, graph_loss = sess.run( 272 | [graph_train_op, global_step, train_summary_op, model.graph_loss], 273 | feed_dict) 274 | time_str = datetime.datetime.now().isoformat() 275 | print("{}: step {}, graph loss {:g}".format( 276 | time_str, step, graph_loss)) 277 | writer.add_summary(summaries, step) 278 | 279 | def dev_step(x_batch, y_batch_binary, y_batch_labels, node_ids, writer): 280 | """ 281 | Evaluates model on a dev set 282 | """ 283 | feed_dict = { 284 | model.cnn.input_x: x_batch, 285 | model.cnn.input_y_binary: y_batch_binary, 286 | model.cnn.input_y_labels: label_lists_to_sparse_tuple( 287 | y_batch_labels, num_classes), # needs some conversion 288 | model.node_ids: node_ids, # node ids 289 | model.cnn.dropout_keep_prob: 1.0, 290 | 291 | # in vain 292 | model.dw.train_inputs: [0], 293 | model.dw.train_labels: [[0]], 294 | } 295 | step, summaries, label_loss, p1, p3, p5 = sess.run( 296 | [global_step, dev_summary_op, model.label_loss, model.p1, model.p3, model.p5], 297 | feed_dict) 298 | time_str = datetime.datetime.now().isoformat() 299 | print("{}: step {}, label loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format( 300 | time_str, step, label_loss, p1, p3, p5)) 301 | 302 | writer.add_summary(summaries, step) 303 | 304 | batches = batch_iter( 305 | list(zip(x_train, y_binary_train, y_id_train, node_ids_train)), 306 | FLAGS.batch_size, FLAGS.num_epochs) 307 | 308 | for batch in batches: 309 | # train label part 310 | x_batch, y_batch_binary, y_train_labels, x_node_ids = zip(*batch) 311 | train_label_step(x_batch, y_batch_binary, y_train_labels, x_node_ids, train_summary_writer) 312 | current_step = tf.train.global_step(sess, global_step) # one step for label training 313 | 314 | # train graph part 315 | batch_inputs, batch_labels = dw_data_generator.next_batch() 316 | train_graph_step(batch_inputs, batch_labels, train_summary_writer) 317 | 318 | if current_step % FLAGS.evaluate_every == 0: 319 | print("\nEvaluation:") 320 | dev_step(x_dev, y_binary_dev, y_id_dev, node_ids_dev, dev_summary_writer) 321 | print("") 322 | 323 | if current_step % FLAGS.checkpoint_every == 0: 324 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 325 | print("Saved model checkpoint to {}\n".format(path)) 326 | -------------------------------------------------------------------------------- /data_helpers.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pickle as pkl 3 | import itertools 4 | import collections 5 | import numpy as np 6 | import random 7 | from html.parser import HTMLParser 8 | from scipy.sparse import csr_matrix 9 | 10 | 11 | def clean_str(string): 12 | """ 13 | Tokenization/string cleaning for all datasets except for SST. 14 | Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py 15 | """ 16 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 17 | string = re.sub(r"\'s", " \'s", string) 18 | string = re.sub(r"\'ve", " \'ve", string) 19 | string = re.sub(r"n\'t", " n\'t", string) 20 | string = re.sub(r"\'re", " \'re", string) 21 | string = re.sub(r"\'d", " \'d", string) 22 | string = re.sub(r"\'ll", " \'ll", string) 23 | string = re.sub(r",", " , ", string) 24 | string = re.sub(r"!", " ! ", string) 25 | string = re.sub(r"\(", " \( ", string) 26 | string = re.sub(r"\)", " \) ", string) 27 | string = re.sub(r"\?", " \? ", string) 28 | string = re.sub(r"\s{2,}", " ", string) 29 | return string.strip().lower() 30 | 31 | 32 | class MLStripper(HTMLParser): 33 | def __init__(self): 34 | self.reset() 35 | self.strict = False 36 | self.convert_charrefs = True 37 | self.fed = [] 38 | 39 | def handle_data(self, d): 40 | self.fed.append(d) 41 | 42 | def get_data(self): 43 | return ''.join(self.fed) 44 | 45 | 46 | def strip_tags(html): 47 | s = MLStripper() 48 | s.feed(html) 49 | return s.get_data() 50 | 51 | 52 | def batch_iter(data, batch_size, num_epochs, shuffle=True): 53 | """ 54 | Generates a batch iterator for a dataset. 55 | """ 56 | data = np.array(data) 57 | data_size = len(data) 58 | num_batches_per_epoch = int((len(data)-1)/batch_size) + 1 59 | for epoch in range(num_epochs): 60 | # Shuffle the data at each epoch 61 | if shuffle: 62 | shuffle_indices = np.random.permutation(np.arange(data_size)) 63 | shuffled_data = data[shuffle_indices] 64 | else: 65 | shuffled_data = data 66 | for batch_num in range(num_batches_per_epoch): 67 | start_index = batch_num * batch_size 68 | end_index = min((batch_num + 1) * batch_size, data_size) 69 | yield shuffled_data[start_index: end_index] 70 | 71 | 72 | class MultiLabelIntegerEncoder: 73 | """transform """ 74 | def fit(self, labels): 75 | self.id2label_ = dict(enumerate(set(itertools.chain(*labels)))) 76 | self.label2id_ = dict(zip(self.id2label_.values(), self.id2label_.keys())) 77 | self.UNK = len(self.label2id_) 78 | 79 | def transform(self, labels): 80 | return [[self.label2id_.get(l, self.UNK) for l in ls] # if label not there, it's UNK 81 | for ls in labels] 82 | 83 | def fit_transform(self, labels): 84 | self.fit(labels) 85 | return self.transform(labels) 86 | 87 | 88 | def label_ids_to_binary_matrix(labels_list, shape): 89 | """ 90 | list of label ids to binary indicator matrix 91 | 92 | args: 93 | 94 | list of list of int 95 | 96 | return: 97 | 98 | csr_matrix 99 | """ 100 | size = sum(len(ls) for ls in labels_list) 101 | row_indx = list(itertools.chain( 102 | *[list(itertools.repeat(i, len(ls))) 103 | for i, ls in enumerate(labels_list)])) 104 | col_indx = list(itertools.chain(*labels_list)) 105 | return csr_matrix((np.ones(size), (row_indx, col_indx)), 106 | shape=shape).toarray() 107 | 108 | 109 | class RWBatchGenerator(): 110 | """Random walk batch generator 111 | """ 112 | def __init__(self, walks, batch_size, num_skips, skip_window): 113 | """ 114 | Args: 115 | 116 | walks: list of integer list 117 | batch_size: int 118 | num_skips: int, within each window, number of examples 119 | skip_window: int, sliding window size 120 | """ 121 | self.walks = walks 122 | self.batch_size = batch_size 123 | self.num_skips = num_skips 124 | self.skip_window = skip_window 125 | 126 | self.current_walk = 0 127 | self.data = self.walks[0] 128 | 129 | self.span = 2 * self.skip_window + 1 # [ self.skip_window target self.skip_window ] 130 | self.data_index = 0 131 | 132 | @classmethod 133 | def read_walks(cls, path): 134 | walks = [] 135 | with open(path, 'r') as f: 136 | for l in f: 137 | walks.append(list(map(int, l.strip().split()))) 138 | return walks 139 | 140 | def next_batch(self): 141 | batch = np.ndarray(shape=(self.batch_size), dtype=np.int32) 142 | labels = np.ndarray(shape=(self.batch_size), dtype=np.int32) 143 | buffer = collections.deque(maxlen=self.span) 144 | 145 | if self.data_index + self.span > len(self.data): 146 | self.data_index = 0 147 | 148 | buffer.extend(self.data[self.data_index:self.data_index + self.span]) 149 | self.data_index += self.span 150 | for i in range(self.batch_size // self.num_skips): 151 | target = self.skip_window # target label at the center of the buffer 152 | targets_to_avoid = [self.skip_window] 153 | for j in range(self.num_skips): 154 | # sample unique targets 155 | while target in targets_to_avoid: 156 | target = random.randint(0, self.span-1) 157 | targets_to_avoid.append(target) 158 | batch[i * self.num_skips + j] = buffer[self.skip_window] 159 | labels[i * self.num_skips + j] = buffer[target] 160 | if self.data_index == len(self.data): 161 | self.current_walk += 1 162 | 163 | if self.current_walk == len(self.walks): # used all walks 164 | self.current_walk = 0 165 | 166 | self.data = self.walks[self.current_walk] 167 | 168 | # equivalent to: buffer[:] = self.data[:self.span] 169 | buffer.clear() 170 | buffer.extend(self.data[:self.span]) 171 | self.data_index = self.span 172 | else: 173 | buffer.append(self.data[self.data_index]) 174 | self.data_index += 1 175 | # Backtrack a little bit to avoid skipping words in the end of a batch 176 | self.data_index = (self.data_index + len(self.data) - self.span) % len(self.data) 177 | 178 | return (batch, labels) 179 | 180 | 181 | def load_pickle(path, mode='rb'): 182 | return pkl.load(open(path, mode)) 183 | -------------------------------------------------------------------------------- /deepwalk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import tensorflow as tf\n", 14 | "import os\n", 15 | "import time\n", 16 | "import itertools\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "from data_helpers import RWBatchGenerator\n", 20 | "from tf_helpers import save_embedding_for_viz" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": false, 28 | "deletable": true, 29 | "editable": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset')\n", 34 | "tf.flags.DEFINE_integer(\"checkpoint_every\", 5000, \"Save model after this many steps (default: 5000)\")\n", 35 | "tf.flags.DEFINE_integer(\"num_checkpoints\", 5, \"Number of checkpoints to store (default: 5)\")" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "collapsed": false, 43 | "deletable": true, 44 | "editable": true 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "\n", 52 | "Parameters:\n", 53 | "CHECKPOINT_EVERY=5000\n", 54 | "DATA_DIR=data/stackexchange/datascience/\n", 55 | "NUM_CHECKPOINTS=5\n", 56 | "\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "FLAGS = tf.flags.FLAGS\n", 62 | "FLAGS._parse_flags()\n", 63 | "print(\"\\nParameters:\")\n", 64 | "for attr, value in sorted(FLAGS.__flags.items()):\n", 65 | " print(\"{}={}\".format(attr.upper(), value))\n", 66 | "print(\"\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": { 73 | "collapsed": false, 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "data_dir = FLAGS.data_dir\n", 80 | "metadata_path = '/home/cloud-user/code/network_embedding/{}/labels_for_visualization.tsv'.format(data_dir)\n", 81 | "\n", 82 | "batch_size = 128\n", 83 | "embedding_size = 128 # Dimension of the embedding vector.\n", 84 | "skip_window = 3 # How many words to consider left and right.\n", 85 | "num_skips = 4 # How many times to reuse an input to generate a label.\n", 86 | "\n", 87 | "walks = RWBatchGenerator.read_walks(\"{}/random_walks.txt\".format(data_dir))\n", 88 | "\n", 89 | "vocabulary_size = len(set(itertools.chain(*walks)))\n", 90 | "\n", 91 | "generator = RWBatchGenerator(walks, batch_size, num_skips, skip_window)\n", 92 | "\n", 93 | "# We pick a random validation set to sample nearest neighbors. Here we limit the\n", 94 | "# validation samples to the words that have a low numeric ID, which by\n", 95 | "# construction are also the most frequent.\n", 96 | "num_sampled = 64 # Number of negative examples to sample.\n", 97 | "\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": { 104 | "collapsed": false, 105 | "deletable": true, 106 | "editable": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "from word2vec import Word2Vec\n", 111 | "graph = tf.Graph()\n", 112 | "with graph.as_default():\n", 113 | " model = Word2Vec(num_sampled,\n", 114 | " vocabulary_size=vocabulary_size, embedding_size=embedding_size)\n", 115 | "\n", 116 | " # Construct the SGD optimizer using a learning rate of 1.0.\n", 117 | " optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(model.loss)\n", 118 | "\n", 119 | " # Add variable initializer.\n", 120 | " init = tf.global_variables_initializer()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": { 127 | "collapsed": false, 128 | "deletable": true, 129 | "editable": true 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "Writing to /home/cloud-user/code/network_embedding/runs/deepwalk\n", 137 | "\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "timestamp = str(int(time.time()))\n", 143 | "out_dir = os.path.abspath(os.path.join(os.path.curdir, \"runs\", \"deepwalk\"))\n", 144 | "# \"deepwalk-{}\".format(timestamp)))\n", 145 | "if tf.gfile.Exists(out_dir):\n", 146 | " tf.gfile.DeleteRecursively(out_dir)\n", 147 | "tf.gfile.MakeDirs(out_dir)\n", 148 | "\n", 149 | "print(\"Writing to {}\\n\".format(out_dir))\n", 150 | "\n", 151 | "# summary config\n", 152 | "loss_summary = tf.summary.scalar(\"loss\", model.loss)\n", 153 | "train_summary_op = tf.summary.merge([loss_summary])\n", 154 | "train_summary_dir = os.path.join(out_dir, \"summaries\", \"train\")\n", 155 | "\n", 156 | "# checkpoint config\n", 157 | "checkpoint_dir = os.path.abspath(os.path.join(out_dir, \"checkpoints\"))\n", 158 | "checkpoint_prefix = os.path.join(checkpoint_dir, \"model\")\n", 159 | "if not os.path.exists(checkpoint_dir):\n", 160 | " os.makedirs(checkpoint_dir)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": false, 168 | "deletable": true, 169 | "editable": true, 170 | "scrolled": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "Initialized\n", 178 | "Average loss at step 0 : 197.034515381\n", 179 | "Average loss at step 2000 : 33.3749833015\n", 180 | "Average loss at step 4000 : 4.71690068173\n", 181 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-5000\n", 182 | "\n", 183 | "embedding for visualization saved\n", 184 | "Average loss at step 6000 : 4.17503132045\n", 185 | "Average loss at step 8000 : 3.89572834098\n", 186 | "Average loss at step 10000 : 3.62797629082\n", 187 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-10000\n", 188 | "\n", 189 | "embedding for visualization saved\n", 190 | "Average loss at step 12000 : 3.33840755868\n", 191 | "Average loss at step 14000 : 3.07739143467\n", 192 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-15000\n", 193 | "\n", 194 | "embedding for visualization saved\n", 195 | "Average loss at step 16000 : 2.81435450733\n", 196 | "Average loss at step 18000 : 2.67681139916\n", 197 | "Average loss at step 20000 : 2.58420013291\n", 198 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-20000\n", 199 | "\n", 200 | "embedding for visualization saved\n", 201 | "Average loss at step 22000 : 2.50351923865\n", 202 | "Average loss at step 24000 : 2.4430124557\n", 203 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-25000\n", 204 | "\n", 205 | "embedding for visualization saved\n", 206 | "Average loss at step 26000 : 2.37914546031\n", 207 | "Average loss at step 28000 : 2.32246963161\n", 208 | "Average loss at step 30000 : 2.29185111243\n", 209 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-30000\n", 210 | "\n", 211 | "embedding for visualization saved\n", 212 | "Average loss at step 32000 : 2.25492233241\n", 213 | "Average loss at step 34000 : 2.23561770236\n", 214 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-35000\n", 215 | "\n", 216 | "embedding for visualization saved\n", 217 | "Average loss at step 36000 : 2.21683451974\n", 218 | "Average loss at step 38000 : 2.20001812011\n", 219 | "Average loss at step 40000 : 2.16008253932\n", 220 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-40000\n", 221 | "\n", 222 | "embedding for visualization saved\n", 223 | "Average loss at step 42000 : 2.14749721628\n", 224 | "Average loss at step 44000 : 2.13005481583\n", 225 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-45000\n", 226 | "\n", 227 | "embedding for visualization saved\n", 228 | "Average loss at step 46000 : 2.11473830765\n", 229 | "Average loss at step 48000 : 2.11871240437\n", 230 | "Average loss at step 50000 : 2.11759582251\n", 231 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-50000\n", 232 | "\n", 233 | "embedding for visualization saved\n", 234 | "Average loss at step 52000 : 2.09510678524\n", 235 | "Average loss at step 54000 : 2.06431380367\n", 236 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-55000\n", 237 | "\n", 238 | "embedding for visualization saved\n", 239 | "Average loss at step 56000 : 2.0584917441\n", 240 | "Average loss at step 58000 : 2.05934475851\n", 241 | "Average loss at step 60000 : 2.05926792186\n", 242 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-60000\n", 243 | "\n", 244 | "embedding for visualization saved\n", 245 | "Average loss at step 62000 : 2.05446812618\n", 246 | "Average loss at step 64000 : 2.03652630895\n", 247 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-65000\n", 248 | "\n", 249 | "embedding for visualization saved\n", 250 | "Average loss at step 66000 : 2.02149266583\n", 251 | "Average loss at step 68000 : 2.01090430015\n", 252 | "Average loss at step 70000 : 2.01638064069\n", 253 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-70000\n", 254 | "\n", 255 | "embedding for visualization saved\n", 256 | "Average loss at step 72000 : 2.00558471739\n", 257 | "Average loss at step 74000 : 2.00571966678\n", 258 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-75000\n", 259 | "\n", 260 | "embedding for visualization saved\n", 261 | "Average loss at step 76000 : 1.99162644798\n", 262 | "Average loss at step 78000 : 1.98593419254\n", 263 | "Average loss at step 80000 : 1.96601568502\n", 264 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-80000\n", 265 | "\n", 266 | "embedding for visualization saved\n", 267 | "Average loss at step 82000 : 1.97888466513\n", 268 | "Average loss at step 84000 : 1.98404323322\n", 269 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-85000\n", 270 | "\n", 271 | "embedding for visualization saved\n", 272 | "Average loss at step 86000 : 1.97378859723\n", 273 | "Average loss at step 88000 : 1.96952367836\n", 274 | "Average loss at step 90000 : 1.95138473696\n", 275 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-90000\n", 276 | "\n", 277 | "embedding for visualization saved\n", 278 | "Average loss at step 92000 : 1.94730134505\n", 279 | "Average loss at step 94000 : 1.93546281201\n", 280 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-95000\n", 281 | "\n", 282 | "embedding for visualization saved\n", 283 | "Average loss at step 96000 : 1.94397306079\n", 284 | "Average loss at step 98000 : 1.9456551421\n", 285 | "Average loss at step 100000 : 1.9326204499\n", 286 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-100000\n", 287 | "\n", 288 | "embedding for visualization saved\n", 289 | "Average loss at step 102000 : 1.94285756439\n", 290 | "Average loss at step 104000 : 1.92044538388\n", 291 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-105000\n", 292 | "\n", 293 | "embedding for visualization saved\n", 294 | "Average loss at step 106000 : 1.9341014365\n", 295 | "Average loss at step 108000 : 1.9297939941\n", 296 | "Average loss at step 110000 : 1.92686362255\n", 297 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-110000\n", 298 | "\n", 299 | "embedding for visualization saved\n", 300 | "Average loss at step 112000 : 1.90848006827\n", 301 | "Average loss at step 114000 : 1.91830887705\n", 302 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-115000\n", 303 | "\n", 304 | "embedding for visualization saved\n", 305 | "Average loss at step 116000 : 1.91322599018\n", 306 | "Average loss at step 118000 : 1.90238298392\n", 307 | "Average loss at step 120000 : 1.90629216832\n", 308 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-120000\n", 309 | "\n", 310 | "embedding for visualization saved\n", 311 | "Average loss at step 122000 : 1.90681960332\n", 312 | "Average loss at step 124000 : 1.91353502142\n", 313 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-125000\n", 314 | "\n", 315 | "embedding for visualization saved\n", 316 | "Average loss at step 126000 : 1.90787102365\n", 317 | "Average loss at step 128000 : 1.91034146512\n", 318 | "Average loss at step 130000 : 1.88908876747\n", 319 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-130000\n", 320 | "\n", 321 | "embedding for visualization saved\n", 322 | "Average loss at step 132000 : 1.88341609645\n", 323 | "Average loss at step 134000 : 1.88996016645\n", 324 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-135000\n", 325 | "\n", 326 | "embedding for visualization saved\n", 327 | "Average loss at step 136000 : 1.89304238236\n", 328 | "Average loss at step 138000 : 1.89212996116\n", 329 | "Average loss at step 140000 : 1.88799269575\n", 330 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-140000\n", 331 | "\n", 332 | "embedding for visualization saved\n", 333 | "Average loss at step 142000 : 1.88314257389\n", 334 | "Average loss at step 144000 : 1.87319713652\n", 335 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-145000\n", 336 | "\n", 337 | "embedding for visualization saved\n", 338 | "Average loss at step 146000 : 1.86209085044\n", 339 | "Average loss at step 148000 : 1.87274713555\n", 340 | "Average loss at step 150000 : 1.86942203593\n", 341 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-150000\n", 342 | "\n", 343 | "embedding for visualization saved\n", 344 | "Average loss at step 152000 : 1.87797070491\n", 345 | "Average loss at step 154000 : 1.86619022033\n", 346 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-155000\n", 347 | "\n", 348 | "embedding for visualization saved\n", 349 | "Average loss at step 156000 : 1.87383587724\n", 350 | "Average loss at step 158000 : 1.85104113367\n", 351 | "Average loss at step 160000 : 1.86882864055\n", 352 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-160000\n", 353 | "\n", 354 | "embedding for visualization saved\n", 355 | "Average loss at step 162000 : 1.86997445494\n", 356 | "Average loss at step 164000 : 1.86812087798\n", 357 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-165000\n", 358 | "\n", 359 | "embedding for visualization saved\n", 360 | "Average loss at step 166000 : 1.86582698256\n", 361 | "Average loss at step 168000 : 1.85421365184\n", 362 | "Average loss at step 170000 : 1.85189017618\n", 363 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-170000\n", 364 | "\n", 365 | "embedding for visualization saved\n", 366 | "Average loss at step 172000 : 1.85086567563\n", 367 | "Average loss at step 174000 : 1.85538542438\n", 368 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-175000\n", 369 | "\n", 370 | "embedding for visualization saved\n", 371 | "Average loss at step 176000 : 1.84999121726\n", 372 | "Average loss at step 178000 : 1.84666664407\n", 373 | "Average loss at step 180000 : 1.84024577102\n", 374 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-180000\n", 375 | "\n", 376 | "embedding for visualization saved\n", 377 | "Average loss at step 182000 : 1.82997762367\n", 378 | "Average loss at step 184000 : 1.83894396123\n", 379 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-185000\n", 380 | "\n", 381 | "embedding for visualization saved\n", 382 | "Average loss at step 186000 : 1.84503561381\n", 383 | "Average loss at step 188000 : 1.85496722293\n", 384 | "Average loss at step 190000 : 1.83791814354\n", 385 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-190000\n", 386 | "\n", 387 | "embedding for visualization saved\n", 388 | "Average loss at step 192000 : 1.84750790322\n", 389 | "Average loss at step 194000 : 1.83688094378\n", 390 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-195000\n", 391 | "\n", 392 | "embedding for visualization saved\n", 393 | "Average loss at step 196000 : 1.82232741687\n", 394 | "Average loss at step 198000 : 1.83022581661\n", 395 | "Average loss at step 200000 : 1.8315110831\n", 396 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-200000\n", 397 | "\n", 398 | "embedding for visualization saved\n", 399 | "Average loss at step 202000 : 1.84495187533\n", 400 | "Average loss at step 204000 : 1.83805379978\n", 401 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-205000\n", 402 | "\n", 403 | "embedding for visualization saved\n", 404 | "Average loss at step 206000 : 1.82688765806\n", 405 | "Average loss at step 208000 : 1.82386591777\n", 406 | "Average loss at step 210000 : 1.83106274354\n", 407 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-210000\n", 408 | "\n", 409 | "embedding for visualization saved\n", 410 | "Average loss at step 212000 : 1.82801567906\n", 411 | "Average loss at step 214000 : 1.82362141362\n", 412 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-215000\n", 413 | "\n", 414 | "embedding for visualization saved\n", 415 | "Average loss at step 216000 : 1.82936387384\n", 416 | "Average loss at step 218000 : 1.82635319006\n", 417 | "Average loss at step 220000 : 1.82470695007\n", 418 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-220000\n", 419 | "\n", 420 | "embedding for visualization saved\n", 421 | "Average loss at step 222000 : 1.81072063574\n", 422 | "Average loss at step 224000 : 1.82379893202\n", 423 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-225000\n", 424 | "\n", 425 | "embedding for visualization saved\n", 426 | "Average loss at step 226000 : 1.81551453474\n", 427 | "Average loss at step 228000 : 1.82078930718\n", 428 | "Average loss at step 230000 : 1.82155351609\n", 429 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-230000\n", 430 | "\n", 431 | "embedding for visualization saved\n", 432 | "Average loss at step 232000 : 1.80945664012\n", 433 | "Average loss at step 234000 : 1.81383246195\n", 434 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-235000\n", 435 | "\n", 436 | "embedding for visualization saved\n", 437 | "Average loss at step 236000 : 1.80408573169\n", 438 | "Average loss at step 238000 : 1.82634266451\n", 439 | "Average loss at step 240000 : 1.81491587222\n", 440 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-240000\n", 441 | "\n", 442 | "embedding for visualization saved\n", 443 | "Average loss at step 242000 : 1.82181355149\n", 444 | "Average loss at step 244000 : 1.80325176316\n", 445 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-245000\n", 446 | "\n", 447 | "embedding for visualization saved\n", 448 | "Average loss at step 246000 : 1.80015126523\n", 449 | "Average loss at step 248000 : 1.80795535001\n", 450 | "Average loss at step 250000 : 1.80928635907\n", 451 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-250000\n", 452 | "\n", 453 | "embedding for visualization saved\n", 454 | "Average loss at step 252000 : 1.80418120205\n", 455 | "Average loss at step 254000 : 1.80356847513\n", 456 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-255000\n", 457 | "\n", 458 | "embedding for visualization saved\n", 459 | "Average loss at step 256000 : 1.79944626078\n", 460 | "Average loss at step 258000 : 1.80082440692\n", 461 | "Average loss at step 260000 : 1.79525485715\n", 462 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-260000\n", 463 | "\n", 464 | "embedding for visualization saved\n", 465 | "Average loss at step 262000 : 1.8031452558\n", 466 | "Average loss at step 264000 : 1.7941118238\n", 467 | "Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/deepwalk/checkpoints/model-265000\n", 468 | "\n", 469 | "embedding for visualization saved\n", 470 | "Average loss at step 266000 : 1.80211679664\n", 471 | "Average loss at step 268000 : 1.80098356003\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "# Step 5: Begin training.\n", 477 | "num_steps = 9999999\n", 478 | "\n", 479 | "with tf.Session(graph=graph) as session:\n", 480 | " train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)\n", 481 | " saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) \n", 482 | " \n", 483 | " \n", 484 | " # We must initialize all variables before we use them.\n", 485 | " init.run()\n", 486 | " print('Initialized')\n", 487 | "\n", 488 | " average_loss = 0\n", 489 | " for step in range(num_steps):\n", 490 | " batch_inputs, batch_labels = generator.next_batch()\n", 491 | " \n", 492 | " feed_dict = {model.train_inputs: batch_inputs,\n", 493 | " model.train_labels: np.expand_dims(np.array(batch_labels), -1)}\n", 494 | "\n", 495 | " # We perform one update step by evaluating the optimizer op (including it\n", 496 | " # in the list of returned values for session.run()\n", 497 | " _, loss_val, summaries = session.run([optimizer, model.loss, train_summary_op], feed_dict=feed_dict)\n", 498 | " average_loss += loss_val\n", 499 | "\n", 500 | " if step % 2000 == 0:\n", 501 | " if step > 0:\n", 502 | " average_loss /= 2000\n", 503 | " # The average loss is an estimate of the loss over the last 2000 batches.\n", 504 | " print('Average loss at step ', step, ': ', average_loss)\n", 505 | " average_loss = 0\n", 506 | "\n", 507 | " if step % FLAGS.checkpoint_every == 0 and step > 0:\n", 508 | " path = saver.save(session, checkpoint_prefix, global_step=step)\n", 509 | " \n", 510 | " print(\"Saved model checkpoint to {}\\n\".format(path)) \n", 511 | " \n", 512 | " save_embedding_for_viz(model.embeddings, session, metadata_path, checkpoint_dir)\n", 513 | " \n", 514 | " train_summary_writer.add_summary(summaries, step)\n", 515 | " \n" 516 | ] 517 | } 518 | ], 519 | "metadata": { 520 | "kernelspec": { 521 | "display_name": "Python 3", 522 | "language": "python", 523 | "name": "python3" 524 | }, 525 | "language_info": { 526 | "codemirror_mode": { 527 | "name": "ipython", 528 | "version": 3 529 | }, 530 | "file_extension": ".py", 531 | "mimetype": "text/x-python", 532 | "name": "python", 533 | "nbconvert_exporter": "python", 534 | "pygments_lexer": "ipython3", 535 | "version": "3.5.2" 536 | } 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 2 540 | } 541 | -------------------------------------------------------------------------------- /deepwalk.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import tensorflow as tf 4 | import os 5 | import itertools 6 | import numpy as np 7 | 8 | from word2vec import Word2Vec 9 | from data_helpers import RWBatchGenerator 10 | from tf_helpers import save_embedding_for_viz 11 | 12 | 13 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 14 | tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 5000)") 15 | tf.flags.DEFINE_integer("num_checkpoints", 1, "Number of checkpoints to store (default: 1)") 16 | tf.flags.DEFINE_boolean("save_viz_embedding", False, 17 | "save embeddding for visualization or not (default: False)") 18 | 19 | tf.flags.DEFINE_integer("num_steps", 100000, 20 | "number of steps") 21 | 22 | FLAGS = tf.flags.FLAGS 23 | FLAGS._parse_flags() 24 | print("\nParameters:") 25 | for attr, value in sorted(FLAGS.__flags.items()): 26 | print("{}={}".format(attr.upper(), value)) 27 | print("") 28 | 29 | 30 | data_dir = FLAGS.data_dir 31 | metadata_path = '/home/cloud-user/code/network_embedding/{}/labels_for_visualization.tsv'.format(data_dir) 32 | 33 | batch_size = 128 34 | embedding_size = 128 # Dimension of the embedding vector. 35 | skip_window = 3 # How many words to consider left and right. 36 | num_skips = 4 # How many times to reuse an input to generate a label. 37 | 38 | walks = RWBatchGenerator.read_walks("{}/random_walks.txt".format(data_dir)) 39 | 40 | vocabulary_size = len(set(itertools.chain(*walks))) 41 | 42 | generator = RWBatchGenerator(walks, batch_size, num_skips, skip_window) 43 | 44 | # We pick a random validation set to sample nearest neighbors. Here we limit the 45 | # validation samples to the words that have a low numeric ID, which by 46 | # construction are also the most frequent. 47 | num_sampled = 64 # Number of negative examples to sample. 48 | 49 | 50 | graph = tf.Graph() 51 | with graph.as_default(): 52 | model = Word2Vec(num_sampled, 53 | vocabulary_size=vocabulary_size, embedding_size=embedding_size) 54 | 55 | # Construct the SGD optimizer using a learning rate of 1.0. 56 | optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(model.loss) 57 | 58 | # Add variable initializer. 59 | init = tf.global_variables_initializer() 60 | 61 | 62 | dataset_id = list(filter(None, data_dir.split('/')))[-1] 63 | print('dataset_id:', dataset_id) 64 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", dataset_id, "deepwalk")) 65 | 66 | if tf.gfile.Exists(out_dir): 67 | tf.gfile.DeleteRecursively(out_dir) 68 | tf.gfile.MakeDirs(out_dir) 69 | 70 | print("Writing to {}\n".format(out_dir)) 71 | 72 | # summary config 73 | loss_summary = tf.summary.scalar("loss", model.loss) 74 | train_summary_op = tf.summary.merge([loss_summary]) 75 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 76 | 77 | # checkpoint config 78 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 79 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 80 | if not os.path.exists(checkpoint_dir): 81 | os.makedirs(checkpoint_dir) 82 | 83 | 84 | # In[ ]: 85 | 86 | # Step 5: Begin training. 87 | num_steps = FLAGS.num_steps 88 | 89 | with tf.Session(graph=graph) as session: 90 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph) 91 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 92 | 93 | # We must initialize all variables before we use them. 94 | init.run() 95 | print('Initialized') 96 | 97 | average_loss = 0 98 | for step in range(num_steps): 99 | batch_inputs, batch_labels = generator.next_batch() 100 | 101 | feed_dict = {model.train_inputs: batch_inputs, 102 | model.train_labels: np.expand_dims(np.array(batch_labels), -1)} 103 | 104 | # We perform one update step by evaluating the optimizer op (including it 105 | # in the list of returned values for session.run() 106 | _, loss_val, summaries = session.run([optimizer, model.loss, train_summary_op], feed_dict=feed_dict) 107 | average_loss += loss_val 108 | 109 | if step % 2000 == 0: 110 | if step > 0: 111 | average_loss /= 2000 112 | # The average loss is an estimate of the loss over the last 2000 batches. 113 | print('Average loss at step ', step, ': ', average_loss) 114 | average_loss = 0 115 | 116 | if step % FLAGS.checkpoint_every == 0 and step > 0: 117 | path = saver.save(session, checkpoint_prefix, global_step=step) 118 | 119 | print("Saved model checkpoint to {}\n".format(path)) 120 | 121 | if FLAGS.save_viz_embedding: 122 | 123 | print('save embedding for viz') 124 | save_embedding_for_viz(model.embeddings, session, metadata_path, checkpoint_dir) 125 | 126 | train_summary_writer.add_summary(summaries, step) 127 | -------------------------------------------------------------------------------- /embedding_labels.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 50, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pickle as pkl\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "from collections import Counter\n", 15 | "from itertools import chain" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 51, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "data_dir='data/stackexchange/datascience/'" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 52, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "labels = pd.read_csv('{}/labels.csv'.format(data_dir), header=None, index_col=0)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 53, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "qids = pkl.load(open('{}/connected_question_ids.pkl'.format(data_dir), 'rb'))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 54, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "all_occurence = chain(*map(lambda s: s.split(','), labels[1].tolist()))\n", 60 | "label_freq = Counter(all_occurence)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 58, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "labels_to_show = []\n", 72 | "for i, r in labels.loc[qids].iterrows():\n", 73 | " best_label = max(r[1].split(','), key=label_freq.__getitem__)\n", 74 | " labels_to_show.append(best_label)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 61, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "with open('{}/labels_for_visualization.tsv'.format(data_dir), 'w') as f: \n", 86 | " for l in labels_to_show:\n", 87 | " f.write(l + '\\n')" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.5.2" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 2 112 | } 113 | -------------------------------------------------------------------------------- /eval_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import itertools 4 | 5 | from scipy.sparse import issparse 6 | 7 | 8 | def precision(p, t): 9 | """ 10 | p, t: two sets of labels/integers 11 | >>> precision({1, 2, 3, 4}, {1}) 12 | 0.25 13 | """ 14 | return len(t.intersection(p)) / len(p) 15 | 16 | 17 | def precision_at_ks(Y_pred_scores, Y_test, ks=[1, 3, 5, 10]): 18 | """ 19 | Y_pred_scores: nd.array of dtype float, entry ij is the score of label j for instance i 20 | Y_test: list of label ids 21 | """ 22 | result = [] 23 | for k in [1, 3, 5, 10]: 24 | Y_pred = [] 25 | for i in np.arange(Y_pred_scores.shape[0]): 26 | if issparse(Y_pred_scores): 27 | idx = np.argsort(Y_pred_scores[i].data)[::-1] 28 | Y_pred.append(set(Y_pred_scores[i].indices[idx[:k]])) 29 | else: # is ndarray 30 | idx = np.argsort(Y_pred_scores[i, :])[::-1] 31 | Y_pred.append(set(idx[:k])) 32 | 33 | result.append(np.mean([precision(yp, set(yt)) for yt, yp in zip(Y_test, Y_pred)])) 34 | return result 35 | 36 | 37 | def tf_precision_at_k(pred_values, correct_labels, k, name=None): 38 | """ 39 | pred_values: Tensor of label scores 40 | correct_labels: SparseTensor, label list (not label indicator matrix) 41 | """ 42 | _, pred_labels = tf.nn.top_k(pred_values, k=k, sorted=True) 43 | 44 | num_intersections = tf.sets.set_size(tf.sets.set_intersection(pred_labels, correct_labels)) 45 | 46 | return tf.reduce_mean(tf.divide(num_intersections, k), name=name) 47 | 48 | 49 | def label_lists_to_sparse_tuple(label_lists, n_classes): 50 | """given label lists and number of a 51 | return the sparse representation (indices, values, shape) 52 | 53 | example: 54 | 55 | >> label_lists = [[0, 1, 2], [1, 2], [0, 2]] 56 | >> sparse_tensor_tuple = label_lists_to_sparse_tuple(label_lists, 3) 57 | >>> print(tf.sparse_to_dense(sparse_tensor_tuple[0], 58 | sparse_tensor_tuple[2], 59 | sparse_tensor_tuple[1]).eval()) 60 | [[0 1 2] 61 | [1 2 0] 62 | [0 2 0]] 63 | """ 64 | indices = [[i, j] 65 | for i, row in enumerate(label_lists) 66 | for j in range(len(row))] 67 | values = list(itertools.chain(*label_lists)) 68 | shape = (len(label_lists), n_classes) 69 | return (indices, values, shape) 70 | -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from tensorflow.contrib import learn 8 | from tqdm import tqdm 9 | 10 | from data_helpers import load_pickle, batch_iter 11 | from eval_helpers import precision_at_ks 12 | 13 | 14 | tf.flags.DEFINE_string('data_dir', '', 'directory of dataset') 15 | tf.flags.DEFINE_boolean('use_node_embedding', False, 'use node embedding or not') 16 | 17 | tf.flags.DEFINE_string("checkpoint_dir", "", 18 | "Checkpoint directory from training run") 19 | tf.flags.DEFINE_integer("batch_size", 64, "") 20 | 21 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 22 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 23 | 24 | FLAGS = tf.flags.FLAGS 25 | FLAGS._parse_flags() 26 | print("\nParameters:") 27 | for attr, value in sorted(FLAGS.__flags.items()): 28 | print("{}={}".format(attr.upper(), value)) 29 | print("") 30 | 31 | data_dir = FLAGS.data_dir 32 | 33 | _, _, test_text = load_pickle( 34 | os.path.join(data_dir, "text_split.pkl")) 35 | _, _, y_id_test = load_pickle( 36 | os.path.join(data_dir, "labels_id_split.pkl")) 37 | 38 | _, _, node_ids_test = load_pickle( 39 | os.path.join(data_dir, "node_ids_split.pkl")) 40 | 41 | 42 | vocab_path = os.path.join(data_dir, "vocab") 43 | vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) 44 | 45 | X = vocab_processor.transform(test_text) 46 | 47 | checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 48 | 49 | graph = tf.Graph() 50 | with graph.as_default(): 51 | session_conf = tf.ConfigProto( 52 | allow_soft_placement=FLAGS.allow_soft_placement, 53 | log_device_placement=FLAGS.log_device_placement 54 | ) 55 | sess = tf.Session(config=session_conf) 56 | with sess.as_default(): 57 | # Load the saved meta graph and restore variables 58 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 59 | saver.restore(sess, checkpoint_file) 60 | 61 | if FLAGS.use_node_embedding: 62 | input_x = graph.get_operation_by_name("kim_cnn/input_x").outputs[0] 63 | dropout_keep_prob = graph.get_operation_by_name("kim_cnn/dropout_keep_prob").outputs[0] 64 | input_node_ids = graph.get_operation_by_name("combined/input_node_ids").outputs[0] 65 | 66 | label_scores = graph.get_operation_by_name("combined/output/scores").outputs[0] 67 | else: 68 | input_x = graph.get_operation_by_name("input_x").outputs[0] 69 | dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 70 | 71 | label_scores = graph.get_operation_by_name("output/scores").outputs[0] 72 | 73 | # Generate batches for one epoch 74 | if FLAGS.use_node_embedding: 75 | input_data = list(zip(list(X), list(node_ids_test))) 76 | else: 77 | input_data = list(X) 78 | batches = batch_iter(input_data, FLAGS.batch_size, 1, shuffle=False) 79 | 80 | # Collect the predictions here 81 | all_label_scores = None 82 | for x_test_batch in tqdm(batches): 83 | if FLAGS.use_node_embedding: 84 | text_batch, node_ids_batch = zip(*x_test_batch) 85 | label_score_values = sess.run( 86 | label_scores, 87 | {input_x: text_batch, input_node_ids: node_ids_batch, dropout_keep_prob: 1.0}) 88 | else: 89 | label_score_values = sess.run( 90 | label_scores, {input_x: x_test_batch, dropout_keep_prob: 1.0}) 91 | 92 | if all_label_scores is not None: 93 | all_label_scores = np.concatenate([all_label_scores, label_score_values]) 94 | else: 95 | all_label_scores = label_score_values 96 | 97 | precisions = precision_at_ks(all_label_scores, y_id_test, ks=[1, 3, 5]) 98 | 99 | for k, p in zip([1, 3, 5], precisions): 100 | print('p@{}: {:.5f}'.format(k, p)) 101 | -------------------------------------------------------------------------------- /extract_embedding_labels.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import pickle as pkl 4 | import pandas as pd 5 | import tensorflow as tf 6 | 7 | from collections import Counter 8 | from itertools import chain 9 | 10 | 11 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 12 | 13 | FLAGS = tf.flags.FLAGS 14 | FLAGS._parse_flags() 15 | print("\nParameters:") 16 | for attr, value in sorted(FLAGS.__flags.items()): 17 | print("{}={}".format(attr.upper(), value)) 18 | print("") 19 | 20 | data_dir = FLAGS.data_dir 21 | 22 | labels = pd.read_csv('{}/labels.csv'.format(data_dir), header=None, index_col=0) 23 | 24 | 25 | qids = pkl.load(open('{}/connected_question_ids.pkl'.format(data_dir), 'rb')) 26 | 27 | 28 | all_occurence = chain(*map(lambda s: s.split(','), labels[1].tolist())) 29 | label_freq = Counter(all_occurence) 30 | 31 | 32 | labels_to_show = [] 33 | for i, r in labels.loc[qids].iterrows(): 34 | best_label = max(r[1].split(','), key=label_freq.__getitem__) 35 | labels_to_show.append(best_label) 36 | 37 | output_path = '{}/labels_for_visualization.tsv'.format(data_dir) 38 | 39 | print('save to {}'.format(output_path)) 40 | 41 | label2color = {} 42 | with open(output_path, 'w') as f: 43 | f.write("name\tlabel\tcolor\n") 44 | for qid, l in zip(qids, labels_to_show): 45 | if l not in label2color: 46 | label2color[l] = len(label2color) 47 | 48 | color = label2color[l] 49 | f.write("{}\t{}\t{}\n".format(qid, l, color)) 50 | -------------------------------------------------------------------------------- /fastxml_experiment.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # supprese warning 4 | import pickle as pkl 5 | import numpy as np 6 | import tensorflow as tf 7 | import os 8 | import warnings 9 | 10 | 11 | def warn(*args, **kwargs): 12 | pass 13 | 14 | warnings.warn = warn 15 | 16 | from fastxml import Trainer, Inferencer 17 | 18 | from eval_helpers import precision_at_ks 19 | 20 | 21 | tf.flags.DEFINE_string('data_dir', 'data/datascience/', 'directory of dataset') 22 | tf.flags.DEFINE_integer('n_trees', 32, 'number of forests') 23 | tf.flags.DEFINE_boolean('eval', False, "whether evaluate on test or not") 24 | 25 | FLAGS = tf.flags.FLAGS 26 | FLAGS._parse_flags() 27 | 28 | data_dir = FLAGS.data_dir 29 | 30 | # load train/test data 31 | x_train, x_dev, x_test = pkl.load(open(os.path.join(data_dir, "tfidf_split.pkl"), 'rb')) 32 | y_train, y_dev, y_test = pkl.load(open(os.path.join(data_dir, "labels_id_split.pkl"), 'rb')) 33 | 34 | # convert dtype to be compatible with fastxml 35 | x_train.data = np.asarray(x_train.data, dtype=np.float32) 36 | x_dev.data = np.asarray(x_dev.data, dtype=np.float32) 37 | x_test.data = np.asarray(x_test.data, dtype=np.float32) 38 | 39 | # fastxml 40 | model_path = os.path.join(data_dir, 'fastxml.model') 41 | 42 | if not FLAGS.eval: 43 | print("training...") 44 | trainer = Trainer(n_trees=FLAGS.n_trees, n_jobs=-1) 45 | trainer.fit(list(x_train), y_train) 46 | trainer.save(model_path) 47 | 48 | clf = Inferencer(model_path) 49 | 50 | 51 | ks = [1, 3, 5] 52 | print("fastxml:") 53 | 54 | if not FLAGS.eval: 55 | print('validating...') 56 | pred = clf.predict(x_dev) 57 | precs = precision_at_ks(pred, y_dev, ks=ks) 58 | else: 59 | print('testing...') 60 | pred = clf.predict(x_test) 61 | precs = precision_at_ks(pred, y_test, ks=ks) 62 | 63 | print("{} result".format("Test" if FLAGS.eval else "Dev")) 64 | for p, k in zip(precs, ks): 65 | print("p@{}: {:.2f}".format(k, p)) 66 | -------------------------------------------------------------------------------- /get_tensor_from_checkpoint.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | import tensorflow as tf 5 | 6 | 7 | checkpoint_file = 'runs/deepwalk/checkpoints/model-25000' 8 | 9 | sess = tf.InteractiveSession() 10 | 11 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 12 | saver.restore(sess, checkpoint_file) 13 | 14 | embedding_table = sess.graph.get_operation_by_name('embedding/table') 15 | 16 | val = embedding_table.outputs[0].eval() 17 | 18 | print(val) 19 | 20 | -------------------------------------------------------------------------------- /k_max_pooling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 80, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "import numpy as np\n", 13 | "from tf_helpers import flatten" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 81, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "def mimic_filter_maps(x):\n", 25 | " \"\"\"\n", 26 | " expand dimension to mimic batches of filter maps of shape (batch, height, width, channel)\n", 27 | " in this case, width=1, channel=1 \n", 28 | " \"\"\"\n", 29 | " expanded = np.expand_dims(x, -1)\n", 30 | " expanded = np.expand_dims(expanded, -1)\n", 31 | " return expanded\n" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 82, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "# input to the algorithm\n", 43 | "k = 2\n", 44 | "x = mimic_filter_maps(np.array([[1, 2, 3, 4], [1, 2, 0, 0], [0, 3, 1, 2]]))\n", 45 | "y = mimic_filter_maps(np.array([[3, 4], [1, 2], [3, 2]]))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 86, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "(3, 4, 1, 1)" 57 | ] 58 | }, 59 | "execution_count": 86, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "x.shape" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 83, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "sess = tf.InteractiveSession()\n", 77 | "tf_x = tf.constant(x)\n", 78 | "tf_y = tf.constant(y)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 84, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "top_k_values\n", 91 | " [[3 4]\n", 92 | " [1 2]\n", 93 | " [2 3]]\n", 94 | "top_k_indices\n", 95 | " [[2 3]\n", 96 | " [0 1]\n", 97 | " [3 1]]\n", 98 | "sorting_indices\n", 99 | " [[0 1]\n", 100 | " [0 1]\n", 101 | " [1 0]]\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "tf_x_squeezed = tf.squeeze(tf_x)\n", 107 | "top_k_values, top_k_indices = tf.nn.top_k(tf_x_squeezed, k=2, sorted=False)\n", 108 | "sorting_indices = tf.reverse(tf.nn.top_k(top_k_indices, k=top_k_indices.shape[1], sorted=True).indices, axis=[-1])\n", 109 | "print('top_k_values\\n', top_k_values.eval())\n", 110 | "print('top_k_indices\\n', top_k_indices.eval())\n", 111 | "print('sorting_indices\\n', sorting_indices.eval())\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 85, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "[[3 4]\n", 124 | " [1 2]\n", 125 | " [3 2]]\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "flattened_indices = flatten(sorting_indices)\n", 131 | "offset_short = tf.range(0, flattened_indices.shape[0], k)\n", 132 | "offset_col = tf.expand_dims(offset_short, -1) # to column\n", 133 | "offset_flattened = tf.tile(offset_col, [1, k]) # repeat on each row\n", 134 | "offset = flatten(offset_flattened) # flatten it\n", 135 | "offset.eval()\n", 136 | "indices = offset + flattened_indices\n", 137 | "\n", 138 | "flattened_values = flatten(top_k_values)\n", 139 | "reindexed_values_flattened = tf.gather(flattened_values, indices)\n", 140 | "reindexed_values = tf.reshape(reindexed_values_flattened, [-1, k])\n", 141 | "print(reindexed_values.eval())" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": true 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "tf.reshape(sorting_indices" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 12, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "ename": "NameError", 162 | "evalue": "name 'top_k_values_t' is not defined", 163 | "output_type": "error", 164 | "traceback": [ 165 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 166 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 167 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtop_k_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtop_k_indices\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msess\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtop_k_values_t\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtop_k_indices_t\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtop_k_values\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtop_k_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 168 | "\u001b[0;31mNameError\u001b[0m: name 'top_k_values_t' is not defined" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "top_k_values, top_k_indices = sess.run([top_k_values, top_k_indices])\n", 174 | "print(top_k_values)\n", 175 | "print(top_k_indices)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 13, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "tf.gather?" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 20, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "array([[[2, 3],\n", 198 | " [1, 2]],\n", 199 | "\n", 200 | " [[3, 4],\n", 201 | " [2, 3]],\n", 202 | "\n", 203 | " [[5, 6],\n", 204 | " [4, 5]]], dtype=int32)" 205 | ] 206 | }, 207 | "execution_count": 20, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "tf.gather([[1,2,3], [2,3,4], [4,5,6]], [[1, 2], [0, 1]], axis=-1).eval()" 214 | ] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Python 3", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.5.2" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /kim_cnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | adapted from: 3 | 4 | - https://raw.githubusercontent.com/xiaohan2012/cnn-text-classification-tf/master/text_cnn.py 5 | - Jingzhou, Liu, etc, Deep Learning for Extreme Multi-label Text Classification, SIGIR 2017 6 | 7 | """ 8 | import tensorflow as tf 9 | from eval_helpers import tf_precision_at_k 10 | 11 | 12 | class KimCNN(): 13 | """ 14 | A CNN for text classification. 15 | Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. 16 | """ 17 | def __init__( 18 | self, sequence_length, num_classes, vocab_size, 19 | embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0, 20 | loss_function='softmax', 21 | redefine_output_layer=False): 22 | 23 | self.sequence_length = sequence_length 24 | self.num_classes = num_classes 25 | self.vocab_size = vocab_size 26 | self.embedding_size = embedding_size 27 | self.filter_sizes = filter_sizes 28 | self.num_filters = num_filters 29 | self.l2_reg_lambda = l2_reg_lambda 30 | self.loss_function = loss_function 31 | 32 | # Placeholders for input, output and dropout 33 | self.input_x = tf.placeholder( 34 | tf.int32, [None, self.sequence_length], name="input_x") 35 | 36 | # label indicator matrix 37 | self.input_y_binary = tf.placeholder( 38 | tf.float32, [None, self.num_classes], name="input_y_binary") 39 | 40 | # label list, a SparseTensor because label list length varies 41 | self.input_y_labels = tf.sparse_placeholder( 42 | tf.int32, shape=[None, self.num_classes], 43 | name='input_y_labels') 44 | 45 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 46 | 47 | # Keeping track of l2 regularization loss (optional) 48 | self.l2_loss = tf.constant(0.0) 49 | 50 | # adding the layers 51 | self.add_embedding_layer() 52 | self.add_convolution_layer() 53 | self.add_drop_out() 54 | 55 | if not redefine_output_layer: 56 | self.add_output() 57 | self.add_loss() 58 | self.add_performance() 59 | 60 | def add_embedding_layer(self): 61 | # Embedding layer 62 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 63 | self.W = tf.Variable( 64 | tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0), 65 | name="W") 66 | self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x) 67 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 68 | 69 | def add_convolution_layer(self): 70 | # Create a convolution + maxpool layer for each filter size 71 | pooled_outputs = [] 72 | for i, filter_size in enumerate(self.filter_sizes): 73 | with tf.name_scope("conv-maxpool-%s" % filter_size): 74 | # Convolution Layer 75 | filter_shape = [filter_size, self.embedding_size, 1, self.num_filters] 76 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") 77 | b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name="b") 78 | conv = tf.nn.conv2d( 79 | self.embedded_chars_expanded, 80 | W, 81 | strides=[1, 1, 1, 1], 82 | padding="VALID", 83 | name="conv") 84 | # Apply nonlinearity 85 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 86 | # Maxpooling over the outputs 87 | # dim still 4d 88 | pooled = tf.nn.max_pool( 89 | h, 90 | ksize=[1, self.sequence_length - filter_size + 1, 1, 1], 91 | strides=[1, 1, 1, 1], 92 | padding='VALID', 93 | name="pool") 94 | pooled_outputs.append(pooled) 95 | 96 | # Combine all the pooled features 97 | self.num_filters_total = self.num_filters * len(self.filter_sizes) 98 | self.h_pool = tf.concat(pooled_outputs, 3) # concat by last dim 99 | self.h_pool_flat = tf.reshape(self.h_pool, [-1, self.num_filters_total]) 100 | 101 | def add_drop_out(self): 102 | # Add dropout 103 | with tf.name_scope("dropout"): 104 | self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob) 105 | 106 | def add_output(self): 107 | # Final (unnormalized) scores and predictions 108 | with tf.name_scope("output"): 109 | W = tf.get_variable( 110 | "W", 111 | shape=[self.num_filters_total, self.num_classes], 112 | initializer=tf.contrib.layers.xavier_initializer()) 113 | b = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name="b") 114 | self.l2_loss += tf.nn.l2_loss(W) 115 | self.l2_loss += tf.nn.l2_loss(b) 116 | self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores") 117 | self.predictions = tf.argmax(self.scores, 1, name="predictions") 118 | 119 | def add_loss(self): 120 | # CalculateMean cross-entropy loss 121 | with tf.name_scope("loss"): 122 | if self.loss_function == 'sigmoid': 123 | print('use sigmoid xentropy') 124 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.scores, 125 | labels=self.input_y_binary) 126 | elif self.loss_function == 'softmax': 127 | print('use softmax xentropy') 128 | losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, 129 | labels=self.input_y_binary) 130 | else: 131 | raise ValueError('invalid loss function') 132 | self.loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss 133 | 134 | def add_performance(self): 135 | # Accuracy 136 | with tf.name_scope("performance"): 137 | self.p1 = tf_precision_at_k(self.scores, self.input_y_labels, k=1, name='p1') 138 | self.p3 = tf_precision_at_k(self.scores, self.input_y_labels, k=3, name='p3') 139 | self.p5 = tf_precision_at_k(self.scores, self.input_y_labels, k=5, name='p5') 140 | -------------------------------------------------------------------------------- /kim_cnn_experiment.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import numpy as np 5 | import tensorflow as tf 6 | import datetime 7 | 8 | from tensorflow.contrib import learn 9 | 10 | from kim_cnn import KimCNN 11 | from eval_helpers import label_lists_to_sparse_tuple 12 | from data_helpers import batch_iter, load_pickle 13 | 14 | 15 | tf.flags.DEFINE_string('data_dir', '', 'directory of dataset') 16 | tf.flags.DEFINE_integer('tag_freq_threshold', 5, 'minimum frequency of a tag') 17 | 18 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 19 | tf.flags.DEFINE_float("max_document_length", 2000, 20 | "Maximum length of document, exceeding part is truncated") 21 | 22 | # Architecutural parameters 23 | 24 | tf.flags.DEFINE_string("loss_function", 'sigmoid', "loss function: (softmax|sigmoid) (Default: sigmoid)") 25 | 26 | # Model Hyperparameters 27 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 28 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 29 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 30 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 31 | tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)") 32 | 33 | # Training parameters 34 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 35 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") 36 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") 37 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") 38 | tf.flags.DEFINE_integer("num_checkpoints", 1, "Number of checkpoints to store (default: 1)") # our storage quota is low 39 | 40 | # Misc Parameters 41 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 42 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 43 | 44 | 45 | FLAGS = tf.flags.FLAGS 46 | FLAGS._parse_flags() 47 | print("\nParameters:") 48 | for attr, value in sorted(FLAGS.__flags.items()): 49 | print("{}={}".format(attr.upper(), value)) 50 | print("") 51 | 52 | data_dir = FLAGS.data_dir 53 | dataset_id = list(filter(None, data_dir.split('/')))[-1] 54 | print('dataset_id:', dataset_id) 55 | 56 | # load data 57 | # =============================================== 58 | train_text, dev_text, _ = load_pickle( 59 | os.path.join(data_dir, "text_split.pkl")) 60 | y_id_train, y_id_dev, _ = load_pickle( 61 | os.path.join(data_dir, "labels_id_split.pkl")) 62 | y_binary_train, y_binary_dev, _ = load_pickle( 63 | os.path.join(data_dir, "labels_binary_split.pkl")) 64 | 65 | # preprocessing text documents 66 | # =============================================== 67 | vocab_processor = learn.preprocessing.VocabularyProcessor(FLAGS.max_document_length) 68 | x_train = np.array(list(vocab_processor.fit_transform(train_text))) 69 | x_dev = np.array(list(vocab_processor.transform(dev_text))) 70 | 71 | print("Train/Dev split: {:d}/{:d}".format(len(x_train), len(x_dev))) 72 | 73 | num_classes = y_binary_train.shape[1] 74 | print("num of classes: {:d}".format(num_classes)) 75 | 76 | 77 | # Training 78 | # ================================================== 79 | 80 | with tf.Graph().as_default(): 81 | session_conf = tf.ConfigProto( 82 | allow_soft_placement=FLAGS.allow_soft_placement, 83 | log_device_placement=FLAGS.log_device_placement) 84 | sess = tf.Session(config=session_conf) 85 | with sess.as_default(): 86 | cnn = KimCNN( 87 | sequence_length=x_train.shape[1], 88 | num_classes=num_classes, 89 | vocab_size=len(vocab_processor.vocabulary_), 90 | embedding_size=FLAGS.embedding_dim, 91 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 92 | num_filters=FLAGS.num_filters, 93 | l2_reg_lambda=FLAGS.l2_reg_lambda, 94 | loss_function=FLAGS.loss_function) 95 | 96 | # Define Training procedure 97 | global_step = tf.Variable(0, name="global_step", trainable=False) 98 | optimizer = tf.train.AdamOptimizer(1e-3) 99 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 100 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 101 | 102 | # Keep track of gradient values and sparsity (optional) 103 | grad_summaries = [] 104 | for g, v in grads_and_vars: 105 | if g is not None: 106 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 107 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), 108 | tf.nn.zero_fraction(g)) 109 | grad_summaries.append(grad_hist_summary) 110 | grad_summaries.append(sparsity_summary) 111 | grad_summaries_merged = tf.summary.merge(grad_summaries) 112 | 113 | # Output directory for models and summaries 114 | out_dir = os.path.abspath(os.path.join(os.path.curdir, 115 | 'runs', dataset_id, 116 | 'kim_cnn')) 117 | 118 | print("Writing to {}\n".format(out_dir)) 119 | 120 | if tf.gfile.Exists(out_dir): 121 | print('cleaning ', out_dir) 122 | tf.gfile.DeleteRecursively(out_dir) 123 | tf.gfile.MakeDirs(out_dir) 124 | 125 | # Summaries for loss and precision 126 | loss_summary = tf.summary.scalar("loss", cnn.loss) 127 | p1 = tf.summary.scalar("p1", cnn.p1) 128 | p3 = tf.summary.scalar("p3", cnn.p3) 129 | p5 = tf.summary.scalar("p5", cnn.p5) 130 | 131 | # Train Summaries 132 | train_summary_op = tf.summary.merge([loss_summary, p1, p3, p5, grad_summaries_merged]) 133 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 134 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 135 | 136 | # Dev summaries 137 | dev_summary_op = tf.summary.merge([loss_summary, p1, p3, p5]) 138 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 139 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 140 | 141 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 142 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 143 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 144 | if not os.path.exists(checkpoint_dir): 145 | os.makedirs(checkpoint_dir) 146 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 147 | 148 | # Write vocabulary 149 | vocab_processor.save(os.path.join(data_dir, "vocab")) 150 | 151 | # Initialize all variables 152 | sess.run(tf.global_variables_initializer()) 153 | 154 | def train_step(x_batch, y_binary_batch, y_batch_labels): 155 | """ 156 | A single training step 157 | """ 158 | feed_dict = { 159 | cnn.input_x: x_batch, 160 | cnn.input_y_binary: y_binary_batch, 161 | cnn.input_y_labels: label_lists_to_sparse_tuple( 162 | y_batch_labels, num_classes), # needs some conversion 163 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 164 | } 165 | _, step, summaries, loss, p1, p3, p5 = sess.run( 166 | [train_op, global_step, train_summary_op, cnn.loss, cnn.p1, cnn.p3, cnn.p5], 167 | feed_dict) 168 | time_str = datetime.datetime.now().isoformat() 169 | print("{}: step {}, loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format( 170 | time_str, step, loss, p1, p3, p5)) 171 | train_summary_writer.add_summary(summaries, step) 172 | 173 | def dev_step(x_batch, y_binary_batch, y_batch_labels, writer=None): 174 | """ 175 | Evaluates model on a dev set 176 | """ 177 | feed_dict = { 178 | cnn.input_x: x_batch, 179 | cnn.input_y_binary: y_binary_batch, 180 | cnn.input_y_labels: label_lists_to_sparse_tuple( 181 | y_batch_labels, num_classes), # needs some conversion 182 | cnn.dropout_keep_prob: 1.0 183 | } 184 | step, summaries, loss, p1, p3, p5 = sess.run( 185 | [global_step, dev_summary_op, cnn.loss, cnn.p1, cnn.p3, cnn.p5], 186 | feed_dict) 187 | time_str = datetime.datetime.now().isoformat() 188 | print("[DEV] {}: step {}, loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format( 189 | time_str, step, loss, p1, p3, p5)) 190 | if writer: 191 | writer.add_summary(summaries, step) 192 | 193 | # Generate batches 194 | batches = batch_iter( 195 | list(zip(x_train, y_binary_train, y_id_train)), FLAGS.batch_size, FLAGS.num_epochs) 196 | # Training loop. For each batch... 197 | for batch in batches: 198 | x_batch, y_binary_batch, y_id_batch = zip(*batch) 199 | train_step(x_batch, y_binary_batch, y_id_batch) 200 | current_step = tf.train.global_step(sess, global_step) 201 | 202 | if current_step % FLAGS.evaluate_every == 0: 203 | print("\nEvaluation:") 204 | dev_step(x_dev, y_binary_dev, y_id_dev, writer=dev_summary_writer) 205 | print("") 206 | 207 | if current_step % FLAGS.checkpoint_every == 0: 208 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 209 | print("Saved model checkpoint to {}\n".format(path)) 210 | 211 | -------------------------------------------------------------------------------- /print_data_set_property.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import itertools 4 | import os 5 | import tensorflow as tf 6 | from data_helpers import load_pickle 7 | 8 | tf.flags.DEFINE_string('data_dir', 'data/datascience', 'directory of dataset') 9 | 10 | 11 | FLAGS = tf.flags.FLAGS 12 | FLAGS._parse_flags() 13 | print("\nParameters:") 14 | for attr, value in sorted(FLAGS.__flags.items()): 15 | print("{}={}".format(attr.upper(), value)) 16 | print("") 17 | 18 | data_dir = FLAGS.data_dir 19 | train, dev, test = load_pickle(os.path.join(data_dir, 20 | "labels_id_split.pkl")) 21 | 22 | 23 | data = train + dev + test 24 | 25 | print('#instances: ', len(data)) 26 | print('# unique labels: ', len(set(itertools.chain(*data)))) 27 | print('avg labels per instance: ', sum(map(len, data)) / len(data)) 28 | -------------------------------------------------------------------------------- /process_posts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [ 12 | { 13 | "name": "stdout", 14 | "output_type": "stream", 15 | "text": [ 16 | "dataset containing 13934 records\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "# coding: utf-8\n", 22 | "\n", 23 | "import os\n", 24 | "import re\n", 25 | "import pandas as pd\n", 26 | "import pickle as pkl\n", 27 | "import itertools\n", 28 | "import tensorflow as tf\n", 29 | "from data_helpers import strip_tags, clean_str\n", 30 | "\n", 31 | "\n", 32 | "tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience', 'directory of dataset')\n", 33 | "tf.flags.DEFINE_integer('tag_freq_threshold', 0, 'minimum frequency of a tag')\n", 34 | "\n", 35 | "FLAGS = tf.flags.FLAGS\n", 36 | "FLAGS._parse_flags()\n", 37 | "\n", 38 | "data_dir = FLAGS.data_dir\n", 39 | "tag_freq_threshold = FLAGS.tag_freq_threshold\n", 40 | "\n", 41 | "\n", 42 | "label_path = os.path.join(data_dir, \"labels.csv\")\n", 43 | "text_path = os.path.join(data_dir, \"input_text.csv\")\n", 44 | "df = pd.read_csv('{}/posts.csv'.format(data_dir), sep=',')\n", 45 | "\n", 46 | "\n", 47 | "print(\"dataset containing {} records\".format(df.shape[0]))\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": { 54 | "collapsed": false, 55 | "deletable": true, 56 | "editable": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "target_question_ids = set(pkl.load(open('{}/connected_question_ids.pkl'.format(data_dir), 'rb')))\n", 61 | "id_target = df['Id'].apply(target_question_ids.__contains__)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": { 68 | "collapsed": false, 69 | "deletable": true, 70 | "editable": true 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "contains 5145 questions\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "qs = df[id_target & (df['PostTypeId'] == 1)] # we only consider questions here\n", 83 | "\n", 84 | "print(\"contains {} questions\".format(qs.shape[0]))\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "collapsed": false, 92 | "deletable": true, 93 | "editable": true 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "number of unique labels (frequency>0): 328\n", 101 | "num. questions with at least one valid labels: 5145\n", 102 | "saving labels to data/stackexchange/datascience/labels.csv\n", 103 | "saving input text to data/stackexchange/datascience/input_text.csv\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "# extract tags\n", 109 | "regexp = re.compile(\"<(.+?)>\")\n", 110 | "\n", 111 | "\n", 112 | "def extract_tags(s):\n", 113 | " return regexp.findall(s)\n", 114 | "\n", 115 | "tags = qs[\"Tags\"].apply(extract_tags).tolist()\n", 116 | "\n", 117 | "# filter out infrequent tags\n", 118 | "tag_freq = pd.Series(list(itertools.chain(*tags))).value_counts()\n", 119 | "valid_tags = tag_freq.index[tag_freq > tag_freq_threshold]\n", 120 | "tag_set = set(valid_tags)\n", 121 | "\n", 122 | "print('number of unique labels (frequency>{}): {}'.format(\n", 123 | " tag_freq_threshold, len(tag_set)))\n", 124 | "\n", 125 | "\n", 126 | "normalized_tags = [[t for t in ts if t in tag_set] for ts in tags]\n", 127 | "\n", 128 | "\n", 129 | "# save labels to file\n", 130 | "y = pd.Series(list(map(lambda l: \",\".join(l), normalized_tags)), index=qs['Id'])\n", 131 | "\n", 132 | "mask = (y.apply(len) > 0).as_matrix()\n", 133 | "\n", 134 | "qs = qs[mask]\n", 135 | "\n", 136 | "assert y.shape[0] == qs.shape[0]\n", 137 | "\n", 138 | "print('num. questions with at least one valid labels: {}'.format(qs.shape[0]))\n", 139 | "\n", 140 | "print('saving labels to {}'.format(label_path))\n", 141 | "y.to_csv(label_path)\n", 142 | "\n", 143 | "\n", 144 | "body = qs['Body'].apply(strip_tags).apply(clean_str)\n", 145 | "title = qs['Title'].apply(strip_tags).apply(clean_str)\n", 146 | "\n", 147 | "# concatenate the texts\n", 148 | "input_text = pd.Series([' '.join(l) for l in list(zip(title, body))], index=qs['Id'])\n", 149 | "\n", 150 | "\n", 151 | "print(\"saving input text to {}\".format(text_path))\n", 152 | "input_text.to_csv(text_path)" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.5.2" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /process_posts.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import re 5 | import pandas as pd 6 | import pickle as pkl 7 | import itertools 8 | import tensorflow as tf 9 | from data_helpers import strip_tags, clean_str 10 | 11 | 12 | tf.flags.DEFINE_string('data_dir', '', 'directory of dataset') 13 | tf.flags.DEFINE_integer('tag_freq_threshold', 0, 'minimum frequency of a tag') 14 | 15 | FLAGS = tf.flags.FLAGS 16 | FLAGS._parse_flags() 17 | 18 | data_dir = FLAGS.data_dir 19 | tag_freq_threshold = FLAGS.tag_freq_threshold 20 | 21 | 22 | label_path = os.path.join(data_dir, "labels.csv") 23 | text_path = os.path.join(data_dir, "input_text.csv") 24 | df = pd.read_csv('{}/posts.csv'.format(data_dir), sep=',') 25 | 26 | 27 | print("dataset containing {} records".format(df.shape[0])) 28 | 29 | 30 | target_question_ids = set(pkl.load(open('{}/connected_question_ids.pkl'.format(data_dir), 'rb'))) 31 | id_target = df['Id'].apply(target_question_ids.__contains__) 32 | 33 | 34 | qs = df[id_target & (df['PostTypeId'] == 1)] # we only consider questions here 35 | 36 | print("contains {} questions".format(qs.shape[0])) 37 | 38 | 39 | # extract tags 40 | regexp = re.compile("<(.+?)>") 41 | 42 | 43 | def extract_tags(s): 44 | return regexp.findall(s) 45 | 46 | tags = qs["Tags"].apply(extract_tags).tolist() 47 | 48 | # filter out infrequent tags 49 | tag_freq = pd.Series(list(itertools.chain(*tags))).value_counts() 50 | valid_tags = tag_freq.index[tag_freq > tag_freq_threshold] 51 | tag_set = set(valid_tags) 52 | 53 | print('number of unique labels (frequency>{}): {}'.format( 54 | tag_freq_threshold, len(tag_set))) 55 | 56 | 57 | normalized_tags = [[t for t in ts if t in tag_set] for ts in tags] 58 | 59 | 60 | # save labels to file 61 | y = pd.Series(list(map(lambda l: ",".join(l), normalized_tags)), index=qs['Id']) 62 | 63 | mask = (y.apply(len) > 0).as_matrix() 64 | 65 | qs = qs[mask] 66 | 67 | assert y.shape[0] == qs.shape[0] 68 | 69 | print('num. questions with at least one valid labels: {}'.format(qs.shape[0])) 70 | 71 | print('saving labels to {}'.format(label_path)) 72 | y.to_csv(label_path) 73 | 74 | 75 | body = qs['Body'].apply(strip_tags).apply(clean_str) 76 | title = qs['Title'].apply(strip_tags).apply(clean_str) 77 | 78 | # concatenate the texts 79 | input_text = pd.Series([' '.join(l) for l in list(zip(title, body))], index=qs['Id']) 80 | 81 | 82 | print("saving input text to {}".format(text_path)) 83 | input_text.to_csv(text_path) 84 | -------------------------------------------------------------------------------- /process_train_dev_test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | encode: 4 | 1. the text using tf-idf 5 | 2. labels into binary vector 6 | """ 7 | 8 | import os 9 | import pickle as pkl 10 | import itertools 11 | import tensorflow as tf 12 | 13 | from sklearn.feature_extraction.text import TfidfVectorizer 14 | 15 | from data_helpers import MultiLabelIntegerEncoder, label_ids_to_binary_matrix 16 | 17 | 18 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 19 | tf.flags.DEFINE_integer('tag_freq_threshold', 0, 'minimum frequency of a tag') 20 | 21 | FLAGS = tf.flags.FLAGS 22 | FLAGS._parse_flags() 23 | 24 | data_dir = FLAGS.data_dir 25 | 26 | dump_path = '{}/train_dev_test.pkl'.format(data_dir) 27 | (x_text_train, x_text_dev, x_text_test, 28 | y_labels_train, y_labels_dev, y_labels_test, 29 | node_ids_train, node_ids_dev, node_ids_test) = \ 30 | pkl.load(open(dump_path, 'rb')) 31 | 32 | 33 | vectorizer = TfidfVectorizer() 34 | x_tfidf_train = vectorizer.fit_transform(x_text_train) 35 | x_tfidf_dev, x_tfidf_test = vectorizer.transform(x_text_dev), vectorizer.transform(x_text_test) 36 | 37 | 38 | label_encoder = MultiLabelIntegerEncoder() 39 | 40 | 41 | def labels_to_str_list(y_labels): 42 | return list(map(lambda s: s.split(','), y_labels)) 43 | 44 | 45 | 46 | 47 | y_ints_train = label_encoder.fit_transform(labels_to_str_list(y_labels_train)) 48 | y_ints_dev, y_ints_test = label_encoder.transform(labels_to_str_list(y_labels_dev)), \ 49 | label_encoder.transform(labels_to_str_list(y_labels_test)) 50 | 51 | print('y_ints_dev[0]', y_ints_dev[0]) 52 | 53 | n_cols = len(set(itertools.chain(*y_ints_train))) 54 | n_cols += 1 # for UNK labels 55 | 56 | y_binary_train = label_ids_to_binary_matrix(y_ints_train, (len(y_ints_train), n_cols)) 57 | y_binary_dev = label_ids_to_binary_matrix(y_ints_dev, (len(y_ints_dev), n_cols)) 58 | print('y_binary_dev', y_binary_dev) 59 | y_binary_test = label_ids_to_binary_matrix(y_ints_test, (len(y_ints_test), n_cols)) 60 | 61 | text_path = os.path.join(data_dir, 'text_split.pkl') 62 | tfidf_path = os.path.join(data_dir, 'tfidf_split.pkl') 63 | labels_path = os.path.join(data_dir, 'labels_split.pkl') 64 | labels_id_path = os.path.join(data_dir, 'labels_id_split.pkl') 65 | labels_binary_path = os.path.join(data_dir, 'labels_binary_split.pkl') 66 | node_ids_path = os.path.join(data_dir, 'node_ids_split.pkl') 67 | 68 | 69 | label_encoder_path = os.path.join(data_dir, 'label_encoder.pkl') 70 | tfidf_vectorizer_path = os.path.join(data_dir, 'text_vectorizer.pkl') 71 | 72 | 73 | def dump_data(variable, path): 74 | pkl.dump(variable, open(path, 'wb')) 75 | 76 | 77 | dump_info = [ 78 | ((x_text_train, x_text_dev, x_text_test), text_path), 79 | ((x_tfidf_train, x_tfidf_dev, x_tfidf_test), tfidf_path), 80 | ((y_labels_train, y_labels_dev, y_labels_test), labels_path), 81 | ((y_ints_train, y_ints_dev, y_ints_test), labels_id_path), 82 | ((y_binary_train, y_binary_dev, y_binary_test), labels_binary_path), 83 | ((node_ids_train, node_ids_dev, node_ids_test), node_ids_path) 84 | ] 85 | 86 | 87 | for var, path in dump_info: 88 | print('dumping to', path) 89 | dump_data(var, path) 90 | -------------------------------------------------------------------------------- /project-slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaohan2012/multi-label-text-classification/e27d0eb259fcf8b529330f4b7496dac5d3919340/project-slides.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | arrow==0.10.0 3 | autopep8==1.2.4 4 | backports.weakref==1.0rc1 5 | beautifulsoup4==4.6.0 6 | bleach==1.5.0 7 | blinker==1.3 8 | bs4==0.0.1 9 | Cartopy==0.14.2 10 | chardet==2.3.0 11 | click==6.2 12 | cloud-init==0.7.9 13 | colorama==0.3.7 14 | command-not-found==0.3 15 | configobj==5.0.6 16 | cryptography==1.2.3 17 | cycler==0.10.0 18 | Cython==0.25.2 19 | decorator==4.1.2 20 | entrypoints==0.2.3 21 | fastxml==2.0.0 22 | flake8==2.6.2 23 | future==0.16.0 24 | html5lib==0.9999999 25 | idna==2.0 26 | importmagic==0.1.7 27 | ipykernel==4.6.1 28 | ipython==6.2.0 29 | ipython-genutils==0.2.0 30 | ipywidgets==7.0.1 31 | jedi==0.11.0 32 | Jinja2==2.9.6 33 | joblib==0.10.3 34 | jsonpatch==1.10 35 | jsonpointer==1.9 36 | jsonschema==2.6.0 37 | jupyter==1.0.0 38 | jupyter-client==5.1.0 39 | jupyter-console==5.2.0 40 | jupyter-core==4.3.0 41 | language-selector==0.1 42 | line-profiler==2.0 43 | Markdown==2.6.9 44 | MarkupSafe==1.0 45 | matplotlib==2.0.2 46 | mccabe==0.5.0 47 | memory-profiler==0.43 48 | mistune==0.7.4 49 | mpld3==0.3 50 | nbconvert==5.3.1 51 | nbformat==4.4.0 52 | network==0.1 53 | networkx==1.11 54 | nose==1.3.7 55 | notebook==5.1.0 56 | numpy==1.13.1 57 | oauthlib==1.0.3 58 | olefile==0.44 59 | packaging==16.8 60 | pandas==0.20.3 61 | pandocfilters==1.4.2 62 | parso==0.1.0 63 | pep8==1.7.0 64 | pexpect==4.2.1 65 | pickleshare==0.7.4 66 | Pillow==4.2.1 67 | pkg-resources==0.0.0 68 | prettytable==0.7.2 69 | prompt-toolkit==1.0.15 70 | protobuf==3.4.0 71 | psutil==5.2.0 72 | ptyprocess==0.5.2 73 | py==1.4.32 74 | pyasn1==0.1.9 75 | pycodestyle==2.0.0 76 | pycurl==7.43.0 77 | pyflakes==1.2.3 78 | Pygments==2.2.0 79 | pygobject==3.20.0 80 | pygraphviz==1.3.1 81 | PyJWT==1.3.0 82 | pyparsing==2.2.0 83 | pyserial==3.0.1 84 | pyshp==1.2.11 85 | pytest==3.0.6 86 | python-apt==1.1.0b1 87 | python-dateutil==2.6.1 88 | python-debian==0.1.27 89 | python-systemd==231 90 | pytz==2017.2 91 | PyYAML==3.11 92 | pyzmq==16.0.2 93 | qtconsole==4.3.1 94 | requests==2.9.1 95 | rope==0.10.3 96 | scikit-learn==0.19.0 97 | scipy==0.19.1 98 | seaborn==0.8.1 99 | Shapely==1.5.17.post1 100 | simplegeneric==0.8.1 101 | six==1.11.0 102 | sklearn==0.0 103 | slack-cleaner==0.3.0 104 | slacker==0.9.30 105 | ssh-import-id==5.5 106 | tensorflow==1.3.0 107 | tensorflow-tensorboard==0.1.6 108 | terminado==0.6 109 | testpath==0.3.1 110 | tflearn==0.3.2 111 | tornado==4.5.2 112 | tqdm==4.10.0 113 | traitlets==4.3.2 114 | ufw==0.35 115 | unattended-upgrades==0.1 116 | urllib3==1.13.1 117 | virtualenv==15.0.1 118 | wcwidth==0.1.7 119 | webencodings==0.5.1 120 | Werkzeug==0.12.2 121 | widgetsnbextension==3.0.3 122 | yapf==0.11.0 123 | -------------------------------------------------------------------------------- /sample_random_walks.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | from tqdm import tqdm 6 | from graph_tool import load_graph 7 | 8 | 9 | def random_walk(g, start_node, walk_length, alpha=0.05): 10 | """ 11 | random walk on unweighted, undirected graph 12 | 13 | Args: 14 | alpha: proba of restart 15 | 16 | Returns: 17 | a list of integer list 18 | """ 19 | walk = [start_node] 20 | c = g.vertex(start_node) 21 | for i in range(walk_length): 22 | if np.random.random() <= alpha: 23 | n = g.vertex(start_node) 24 | else: 25 | n = np.random.choice(list(c.out_neighbours())) 26 | c = n 27 | walk.append(int(c)) 28 | return walk 29 | 30 | 31 | def yield_n_random_walks(n, g, walk_length, alpha): 32 | nodes = list(map(int, g.vertices())) 33 | while n > 0: 34 | if n >= g.num_vertices(): 35 | nodes_to_start = nodes 36 | else: 37 | nodes_to_start = np.random.choice(nodes, n, replace=False) 38 | 39 | for v in nodes_to_start: 40 | yield random_walk(g, v, walk_length, alpha) 41 | 42 | n -= len(nodes_to_start) 43 | 44 | 45 | if __name__ == '__main__': 46 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 47 | 48 | FLAGS = tf.flags.FLAGS 49 | FLAGS._parse_flags() 50 | print("\nParameters:") 51 | for attr, value in sorted(FLAGS.__flags.items()): 52 | print("{}={}".format(attr.upper(), value)) 53 | print("") 54 | 55 | data_dir = FLAGS.data_dir 56 | 57 | # params on random walk 58 | num_walks_per_node = 80 59 | window_size = 10 60 | walk_length = 6 61 | alpha = 0.05 62 | 63 | # output 64 | walks = [] # list of list of nodes 65 | 66 | g = load_graph('{}/question_graph.gt'.format(data_dir)) 67 | total = num_walks_per_node * g.num_vertices() 68 | with open('{}/random_walks.txt'.format(data_dir), 'w') as f: 69 | for walk in tqdm(yield_n_random_walks(total, g, walk_length, alpha), total=total): 70 | f.write(' '.join(map(str, walk)) + '\n') 71 | print('written to ', '{}/random_walks.txt'.format(data_dir)) 72 | -------------------------------------------------------------------------------- /scripts/preprocessing_pipeline.sh: -------------------------------------------------------------------------------- 1 | #! /bin/zsh 2 | 3 | echo "build question graph..." 4 | python build_question_graph.py --data_dir $1 5 | 6 | echo "process posts..." 7 | python process_posts.py --data_dir $1 8 | 9 | echo "splitting data into train/test/dev..." 10 | python split_train_dev_test.py --data_dir $1 11 | 12 | echo "process train dev test..." 13 | python process_train_dev_test.py --data_dir $1 14 | -------------------------------------------------------------------------------- /split_train_dev_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle as pkl 3 | import numpy as np 4 | import pandas as pd 5 | import tensorflow as tf 6 | from sklearn.cross_validation import train_test_split 7 | 8 | 9 | tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset') 10 | tf.flags.DEFINE_float('test_sample_percentage', 0.1, 'precentage for test samples') 11 | tf.flags.DEFINE_float('dev_sample_percentage', 0.1, 'precentage for dev samples') 12 | 13 | 14 | FLAGS = tf.flags.FLAGS 15 | FLAGS._parse_flags() 16 | print("\nParameters:") 17 | for attr, value in sorted(FLAGS.__flags.items()): 18 | print("{}={}".format(attr.upper(), value)) 19 | print("") 20 | 21 | data_dir = FLAGS.data_dir 22 | 23 | text_path = os.path.join(data_dir, "input_text.csv") 24 | tdf = pd.read_csv(text_path, header=None) 25 | x_text = tdf[1] 26 | 27 | label_path = os.path.join(data_dir, "labels.csv") 28 | ldf = pd.read_csv(label_path, header=None) 29 | y_labels = ldf[1] 30 | 31 | node_ids = np.arange(len(x_text)) 32 | 33 | # train+dev and test 34 | (x_text_train, x_text_test, 35 | y_labels_train, y_labels_test, 36 | node_ids_train, node_ids_test) = \ 37 | train_test_split( 38 | x_text, y_labels, node_ids, 39 | train_size=1 - FLAGS.test_sample_percentage, 40 | random_state=123456) 41 | 42 | # re-scale 43 | train_percentage = 1 - FLAGS.dev_sample_percentage - FLAGS.test_sample_percentage 44 | new_train_percentage = train_percentage / (train_percentage + FLAGS.dev_sample_percentage) 45 | 46 | # train and dev 47 | (x_text_train, x_text_dev, 48 | y_labels_train, y_labels_dev, 49 | node_ids_train, node_ids_dev) = \ 50 | train_test_split( 51 | x_text_train, y_labels_train, node_ids_train, 52 | train_size=new_train_percentage, 53 | random_state=42) 54 | 55 | print("Train/Dev/Test split: {:d}/{:d}/{:d}".format( 56 | len(x_text_train), len(x_text_dev), len(x_text_test))) 57 | 58 | dump_path = '{}/train_dev_test.pkl'.format(data_dir) 59 | 60 | print('dumping to ', dump_path) 61 | pkl.dump((x_text_train.tolist(), x_text_dev.tolist(), x_text_test.tolist(), 62 | y_labels_train.tolist(), y_labels_dev.tolist(), y_labels_test.tolist(), 63 | node_ids_train, node_ids_dev, node_ids_test), 64 | open(dump_path, 'wb')) 65 | -------------------------------------------------------------------------------- /test_data_helpers.py: -------------------------------------------------------------------------------- 1 | from data_helpers import RWBatchGenerator 2 | 3 | 4 | def test_dw_batch_generator(): 5 | walks = [[1, 2, 3, 4], [4, 5, 6]] 6 | 7 | g = RWBatchGenerator(walks, 2, 2, 1) # 2 batches 8 | 9 | # iterate 3 rounds 10 | expected_batches = [{(2, 1), (2, 3)}, {(3, 2), (3, 4)}, {(5, 4), (5, 6)}] * 100 11 | for exp in expected_batches: 12 | batches, labels = g.next_batch() 13 | assert set(zip(batches, labels)) == exp 14 | 15 | g = RWBatchGenerator(walks, 8, 2, 1) # 8 batches 16 | batches, labels = g.next_batch() 17 | assert set(zip(batches, labels)) == {(2, 1), (2, 3), (3, 2), (3, 4), (5, 4), (5, 6)} 18 | assert set(list(zip(batches, labels))[-2:]) == {(2, 1), (2, 3)} # the last two loops back 19 | -------------------------------------------------------------------------------- /test_eval_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | import tensorflow as tf 4 | from scipy.sparse import csr_matrix 5 | 6 | from eval_helpers import tf_precision_at_k, label_lists_to_sparse_tuple, \ 7 | precision_at_ks 8 | 9 | 10 | @pytest.fixture 11 | def pred_value_1(): 12 | # [0, 1], [2, 1], [0, 2] 13 | return np.array([[0.9, 0.8, 0.7], [0.7, 0.8, 0.9], [0.9, 0.7, 0.8]], dtype=np.float32) 14 | 15 | 16 | @pytest.fixture 17 | def pred_value_2(): 18 | # [2, 1], [0, 1], [2, 1] 19 | return np.array([[0.7, 0.8, 0.9], [0.9, 0.8, 0.7], [0.7, 0.8, 0.9]], dtype=np.float32) 20 | 21 | 22 | @pytest.fixture 23 | def correct_values(): 24 | return [[0, 1, 2], [1, 2], [0, 2]] 25 | 26 | 27 | def test_tf_precision_at_k(pred_value_1, pred_value_2, correct_values): 28 | n_classes = 3 29 | 30 | sparse_tensor_tuple = label_lists_to_sparse_tuple(correct_values, n_classes) 31 | 32 | with tf.Session() as sess: 33 | print(tf.sparse_to_dense(sparse_tensor_tuple[0], 34 | sparse_tensor_tuple[2], 35 | sparse_tensor_tuple[1]).eval()) 36 | 37 | pred = tf.placeholder(tf.float32, shape=[None, None], name='pred') 38 | correct_labels = tf.sparse_placeholder(tf.int32, shape=[None, n_classes], name='correct_labels') 39 | precision = tf_precision_at_k(pred, correct_labels, k=2) 40 | 41 | p1 = sess.run(precision, 42 | feed_dict={ 43 | pred: pred_value_1, 44 | correct_labels: sparse_tensor_tuple 45 | }) 46 | 47 | assert np.isclose(p1, 1.0) 48 | 49 | p2 = sess.run(precision, 50 | feed_dict={ 51 | pred: pred_value_2, 52 | correct_labels: sparse_tensor_tuple 53 | }) 54 | assert np.isclose(p2, np.mean([1, 0.5, 0.5])) 55 | 56 | 57 | def test_precision_at_k_dense(pred_value_1, pred_value_2, correct_values): 58 | p1 = precision_at_ks(pred_value_1, correct_values, ks=[2])[0] 59 | assert np.isclose(p1, 1.0) 60 | 61 | p2 = precision_at_ks(pred_value_2, correct_values, ks=[2])[0] 62 | assert np.isclose(p2, np.mean([1, 0.5, 0.5])) 63 | 64 | 65 | def test_precision_at_k_sparse(pred_value_1, pred_value_2, correct_values): 66 | p1 = precision_at_ks(csr_matrix(pred_value_1), correct_values, ks=[2])[0] 67 | assert np.isclose(p1, 1.0) 68 | 69 | p2 = precision_at_ks(csr_matrix(pred_value_2), correct_values, ks=[2])[0] 70 | assert np.isclose(p2, np.mean([1, 0.5, 0.5])) 71 | -------------------------------------------------------------------------------- /text_cnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | class TextCNN: 5 | def __init__(self, sentence_length, num_classes, 6 | embedding_dim, 7 | vocab_size, 8 | filter_sizes, 9 | num_filters, ): 10 | # placeholders 11 | self.input_x = tf.placeholder( 12 | tf.int32, [None, sentence_length], name='input_x') 13 | self.input_y = tf.placeholder( 14 | tf.float32, [None, num_classes], name='input_y') 15 | self.dropout_keep_prob = tf.placeholder( 16 | tf.float32, name='dropout_keep_prob') 17 | 18 | # variables 19 | with tf.device('/cpu:0'), tf.name_scope('embedding'): 20 | # like name space for the operation inside this context 21 | W = tf.Variable(tf.random_uniform([vocab_size, embedding_dim], -0.1, 1.0)) 22 | embedded_x_3d = tf.nn.embedding_lookup(W, self.input_x) 23 | 24 | # make it into 4 dim 25 | # equivalent but more verbose way: 26 | # tf.reshape(embedded_x_3d, [-1, sentence_length, embedding_dim, 1]) 27 | self.embedded_x = tf.expand_dims(embedded_x_3d, -1) 28 | 29 | pooled_outputs = [] # to be concatenated 30 | for i, filter_size in enumerate(filter_sizes): 31 | 32 | with tf.name_scope('conv-maxpool-{}'.format(filter_size)): 33 | shape = [filter_size, embedding_dim, 1, num_filters] 34 | init_W = tf.truncated_normal(shape, stddev=0.1) 35 | W = tf.Variable(init_W, name='W') 36 | 37 | init_b = tf.constant(0.1, shape=[num_filters]) 38 | b = tf.Variable(init_b, name='b') 39 | conv = tf.nn.conv2d(self.embedded_x, W, 40 | padding="VALID", strides=[1, 1, 1, 1], 41 | name='conv') 42 | relu_out = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu') 43 | maxpool_out = tf.nn.max_pool( 44 | relu_out, 45 | ksize=[1, (sentence_length - filter_size + 1), 1, 1], 46 | strides=[1, 1, 1, 1], 47 | padding='VALID', 48 | name='pool') 49 | pooled_outputs.append(maxpool_out) 50 | 51 | num_filters_total = num_filters * len(filter_sizes) 52 | 53 | # 4d: [batch, 1, 1, filter_sizes] 54 | # "3" here is tricky 55 | self.h_pool = tf.concat(pooled_outputs, 3) 56 | 57 | # flatten into [batch, num_filters_total] 58 | self.h_pool_flat = tf.reshape(self.h_pool, 59 | [-1, num_filters_total], 60 | name="pooled_outputs") 61 | 62 | with tf.name_scope('dropout'): 63 | self.h_dropout = tf.nn.dropout( 64 | self.h_pool_flat, self.dropout_keep_prob) 65 | 66 | with tf.name_scope('output'): 67 | W = tf.Variable( 68 | tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), 69 | name='W') 70 | b = tf.Variable( 71 | tf.constant(0.1, shape=[num_classes]), 72 | name='b' 73 | ) 74 | 75 | self.scores = tf.nn.xw_plus_b(self.h_dropout, W, b, name="scores") 76 | 77 | probas = tf.sigmoid(self.scores) 78 | self.predictions = tf.round(probas, name="predictions") 79 | 80 | with tf.name_scope('loss'): 81 | losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.scores, labels=self.input_y) 82 | self.loss = tf.reduce_mean(losses) 83 | 84 | with tf.name_scope('performance'): 85 | self.precision = tf.metrics.precision(self.input_y, self.predictions, name="precision-micro")[1] 86 | self.recall = tf.metrics.recall(self.input_y, self.predictions, name="recall-micro")[1] 87 | -------------------------------------------------------------------------------- /tf_gather.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# demo on how to slice a tensor in tensorflow" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import tensorflow as tf\n", 23 | "import numpy as np" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "a = np.arange(10).reshape(5, 2)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "array([[0, 1],\n", 48 | " [2, 3],\n", 49 | " [4, 5],\n", 50 | " [6, 7],\n", 51 | " [8, 9]])" 52 | ] 53 | }, 54 | "execution_count": 4, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "a" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "v = tf.constant(a)\n", 72 | "indices = tf.placeholder(dtype=tf.int32)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 6, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "sub_v = tf.gather(v, indices)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 7, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "sess = tf.InteractiveSession()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 8, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "output = sess.run(sub_v, feed_dict={indices: [2,3,1]})" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 9, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "array([[4, 5],\n", 119 | " [6, 7],\n", 120 | " [2, 3]])" 121 | ] 122 | }, 123 | "execution_count": 9, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "output" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "array([[4, 5],\n", 143 | " [6, 7],\n", 144 | " [2, 3]])" 145 | ] 146 | }, 147 | "execution_count": 10, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "a[[2,3,1]]" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 11, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "assert (a[[2,3,1]] == output).all()" 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.5.2" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 2 189 | } 190 | -------------------------------------------------------------------------------- /tf_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | from tensorflow.contrib.tensorboard.plugins import projector 4 | 5 | 6 | def flatten(t): 7 | """flatten tensor of any dimension to 1d""" 8 | return tf.reshape(t, [tf.reduce_prod(t.shape)]) 9 | 10 | 11 | def dynamic_max_k_pool(t, k): 12 | """ 13 | perform dynamic max-k pooling on t, 14 | 15 | note that: 16 | 17 | 1. the 18 | 2. only supports 1d data for now 19 | 20 | Param: 21 | ----------- 22 | t: Tensor, 2d (batches, repr) 23 | k: int 24 | 25 | Return: 26 | ----------- 27 | Tensor, 2d (batches, k) 28 | """ 29 | 30 | 31 | def save_embedding_for_viz(embeddings, session, metadata_path, checkpoint_dir): 32 | embeddings_val = embeddings.eval() 33 | 34 | embedding_var = tf.Variable(embeddings_val, name='node_embedding') 35 | session.run(embedding_var.initializer) 36 | 37 | # Format: tensorflow/tensorboard/plugins/projector/projector_config.proto 38 | config = projector.ProjectorConfig() 39 | 40 | # You can add multiple embeddings. Here we add only one. 41 | embedding = config.embeddings.add() 42 | embedding.tensor_name = embedding_var.name 43 | 44 | # Link this tensor to its metadata file (e.g. labels). 45 | embedding.metadata_path = metadata_path 46 | 47 | # Use the same LOG_DIR where you stored your checkpoint. 48 | summary_writer = tf.summary.FileWriter(checkpoint_dir) 49 | 50 | # The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will 51 | # read this file during startup. 52 | projector.visualize_embeddings(summary_writer, config) 53 | 54 | saver = tf.train.Saver([embedding_var]) 55 | saver.save(session, os.path.join(checkpoint_dir, 'model2.ckpt'), 1) 56 | print('embedding for visualization saved') 57 | 58 | 59 | def get_variable_value_from_checkpoint(checkpoint_file, variable_names=[]): 60 | """load from checkpoint_file and read the values of the variables of `variable_name` 61 | 62 | return 63 | 64 | list of variable values 65 | """ 66 | sess = tf.Session() 67 | with sess.as_default(): 68 | saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) 69 | saver.restore(sess, checkpoint_file) 70 | 71 | vals = [] 72 | for variable_name in variable_names: 73 | embedding_table = sess.graph.get_operation_by_name(variable_name) 74 | vals.append(embedding_table.outputs[0].eval()) 75 | 76 | return vals 77 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import time 5 | import pickle as pkl 6 | import tensorflow as tf 7 | 8 | from tensorflow.contrib import learn 9 | from sklearn.preprocessing import MultiLabelBinarizer 10 | 11 | from data_helpers import batch_iter 12 | from text_cnn import TextCNN 13 | 14 | 15 | tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation") 16 | 17 | # model parameters 18 | tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)") 19 | tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 20 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 21 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 22 | 23 | 24 | # Training parameters 25 | tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)") 26 | tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)") 27 | tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)") 28 | tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)") 29 | tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)") 30 | 31 | # Misc Parameters 32 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 33 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 34 | 35 | 36 | FLAGS = tf.flags.FLAGS 37 | FLAGS._parse_flags() 38 | print("\nParameters:") 39 | for attr, value in sorted(FLAGS.__flags.items()): 40 | print("{}={}".format(attr.upper(), value)) 41 | print("") 42 | 43 | 44 | data_dir = 'data/stackexchange/datascience/' 45 | 46 | input_text = pd.read_csv(os.path.join(data_dir, 'input_text.csv'), header=None)[1].tolist() 47 | labels = pd.read_csv(os.path.join(data_dir, 'labels.csv'), header=None)[0].tolist() 48 | labels = list(map(lambda s: s.split(','), labels)) 49 | 50 | 51 | # In[34]: 52 | 53 | 54 | # filter out too long documents 55 | max_doc_len = 1000 56 | tpls = [(t, l) for t, l in zip(input_text, labels) if len(t.split()) <= max_doc_len] 57 | input_text, labels = list(zip(*tpls)) 58 | 59 | text_processor = learn.preprocessing.VocabularyProcessor(max_doc_len) 60 | x = np.array(list(text_processor.fit_transform(input_text))) 61 | 62 | 63 | mb = MultiLabelBinarizer() 64 | y = mb.fit_transform(labels) 65 | 66 | # split it 67 | np.random.seed(12345) 68 | 69 | ind = np.random.permutation(x.shape[0]) 70 | shuffled_x = x[ind, :] 71 | shuffled_y = y[ind, :] 72 | 73 | idx = int(FLAGS.dev_sample_percentage * x.shape[0]) 74 | train_x, train_y = shuffled_x[idx:, :], shuffled_y[idx:, :] 75 | dev_x, dev_y = shuffled_x[:idx, :], shuffled_y[:idx, :] 76 | 77 | 78 | with tf.Session() as sess: 79 | 80 | cnn = TextCNN(max_doc_len, len(mb.classes_), FLAGS.embedding_dim, len(text_processor.vocabulary_), 81 | list(map(int, FLAGS.filter_sizes.split(','))), 82 | FLAGS.num_filters) 83 | # train operation 84 | global_step = tf.Variable(0, name="global_step", trainable=False) 85 | optimizer = tf.train.AdamOptimizer(1e-5) 86 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 87 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 88 | 89 | # IO direction stuff 90 | timestamp = str(int(time.time())) 91 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 92 | 93 | 94 | # summary writer 95 | train_summary_dir = os.path.join(out_dir, "summary/train") 96 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) 97 | 98 | dev_summary_dir = os.path.join(out_dir, "summary/dev") 99 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) 100 | 101 | # checkpoint writer 102 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 103 | if not os.path.exists(checkpoint_dir): 104 | os.makedirs(checkpoint_dir) 105 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 106 | saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) 107 | 108 | # summary operation 109 | grad_summaries = [] 110 | for grad, v in grads_and_vars: 111 | if grad is not None: 112 | hist = tf.summary.histogram("{}/grad/hist".format(v.name), grad) 113 | sparsity = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(grad)) 114 | grad_summaries.append(hist) 115 | grad_summaries.append(sparsity) 116 | grad_summary = tf.summary.merge(grad_summaries) 117 | 118 | prec_summary = tf.summary.scalar("precision", cnn.precision) 119 | rec_summary = tf.summary.scalar("recall", cnn.recall) 120 | loss_summary = tf.summary.scalar("loss", cnn.loss) 121 | 122 | train_summary_op = tf.summary.merge([grad_summary, prec_summary, rec_summary, loss_summary]) 123 | dev_summary_op = tf.summary.merge([prec_summary, rec_summary, loss_summary]) 124 | 125 | # real code 126 | sess.run(tf.global_variables_initializer()) 127 | sess.run(tf.local_variables_initializer()) 128 | 129 | # dump vectorizer 130 | text_processor.save(os.path.join(out_dir, 'text_processor')) 131 | pkl.dump(mb, open(os.path.join(out_dir, 'label_encoder.pkl'), 'wb')) 132 | 133 | data = list(zip(train_x, train_y)) 134 | batches = batch_iter(data, FLAGS.batch_size, FLAGS.num_epochs) 135 | for batch in batches: 136 | batch_x, batch_y = zip(*batch) 137 | feed_dict = {cnn.input_x: batch_x, 138 | cnn.input_y: batch_y, 139 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob} 140 | 141 | current_step = tf.train.global_step(sess, global_step) 142 | _, current_step, summaries, loss, prec, rec = sess.run( 143 | [train_op, global_step, train_summary_op, cnn.loss, cnn.precision, cnn.recall], 144 | feed_dict=feed_dict) 145 | print("(TRAIN) at step {}: loss={:.2f}, precision={:4f}, recall={:4f}".format(current_step, loss, prec, rec)) 146 | train_summary_writer.add_summary(summaries, current_step) 147 | 148 | if current_step % FLAGS.evaluate_every == 0: 149 | loss, summaries, prec, rec = sess.run( 150 | [cnn.loss, dev_summary_op, cnn.precision, cnn.recall], 151 | feed_dict={cnn.input_x: dev_x, cnn.input_y: dev_y, cnn.dropout_keep_prob: 1}) 152 | print("(DEV) at step {}: loss={:2f}, precision={:4f}, recall={:4f}".format( 153 | current_step, loss, prec, rec)) 154 | dev_summary_writer.add_summary(summaries, current_step) 155 | 156 | if current_step % FLAGS.checkpoint_every == 0: 157 | saver.save(sess, checkpoint_prefix, global_step=global_step) 158 | 159 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | import math 2 | import tensorflow as tf 3 | 4 | 5 | class Word2Vec(): 6 | """ 7 | model for word2vec 8 | can be used network embedding as well 9 | 10 | Args: 11 | 12 | num_sampled: int, number of negative examples to sample 13 | vocabulary_size: int 14 | embedding_size: int 15 | """ 16 | 17 | def __init__(self, 18 | num_sampled, 19 | vocabulary_size, 20 | embedding_size, 21 | embedding_value=None, 22 | nce_W_value=None, 23 | nce_b_value=None): 24 | 25 | self.vocabulary_size, self.embedding_size = (vocabulary_size, 26 | embedding_size) 27 | assert self.vocabulary_size > 0 28 | assert self.embedding_size > 0 29 | 30 | # Input data. 31 | self.train_inputs = tf.placeholder(tf.int32, shape=None, name='input_x') 32 | self.train_labels = tf.placeholder(tf.int32, shape=[None, 1], name='input_y') 33 | 34 | # Ops and variables pinned to the CPU because of missing GPU implementation 35 | with tf.device('/cpu:0'): 36 | # Look up self.embeddings for inputs. 37 | with tf.name_scope('embedding'): 38 | if embedding_value is None: 39 | embedding_value = tf.random_uniform( 40 | [self.vocabulary_size, self.embedding_size], -1.0, 1.0) 41 | else: 42 | assert (self.vocabulary_size, self.embedding_size) == \ 43 | embedding_value.shape, 'shape does not match' 44 | 45 | self.embeddings = tf.Variable( 46 | embedding_value, 47 | name='table') 48 | embed = tf.nn.embedding_lookup(self.embeddings, self.train_inputs, name='looked-up-value') 49 | 50 | with tf.name_scope('nce'): 51 | # Construct the variables for the NCE loss 52 | if nce_W_value is None: 53 | nce_W_value = tf.truncated_normal([self.vocabulary_size, self.embedding_size], 54 | stddev=1.0 / math.sqrt(self.embedding_size)) 55 | self.nce_weights = tf.Variable(nce_W_value) 56 | 57 | if nce_b_value is None: 58 | nce_b_value = tf.zeros([self.vocabulary_size]) 59 | self.nce_biases = tf.Variable(nce_b_value) 60 | 61 | # Compute the average NCE loss for the batch. 62 | # tf.nce_loss automatically draws a new sample of the negative labels each 63 | # time we evaluate the loss. 64 | with tf.name_scope('loss'): 65 | self.loss = tf.reduce_mean( 66 | tf.nn.nce_loss(weights=self.nce_weights, 67 | biases=self.nce_biases, 68 | labels=self.train_labels, 69 | inputs=embed, 70 | num_sampled=num_sampled, 71 | num_classes=self.vocabulary_size)) 72 | 73 | norm = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keep_dims=True)) 74 | self.normalized_embeddings = self.embeddings / norm 75 | --------------------------------------------------------------------------------