├── data ├── w2v │ └── .gitignore ├── preprocess_data.sh ├── logicnn_features.py ├── preprocess_stsa.py └── raw │ └── stsa.binary.dev ├── run.sh ├── .gitignore ├── README.md ├── fol.py ├── logicnn_classes.py └── logicnn_sentiment.py /data/w2v/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python logicnn_sentiment.py -nonstatic -word2vec 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # data 2 | data/*.p 3 | 4 | # vi 5 | tags 6 | *.tags 7 | *.swp 8 | *.swo 9 | 10 | # backup 11 | *_bak 12 | -------------------------------------------------------------------------------- /data/preprocess_data.sh: -------------------------------------------------------------------------------- 1 | # preprocess raw data 2 | python preprocess_stsa.py ./raw/ ./w2v/GoogleNews-vectors-negative300.bin 3 | # extract rule features 4 | python logicnn_features.py ./stsa.binary.p 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An implementation of the following paper in application of sentiment classification 2 | 3 | [Harnessing Deep Neural Networks with Logic Rules](https://arxiv.org/abs/1603.06318) 4 | Zhiting Hu, Xuezhe Ma, Zhengzhong Liu, Eduard Hovy, Eric P. Xing, ACL 2016 5 | 6 | * The code is adapted from [previous work](https://github.com/yoonkim/CNN_sentence), written in Python (2.7) and Theano (0.9) 7 | 8 | * The [SST2](http://nlp.stanford.edu/sentiment/treebank.html) dataset is used 9 | 10 | * contact: zhitinghu@gmail.com 11 | 12 | ## Data Preparation ## 13 | 14 | * Download and uncompress the pre-trained [word2vec](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) to `data/w2v/` 15 | 16 | * run script `data/preprocess_data.sh` under `data/` 17 | 18 | ## Running ## 19 | 20 | * run `run.sh` 21 | 22 | * Average performance (accuracy) is around `q:0.893, p:0.887` (randomness comes from GPU parallelization) 23 | -------------------------------------------------------------------------------- /data/logicnn_features.py: -------------------------------------------------------------------------------- 1 | """ 2 | BUT-rule feature extractor 3 | 4 | """ 5 | import cPickle 6 | import numpy as np 7 | from collections import defaultdict, OrderedDict 8 | import theano 9 | import theano.tensor as T 10 | import re 11 | import warnings 12 | import sys 13 | import time 14 | 15 | warnings.filterwarnings("ignore") 16 | 17 | def text_after_first(text, part): 18 | if part in text: 19 | return ''.join(text.split(part)[1:]) 20 | else: 21 | return '' 22 | 23 | def extract_but(revs): 24 | but_fea = [] 25 | but_ind = [] 26 | but_fea_cnt = 0 27 | for rev in revs: 28 | text = rev["text"] 29 | if ' but ' in text: 30 | but_ind.append(1) 31 | # make the text after 'but' as the feature 32 | fea = text.split('but')[1:] 33 | fea = ''.join(fea) 34 | fea = fea.strip().replace(' ', ' ') 35 | but_fea_cnt += 1 36 | else: 37 | but_ind.append(0) 38 | fea = '' 39 | but_fea.append(fea) 40 | print '#but %d' % but_fea_cnt 41 | return {'but_text': but_fea, 'but_ind': but_ind} 42 | 43 | if __name__=="__main__": 44 | data_file = sys.argv[1] 45 | print "loading data..." 46 | x = cPickle.load(open(data_file,"rb")) 47 | revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4] 48 | print "data loaded!" 49 | but_fea = extract_but(revs) 50 | cPickle.dump(but_fea, open("%s.fea.p" % data_file, "wb")) 51 | print "feature dumped!" 52 | 53 | -------------------------------------------------------------------------------- /fol.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | First Order Logic (FOL) rules 4 | 5 | """ 6 | 7 | import warnings 8 | import numpy 9 | import theano.tensor.shared_randomstreams 10 | import theano 11 | import theano.tensor as T 12 | from theano.ifelse import ifelse 13 | from theano.tensor.signal import downsample 14 | from theano.tensor.nnet import conv 15 | from theano import printing 16 | 17 | 18 | class FOL(object): 19 | """ First Order Logic (FOL) rules """ 20 | 21 | def __init__(self, K, input, fea): 22 | """ Initialize 23 | 24 | : type K: int 25 | : param K: the number of classes 26 | """ 27 | self.input = input 28 | self.fea = fea 29 | # Record the data relevance (binary) 30 | self.conds = self.conditions(self.input, self.fea) 31 | self.K = K 32 | 33 | def conditions(self, X, F): 34 | results,_ = theano.scan(lambda x,f: self.condition_single(x,f), sequences=[X,F]) 35 | return results 36 | 37 | 38 | def distribution_helper_helper(self, x, f): 39 | results,_ = theano.scan(lambda k: self.value_single(x,k,f), sequences=T.arange(self.K)) 40 | return results 41 | 42 | 43 | def distribution_helper(self, w, X, F, conds): 44 | nx = X.shape[0] 45 | distr = T.alloc(1.0, nx, self.K) 46 | distr,_ = theano.scan( 47 | lambda c,x,f,d: ifelse(T.eq(c,1.), self.distribution_helper_helper(x,f), d), 48 | sequences=[conds, X, F, distr]) 49 | distr,_ = theano.scan( 50 | lambda d: -w*(T.min(d,keepdims=True)-d), # relative value w.r.t the minimum 51 | sequences=distr) 52 | return distr 53 | 54 | 55 | """ 56 | Interface function of logic constraints 57 | 58 | The interface is general---only need to overload condition_single(.) and 59 | value_single(.) below to implement a logic rule---but can be slow 60 | 61 | See the overloaded log_distribution(.) of the BUT-rule for an efficient 62 | version specific to the BUT-rule 63 | """ 64 | def log_distribution(self, w, X=None, F=None, config={}): 65 | """ Return an nxK matrix with the (i,c)-th term 66 | = - w * (1 - r(X_i, y_i=c)) 67 | if X_i is a grounding of the rule 68 | = 1 otherwise 69 | """ 70 | if F == None: 71 | X, F, conds = self.input, self.fea, self.conds 72 | else: 73 | conds = self.conditions(X,F) 74 | log_distr = self.distribution_helper(w,X,F,conds) 75 | return log_distr 76 | 77 | 78 | """ 79 | Rule-specific functions to be overloaded 80 | 81 | """ 82 | def condition_single(self, x, f): 83 | """ True if x satisfies the condition """ 84 | return T.cast(0, dtype=theano.config.floatX) 85 | 86 | 87 | def value_single(self, x, y, f): 88 | """ value = r(x,y) """ 89 | return T.cast(1, dtype=theano.config.floatX) 90 | 91 | 92 | #---------------------------------------------------- 93 | # BUT rule 94 | #---------------------------------------------------- 95 | 96 | class FOL_But(FOL): 97 | """ x=x1_but_x2 => { y => pred(x2) AND pred(x2) => y } """ 98 | def __init__(self, K, input, fea): 99 | """ Initialize 100 | 101 | :type K: int 102 | :param K: the number of classes 103 | 104 | :type fea: theano.tensor.dtensor4 105 | :param fea: symbolic feature tensor, of shape 3 106 | fea[0] : 1 if x=x1_but_x2, 0 otherwise 107 | fea[1:2] : classifier.predict_p(x_2) 108 | """ 109 | assert K == 2 110 | super(FOL_But, self).__init__(K, input, fea) 111 | 112 | """ 113 | Rule-specific functions 114 | 115 | """ 116 | def condition_single(self, x, f): 117 | return T.cast(T.eq(f[0],1.), dtype=theano.config.floatX) 118 | 119 | 120 | def value_single(self, x, y, f): 121 | ret = T.mean([T.min([1.-y+f[2],1.]), T.min([1.-f[2]+y,1.])]) 122 | ret = T.cast(ret, dtype=theano.config.floatX) 123 | return T.cast(ifelse(T.eq(self.condition_single(x,f),1.), ret, 1.), 124 | dtype=theano.config.floatX) 125 | 126 | """ 127 | Efficient version specific to the BUT-rule 128 | 129 | """ 130 | def log_distribution(self, w, X=None, F=None): 131 | if F == None: 132 | X, F = self.input, self.fea 133 | F_mask = F[:,0] 134 | F_fea = F[:,1:] 135 | # y = 0 136 | distr_y0 = w*F_mask*F_fea[:,0] 137 | # y = 1 138 | distr_y1 = w*F_mask*F_fea[:,1] 139 | distr_y0 = distr_y0.reshape([distr_y0.shape[0],1]) 140 | distr_y1 = distr_y1.reshape([distr_y1.shape[0],1]) 141 | distr = T.concatenate([distr_y0, distr_y1], axis=1) 142 | return distr 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /data/preprocess_stsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cPickle 3 | from collections import defaultdict 4 | import sys, re 5 | import pandas as pd 6 | from random import randint 7 | np.random.seed(7294258) 8 | 9 | def build_data(data_folder, clean_string=True): 10 | """ 11 | Loads data 12 | """ 13 | revs = [] 14 | [train_file,dev_file,test_file] = data_folder 15 | vocab = defaultdict(float) 16 | with open(train_file, "rb") as f: 17 | for line in f: 18 | line = line.strip() 19 | y = int(line[0]) 20 | rev = [] 21 | rev.append(line[2:].strip()) 22 | if clean_string: 23 | orig_rev = clean_str(" ".join(rev)) 24 | else: 25 | orig_rev = " ".join(rev).lower() 26 | words = set(orig_rev.split()) 27 | for word in words: 28 | vocab[word] += 1 29 | datum = {"y":y, 30 | "text": orig_rev, 31 | "num_words": len(orig_rev.split()), 32 | "split": 0} # 0-train, 1-dev, 2-test 33 | revs.append(datum) 34 | with open(dev_file, "rb") as f: 35 | for line in f: 36 | line = line.strip() 37 | y = int(line[0]) 38 | rev = [] 39 | rev.append(line[2:].strip()) 40 | if clean_string: 41 | orig_rev = clean_str(" ".join(rev)) 42 | else: 43 | orig_rev = " ".join(rev).lower() 44 | words = set(orig_rev.split()) 45 | for word in words: 46 | vocab[word] += 1 47 | datum = {"y":y, 48 | "text": orig_rev, 49 | "num_words": len(orig_rev.split()), 50 | "split": 1} 51 | revs.append(datum) 52 | with open(test_file, "rb") as f: 53 | for line in f: 54 | line = line.strip() 55 | y = int(line[0]) 56 | rev = [] 57 | rev.append(line[2:].strip()) 58 | if clean_string: 59 | orig_rev = clean_str(" ".join(rev)) 60 | else: 61 | orig_rev = " ".join(rev).lower() 62 | words = set(orig_rev.split()) 63 | for word in words: 64 | vocab[word] += 1 65 | datum = {"y":y, 66 | "text": orig_rev, 67 | "num_words": len(orig_rev.split()), 68 | "split": 2} 69 | revs.append(datum) 70 | return revs, vocab 71 | 72 | def get_W(word_vecs, k=300): 73 | """ 74 | Get word matrix. W[i] is the vector for word indexed by i 75 | """ 76 | vocab_size = len(word_vecs) 77 | word_idx_map = dict() 78 | W = np.zeros(shape=(vocab_size+1, k), dtype='float32') 79 | W[0] = np.zeros(k, dtype='float32') 80 | i = 1 81 | for word in word_vecs: 82 | W[i] = word_vecs[word] 83 | word_idx_map[word] = i 84 | i += 1 85 | return W, word_idx_map 86 | 87 | def load_bin_vec(fname, vocab): 88 | """ 89 | Loads 300x1 word vecs from Google (Mikolov) word2vec 90 | """ 91 | word_vecs = {} 92 | with open(fname, "rb") as f: 93 | header = f.readline() 94 | vocab_size, layer1_size = map(int, header.split()) 95 | binary_len = np.dtype('float32').itemsize * layer1_size 96 | for line in xrange(vocab_size): 97 | word = [] 98 | while True: 99 | ch = f.read(1) 100 | if ch == ' ': 101 | word = ''.join(word) 102 | break 103 | if ch != '\n': 104 | word.append(ch) 105 | if word in vocab: 106 | word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32') 107 | else: 108 | f.read(binary_len) 109 | return word_vecs 110 | 111 | def add_unknown_words(word_vecs, vocab, min_df=1, k=300): 112 | """ 113 | For words that occur in at least min_df documents, create a separate word vector. 114 | 0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones 115 | """ 116 | for word in vocab: 117 | if word not in word_vecs and vocab[word] >= min_df: 118 | word_vecs[word] = np.random.uniform(-0.25,0.25,k) 119 | 120 | def clean_str(string, TREC=False): 121 | """ 122 | Tokenization/string cleaning for all datasets except for SST. 123 | Every dataset is lower cased except for TREC 124 | """ 125 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 126 | string = re.sub(r"\'s", " \'s", string) 127 | string = re.sub(r"\'ve", " \'ve", string) 128 | string = re.sub(r"n\'t", " n\'t", string) 129 | string = re.sub(r"\'re", " \'re", string) 130 | string = re.sub(r"\'d", " \'d", string) 131 | string = re.sub(r"\'ll", " \'ll", string) 132 | string = re.sub(r",", " , ", string) 133 | string = re.sub(r"!", " ! ", string) 134 | string = re.sub(r"\(", " \( ", string) 135 | string = re.sub(r"\)", " \) ", string) 136 | string = re.sub(r"\?", " \? ", string) 137 | string = re.sub(r"\s{2,}", " ", string) 138 | return string.strip() if TREC else string.strip().lower() 139 | 140 | def clean_str_sst(string): 141 | """ 142 | Tokenization/string cleaning for the SST dataset 143 | """ 144 | string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) 145 | string = re.sub(r"\s{2,}", " ", string) 146 | return string.strip().lower() 147 | 148 | if __name__=="__main__": 149 | stsa_path = sys.argv[1] 150 | w2v_file = sys.argv[2] 151 | train_data_file = "%s/stsa.binary.phrases.train" % stsa_path 152 | dev_data_file = "%s/stsa.binary.dev" % stsa_path 153 | test_data_file = "%s/stsa.binary.test" % stsa_path 154 | data_folder = [train_data_file, dev_data_file, test_data_file] 155 | print "loading data...", 156 | revs, vocab = build_data(data_folder, clean_string=True) 157 | max_l = np.max(pd.DataFrame(revs)["num_words"]) 158 | print "data loaded!" 159 | print "number of sentences: " + str(len(revs)) 160 | print "vocab size: " + str(len(vocab)) 161 | print "max sentence length: " + str(max_l) 162 | print "loading word2vec vectors...", 163 | w2v = load_bin_vec(w2v_file, vocab) 164 | print "word2vec loaded!" 165 | print "num words already in word2vec: " + str(len(w2v)) 166 | add_unknown_words(w2v, vocab) 167 | W, word_idx_map = get_W(w2v) 168 | rand_vecs = {} 169 | add_unknown_words(rand_vecs, vocab) 170 | W2, _ = get_W(rand_vecs) 171 | cPickle.dump([revs, W, W2, word_idx_map, vocab], open("./stsa.binary.p", "wb")) 172 | print "dataset created!" 173 | 174 | -------------------------------------------------------------------------------- /logicnn_classes.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | import numpy 3 | import theano.tensor.shared_randomstreams 4 | import theano 5 | import theano.tensor as T 6 | from theano.ifelse import ifelse 7 | from theano.tensor.signal import downsample 8 | from theano.tensor.nnet import conv 9 | from theano import printing 10 | import time 11 | 12 | def ReLU(x): 13 | y = T.maximum(0.0, x) 14 | return(y) 15 | def Sigmoid(x): 16 | y = T.nnet.sigmoid(x) 17 | return(y) 18 | def Tanh(x): 19 | y = T.tanh(x) 20 | return(y) 21 | def Iden(x): 22 | y = x 23 | return(y) 24 | 25 | class HiddenLayer(object): 26 | """ 27 | Class for HiddenLayer 28 | """ 29 | def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None): 30 | 31 | self.input = input 32 | self.activation = activation 33 | 34 | if W is None: 35 | if activation.func_name == "ReLU": 36 | W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), dtype=theano.config.floatX) 37 | else: 38 | W_values = numpy.asarray(rng.uniform(low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), 39 | size=(n_in, n_out)), dtype=theano.config.floatX) 40 | W = theano.shared(value=W_values, name='W') 41 | if b is None: 42 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 43 | b = theano.shared(value=b_values, name='b') 44 | 45 | self.W = W 46 | self.b = b 47 | 48 | lin_output = T.dot(input, self.W) + self.b 49 | 50 | self.output = (lin_output if activation is None else activation(lin_output)) 51 | 52 | # parameters of the model 53 | self.params = [self.W, self.b] 54 | 55 | def _dropout_from_layer(rng, layer, p): 56 | """p is the probablity of dropping a unit 57 | """ 58 | srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999)) 59 | # p=1-p because 1's indicate keep and p is prob of dropping 60 | mask = srng.binomial(n=1, p=1-p, size=layer.shape) 61 | # The cast is important because 62 | # int * float32 = float64 which pulls things off the gpu 63 | output = layer * T.cast(mask, theano.config.floatX) 64 | return output 65 | 66 | class DropoutHiddenLayer(HiddenLayer): 67 | def __init__(self, rng, input, n_in, n_out, 68 | activation, dropout_rate, W=None, b=None): 69 | super(DropoutHiddenLayer, self).__init__( 70 | rng=rng, input=input, n_in=n_in, n_out=n_out, W=W, b=b, 71 | activation=activation) 72 | 73 | self.output = _dropout_from_layer(rng, self.output, p=dropout_rate) 74 | 75 | class MLPDropout(object): 76 | """A multilayer perceptron with dropout""" 77 | def __init__(self,rng,input,layer_sizes,dropout_rates,activations): 78 | 79 | #rectified_linear_activation = lambda x: T.maximum(0.0, x) 80 | 81 | # Set up all the hidden layers 82 | self.weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:]) 83 | self.layers = [] 84 | self.dropout_layers = [] 85 | self.activations = activations 86 | next_layer_input = input 87 | #first_layer = True 88 | # dropout the input 89 | next_dropout_layer_input = _dropout_from_layer(rng, input, p=dropout_rates[0]) 90 | layer_counter = 0 91 | for n_in, n_out in self.weight_matrix_sizes[:-1]: 92 | next_dropout_layer = DropoutHiddenLayer(rng=rng, 93 | input=next_dropout_layer_input, 94 | activation=activations[layer_counter], 95 | n_in=n_in, n_out=n_out, 96 | dropout_rate=dropout_rates[layer_counter]) 97 | self.dropout_layers.append(next_dropout_layer) 98 | next_dropout_layer_input = next_dropout_layer.output 99 | 100 | # Reuse the parameters from the dropout layer here, in a different 101 | # path through the graph. 102 | next_layer = HiddenLayer(rng=rng, 103 | input=next_layer_input, 104 | activation=activations[layer_counter], 105 | # scale the weight matrix W with (1-p) 106 | W=next_dropout_layer.W * (1 - dropout_rates[layer_counter]), 107 | b=next_dropout_layer.b, 108 | n_in=n_in, n_out=n_out) 109 | self.layers.append(next_layer) 110 | next_layer_input = next_layer.output 111 | #first_layer = False 112 | layer_counter += 1 113 | 114 | # Set up the output layer 115 | n_in, n_out = self.weight_matrix_sizes[-1] 116 | dropout_output_layer = LogisticRegression( 117 | input=next_dropout_layer_input, 118 | n_in=n_in, n_out=n_out) 119 | self.dropout_layers.append(dropout_output_layer) 120 | 121 | # Again, reuse paramters in the dropout output. 122 | output_layer = LogisticRegression( 123 | input=next_layer_input, 124 | # scale the weight matrix W with (1-p) 125 | W=dropout_output_layer.W * (1 - dropout_rates[-1]), 126 | b=dropout_output_layer.b, 127 | n_in=n_in, n_out=n_out) 128 | self.layers.append(output_layer) 129 | 130 | # Use the negative log likelihood of the logistic regression layer as 131 | # the objective. 132 | self.dropout_negative_log_likelihood = self.dropout_layers[-1].negative_log_likelihood 133 | self.dropout_errors = self.dropout_layers[-1].errors 134 | 135 | self.negative_log_likelihood = self.layers[-1].negative_log_likelihood 136 | self.errors = self.layers[-1].errors 137 | 138 | # the output of the logistic regression layer 139 | self.dropout_p_y_given_x = self.dropout_layers[-1].p_y_given_x 140 | self.p_y_given_x = self.layers[-1].p_y_given_x 141 | # the negative log liklihood with soft ground truth 142 | self.soft_dropout_negative_log_likelihood = self.dropout_layers[-1].soft_negative_log_likelihood 143 | self.soft_negative_log_likelihood = self.layers[-1].soft_negative_log_likelihood 144 | 145 | # Grab all the parameters together. 146 | self.params = [ param for layer in self.dropout_layers for param in layer.params ] 147 | 148 | 149 | def predict(self, new_data): 150 | next_layer_input = new_data 151 | for i,layer in enumerate(self.layers): 152 | if i