├── README.md ├── code ├── linmod.R └── parseyelp.py ├── debug.ipynb ├── paths.ipynb ├── tex ├── acl.bst ├── acl2015.sty ├── acl2015.tex ├── deepir.bbl ├── deepir.bib ├── deepir.pdf ├── deepir.tex └── graphs │ ├── bht.dot │ ├── bht.pdf │ ├── bht.png │ ├── bystarshort.pdf │ ├── coarseprob.pdf │ ├── coarseprob_bystar.pdf │ ├── fineprob.pdf │ ├── nnpprob.pdf │ ├── posneg.png │ └── yelp_logistic.png └── w2v-inversion.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # deep inverse regression 2 | 3 | ### or: Document Classification by Inversion of Distributed Language Representations [(ACL 2015)](http://arxiv.org/pdf/1504.07295v3.pdf) 4 | 5 | Using unsupervised deep learning within sub-groups as the input for Bayesian discrimination. 6 | 7 | Everything in here is built around the [gensim](https://radimrehurek.com/gensim/) library for python. See the demo at [deepir.ipynb](https://github.com/TaddyLab/gensim/blob/deepir/docs/notebooks/deepir.ipynb). 8 | -------------------------------------------------------------------------------- /code/linmod.R: -------------------------------------------------------------------------------- 1 | suppressMessages(library(textir)) 2 | suppressMessages(library(data.table)) 3 | 4 | ## get results from w2v 5 | w2vprob <- fread("data/yelpw2vprobs.csv", header=TRUE, verbose=FALSE) 6 | 7 | ## read the aggregated w2v vectors 8 | aggvec <- read.table("data/yelp_vectors.txt", sep="|") 9 | 10 | ## read in the text 11 | revs <- read.table("data/yelp_phrases.txt", 12 | sep="|",quote=NULL, comment="", 13 | col.names=c("id","phrase","stars","sample")) 14 | 15 | x <- sparseMatrix( 16 | i=revs[,"id"]+1, j=as.numeric(revs[,"phrase"]), x=rep(1,nrow(revs)), 17 | dimnames=list(NULL, levels(revs[,"phrase"])), 18 | dims=c(nrow(aggvec), nlevels(revs[,"phrase"])) ) 19 | emptyrev <- which(rowSums(x)==0) 20 | 21 | x <- x[-emptyrev,colSums(x>0)>5] 22 | w2vprob <- as.matrix(w2vprob[-emptyrev,]) 23 | aggvec <- as.matrix(aggvec[-emptyrev,]) 24 | 25 | print(n <- nrow(x)) 26 | 27 | stars <- tapply(revs$stars, revs$id, mean) 28 | samp <- tapply( revs$sample=="test", revs$id, mean) 29 | test <- which(samp==1) 30 | 31 | ## read d2v 32 | dv0train <- fread("data/yelpD2Vtrain0.csv", verbose=FALSE) 33 | dv0test <- fread("data/yelpD2Vtest0.csv", verbose=FALSE) 34 | dv1train <- fread("data/yelpD2Vtrain1.csv", verbose=FALSE) 35 | dv1test <- fread("data/yelpD2Vtest1.csv", verbose=FALSE) 36 | # all(dv0test[,id]==dv1test[,id]) 37 | # all(dv0test[,stars]==dv1test[,stars]) 38 | vecvar <- paste("x",1:100,sep="") 39 | dv0x <- rbind(as.matrix(dv0train[,vecvar,with=FALSE]), 40 | as.matrix(dv0test[,vecvar,with=FALSE])) 41 | dv1x <- rbind(as.matrix(dv1train[,vecvar,with=FALSE]), 42 | as.matrix(dv1test[,vecvar,with=FALSE])) 43 | dvx <- cbind(dv0x,dv1x) 44 | dvstars <- c(dv0train[,stars], dv0test[,stars]) 45 | dvtest <- nrow(dv0train)+1:nrow(dv0test) 46 | 47 | library(parallel) 48 | cl <- makeCluster(6, type="FORK") 49 | 50 | geterr <- function(phat, y, PY=FALSE){ 51 | if(ncol(phat)==1) phat <- cbind(1-phat,phat) 52 | y <- factor(y) 53 | yhat <- factor(levels(y)[apply(phat,1,which.max)]) 54 | cat("mcr ") 55 | for(l in levels(y)) 56 | cat(l, ":", round( 57 | mean(yhat[y==l] != y[y==l]),3), ", ", sep="") 58 | overall <- mean(yhat !=y) 59 | diff <- mean( abs(as.numeric(yhat) - as.numeric(y)) ) 60 | py <- phat[cbind(1:nrow(phat),y)] 61 | lp <- log(py) 62 | lp[lp < (-50)] <- -50 63 | dev <- mean(-2*lp) 64 | cat("\noverall:", round(overall,3), "diff:", round(diff,3), "deviance:", dev, "\n") 65 | if(PY) return(py) 66 | invisible() 67 | } 68 | 69 | getpy <- function(fit, xx, y, testset, PY=FALSE){ 70 | if(inherits(fit,"randomForest")) 71 | phat <- as.matrix(predict(fit, xx[testset,], type="prob")) 72 | else 73 | phat <- predict(fit, xx[testset,], type="response") 74 | py <- geterr(phat, y[testset], PY=PY) 75 | if(PY) return(py) 76 | invisible() 77 | } 78 | 79 | ## define y 80 | ycoarse <- as.numeric(stars>2) 81 | ynnp <- cut(stars, c(0,2,3,5)) 82 | yfine <- factor(stars) 83 | dvycoarse <- as.numeric(dvstars>2) 84 | dvynnp <- cut(dvstars, c(0,2,3,5)) 85 | dvyfine <- factor(dvstars) 86 | 87 | ### W2V inversion 88 | cat("\n**** W2V INVERSION ****\n") 89 | nullprob <- as.numeric(table(stars[-test])/length(stars[-test])) 90 | 91 | cat("** COARSE **\n") 92 | w2vpcoarse <- cbind(rowSums(w2vprob[,1:2]),rowSums(w2vprob[,3:5])) 93 | geterr(w2vpcoarse[test,], ycoarse[test]) 94 | 95 | cat("** NNP **\n") 96 | w2vpnnp <- cbind(rowSums(w2vprob[,1:2]), 97 | rowSums(w2vprob[,3,drop=FALSE]), 98 | rowSums(w2vprob[,4:5,drop=FALSE])) 99 | geterr(w2vpnnp[test,], ynnp[test]) 100 | 101 | cat("** FINE **\n") 102 | geterr(w2vprob[test,], yfine[test]) 103 | 104 | ### logit word-count prediction 105 | cat("\n*** COUNTREG ***\n") 106 | 107 | cat("** COARSE **\n") 108 | logitcoarse <- gamlr(x[-test,], ycoarse[-test], 109 | family="binomial", lmr=1e-3) 110 | pycoarse <- getpy(logitcoarse, x, ycoarse, test, PY=TRUE) 111 | 112 | png(file="paper/graphs/yelp_logistic.png", width=12,height=6, units="in", res=180) 113 | plot(logitcoarse) 114 | invisible(dev.off()) 115 | 116 | cat("** NNP **\n") 117 | logitnnp <- dmr(cl=cl, x[-test,], ynnp[-test], lmr=1e-3) 118 | pynnp <- getpy(logitnnp, x, ynnp, test, PY=TRUE) 119 | 120 | cat("** FINE **\n") 121 | logitfine <- dmr(cl=cl, x[-test,], yfine[-test], lmr=1e-3) 122 | pyfine <- getpy(logitfine, x, yfine, test, PY=TRUE) 123 | 124 | cat("\n*** W2V and COUNTREG NNP ***\n") 125 | wx <- cBind(w2vprob,x) 126 | combof <- dmr(cl,wx[-test,], ynnp[-test]) 127 | getpy(combof, wx, ynnp, test) 128 | 129 | ## D2V stuff 130 | ## all run at zero lambda; AICc selects most complex model anyways 131 | cat("\n*** D2V ***\n") 132 | 133 | cat("** COARSE\n") 134 | cat("dm0 **\n") 135 | dv0coarse <- gamlr(dv0x[-dvtest,], dvycoarse[-dvtest], 136 | family="binomial", lmr=1e-4) 137 | getpy(dv0coarse, dv0x, dvycoarse, dvtest) 138 | cat("dm1 **\n") 139 | dv1coarse <- gamlr(dv1x[-dvtest,], dvycoarse[-dvtest], 140 | family="binomial", lmr=1e-4) 141 | getpy(dv1coarse, dv1x, dvycoarse, dvtest) 142 | cat("dm both **\n") 143 | dvcoarse <- gamlr(dvx[-dvtest,], dvycoarse[-dvtest], 144 | family="binomial", lmr=1e-4) 145 | pydvcoarse <- getpy(dvcoarse, dvx, dvycoarse, dvtest, PY=TRUE) 146 | 147 | cat("** NNP\n") 148 | cat("dm0 **\n") 149 | dv0nnp <- dmr(cl, dv0x[-dvtest,], dvynnp[-dvtest], lmr=1e-4) 150 | getpy(dv0nnp, dv0x, dvynnp, dvtest) 151 | cat("dm1 **\n") 152 | dv1nnp <- dmr(cl, dv1x[-dvtest,], dvynnp[-dvtest], lmr=1e-4) 153 | getpy(dv1nnp, dv1x, dvynnp, dvtest) 154 | cat("dm both **\n") 155 | dvnnp <- dmr(cl, dvx[-dvtest,], dvynnp[-dvtest], lmr=1e-4) 156 | pydvnnp <- getpy(dvnnp, dvx, dvynnp, dvtest, PY=TRUE) 157 | 158 | cat("** FINE\n") 159 | cat("dm0 **\n") 160 | dv0fine <- dmr(cl, dv0x[-dvtest,], dvyfine[-dvtest], lmr=1e-4) 161 | getpy(dv0fine, dv0x, dvyfine, dvtest) 162 | cat("dm1 **\n") 163 | dv1fine <- dmr(cl, dv1x[-dvtest,], dvyfine[-dvtest], lmr=1e-4) 164 | getpy(dv1fine, dv1x, dvyfine, dvtest) 165 | cat("dm both **\n") 166 | dvfine <- dmr(cl, dvx[-dvtest,], dvyfine[-dvtest], lmr=1e-4) 167 | pydvfine <- getpy(dvfine, dvx, dvyfine, dvtest, PY=TRUE) 168 | 169 | # mnir 170 | cat("\n*** MNIR ***\n") 171 | vmat <- sparse.model.matrix(~stars + yfine-1) 172 | mnir <- mnlm(cl=cl, vmat[-test,], x[-test,], verb=1, bins=5) 173 | zir <- srproj(mnir, x, select=100) 174 | 175 | cat("** COARSE **\n") 176 | fwdcoarse <- gamlr(zir[-test,], ycoarse[-test], lmr=1e-4, family="binomial") 177 | pymnircoarse <- getpy(fwdcoarse, zir, ycoarse, test, PY=TRUE) 178 | 179 | cat("** NNP **\n") 180 | fwdnnp <- dmr(cl, zir[-test,], ynnp[-test], lmr=1e-4) 181 | pymnirnnp <- getpy(fwdnnp, zir, ynnp, test, PY=TRUE) 182 | 183 | cat("** FINE **\n") 184 | fwdfine <- dmr(cl, zir[-test,], yfine[-test], lmr=1e-4) 185 | pymnirfine <- getpy(fwdfine, zir, yfine, test, PY=TRUE) 186 | 187 | ### Aggregate vector prediction 188 | cat("\n*** W2V AGGREGATION ***\n") 189 | 190 | cat("** COARSE **\n") 191 | avc <- gamlr(aggvec[-test,], ycoarse[-test], 192 | family="binomial", lambda.min.ratio=1e-3) 193 | getpy(avc, aggvec, ycoarse, test) 194 | 195 | cat("** NNP **\n") 196 | avnnp <- dmr(cl=cl, aggvec[-test,], ynnp[-test], lmr=1e-3) 197 | getpy(avnnp, aggvec, ynnp, test) 198 | 199 | cat("** FINE **\n") 200 | avfine <- dmr(cl=cl, aggvec[-test,], yfine[-test], lmr=1e-3) 201 | getpy(avfine, aggvec, yfine, test) 202 | 203 | save.image("linmod.rda", compress=FALSE) 204 | 205 | ### some plots 206 | w2vpc <- w2vpcoarse[test,2] 207 | pdf("paper/graphs/coarseprob.pdf", width=9, height=2.75) 208 | par(mfrow=c(1,3),mai=c(.45,.45,.3,.2),omi=c(.15,.15,0,0)) 209 | hist(w2vpc[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE, 210 | xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="word2vec inversion") 211 | hist(w2vpc[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE) 212 | 213 | hist(pycoarse[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE, 214 | xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="phrase regression") 215 | hist(pycoarse[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE) 216 | 217 | hist(pydvcoarse[dvycoarse[dvtest]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE, 218 | xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="doc2vec regression") 219 | hist(pydvcoarse[dvycoarse[dvtest]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE) 220 | 221 | # hist(pymnircoarse[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE, 222 | # xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="mnir") 223 | # hist(pymnircoarse[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE) 224 | 225 | mtext(side=2, "density", outer=TRUE,cex=.9, font=3) 226 | mtext(side=1, "probability positive", outer=TRUE, cex=.9, font=3) 227 | dev.off() 228 | 229 | 230 | pdf("paper/graphs/coarseprob_bystar.pdf", width=9, height=2.5) 231 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0)) 232 | boxplot( w2vpc ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="word2vec inversion") 233 | boxplot( pycoarse ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="phrase regression") 234 | boxplot( pydvcoarse ~ dvyfine[dvtest], col=heat.colors(5), varwidth=TRUE, main="doc2vec regression") 235 | boxplot( pymnircoarse ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="mnir") 236 | mtext(side=1, "stars", outer=TRUE,cex=1, font=3) 237 | mtext(side=2, "probability positive", outer=TRUE,cex=1, font=3) 238 | dev.off() 239 | 240 | w2vpnnpy <- w2vpnnp[cbind(1:n,ynnp)] 241 | pdf("paper/graphs/nnpprob.pdf", width=9, height=2.5) 242 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0)) 243 | boxplot( w2vpnnpy[test] ~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="word2vec inversion") 244 | boxplot( pynnp~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="phrase regression") 245 | boxplot( pydvnnp~ dvynnp[dvtest], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="doc2vec regression") 246 | boxplot( pymnirnnp~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="mnir") 247 | mtext(side=1, "stars", outer=TRUE,cex=.9, font=3) 248 | mtext(side=2, "probability of true category", outer=TRUE,cex=.9, font=3) 249 | dev.off() 250 | 251 | w2vpy <- w2vprob[cbind(1:n,stars)] 252 | pdf("paper/graphs/fineprob.pdf", width=9, height=2.5) 253 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0)) 254 | boxplot( w2vpy[test] ~ yfine[test], col=heat.colors(5), varwidth=TRUE, ylim=c(0,1), main="word2vec inversion") 255 | boxplot( pyfine~ yfine[test], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="phrase regression") 256 | boxplot( pydvfine~ dvyfine[dvtest], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="doc2vec regression") 257 | boxplot( pymnirfine~ yfine[test], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="mnir") 258 | mtext(side=1, "stars", outer=TRUE,cex=.9, font=3) 259 | mtext(side=2, "probability of true stars", outer=TRUE,cex=.9, font=3) 260 | dev.off() 261 | -------------------------------------------------------------------------------- /code/parseyelp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ## python map for word counts 3 | 4 | # Import Modules 5 | import sys 6 | import re 7 | import json 8 | 9 | # all non alphanumeric 10 | contractions = re.compile(r"'|-") 11 | symbols = re.compile(r'(\W+)', re.U) 12 | numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I) 13 | swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I) 14 | suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)') 15 | seps = re.compile(r'\s+') 16 | 17 | # cleaner (order matters) 18 | def clean(text): 19 | text = u' ' + text.lower() + u' ' 20 | text = contractions.sub('', text) 21 | text = symbols.sub(r' \1 ', text) 22 | text = numeric.sub('000', text) 23 | text = swrd.sub(' ', text) 24 | #text = suffix.sub('', text) 25 | text = seps.sub(' ', text) 26 | return text 27 | 28 | 29 | fout = [ open("data/yelptrain%dstar.txt" % y, 'w') for y in range(1,6) ] 30 | fin = open("data/yelp_training_set/yelp_training_set_review.json", 'r') 31 | i = 0 32 | 33 | for line in fin: 34 | d = json.loads(line) 35 | i += 1 36 | try: 37 | txt = clean(d['text']) 38 | fout[d['stars']-1].write(txt+'\n') 39 | print(i, end=" ") 40 | 41 | except: 42 | e = sys.exc_info()[0] 43 | sys.stderr.write("review reader error: %s\n"%str(e)) 44 | 45 | fin.close() 46 | for f in fout: 47 | f.close() 48 | 49 | 50 | fout = [ open("data/yelptest%dstar.txt" % y, 'w') for y in range(1,6) ] 51 | fin = open("data/yelp_test_set/yelp_test_set_review.json", 'r') 52 | i = 0 53 | 54 | for line in fin: 55 | d = json.loads(line) 56 | i += 1 57 | try: 58 | txt = clean(d['text']) 59 | fout[d['stars']-1].write(txt+'\n') 60 | print(i, end=" ") 61 | 62 | except: 63 | e = sys.exc_info()[0] 64 | sys.stderr.write("review reader error: %s\n"%str(e)) 65 | 66 | fin.close() 67 | for f in fout: 68 | f.close() 69 | -------------------------------------------------------------------------------- /debug.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy\n", 12 | "\n", 13 | "from gensim import utils, matutils\n", 14 | "from gensim.models import word2vec\n", 15 | "\n", 16 | "sentences = [\n", 17 | " ['human', 'interface', 'computer'],\n", 18 | " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", 19 | " ['eps', 'user', 'interface', 'system'],\n", 20 | " ['system', 'human', 'system', 'eps'],\n", 21 | " ['user', 'response', 'time'],\n", 22 | " ['trees'],\n", 23 | " ['graph', 'trees'],\n", 24 | " ['graph', 'minors', 'trees'],\n", 25 | " ['graph', 'minors', 'survey']\n", 26 | "]\n", 27 | "\n", 28 | "model = word2vec.Word2Vec(sentences, min_count=1, workers=5)\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stderr", 40 | "output_type": "stream", 41 | "text": [ 42 | "DEBUG:root:test\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "import logging\n", 48 | "logger = logging.getLogger()\n", 49 | "logger.setLevel(logging.DEBUG)\n", 50 | "logging.debug(\"test\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "name": "stderr", 62 | "output_type": "stream", 63 | "text": [ 64 | "INFO:gensim.models.word2vec:scoring sentences with 2 workers on 12 vocabulary and 100 features, using sg=1 hs=1 sample=0 and negative=0\n", 65 | "DEBUG:gensim.models.word2vec:putting job #0 in the queue\n", 66 | "DEBUG:gensim.models.word2vec:putting job #1 in the queue\n", 67 | "DEBUG:gensim.models.word2vec:putting job #2 in the queue\n", 68 | "DEBUG:gensim.models.word2vec:putting job #3 in the queue\n", 69 | "DEBUG:gensim.models.word2vec:putting job #4 in the queue\n", 70 | "INFO:gensim.models.word2vec:reached end of input; waiting to finish 1 outstanding jobs\n", 71 | "INFO:gensim.models.word2vec:scoring 9 sentences took 0.0s, 1105 sentences/s\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "scores = model.score(sentences, 9, chunksize=2)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "array([-16.96789169, -77.79333496, -29.70446968, -29.69228935,\n", 90 | " -15.54527378, 0. , -4.23740578, -14.12265587, -15.54527378], dtype=float32)" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "scores" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stderr", 111 | "output_type": "stream", 112 | "text": [ 113 | "INFO:gensim.models.word2vec:scoring sentences with 2 workers on 12 vocabulary and 100 features, using sg=1 hs=1 sample=0 and negative=0\n", 114 | "DEBUG:gensim.models.word2vec:putting job #0 in the queue\n", 115 | "DEBUG:gensim.models.word2vec:putting job #1 in the queue\n", 116 | "DEBUG:gensim.models.word2vec:putting job #2 in the queue\n", 117 | "WARNING:gensim.models.word2vec:terminating after 2 sentences (set higher total_sentences if you want more).\n", 118 | "INFO:gensim.models.word2vec:reached end of input; waiting to finish 1 outstanding jobs\n", 119 | "INFO:gensim.models.word2vec:scoring 2 sentences took 0.0s, 774 sentences/s\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "scores = model.score(sentences, 2, chunksize=2)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "array([-16.96789169, -77.79333496], dtype=float32)" 138 | ] 139 | }, 140 | "execution_count": 6, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "scores" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "2" 160 | ] 161 | }, 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "len(scores)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [] 188 | } 189 | ], 190 | "metadata": { 191 | "kernelspec": { 192 | "display_name": "Python 2", 193 | "language": "python", 194 | "name": "python2" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 2 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython2", 206 | "version": "2.7.10" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 0 211 | } 212 | -------------------------------------------------------------------------------- /paths.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re\n", 12 | "contractions = re.compile(r\"'s*|-|\\\"\")\n", 13 | "# all non alphanumeric\n", 14 | "symbols = re.compile(r'(\\W+)', re.U)\n", 15 | "# separators (any whitespace)\n", 16 | "seps = re.compile(r'\\s+')\n", 17 | "# some stops to remove\n", 18 | "stops = re.compile(r'(\\s[,:\\)\\(]\\s)')\n", 19 | "# for sentence splitter\n", 20 | "alteos = re.compile(r'([!\\?])')\n", 21 | "\n", 22 | "\n", 23 | "# cleaner (order matters)\n", 24 | "def clean(text): \n", 25 | " text = text.lower()\n", 26 | " text = contractions.sub('', text)\n", 27 | " text = symbols.sub(r' \\1 ', text)\n", 28 | " text = stops.sub(' ', text)\n", 29 | " text = seps.sub(' ', text)\n", 30 | " text = alteos.sub(r' \\1 .', text)\n", 31 | " return text\n", 32 | "\n", 33 | "from zipfile import ZipFile\n", 34 | "import json\n", 35 | "\n", 36 | "def YelpSentences(label, stars=[1,2,3,4,5]):\n", 37 | " with ZipFile(\"yelp_%s_set.zip\"%label, 'r') as zf:\n", 38 | " with zf.open(\"yelp_%s_set/yelp_%s_set_review.json\"%(label,label)) as f:\n", 39 | " for line in f:\n", 40 | " rev = json.loads(line)\n", 41 | " if rev['stars'] in stars:\n", 42 | " text = rev['text'].clean()\n", 43 | " for s in text.split(\".\"):\n", 44 | " yield s.split()\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "## just bring them into memory\n", 56 | "sentences = list(YelpSentences(\"training\"))" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "from gensim.models import Word2Vec\n", 68 | "import logging \n", 69 | "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", 70 | "\n", 71 | "## create a w2v learner \n", 72 | "w2v = Word2Vec(sentences, workers=8, iter=3) " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "## function to save vector path to file \n", 84 | "# Each row is a word, sentence reads from top.\n", 85 | "# (throws a key error if words are not in the vocab.)\n", 86 | "import numpy as np\n", 87 | "def savepath(words):\n", 88 | " print(words)\n", 89 | " np.savetxt(\"_\".join(words)+\".txt\", w2v[words], fmt=\"%.6f\")" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "[u'my', u'wife', u'took', u'me', u'here', u'on', u'my', u'birthday', u'for', u'breakfast', u'and', u'it', u'was', u'excellent']\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "savepath(sentences[0])" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "[u'the', u'horchata', u'is', u'handmade', u'and', u'delicious']\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "savepath(sentences[97])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "badsentences = list(YelpSentences(\"test\", [1]))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 16, 144 | "metadata": { 145 | "collapsed": false, 146 | "scrolled": true 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "[u'we', u'thought', u'this', u'was', u'a', u'little', u'strange', u'since', u'every', u'single', u'other', u'kennel', u'weve', u'ever', u'been', u'to', u'was', u'willing', u'and', u'wanted', u'to', u'give', u'us', u'a', u'tour']\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "savepath(badsentences[100])" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 20, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "[u'thanks', u'for', u'lying', u'to', u'my', u'face', u'dude']\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "savepath(badsentences[800])" 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 2", 184 | "language": "python", 185 | "name": "python2" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 2 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython2", 197 | "version": "2.7.9" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 0 202 | } 203 | -------------------------------------------------------------------------------- /tex/acl.bst: -------------------------------------------------------------------------------- 1 | 2 | % BibTeX `acl' style file for BibTeX version 0.99c, LaTeX version 2.09 3 | % This version was made by modifying `aaai-named' format based on the master 4 | % file by Oren Patashnik (PATASHNIK@SCORE.STANFORD.EDU) 5 | 6 | % Copyright (C) 1985, all rights reserved. 7 | % Modifications Copyright 1988, Peter F. Patel-Schneider 8 | % Further modifictions by Stuart Shieber, 1991, and Fernando Pereira, 1992. 9 | % Copying of this file is authorized only if either 10 | % (1) you make absolutely no changes to your copy, including name, or 11 | % (2) if you do make changes, you name it something other than 12 | % btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. 13 | % This restriction helps ensure that all standard styles are identical. 14 | 15 | % There are undoubtably bugs in this style. If you make bug fixes, 16 | % improvements, etc. please let me know. My e-mail address is: 17 | % pfps@spar.slb.com 18 | 19 | % Citation format: [author-last-name, year] 20 | % [author-last-name and author-last-name, year] 21 | % [author-last-name {\em et al.}, year] 22 | % 23 | % Reference list ordering: alphabetical by author or whatever passes 24 | % for author in the absence of one. 25 | % 26 | % This BibTeX style has support for short (year only) citations. This 27 | % is done by having the citations actually look like 28 | % \citename{name-info, }year 29 | % The LaTeX style has to have the following 30 | % \let\@internalcite\cite 31 | % \def\cite{\def\citename##1{##1}\@internalcite} 32 | % \def\shortcite{\def\citename##1{}\@internalcite} 33 | % \def\@biblabel#1{\def\citename##1{##1}[#1]\hfill} 34 | % which makes \shortcite the macro for short citations. 35 | 36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 37 | % Changes made by SMS for thesis style 38 | % no emphasis on "et al." 39 | % "Ph.D." includes periods (not "PhD") 40 | % moved year to immediately after author's name 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 42 | ENTRY 43 | { address 44 | author 45 | booktitle 46 | chapter 47 | edition 48 | editor 49 | howpublished 50 | institution 51 | journal 52 | key 53 | month 54 | note 55 | number 56 | organization 57 | pages 58 | publisher 59 | school 60 | series 61 | title 62 | type 63 | volume 64 | year 65 | } 66 | {} 67 | { label extra.label sort.label } 68 | 69 | INTEGERS { output.state before.all mid.sentence after.sentence after.block } 70 | 71 | FUNCTION {init.state.consts} 72 | { #0 'before.all := 73 | #1 'mid.sentence := 74 | #2 'after.sentence := 75 | #3 'after.block := 76 | } 77 | 78 | STRINGS { s t } 79 | 80 | FUNCTION {output.nonnull} 81 | { 's := 82 | output.state mid.sentence = 83 | { ", " * write$ } 84 | { output.state after.block = 85 | { add.period$ write$ 86 | newline$ 87 | "\newblock " write$ 88 | } 89 | { output.state before.all = 90 | 'write$ 91 | { add.period$ " " * write$ } 92 | if$ 93 | } 94 | if$ 95 | mid.sentence 'output.state := 96 | } 97 | if$ 98 | s 99 | } 100 | 101 | FUNCTION {output} 102 | { duplicate$ empty$ 103 | 'pop$ 104 | 'output.nonnull 105 | if$ 106 | } 107 | 108 | FUNCTION {output.check} 109 | { 't := 110 | duplicate$ empty$ 111 | { pop$ "empty " t * " in " * cite$ * warning$ } 112 | 'output.nonnull 113 | if$ 114 | } 115 | 116 | FUNCTION {output.bibitem} 117 | { newline$ 118 | 119 | "\bibitem[" write$ 120 | label write$ 121 | "]{" write$ 122 | 123 | cite$ write$ 124 | "}" write$ 125 | newline$ 126 | "" 127 | before.all 'output.state := 128 | } 129 | 130 | FUNCTION {fin.entry} 131 | { add.period$ 132 | write$ 133 | newline$ 134 | } 135 | 136 | FUNCTION {new.block} 137 | { output.state before.all = 138 | 'skip$ 139 | { after.block 'output.state := } 140 | if$ 141 | } 142 | 143 | FUNCTION {new.sentence} 144 | { output.state after.block = 145 | 'skip$ 146 | { output.state before.all = 147 | 'skip$ 148 | { after.sentence 'output.state := } 149 | if$ 150 | } 151 | if$ 152 | } 153 | 154 | FUNCTION {not} 155 | { { #0 } 156 | { #1 } 157 | if$ 158 | } 159 | 160 | FUNCTION {and} 161 | { 'skip$ 162 | { pop$ #0 } 163 | if$ 164 | } 165 | 166 | FUNCTION {or} 167 | { { pop$ #1 } 168 | 'skip$ 169 | if$ 170 | } 171 | 172 | FUNCTION {new.block.checka} 173 | { empty$ 174 | 'skip$ 175 | 'new.block 176 | if$ 177 | } 178 | 179 | FUNCTION {new.block.checkb} 180 | { empty$ 181 | swap$ empty$ 182 | and 183 | 'skip$ 184 | 'new.block 185 | if$ 186 | } 187 | 188 | FUNCTION {new.sentence.checka} 189 | { empty$ 190 | 'skip$ 191 | 'new.sentence 192 | if$ 193 | } 194 | 195 | FUNCTION {new.sentence.checkb} 196 | { empty$ 197 | swap$ empty$ 198 | and 199 | 'skip$ 200 | 'new.sentence 201 | if$ 202 | } 203 | 204 | FUNCTION {field.or.null} 205 | { duplicate$ empty$ 206 | { pop$ "" } 207 | 'skip$ 208 | if$ 209 | } 210 | 211 | FUNCTION {emphasize} 212 | { duplicate$ empty$ 213 | { pop$ "" } 214 | { "{\em " swap$ * "}" * } 215 | if$ 216 | } 217 | 218 | INTEGERS { nameptr namesleft numnames } 219 | 220 | FUNCTION {format.names} 221 | { 's := 222 | #1 'nameptr := 223 | s num.names$ 'numnames := 224 | numnames 'namesleft := 225 | { namesleft #0 > } 226 | 227 | { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't := 228 | 229 | nameptr #1 > 230 | { namesleft #1 > 231 | { ", " * t * } 232 | { numnames #2 > 233 | { "," * } 234 | 'skip$ 235 | if$ 236 | t "others" = 237 | { " et~al." * } 238 | { " and " * t * } 239 | if$ 240 | } 241 | if$ 242 | } 243 | 't 244 | if$ 245 | nameptr #1 + 'nameptr := 246 | namesleft #1 - 'namesleft := 247 | } 248 | while$ 249 | } 250 | 251 | FUNCTION {format.authors} 252 | { author empty$ 253 | { "" } 254 | { author format.names } 255 | if$ 256 | } 257 | 258 | FUNCTION {format.editors} 259 | { editor empty$ 260 | { "" } 261 | { editor format.names 262 | editor num.names$ #1 > 263 | { ", editors" * } 264 | { ", editor" * } 265 | if$ 266 | } 267 | if$ 268 | } 269 | 270 | FUNCTION {format.title} 271 | { title empty$ 272 | { "" } 273 | 274 | { title "t" change.case$ } 275 | 276 | if$ 277 | } 278 | 279 | FUNCTION {n.dashify} 280 | { 't := 281 | "" 282 | { t empty$ not } 283 | { t #1 #1 substring$ "-" = 284 | { t #1 #2 substring$ "--" = not 285 | { "--" * 286 | t #2 global.max$ substring$ 't := 287 | } 288 | { { t #1 #1 substring$ "-" = } 289 | { "-" * 290 | t #2 global.max$ substring$ 't := 291 | } 292 | while$ 293 | } 294 | if$ 295 | } 296 | { t #1 #1 substring$ * 297 | t #2 global.max$ substring$ 't := 298 | } 299 | if$ 300 | } 301 | while$ 302 | } 303 | 304 | FUNCTION {format.date} 305 | { year empty$ 306 | { month empty$ 307 | { "" } 308 | { "there's a month but no year in " cite$ * warning$ 309 | month 310 | } 311 | if$ 312 | } 313 | { month empty$ 314 | { "" } 315 | { month } 316 | if$ 317 | } 318 | if$ 319 | } 320 | 321 | FUNCTION {format.btitle} 322 | { title emphasize 323 | } 324 | 325 | FUNCTION {tie.or.space.connect} 326 | { duplicate$ text.length$ #3 < 327 | { "~" } 328 | { " " } 329 | if$ 330 | swap$ * * 331 | } 332 | 333 | FUNCTION {either.or.check} 334 | { empty$ 335 | 'pop$ 336 | { "can't use both " swap$ * " fields in " * cite$ * warning$ } 337 | if$ 338 | } 339 | 340 | FUNCTION {format.bvolume} 341 | { volume empty$ 342 | { "" } 343 | { "volume" volume tie.or.space.connect 344 | series empty$ 345 | 'skip$ 346 | { " of " * series emphasize * } 347 | if$ 348 | "volume and number" number either.or.check 349 | } 350 | if$ 351 | } 352 | 353 | FUNCTION {format.number.series} 354 | { volume empty$ 355 | { number empty$ 356 | { series field.or.null } 357 | { output.state mid.sentence = 358 | { "number" } 359 | { "Number" } 360 | if$ 361 | number tie.or.space.connect 362 | series empty$ 363 | { "there's a number but no series in " cite$ * warning$ } 364 | { " in " * series * } 365 | if$ 366 | } 367 | if$ 368 | } 369 | { "" } 370 | if$ 371 | } 372 | 373 | FUNCTION {format.edition} 374 | { edition empty$ 375 | { "" } 376 | { output.state mid.sentence = 377 | { edition "l" change.case$ " edition" * } 378 | { edition "t" change.case$ " edition" * } 379 | if$ 380 | } 381 | if$ 382 | } 383 | 384 | INTEGERS { multiresult } 385 | 386 | FUNCTION {multi.page.check} 387 | { 't := 388 | #0 'multiresult := 389 | { multiresult not 390 | t empty$ not 391 | and 392 | } 393 | { t #1 #1 substring$ 394 | duplicate$ "-" = 395 | swap$ duplicate$ "," = 396 | swap$ "+" = 397 | or or 398 | { #1 'multiresult := } 399 | { t #2 global.max$ substring$ 't := } 400 | if$ 401 | } 402 | while$ 403 | multiresult 404 | } 405 | 406 | FUNCTION {format.pages} 407 | { pages empty$ 408 | { "" } 409 | { pages multi.page.check 410 | { "pages" pages n.dashify tie.or.space.connect } 411 | { "page" pages tie.or.space.connect } 412 | if$ 413 | } 414 | if$ 415 | } 416 | 417 | FUNCTION {format.year.label} 418 | { year extra.label * 419 | } 420 | 421 | FUNCTION {format.vol.num.pages} 422 | { volume field.or.null 423 | number empty$ 424 | 'skip$ 425 | { "(" number * ")" * * 426 | volume empty$ 427 | { "there's a number but no volume in " cite$ * warning$ } 428 | 'skip$ 429 | if$ 430 | } 431 | if$ 432 | pages empty$ 433 | 'skip$ 434 | { duplicate$ empty$ 435 | { pop$ format.pages } 436 | { ":" * pages n.dashify * } 437 | if$ 438 | } 439 | if$ 440 | } 441 | 442 | FUNCTION {format.chapter.pages} 443 | { chapter empty$ 444 | 'format.pages 445 | { type empty$ 446 | { "chapter" } 447 | { type "l" change.case$ } 448 | if$ 449 | chapter tie.or.space.connect 450 | pages empty$ 451 | 'skip$ 452 | { ", " * format.pages * } 453 | if$ 454 | } 455 | if$ 456 | } 457 | 458 | FUNCTION {format.in.ed.booktitle} 459 | { booktitle empty$ 460 | { "" } 461 | { editor empty$ 462 | { "In " booktitle emphasize * } 463 | { "In " format.editors * ", " * booktitle emphasize * } 464 | if$ 465 | } 466 | if$ 467 | } 468 | 469 | FUNCTION {empty.misc.check} 470 | { author empty$ title empty$ howpublished empty$ 471 | month empty$ year empty$ note empty$ 472 | and and and and and 473 | 474 | key empty$ not and 475 | 476 | { "all relevant fields are empty in " cite$ * warning$ } 477 | 'skip$ 478 | if$ 479 | } 480 | 481 | FUNCTION {format.thesis.type} 482 | { type empty$ 483 | 'skip$ 484 | { pop$ 485 | type "t" change.case$ 486 | } 487 | if$ 488 | } 489 | 490 | FUNCTION {format.tr.number} 491 | { type empty$ 492 | { "Technical Report" } 493 | 'type 494 | if$ 495 | number empty$ 496 | { "t" change.case$ } 497 | { number tie.or.space.connect } 498 | if$ 499 | } 500 | 501 | FUNCTION {format.article.crossref} 502 | { key empty$ 503 | { journal empty$ 504 | { "need key or journal for " cite$ * " to crossref " * crossref * 505 | warning$ 506 | "" 507 | } 508 | { "In {\em " journal * "\/}" * } 509 | if$ 510 | } 511 | { "In " key * } 512 | if$ 513 | " \cite{" * crossref * "}" * 514 | } 515 | 516 | FUNCTION {format.crossref.editor} 517 | { editor #1 "{vv~}{ll}" format.name$ 518 | editor num.names$ duplicate$ 519 | #2 > 520 | { pop$ " et~al." * } 521 | { #2 < 522 | 'skip$ 523 | { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = 524 | { " et~al." * } 525 | { " and " * editor #2 "{vv~}{ll}" format.name$ * } 526 | if$ 527 | } 528 | if$ 529 | } 530 | if$ 531 | } 532 | 533 | FUNCTION {format.book.crossref} 534 | { volume empty$ 535 | { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ 536 | "In " 537 | } 538 | { "Volume" volume tie.or.space.connect 539 | " of " * 540 | } 541 | if$ 542 | editor empty$ 543 | editor field.or.null author field.or.null = 544 | or 545 | { key empty$ 546 | { series empty$ 547 | { "need editor, key, or series for " cite$ * " to crossref " * 548 | crossref * warning$ 549 | "" * 550 | } 551 | { "{\em " * series * "\/}" * } 552 | if$ 553 | } 554 | { key * } 555 | if$ 556 | } 557 | { format.crossref.editor * } 558 | if$ 559 | " \cite{" * crossref * "}" * 560 | } 561 | 562 | FUNCTION {format.incoll.inproc.crossref} 563 | { editor empty$ 564 | editor field.or.null author field.or.null = 565 | or 566 | { key empty$ 567 | { booktitle empty$ 568 | { "need editor, key, or booktitle for " cite$ * " to crossref " * 569 | crossref * warning$ 570 | "" 571 | } 572 | { "In {\em " booktitle * "\/}" * } 573 | if$ 574 | } 575 | { "In " key * } 576 | if$ 577 | } 578 | { "In " format.crossref.editor * } 579 | if$ 580 | " \cite{" * crossref * "}" * 581 | } 582 | 583 | FUNCTION {article} 584 | { output.bibitem 585 | format.authors "author" output.check 586 | new.block 587 | format.year.label "year" output.check 588 | new.block 589 | format.title "title" output.check 590 | new.block 591 | crossref missing$ 592 | { journal emphasize "journal" output.check 593 | format.vol.num.pages output 594 | format.date output 595 | } 596 | { format.article.crossref output.nonnull 597 | format.pages output 598 | } 599 | if$ 600 | new.block 601 | note output 602 | fin.entry 603 | } 604 | 605 | FUNCTION {book} 606 | { output.bibitem 607 | author empty$ 608 | { format.editors "author and editor" output.check } 609 | { format.authors output.nonnull 610 | crossref missing$ 611 | { "author and editor" editor either.or.check } 612 | 'skip$ 613 | if$ 614 | } 615 | if$ 616 | new.block 617 | format.year.label "year" output.check 618 | new.block 619 | format.btitle "title" output.check 620 | crossref missing$ 621 | { format.bvolume output 622 | new.block 623 | format.number.series output 624 | new.sentence 625 | publisher "publisher" output.check 626 | address output 627 | } 628 | { new.block 629 | format.book.crossref output.nonnull 630 | } 631 | if$ 632 | format.edition output 633 | format.date output 634 | new.block 635 | note output 636 | fin.entry 637 | } 638 | 639 | FUNCTION {booklet} 640 | { output.bibitem 641 | format.authors output 642 | new.block 643 | format.year.label "year" output.check 644 | new.block 645 | format.title "title" output.check 646 | howpublished address new.block.checkb 647 | howpublished output 648 | address output 649 | format.date output 650 | new.block 651 | note output 652 | fin.entry 653 | } 654 | 655 | FUNCTION {inbook} 656 | { output.bibitem 657 | author empty$ 658 | { format.editors "author and editor" output.check } 659 | { format.authors output.nonnull 660 | crossref missing$ 661 | { "author and editor" editor either.or.check } 662 | 'skip$ 663 | if$ 664 | } 665 | if$ 666 | format.year.label "year" output.check 667 | new.block 668 | new.block 669 | format.btitle "title" output.check 670 | crossref missing$ 671 | { format.bvolume output 672 | format.chapter.pages "chapter and pages" output.check 673 | new.block 674 | format.number.series output 675 | new.sentence 676 | publisher "publisher" output.check 677 | address output 678 | } 679 | { format.chapter.pages "chapter and pages" output.check 680 | new.block 681 | format.book.crossref output.nonnull 682 | } 683 | if$ 684 | format.edition output 685 | format.date output 686 | new.block 687 | note output 688 | fin.entry 689 | } 690 | 691 | FUNCTION {incollection} 692 | { output.bibitem 693 | format.authors "author" output.check 694 | new.block 695 | format.year.label "year" output.check 696 | new.block 697 | format.title "title" output.check 698 | new.block 699 | crossref missing$ 700 | { format.in.ed.booktitle "booktitle" output.check 701 | format.bvolume output 702 | format.number.series output 703 | format.chapter.pages output 704 | new.sentence 705 | publisher "publisher" output.check 706 | address output 707 | format.edition output 708 | format.date output 709 | } 710 | { format.incoll.inproc.crossref output.nonnull 711 | format.chapter.pages output 712 | } 713 | if$ 714 | new.block 715 | note output 716 | fin.entry 717 | } 718 | 719 | FUNCTION {inproceedings} 720 | { output.bibitem 721 | format.authors "author" output.check 722 | new.block 723 | format.year.label "year" output.check 724 | new.block 725 | format.title "title" output.check 726 | new.block 727 | crossref missing$ 728 | { format.in.ed.booktitle "booktitle" output.check 729 | format.bvolume output 730 | format.number.series output 731 | format.pages output 732 | address empty$ 733 | { organization publisher new.sentence.checkb 734 | organization output 735 | publisher output 736 | format.date output 737 | } 738 | { address output.nonnull 739 | format.date output 740 | new.sentence 741 | organization output 742 | publisher output 743 | } 744 | if$ 745 | } 746 | { format.incoll.inproc.crossref output.nonnull 747 | format.pages output 748 | } 749 | if$ 750 | new.block 751 | note output 752 | fin.entry 753 | } 754 | 755 | FUNCTION {conference} { inproceedings } 756 | 757 | FUNCTION {manual} 758 | { output.bibitem 759 | author empty$ 760 | { organization empty$ 761 | 'skip$ 762 | { organization output.nonnull 763 | address output 764 | } 765 | if$ 766 | } 767 | { format.authors output.nonnull } 768 | if$ 769 | format.year.label "year" output.check 770 | new.block 771 | new.block 772 | format.btitle "title" output.check 773 | author empty$ 774 | { organization empty$ 775 | { address new.block.checka 776 | address output 777 | } 778 | 'skip$ 779 | if$ 780 | } 781 | { organization address new.block.checkb 782 | organization output 783 | address output 784 | } 785 | if$ 786 | format.edition output 787 | format.date output 788 | new.block 789 | note output 790 | fin.entry 791 | } 792 | 793 | FUNCTION {mastersthesis} 794 | { output.bibitem 795 | format.authors "author" output.check 796 | new.block 797 | format.year.label "year" output.check 798 | new.block 799 | format.title "title" output.check 800 | new.block 801 | "Master's thesis" format.thesis.type output.nonnull 802 | school "school" output.check 803 | address output 804 | format.date output 805 | new.block 806 | note output 807 | fin.entry 808 | } 809 | 810 | FUNCTION {misc} 811 | { output.bibitem 812 | format.authors output 813 | new.block 814 | format.year.label output 815 | new.block 816 | title howpublished new.block.checkb 817 | format.title output 818 | howpublished new.block.checka 819 | howpublished output 820 | format.date output 821 | new.block 822 | note output 823 | fin.entry 824 | empty.misc.check 825 | } 826 | 827 | FUNCTION {phdthesis} 828 | { output.bibitem 829 | format.authors "author" output.check 830 | new.block 831 | format.year.label "year" output.check 832 | new.block 833 | format.btitle "title" output.check 834 | new.block 835 | "{Ph.D.} thesis" format.thesis.type output.nonnull 836 | school "school" output.check 837 | address output 838 | format.date output 839 | new.block 840 | note output 841 | fin.entry 842 | } 843 | 844 | FUNCTION {proceedings} 845 | { output.bibitem 846 | editor empty$ 847 | { organization output } 848 | { format.editors output.nonnull } 849 | if$ 850 | new.block 851 | format.year.label "year" output.check 852 | new.block 853 | format.btitle "title" output.check 854 | format.bvolume output 855 | format.number.series output 856 | address empty$ 857 | { editor empty$ 858 | { publisher new.sentence.checka } 859 | { organization publisher new.sentence.checkb 860 | organization output 861 | } 862 | if$ 863 | publisher output 864 | format.date output 865 | } 866 | { address output.nonnull 867 | format.date output 868 | new.sentence 869 | editor empty$ 870 | 'skip$ 871 | { organization output } 872 | if$ 873 | publisher output 874 | } 875 | if$ 876 | new.block 877 | note output 878 | fin.entry 879 | } 880 | 881 | FUNCTION {techreport} 882 | { output.bibitem 883 | format.authors "author" output.check 884 | new.block 885 | format.year.label "year" output.check 886 | new.block 887 | format.title "title" output.check 888 | new.block 889 | format.tr.number output.nonnull 890 | institution "institution" output.check 891 | address output 892 | format.date output 893 | new.block 894 | note output 895 | fin.entry 896 | } 897 | 898 | FUNCTION {unpublished} 899 | { output.bibitem 900 | format.authors "author" output.check 901 | new.block 902 | format.year.label "year" output.check 903 | new.block 904 | format.title "title" output.check 905 | new.block 906 | note "note" output.check 907 | format.date output 908 | fin.entry 909 | } 910 | 911 | FUNCTION {default.type} { misc } 912 | 913 | MACRO {jan} {"January"} 914 | 915 | MACRO {feb} {"February"} 916 | 917 | MACRO {mar} {"March"} 918 | 919 | MACRO {apr} {"April"} 920 | 921 | MACRO {may} {"May"} 922 | 923 | MACRO {jun} {"June"} 924 | 925 | MACRO {jul} {"July"} 926 | 927 | MACRO {aug} {"August"} 928 | 929 | MACRO {sep} {"September"} 930 | 931 | MACRO {oct} {"October"} 932 | 933 | MACRO {nov} {"November"} 934 | 935 | MACRO {dec} {"December"} 936 | 937 | MACRO {acmcs} {"ACM Computing Surveys"} 938 | 939 | MACRO {acta} {"Acta Informatica"} 940 | 941 | MACRO {cacm} {"Communications of the ACM"} 942 | 943 | MACRO {ibmjrd} {"IBM Journal of Research and Development"} 944 | 945 | MACRO {ibmsj} {"IBM Systems Journal"} 946 | 947 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"} 948 | 949 | MACRO {ieeetc} {"IEEE Transactions on Computers"} 950 | 951 | MACRO {ieeetcad} 952 | {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} 953 | 954 | MACRO {ipl} {"Information Processing Letters"} 955 | 956 | MACRO {jacm} {"Journal of the ACM"} 957 | 958 | MACRO {jcss} {"Journal of Computer and System Sciences"} 959 | 960 | MACRO {scp} {"Science of Computer Programming"} 961 | 962 | MACRO {sicomp} {"SIAM Journal on Computing"} 963 | 964 | MACRO {tocs} {"ACM Transactions on Computer Systems"} 965 | 966 | MACRO {tods} {"ACM Transactions on Database Systems"} 967 | 968 | MACRO {tog} {"ACM Transactions on Graphics"} 969 | 970 | MACRO {toms} {"ACM Transactions on Mathematical Software"} 971 | 972 | MACRO {toois} {"ACM Transactions on Office Information Systems"} 973 | 974 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} 975 | 976 | MACRO {tcs} {"Theoretical Computer Science"} 977 | 978 | READ 979 | 980 | FUNCTION {sortify} 981 | { purify$ 982 | "l" change.case$ 983 | } 984 | 985 | INTEGERS { len } 986 | 987 | FUNCTION {chop.word} 988 | { 's := 989 | 'len := 990 | s #1 len substring$ = 991 | { s len #1 + global.max$ substring$ } 992 | 's 993 | if$ 994 | } 995 | 996 | INTEGERS { et.al.char.used } 997 | 998 | FUNCTION {initialize.et.al.char.used} 999 | { #0 'et.al.char.used := 1000 | } 1001 | 1002 | EXECUTE {initialize.et.al.char.used} 1003 | 1004 | FUNCTION {format.lab.names} 1005 | { 's := 1006 | s num.names$ 'numnames := 1007 | 1008 | numnames #1 = 1009 | { s #1 "{vv }{ll}" format.name$ } 1010 | { numnames #2 = 1011 | { s #1 "{vv }{ll }and " format.name$ s #2 "{vv }{ll}" format.name$ * 1012 | } 1013 | { s #1 "{vv }{ll }\bgroup et al.\egroup " format.name$ } 1014 | if$ 1015 | } 1016 | if$ 1017 | 1018 | } 1019 | 1020 | FUNCTION {author.key.label} 1021 | { author empty$ 1022 | { key empty$ 1023 | 1024 | { cite$ #1 #3 substring$ } 1025 | 1026 | { key #3 text.prefix$ } 1027 | if$ 1028 | } 1029 | { author format.lab.names } 1030 | if$ 1031 | } 1032 | 1033 | FUNCTION {author.editor.key.label} 1034 | { author empty$ 1035 | { editor empty$ 1036 | { key empty$ 1037 | 1038 | { cite$ #1 #3 substring$ } 1039 | 1040 | { key #3 text.prefix$ } 1041 | if$ 1042 | } 1043 | { editor format.lab.names } 1044 | if$ 1045 | } 1046 | { author format.lab.names } 1047 | if$ 1048 | } 1049 | 1050 | FUNCTION {author.key.organization.label} 1051 | { author empty$ 1052 | { key empty$ 1053 | { organization empty$ 1054 | 1055 | { cite$ #1 #3 substring$ } 1056 | 1057 | { "The " #4 organization chop.word #3 text.prefix$ } 1058 | if$ 1059 | } 1060 | { key #3 text.prefix$ } 1061 | if$ 1062 | } 1063 | { author format.lab.names } 1064 | if$ 1065 | } 1066 | 1067 | FUNCTION {editor.key.organization.label} 1068 | { editor empty$ 1069 | { key empty$ 1070 | { organization empty$ 1071 | 1072 | { cite$ #1 #3 substring$ } 1073 | 1074 | { "The " #4 organization chop.word #3 text.prefix$ } 1075 | if$ 1076 | } 1077 | { key #3 text.prefix$ } 1078 | if$ 1079 | } 1080 | { editor format.lab.names } 1081 | if$ 1082 | } 1083 | 1084 | FUNCTION {calc.label} 1085 | { type$ "book" = 1086 | type$ "inbook" = 1087 | or 1088 | 'author.editor.key.label 1089 | { type$ "proceedings" = 1090 | 'editor.key.organization.label 1091 | { type$ "manual" = 1092 | 'author.key.organization.label 1093 | 'author.key.label 1094 | if$ 1095 | } 1096 | if$ 1097 | } 1098 | if$ 1099 | duplicate$ 1100 | 1101 | "\protect\citename{" swap$ * "}" * 1102 | year field.or.null purify$ * 1103 | 'label := 1104 | year field.or.null purify$ * 1105 | 1106 | sortify 'sort.label := 1107 | } 1108 | 1109 | FUNCTION {sort.format.names} 1110 | { 's := 1111 | #1 'nameptr := 1112 | "" 1113 | s num.names$ 'numnames := 1114 | numnames 'namesleft := 1115 | { namesleft #0 > } 1116 | { nameptr #1 > 1117 | { " " * } 1118 | 'skip$ 1119 | if$ 1120 | 1121 | s nameptr "{vv{ } }{ll{ }}{ ff{ }}{ jj{ }}" format.name$ 't := 1122 | 1123 | nameptr numnames = t "others" = and 1124 | { "et al" * } 1125 | { t sortify * } 1126 | if$ 1127 | nameptr #1 + 'nameptr := 1128 | namesleft #1 - 'namesleft := 1129 | } 1130 | while$ 1131 | } 1132 | 1133 | FUNCTION {sort.format.title} 1134 | { 't := 1135 | "A " #2 1136 | "An " #3 1137 | "The " #4 t chop.word 1138 | chop.word 1139 | chop.word 1140 | sortify 1141 | #1 global.max$ substring$ 1142 | } 1143 | 1144 | FUNCTION {author.sort} 1145 | { author empty$ 1146 | { key empty$ 1147 | { "to sort, need author or key in " cite$ * warning$ 1148 | "" 1149 | } 1150 | { key sortify } 1151 | if$ 1152 | } 1153 | { author sort.format.names } 1154 | if$ 1155 | } 1156 | 1157 | FUNCTION {author.editor.sort} 1158 | { author empty$ 1159 | { editor empty$ 1160 | { key empty$ 1161 | { "to sort, need author, editor, or key in " cite$ * warning$ 1162 | "" 1163 | } 1164 | { key sortify } 1165 | if$ 1166 | } 1167 | { editor sort.format.names } 1168 | if$ 1169 | } 1170 | { author sort.format.names } 1171 | if$ 1172 | } 1173 | 1174 | FUNCTION {author.organization.sort} 1175 | { author empty$ 1176 | { organization empty$ 1177 | { key empty$ 1178 | { "to sort, need author, organization, or key in " cite$ * warning$ 1179 | "" 1180 | } 1181 | { key sortify } 1182 | if$ 1183 | } 1184 | { "The " #4 organization chop.word sortify } 1185 | if$ 1186 | } 1187 | { author sort.format.names } 1188 | if$ 1189 | } 1190 | 1191 | FUNCTION {editor.organization.sort} 1192 | { editor empty$ 1193 | { organization empty$ 1194 | { key empty$ 1195 | { "to sort, need editor, organization, or key in " cite$ * warning$ 1196 | "" 1197 | } 1198 | { key sortify } 1199 | if$ 1200 | } 1201 | { "The " #4 organization chop.word sortify } 1202 | if$ 1203 | } 1204 | { editor sort.format.names } 1205 | if$ 1206 | } 1207 | 1208 | FUNCTION {presort} 1209 | 1210 | { calc.label 1211 | sort.label 1212 | " " 1213 | * 1214 | type$ "book" = 1215 | 1216 | type$ "inbook" = 1217 | or 1218 | 'author.editor.sort 1219 | { type$ "proceedings" = 1220 | 'editor.organization.sort 1221 | { type$ "manual" = 1222 | 'author.organization.sort 1223 | 'author.sort 1224 | if$ 1225 | } 1226 | if$ 1227 | } 1228 | if$ 1229 | 1230 | * 1231 | 1232 | " " 1233 | * 1234 | year field.or.null sortify 1235 | * 1236 | " " 1237 | * 1238 | title field.or.null 1239 | sort.format.title 1240 | * 1241 | #1 entry.max$ substring$ 1242 | 'sort.key$ := 1243 | } 1244 | 1245 | ITERATE {presort} 1246 | 1247 | SORT 1248 | 1249 | STRINGS { longest.label last.sort.label next.extra } 1250 | 1251 | INTEGERS { longest.label.width last.extra.num } 1252 | 1253 | FUNCTION {initialize.longest.label} 1254 | { "" 'longest.label := 1255 | #0 int.to.chr$ 'last.sort.label := 1256 | "" 'next.extra := 1257 | #0 'longest.label.width := 1258 | #0 'last.extra.num := 1259 | } 1260 | 1261 | FUNCTION {forward.pass} 1262 | { last.sort.label sort.label = 1263 | { last.extra.num #1 + 'last.extra.num := 1264 | last.extra.num int.to.chr$ 'extra.label := 1265 | } 1266 | { "a" chr.to.int$ 'last.extra.num := 1267 | "" 'extra.label := 1268 | sort.label 'last.sort.label := 1269 | } 1270 | if$ 1271 | } 1272 | 1273 | FUNCTION {reverse.pass} 1274 | { next.extra "b" = 1275 | { "a" 'extra.label := } 1276 | 'skip$ 1277 | if$ 1278 | label extra.label * 'label := 1279 | label width$ longest.label.width > 1280 | { label 'longest.label := 1281 | label width$ 'longest.label.width := 1282 | } 1283 | 'skip$ 1284 | if$ 1285 | extra.label 'next.extra := 1286 | } 1287 | 1288 | EXECUTE {initialize.longest.label} 1289 | 1290 | ITERATE {forward.pass} 1291 | 1292 | REVERSE {reverse.pass} 1293 | 1294 | FUNCTION {begin.bib} 1295 | 1296 | { et.al.char.used 1297 | { "\newcommand{\etalchar}[1]{$^{#1}$}" write$ newline$ } 1298 | 'skip$ 1299 | if$ 1300 | preamble$ empty$ 1301 | 1302 | 'skip$ 1303 | { preamble$ write$ newline$ } 1304 | if$ 1305 | 1306 | "\begin{thebibliography}{" "}" * write$ newline$ 1307 | 1308 | } 1309 | 1310 | EXECUTE {begin.bib} 1311 | 1312 | EXECUTE {init.state.consts} 1313 | 1314 | ITERATE {call.type$} 1315 | 1316 | FUNCTION {end.bib} 1317 | { newline$ 1318 | "\end{thebibliography}" write$ newline$ 1319 | } 1320 | 1321 | EXECUTE {end.bib} 1322 | 1323 | -------------------------------------------------------------------------------- /tex/acl2015.sty: -------------------------------------------------------------------------------- 1 | % File acl2015.sty 2 | % December 2014 3 | 4 | % This is the LaTeX style file for ACL 2015. It is nearly identical to 5 | % the style files for ACL 2014, EACL 2006, ACL2005, ACL 2002, ACL 6 | % 2001, ACL 2000, EACL 95 and EACL 99. 7 | % 8 | % Changes made include: adapt layout to A4 and centimeters, widen abstract 9 | 10 | % This is the LaTeX style file for ACL 2000. It is nearly identical to the 11 | % style files for EACL 95 and EACL 99. Minor changes include editing the 12 | % instructions to reflect use of \documentclass rather than \documentstyle 13 | % and removing the white space before the title on the first page 14 | % -- John Chen, June 29, 2000 15 | 16 | % To convert from submissions prepared using the style file aclsub.sty 17 | % prepared for the ACL 2000 conference, proceed as follows: 18 | % 1) Remove submission-specific information: \whichsession, \id, 19 | % \wordcount, \otherconferences, \area, \keywords 20 | % 2) \summary should be removed. The summary material should come 21 | % after \maketitle and should be in the ``abstract'' environment 22 | % 3) Check all citations. This style should handle citations correctly 23 | % and also allows multiple citations separated by semicolons. 24 | % 4) Check figures and examples. Because the final format is double- 25 | % column, some adjustments may have to be made to fit text in the column 26 | % or to choose full-width (\figure*} figures. 27 | % 5) Change the style reference from aclsub to acl2000, and be sure 28 | % this style file is in your TeX search path 29 | 30 | 31 | % This is the LaTeX style file for EACL-95. It is identical to the 32 | % style file for ANLP '94 except that the margins are adjusted for A4 33 | % paper. -- abney 13 Dec 94 34 | 35 | % The ANLP '94 style file is a slightly modified 36 | % version of the style used for AAAI and IJCAI, using some changes 37 | % prepared by Fernando Pereira and others and some minor changes 38 | % by Paul Jacobs. 39 | 40 | % Papers prepared using the aclsub.sty file and acl.bst bibtex style 41 | % should be easily converted to final format using this style. 42 | % (1) Submission information (\wordcount, \subject, and \makeidpage) 43 | % should be removed. 44 | % (2) \summary should be removed. The summary material should come 45 | % after \maketitle and should be in the ``abstract'' environment 46 | % (between \begin{abstract} and \end{abstract}). 47 | % (3) Check all citations. This style should handle citations correctly 48 | % and also allows multiple citations separated by semicolons. 49 | % (4) Check figures and examples. Because the final format is double- 50 | % column, some adjustments may have to be made to fit text in the column 51 | % or to choose full-width (\figure*} figures. 52 | 53 | % Place this in a file called aclap.sty in the TeX search path. 54 | % (Placing it in the same directory as the paper should also work.) 55 | 56 | % Prepared by Peter F. Patel-Schneider, liberally using the ideas of 57 | % other style hackers, including Barbara Beeton. 58 | % This style is NOT guaranteed to work. It is provided in the hope 59 | % that it will make the preparation of papers easier. 60 | % 61 | % There are undoubtably bugs in this style. If you make bug fixes, 62 | % improvements, etc. please let me know. My e-mail address is: 63 | % pfps@research.att.com 64 | 65 | % Papers are to be prepared using the ``acl'' bibliography style, 66 | % as follows: 67 | % \documentclass[11pt]{article} 68 | % \usepackage{acl2000} 69 | % \title{Title} 70 | % \author{Author 1 \and Author 2 \\ Address line \\ Address line \And 71 | % Author 3 \\ Address line \\ Address line} 72 | % \begin{document} 73 | % ... 74 | % \bibliography{bibliography-file} 75 | % \bibliographystyle{acl} 76 | % \end{document} 77 | 78 | % Author information can be set in various styles: 79 | % For several authors from the same institution: 80 | % \author{Author 1 \and ... \and Author n \\ 81 | % Address line \\ ... \\ Address line} 82 | % if the names do not fit well on one line use 83 | % Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\ 84 | % For authors from different institutions: 85 | % \author{Author 1 \\ Address line \\ ... \\ Address line 86 | % \And ... \And 87 | % Author n \\ Address line \\ ... \\ Address line} 88 | % To start a seperate ``row'' of authors use \AND, as in 89 | % \author{Author 1 \\ Address line \\ ... \\ Address line 90 | % \AND 91 | % Author 2 \\ Address line \\ ... \\ Address line \And 92 | % Author 3 \\ Address line \\ ... \\ Address line} 93 | 94 | % If the title and author information does not fit in the area allocated, 95 | % place \setlength\titlebox{} right after 96 | % \usepackage{acl2015} 97 | % where can be something larger than 5cm 98 | 99 | \typeout{Conference Style for ACL 2015 -- released December 7, 2014} 100 | 101 | % NOTE: Some laser printers have a serious problem printing TeX output. 102 | % These printing devices, commonly known as ``write-white'' laser 103 | % printers, tend to make characters too light. To get around this 104 | % problem, a darker set of fonts must be created for these devices. 105 | % 106 | 107 | 108 | 109 | % A4 modified by Eneko; again modified by Alexander for 5cm titlebox 110 | \setlength{\paperwidth}{21cm} % A4 111 | \setlength{\paperheight}{29.7cm}% A4 112 | \setlength\topmargin{-0.5cm} 113 | \setlength\oddsidemargin{0cm} 114 | \setlength\textheight{24.7cm} 115 | \setlength\textwidth{16.0cm} 116 | \setlength\columnsep{0.6cm} 117 | \newlength\titlebox 118 | \setlength\titlebox{5cm} 119 | \setlength\headheight{5pt} 120 | \setlength\headsep{0pt} 121 | \thispagestyle{empty} 122 | \pagestyle{empty} 123 | 124 | 125 | \flushbottom \twocolumn \sloppy 126 | 127 | % We're never going to need a table of contents, so just flush it to 128 | % save space --- suggested by drstrip@sandia-2 129 | \def\addcontentsline#1#2#3{} 130 | 131 | % Title stuff, taken from deproc. 132 | \def\maketitle{\par 133 | \begingroup 134 | \def\thefootnote{\fnsymbol{footnote}} 135 | \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} 136 | \twocolumn[\@maketitle] \@thanks 137 | \endgroup 138 | \setcounter{footnote}{0} 139 | \let\maketitle\relax \let\@maketitle\relax 140 | \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax} 141 | \def\@maketitle{\vbox to \titlebox{\hsize\textwidth 142 | \linewidth\hsize \vskip 0.125in minus 0.125in \centering 143 | {\Large\bf \@title \par} \vskip 0.2in plus 1fil minus 0.1in 144 | {\def\and{\unskip\enspace{\rm and}\enspace}% 145 | \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil 146 | \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf}% 147 | \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup 148 | \vskip 0.25in plus 1fil minus 0.125in 149 | \hbox to \linewidth\bgroup\large \hfil\hfil 150 | \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf} 151 | \hbox to \linewidth\bgroup\large \hfil\hfil 152 | \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf\@author 153 | \end{tabular}\hss\egroup 154 | \hfil\hfil\egroup} 155 | \vskip 0.3in plus 2fil minus 0.1in 156 | }} 157 | 158 | % margins for abstract 159 | \renewenvironment{abstract}% 160 | {\centerline{\large\bf Abstract}% 161 | \begin{list}{}% 162 | {\setlength{\rightmargin}{0.6cm}% 163 | \setlength{\leftmargin}{0.6cm}}% 164 | \item[]\ignorespaces}% 165 | {\unskip\end{list}} 166 | 167 | %\renewenvironment{abstract}{\centerline{\large\bf 168 | % Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex} 169 | 170 | 171 | % bibliography 172 | 173 | \def\thebibliography#1{\section*{References} 174 | \global\def\@listi{\leftmargin\leftmargini 175 | \labelwidth\leftmargini \advance\labelwidth-\labelsep 176 | \topsep 1pt plus 2pt minus 1pt 177 | \parsep 0.25ex plus 1pt \itemsep 0.25ex plus 1pt} 178 | \list {[\arabic{enumi}]}{\settowidth\labelwidth{[#1]}\leftmargin\labelwidth 179 | \advance\leftmargin\labelsep\usecounter{enumi}} 180 | \def\newblock{\hskip .11em plus .33em minus -.07em} 181 | \sloppy 182 | \sfcode`\.=1000\relax} 183 | 184 | \def\@up#1{\raise.2ex\hbox{#1}} 185 | 186 | % most of cite format is from aclsub.sty by SMS 187 | 188 | % don't box citations, separate with ; and a space 189 | % also, make the penalty between citations negative: a good place to break 190 | % changed comma back to semicolon pj 2/1/90 191 | % \def\@citex[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi 192 | % \def\@citea{}\@cite{\@for\@citeb:=#2\do 193 | % {\@citea\def\@citea{;\penalty\@citeseppen\ }\@ifundefined 194 | % {b@\@citeb}{{\bf ?}\@warning 195 | % {Citation `\@citeb' on page \thepage \space undefined}}% 196 | % {\csname b@\@citeb\endcsname}}}{#1}} 197 | 198 | % don't box citations, separate with ; and a space 199 | % Replaced for multiple citations (pj) 200 | % don't box citations and also add space, semicolon between multiple citations 201 | \def\@citex[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi 202 | \def\@citea{}\@cite{\@for\@citeb:=#2\do 203 | {\@citea\def\@citea{; }\@ifundefined 204 | {b@\@citeb}{{\bf ?}\@warning 205 | {Citation `\@citeb' on page \thepage \space undefined}}% 206 | {\csname b@\@citeb\endcsname}}}{#1}} 207 | 208 | % Allow short (name-less) citations, when used in 209 | % conjunction with a bibliography style that creates labels like 210 | % \citename{, } 211 | % 212 | \let\@internalcite\cite 213 | \def\cite{\def\citename##1{##1, }\@internalcite} 214 | \def\shortcite{\def\citename##1{}\@internalcite} 215 | \def\newcite{\def\citename##1{{\frenchspacing##1} (}\@internalciteb} 216 | 217 | % Macros for \newcite, which leaves name in running text, and is 218 | % otherwise like \shortcite. 219 | \def\@citexb[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi 220 | \def\@citea{}\@newcite{\@for\@citeb:=#2\do 221 | {\@citea\def\@citea{;\penalty\@m\ }\@ifundefined 222 | {b@\@citeb}{{\bf ?}\@warning 223 | {Citation `\@citeb' on page \thepage \space undefined}}% 224 | {\csname b@\@citeb\endcsname}}}{#1}} 225 | \def\@internalciteb{\@ifnextchar [{\@tempswatrue\@citexb}{\@tempswafalse\@citexb[]}} 226 | 227 | \def\@newcite#1#2{{#1\if@tempswa, #2\fi)}} 228 | 229 | \def\@biblabel#1{\def\citename##1{##1}[#1]\hfill} 230 | 231 | %%% More changes made by SMS (originals in latex.tex) 232 | % Use parentheses instead of square brackets in the text. 233 | \def\@cite#1#2{({#1\if@tempswa , #2\fi})} 234 | 235 | % Don't put a label in the bibliography at all. Just use the unlabeled format 236 | % instead. 237 | \def\thebibliography#1{\vskip\parskip% 238 | \vskip\baselineskip% 239 | \def\baselinestretch{1}% 240 | \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi% 241 | \vskip-\parskip% 242 | \vskip-\baselineskip% 243 | \section*{References\@mkboth 244 | {References}{References}}\list 245 | {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent} 246 | \setlength{\itemindent}{-\parindent}} 247 | \def\newblock{\hskip .11em plus .33em minus -.07em} 248 | \sloppy\clubpenalty4000\widowpenalty4000 249 | \sfcode`\.=1000\relax} 250 | \let\endthebibliography=\endlist 251 | 252 | % Allow for a bibliography of sources of attested examples 253 | \def\thesourcebibliography#1{\vskip\parskip% 254 | \vskip\baselineskip% 255 | \def\baselinestretch{1}% 256 | \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi% 257 | \vskip-\parskip% 258 | \vskip-\baselineskip% 259 | \section*{Sources of Attested Examples\@mkboth 260 | {Sources of Attested Examples}{Sources of Attested Examples}}\list 261 | {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent} 262 | \setlength{\itemindent}{-\parindent}} 263 | \def\newblock{\hskip .11em plus .33em minus -.07em} 264 | \sloppy\clubpenalty4000\widowpenalty4000 265 | \sfcode`\.=1000\relax} 266 | \let\endthesourcebibliography=\endlist 267 | 268 | \def\@lbibitem[#1]#2{\item[]\if@filesw 269 | { \def\protect##1{\string ##1\space}\immediate 270 | \write\@auxout{\string\bibcite{#2}{#1}}\fi\ignorespaces}} 271 | 272 | \def\@bibitem#1{\item\if@filesw \immediate\write\@auxout 273 | {\string\bibcite{#1}{\the\c@enumi}}\fi\ignorespaces} 274 | 275 | % sections with less space 276 | \def\section{\@startsection {section}{1}{\z@}{-2.0ex plus 277 | -0.5ex minus -.2ex}{1.5ex plus 0.3ex minus .2ex}{\large\bf\raggedright}} 278 | \def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus 279 | -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}} 280 | %% changed by KO to - values to get teh initial parindent right 281 | \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex plus 282 | -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalsize\bf\raggedright}} 283 | \def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus 284 | 0.5ex minus .2ex}{-1em}{\normalsize\bf}} 285 | \def\subparagraph{\@startsection{subparagraph}{5}{\parindent}{1.5ex plus 286 | 0.5ex minus .2ex}{-1em}{\normalsize\bf}} 287 | 288 | % Footnotes 289 | \footnotesep 6.65pt % 290 | \skip\footins 9pt plus 4pt minus 2pt 291 | \def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt } 292 | \setcounter{footnote}{0} 293 | 294 | % Lists and paragraphs 295 | \parindent 1em 296 | \topsep 4pt plus 1pt minus 2pt 297 | \partopsep 1pt plus 0.5pt minus 0.5pt 298 | \itemsep 2pt plus 1pt minus 0.5pt 299 | \parsep 2pt plus 1pt minus 0.5pt 300 | 301 | \leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em 302 | \leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em \leftmarginvi .5em 303 | \labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt 304 | 305 | \def\@listi{\leftmargin\leftmargini} 306 | \def\@listii{\leftmargin\leftmarginii 307 | \labelwidth\leftmarginii\advance\labelwidth-\labelsep 308 | \topsep 2pt plus 1pt minus 0.5pt 309 | \parsep 1pt plus 0.5pt minus 0.5pt 310 | \itemsep \parsep} 311 | \def\@listiii{\leftmargin\leftmarginiii 312 | \labelwidth\leftmarginiii\advance\labelwidth-\labelsep 313 | \topsep 1pt plus 0.5pt minus 0.5pt 314 | \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt 315 | \itemsep \topsep} 316 | \def\@listiv{\leftmargin\leftmarginiv 317 | \labelwidth\leftmarginiv\advance\labelwidth-\labelsep} 318 | \def\@listv{\leftmargin\leftmarginv 319 | \labelwidth\leftmarginv\advance\labelwidth-\labelsep} 320 | \def\@listvi{\leftmargin\leftmarginvi 321 | \labelwidth\leftmarginvi\advance\labelwidth-\labelsep} 322 | 323 | \abovedisplayskip 7pt plus2pt minus5pt% 324 | \belowdisplayskip \abovedisplayskip 325 | \abovedisplayshortskip 0pt plus3pt% 326 | \belowdisplayshortskip 4pt plus3pt minus3pt% 327 | 328 | % Less leading in most fonts (due to the narrow columns) 329 | % The choices were between 1-pt and 1.5-pt leading 330 | \def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt} 331 | \def\small{\@setsize\small{10pt}\ixpt\@ixpt} 332 | \def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt} 333 | \def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt} 334 | \def\tiny{\@setsize\tiny{7pt}\vipt\@vipt} 335 | \def\large{\@setsize\large{14pt}\xiipt\@xiipt} 336 | \def\Large{\@setsize\Large{16pt}\xivpt\@xivpt} 337 | \def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt} 338 | \def\huge{\@setsize\huge{23pt}\xxpt\@xxpt} 339 | \def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt} 340 | -------------------------------------------------------------------------------- /tex/acl2015.tex: -------------------------------------------------------------------------------- 1 | % 2 | % File acl2015.tex 3 | % 4 | % Contact: car@ir.hit.edu.cn, gdzhou@suda.edu.cn 5 | %% 6 | %% Based on the style files for ACL-2014, which were, in turn, 7 | %% Based on the style files for ACL-2013, which were, in turn, 8 | %% Based on the style files for ACL-2012, which were, in turn, 9 | %% based on the style files for ACL-2011, which were, in turn, 10 | %% based on the style files for ACL-2010, which were, in turn, 11 | %% based on the style files for ACL-IJCNLP-2009, which were, in turn, 12 | %% based on the style files for EACL-2009 and IJCNLP-2008... 13 | 14 | %% Based on the style files for EACL 2006 by 15 | %%e.agirre@ehu.es or Sergi.Balari@uab.es 16 | %% and that of ACL 08 by Joakim Nivre and Noah Smith 17 | 18 | \documentclass[11pt]{article} 19 | \usepackage{acl2015} 20 | \usepackage{times} 21 | \usepackage{url} 22 | \usepackage{latexsym} 23 | 24 | %\setlength\titlebox{5cm} 25 | 26 | % You can expand the titlebox if you need extra space 27 | % to show all the authors. Please do not make the titlebox 28 | % smaller than 5cm (the original size); we will check this 29 | % in the camera-ready version and ask you to change it back. 30 | 31 | 32 | \title{Instructions for ACL-2015 Proceedings} 33 | 34 | \author{First Author \\ 35 | Affiliation / Address line 1 \\ 36 | Affiliation / Address line 2 \\ 37 | Affiliation / Address line 3 \\ 38 | {\tt email@domain} \\\And 39 | Second Author \\ 40 | Affiliation / Address line 1 \\ 41 | Affiliation / Address line 2 \\ 42 | Affiliation / Address line 3 \\ 43 | {\tt email@domain} \\} 44 | 45 | \date{} 46 | 47 | \begin{document} 48 | \maketitle 49 | \begin{abstract} 50 | This document contains the instructions for preparing a camera-ready 51 | manuscript for the proceedings of ACL-2015. The document itself 52 | conforms to its own specifications, and is therefore an example of 53 | what your manuscript should look like. These instructions should be 54 | used for both papers submitted for review and for final versions of 55 | accepted papers. Authors are asked to conform to all the directions 56 | reported in this document. 57 | \end{abstract} 58 | 59 | \section{Credits} 60 | 61 | This document has been adapted from the instructions for earlier ACL 62 | proceedings, including those for ACL-2012 by Maggie Li and Michael 63 | White, those from ACL-2010 by Jing-Shing Chang and Philipp Koehn, 64 | those for ACL-2008 by Johanna D. Moore, Simone Teufel, James Allan, 65 | and Sadaoki Furui, those for ACL-2005 by Hwee Tou Ng and Kemal 66 | Oflazer, those for ACL-2002 by Eugene Charniak and Dekang Lin, and 67 | earlier ACL and EACL formats. Those versions were written by several 68 | people, including John Chen, Henry S. Thompson and Donald 69 | Walker. Additional elements were taken from the formatting 70 | instructions of the {\em International Joint Conference on Artificial 71 | Intelligence}. 72 | 73 | \section{Introduction} 74 | 75 | The following instructions are directed to authors of papers submitted 76 | to ACL-2015 or accepted for publication in its proceedings. All 77 | authors are required to adhere to these specifications. Authors are 78 | required to provide a Portable Document Format (PDF) version of their 79 | papers. \textbf{The proceedings are designed for printing on A4 80 | paper.} 81 | 82 | We will make more detailed instructions available at 83 | \url{http://acl2015.org/publication.html}. Please check this website 84 | regularly. 85 | 86 | 87 | \section{General Instructions} 88 | 89 | Manuscripts must be in two-column format. Exceptions to the 90 | two-column format include the title, authors' names and complete 91 | addresses, which must be centered at the top of the first page, and 92 | any full-width figures or tables (see the guidelines in 93 | Subsection~\ref{ssec:first}). {\bf Type single-spaced.} Start all 94 | pages directly under the top margin. See the guidelines later 95 | regarding formatting the first page. The manuscript should be 96 | printed single-sided and its length 97 | should not exceed the maximum page limit described in Section~\ref{sec:length}. 98 | Do not number the pages. 99 | 100 | 101 | \subsection{Electronically-available resources} 102 | 103 | We strongly prefer that you prepare your PDF files using \LaTeX\ with 104 | the official ACL 2015 style file (acl2015.sty) and bibliography style 105 | (acl.bst). These files are available at 106 | \url{http://acl2015.org}. You will also find the document 107 | you are currently reading (acl2015.pdf) and its \LaTeX\ source code 108 | (acl2015.tex) on this website. 109 | 110 | You can alternatively use Microsoft Word to produce your PDF file. In 111 | this case, we strongly recommend the use of the Word template file 112 | (acl2015.dot) on the ACL 2015 website (\url{http://acl2015.org}). 113 | If you have an option, we recommend that you use the \LaTeX2e version. 114 | If you will be using the Microsoft Word template, we suggest that you 115 | anonymize your source file so that the pdf produced does not retain your 116 | identity. This can be done by removing any personal information 117 | from your source document properties. 118 | 119 | 120 | 121 | \subsection{Format of Electronic Manuscript} 122 | \label{sect:pdf} 123 | 124 | For the production of the electronic manuscript you must use Adobe's 125 | Portable Document Format (PDF). PDF files are usually produced from 126 | \LaTeX\ using the \textit{pdflatex} command. If your version of 127 | \LaTeX\ produces Postscript files, you can convert these into PDF 128 | using \textit{ps2pdf} or \textit{dvipdf}. On Windows, you can also use 129 | Adobe Distiller to generate PDF. 130 | 131 | Please make sure that your PDF file includes all the necessary fonts 132 | (especially tree diagrams, symbols, and fonts with Asian 133 | characters). When you print or create the PDF file, there is usually 134 | an option in your printer setup to include none, all or just 135 | non-standard fonts. Please make sure that you select the option of 136 | including ALL the fonts. \textbf{Before sending it, test your PDF by 137 | printing it from a computer different from the one where it was 138 | created.} Moreover, some word processors may generate very large PDF 139 | files, where each page is rendered as an image. Such images may 140 | reproduce poorly. In this case, try alternative ways to obtain the 141 | PDF. One way on some systems is to install a driver for a postscript 142 | printer, send your document to the printer specifying ``Output to a 143 | file'', then convert the file to PDF. 144 | 145 | It is of utmost importance to specify the \textbf{A4 format} (21 cm 146 | x 29.7 cm) when formatting the paper. When working with 147 | {\tt dvips}, for instance, one should specify {\tt -t a4}. 148 | Or using the command \verb|\special{papersize=210mm,297mm}| in the latex 149 | preamble (directly below the \verb|\usepackage| commands). Then using 150 | {\tt dvipdf} and/or {\tt pdflatex} which would make it easier for some. 151 | 152 | 153 | Print-outs of the PDF file on A4 paper should be identical to the 154 | hardcopy version. If you cannot meet the above requirements about the 155 | production of your electronic submission, please contact the 156 | publication chairs as soon as possible. 157 | 158 | 159 | \subsection{Layout} 160 | \label{ssec:layout} 161 | 162 | Format manuscripts two columns to a page, in the manner these 163 | instructions are formatted. The exact dimensions for a page on A4 164 | paper are: 165 | 166 | \begin{itemize} 167 | \item Left and right margins: 2.5 cm 168 | \item Top margin: 2.5 cm 169 | \item Bottom margin: 2.5 cm 170 | \item Column width: 7.7 cm 171 | \item Column height: 24.7 cm 172 | \item Gap between columns: 0.6 cm 173 | \end{itemize} 174 | 175 | \noindent Papers should not be submitted on any other paper size. 176 | If you cannot meet the above requirements about the production of 177 | your electronic submission, please contact the publication chairs 178 | above as soon as possible. 179 | 180 | 181 | \subsection{Fonts} 182 | 183 | For reasons of uniformity, Adobe's {\bf Times Roman} font should be 184 | used. In \LaTeX2e{} this is accomplished by putting 185 | 186 | \begin{quote} 187 | \begin{verbatim} 188 | \usepackage{times} 189 | \usepackage{latexsym} 190 | \end{verbatim} 191 | \end{quote} 192 | in the preamble. If Times Roman is unavailable, use {\bf Computer 193 | Modern Roman} (\LaTeX2e{}'s default). Note that the latter is about 194 | 10\% less dense than Adobe's Times Roman font. 195 | 196 | 197 | \begin{table}[h] 198 | \begin{center} 199 | \begin{tabular}{|l|rl|} 200 | \hline \bf Type of Text & \bf Font Size & \bf Style \\ \hline 201 | paper title & 15 pt & bold \\ 202 | author names & 12 pt & bold \\ 203 | author affiliation & 12 pt & \\ 204 | the word ``Abstract'' & 12 pt & bold \\ 205 | section titles & 12 pt & bold \\ 206 | document text & 11 pt &\\ 207 | captions & 11 pt & \\ 208 | abstract text & 10 pt & \\ 209 | bibliography & 10 pt & \\ 210 | footnotes & 9 pt & \\ 211 | \hline 212 | \end{tabular} 213 | \end{center} 214 | \caption{\label{font-table} Font guide. } 215 | \end{table} 216 | 217 | \subsection{The First Page} 218 | \label{ssec:first} 219 | 220 | Center the title, author's name(s) and affiliation(s) across both 221 | columns. Do not use footnotes for affiliations. Do not include the 222 | paper ID number assigned during the submission process. Use the 223 | two-column format only when you begin the abstract. 224 | 225 | {\bf Title}: Place the title centered at the top of the first page, in 226 | a 15-point bold font. (For a complete guide to font sizes and styles, 227 | see Table~\ref{font-table}) Long titles should be typed on two lines 228 | without a blank line intervening. Approximately, put the title at 2.5 229 | cm from the top of the page, followed by a blank line, then the 230 | author's names(s), and the affiliation on the following line. Do not 231 | use only initials for given names (middle initials are allowed). Do 232 | not format surnames in all capitals (e.g., use ``Schlangen'' not 233 | ``SCHLANGEN''). Do not format title and section headings in all 234 | capitals as well except for proper names (such as ``BLEU'') that are 235 | conventionally in all capitals. The affiliation should contain the 236 | author's complete address, and if possible, an electronic mail 237 | address. Start the body of the first page 7.5 cm from the top of the 238 | page. 239 | 240 | The title, author names and addresses should be completely identical 241 | to those entered to the electronical paper submission website in order 242 | to maintain the consistency of author information among all 243 | publications of the conference. If they are different, the publication 244 | chairs may resolve the difference without consulting with you; so it 245 | is in your own interest to double-check that the information is 246 | consistent. 247 | 248 | {\bf Abstract}: Type the abstract at the beginning of the first 249 | column. The width of the abstract text should be smaller than the 250 | width of the columns for the text in the body of the paper by about 251 | 0.6 cm on each side. Center the word {\bf Abstract} in a 12 point bold 252 | font above the body of the abstract. The abstract should be a concise 253 | summary of the general thesis and conclusions of the paper. It should 254 | be no longer than 200 words. The abstract text should be in 10 point font. 255 | 256 | {\bf Text}: Begin typing the main body of the text immediately after 257 | the abstract, observing the two-column format as shown in 258 | the present document. Do not include page numbers. 259 | 260 | {\bf Indent} when starting a new paragraph. Use 11 points for text and 261 | subsection headings, 12 points for section headings and 15 points for 262 | the title. 263 | 264 | \subsection{Sections} 265 | 266 | {\bf Headings}: Type and label section and subsection headings in the 267 | style shown on the present document. Use numbered sections (Arabic 268 | numerals) in order to facilitate cross references. Number subsections 269 | with the section number and the subsection number separated by a dot, 270 | in Arabic numerals. Do not number subsubsections. 271 | 272 | {\bf Citations}: Citations within the text appear in parentheses 273 | as~\cite{Gusfield:97} or, if the author's name appears in the text 274 | itself, as Gusfield~\shortcite{Gusfield:97}. Append lowercase letters 275 | to the year in cases of ambiguity. Treat double authors as 276 | in~\cite{Aho:72}, but write as in~\cite{Chandra:81} when more than two 277 | authors are involved. Collapse multiple citations as 278 | in~\cite{Gusfield:97,Aho:72}. Also refrain from using full citations 279 | as sentence constituents. We suggest that instead of 280 | \begin{quote} 281 | ``\cite{Gusfield:97} showed that ...'' 282 | \end{quote} 283 | you use 284 | \begin{quote} 285 | ``Gusfield \shortcite{Gusfield:97} showed that ...'' 286 | \end{quote} 287 | 288 | If you are using the provided \LaTeX{} and Bib\TeX{} style files, you 289 | can use the command \verb|\newcite| to get ``author (year)'' citations. 290 | 291 | As reviewing will be double-blind, the submitted version of the papers 292 | should not include the authors' names and affiliations. Furthermore, 293 | self-references that reveal the author's identity, e.g., 294 | \begin{quote} 295 | ``We previously showed \cite{Gusfield:97} ...'' 296 | \end{quote} 297 | should be avoided. Instead, use citations such as 298 | \begin{quote} 299 | ``Gusfield \shortcite{Gusfield:97} 300 | previously showed ... '' 301 | \end{quote} 302 | 303 | \textbf{Please do not use anonymous citations} and do not include 304 | acknowledgements when submitting your papers. Papers that do not 305 | conform to these requirements may be rejected without review. 306 | 307 | \textbf{References}: Gather the full set of references together under 308 | the heading {\bf References}; place the section before any Appendices, 309 | unless they contain references. Arrange the references alphabetically 310 | by first author, rather than by order of occurrence in the text. 311 | Provide as complete a citation as possible, using a consistent format, 312 | such as the one for {\em Computational Linguistics\/} or the one in the 313 | {\em Publication Manual of the American 314 | Psychological Association\/}~\cite{APA:83}. Use of full names for 315 | authors rather than initials is preferred. A list of abbreviations 316 | for common computer science journals can be found in the ACM 317 | {\em Computing Reviews\/}~\cite{ACM:83}. 318 | 319 | The \LaTeX{} and Bib\TeX{} style files provided roughly fit the 320 | American Psychological Association format, allowing regular citations, 321 | short citations and multiple citations as described above. 322 | 323 | {\bf Appendices}: Appendices, if any, directly follow the text and the 324 | references (but see above). Letter them in sequence and provide an 325 | informative title: {\bf Appendix A. Title of Appendix}. 326 | 327 | \subsection{Footnotes} 328 | 329 | {\bf Footnotes}: Put footnotes at the bottom of the page and use 9 330 | points text. They may be numbered or referred to by asterisks or other 331 | symbols.\footnote{This is how a footnote should appear.} Footnotes 332 | should be separated from the text by a line.\footnote{Note the line 333 | separating the footnotes from the text.} 334 | 335 | \subsection{Graphics} 336 | 337 | {\bf Illustrations}: Place figures, tables, and photographs in the 338 | paper near where they are first discussed, rather than at the end, if 339 | possible. Wide illustrations may run across both columns. Color 340 | illustrations are discouraged, unless you have verified that 341 | they will be understandable when printed in black ink. 342 | 343 | {\bf Captions}: Provide a caption for every illustration; number each one 344 | sequentially in the form: ``Figure 1. Caption of the Figure.'' ``Table 1. 345 | Caption of the Table.'' Type the captions of the figures and 346 | tables below the body, using 11 point text. 347 | 348 | 349 | \section{XML conversion and supported \LaTeX\ packages} 350 | 351 | Following ACL 2014 we will also we will attempt to automatically convert 352 | your \LaTeX\ source files to publish papers in machine-readable 353 | XML with semantic markup in the ACL Anthology, in addition to the 354 | traditional PDF format. This will allow us to create, over the next 355 | few years, a growing corpus of scientific text for our own future research, 356 | and picks up on recent initiatives on converting ACL papers from earlier 357 | years to XML. 358 | 359 | We encourage you to submit a ZIP file of your \LaTeX\ sources along 360 | with the camera-ready version of your paper. We will then convert them 361 | to XML automatically, using the LaTeXML tool 362 | (\url{http://dlmf.nist.gov/LaTeXML}). LaTeXML has \emph{bindings} for 363 | a number of \LaTeX\ packages, including the ACL 2015 stylefile. These 364 | bindings allow LaTeXML to render the commands from these packages 365 | correctly in XML. For best results, we encourage you to use the 366 | packages that are officially supported by LaTeXML, listed at 367 | \url{http://dlmf.nist.gov/LaTeXML/manual/included.bindings} 368 | 369 | 370 | 371 | 372 | 373 | \section{Translation of non-English Terms} 374 | 375 | It is also advised to supplement non-English characters and terms 376 | with appropriate transliterations and/or translations 377 | since not all readers understand all such characters and terms. 378 | Inline transliteration or translation can be represented in 379 | the order of: original-form transliteration ``translation''. 380 | 381 | \section{Length of Submission} 382 | \label{sec:length} 383 | 384 | Long papers may consist of up to 8 pages of content, plus two extra 385 | pages for references. Short papers may consist of up to 4 pages of 386 | content, plus two extra pages for references. Papers that do not 387 | conform to the specified length and formatting requirements may be 388 | rejected without review. 389 | 390 | 391 | 392 | \section*{Acknowledgments} 393 | 394 | The acknowledgments should go immediately before the references. Do 395 | not number the acknowledgments section. Do not include this section 396 | when submitting your paper for review. 397 | 398 | % include your own bib file like this: 399 | %\bibliographystyle{acl} 400 | %\bibliography{acl2015} 401 | 402 | \begin{thebibliography}{} 403 | 404 | \bibitem[\protect\citename{Aho and Ullman}1972]{Aho:72} 405 | Alfred~V. Aho and Jeffrey~D. Ullman. 406 | \newblock 1972. 407 | \newblock {\em The Theory of Parsing, Translation and Compiling}, volume~1. 408 | \newblock Prentice-{Hall}, Englewood Cliffs, NJ. 409 | 410 | \bibitem[\protect\citename{{American Psychological Association}}1983]{APA:83} 411 | {American Psychological Association}. 412 | \newblock 1983. 413 | \newblock {\em Publications Manual}. 414 | \newblock American Psychological Association, Washington, DC. 415 | 416 | \bibitem[\protect\citename{{Association for Computing Machinery}}1983]{ACM:83} 417 | {Association for Computing Machinery}. 418 | \newblock 1983. 419 | \newblock {\em Computing Reviews}, 24(11):503--512. 420 | 421 | \bibitem[\protect\citename{Chandra \bgroup et al.\egroup }1981]{Chandra:81} 422 | Ashok~K. Chandra, Dexter~C. Kozen, and Larry~J. Stockmeyer. 423 | \newblock 1981. 424 | \newblock Alternation. 425 | \newblock {\em Journal of the Association for Computing Machinery}, 426 | 28(1):114--133. 427 | 428 | \bibitem[\protect\citename{Gusfield}1997]{Gusfield:97} 429 | Dan Gusfield. 430 | \newblock 1997. 431 | \newblock {\em Algorithms on Strings, Trees and Sequences}. 432 | \newblock Cambridge University Press, Cambridge, UK. 433 | 434 | \end{thebibliography} 435 | 436 | \end{document} 437 | -------------------------------------------------------------------------------- /tex/deepir.bbl: -------------------------------------------------------------------------------- 1 | \begin{thebibliography}{} 2 | 3 | \bibitem[\protect\citename{Besag}1974]{besag_spatial_1974} 4 | Julian Besag. 5 | \newblock 1974. 6 | \newblock Spatial interaction and the statistical analysis of lattice systems. 7 | \newblock {\em Journal of the Royal Statistical Society, Series B}. 8 | 9 | \bibitem[\protect\citename{Besag}1975]{besag1975statistical} 10 | Julian Besag. 11 | \newblock 1975. 12 | \newblock Statistical analysis of non-lattice data. 13 | \newblock {\em The Statistician}, pages 179--195. 14 | 15 | \bibitem[\protect\citename{Flynn \bgroup et al.\egroup 16 | }2013]{flynn_efficiency_2013} 17 | Cheryl Flynn, Clifford Hurvich, and Jefferey Simonoff. 18 | \newblock 2013. 19 | \newblock Efficiency for {Regularization} {Parameter} {Selection} in 20 | {Penalized} {Likelihood} {Estimation} of {Misspecified} {Models}. 21 | \newblock {\em Journal of the American Statistical Association}, 22 | 108:1031--1043. 23 | 24 | \bibitem[\protect\citename{Jernite \bgroup et al.\egroup }2015]{jernite2015mrf} 25 | Yacine Jernite, Alexander Rush, and David Sontag. 26 | \newblock 2015. 27 | \newblock A fast variational approach for learning {M}arkov random field 28 | language models. 29 | \newblock In {\em Proceedings of the 32nd International Conference on Machine 30 | Learning (ICML 2015)}. 31 | 32 | \bibitem[\protect\citename{Le and Mikolov}2014]{le_distributed_2014} 33 | Quoc~V. Le and Tomas Mikolov. 34 | \newblock 2014. 35 | \newblock Distributed representations of sentences and documents. 36 | \newblock In {\em Proceedings of the 31 st {International} {Conference} on 37 | {Machine} {Learning}}. 38 | 39 | \bibitem[\protect\citename{Mikolov \bgroup et al.\egroup 40 | }2013a]{mikolov2013efficient} 41 | Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 42 | \newblock 2013a. 43 | \newblock Efficient estimation of word representations in vector space. 44 | \newblock {\em arXiv preprint arXiv:1301.3781}. 45 | 46 | \bibitem[\protect\citename{Mikolov \bgroup et al.\egroup 47 | }2013b]{mikolov_distributed_2013} 48 | Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg~S. Corrado, and Jeff Dean. 49 | \newblock 2013b. 50 | \newblock Distributed representations of words and phrases and their 51 | compositionality. 52 | \newblock In {\em Advances in {Neural} {Information} {Processing} {Systems}}, 53 | pages 3111--3119. 54 | 55 | \bibitem[\protect\citename{Molenberghs and Verbeke}2006]{molenberghs2006models} 56 | Geert Molenberghs and Geert Verbeke. 57 | \newblock 2006. 58 | \newblock {\em Models for discrete longitudinal data}. 59 | \newblock Springer Science \& Business Media. 60 | 61 | \bibitem[\protect\citename{Morin and Bengio}2005]{morin_hierarchical_2005} 62 | Frederic Morin and Yoshua Bengio. 63 | \newblock 2005. 64 | \newblock Hierarchical probabilistic neural network language model. 65 | \newblock In {\em Proceedings of the {International} {Workshop} on {Artificial} 66 | {Intelligence} and {Statistics}}, pages 246--252. 67 | 68 | \bibitem[\protect\citename{Ng and Jordan}2002]{ng_discriminative_2002} 69 | Andrew~Y. Ng and Michael~I. Jordan. 70 | \newblock 2002. 71 | \newblock On {Discriminative} vs {Generative} {Classifiers}: {A} {Comparison} 72 | of {Logistic} {Regression} and naive {Bayes}. 73 | \newblock In {\em Advances in {Neural} {Information} {Processing} {Systems} 74 | ({NIPS})}. 75 | 76 | \bibitem[\protect\citename{Pennington \bgroup et al.\egroup 77 | }2014]{pennington_glove:_2014} 78 | Jeffrey Pennington, Richard Socher, and Christopher~D. Manning. 79 | \newblock 2014. 80 | \newblock Glove: {Global} vectors for word representation. 81 | \newblock {\em Proceedings of the Empiricial Methods in Natural Language 82 | Processing (EMNLP 2014)}, 12. 83 | 84 | \bibitem[\protect\citename{Rehurek and Sojka}2010]{rehurek_software_2010} 85 | Radim Rehurek and Petr Sojka. 86 | \newblock 2010. 87 | \newblock Software {Framework} for {Topic} {Modelling} with {Large} {Corpora}. 88 | \newblock In {\em Proceedings of the {LREC} 2010 {Workshop} on {New} 89 | {Challenges} for {NLP} {Frameworks}}, pages 45--50. 90 | 91 | \bibitem[\protect\citename{Rumelhart \bgroup et al.\egroup 92 | }1986]{rumelhart_learning_1986} 93 | David Rumelhart, Geoffrey Hinton, and Ronald Williams. 94 | \newblock 1986. 95 | \newblock Learning representations by back-propagating errors. 96 | \newblock {\em Nature}, 323:533--536. 97 | 98 | \bibitem[\protect\citename{Socher \bgroup et al.\egroup 99 | }2011]{socher_parsing_2011} 100 | Richard Socher, Cliff~C. Lin, Chris Manning, and Andrew~Y. Ng. 101 | \newblock 2011. 102 | \newblock Parsing natural scenes and natural language with recursive neural 103 | networks. 104 | \newblock In {\em Proceedings of the 28th international conference on machine 105 | learning ({ICML}-11)}, pages 129--136. 106 | 107 | \bibitem[\protect\citename{Socher \bgroup et al.\egroup 108 | }2013]{socher_recursive_2013} 109 | Richard Socher, Alex Perelygin, Jean~Y. Wu, Jason Chuang, Christopher~D. 110 | Manning, Andrew~Y. Ng, and Christopher Potts. 111 | \newblock 2013. 112 | \newblock Recursive deep models for semantic compositionality over a sentiment 113 | treebank. 114 | \newblock In {\em Proceedings of the conference on empirical methods in natural 115 | language processing ({EMNLP})}, volume 1631, page 1642. 116 | 117 | \bibitem[\protect\citename{Taddy}2013a]{taddy_measuring_2013} 118 | Matt Taddy. 119 | \newblock 2013a. 120 | \newblock Measuring {Political} {Sentiment} on {Twitter}: {Factor} {Optimal} 121 | {Design} for {Multinomial} {Inverse} {Regression}. 122 | \newblock {\em Technometrics}, 55(4):415--425, November. 123 | 124 | \bibitem[\protect\citename{Taddy}2013b]{taddy_multinomial_2013} 125 | Matt Taddy. 126 | \newblock 2013b. 127 | \newblock Multinomial {Inverse} {Regression} for {Text} {Analysis}. 128 | \newblock {\em Journal of the American Statistical Association}, 108:755--770. 129 | 130 | \bibitem[\protect\citename{Taddy}2013c]{taddy_rejoinder:_2013} 131 | Matt Taddy. 132 | \newblock 2013c. 133 | \newblock Rejoinder: {Efficiency} and structure in {MNIR}. 134 | \newblock {\em Journal of the American Statistical Association}, 108:772--774. 135 | 136 | \bibitem[\protect\citename{Taddy}2014]{taddy_one-step_2014} 137 | Matt Taddy. 138 | \newblock 2014. 139 | \newblock One-step estimator paths for concave regularization. 140 | \newblock arXiv:1308.5623. 141 | 142 | \bibitem[\protect\citename{Taddy}2015]{taddy_distributed_2015} 143 | Matt Taddy. 144 | \newblock 2015. 145 | \newblock Distributed {Multinomial} {Regression}. 146 | \newblock {\em Annals of Applied Statistics}, To appear. 147 | 148 | \bibitem[\protect\citename{Varin \bgroup et al.\egroup 149 | }2011]{varin2011overview} 150 | Cristiano Varin, Nancy Reid, and David Firth. 151 | \newblock 2011. 152 | \newblock An overview of composite likelihood methods. 153 | \newblock {\em Statistica Sinica}, 21(1):5--42. 154 | 155 | \end{thebibliography} 156 | -------------------------------------------------------------------------------- /tex/deepir.bib: -------------------------------------------------------------------------------- 1 | @book{molenberghs2006models, 2 | title={Models for discrete longitudinal data}, 3 | author={Molenberghs, Geert and Verbeke, Geert}, 4 | year={2006}, 5 | publisher={Springer Science \& Business Media} 6 | } 7 | 8 | @article{mikolov2013efficient, 9 | title={Efficient estimation of word representations in vector space}, 10 | author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey}, 11 | journal={arXiv preprint arXiv:1301.3781}, 12 | year={2013} 13 | } 14 | 15 | @article{besag_spatial_1974, 16 | title = {Spatial Interaction and the Statistical Analysis of Lattice Systems}, 17 | journal = {Journal of the Royal Statistical Society, Series B}, 18 | author = {Besag, Julian}, 19 | year = {1974} 20 | } 21 | 22 | @article{besag1975statistical, 23 | title={Statistical analysis of non-lattice data}, 24 | author={Besag, Julian}, 25 | journal={The Statistician}, 26 | pages={179--195}, 27 | year={1975} 28 | } 29 | 30 | @inproceedings{jernite2015mrf, 31 | title = {A Fast Variational Approach for Learning {M}arkov Random Field Language Models}, 32 | author = {Yacine Jernite and Alexander Rush and David Sontag}, 33 | booktitle={Proceedings of the 32nd International Conference on Machine Learning (ICML 2015)}, 34 | year={2015} 35 | } 36 | 37 | @article{cox2004note, 38 | title={A note on pseudolikelihood constructed from marginal densities}, 39 | author={Cox, David R and Reid, Nancy}, 40 | journal={Biometrika}, 41 | volume={91}, 42 | number={3}, 43 | pages={729--737}, 44 | year={2004}, 45 | publisher={Biometrika Trust} 46 | } 47 | 48 | @book{verbeke2009linear, 49 | title={Linear mixed models for longitudinal data}, 50 | author={Verbeke, Geert and Molenberghs, Geert}, 51 | year={2009}, 52 | publisher={Springer Science \& Business Media} 53 | } 54 | 55 | @article{varin2011overview, 56 | title={An overview of composite likelihood methods}, 57 | author={Varin, Cristiano and Reid, Nancy and Firth, David}, 58 | journal={Statistica Sinica}, 59 | volume={21}, 60 | number={1}, 61 | pages={5--42}, 62 | year={2011} 63 | } 64 | 65 | @article{pang_opinion_2008, 66 | title = {Opinion {Mining} and {Sentiment} {Analysis}}, 67 | volume = {1-2}, 68 | journal = {Foundations and Trends in Information Retrieval}, 69 | author = {Pang, Bo and Lee, Lillian}, 70 | year = {2008}, 71 | pages = {1--135} 72 | } 73 | 74 | @article{efron_efficiency_1975, 75 | title = {The efficiency of logistic regression compared to normal discriminant analysis}, 76 | number = {70}, 77 | journal = {Journal of the American Statistical Association}, 78 | author = {Efron, Bradley}, 79 | year = {1975}, 80 | pages = {892--898} 81 | } 82 | 83 | @inproceedings{taddy_estimation_2012, 84 | title = {On {Estimation} and {Selection} for {Topic} {Models}}, 85 | booktitle = {Proceedings of the 15th {International} {Conference} on {Artificial} {Intelligence} and {Statistics} ({AISTATS} 2012)}, 86 | author = {Taddy, Matt}, 87 | year = {2012} 88 | } 89 | 90 | @inproceedings{mikolov_distributed_2013, 91 | title = {Distributed representations of words and phrases and their compositionality}, 92 | url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality}, 93 | urldate = {2014-10-28}, 94 | booktitle = {Advances in {Neural} {Information} {Processing} {Systems}}, 95 | author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeff}, 96 | year = {2013}, 97 | pages = {3111--3119} 98 | } 99 | 100 | @article{taddy_rejoinder:_2013, 101 | title = {Rejoinder: {Efficiency} and structure in {MNIR}}, 102 | volume = {108}, 103 | journal = {Journal of the American Statistical Association}, 104 | author = {Taddy, Matt}, 105 | year = {2013}, 106 | pages = {772--774} 107 | } 108 | 109 | @article{taddy_multinomial_2013, 110 | title = {Multinomial {Inverse} {Regression} for {Text} {Analysis}}, 111 | volume = {108}, 112 | journal = {Journal of the American Statistical Association}, 113 | author = {Taddy, Matt}, 114 | year = {2013}, 115 | pages = {755--770} 116 | } 117 | 118 | @inproceedings{socher_recursive_2013, 119 | title = {Recursive deep models for semantic compositionality over a sentiment treebank}, 120 | volume = {1631}, 121 | url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.383.1327&rep=rep1&type=pdf}, 122 | urldate = {2015-04-24}, 123 | booktitle = {Proceedings of the conference on empirical methods in natural language processing ({EMNLP})}, 124 | author = {Socher, Richard and Perelygin, Alex and Wu, Jean Y. and Chuang, Jason and Manning, Christopher D. and Ng, Andrew Y. and Potts, Christopher}, 125 | year = {2013}, 126 | pages = {1642} 127 | } 128 | 129 | @inproceedings{pang_thumbs_2002, 130 | title = {Thumbs up?: sentiment classification using machine learning techniques}, 131 | shorttitle = {Thumbs up?}, 132 | url = {http://dl.acm.org/citation.cfm?id=1118704}, 133 | urldate = {2014-10-28}, 134 | booktitle = {Proceedings of the {ACL}-02 conference on {Empirical} methods in natural language processing-{Volume} 10}, 135 | publisher = {Association for Computational Linguistics}, 136 | author = {Pang, Bo and Lee, Lillian and Vaithyanathan, Shivakumar}, 137 | year = {2002}, 138 | pages = {79--86} 139 | } 140 | 141 | @article{pennington_glove:_2014, 142 | title = {Glove: {Global} vectors for word representation}, 143 | volume = {12}, 144 | shorttitle = {Glove}, 145 | url = {http://nlp.stanford.edu/projects/glove/glove.pdf}, 146 | urldate = {2015-04-24}, 147 | journal = {Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014)}, 148 | author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D.}, 149 | year = {2014} 150 | } 151 | 152 | @article{mosteller_inference_1963, 153 | title = {Inference in an {Authorship} {Problem}}, 154 | volume = {58}, 155 | journal = {Journal of the American Statistical Association}, 156 | author = {Mosteller, Frederick and Wallace, David L.}, 157 | year = {1963}, 158 | pages = {275--309} 159 | } 160 | 161 | @article{taddy_one-step_2014, 162 | title = {One-step estimator paths for concave regularization}, 163 | author = {Taddy, Matt}, 164 | year = {2014}, 165 | note = {arXiv:1308.5623} 166 | } 167 | 168 | @article{sebastiani_machine_2002, 169 | title = {Machine {Learning} in {Automated} {Test} {Categorization}}, 170 | volume = {34}, 171 | journal = {ACM Computing Surveys}, 172 | author = {Sebastiani, Fabrizio}, 173 | year = {2002}, 174 | pages = {1--47} 175 | } 176 | 177 | @article{taddy_distributed_2015, 178 | title = {Distributed {Multinomial} {Regression}}, 179 | volume = {To appear}, 180 | journal = {Annals of Applied Statistics}, 181 | author = {Taddy, Matt}, 182 | year = {2015} 183 | } 184 | 185 | @inproceedings{joshi_movie_2010, 186 | title = {Movie reviews and revenues: {An} experiment in text regression}, 187 | shorttitle = {Movie reviews and revenues}, 188 | url = {http://dl.acm.org/citation.cfm?id=1858037}, 189 | urldate = {2014-10-28}, 190 | booktitle = {Human {Language} {Technologies}: {The} 2010 {Annual} {Conference} of the {North} {American} {Chapter} of the {Association} for {Computational} {Linguistics}}, 191 | publisher = {Association for Computational Linguistics}, 192 | author = {Joshi, Mahesh and Das, Dipanjan and Gimpel, Kevin and Smith, Noah A.}, 193 | year = {2010}, 194 | pages = {293--296} 195 | } 196 | 197 | @inproceedings{rehurek_software_2010, 198 | title = {Software {Framework} for {Topic} {Modelling} with {Large} {Corpora}}, 199 | booktitle = {Proceedings of the {LREC} 2010 {Workshop} on {New} {Challenges} for {NLP} {Frameworks}}, 200 | author = {Rehurek, Radim and Sojka, Petr}, 201 | year = {2010}, 202 | pages = {45--50} 203 | } 204 | 205 | @inproceedings{ng_discriminative_2002, 206 | title = {On {Discriminative} vs {Generative} {Classifiers}: {A} {Comparison} of {Logistic} {Regression} and naive {Bayes}}, 207 | booktitle = {Advances in {Neural} {Information} {Processing} {Systems} ({NIPS})}, 208 | author = {Ng, Andrew Y. and Jordan, Michael I.}, 209 | year = {2002} 210 | } 211 | 212 | @inproceedings{socher_parsing_2011, 213 | title = {Parsing natural scenes and natural language with recursive neural networks}, 214 | url = {http://machinelearning.wustl.edu/mlpapers/paper_files/ICML2011Socher_125.pdf}, 215 | urldate = {2015-04-25}, 216 | booktitle = {Proceedings of the 28th international conference on machine learning ({ICML}-11)}, 217 | author = {Socher, Richard and Lin, Cliff C. and Manning, Chris and Ng, Andrew Y.}, 218 | year = {2011}, 219 | pages = {129--136} 220 | } 221 | 222 | @article{flynn_efficiency_2013, 223 | title = {Efficiency for {Regularization} {Parameter} {Selection} in {Penalized} {Likelihood} {Estimation} of {Misspecified} {Models}}, 224 | volume = {108}, 225 | journal = {Journal of the American Statistical Association}, 226 | author = {Flynn, Cheryl and Hurvich, Clifford and Simonoff, Jefferey}, 227 | year = {2013}, 228 | pages = {1031--1043} 229 | } 230 | 231 | @article{rumelhart_learning_1986, 232 | title = {Learning representations by back-propagating errors}, 233 | volume = {323}, 234 | journal = {Nature}, 235 | author = {Rumelhart, David and Hinton, Geoffrey and Williams, Ronald}, 236 | year = {1986}, 237 | pages = {533--536} 238 | } 239 | 240 | @article{blei_latent_2003, 241 | title = {Latent {Dirichlet} {Allocation}}, 242 | volume = {3}, 243 | url = {http://dl.acm.org/citation.cfm?id=944937}, 244 | urldate = {2013-11-01}, 245 | journal = {the Journal of machine Learning research}, 246 | author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.}, 247 | year = {2003}, 248 | pages = {993--1022} 249 | } 250 | 251 | @inproceedings{morin_hierarchical_2005, 252 | title = {Hierarchical probabilistic neural network language model}, 253 | url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.8829&rep=rep1&type=pdf#page=255}, 254 | urldate = {2015-04-24}, 255 | booktitle = {Proceedings of the {International} {Workshop} on {Artificial} {Intelligence} and {Statistics}}, 256 | author = {Morin, Frederic and Bengio, Yoshua}, 257 | year = {2005}, 258 | pages = {246--252} 259 | } 260 | 261 | @inproceedings{le_distributed_2014, 262 | title = {Distributed representations of sentences and documents}, 263 | url = {http://arxiv.org/abs/1405.4053}, 264 | urldate = {2015-04-24}, 265 | booktitle = {Proceedings of the 31 st {International} {Conference} on {Machine} {Learning}}, 266 | author = {Le, Quoc V. and Mikolov, Tomas}, 267 | year = {2014} 268 | } 269 | 270 | @article{taddy_measuring_2013, 271 | title = {Measuring {Political} {Sentiment} on {Twitter}: {Factor} {Optimal} {Design} for {Multinomial} {Inverse} {Regression}}, 272 | volume = {55}, 273 | issn = {0040-1706, 1537-2723}, 274 | shorttitle = {Measuring {Political} {Sentiment} on {Twitter}}, 275 | url = {http://www.tandfonline.com/doi/abs/10.1080/00401706.2013.778791}, 276 | doi = {10.1080/00401706.2013.778791}, 277 | language = {en}, 278 | number = {4}, 279 | urldate = {2014-10-28}, 280 | journal = {Technometrics}, 281 | author = {Taddy, Matt}, 282 | month = nov, 283 | year = {2013}, 284 | pages = {415--425} 285 | } -------------------------------------------------------------------------------- /tex/deepir.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/deepir.pdf -------------------------------------------------------------------------------- /tex/deepir.tex: -------------------------------------------------------------------------------- 1 | 2 | \documentclass[11pt]{article} 3 | \usepackage{acl2015} 4 | \usepackage{times} 5 | \usepackage{url} 6 | \usepackage{dsfont} 7 | \usepackage{latexsym} 8 | \usepackage{graphicx} 9 | 10 | 11 | \title{Document Classification by Inversion of \\Distributed Language Representations} 12 | 13 | \author{Matt Taddy \\ 14 | University of Chicago Booth School of Business \\ 15 | {\tt taddy@chicagobooth.edu} \\} 16 | 17 | \date{} 18 | 19 | \begin{document} 20 | \maketitle 21 | \begin{abstract} 22 | There have been many recent advances in the structure and measurement of {\it distributed} language models: those that map from words to a vector-space that is rich in information about word choice and composition. This vector-space is the distributed language representation. 23 | 24 | 25 | The goal of this note is to point out that any distributed representation can be turned into a classifier through inversion via Bayes rule. 26 | The approach is simple and modular, in that it will work with any language representation whose training can be formulated as optimizing a probability model. In our application to 2 million sentences from Yelp reviews, we also find that it performs as well as or better than complex purpose-built algorithms. \end{abstract} 27 | 28 | \section{Introduction} 29 | 30 | Distributed, or vector-space, language representations $\mathcal{V}$ consist 31 | of a location, or embedding, for every vocabulary {\it word} in $\mathds{R}^K$, where 32 | $K$ is the dimension of the latent representation space. These locations 33 | are learned to optimize, perhaps approximately, an objective function 34 | defined on the original text such as a likelihood for word occurrences. 35 | 36 | A popular example is the Word2Vec machinery of 37 | Mikolov et al.~\shortcite{mikolov_distributed_2013}. This trains the distributed 38 | representation to be useful as an input layer for prediction of words from 39 | their neighbors in a Skip-gram likelihood. That is, to maximize 40 | \begin{equation}\label{eq:skipgram} 41 | \sum_{j\neq t,~j=t-b}^{t+b} \log\mathrm{p}_{\mathcal{V}}(w_{sj}\mid w_{st}) 42 | \end{equation} 43 | summed across all words $w_{st}$ in all sentences $\mathbf{w}_s$, where $b$ is 44 | the skip-gram window (truncated by the ends of the 45 | sentence) and $\mathrm{p}_{\mathcal{V}}(w_{sj}| w_{st})$ is a neural network 46 | classifier that takes vector representations for $w_{st}$ and $w_{sj}$ 47 | as input (see Section \ref{sec:w2v}). 48 | 49 | Distributed language representations have been studied since the early work on 50 | neural networks \cite{rumelhart_learning_1986} and have long been applied in 51 | natural language processing \cite{morin_hierarchical_2005}. The models are 52 | generating much recent interest due to the large performance gains from the 53 | newer systems, including Word2Vec and the Glove model of Pennington et 54 | al.~\shortcite{pennington_glove:_2014}, observed in, e.g., word 55 | prediction, word analogy identification, and named entity recognition. 56 | 57 | Given the success of these new models, researchers have begun searching for 58 | ways to adapt the representations for use in document classification tasks 59 | such as sentiment prediction or author identification. One naive approach is 60 | to use aggregated word vectors across a document (e.g., a document's average 61 | word-vector location) as input to a standard classifier (e.g., 62 | logistic regression). However, a document is actually an {\it ordered} path 63 | of locations through $\mathds{R}^K$, and simple averaging destroys much of the available 64 | information. 65 | 66 | More sophisticated aggregation is proposed in Socher et al. 67 | \shortcite{socher_parsing_2011,socher_recursive_2013}, where recursive neural 68 | networks are used to combine the word vectors through the estimated parse tree 69 | for each sentence. Alternatively, Le and Mikolov's Doc2Vec 70 | \shortcite{le_distributed_2014} adds document labels to the conditioning set 71 | in (\ref{eq:skipgram}) and has them influence the skip-gram likelihood through 72 | a latent input vector location in $\mathcal{V}$. In each case, the end product 73 | is a distributed representation for every sentence (or document for Doc2Vec) 74 | that can be used as input to a generic classifier. 75 | 76 | \subsection{Bayesian Inversion} 77 | 78 | These approaches all add considerable model and estimation complexity to the 79 | original underlying distributed representation. We are proposing a 80 | simple alternative that turns fitted distributed language representations into 81 | document classifiers without any additional modeling or estimation. 82 | 83 | A typical language model is trained to maximize the likelihoods of single words and their neighbors. For example, the skip-gram 84 | in (\ref{eq:skipgram}) represents conditional probability for a 85 | word's context (surrounding words), while the alternative CBOW Word2Vec 86 | specification \cite{mikolov2013efficient} targets the conditional probability 87 | for each word given its context. Although these objectives do not correspond to a full document likelihood model, they can be interpreted as components in a \textit{composite likelihood}\footnote{Composite likelihoods are a common tool in analysis of spatial data and data on graphs. They were popularized in statistics by Besag's \shortcite{besag_spatial_1974,besag1975statistical} work on the pseudolikelihood -- $\mathrm{p}(\mathbf{w}) \approx \prod_j \mathrm{p}(w_j |\mathbf{w}_{-j})$ -- for analysis of Markov random fields. See Varin et al. \shortcite{varin2011overview} for a detailed review.} approximation. 88 | 89 | Use $\mathbf{w} = [w_1\dots w_T]'$ to denote a sentence: an ordered vector of words. 90 | The skip-gram in 91 | (\ref{eq:skipgram}) yields the pairwise composite log likelihood\footnote{See Molenberghs and Verbeke \shortcite{molenberghs2006models} for similar pairwise compositions in analysis of longitudinal data.} 92 | \begin{equation}\label{eq:sentencelhd} \log\mathrm{p}_{ \mathcal{V}}(\mathbf{w}) = 93 | \sum_{j=1}^T\sum_{k=1}^T \mathds{1}_{\left[1\leq |k-j| \leq b\right]} \log\mathrm{p}_{ \mathcal{V}}(w_{k}| 94 | w_{j} ). \end{equation} 95 | In another example, Jernite et al.~\shortcite{jernite2015mrf} show that CBOW Word2Vec corresponds to the pseudolikelihood for a Markov random field sentence model. 96 | 97 | Finally, given a sentence likelihood as in (\ref{eq:sentencelhd}), document $d = 98 | \{\mathbf{w}_1, ... \mathbf{w}_S\}$ has log likelihood 99 | \begin{equation}\label{eq:fulllhd} \log\mathrm{p}_{ \mathcal{V}}(d) = 100 | \sum_{s} \log\mathrm{p}_{ \mathcal{V}}(\mathbf{w}_s). \end{equation} 101 | 102 | 103 | Now suppose that your training documents are grouped by class label, $y \in 104 | \{1 \dots C\}$. We can train {\it separate} distributed language representations 105 | for each set of documents as partitioned by $y$; for example, fit Word2Vec independently on each sub-corpus $D_c = \{ d_i : y_i =c \}$ and obtain the labeled distributed representation map $\mathcal{V}_c$. A new document $d$ has probability 106 | $\mathrm{p}_{ \mathcal{V}_c}(d)$ if we treat it as a member of class $c$, and Bayes rule implies 107 | \begin{equation}\label{eq:bayesrule} 108 | \mathrm{p}( y | d) = \frac{\mathrm{p}_{ \mathcal{V}_y}(d)\pi_y } 109 | {\sum_c \mathrm{p}_{ \mathcal{V}_c}(d)\pi_c } 110 | \end{equation} 111 | where $\pi_c$ is our prior probability on class label $c$. 112 | 113 | Thus distributed language representations trained separately for each class label 114 | yield directly a document classification rule via (\ref{eq:bayesrule}). This 115 | approach has a number of attractive qualities. 116 | 117 | \vskip .1cm 118 | \noindent \textbf{Simplicity:} The inversion strategy works for any model of 119 | language that can (or its training can) be interpreted as a probabilistic 120 | model. This makes for easy implementation in systems that are already 121 | engineered to fit such language representations, leading to faster deployment and lower development costs. 122 | The strategy is also interpretable: whatever intuition one has about the 123 | distributed language model can be applied directly to the 124 | inversion-based classification rule. Inversion adds a 125 | plausible model for reader understanding on top of any given language 126 | representation. 127 | 128 | \vskip .1cm 129 | \noindent \textbf{Scalability:} when working with 130 | massive corpora it is often useful to split the data into blocks as part of 131 | distributed computing strategies. Our model of classification via inversion 132 | provides a convenient top-level partitioning of the data. An efficient system 133 | could fit separate by-class language representations, which 134 | will provide for document classification as in this article as well as 135 | class-specific answers for NLP tasks such as word prediction or analogy. When 136 | one wishes to treat a document as unlabeled, NLP tasks can be answered through 137 | ensemble aggregation of the class-specific answers. 138 | 139 | \vskip .1cm 140 | \noindent \textbf{Performance:} We find that, in our examples, inversion of 141 | Word2Vec yields lower misclassification rates than both Doc2Vec-based 142 | classification and the multinomial inverse regression (MNIR) of Taddy 143 | \shortcite{taddy_multinomial_2013}. We did not anticipate such outright 144 | performance gain. Moreover, we expect that with calibration (i.e., through 145 | cross-validation) of the many various tuning parameters available when 146 | fitting both Word and Doc 2Vec the performance results will change. Indeed, 147 | we find that all methods are often outperformed by phrase-count logistic 148 | regression with rare-feature up-weighting and carefully chosen regularization. 149 | However, the out-of-the-box performance of Word2Vec inversion 150 | argues for its consideration as a simple default in document classification. 151 | 152 | \vskip .2cm 153 | In the remainder, we outline classification through inversion of a specific 154 | Word2Vec model and illustrate the ideas in classification of Yelp reviews. 155 | The implementation requires only a small extension of the popular 156 | \texttt{gensim} python library \cite{rehurek_software_2010}; the extended 157 | library as well as code to reproduce all of the results in this paper are 158 | available on \texttt{github}. In addition, the yelp data is publicly available 159 | as part of the corresponding data mining contest at 160 | \texttt{kaggle.com}. 161 | See \texttt{github.com/taddylab/deepir} for detail. 162 | 163 | 164 | \section{Implementation} 165 | \label{sec:w2v} 166 | 167 | Word2Vec trains $\mathcal{V}$ to maximize the skip-gram likelihood based on (\ref{eq:skipgram}). We work with the Huffman softmax specification \cite{mikolov_distributed_2013}, which includes a pre-processing step to encode each vocabulary word in its representation via a binary Huffman tree (see Figure \ref{bht}). 168 | 169 | \begin{figure}[b] 170 | ~\includegraphics[width=0.47\textwidth]{graphs/bht} 171 | \caption{\label{bht} Binary Huffman encoding of a 4 word vocabulary, based upon 18 total utterances. 172 | At each step proceeding from left to right the two nodes with lowest count are 173 | combined into a parent node. Binary encodings are read back off of the splits 174 | moving from right to left. } 175 | \end{figure} 176 | 177 | Each individual probability is then 178 | \begin{equation} \label{eq:neuralnet} 179 | \mathrm{p}_{\mathcal{V}}(w | w_t) =\!\!\! 180 | \prod_{j=1}^{L(w)-1} \!\!\!\sigma\!\left( \mathrm{ch}\left[\eta(w,j+1)\right] \mathbf{u}_{\eta(w,j)}^\top \mathbf{v}_{w_t} \right) 181 | \end{equation} 182 | where $\eta(w,i)$ is the $i^{th}$ node in the Huffman tree path, of length $L(w)$, for word $w$; $\sigma(x) = 1/(1 + \exp[-x])$; and $\mathrm{ch}(\eta) 183 | \in \{-1,+1\}$ translates from whether $\eta$ is a left or right child to +/- 184 | 1. Every word thus has both input and output vector coordinates, 185 | $\mathbf{v}_w$ and $[\mathbf{u}_{\eta(w,1)} \cdots \mathbf{u}_{\eta(w,L(w))}]$. 186 | Typically, only the input space $\mathbf{V} = [\mathbf{v}_{w_1} \cdots \mathbf{v}_{w_p}]$, 187 | for a $p$-word vocabulary, is reported as the language 188 | representation -- these vectors are used as input for NLP tasks. However, 189 | the full representation $\mathcal{V}$ includes mapping from each word to both 190 | $\mathbf{V}$ and $\mathbf{U}$. 191 | 192 | We apply the 193 | \texttt{gensim} python implementation of Word2Vec, which fits the model via stochastic gradient descent (SGD), under default specification. This includes a vector space of dimension $K=100$ and a skip-gram window of size $b=5$. 194 | 195 | \subsection{Word2Vec Inversion} 196 | 197 | 198 | \begin{figure*} 199 | %\includegraphics[width=\textwidth]{graphs/coarseprob} 200 | 201 | % \vskip .5cm 202 | % \begin{center} 203 | \includegraphics[width=1\textwidth]{graphs/coarseprob_bystar} 204 | % \end{center} 205 | \vskip -.25cm 206 | \caption{\label{pic:coarseprob} Out-of-Sample fitted probabilities of a review being \emph{positive} (having greater than 2 stars) as a function of the true number of review stars. Box widths are proportional to number of observations in each class; roughly 10\% of reviews have each of 1-3 stars, while 30\% have 4 stars and 40\% have 5 stars. 207 | } 208 | \end{figure*} 209 | 210 | % \begin{figure*} 211 | % \begin{center} 212 | % \includegraphics[width=.98\textwidth]{graphs/nnpprob} 213 | 214 | % \vskip .25cm 215 | 216 | % \includegraphics[width=.98\textwidth]{graphs/fineprob} 217 | % \end{center} 218 | % \vskip -.25cm 219 | % \caption{\label{pic:fineprob} Out-of-Sample fitted probabilities for observed truth. In the top plot, we are predicting Negative ($\leq 2$), Neutral ($3$), or Positive ($\geq 4$). In the bottom, we are predicting each of the separate 5 star ratings.} 220 | % \end{figure*} 221 | 222 | Given Word2Vec trained on each of $C$ class-specific corpora $D_1 \ldots D_C$, 223 | leading to $C$ distinct language representations $\mathcal{V}_1 \dots 224 | \mathcal{V}_C$, classification for new documents is straightforward. Consider 225 | the $S$-sentence document $d$: each sentence $\mathbf{w}_s$ is given a 226 | probability under each representation $\mathcal{V}_c$ by applying the 227 | calculations in (\ref{eq:skipgram}) and (\ref{eq:neuralnet}). This leads to 228 | the $S \times C$ matrix of sentence probabilities, 229 | $\mathrm{p}_{\mathcal{V}_c}(\mathbf{w}_s)$, and document probabilities are 230 | obtained %as the column means 231 | \begin{equation} 232 | \mathrm{p}_{\mathcal{V}_c}(d) = \frac{1}{S}\sum_s \mathrm{p}_{\mathcal{V}_c}(\mathbf{w}_s). 233 | \end{equation} 234 | Finally, class probabilities are calculated via Bayes rule as in (\ref{eq:bayesrule}). We use priors $\pi_c = 1/C$, so that classification proceeds by assigning the class 235 | \begin{equation}\label{eq:class} 236 | \hat y = \mathrm{argmax}_c ~~\mathrm{p}_{\mathcal{V}_c}(d). 237 | \end{equation} 238 | 239 | 240 | 241 | \section{Illustration} 242 | 243 | We consider a corpus of reviews provided by Yelp for a contest on {\tt 244 | kaggle.com}. The text is tokenized simply by converting to lowercase before splitting on punctuation and white-space. The training data are 230,000 reviews containing more than 2 245 | million sentences. Each review is marked by a number of {\it stars}, from 1 246 | to 5, and we fit separate Word2Vec representations $\mathcal{V}_1 \ldots 247 | \mathcal{V}_5$ for the documents at each star rating. The validation data 248 | consist of 23,000 reviews, and we apply the inversion technique of Section 249 | \ref{sec:w2v} to score each validation document $d$ with class probabilities 250 | $\mathbf{q} = [q_1 \cdots q_5]$, where $q_c = \mathrm{p}(c|d)$. 251 | 252 | The probabilities will be used in three different classification tasks; for reviews as 253 | 254 | \vskip .1cm 255 | $a.$ negative at 1-2 stars, or positive at 3-5 stars; 256 | 257 | \vskip .1cm 258 | $b.$ negative 1-2, neutral 3, or positive 4-5 stars; 259 | 260 | \vskip .1cm 261 | $c.$ corresponding to each of 1 to 5 stars. 262 | 263 | \vskip .1cm 264 | In each case, classification proceeds by summing across the relevant 265 | sub-class probabilities. For example, in task $a$, 266 | $\mathrm{p}(\texttt{positive}) = q_3+q_4+q_5$. Note that the same five fitted 267 | Word2Vec representations are used for each task. 268 | 269 | We consider a set of related comparator techniques. In each case, some 270 | document representation (e.g., phrase counts or Doc2Vec vectors) is used as 271 | input to logistic regression prediction of the associated review rating. 272 | The logistic regressions are fit under $L_1$ regularization with the 273 | penalties weighted by feature standard deviation (which, e.g., up-weights rare 274 | phrases) and selected according to the corrected AICc criteria 275 | \cite{flynn_efficiency_2013} via the \texttt{gamlr} R package of Taddy 276 | \shortcite{taddy_one-step_2014}. For multi-class tasks $b$-$c$, we use 277 | distributed Multinomial regression (DMR; Taddy 278 | 2015)\nocite{taddy_distributed_2015} via the \texttt{distrom} R package. DMR 279 | fits multinomial logistic regression in a factorized representation wherein 280 | one estimates independent Poisson linear models for each response category. 281 | Document representations and logistic regressions are 282 | always trained using only the training corpus. 283 | 284 | 285 | \vskip .1cm 286 | \noindent \textit{Doc2Vec} is also fit via \texttt{gensim}, using the same 287 | latent space specification as for Word2Vec: $K=100$ and $b=5$. 288 | As recommended in the documentation, we apply repeated SGD over 20 re-orderings of each 289 | corpus (for comparability, this was also done when fitting Word2Vec). 290 | Le and Mikolov provide two alternative Doc2Vec specifications: distributed 291 | memory (DM) and distributed bag-of-words (DBOW). We fit both. Vector representations for validation documents are trained without 292 | updating the word-vector elements, leading to 100 dimensional vectors for 293 | each document for each of DM and DCBOW. We input each, as well as the combined 200 dimensional 294 | DM+DBOW representation, to logistic regression. 295 | 296 | 297 | \vskip .1cm 298 | \noindent \textit{Phrase regression} applies logistic regression of 299 | response classes directly onto counts for short 1-2 word `phrases'. The phrases are 300 | obtained using \texttt{gensim}'s phrase builder, which simply combines highly 301 | probable pairings; e.g., \texttt{first\_date} and 302 | \texttt{chicken\_wing} are two pairings in this corpus. 303 | 304 | \vskip .1cm 305 | \noindent \textit{MNIR}, the multinomial inverse regression of Taddy 306 | \shortcite{taddy_measuring_2013,taddy_multinomial_2013,taddy_distributed_2015} 307 | is applied as implemented in the \texttt{textir} package for R. MNIR maps 308 | from text to the class-space of interest through a multinomial logistic 309 | regression of phrase counts onto variables relevant to the class-space. We 310 | apply MNIR to the same set of 1-2 word phrases used in phrase regression. 311 | Here, we regress phrase counts onto stars expressed numerically and as a 312 | 5-dimensional indicator vector, leading to a 6-feature multinomial logistic 313 | regression. The MNIR procedure then uses the $6\times p$ matrix of 314 | feature-phrase regression coefficients to map from phrase-count to feature space, 315 | resulting in 6 dimensional `sufficient reduction' statistics for each 316 | document. These are input to logistic 317 | regression. 318 | 319 | \vskip .1cm 320 | \noindent \textit{Word2Vec aggregation} averages fitted word 321 | representations for a single Word2Vec trained on all sentences to obtain a 322 | fixed-length feature vector for each review ($K=100$, as for inversion). This 323 | vector is then input to logistic regression. 324 | 325 | % \vskip .1cm 326 | % \noindent \textit{Topic regression} fits the Latent Dirichlet 327 | % Allocation of Blei et al.~\shortcite{blei_latent_2003} using the posterior 328 | % maximization and Bayes factor selection strategy of Taddy 329 | % \shortcite{taddy_estimation_2012} as implemented in \texttt{maptpx} for R. 330 | % Estimated topic weights for each document are then used as inputs to logistic 331 | % regression. Due to high computational costs, we limit to words occurring in 332 | % at least 200 documents. 333 | 334 | \begin{table} 335 | \hspace{-.25cm} 336 | { 337 | \begin{tabular}{r|c c c} 338 | & $a$ (NP) & $b$ (NNP) & $c$ (1-5) 339 | \\ \cline{2-4}\rule{0pt}{3ex} 340 | W2V inversion & .099 & \textbf{.189} & .435 \\ 341 | Phrase regression & \textbf{.084} & .200 & \textbf{.410} \\ 342 | D2V DBOW & .144 &.282 & .496 \\ 343 | D2V DM & .179 & .306 & .549 \\ 344 | D2V combined & .148 & . 284 & .500 \\ 345 | MNIR & .095 & .254 & .480 \\ 346 | W2V aggregation & .118 & .248 & .461 347 | \end{tabular}} 348 | \caption{ Out-of-sample misclassification rates.} 349 | \end{table} 350 | 351 | \subsection{Results} 352 | 353 | Misclassification rates for each task on the validation set are reported in 354 | Table 1. Simple phrase-count regression is consistently the 355 | strongest performer, bested only by Word2Vec inversion on task $b$. This is 356 | partially due to the relative strengths of discriminative (e.g., logistic 357 | regression) vs generative (e.g., all others here) classifiers: given a large amount of 358 | training text, asymptotic efficiency of logistic regression will start to work 359 | in its favor over the finite sample advantages of a generative classifier 360 | \cite{ng_discriminative_2002,taddy_rejoinder:_2013}. 361 | However, the comparison is also unfair to Word2Vec and Doc2Vec: both 362 | phrase regression and MNIR are optimized exactly under 363 | AICc selected penalty, while Word and Doc 2Vec have only been approximately 364 | optimized under a single specification. The 365 | distributed representations should improve with some careful engineering. 366 | 367 | Word2Vec inversion outperforms the other document representation-based 368 | alternatives (except, by a narrow margin, MNIR in task $a$). Doc2Vec under 369 | DBOW specification and MNIR both do worse, but not by a large margin. In 370 | contrast to Le and Mikolov, we find here that the Doc2Vec DM model does much 371 | worse than DBOW. Regression onto simple within- document aggregations of 372 | Word2Vec perform slightly better than any Doc2Vec option (but not as well as 373 | the Word2Vec inversion). This again contrasts the results of Le and Mikolov 374 | and we suspect that the more complex 375 | Doc2Vec model would benefit from a careful tuning of the SGD optimization 376 | routine.\footnote{Note also that the unsupervised document representations -- Doc2Vec or the single Word2Vec used in Word2Vec aggregation -- could be trained on larger unlabeled corpora. A similar option is available for Word2Vec inversion: one could take a single Word2Vec model trained on a large unlabeled corpora as a shared baseline (prior) and update separate models with additional training on each labeled sub-corpora. The representations will all be shrunk towards a baseline language model, but will differ according to distinctions between the language in each labeled sub-corpora.} 377 | 378 | 379 | Looking at the fitted probabilities in detail we see that Word2Vec inversion 380 | provides a more useful document {\it ranking} than any comparator (including 381 | phrase regression). For example, Figure \ref{pic:coarseprob} shows the 382 | probabilities of a review being `positive' in task $a$ as a function of the 383 | true star rating for each validation review. Although phrase regression does 384 | slightly better in terms of misclassification rate, it does so at the cost of 385 | classifying many terrible (1 star) reviews as positive. This occurs because 1-2 star reviews are more rare than 3-5 star reviews and because words of emphasis (e.g. \texttt{very}, \texttt{completely}, and \texttt{!!!}) are used both in very bad and in very good reviews. Word2Vec inversion is 386 | the {\it only} method that yields positive-document probabilities that are 387 | clearly increasing in distribution with the true star rating. It is not 388 | difficult to envision a misclassification cost structure that favors such 389 | nicely ordered probabilities. 390 | 391 | 392 | \section{Discussion} 393 | 394 | The goal of this note is to point out inversion as an option for turning distributed language representations into classification rules. We are not arguing for the supremacy of Word2Vec inversion in particular, and the approach should work well with alternative representations (e.g., Glove). Moreover, we are not even arguing that it will always outperform purpose-built classification tools. However, it is a simple, scalable, interpretable, and effective option for classification whenever you are working with such distributed representations. 395 | 396 | \bibliographystyle{acl} 397 | \bibliography{deepir} 398 | 399 | 400 | \end{document} 401 | -------------------------------------------------------------------------------- /tex/graphs/bht.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | rankdir="RL"; 3 | edge [arrowhead=none]; 4 | node [shape=box]; 5 | 0 [label="18"] ; 6 | 0 -> 1 [label = "0"]; 7 | 0 -> 2 [label = "1"]; 8 | 1 [label="11"] ; 9 | 2 [label="7"] ; 10 | 1 -> 3 ; 11 | 2 -> 4 [label = "0"]; 12 | 2 -> 5 [label = "1"]; 13 | 3 [label="7"] ; 14 | 4 [label="6"] ; 15 | 5 [label="5"] ; 16 | 3 -> 8 [label = "0"]; 17 | 3 -> 9 [label = "1"]; 18 | 4 -> 6 ; 19 | 5 -> 7 ; 20 | 6 [label="10 Thanks 6"] ; 21 | 7 [label="11 Buddy 5"] ; 22 | 8 [label="00 Hello 4"] ; 23 | 9 [label="01 No 3"] ; 24 | } 25 | 26 | -------------------------------------------------------------------------------- /tex/graphs/bht.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bht.pdf -------------------------------------------------------------------------------- /tex/graphs/bht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bht.png -------------------------------------------------------------------------------- /tex/graphs/bystarshort.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bystarshort.pdf -------------------------------------------------------------------------------- /tex/graphs/coarseprob.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/coarseprob.pdf -------------------------------------------------------------------------------- /tex/graphs/coarseprob_bystar.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/coarseprob_bystar.pdf -------------------------------------------------------------------------------- /tex/graphs/fineprob.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/fineprob.pdf -------------------------------------------------------------------------------- /tex/graphs/nnpprob.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/nnpprob.pdf -------------------------------------------------------------------------------- /tex/graphs/posneg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/posneg.png -------------------------------------------------------------------------------- /tex/graphs/yelp_logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/yelp_logistic.png --------------------------------------------------------------------------------