├── README.md
├── code
    ├── linmod.R
    └── parseyelp.py
├── debug.ipynb
├── paths.ipynb
├── tex
    ├── acl.bst
    ├── acl2015.sty
    ├── acl2015.tex
    ├── deepir.bbl
    ├── deepir.bib
    ├── deepir.pdf
    ├── deepir.tex
    └── graphs
    │   ├── bht.dot
    │   ├── bht.pdf
    │   ├── bht.png
    │   ├── bystarshort.pdf
    │   ├── coarseprob.pdf
    │   ├── coarseprob_bystar.pdf
    │   ├── fineprob.pdf
    │   ├── nnpprob.pdf
    │   ├── posneg.png
    │   └── yelp_logistic.png
└── w2v-inversion.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # deep inverse regression
2 | 
3 | ### or: Document Classification by Inversion of Distributed Language Representations [(ACL 2015)](http://arxiv.org/pdf/1504.07295v3.pdf)
4 | 
5 | Using unsupervised deep learning within sub-groups as the input for Bayesian discrimination. 
6 | 
7 | Everything in here is built around the [gensim](https://radimrehurek.com/gensim/) library for python.  See the demo at [deepir.ipynb](https://github.com/TaddyLab/gensim/blob/deepir/docs/notebooks/deepir.ipynb).
8 | 


--------------------------------------------------------------------------------
/code/linmod.R:
--------------------------------------------------------------------------------
  1 | suppressMessages(library(textir))
  2 | suppressMessages(library(data.table))
  3 | 
  4 | ## get results from w2v
  5 | w2vprob <- fread("data/yelpw2vprobs.csv", header=TRUE, verbose=FALSE)
  6 | 
  7 | ## read the aggregated w2v vectors
  8 | aggvec <- read.table("data/yelp_vectors.txt", sep="|")
  9 | 
 10 | ## read in the text
 11 | revs <- read.table("data/yelp_phrases.txt",
 12 | 	sep="|",quote=NULL, comment="", 
 13 | 	col.names=c("id","phrase","stars","sample"))
 14 | 
 15 | x <- sparseMatrix( 
 16 | 			i=revs[,"id"]+1, j=as.numeric(revs[,"phrase"]), x=rep(1,nrow(revs)),
 17 | 			dimnames=list(NULL, levels(revs[,"phrase"])),
 18 |             dims=c(nrow(aggvec), nlevels(revs[,"phrase"])) )
 19 | emptyrev <- which(rowSums(x)==0)
 20 | 
 21 | x <- x[-emptyrev,colSums(x>0)>5]
 22 | w2vprob <- as.matrix(w2vprob[-emptyrev,])
 23 | aggvec <- as.matrix(aggvec[-emptyrev,])
 24 | 
 25 | print(n <- nrow(x))
 26 | 
 27 | stars <- tapply(revs$stars, revs$id, mean)
 28 | samp <- tapply( revs$sample=="test", revs$id, mean)
 29 | test <- which(samp==1)
 30 | 
 31 | ## read d2v
 32 | dv0train <- fread("data/yelpD2Vtrain0.csv", verbose=FALSE)
 33 | dv0test <- fread("data/yelpD2Vtest0.csv", verbose=FALSE)
 34 | dv1train <- fread("data/yelpD2Vtrain1.csv", verbose=FALSE)
 35 | dv1test <- fread("data/yelpD2Vtest1.csv", verbose=FALSE)
 36 | # all(dv0test[,id]==dv1test[,id])
 37 | # all(dv0test[,stars]==dv1test[,stars])
 38 | vecvar <- paste("x",1:100,sep="")
 39 | dv0x <- rbind(as.matrix(dv0train[,vecvar,with=FALSE]),
 40 |             as.matrix(dv0test[,vecvar,with=FALSE]))
 41 | dv1x <- rbind(as.matrix(dv1train[,vecvar,with=FALSE]),
 42 |             as.matrix(dv1test[,vecvar,with=FALSE]))
 43 | dvx <- cbind(dv0x,dv1x)
 44 | dvstars <- c(dv0train[,stars], dv0test[,stars])
 45 | dvtest <- nrow(dv0train)+1:nrow(dv0test)
 46 | 
 47 | library(parallel)
 48 | cl <- makeCluster(6, type="FORK")
 49 | 
 50 | geterr <- function(phat, y, PY=FALSE){
 51 |     if(ncol(phat)==1) phat <- cbind(1-phat,phat)
 52 |     y <- factor(y)
 53 |     yhat <- factor(levels(y)[apply(phat,1,which.max)])
 54 |     cat("mcr ")
 55 |     for(l in levels(y))
 56 |         cat(l, ":", round(
 57 |             mean(yhat[y==l] != y[y==l]),3), ", ", sep="")
 58 |     overall <- mean(yhat !=y)
 59 |     diff <- mean( abs(as.numeric(yhat) - as.numeric(y)) )
 60 |     py <- phat[cbind(1:nrow(phat),y)]
 61 |     lp <- log(py)
 62 |     lp[lp < (-50)] <- -50
 63 |     dev <- mean(-2*lp)
 64 |     cat("\noverall:", round(overall,3), "diff:", round(diff,3), "deviance:", dev, "\n")
 65 |     if(PY) return(py)
 66 |     invisible()
 67 | } 
 68 | 
 69 | getpy <- function(fit, xx, y, testset, PY=FALSE){
 70 |     if(inherits(fit,"randomForest"))
 71 |         phat <- as.matrix(predict(fit, xx[testset,], type="prob"))
 72 |     else 
 73 |         phat <- predict(fit, xx[testset,], type="response")
 74 |     py <- geterr(phat, y[testset], PY=PY)
 75 |     if(PY) return(py) 
 76 |     invisible()
 77 | }
 78 | 
 79 | ## define y
 80 | ycoarse <- as.numeric(stars>2)
 81 | ynnp <- cut(stars, c(0,2,3,5))
 82 | yfine <- factor(stars)
 83 | dvycoarse <- as.numeric(dvstars>2)
 84 | dvynnp <- cut(dvstars, c(0,2,3,5))
 85 | dvyfine <- factor(dvstars)
 86 | 
 87 | ### W2V inversion
 88 | cat("\n**** W2V INVERSION ****\n")
 89 | nullprob <- as.numeric(table(stars[-test])/length(stars[-test]))
 90 | 
 91 | cat("** COARSE **\n")
 92 | w2vpcoarse <- cbind(rowSums(w2vprob[,1:2]),rowSums(w2vprob[,3:5]))
 93 | geterr(w2vpcoarse[test,], ycoarse[test])
 94 | 
 95 | cat("** NNP **\n")
 96 | w2vpnnp <- cbind(rowSums(w2vprob[,1:2]),
 97 |     rowSums(w2vprob[,3,drop=FALSE]),
 98 |     rowSums(w2vprob[,4:5,drop=FALSE]))
 99 | geterr(w2vpnnp[test,], ynnp[test])
100 | 
101 | cat("** FINE **\n")
102 | geterr(w2vprob[test,], yfine[test])
103 | 
104 | ### logit word-count prediction
105 | cat("\n*** COUNTREG ***\n")
106 | 
107 | cat("** COARSE **\n")
108 | logitcoarse <- gamlr(x[-test,], ycoarse[-test], 
109 |                 family="binomial", lmr=1e-3)
110 | pycoarse <- getpy(logitcoarse, x, ycoarse, test, PY=TRUE)
111 | 
112 | png(file="paper/graphs/yelp_logistic.png", width=12,height=6, units="in", res=180)
113 | plot(logitcoarse)
114 | invisible(dev.off())
115 | 
116 | cat("** NNP **\n")
117 | logitnnp <- dmr(cl=cl, x[-test,], ynnp[-test], lmr=1e-3)
118 | pynnp <- getpy(logitnnp, x, ynnp, test, PY=TRUE)
119 | 
120 | cat("** FINE **\n")
121 | logitfine <- dmr(cl=cl, x[-test,], yfine[-test], lmr=1e-3)
122 | pyfine <- getpy(logitfine, x, yfine, test, PY=TRUE)
123 | 
124 | cat("\n*** W2V and COUNTREG NNP ***\n")
125 | wx <- cBind(w2vprob,x)
126 | combof <- dmr(cl,wx[-test,], ynnp[-test])
127 | getpy(combof, wx, ynnp, test)
128 | 
129 | ## D2V stuff
130 | ## all run at zero lambda; AICc selects most complex model anyways
131 | cat("\n*** D2V ***\n")
132 | 
133 | cat("** COARSE\n")
134 | cat("dm0 **\n")
135 | dv0coarse <- gamlr(dv0x[-dvtest,], dvycoarse[-dvtest],
136 |                 family="binomial", lmr=1e-4)
137 | getpy(dv0coarse, dv0x, dvycoarse, dvtest)
138 | cat("dm1 **\n")
139 | dv1coarse <- gamlr(dv1x[-dvtest,], dvycoarse[-dvtest],
140 |                 family="binomial", lmr=1e-4)
141 | getpy(dv1coarse, dv1x, dvycoarse, dvtest)
142 | cat("dm both **\n")
143 | dvcoarse <- gamlr(dvx[-dvtest,], dvycoarse[-dvtest],
144 |                 family="binomial", lmr=1e-4)
145 | pydvcoarse <- getpy(dvcoarse, dvx, dvycoarse, dvtest, PY=TRUE)
146 | 
147 | cat("** NNP\n")
148 | cat("dm0 **\n")
149 | dv0nnp <- dmr(cl, dv0x[-dvtest,], dvynnp[-dvtest], lmr=1e-4)
150 | getpy(dv0nnp, dv0x, dvynnp, dvtest)
151 | cat("dm1 **\n")
152 | dv1nnp <- dmr(cl, dv1x[-dvtest,], dvynnp[-dvtest], lmr=1e-4)
153 | getpy(dv1nnp, dv1x, dvynnp, dvtest)
154 | cat("dm both **\n")
155 | dvnnp <- dmr(cl, dvx[-dvtest,], dvynnp[-dvtest], lmr=1e-4)
156 | pydvnnp <- getpy(dvnnp, dvx, dvynnp, dvtest, PY=TRUE)
157 | 
158 | cat("** FINE\n")
159 | cat("dm0 **\n")
160 | dv0fine <- dmr(cl, dv0x[-dvtest,], dvyfine[-dvtest], lmr=1e-4)
161 | getpy(dv0fine, dv0x, dvyfine, dvtest)
162 | cat("dm1 **\n")
163 | dv1fine <- dmr(cl, dv1x[-dvtest,], dvyfine[-dvtest], lmr=1e-4)
164 | getpy(dv1fine, dv1x, dvyfine, dvtest)
165 | cat("dm both **\n")
166 | dvfine <- dmr(cl, dvx[-dvtest,], dvyfine[-dvtest], lmr=1e-4)
167 | pydvfine <- getpy(dvfine, dvx, dvyfine, dvtest, PY=TRUE)
168 | 
169 | # mnir
170 | cat("\n*** MNIR ***\n")
171 | vmat <- sparse.model.matrix(~stars + yfine-1)
172 | mnir <- mnlm(cl=cl, vmat[-test,], x[-test,], verb=1, bins=5)
173 | zir <- srproj(mnir, x, select=100)
174 | 
175 | cat("** COARSE **\n")
176 | fwdcoarse <- gamlr(zir[-test,], ycoarse[-test], lmr=1e-4, family="binomial")
177 | pymnircoarse <- getpy(fwdcoarse, zir, ycoarse, test, PY=TRUE)
178 | 
179 | cat("** NNP **\n")
180 | fwdnnp <- dmr(cl, zir[-test,], ynnp[-test],  lmr=1e-4)
181 | pymnirnnp <- getpy(fwdnnp, zir, ynnp, test, PY=TRUE)
182 | 
183 | cat("** FINE **\n")
184 | fwdfine <- dmr(cl, zir[-test,], yfine[-test],  lmr=1e-4)
185 | pymnirfine <- getpy(fwdfine, zir, yfine, test, PY=TRUE)
186 | 
187 | ### Aggregate vector prediction
188 | cat("\n*** W2V AGGREGATION ***\n")
189 | 
190 | cat("** COARSE **\n")
191 | avc <- gamlr(aggvec[-test,], ycoarse[-test], 
192 |             family="binomial", lambda.min.ratio=1e-3)
193 | getpy(avc, aggvec, ycoarse, test)
194 | 
195 | cat("** NNP **\n")
196 | avnnp <- dmr(cl=cl, aggvec[-test,], ynnp[-test], lmr=1e-3)
197 | getpy(avnnp, aggvec, ynnp, test)
198 | 
199 | cat("** FINE **\n")
200 | avfine <- dmr(cl=cl, aggvec[-test,], yfine[-test], lmr=1e-3)
201 | getpy(avfine, aggvec, yfine, test)
202 | 
203 | save.image("linmod.rda", compress=FALSE)
204 | 
205 | ### some plots
206 | w2vpc <- w2vpcoarse[test,2]
207 | pdf("paper/graphs/coarseprob.pdf", width=9, height=2.75)
208 | par(mfrow=c(1,3),mai=c(.45,.45,.3,.2),omi=c(.15,.15,0,0))
209 | hist(w2vpc[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE,
210 |          xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="word2vec inversion")
211 | hist(w2vpc[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE)
212 | 
213 | hist(pycoarse[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE,
214 |          xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="phrase regression")
215 | hist(pycoarse[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE)
216 | 
217 | hist(pydvcoarse[dvycoarse[dvtest]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE,
218 |          xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="doc2vec regression")
219 | hist(pydvcoarse[dvycoarse[dvtest]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE)
220 | 
221 | # hist(pymnircoarse[ycoarse[test]==0], col=rgb(1,0,0,1), breaks=10, freq=FALSE,
222 | #          xlab="", ylab="", xlim=c(0,1), ylim=c(0,8), main="mnir")
223 | # hist(pymnircoarse[ycoarse[test]==1], col=rgb(1,1,0,.7), breaks=10, freq=FALSE, add=TRUE)
224 | 
225 | mtext(side=2, "density", outer=TRUE,cex=.9, font=3)
226 | mtext(side=1, "probability positive", outer=TRUE, cex=.9, font=3)
227 | dev.off()
228 | 
229 | 
230 | pdf("paper/graphs/coarseprob_bystar.pdf", width=9, height=2.5)
231 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0))
232 | boxplot( w2vpc ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="word2vec inversion")
233 | boxplot( pycoarse ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="phrase regression")
234 | boxplot( pydvcoarse ~ dvyfine[dvtest], col=heat.colors(5), varwidth=TRUE, main="doc2vec regression")
235 | boxplot( pymnircoarse ~ yfine[test], col=heat.colors(5), varwidth=TRUE, main="mnir")
236 | mtext(side=1, "stars", outer=TRUE,cex=1, font=3)
237 | mtext(side=2, "probability positive", outer=TRUE,cex=1, font=3)
238 | dev.off()
239 | 
240 | w2vpnnpy <- w2vpnnp[cbind(1:n,ynnp)]
241 | pdf("paper/graphs/nnpprob.pdf", width=9, height=2.5)
242 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0))
243 | boxplot( w2vpnnpy[test] ~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="word2vec inversion")
244 | boxplot( pynnp~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="phrase regression")
245 | boxplot( pydvnnp~ dvynnp[dvtest], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="doc2vec regression")
246 | boxplot( pymnirnnp~ ynnp[test], col=c("red","grey","yellow"), varwidth=TRUE, ylim=c(0,1), main="mnir")
247 | mtext(side=1, "stars", outer=TRUE,cex=.9, font=3)
248 | mtext(side=2, "probability of true category", outer=TRUE,cex=.9, font=3)
249 | dev.off()
250 | 
251 | w2vpy <- w2vprob[cbind(1:n,stars)]
252 | pdf("paper/graphs/fineprob.pdf", width=9, height=2.5)
253 | par(mfrow=c(1,4),mai=c(.4,.4,.3,.2),omi=c(.2,.2,0,0))
254 | boxplot( w2vpy[test] ~ yfine[test], col=heat.colors(5), varwidth=TRUE, ylim=c(0,1), main="word2vec inversion")
255 | boxplot( pyfine~ yfine[test], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="phrase regression")
256 | boxplot( pydvfine~ dvyfine[dvtest], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="doc2vec regression")
257 | boxplot( pymnirfine~ yfine[test], col=heat.colors(5), ylim=c(0,1), varwidth=TRUE, main="mnir")
258 | mtext(side=1, "stars", outer=TRUE,cex=.9, font=3)
259 | mtext(side=2, "probability of true stars", outer=TRUE,cex=.9, font=3)
260 | dev.off()
261 | 


--------------------------------------------------------------------------------
/code/parseyelp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | ## python map for word counts
 3 | 
 4 | # Import Modules
 5 | import sys
 6 | import re
 7 | import json
 8 | 
 9 | # all non alphanumeric
10 | contractions = re.compile(r"'|-")
11 | symbols = re.compile(r'(\W+)', re.U)
12 | numeric = re.compile(r'(?<=\s)(\d+|\w\d+|\d+\w)(?=\s)', re.I)
13 | swrd = re.compile(r'(?<=\s)(,|"|\(|\)|to|a|as|the|an|and|or|for|are|is)(?=\s)', re.I)
14 | suffix = re.compile(r'(?<=\w)(s|ings*|ly|(?<=e)[sd]+)(?=\s)')
15 | seps = re.compile(r'\s+')
16 | 
17 | # cleaner (order matters)
18 | def clean(text): 
19 |     text = u' ' +  text.lower() + u' '
20 |     text = contractions.sub('', text)
21 |     text = symbols.sub(r' \1 ', text)
22 |     text = numeric.sub('000', text)
23 |     text = swrd.sub(' ', text)
24 |     #text = suffix.sub('', text)
25 |     text = seps.sub(' ', text)
26 |     return text
27 | 
28 | 
29 | fout = [ open("data/yelptrain%dstar.txt" % y, 'w') for y in range(1,6) ]
30 | fin = open("data/yelp_training_set/yelp_training_set_review.json", 'r')
31 | i = 0
32 | 
33 | for line in fin:
34 |     d = json.loads(line)
35 |     i += 1
36 |     try:
37 |         txt = clean(d['text'])
38 |         fout[d['stars']-1].write(txt+'\n')
39 |         print(i, end=" ")
40 | 
41 |     except:
42 |         e = sys.exc_info()[0]
43 |         sys.stderr.write("review reader error: %s\n"%str(e))
44 | 
45 | fin.close()
46 | for f in fout:
47 |     f.close()
48 | 
49 | 
50 | fout = [ open("data/yelptest%dstar.txt" % y, 'w') for y in range(1,6) ]
51 | fin = open("data/yelp_test_set/yelp_test_set_review.json", 'r')
52 | i = 0
53 | 
54 | for line in fin:
55 |     d = json.loads(line)
56 |     i += 1
57 |     try:
58 |         txt = clean(d['text'])
59 |         fout[d['stars']-1].write(txt+'\n')
60 |         print(i, end=" ")
61 | 
62 |     except:
63 |         e = sys.exc_info()[0]
64 |         sys.stderr.write("review reader error: %s\n"%str(e))
65 | 
66 | fin.close()
67 | for f in fout:
68 |     f.close()
69 | 


--------------------------------------------------------------------------------
/debug.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy\n",
 12 |     "\n",
 13 |     "from gensim import utils, matutils\n",
 14 |     "from gensim.models import word2vec\n",
 15 |     "\n",
 16 |     "sentences = [\n",
 17 |     "    ['human', 'interface', 'computer'],\n",
 18 |     "    ['survey', 'user', 'computer', 'system', 'response', 'time'],\n",
 19 |     "    ['eps', 'user', 'interface', 'system'],\n",
 20 |     "    ['system', 'human', 'system', 'eps'],\n",
 21 |     "    ['user', 'response', 'time'],\n",
 22 |     "    ['trees'],\n",
 23 |     "    ['graph', 'trees'],\n",
 24 |     "    ['graph', 'minors', 'trees'],\n",
 25 |     "    ['graph', 'minors', 'survey']\n",
 26 |     "]\n",
 27 |     "\n",
 28 |     "model = word2vec.Word2Vec(sentences, min_count=1, workers=5)\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stderr",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "DEBUG:root:test\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "import logging\n",
 48 |     "logger = logging.getLogger()\n",
 49 |     "logger.setLevel(logging.DEBUG)\n",
 50 |     "logging.debug(\"test\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stderr",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "INFO:gensim.models.word2vec:scoring sentences with 2 workers on 12 vocabulary and 100 features, using sg=1 hs=1 sample=0 and negative=0\n",
 65 |       "DEBUG:gensim.models.word2vec:putting job #0 in the queue\n",
 66 |       "DEBUG:gensim.models.word2vec:putting job #1 in the queue\n",
 67 |       "DEBUG:gensim.models.word2vec:putting job #2 in the queue\n",
 68 |       "DEBUG:gensim.models.word2vec:putting job #3 in the queue\n",
 69 |       "DEBUG:gensim.models.word2vec:putting job #4 in the queue\n",
 70 |       "INFO:gensim.models.word2vec:reached end of input; waiting to finish 1 outstanding jobs\n",
 71 |       "INFO:gensim.models.word2vec:scoring 9 sentences took 0.0s, 1105 sentences/s\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "scores = model.score(sentences, 9, chunksize=2)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "array([-16.96789169, -77.79333496, -29.70446968, -29.69228935,\n",
 90 |        "       -15.54527378,   0.        ,  -4.23740578, -14.12265587, -15.54527378], dtype=float32)"
 91 |       ]
 92 |      },
 93 |      "execution_count": 4,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "scores"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stderr",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "INFO:gensim.models.word2vec:scoring sentences with 2 workers on 12 vocabulary and 100 features, using sg=1 hs=1 sample=0 and negative=0\n",
114 |       "DEBUG:gensim.models.word2vec:putting job #0 in the queue\n",
115 |       "DEBUG:gensim.models.word2vec:putting job #1 in the queue\n",
116 |       "DEBUG:gensim.models.word2vec:putting job #2 in the queue\n",
117 |       "WARNING:gensim.models.word2vec:terminating after 2 sentences (set higher total_sentences if you want more).\n",
118 |       "INFO:gensim.models.word2vec:reached end of input; waiting to finish 1 outstanding jobs\n",
119 |       "INFO:gensim.models.word2vec:scoring 2 sentences took 0.0s, 774 sentences/s\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "scores = model.score(sentences, 2, chunksize=2)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 6,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "array([-16.96789169, -77.79333496], dtype=float32)"
138 |       ]
139 |      },
140 |      "execution_count": 6,
141 |      "metadata": {},
142 |      "output_type": "execute_result"
143 |     }
144 |    ],
145 |    "source": [
146 |     "scores"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 7,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "2"
160 |       ]
161 |      },
162 |      "execution_count": 7,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "len(scores)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": []
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": []
188 |   }
189 |  ],
190 |  "metadata": {
191 |   "kernelspec": {
192 |    "display_name": "Python 2",
193 |    "language": "python",
194 |    "name": "python2"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 2
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython2",
206 |    "version": "2.7.10"
207 |   }
208 |  },
209 |  "nbformat": 4,
210 |  "nbformat_minor": 0
211 | }
212 | 


--------------------------------------------------------------------------------
/paths.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 14,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re\n",
 12 |     "contractions = re.compile(r\"'s*|-|\\\"\")\n",
 13 |     "# all non alphanumeric\n",
 14 |     "symbols = re.compile(r'(\\W+)', re.U)\n",
 15 |     "# separators (any whitespace)\n",
 16 |     "seps = re.compile(r'\\s+')\n",
 17 |     "# some stops to remove\n",
 18 |     "stops = re.compile(r'(\\s[,:\\)\\(]\\s)')\n",
 19 |     "# for sentence splitter\n",
 20 |     "alteos = re.compile(r'([!\\?])')\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "# cleaner (order matters)\n",
 24 |     "def clean(text): \n",
 25 |     "    text = text.lower()\n",
 26 |     "    text = contractions.sub('', text)\n",
 27 |     "    text = symbols.sub(r' \\1 ', text)\n",
 28 |     "    text = stops.sub(' ', text)\n",
 29 |     "    text = seps.sub(' ', text)\n",
 30 |     "    text = alteos.sub(r' \\1 .', text)\n",
 31 |     "    return text\n",
 32 |     "\n",
 33 |     "from zipfile import ZipFile\n",
 34 |     "import json\n",
 35 |     "\n",
 36 |     "def YelpSentences(label, stars=[1,2,3,4,5]):\n",
 37 |     "    with ZipFile(\"yelp_%s_set.zip\"%label, 'r') as zf:\n",
 38 |     "        with zf.open(\"yelp_%s_set/yelp_%s_set_review.json\"%(label,label)) as f:\n",
 39 |     "            for line in f:\n",
 40 |     "                rev = json.loads(line)\n",
 41 |     "                if rev['stars'] in stars:\n",
 42 |     "                    text = rev['text'].clean()\n",
 43 |     "                    for s in text.split(\".\"):\n",
 44 |     "                        yield s.split()\n"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "## just bring them into memory\n",
 56 |     "sentences = list(YelpSentences(\"training\"))"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from gensim.models import Word2Vec\n",
 68 |     "import logging \n",
 69 |     "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
 70 |     "\n",
 71 |     "## create a w2v learner \n",
 72 |     "w2v = Word2Vec(sentences, workers=8, iter=3) "
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "## function to save vector path to file \n",
 84 |     "# Each row is a word, sentence reads from top.\n",
 85 |     "# (throws a key error if words are not in the vocab.)\n",
 86 |     "import numpy as np\n",
 87 |     "def savepath(words):\n",
 88 |     "    print(words)\n",
 89 |     "    np.savetxt(\"_\".join(words)+\".txt\", w2v[words], fmt=\"%.6f\")"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 5,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "[u'my', u'wife', u'took', u'me', u'here', u'on', u'my', u'birthday', u'for', u'breakfast', u'and', u'it', u'was', u'excellent']\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "savepath(sentences[0])"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "[u'the', u'horchata', u'is', u'handmade', u'and', u'delicious']\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "savepath(sentences[97])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 9,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "badsentences = list(YelpSentences(\"test\", [1]))"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 16,
144 |    "metadata": {
145 |     "collapsed": false,
146 |     "scrolled": true
147 |    },
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "[u'we', u'thought', u'this', u'was', u'a', u'little', u'strange', u'since', u'every', u'single', u'other', u'kennel', u'weve', u'ever', u'been', u'to', u'was', u'willing', u'and', u'wanted', u'to', u'give', u'us', u'a', u'tour']\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "savepath(badsentences[100])"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 20,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "[u'thanks', u'for', u'lying', u'to', u'my', u'face', u'dude']\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "savepath(badsentences[800])"
178 |    ]
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 2",
184 |    "language": "python",
185 |    "name": "python2"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 2
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython2",
197 |    "version": "2.7.9"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 0
202 | }
203 | 


--------------------------------------------------------------------------------
/tex/acl.bst:
--------------------------------------------------------------------------------
   1 | 
   2 | % BibTeX `acl' style file for BibTeX version 0.99c, LaTeX version 2.09
   3 | % This version was made by modifying `aaai-named' format based on the master
   4 | % file by Oren Patashnik (PATASHNIK@SCORE.STANFORD.EDU)
   5 | 
   6 | % Copyright (C) 1985, all rights reserved.
   7 | % Modifications Copyright 1988, Peter F. Patel-Schneider
   8 | % Further modifictions by Stuart Shieber, 1991, and Fernando Pereira, 1992.
   9 | % Copying of this file is authorized only if either
  10 | % (1) you make absolutely no changes to your copy, including name, or
  11 | % (2) if you do make changes, you name it something other than
  12 | % btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst.
  13 | % This restriction helps ensure that all standard styles are identical.
  14 | 
  15 | % There are undoubtably bugs in this style.  If you make bug fixes,
  16 | % improvements, etc.  please let me know.  My e-mail address is:
  17 | %       pfps@spar.slb.com
  18 | 
  19 | %   Citation format: [author-last-name, year]
  20 | %                    [author-last-name and author-last-name, year]
  21 | %                    [author-last-name {\em et al.}, year]
  22 | %
  23 | %   Reference list ordering: alphabetical by author or whatever passes
  24 | %       for author in the absence of one.
  25 | %
  26 | % This BibTeX style has support for short (year only) citations.  This
  27 | % is done by having the citations actually look like
  28 | %         \citename{name-info, }year
  29 | % The LaTeX style has to have the following
  30 | %     \let\@internalcite\cite
  31 | %     \def\cite{\def\citename##1{##1}\@internalcite}
  32 | %     \def\shortcite{\def\citename##1{}\@internalcite}
  33 | %     \def\@biblabel#1{\def\citename##1{##1}[#1]\hfill}
  34 | % which makes \shortcite the macro for short citations.
  35 | 
  36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  37 | % Changes made by SMS for thesis style
  38 | %   no emphasis on "et al."
  39 | %   "Ph.D." includes periods (not "PhD")
  40 | %   moved year to immediately after author's name
  41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  42 | ENTRY
  43 |   { address
  44 |     author
  45 |     booktitle
  46 |     chapter
  47 |     edition
  48 |     editor
  49 |     howpublished
  50 |     institution
  51 |     journal
  52 |     key
  53 |     month
  54 |     note
  55 |     number
  56 |     organization
  57 |     pages
  58 |     publisher
  59 |     school
  60 |     series
  61 |     title
  62 |     type
  63 |     volume
  64 |     year
  65 |   }
  66 |   {}
  67 |   { label extra.label sort.label }
  68 | 
  69 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
  70 | 
  71 | FUNCTION {init.state.consts}
  72 | { #0 'before.all :=
  73 |   #1 'mid.sentence :=
  74 |   #2 'after.sentence :=
  75 |   #3 'after.block :=
  76 | }
  77 | 
  78 | STRINGS { s t }
  79 | 
  80 | FUNCTION {output.nonnull}
  81 | { 's :=
  82 |   output.state mid.sentence =
  83 |     { ", " * write$ }
  84 |     { output.state after.block =
  85 |         { add.period$ write$
  86 |           newline$
  87 |           "\newblock " write$
  88 |         }
  89 |         { output.state before.all =
  90 |             'write$
  91 |             { add.period$ " " * write$ }
  92 |           if$
  93 |         }
  94 |       if$
  95 |       mid.sentence 'output.state :=
  96 |     }
  97 |   if$
  98 |   s
  99 | }
 100 | 
 101 | FUNCTION {output}
 102 | { duplicate$ empty$
 103 |     'pop$
 104 |     'output.nonnull
 105 |   if$
 106 | }
 107 | 
 108 | FUNCTION {output.check}
 109 | { 't :=
 110 |   duplicate$ empty$
 111 |     { pop$ "empty " t * " in " * cite$ * warning$ }
 112 |     'output.nonnull
 113 |   if$
 114 | }
 115 | 
 116 | FUNCTION {output.bibitem}
 117 | { newline$
 118 | 
 119 |   "\bibitem[" write$
 120 |   label write$
 121 |   "]{" write$
 122 | 
 123 |   cite$ write$
 124 |   "}" write$
 125 |   newline$
 126 |   ""
 127 |   before.all 'output.state :=
 128 | }
 129 | 
 130 | FUNCTION {fin.entry}
 131 | { add.period$
 132 |   write$
 133 |   newline$
 134 | }
 135 | 
 136 | FUNCTION {new.block}
 137 | { output.state before.all =
 138 |     'skip$
 139 |     { after.block 'output.state := }
 140 |   if$
 141 | }
 142 | 
 143 | FUNCTION {new.sentence}
 144 | { output.state after.block =
 145 |     'skip$
 146 |     { output.state before.all =
 147 |         'skip$
 148 |         { after.sentence 'output.state := }
 149 |       if$
 150 |     }
 151 |   if$
 152 | }
 153 | 
 154 | FUNCTION {not}
 155 | {   { #0 }
 156 |     { #1 }
 157 |   if$
 158 | }
 159 | 
 160 | FUNCTION {and}
 161 | {   'skip$
 162 |     { pop$ #0 }
 163 |   if$
 164 | }
 165 | 
 166 | FUNCTION {or}
 167 | {   { pop$ #1 }
 168 |     'skip$
 169 |   if$
 170 | }
 171 | 
 172 | FUNCTION {new.block.checka}
 173 | { empty$
 174 |     'skip$
 175 |     'new.block
 176 |   if$
 177 | }
 178 | 
 179 | FUNCTION {new.block.checkb}
 180 | { empty$
 181 |   swap$ empty$
 182 |   and
 183 |     'skip$
 184 |     'new.block
 185 |   if$
 186 | }
 187 | 
 188 | FUNCTION {new.sentence.checka}
 189 | { empty$
 190 |     'skip$
 191 |     'new.sentence
 192 |   if$
 193 | }
 194 | 
 195 | FUNCTION {new.sentence.checkb}
 196 | { empty$
 197 |   swap$ empty$
 198 |   and
 199 |     'skip$
 200 |     'new.sentence
 201 |   if$
 202 | }
 203 | 
 204 | FUNCTION {field.or.null}
 205 | { duplicate$ empty$
 206 |     { pop$ "" }
 207 |     'skip$
 208 |   if$
 209 | }
 210 | 
 211 | FUNCTION {emphasize}
 212 | { duplicate$ empty$
 213 |     { pop$ "" }
 214 |     { "{\em " swap$ * "}" * }
 215 |   if$
 216 | }
 217 | 
 218 | INTEGERS { nameptr namesleft numnames }
 219 | 
 220 | FUNCTION {format.names}
 221 | { 's :=
 222 |   #1 'nameptr :=
 223 |   s num.names$ 'numnames :=
 224 |   numnames 'namesleft :=
 225 |     { namesleft #0 > }
 226 | 
 227 |     { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
 228 | 
 229 |       nameptr #1 >
 230 |         { namesleft #1 >
 231 |             { ", " * t * }
 232 |             { numnames #2 >
 233 |                 { "," * }
 234 |                 'skip$
 235 |               if$
 236 |               t "others" =
 237 |                 { " et~al." * }
 238 |                 { " and " * t * }
 239 |               if$
 240 |             }
 241 |           if$
 242 |         }
 243 |         't
 244 |       if$
 245 |       nameptr #1 + 'nameptr :=
 246 |       namesleft #1 - 'namesleft :=
 247 |     }
 248 |   while$
 249 | }
 250 | 
 251 | FUNCTION {format.authors}
 252 | { author empty$
 253 |     { "" }
 254 |     { author format.names }
 255 |   if$
 256 | }
 257 | 
 258 | FUNCTION {format.editors}
 259 | { editor empty$
 260 |     { "" }
 261 |     { editor format.names
 262 |       editor num.names$ #1 >
 263 |         { ", editors" * }
 264 |         { ", editor" * }
 265 |       if$
 266 |     }
 267 |   if$
 268 | }
 269 | 
 270 | FUNCTION {format.title}
 271 | { title empty$
 272 |     { "" }
 273 | 
 274 |     { title "t" change.case$ }
 275 | 
 276 |   if$
 277 | }
 278 | 
 279 | FUNCTION {n.dashify}
 280 | { 't :=
 281 |   ""
 282 |     { t empty$ not }
 283 |     { t #1 #1 substring$ "-" =
 284 |         { t #1 #2 substring$ "--" = not
 285 |             { "--" *
 286 |               t #2 global.max$ substring$ 't :=
 287 |             }
 288 |             {   { t #1 #1 substring$ "-" = }
 289 |                 { "-" *
 290 |                   t #2 global.max$ substring$ 't :=
 291 |                 }
 292 |               while$
 293 |             }
 294 |           if$
 295 |         }
 296 |         { t #1 #1 substring$ *
 297 |           t #2 global.max$ substring$ 't :=
 298 |         }
 299 |       if$
 300 |     }
 301 |   while$
 302 | }
 303 | 
 304 | FUNCTION {format.date}
 305 | { year empty$
 306 |     { month empty$
 307 |         { "" }
 308 |         { "there's a month but no year in " cite$ * warning$
 309 |           month
 310 |         }
 311 |       if$
 312 |     }
 313 |     { month empty$
 314 |         { "" }
 315 |         { month }
 316 |       if$
 317 |     }
 318 |   if$
 319 | }
 320 | 
 321 | FUNCTION {format.btitle}
 322 | { title emphasize
 323 | }
 324 | 
 325 | FUNCTION {tie.or.space.connect}
 326 | { duplicate$ text.length$ #3 <
 327 |     { "~" }
 328 |     { " " }
 329 |   if$
 330 |   swap$ * *
 331 | }
 332 | 
 333 | FUNCTION {either.or.check}
 334 | { empty$
 335 |     'pop$
 336 |     { "can't use both " swap$ * " fields in " * cite$ * warning$ }
 337 |   if$
 338 | }
 339 | 
 340 | FUNCTION {format.bvolume}
 341 | { volume empty$
 342 |     { "" }
 343 |     { "volume" volume tie.or.space.connect
 344 |       series empty$
 345 |         'skip$
 346 |         { " of " * series emphasize * }
 347 |       if$
 348 |       "volume and number" number either.or.check
 349 |     }
 350 |   if$
 351 | }
 352 | 
 353 | FUNCTION {format.number.series}
 354 | { volume empty$
 355 |     { number empty$
 356 |         { series field.or.null }
 357 |         { output.state mid.sentence =
 358 |             { "number" }
 359 |             { "Number" }
 360 |           if$
 361 |           number tie.or.space.connect
 362 |           series empty$
 363 |             { "there's a number but no series in " cite$ * warning$ }
 364 |             { " in " * series * }
 365 |           if$
 366 |         }
 367 |       if$
 368 |     }
 369 |     { "" }
 370 |   if$
 371 | }
 372 | 
 373 | FUNCTION {format.edition}
 374 | { edition empty$
 375 |     { "" }
 376 |     { output.state mid.sentence =
 377 |         { edition "l" change.case$ " edition" * }
 378 |         { edition "t" change.case$ " edition" * }
 379 |       if$
 380 |     }
 381 |   if$
 382 | }
 383 | 
 384 | INTEGERS { multiresult }
 385 | 
 386 | FUNCTION {multi.page.check}
 387 | { 't :=
 388 |   #0 'multiresult :=
 389 |     { multiresult not
 390 |       t empty$ not
 391 |       and
 392 |     }
 393 |     { t #1 #1 substring$
 394 |       duplicate$ "-" =
 395 |       swap$ duplicate$ "," =
 396 |       swap$ "+" =
 397 |       or or
 398 |         { #1 'multiresult := }
 399 |         { t #2 global.max$ substring$ 't := }
 400 |       if$
 401 |     }
 402 |   while$
 403 |   multiresult
 404 | }
 405 | 
 406 | FUNCTION {format.pages}
 407 | { pages empty$
 408 |     { "" }
 409 |     { pages multi.page.check
 410 |         { "pages" pages n.dashify tie.or.space.connect }
 411 |         { "page" pages tie.or.space.connect }
 412 |       if$
 413 |     }
 414 |   if$
 415 | }
 416 | 
 417 | FUNCTION {format.year.label}
 418 | { year extra.label *
 419 | }
 420 | 
 421 | FUNCTION {format.vol.num.pages}
 422 | { volume field.or.null
 423 |   number empty$
 424 |     'skip$
 425 |     { "(" number * ")" * *
 426 |       volume empty$
 427 |         { "there's a number but no volume in " cite$ * warning$ }
 428 |         'skip$
 429 |       if$
 430 |     }
 431 |   if$
 432 |   pages empty$
 433 |     'skip$
 434 |     { duplicate$ empty$
 435 |         { pop$ format.pages }
 436 |         { ":" * pages n.dashify * }
 437 |       if$
 438 |     }
 439 |   if$
 440 | }
 441 | 
 442 | FUNCTION {format.chapter.pages}
 443 | { chapter empty$
 444 |     'format.pages
 445 |     { type empty$
 446 |         { "chapter" }
 447 |         { type "l" change.case$ }
 448 |       if$
 449 |       chapter tie.or.space.connect
 450 |       pages empty$
 451 |         'skip$
 452 |         { ", " * format.pages * }
 453 |       if$
 454 |     }
 455 |   if$
 456 | }
 457 | 
 458 | FUNCTION {format.in.ed.booktitle}
 459 | { booktitle empty$
 460 |     { "" }
 461 |     { editor empty$
 462 |         { "In " booktitle emphasize * }
 463 |         { "In " format.editors * ", " * booktitle emphasize * }
 464 |       if$
 465 |     }
 466 |   if$
 467 | }
 468 | 
 469 | FUNCTION {empty.misc.check}
 470 | { author empty$ title empty$ howpublished empty$
 471 |   month empty$ year empty$ note empty$
 472 |   and and and and and
 473 | 
 474 |   key empty$ not and
 475 | 
 476 |     { "all relevant fields are empty in " cite$ * warning$ }
 477 |     'skip$
 478 |   if$
 479 | }
 480 | 
 481 | FUNCTION {format.thesis.type}
 482 | { type empty$
 483 |     'skip$
 484 |     { pop$
 485 |       type "t" change.case$
 486 |     }
 487 |   if$
 488 | }
 489 | 
 490 | FUNCTION {format.tr.number}
 491 | { type empty$
 492 |     { "Technical Report" }
 493 |     'type
 494 |   if$
 495 |   number empty$
 496 |     { "t" change.case$ }
 497 |     { number tie.or.space.connect }
 498 |   if$
 499 | }
 500 | 
 501 | FUNCTION {format.article.crossref}
 502 | { key empty$
 503 |     { journal empty$
 504 |         { "need key or journal for " cite$ * " to crossref " * crossref *
 505 |           warning$
 506 |           ""
 507 |         }
 508 |         { "In {\em " journal * "\/}" * }
 509 |       if$
 510 |     }
 511 |     { "In " key * }
 512 |   if$
 513 |   " \cite{" * crossref * "}" *
 514 | }
 515 | 
 516 | FUNCTION {format.crossref.editor}
 517 | { editor #1 "{vv~}{ll}" format.name$
 518 |   editor num.names$ duplicate$
 519 |   #2 >
 520 |     { pop$ " et~al." * }
 521 |     { #2 <
 522 |         'skip$
 523 |         { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
 524 |             { " et~al." * }
 525 |             { " and " * editor #2 "{vv~}{ll}" format.name$ * }
 526 |           if$
 527 |         }
 528 |       if$
 529 |     }
 530 |   if$
 531 | }
 532 | 
 533 | FUNCTION {format.book.crossref}
 534 | { volume empty$
 535 |     { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
 536 |       "In "
 537 |     }
 538 |     { "Volume" volume tie.or.space.connect
 539 |       " of " *
 540 |     }
 541 |   if$
 542 |   editor empty$
 543 |   editor field.or.null author field.or.null =
 544 |   or
 545 |     { key empty$
 546 |         { series empty$
 547 |             { "need editor, key, or series for " cite$ * " to crossref " *
 548 |               crossref * warning$
 549 |               "" *
 550 |             }
 551 |             { "{\em " * series * "\/}" * }
 552 |           if$
 553 |         }
 554 |         { key * }
 555 |       if$
 556 |     }
 557 |     { format.crossref.editor * }
 558 |   if$
 559 |   " \cite{" * crossref * "}" *
 560 | }
 561 | 
 562 | FUNCTION {format.incoll.inproc.crossref}
 563 | { editor empty$
 564 |   editor field.or.null author field.or.null =
 565 |   or
 566 |     { key empty$
 567 |         { booktitle empty$
 568 |             { "need editor, key, or booktitle for " cite$ * " to crossref " *
 569 |               crossref * warning$
 570 |               ""
 571 |             }
 572 |             { "In {\em " booktitle * "\/}" * }
 573 |           if$
 574 |         }
 575 |         { "In " key * }
 576 |       if$
 577 |     }
 578 |     { "In " format.crossref.editor * }
 579 |   if$
 580 |   " \cite{" * crossref * "}" *
 581 | }
 582 | 
 583 | FUNCTION {article}
 584 | { output.bibitem
 585 |   format.authors "author" output.check
 586 |   new.block
 587 |   format.year.label "year" output.check
 588 |   new.block
 589 |   format.title "title" output.check
 590 |   new.block
 591 |   crossref missing$
 592 |     { journal emphasize "journal" output.check
 593 |       format.vol.num.pages output
 594 |       format.date output
 595 |     }
 596 |     { format.article.crossref output.nonnull
 597 |       format.pages output
 598 |     }
 599 |   if$
 600 |   new.block
 601 |   note output
 602 |   fin.entry
 603 | }
 604 | 
 605 | FUNCTION {book}
 606 | { output.bibitem
 607 |   author empty$
 608 |     { format.editors "author and editor" output.check }
 609 |     { format.authors output.nonnull
 610 |       crossref missing$
 611 |         { "author and editor" editor either.or.check }
 612 |         'skip$
 613 |       if$
 614 |     }
 615 |   if$
 616 |   new.block
 617 |   format.year.label "year" output.check
 618 |   new.block
 619 |   format.btitle "title" output.check
 620 |   crossref missing$
 621 |     { format.bvolume output
 622 |       new.block
 623 |       format.number.series output
 624 |       new.sentence
 625 |       publisher "publisher" output.check
 626 |       address output
 627 |     }
 628 |     { new.block
 629 |       format.book.crossref output.nonnull
 630 |     }
 631 |   if$
 632 |   format.edition output
 633 |   format.date output
 634 |   new.block
 635 |   note output
 636 |   fin.entry
 637 | }
 638 | 
 639 | FUNCTION {booklet}
 640 | { output.bibitem
 641 |   format.authors output
 642 |   new.block
 643 |   format.year.label "year" output.check
 644 |   new.block
 645 |   format.title "title" output.check
 646 |   howpublished address new.block.checkb
 647 |   howpublished output
 648 |   address output
 649 |   format.date output
 650 |   new.block
 651 |   note output
 652 |   fin.entry
 653 | }
 654 | 
 655 | FUNCTION {inbook}
 656 | { output.bibitem
 657 |   author empty$
 658 |     { format.editors "author and editor" output.check }
 659 |     { format.authors output.nonnull
 660 |       crossref missing$
 661 |         { "author and editor" editor either.or.check }
 662 |         'skip$
 663 |       if$
 664 |     }
 665 |   if$
 666 |   format.year.label "year" output.check
 667 |   new.block
 668 |   new.block
 669 |   format.btitle "title" output.check
 670 |   crossref missing$
 671 |     { format.bvolume output
 672 |       format.chapter.pages "chapter and pages" output.check
 673 |       new.block
 674 |       format.number.series output
 675 |       new.sentence
 676 |       publisher "publisher" output.check
 677 |       address output
 678 |     }
 679 |     { format.chapter.pages "chapter and pages" output.check
 680 |       new.block
 681 |       format.book.crossref output.nonnull
 682 |     }
 683 |   if$
 684 |   format.edition output
 685 |   format.date output
 686 |   new.block
 687 |   note output
 688 |   fin.entry
 689 | }
 690 | 
 691 | FUNCTION {incollection}
 692 | { output.bibitem
 693 |   format.authors "author" output.check
 694 |   new.block
 695 |   format.year.label "year" output.check
 696 |   new.block
 697 |   format.title "title" output.check
 698 |   new.block
 699 |   crossref missing$
 700 |     { format.in.ed.booktitle "booktitle" output.check
 701 |       format.bvolume output
 702 |       format.number.series output
 703 |       format.chapter.pages output
 704 |       new.sentence
 705 |       publisher "publisher" output.check
 706 |       address output
 707 |       format.edition output
 708 |       format.date output
 709 |     }
 710 |     { format.incoll.inproc.crossref output.nonnull
 711 |       format.chapter.pages output
 712 |     }
 713 |   if$
 714 |   new.block
 715 |   note output
 716 |   fin.entry
 717 | }
 718 | 
 719 | FUNCTION {inproceedings}
 720 | { output.bibitem
 721 |   format.authors "author" output.check
 722 |   new.block
 723 |   format.year.label "year" output.check
 724 |   new.block
 725 |   format.title "title" output.check
 726 |   new.block
 727 |   crossref missing$
 728 |     { format.in.ed.booktitle "booktitle" output.check
 729 |       format.bvolume output
 730 |       format.number.series output
 731 |       format.pages output
 732 |       address empty$
 733 |         { organization publisher new.sentence.checkb
 734 |           organization output
 735 |           publisher output
 736 |           format.date output
 737 |         }
 738 |         { address output.nonnull
 739 |           format.date output
 740 |           new.sentence
 741 |           organization output
 742 |           publisher output
 743 |         }
 744 |       if$
 745 |     }
 746 |     { format.incoll.inproc.crossref output.nonnull
 747 |       format.pages output
 748 |     }
 749 |   if$
 750 |   new.block
 751 |   note output
 752 |   fin.entry
 753 | }
 754 | 
 755 | FUNCTION {conference} { inproceedings }
 756 | 
 757 | FUNCTION {manual}
 758 | { output.bibitem
 759 |   author empty$
 760 |     { organization empty$
 761 |         'skip$
 762 |         { organization output.nonnull
 763 |           address output
 764 |         }
 765 |       if$
 766 |     }
 767 |     { format.authors output.nonnull }
 768 |   if$
 769 |   format.year.label "year" output.check
 770 |   new.block
 771 |   new.block
 772 |   format.btitle "title" output.check
 773 |   author empty$
 774 |     { organization empty$
 775 |         { address new.block.checka
 776 |           address output
 777 |         }
 778 |         'skip$
 779 |       if$
 780 |     }
 781 |     { organization address new.block.checkb
 782 |       organization output
 783 |       address output
 784 |     }
 785 |   if$
 786 |   format.edition output
 787 |   format.date output
 788 |   new.block
 789 |   note output
 790 |   fin.entry
 791 | }
 792 | 
 793 | FUNCTION {mastersthesis}
 794 | { output.bibitem
 795 |   format.authors "author" output.check
 796 |   new.block
 797 |   format.year.label "year" output.check
 798 |   new.block
 799 |   format.title "title" output.check
 800 |   new.block
 801 |   "Master's thesis" format.thesis.type output.nonnull
 802 |   school "school" output.check
 803 |   address output
 804 |   format.date output
 805 |   new.block
 806 |   note output
 807 |   fin.entry
 808 | }
 809 | 
 810 | FUNCTION {misc}
 811 | { output.bibitem
 812 |   format.authors output 
 813 |   new.block
 814 |   format.year.label output
 815 |   new.block
 816 |   title howpublished new.block.checkb
 817 |   format.title output
 818 |   howpublished new.block.checka
 819 |   howpublished output
 820 |   format.date output
 821 |   new.block
 822 |   note output
 823 |   fin.entry
 824 |   empty.misc.check
 825 | }
 826 | 
 827 | FUNCTION {phdthesis}
 828 | { output.bibitem
 829 |   format.authors "author" output.check
 830 |   new.block
 831 |   format.year.label "year" output.check
 832 |   new.block
 833 |   format.btitle "title" output.check
 834 |   new.block
 835 |   "{Ph.D.} thesis" format.thesis.type output.nonnull
 836 |   school "school" output.check
 837 |   address output
 838 |   format.date output
 839 |   new.block
 840 |   note output
 841 |   fin.entry
 842 | }
 843 | 
 844 | FUNCTION {proceedings}
 845 | { output.bibitem
 846 |   editor empty$
 847 |     { organization output }
 848 |     { format.editors output.nonnull }
 849 |   if$
 850 |   new.block
 851 |   format.year.label "year" output.check
 852 |   new.block
 853 |   format.btitle "title" output.check
 854 |   format.bvolume output
 855 |   format.number.series output
 856 |   address empty$
 857 |     { editor empty$
 858 |         { publisher new.sentence.checka }
 859 |         { organization publisher new.sentence.checkb
 860 |           organization output
 861 |         }
 862 |       if$
 863 |       publisher output
 864 |       format.date output
 865 |     }
 866 |     { address output.nonnull
 867 |       format.date output
 868 |       new.sentence
 869 |       editor empty$
 870 |         'skip$
 871 |         { organization output }
 872 |       if$
 873 |       publisher output
 874 |     }
 875 |   if$
 876 |   new.block
 877 |   note output
 878 |   fin.entry
 879 | }
 880 | 
 881 | FUNCTION {techreport}
 882 | { output.bibitem
 883 |   format.authors "author" output.check
 884 |   new.block
 885 |   format.year.label "year" output.check
 886 |   new.block
 887 |   format.title "title" output.check
 888 |   new.block
 889 |   format.tr.number output.nonnull
 890 |   institution "institution" output.check
 891 |   address output
 892 |   format.date output
 893 |   new.block
 894 |   note output
 895 |   fin.entry
 896 | }
 897 | 
 898 | FUNCTION {unpublished}
 899 | { output.bibitem
 900 |   format.authors "author" output.check
 901 |   new.block
 902 |   format.year.label "year" output.check
 903 |   new.block
 904 |   format.title "title" output.check
 905 |   new.block
 906 |   note "note" output.check
 907 |   format.date output
 908 |   fin.entry
 909 | }
 910 | 
 911 | FUNCTION {default.type} { misc }
 912 | 
 913 | MACRO {jan} {"January"}
 914 | 
 915 | MACRO {feb} {"February"}
 916 | 
 917 | MACRO {mar} {"March"}
 918 | 
 919 | MACRO {apr} {"April"}
 920 | 
 921 | MACRO {may} {"May"}
 922 | 
 923 | MACRO {jun} {"June"}
 924 | 
 925 | MACRO {jul} {"July"}
 926 | 
 927 | MACRO {aug} {"August"}
 928 | 
 929 | MACRO {sep} {"September"}
 930 | 
 931 | MACRO {oct} {"October"}
 932 | 
 933 | MACRO {nov} {"November"}
 934 | 
 935 | MACRO {dec} {"December"}
 936 | 
 937 | MACRO {acmcs} {"ACM Computing Surveys"}
 938 | 
 939 | MACRO {acta} {"Acta Informatica"}
 940 | 
 941 | MACRO {cacm} {"Communications of the ACM"}
 942 | 
 943 | MACRO {ibmjrd} {"IBM Journal of Research and Development"}
 944 | 
 945 | MACRO {ibmsj} {"IBM Systems Journal"}
 946 | 
 947 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
 948 | 
 949 | MACRO {ieeetc} {"IEEE Transactions on Computers"}
 950 | 
 951 | MACRO {ieeetcad}
 952 |  {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
 953 | 
 954 | MACRO {ipl} {"Information Processing Letters"}
 955 | 
 956 | MACRO {jacm} {"Journal of the ACM"}
 957 | 
 958 | MACRO {jcss} {"Journal of Computer and System Sciences"}
 959 | 
 960 | MACRO {scp} {"Science of Computer Programming"}
 961 | 
 962 | MACRO {sicomp} {"SIAM Journal on Computing"}
 963 | 
 964 | MACRO {tocs} {"ACM Transactions on Computer Systems"}
 965 | 
 966 | MACRO {tods} {"ACM Transactions on Database Systems"}
 967 | 
 968 | MACRO {tog} {"ACM Transactions on Graphics"}
 969 | 
 970 | MACRO {toms} {"ACM Transactions on Mathematical Software"}
 971 | 
 972 | MACRO {toois} {"ACM Transactions on Office Information Systems"}
 973 | 
 974 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
 975 | 
 976 | MACRO {tcs} {"Theoretical Computer Science"}
 977 | 
 978 | READ
 979 | 
 980 | FUNCTION {sortify}
 981 | { purify$
 982 |   "l" change.case$
 983 | }
 984 | 
 985 | INTEGERS { len }
 986 | 
 987 | FUNCTION {chop.word}
 988 | { 's :=
 989 |   'len :=
 990 |   s #1 len substring$ =
 991 |     { s len #1 + global.max$ substring$ }
 992 |     's
 993 |   if$
 994 | }
 995 | 
 996 | INTEGERS { et.al.char.used }
 997 | 
 998 | FUNCTION {initialize.et.al.char.used}
 999 | { #0 'et.al.char.used :=
1000 | }
1001 | 
1002 | EXECUTE {initialize.et.al.char.used}
1003 | 
1004 | FUNCTION {format.lab.names}
1005 | { 's :=
1006 |   s num.names$ 'numnames :=
1007 | 
1008 |   numnames #1 =
1009 |     { s #1 "{vv }{ll}" format.name$ }
1010 |     { numnames #2 =
1011 |         { s #1 "{vv }{ll }and " format.name$ s #2 "{vv }{ll}" format.name$ *
1012 |         }
1013 |         { s #1 "{vv }{ll }\bgroup et al.\egroup " format.name$ }
1014 |       if$
1015 |     }
1016 |   if$
1017 | 
1018 | }
1019 | 
1020 | FUNCTION {author.key.label}
1021 | { author empty$
1022 |     { key empty$
1023 | 
1024 |         { cite$ #1 #3 substring$ }
1025 | 
1026 |         { key #3 text.prefix$ }
1027 |       if$
1028 |     }
1029 |     { author format.lab.names }
1030 |   if$
1031 | }
1032 | 
1033 | FUNCTION {author.editor.key.label}
1034 | { author empty$
1035 |     { editor empty$
1036 |         { key empty$
1037 | 
1038 |             { cite$ #1 #3 substring$ }
1039 | 
1040 |             { key #3 text.prefix$ }
1041 |           if$
1042 |         }
1043 |         { editor format.lab.names }
1044 |       if$
1045 |     }
1046 |     { author format.lab.names }
1047 |   if$
1048 | }
1049 | 
1050 | FUNCTION {author.key.organization.label}
1051 | { author empty$
1052 |     { key empty$
1053 |         { organization empty$
1054 | 
1055 |             { cite$ #1 #3 substring$ }
1056 | 
1057 |             { "The " #4 organization chop.word #3 text.prefix$ }
1058 |           if$
1059 |         }
1060 |         { key #3 text.prefix$ }
1061 |       if$
1062 |     }
1063 |     { author format.lab.names }
1064 |   if$
1065 | }
1066 | 
1067 | FUNCTION {editor.key.organization.label}
1068 | { editor empty$
1069 |     { key empty$
1070 |         { organization empty$
1071 | 
1072 |             { cite$ #1 #3 substring$ }
1073 | 
1074 |             { "The " #4 organization chop.word #3 text.prefix$ }
1075 |           if$
1076 |         }
1077 |         { key #3 text.prefix$ }
1078 |       if$
1079 |     }
1080 |     { editor format.lab.names }
1081 |   if$
1082 | }
1083 | 
1084 | FUNCTION {calc.label}
1085 | { type$ "book" =
1086 |   type$ "inbook" =
1087 |   or
1088 |     'author.editor.key.label
1089 |     { type$ "proceedings" =
1090 |         'editor.key.organization.label
1091 |         { type$ "manual" =
1092 |             'author.key.organization.label
1093 |             'author.key.label
1094 |           if$
1095 |         }
1096 |       if$
1097 |     }
1098 |   if$
1099 |   duplicate$
1100 | 
1101 |   "\protect\citename{" swap$ * "}" *
1102 |   year field.or.null purify$ *
1103 |   'label :=
1104 |   year field.or.null purify$ *
1105 | 
1106 |   sortify 'sort.label :=
1107 | }
1108 | 
1109 | FUNCTION {sort.format.names}
1110 | { 's :=
1111 |   #1 'nameptr :=
1112 |   ""
1113 |   s num.names$ 'numnames :=
1114 |   numnames 'namesleft :=
1115 |     { namesleft #0 > }
1116 |     { nameptr #1 >
1117 |         { "   " * }
1118 |         'skip$
1119 |       if$
1120 | 
1121 |       s nameptr "{vv{ } }{ll{ }}{  ff{ }}{  jj{ }}" format.name$ 't :=
1122 | 
1123 |       nameptr numnames = t "others" = and
1124 |         { "et al" * }
1125 |         { t sortify * }
1126 |       if$
1127 |       nameptr #1 + 'nameptr :=
1128 |       namesleft #1 - 'namesleft :=
1129 |     }
1130 |   while$
1131 | }
1132 | 
1133 | FUNCTION {sort.format.title}
1134 | { 't :=
1135 |   "A " #2
1136 |     "An " #3
1137 |       "The " #4 t chop.word
1138 |     chop.word
1139 |   chop.word
1140 |   sortify
1141 |   #1 global.max$ substring$
1142 | }
1143 | 
1144 | FUNCTION {author.sort}
1145 | { author empty$
1146 |     { key empty$
1147 |         { "to sort, need author or key in " cite$ * warning$
1148 |           ""
1149 |         }
1150 |         { key sortify }
1151 |       if$
1152 |     }
1153 |     { author sort.format.names }
1154 |   if$
1155 | }
1156 | 
1157 | FUNCTION {author.editor.sort}
1158 | { author empty$
1159 |     { editor empty$
1160 |         { key empty$
1161 |             { "to sort, need author, editor, or key in " cite$ * warning$
1162 |               ""
1163 |             }
1164 |             { key sortify }
1165 |           if$
1166 |         }
1167 |         { editor sort.format.names }
1168 |       if$
1169 |     }
1170 |     { author sort.format.names }
1171 |   if$
1172 | }
1173 | 
1174 | FUNCTION {author.organization.sort}
1175 | { author empty$
1176 |     { organization empty$
1177 |         { key empty$
1178 |             { "to sort, need author, organization, or key in " cite$ * warning$
1179 |               ""
1180 |             }
1181 |             { key sortify }
1182 |           if$
1183 |         }
1184 |         { "The " #4 organization chop.word sortify }
1185 |       if$
1186 |     }
1187 |     { author sort.format.names }
1188 |   if$
1189 | }
1190 | 
1191 | FUNCTION {editor.organization.sort}
1192 | { editor empty$
1193 |     { organization empty$
1194 |         { key empty$
1195 |             { "to sort, need editor, organization, or key in " cite$ * warning$
1196 |               ""
1197 |             }
1198 |             { key sortify }
1199 |           if$
1200 |         }
1201 |         { "The " #4 organization chop.word sortify }
1202 |       if$
1203 |     }
1204 |     { editor sort.format.names }
1205 |   if$
1206 | }
1207 | 
1208 | FUNCTION {presort}
1209 | 
1210 | { calc.label
1211 |   sort.label
1212 |   "    "
1213 |   *
1214 |   type$ "book" =
1215 | 
1216 |   type$ "inbook" =
1217 |   or
1218 |     'author.editor.sort
1219 |     { type$ "proceedings" =
1220 |         'editor.organization.sort
1221 |         { type$ "manual" =
1222 |             'author.organization.sort
1223 |             'author.sort
1224 |           if$
1225 |         }
1226 |       if$
1227 |     }
1228 |   if$
1229 | 
1230 |   *
1231 | 
1232 |   "    "
1233 |   *
1234 |   year field.or.null sortify
1235 |   *
1236 |   "    "
1237 |   *
1238 |   title field.or.null
1239 |   sort.format.title
1240 |   *
1241 |   #1 entry.max$ substring$
1242 |   'sort.key$ :=
1243 | }
1244 | 
1245 | ITERATE {presort}
1246 | 
1247 | SORT
1248 | 
1249 | STRINGS { longest.label last.sort.label next.extra }
1250 | 
1251 | INTEGERS { longest.label.width last.extra.num }
1252 | 
1253 | FUNCTION {initialize.longest.label}
1254 | { "" 'longest.label :=
1255 |   #0 int.to.chr$ 'last.sort.label :=
1256 |   "" 'next.extra :=
1257 |   #0 'longest.label.width :=
1258 |   #0 'last.extra.num :=
1259 | }
1260 | 
1261 | FUNCTION {forward.pass}
1262 | { last.sort.label sort.label =
1263 |     { last.extra.num #1 + 'last.extra.num :=
1264 |       last.extra.num int.to.chr$ 'extra.label :=
1265 |     }
1266 |     { "a" chr.to.int$ 'last.extra.num :=
1267 |       "" 'extra.label :=
1268 |       sort.label 'last.sort.label :=
1269 |     }
1270 |   if$
1271 | }
1272 | 
1273 | FUNCTION {reverse.pass}
1274 | { next.extra "b" =
1275 |     { "a" 'extra.label := }
1276 |     'skip$
1277 |   if$
1278 |   label extra.label * 'label :=
1279 |   label width$ longest.label.width >
1280 |     { label 'longest.label :=
1281 |       label width$ 'longest.label.width :=
1282 |     }
1283 |     'skip$
1284 |   if$
1285 |   extra.label 'next.extra :=
1286 | }
1287 | 
1288 | EXECUTE {initialize.longest.label}
1289 | 
1290 | ITERATE {forward.pass}
1291 | 
1292 | REVERSE {reverse.pass}
1293 | 
1294 | FUNCTION {begin.bib}
1295 | 
1296 | { et.al.char.used
1297 |     { "\newcommand{\etalchar}[1]{$^{#1}$}" write$ newline$ }
1298 |     'skip$
1299 |   if$
1300 |   preamble$ empty$
1301 | 
1302 |     'skip$
1303 |     { preamble$ write$ newline$ }
1304 |   if$
1305 | 
1306 |   "\begin{thebibliography}{" "}" * write$ newline$
1307 | 
1308 | }
1309 | 
1310 | EXECUTE {begin.bib}
1311 | 
1312 | EXECUTE {init.state.consts}
1313 | 
1314 | ITERATE {call.type$}
1315 | 
1316 | FUNCTION {end.bib}
1317 | { newline$
1318 |   "\end{thebibliography}" write$ newline$
1319 | }
1320 | 
1321 | EXECUTE {end.bib}
1322 | 
1323 | 


--------------------------------------------------------------------------------
/tex/acl2015.sty:
--------------------------------------------------------------------------------
  1 | % File acl2015.sty
  2 | % December 2014
  3 | 
  4 | % This is the LaTeX style file for ACL 2015. It is nearly identical to
  5 | % the style files for ACL 2014, EACL 2006, ACL2005, ACL 2002, ACL
  6 | % 2001, ACL 2000, EACL 95 and EACL 99. 
  7 | %
  8 | % Changes made include: adapt layout to A4 and centimeters, widen abstract
  9 | 
 10 | % This is the LaTeX style file for ACL 2000.  It is nearly identical to the
 11 | % style files for EACL 95 and EACL 99.  Minor changes include editing the
 12 | % instructions to reflect use of \documentclass rather than \documentstyle
 13 | % and removing the white space before the title on the first page
 14 | % -- John Chen, June 29, 2000
 15 | 
 16 | % To convert from submissions prepared using the style file aclsub.sty
 17 | % prepared for the ACL 2000 conference, proceed as follows:
 18 | % 1) Remove submission-specific information:  \whichsession, \id,
 19 | %    \wordcount, \otherconferences, \area, \keywords
 20 | % 2) \summary should be removed.  The summary material should come
 21 | %     after \maketitle and should be in the ``abstract'' environment
 22 | % 3) Check all citations.  This style should handle citations correctly
 23 | %    and also allows multiple citations separated by semicolons.
 24 | % 4) Check figures and examples.  Because the final format is double-
 25 | %    column, some adjustments may have to be made to fit text in the column
 26 | %    or to choose full-width (\figure*} figures.
 27 | % 5) Change the style reference from aclsub to acl2000, and be sure
 28 | %    this style file is in your TeX search path
 29 | 
 30 | 
 31 | % This is the LaTeX style file for EACL-95.  It is identical to the
 32 | % style file for ANLP '94 except that the margins are adjusted for A4
 33 | % paper.  -- abney 13 Dec 94
 34 | 
 35 | % The ANLP '94 style file is a slightly modified
 36 | % version of the style used for AAAI and IJCAI, using some changes
 37 | % prepared by Fernando Pereira and others and some minor changes 
 38 | % by Paul Jacobs.
 39 | 
 40 | % Papers prepared using the aclsub.sty file and acl.bst bibtex style
 41 | % should be easily converted to final format using this style.  
 42 | % (1) Submission information (\wordcount, \subject, and \makeidpage)
 43 | % should be removed.
 44 | % (2) \summary should be removed.  The summary material should come
 45 | % after \maketitle and should be in the ``abstract'' environment
 46 | % (between \begin{abstract} and \end{abstract}).
 47 | % (3) Check all citations.  This style should handle citations correctly
 48 | % and also allows multiple citations separated by semicolons.
 49 | % (4) Check figures and examples.  Because the final format is double-
 50 | % column, some adjustments may have to be made to fit text in the column
 51 | % or to choose full-width (\figure*} figures.
 52 | 
 53 | % Place this in a file called aclap.sty in the TeX search path.  
 54 | % (Placing it in the same directory as the paper should also work.)
 55 | 
 56 | % Prepared by Peter F. Patel-Schneider, liberally using the ideas of
 57 | % other style hackers, including Barbara Beeton.
 58 | % This style is NOT guaranteed to work.  It is provided in the hope
 59 | % that it will make the preparation of papers easier.
 60 | %
 61 | % There are undoubtably bugs in this style.  If you make bug fixes,
 62 | % improvements, etc.  please let me know.  My e-mail address is:
 63 | %       pfps@research.att.com
 64 | 
 65 | % Papers are to be prepared using the ``acl'' bibliography style,
 66 | % as follows:
 67 | %       \documentclass[11pt]{article}
 68 | %       \usepackage{acl2000}
 69 | %       \title{Title}
 70 | %       \author{Author 1 \and Author 2 \\ Address line \\ Address line \And
 71 | %               Author 3 \\ Address line \\ Address line}
 72 | %       \begin{document}
 73 | %       ...
 74 | %       \bibliography{bibliography-file}
 75 | %       \bibliographystyle{acl}
 76 | %       \end{document}
 77 | 
 78 | % Author information can be set in various styles:
 79 | % For several authors from the same institution:
 80 | % \author{Author 1 \and ... \and Author n \\
 81 | %         Address line \\ ... \\ Address line}
 82 | % if the names do not fit well on one line use
 83 | %         Author 1 \\ {\bf Author 2} \\ ... \\ {\bf Author n} \\
 84 | % For authors from different institutions:
 85 | % \author{Author 1 \\ Address line \\  ... \\ Address line
 86 | %         \And  ... \And
 87 | %         Author n \\ Address line \\ ... \\ Address line}
 88 | % To start a seperate ``row'' of authors use \AND, as in
 89 | % \author{Author 1 \\ Address line \\  ... \\ Address line
 90 | %         \AND
 91 | %         Author 2 \\ Address line \\ ... \\ Address line \And
 92 | %         Author 3 \\ Address line \\ ... \\ Address line}
 93 | 
 94 | % If the title and author information does not fit in the area allocated,
 95 | % place \setlength\titlebox{<new height>} right after
 96 | % \usepackage{acl2015}
 97 | % where <new height> can be something larger than 5cm
 98 | 
 99 | \typeout{Conference Style for ACL 2015 -- released December 7, 2014}
100 | 
101 | % NOTE:  Some laser printers have a serious problem printing TeX output.
102 | % These printing devices, commonly known as ``write-white'' laser
103 | % printers, tend to make characters too light.  To get around this
104 | % problem, a darker set of fonts must be created for these devices.
105 | %
106 | 
107 | 
108 | 
109 | % A4 modified by Eneko; again modified by Alexander for 5cm titlebox
110 | \setlength{\paperwidth}{21cm}   % A4
111 | \setlength{\paperheight}{29.7cm}% A4
112 | \setlength\topmargin{-0.5cm}    
113 | \setlength\oddsidemargin{0cm}   
114 | \setlength\textheight{24.7cm} 
115 | \setlength\textwidth{16.0cm}
116 | \setlength\columnsep{0.6cm}  
117 | \newlength\titlebox 
118 | \setlength\titlebox{5cm}
119 | \setlength\headheight{5pt}   
120 | \setlength\headsep{0pt}
121 | \thispagestyle{empty}        
122 | \pagestyle{empty}
123 | 
124 | 
125 | \flushbottom \twocolumn \sloppy
126 | 
127 | % We're never going to need a table of contents, so just flush it to
128 | % save space --- suggested by drstrip@sandia-2
129 | \def\addcontentsline#1#2#3{}
130 | 
131 | % Title stuff, taken from deproc.
132 | \def\maketitle{\par
133 |  \begingroup
134 |    \def\thefootnote{\fnsymbol{footnote}}
135 |    \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}}
136 |    \twocolumn[\@maketitle] \@thanks
137 |  \endgroup
138 |  \setcounter{footnote}{0}
139 |  \let\maketitle\relax \let\@maketitle\relax
140 |  \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
141 | \def\@maketitle{\vbox to \titlebox{\hsize\textwidth
142 |  \linewidth\hsize \vskip 0.125in minus 0.125in \centering
143 |  {\Large\bf \@title \par} \vskip 0.2in plus 1fil minus 0.1in
144 |  {\def\and{\unskip\enspace{\rm and}\enspace}%
145 |   \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil 
146 |            \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf}%
147 |   \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
148 |           \vskip 0.25in plus 1fil minus 0.125in
149 |            \hbox to \linewidth\bgroup\large \hfil\hfil
150 |              \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf}
151 |   \hbox to \linewidth\bgroup\large \hfil\hfil
152 |     \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\bf\@author 
153 |                             \end{tabular}\hss\egroup
154 |     \hfil\hfil\egroup}
155 |   \vskip 0.3in plus 2fil minus 0.1in
156 | }}
157 | 
158 | % margins for abstract
159 | \renewenvironment{abstract}%
160 | 		 {\centerline{\large\bf Abstract}%
161 | 		  \begin{list}{}%
162 | 		     {\setlength{\rightmargin}{0.6cm}%
163 | 		      \setlength{\leftmargin}{0.6cm}}%
164 | 		   \item[]\ignorespaces}%
165 | 		 {\unskip\end{list}}
166 | 		     
167 | %\renewenvironment{abstract}{\centerline{\large\bf  
168 | % Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
169 | 
170 | 
171 | % bibliography
172 | 
173 | \def\thebibliography#1{\section*{References}
174 |   \global\def\@listi{\leftmargin\leftmargini
175 |                \labelwidth\leftmargini \advance\labelwidth-\labelsep
176 |                \topsep 1pt plus 2pt minus 1pt
177 |                \parsep 0.25ex plus 1pt \itemsep 0.25ex plus 1pt}
178 |   \list {[\arabic{enumi}]}{\settowidth\labelwidth{[#1]}\leftmargin\labelwidth
179 |     \advance\leftmargin\labelsep\usecounter{enumi}}
180 |     \def\newblock{\hskip .11em plus .33em minus -.07em}
181 |     \sloppy
182 |     \sfcode`\.=1000\relax}
183 | 
184 | \def\@up#1{\raise.2ex\hbox{#1}}
185 | 
186 | % most of cite format is from aclsub.sty by SMS
187 | 
188 | % don't box citations, separate with ; and a space
189 | % also, make the penalty between citations negative: a good place to break
190 | % changed comma back to semicolon pj 2/1/90
191 | % \def\@citex[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi
192 | % \def\@citea{}\@cite{\@for\@citeb:=#2\do
193 | %   {\@citea\def\@citea{;\penalty\@citeseppen\ }\@ifundefined
194 | %      {b@\@citeb}{{\bf ?}\@warning
195 | %      {Citation `\@citeb' on page \thepage \space undefined}}%
196 | % {\csname b@\@citeb\endcsname}}}{#1}}
197 | 
198 | % don't box citations, separate with ; and a space
199 | % Replaced for multiple citations (pj) 
200 | % don't box citations and also add space, semicolon between multiple citations
201 | \def\@citex[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi
202 |   \def\@citea{}\@cite{\@for\@citeb:=#2\do
203 |      {\@citea\def\@citea{; }\@ifundefined
204 |        {b@\@citeb}{{\bf ?}\@warning
205 |         {Citation `\@citeb' on page \thepage \space undefined}}%
206 |  {\csname b@\@citeb\endcsname}}}{#1}}
207 | 
208 | % Allow short (name-less) citations, when used in
209 | % conjunction with a bibliography style that creates labels like
210 | %       \citename{<names>, }<year>
211 | % 
212 | \let\@internalcite\cite
213 | \def\cite{\def\citename##1{##1, }\@internalcite}
214 | \def\shortcite{\def\citename##1{}\@internalcite}
215 | \def\newcite{\def\citename##1{{\frenchspacing##1} (}\@internalciteb}
216 | 
217 | % Macros for \newcite, which leaves name in running text, and is
218 | % otherwise like \shortcite.
219 | \def\@citexb[#1]#2{\if@filesw\immediate\write\@auxout{\string\citation{#2}}\fi
220 |   \def\@citea{}\@newcite{\@for\@citeb:=#2\do
221 |     {\@citea\def\@citea{;\penalty\@m\ }\@ifundefined
222 |        {b@\@citeb}{{\bf ?}\@warning
223 |        {Citation `\@citeb' on page \thepage \space undefined}}%
224 | {\csname b@\@citeb\endcsname}}}{#1}}
225 | \def\@internalciteb{\@ifnextchar [{\@tempswatrue\@citexb}{\@tempswafalse\@citexb[]}}
226 | 
227 | \def\@newcite#1#2{{#1\if@tempswa, #2\fi)}}
228 | 
229 | \def\@biblabel#1{\def\citename##1{##1}[#1]\hfill}
230 | 
231 | %%% More changes made by SMS (originals in latex.tex)
232 | % Use parentheses instead of square brackets in the text.
233 | \def\@cite#1#2{({#1\if@tempswa , #2\fi})}
234 | 
235 | % Don't put a label in the bibliography at all.  Just use the unlabeled format
236 | % instead.
237 | \def\thebibliography#1{\vskip\parskip%
238 | \vskip\baselineskip%
239 | \def\baselinestretch{1}%
240 | \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
241 | \vskip-\parskip%
242 | \vskip-\baselineskip%
243 | \section*{References\@mkboth
244 |  {References}{References}}\list
245 |  {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
246 |  \setlength{\itemindent}{-\parindent}}
247 |  \def\newblock{\hskip .11em plus .33em minus -.07em}
248 |  \sloppy\clubpenalty4000\widowpenalty4000
249 |  \sfcode`\.=1000\relax}
250 | \let\endthebibliography=\endlist
251 | 
252 | % Allow for a bibliography of sources of attested examples
253 | \def\thesourcebibliography#1{\vskip\parskip%
254 | \vskip\baselineskip%
255 | \def\baselinestretch{1}%
256 | \ifx\@currsize\normalsize\@normalsize\else\@currsize\fi%
257 | \vskip-\parskip%
258 | \vskip-\baselineskip%
259 | \section*{Sources of Attested Examples\@mkboth
260 |  {Sources of Attested Examples}{Sources of Attested Examples}}\list
261 |  {}{\setlength{\labelwidth}{0pt}\setlength{\leftmargin}{\parindent}
262 |  \setlength{\itemindent}{-\parindent}}
263 |  \def\newblock{\hskip .11em plus .33em minus -.07em}
264 |  \sloppy\clubpenalty4000\widowpenalty4000
265 |  \sfcode`\.=1000\relax}
266 | \let\endthesourcebibliography=\endlist
267 | 
268 | \def\@lbibitem[#1]#2{\item[]\if@filesw 
269 |       { \def\protect##1{\string ##1\space}\immediate
270 |         \write\@auxout{\string\bibcite{#2}{#1}}\fi\ignorespaces}}
271 | 
272 | \def\@bibitem#1{\item\if@filesw \immediate\write\@auxout
273 |        {\string\bibcite{#1}{\the\c@enumi}}\fi\ignorespaces}
274 | 
275 | % sections with less space
276 | \def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
277 |     -0.5ex minus -.2ex}{1.5ex plus 0.3ex minus .2ex}{\large\bf\raggedright}}
278 | \def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
279 |     -0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright}}
280 | %% changed by KO to - values to get teh initial parindent right
281 | \def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex plus
282 |    -0.5ex minus -.2ex}{0.5ex plus .2ex}{\normalsize\bf\raggedright}}
283 | \def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
284 |    0.5ex minus .2ex}{-1em}{\normalsize\bf}}
285 | \def\subparagraph{\@startsection{subparagraph}{5}{\parindent}{1.5ex plus
286 |    0.5ex minus .2ex}{-1em}{\normalsize\bf}}
287 | 
288 | % Footnotes
289 | \footnotesep 6.65pt %
290 | \skip\footins 9pt plus 4pt minus 2pt
291 | \def\footnoterule{\kern-3pt \hrule width 5pc \kern 2.6pt }
292 | \setcounter{footnote}{0}
293 | 
294 | % Lists and paragraphs
295 | \parindent 1em
296 | \topsep 4pt plus 1pt minus 2pt
297 | \partopsep 1pt plus 0.5pt minus 0.5pt
298 | \itemsep 2pt plus 1pt minus 0.5pt
299 | \parsep 2pt plus 1pt minus 0.5pt
300 | 
301 | \leftmargin 2em \leftmargini\leftmargin \leftmarginii 2em
302 | \leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em \leftmarginvi .5em
303 | \labelwidth\leftmargini\advance\labelwidth-\labelsep \labelsep 5pt
304 | 
305 | \def\@listi{\leftmargin\leftmargini}
306 | \def\@listii{\leftmargin\leftmarginii
307 |    \labelwidth\leftmarginii\advance\labelwidth-\labelsep
308 |    \topsep 2pt plus 1pt minus 0.5pt
309 |    \parsep 1pt plus 0.5pt minus 0.5pt
310 |    \itemsep \parsep}
311 | \def\@listiii{\leftmargin\leftmarginiii
312 |     \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
313 |     \topsep 1pt plus 0.5pt minus 0.5pt 
314 |     \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
315 |     \itemsep \topsep}
316 | \def\@listiv{\leftmargin\leftmarginiv
317 |      \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
318 | \def\@listv{\leftmargin\leftmarginv
319 |      \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
320 | \def\@listvi{\leftmargin\leftmarginvi
321 |      \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
322 | 
323 | \abovedisplayskip 7pt plus2pt minus5pt%
324 | \belowdisplayskip \abovedisplayskip
325 | \abovedisplayshortskip  0pt plus3pt%   
326 | \belowdisplayshortskip  4pt plus3pt minus3pt%
327 | 
328 | % Less leading in most fonts (due to the narrow columns)
329 | % The choices were between 1-pt and 1.5-pt leading
330 | \def\@normalsize{\@setsize\normalsize{11pt}\xpt\@xpt}
331 | \def\small{\@setsize\small{10pt}\ixpt\@ixpt}
332 | \def\footnotesize{\@setsize\footnotesize{10pt}\ixpt\@ixpt}
333 | \def\scriptsize{\@setsize\scriptsize{8pt}\viipt\@viipt}
334 | \def\tiny{\@setsize\tiny{7pt}\vipt\@vipt}
335 | \def\large{\@setsize\large{14pt}\xiipt\@xiipt}
336 | \def\Large{\@setsize\Large{16pt}\xivpt\@xivpt}
337 | \def\LARGE{\@setsize\LARGE{20pt}\xviipt\@xviipt}
338 | \def\huge{\@setsize\huge{23pt}\xxpt\@xxpt}
339 | \def\Huge{\@setsize\Huge{28pt}\xxvpt\@xxvpt}
340 | 


--------------------------------------------------------------------------------
/tex/acl2015.tex:
--------------------------------------------------------------------------------
  1 | %
  2 | % File acl2015.tex
  3 | %
  4 | % Contact: car@ir.hit.edu.cn, gdzhou@suda.edu.cn
  5 | %%
  6 | %% Based on the style files for ACL-2014, which were, in turn,
  7 | %% Based on the style files for ACL-2013, which were, in turn,
  8 | %% Based on the style files for ACL-2012, which were, in turn,
  9 | %% based on the style files for ACL-2011, which were, in turn, 
 10 | %% based on the style files for ACL-2010, which were, in turn, 
 11 | %% based on the style files for ACL-IJCNLP-2009, which were, in turn,
 12 | %% based on the style files for EACL-2009 and IJCNLP-2008...
 13 | 
 14 | %% Based on the style files for EACL 2006 by 
 15 | %%e.agirre@ehu.es or Sergi.Balari@uab.es
 16 | %% and that of ACL 08 by Joakim Nivre and Noah Smith
 17 | 
 18 | \documentclass[11pt]{article}
 19 | \usepackage{acl2015}
 20 | \usepackage{times}
 21 | \usepackage{url}
 22 | \usepackage{latexsym}
 23 | 
 24 | %\setlength\titlebox{5cm}
 25 | 
 26 | % You can expand the titlebox if you need extra space
 27 | % to show all the authors. Please do not make the titlebox
 28 | % smaller than 5cm (the original size); we will check this
 29 | % in the camera-ready version and ask you to change it back.
 30 | 
 31 | 
 32 | \title{Instructions for ACL-2015 Proceedings}
 33 | 
 34 | \author{First Author \\
 35 |   Affiliation / Address line 1 \\
 36 |   Affiliation / Address line 2 \\
 37 |   Affiliation / Address line 3 \\
 38 |   {\tt email@domain} \\\And
 39 |   Second Author \\
 40 |   Affiliation / Address line 1 \\
 41 |   Affiliation / Address line 2 \\
 42 |   Affiliation / Address line 3 \\
 43 |   {\tt email@domain} \\}
 44 | 
 45 | \date{}
 46 | 
 47 | \begin{document}
 48 | \maketitle
 49 | \begin{abstract}
 50 |   This document contains the instructions for preparing a camera-ready
 51 |   manuscript for the proceedings of ACL-2015. The document itself
 52 |   conforms to its own specifications, and is therefore an example of
 53 |   what your manuscript should look like. These instructions should be
 54 |   used for both papers submitted for review and for final versions of
 55 |   accepted papers.  Authors are asked to conform to all the directions
 56 |   reported in this document.
 57 | \end{abstract}
 58 | 
 59 | \section{Credits}
 60 | 
 61 | This document has been adapted from the instructions for earlier ACL
 62 | proceedings, including those for ACL-2012 by Maggie Li and Michael
 63 | White, those from ACL-2010 by Jing-Shing Chang and Philipp Koehn,
 64 | those for ACL-2008 by Johanna D. Moore, Simone Teufel, James Allan,
 65 | and Sadaoki Furui, those for ACL-2005 by Hwee Tou Ng and Kemal
 66 | Oflazer, those for ACL-2002 by Eugene Charniak and Dekang Lin, and
 67 | earlier ACL and EACL formats. Those versions were written by several
 68 | people, including John Chen, Henry S. Thompson and Donald
 69 | Walker. Additional elements were taken from the formatting
 70 | instructions of the {\em International Joint Conference on Artificial
 71 |   Intelligence}.
 72 | 
 73 | \section{Introduction}
 74 | 
 75 | The following instructions are directed to authors of papers submitted
 76 | to ACL-2015 or accepted for publication in its proceedings. All
 77 | authors are required to adhere to these specifications. Authors are
 78 | required to provide a Portable Document Format (PDF) version of their
 79 | papers. \textbf{The proceedings are designed for printing on A4
 80 | paper.}
 81 | 
 82 | We will make more detailed instructions available at
 83 | \url{http://acl2015.org/publication.html}. Please check this website 
 84 | regularly.
 85 | 
 86 | 
 87 | \section{General Instructions}
 88 | 
 89 | Manuscripts must be in two-column format.  Exceptions to the
 90 | two-column format include the title, authors' names and complete
 91 | addresses, which must be centered at the top of the first page, and
 92 | any full-width figures or tables (see the guidelines in
 93 | Subsection~\ref{ssec:first}). {\bf Type single-spaced.}  Start all
 94 | pages directly under the top margin. See the guidelines later
 95 | regarding formatting the first page.  The manuscript should be
 96 | printed single-sided and its length
 97 | should not exceed the maximum page limit described in Section~\ref{sec:length}.
 98 | Do not number the pages.
 99 | 
100 | 
101 | \subsection{Electronically-available resources}
102 | 
103 | We strongly prefer that you prepare your PDF files using \LaTeX\ with
104 | the official ACL 2015 style file (acl2015.sty) and bibliography style
105 | (acl.bst). These files are available at
106 | \url{http://acl2015.org}. You will also find the document
107 | you are currently reading (acl2015.pdf) and its \LaTeX\ source code
108 | (acl2015.tex) on this website.
109 | 
110 | You can alternatively use Microsoft Word to produce your PDF file. In
111 | this case, we strongly recommend the use of the Word template file
112 | (acl2015.dot) on the ACL 2015 website (\url{http://acl2015.org}). 
113 | If you have an option, we recommend that you use the \LaTeX2e version. 
114 | If you will be using the Microsoft Word template, we suggest that you 
115 | anonymize your source file so that the pdf produced does not retain your
116 | identity.  This can be done by removing any personal information
117 | from your source document properties.
118 | 
119 | 
120 | 
121 | \subsection{Format of Electronic Manuscript}
122 | \label{sect:pdf}
123 | 
124 | For the production of the electronic manuscript you must use Adobe's
125 | Portable Document Format (PDF). PDF files are usually produced from
126 | \LaTeX\ using the \textit{pdflatex} command. If your version of
127 | \LaTeX\ produces Postscript files, you can convert these into PDF
128 | using \textit{ps2pdf} or \textit{dvipdf}. On Windows, you can also use
129 | Adobe Distiller to generate PDF.
130 | 
131 | Please make sure that your PDF file includes all the necessary fonts
132 | (especially tree diagrams, symbols, and fonts with Asian
133 | characters). When you print or create the PDF file, there is usually
134 | an option in your printer setup to include none, all or just
135 | non-standard fonts.  Please make sure that you select the option of
136 | including ALL the fonts. \textbf{Before sending it, test your PDF by
137 |   printing it from a computer different from the one where it was
138 |   created.} Moreover, some word processors may generate very large PDF
139 | files, where each page is rendered as an image. Such images may
140 | reproduce poorly. In this case, try alternative ways to obtain the
141 | PDF. One way on some systems is to install a driver for a postscript
142 | printer, send your document to the printer specifying ``Output to a
143 | file'', then convert the file to PDF.
144 | 
145 | It is of utmost importance to specify the \textbf{A4 format} (21 cm
146 | x 29.7 cm) when formatting the paper. When working with
147 | {\tt dvips}, for instance, one should specify {\tt -t a4}.
148 | Or using the command \verb|\special{papersize=210mm,297mm}| in the latex
149 | preamble (directly below the \verb|\usepackage| commands). Then using 
150 | {\tt dvipdf} and/or {\tt pdflatex} which would make it easier for some.
151 | 
152 | 
153 | Print-outs of the PDF file on A4 paper should be identical to the
154 | hardcopy version. If you cannot meet the above requirements about the
155 | production of your electronic submission, please contact the
156 | publication chairs as soon as possible.
157 | 
158 | 
159 | \subsection{Layout}
160 | \label{ssec:layout}
161 | 
162 | Format manuscripts two columns to a page, in the manner these
163 | instructions are formatted. The exact dimensions for a page on A4
164 | paper are:
165 | 
166 | \begin{itemize}
167 | \item Left and right margins: 2.5 cm
168 | \item Top margin: 2.5 cm
169 | \item Bottom margin: 2.5 cm
170 | \item Column width: 7.7 cm
171 | \item Column height: 24.7 cm
172 | \item Gap between columns: 0.6 cm
173 | \end{itemize}
174 | 
175 | \noindent Papers should not be submitted on any other paper size.
176 |  If you cannot meet the above requirements about the production of 
177 |  your electronic submission, please contact the publication chairs 
178 |  above as soon as possible.
179 | 
180 | 
181 | \subsection{Fonts}
182 | 
183 | For reasons of uniformity, Adobe's {\bf Times Roman} font should be
184 | used. In \LaTeX2e{} this is accomplished by putting
185 | 
186 | \begin{quote}
187 | \begin{verbatim}
188 | \usepackage{times}
189 | \usepackage{latexsym}
190 | \end{verbatim}
191 | \end{quote}
192 | in the preamble. If Times Roman is unavailable, use {\bf Computer
193 |   Modern Roman} (\LaTeX2e{}'s default).  Note that the latter is about
194 |   10\% less dense than Adobe's Times Roman font.
195 | 
196 | 
197 | \begin{table}[h]
198 | \begin{center}
199 | \begin{tabular}{|l|rl|}
200 | \hline \bf Type of Text & \bf Font Size & \bf Style \\ \hline
201 | paper title & 15 pt & bold \\
202 | author names & 12 pt & bold \\
203 | author affiliation & 12 pt & \\
204 | the word ``Abstract'' & 12 pt & bold \\
205 | section titles & 12 pt & bold \\
206 | document text & 11 pt  &\\
207 | captions & 11 pt & \\
208 | abstract text & 10 pt & \\
209 | bibliography & 10 pt & \\
210 | footnotes & 9 pt & \\
211 | \hline
212 | \end{tabular}
213 | \end{center}
214 | \caption{\label{font-table} Font guide. }
215 | \end{table}
216 | 
217 | \subsection{The First Page}
218 | \label{ssec:first}
219 | 
220 | Center the title, author's name(s) and affiliation(s) across both
221 | columns. Do not use footnotes for affiliations. Do not include the
222 | paper ID number assigned during the submission process. Use the
223 | two-column format only when you begin the abstract.
224 | 
225 | {\bf Title}: Place the title centered at the top of the first page, in
226 | a 15-point bold font. (For a complete guide to font sizes and styles,
227 | see Table~\ref{font-table}) Long titles should be typed on two lines
228 | without a blank line intervening. Approximately, put the title at 2.5
229 | cm from the top of the page, followed by a blank line, then the
230 | author's names(s), and the affiliation on the following line. Do not
231 | use only initials for given names (middle initials are allowed). Do
232 | not format surnames in all capitals (e.g., use ``Schlangen'' not
233 | ``SCHLANGEN'').  Do not format title and section headings in all
234 | capitals as well except for proper names (such as ``BLEU'') that are
235 | conventionally in all capitals.  The affiliation should contain the
236 | author's complete address, and if possible, an electronic mail
237 | address. Start the body of the first page 7.5 cm from the top of the
238 | page.
239 | 
240 | The title, author names and addresses should be completely identical
241 | to those entered to the electronical paper submission website in order
242 | to maintain the consistency of author information among all
243 | publications of the conference. If they are different, the publication
244 | chairs may resolve the difference without consulting with you; so it
245 | is in your own interest to double-check that the information is
246 | consistent.
247 | 
248 | {\bf Abstract}: Type the abstract at the beginning of the first
249 | column. The width of the abstract text should be smaller than the
250 | width of the columns for the text in the body of the paper by about
251 | 0.6 cm on each side. Center the word {\bf Abstract} in a 12 point bold
252 | font above the body of the abstract. The abstract should be a concise
253 | summary of the general thesis and conclusions of the paper. It should
254 | be no longer than 200 words. The abstract text should be in 10 point font.
255 | 
256 | {\bf Text}: Begin typing the main body of the text immediately after
257 | the abstract, observing the two-column format as shown in 
258 | the present document. Do not include page numbers.
259 | 
260 | {\bf Indent} when starting a new paragraph. Use 11 points for text and 
261 | subsection headings, 12 points for section headings and 15 points for
262 | the title. 
263 | 
264 | \subsection{Sections}
265 | 
266 | {\bf Headings}: Type and label section and subsection headings in the
267 | style shown on the present document.  Use numbered sections (Arabic
268 | numerals) in order to facilitate cross references. Number subsections
269 | with the section number and the subsection number separated by a dot,
270 | in Arabic numerals. Do not number subsubsections.
271 | 
272 | {\bf Citations}: Citations within the text appear in parentheses
273 | as~\cite{Gusfield:97} or, if the author's name appears in the text
274 | itself, as Gusfield~\shortcite{Gusfield:97}.  Append lowercase letters
275 | to the year in cases of ambiguity.  Treat double authors as
276 | in~\cite{Aho:72}, but write as in~\cite{Chandra:81} when more than two
277 | authors are involved. Collapse multiple citations as
278 | in~\cite{Gusfield:97,Aho:72}. Also refrain from using full citations
279 | as sentence constituents. We suggest that instead of
280 | \begin{quote}
281 |   ``\cite{Gusfield:97} showed that ...''
282 | \end{quote}
283 | you use
284 | \begin{quote}
285 | ``Gusfield \shortcite{Gusfield:97}   showed that ...''
286 | \end{quote}
287 | 
288 | If you are using the provided \LaTeX{} and Bib\TeX{} style files, you
289 | can use the command \verb|\newcite| to get ``author (year)'' citations.
290 | 
291 | As reviewing will be double-blind, the submitted version of the papers
292 | should not include the authors' names and affiliations. Furthermore,
293 | self-references that reveal the author's identity, e.g.,
294 | \begin{quote}
295 | ``We previously showed \cite{Gusfield:97} ...''  
296 | \end{quote}
297 | should be avoided. Instead, use citations such as 
298 | \begin{quote}
299 | ``Gusfield \shortcite{Gusfield:97}
300 | previously showed ... ''
301 | \end{quote}
302 | 
303 | \textbf{Please do not use anonymous citations} and do not include
304 | acknowledgements when submitting your papers. Papers that do not
305 | conform to these requirements may be rejected without review.
306 | 
307 | \textbf{References}: Gather the full set of references together under
308 | the heading {\bf References}; place the section before any Appendices,
309 | unless they contain references. Arrange the references alphabetically
310 | by first author, rather than by order of occurrence in the text.
311 | Provide as complete a citation as possible, using a consistent format,
312 | such as the one for {\em Computational Linguistics\/} or the one in the 
313 | {\em Publication Manual of the American 
314 | Psychological Association\/}~\cite{APA:83}.  Use of full names for
315 | authors rather than initials is preferred.  A list of abbreviations
316 | for common computer science journals can be found in the ACM 
317 | {\em Computing Reviews\/}~\cite{ACM:83}.
318 | 
319 | The \LaTeX{} and Bib\TeX{} style files provided roughly fit the
320 | American Psychological Association format, allowing regular citations, 
321 | short citations and multiple citations as described above.
322 | 
323 | {\bf Appendices}: Appendices, if any, directly follow the text and the
324 | references (but see above).  Letter them in sequence and provide an
325 | informative title: {\bf Appendix A. Title of Appendix}.
326 | 
327 | \subsection{Footnotes}
328 | 
329 | {\bf Footnotes}: Put footnotes at the bottom of the page and use 9
330 | points text. They may be numbered or referred to by asterisks or other
331 | symbols.\footnote{This is how a footnote should appear.} Footnotes
332 | should be separated from the text by a line.\footnote{Note the line
333 | separating the footnotes from the text.}
334 | 
335 | \subsection{Graphics}
336 | 
337 | {\bf Illustrations}: Place figures, tables, and photographs in the
338 | paper near where they are first discussed, rather than at the end, if
339 | possible.  Wide illustrations may run across both columns.  Color
340 | illustrations are discouraged, unless you have verified that  
341 | they will be understandable when printed in black ink.
342 | 
343 | {\bf Captions}: Provide a caption for every illustration; number each one
344 | sequentially in the form:  ``Figure 1. Caption of the Figure.'' ``Table 1.
345 | Caption of the Table.''  Type the captions of the figures and 
346 | tables below the body, using 11 point text.
347 | 
348 | 
349 | \section{XML conversion and supported \LaTeX\ packages}
350 | 
351 | Following ACL 2014 we will also we will attempt to automatically convert 
352 | your \LaTeX\ source files to publish papers in machine-readable 
353 | XML with semantic markup in the ACL Anthology, in addition to the 
354 | traditional PDF format.  This will allow us to create, over the next 
355 | few years, a growing corpus of scientific text for our own future research, 
356 | and picks up on recent initiatives on converting ACL papers from earlier 
357 | years to XML. 
358 | 
359 | We encourage you to submit a ZIP file of your \LaTeX\ sources along
360 | with the camera-ready version of your paper. We will then convert them
361 | to XML automatically, using the LaTeXML tool
362 | (\url{http://dlmf.nist.gov/LaTeXML}). LaTeXML has \emph{bindings} for
363 | a number of \LaTeX\ packages, including the ACL 2015 stylefile. These
364 | bindings allow LaTeXML to render the commands from these packages
365 | correctly in XML. For best results, we encourage you to use the
366 | packages that are officially supported by LaTeXML, listed at
367 | \url{http://dlmf.nist.gov/LaTeXML/manual/included.bindings}
368 | 
369 | 
370 | 
371 | 
372 | 
373 | \section{Translation of non-English Terms}
374 | 
375 | It is also advised to supplement non-English characters and terms
376 | with appropriate transliterations and/or translations
377 | since not all readers understand all such characters and terms.
378 | Inline transliteration or translation can be represented in
379 | the order of: original-form transliteration ``translation''.
380 | 
381 | \section{Length of Submission}
382 | \label{sec:length}
383 | 
384 | Long papers may consist of up to 8 pages of content, plus two extra
385 | pages for references. Short papers may consist of up to 4 pages of
386 | content, plus two extra pages for references.  Papers that do not
387 | conform to the specified length and formatting requirements may be
388 | rejected without review.
389 | 
390 | 
391 | 
392 | \section*{Acknowledgments}
393 | 
394 | The acknowledgments should go immediately before the references.  Do
395 | not number the acknowledgments section. Do not include this section
396 | when submitting your paper for review.
397 | 
398 | % include your own bib file like this:
399 | %\bibliographystyle{acl}
400 | %\bibliography{acl2015}
401 | 
402 | \begin{thebibliography}{}
403 | 
404 | \bibitem[\protect\citename{Aho and Ullman}1972]{Aho:72}
405 | Alfred~V. Aho and Jeffrey~D. Ullman.
406 | \newblock 1972.
407 | \newblock {\em The Theory of Parsing, Translation and Compiling}, volume~1.
408 | \newblock Prentice-{Hall}, Englewood Cliffs, NJ.
409 | 
410 | \bibitem[\protect\citename{{American Psychological Association}}1983]{APA:83}
411 | {American Psychological Association}.
412 | \newblock 1983.
413 | \newblock {\em Publications Manual}.
414 | \newblock American Psychological Association, Washington, DC.
415 | 
416 | \bibitem[\protect\citename{{Association for Computing Machinery}}1983]{ACM:83}
417 | {Association for Computing Machinery}.
418 | \newblock 1983.
419 | \newblock {\em Computing Reviews}, 24(11):503--512.
420 | 
421 | \bibitem[\protect\citename{Chandra \bgroup et al.\egroup }1981]{Chandra:81}
422 | Ashok~K. Chandra, Dexter~C. Kozen, and Larry~J. Stockmeyer.
423 | \newblock 1981.
424 | \newblock Alternation.
425 | \newblock {\em Journal of the Association for Computing Machinery},
426 |   28(1):114--133.
427 | 
428 | \bibitem[\protect\citename{Gusfield}1997]{Gusfield:97}
429 | Dan Gusfield.
430 | \newblock 1997.
431 | \newblock {\em Algorithms on Strings, Trees and Sequences}.
432 | \newblock Cambridge University Press, Cambridge, UK.
433 | 
434 | \end{thebibliography}
435 | 
436 | \end{document}
437 | 


--------------------------------------------------------------------------------
/tex/deepir.bbl:
--------------------------------------------------------------------------------
  1 | \begin{thebibliography}{}
  2 | 
  3 | \bibitem[\protect\citename{Besag}1974]{besag_spatial_1974}
  4 | Julian Besag.
  5 | \newblock 1974.
  6 | \newblock Spatial interaction and the statistical analysis of lattice systems.
  7 | \newblock {\em Journal of the Royal Statistical Society, Series B}.
  8 | 
  9 | \bibitem[\protect\citename{Besag}1975]{besag1975statistical}
 10 | Julian Besag.
 11 | \newblock 1975.
 12 | \newblock Statistical analysis of non-lattice data.
 13 | \newblock {\em The Statistician}, pages 179--195.
 14 | 
 15 | \bibitem[\protect\citename{Flynn \bgroup et al.\egroup
 16 |   }2013]{flynn_efficiency_2013}
 17 | Cheryl Flynn, Clifford Hurvich, and Jefferey Simonoff.
 18 | \newblock 2013.
 19 | \newblock Efficiency for {Regularization} {Parameter} {Selection} in
 20 |   {Penalized} {Likelihood} {Estimation} of {Misspecified} {Models}.
 21 | \newblock {\em Journal of the American Statistical Association},
 22 |   108:1031--1043.
 23 | 
 24 | \bibitem[\protect\citename{Jernite \bgroup et al.\egroup }2015]{jernite2015mrf}
 25 | Yacine Jernite, Alexander Rush, and David Sontag.
 26 | \newblock 2015.
 27 | \newblock A fast variational approach for learning {M}arkov random field
 28 |   language models.
 29 | \newblock In {\em Proceedings of the 32nd International Conference on Machine
 30 |   Learning (ICML 2015)}.
 31 | 
 32 | \bibitem[\protect\citename{Le and Mikolov}2014]{le_distributed_2014}
 33 | Quoc~V. Le and Tomas Mikolov.
 34 | \newblock 2014.
 35 | \newblock Distributed representations of sentences and documents.
 36 | \newblock In {\em Proceedings of the 31 st {International} {Conference} on
 37 |   {Machine} {Learning}}.
 38 | 
 39 | \bibitem[\protect\citename{Mikolov \bgroup et al.\egroup
 40 |   }2013a]{mikolov2013efficient}
 41 | Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
 42 | \newblock 2013a.
 43 | \newblock Efficient estimation of word representations in vector space.
 44 | \newblock {\em arXiv preprint arXiv:1301.3781}.
 45 | 
 46 | \bibitem[\protect\citename{Mikolov \bgroup et al.\egroup
 47 |   }2013b]{mikolov_distributed_2013}
 48 | Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg~S. Corrado, and Jeff Dean.
 49 | \newblock 2013b.
 50 | \newblock Distributed representations of words and phrases and their
 51 |   compositionality.
 52 | \newblock In {\em Advances in {Neural} {Information} {Processing} {Systems}},
 53 |   pages 3111--3119.
 54 | 
 55 | \bibitem[\protect\citename{Molenberghs and Verbeke}2006]{molenberghs2006models}
 56 | Geert Molenberghs and Geert Verbeke.
 57 | \newblock 2006.
 58 | \newblock {\em Models for discrete longitudinal data}.
 59 | \newblock Springer Science \& Business Media.
 60 | 
 61 | \bibitem[\protect\citename{Morin and Bengio}2005]{morin_hierarchical_2005}
 62 | Frederic Morin and Yoshua Bengio.
 63 | \newblock 2005.
 64 | \newblock Hierarchical probabilistic neural network language model.
 65 | \newblock In {\em Proceedings of the {International} {Workshop} on {Artificial}
 66 |   {Intelligence} and {Statistics}}, pages 246--252.
 67 | 
 68 | \bibitem[\protect\citename{Ng and Jordan}2002]{ng_discriminative_2002}
 69 | Andrew~Y. Ng and Michael~I. Jordan.
 70 | \newblock 2002.
 71 | \newblock On {Discriminative} vs {Generative} {Classifiers}: {A} {Comparison}
 72 |   of {Logistic} {Regression} and naive {Bayes}.
 73 | \newblock In {\em Advances in {Neural} {Information} {Processing} {Systems}
 74 |   ({NIPS})}.
 75 | 
 76 | \bibitem[\protect\citename{Pennington \bgroup et al.\egroup
 77 |   }2014]{pennington_glove:_2014}
 78 | Jeffrey Pennington, Richard Socher, and Christopher~D. Manning.
 79 | \newblock 2014.
 80 | \newblock Glove: {Global} vectors for word representation.
 81 | \newblock {\em Proceedings of the Empiricial Methods in Natural Language
 82 |   Processing (EMNLP 2014)}, 12.
 83 | 
 84 | \bibitem[\protect\citename{Rehurek and Sojka}2010]{rehurek_software_2010}
 85 | Radim Rehurek and Petr Sojka.
 86 | \newblock 2010.
 87 | \newblock Software {Framework} for {Topic} {Modelling} with {Large} {Corpora}.
 88 | \newblock In {\em Proceedings of the {LREC} 2010 {Workshop} on {New}
 89 |   {Challenges} for {NLP} {Frameworks}}, pages 45--50.
 90 | 
 91 | \bibitem[\protect\citename{Rumelhart \bgroup et al.\egroup
 92 |   }1986]{rumelhart_learning_1986}
 93 | David Rumelhart, Geoffrey Hinton, and Ronald Williams.
 94 | \newblock 1986.
 95 | \newblock Learning representations by back-propagating errors.
 96 | \newblock {\em Nature}, 323:533--536.
 97 | 
 98 | \bibitem[\protect\citename{Socher \bgroup et al.\egroup
 99 |   }2011]{socher_parsing_2011}
100 | Richard Socher, Cliff~C. Lin, Chris Manning, and Andrew~Y. Ng.
101 | \newblock 2011.
102 | \newblock Parsing natural scenes and natural language with recursive neural
103 |   networks.
104 | \newblock In {\em Proceedings of the 28th international conference on machine
105 |   learning ({ICML}-11)}, pages 129--136.
106 | 
107 | \bibitem[\protect\citename{Socher \bgroup et al.\egroup
108 |   }2013]{socher_recursive_2013}
109 | Richard Socher, Alex Perelygin, Jean~Y. Wu, Jason Chuang, Christopher~D.
110 |   Manning, Andrew~Y. Ng, and Christopher Potts.
111 | \newblock 2013.
112 | \newblock Recursive deep models for semantic compositionality over a sentiment
113 |   treebank.
114 | \newblock In {\em Proceedings of the conference on empirical methods in natural
115 |   language processing ({EMNLP})}, volume 1631, page 1642.
116 | 
117 | \bibitem[\protect\citename{Taddy}2013a]{taddy_measuring_2013}
118 | Matt Taddy.
119 | \newblock 2013a.
120 | \newblock Measuring {Political} {Sentiment} on {Twitter}: {Factor} {Optimal}
121 |   {Design} for {Multinomial} {Inverse} {Regression}.
122 | \newblock {\em Technometrics}, 55(4):415--425, November.
123 | 
124 | \bibitem[\protect\citename{Taddy}2013b]{taddy_multinomial_2013}
125 | Matt Taddy.
126 | \newblock 2013b.
127 | \newblock Multinomial {Inverse} {Regression} for {Text} {Analysis}.
128 | \newblock {\em Journal of the American Statistical Association}, 108:755--770.
129 | 
130 | \bibitem[\protect\citename{Taddy}2013c]{taddy_rejoinder:_2013}
131 | Matt Taddy.
132 | \newblock 2013c.
133 | \newblock Rejoinder: {Efficiency} and structure in {MNIR}.
134 | \newblock {\em Journal of the American Statistical Association}, 108:772--774.
135 | 
136 | \bibitem[\protect\citename{Taddy}2014]{taddy_one-step_2014}
137 | Matt Taddy.
138 | \newblock 2014.
139 | \newblock One-step estimator paths for concave regularization.
140 | \newblock arXiv:1308.5623.
141 | 
142 | \bibitem[\protect\citename{Taddy}2015]{taddy_distributed_2015}
143 | Matt Taddy.
144 | \newblock 2015.
145 | \newblock Distributed {Multinomial} {Regression}.
146 | \newblock {\em Annals of Applied Statistics}, To appear.
147 | 
148 | \bibitem[\protect\citename{Varin \bgroup et al.\egroup
149 |   }2011]{varin2011overview}
150 | Cristiano Varin, Nancy Reid, and David Firth.
151 | \newblock 2011.
152 | \newblock An overview of composite likelihood methods.
153 | \newblock {\em Statistica Sinica}, 21(1):5--42.
154 | 
155 | \end{thebibliography}
156 | 


--------------------------------------------------------------------------------
/tex/deepir.bib:
--------------------------------------------------------------------------------
  1 | @book{molenberghs2006models,
  2 |   title={Models for discrete longitudinal data},
  3 |   author={Molenberghs, Geert and Verbeke, Geert},
  4 |   year={2006},
  5 |   publisher={Springer Science \& Business Media}
  6 | }
  7 | 
  8 | @article{mikolov2013efficient,
  9 |   title={Efficient estimation of word representations in vector space},
 10 |   author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
 11 |   journal={arXiv preprint arXiv:1301.3781},
 12 |   year={2013}
 13 | }
 14 | 
 15 | @article{besag_spatial_1974,
 16 |     title = {Spatial Interaction and the Statistical Analysis of Lattice Systems},
 17 |     journal = {Journal of the Royal Statistical Society, Series B},
 18 |     author = {Besag, Julian},
 19 |     year = {1974}
 20 | }
 21 | 
 22 | @article{besag1975statistical,
 23 |   title={Statistical analysis of non-lattice data},
 24 |   author={Besag, Julian},
 25 |   journal={The Statistician},
 26 |   pages={179--195},
 27 |   year={1975}
 28 | }
 29 | 
 30 | @inproceedings{jernite2015mrf,
 31 |     title = {A Fast Variational Approach for Learning {M}arkov Random Field Language Models},
 32 |     author = {Yacine Jernite and Alexander Rush and David Sontag},
 33 |     booktitle={Proceedings of the 32nd International Conference on Machine Learning (ICML 2015)},
 34 |     year={2015}
 35 | }
 36 | 
 37 | @article{cox2004note,
 38 |   title={A note on pseudolikelihood constructed from marginal densities},
 39 |   author={Cox, David R and Reid, Nancy},
 40 |   journal={Biometrika},
 41 |   volume={91},
 42 |   number={3},
 43 |   pages={729--737},
 44 |   year={2004},
 45 |   publisher={Biometrika Trust}
 46 | }
 47 | 
 48 | @book{verbeke2009linear,
 49 |   title={Linear mixed models for longitudinal data},
 50 |   author={Verbeke, Geert and Molenberghs, Geert},
 51 |   year={2009},
 52 |   publisher={Springer Science \& Business Media}
 53 | }
 54 | 
 55 | @article{varin2011overview,
 56 |   title={An overview of composite likelihood methods},
 57 |   author={Varin, Cristiano and Reid, Nancy and Firth, David},
 58 |   journal={Statistica Sinica},
 59 |   volume={21},
 60 |   number={1},
 61 |   pages={5--42},
 62 |   year={2011}
 63 | }
 64 | 
 65 | @article{pang_opinion_2008,
 66 | 	title = {Opinion {Mining} and {Sentiment} {Analysis}},
 67 | 	volume = {1-2},
 68 | 	journal = {Foundations and Trends in Information Retrieval},
 69 | 	author = {Pang, Bo and Lee, Lillian},
 70 | 	year = {2008},
 71 | 	pages = {1--135}
 72 | }
 73 | 
 74 | @article{efron_efficiency_1975,
 75 | 	title = {The efficiency of logistic regression compared to normal discriminant analysis},
 76 | 	number = {70},
 77 | 	journal = {Journal of the American Statistical Association},
 78 | 	author = {Efron, Bradley},
 79 | 	year = {1975},
 80 | 	pages = {892--898}
 81 | }
 82 | 
 83 | @inproceedings{taddy_estimation_2012,
 84 | 	title = {On {Estimation} and {Selection} for {Topic} {Models}},
 85 | 	booktitle = {Proceedings of the 15th {International} {Conference} on {Artificial} {Intelligence} and {Statistics} ({AISTATS} 2012)},
 86 | 	author = {Taddy, Matt},
 87 | 	year = {2012}
 88 | }
 89 | 
 90 | @inproceedings{mikolov_distributed_2013,
 91 | 	title = {Distributed representations of words and phrases and their compositionality},
 92 | 	url = {http://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality},
 93 | 	urldate = {2014-10-28},
 94 | 	booktitle = {Advances in {Neural} {Information} {Processing} {Systems}},
 95 | 	author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeff},
 96 | 	year = {2013},
 97 | 	pages = {3111--3119}
 98 | }
 99 | 
100 | @article{taddy_rejoinder:_2013,
101 | 	title = {Rejoinder: {Efficiency} and structure in {MNIR}},
102 | 	volume = {108},
103 | 	journal = {Journal of the American Statistical Association},
104 | 	author = {Taddy, Matt},
105 | 	year = {2013},
106 | 	pages = {772--774}
107 | }
108 | 
109 | @article{taddy_multinomial_2013,
110 | 	title = {Multinomial {Inverse} {Regression} for {Text} {Analysis}},
111 | 	volume = {108},
112 | 	journal = {Journal of the American Statistical Association},
113 | 	author = {Taddy, Matt},
114 | 	year = {2013},
115 | 	pages = {755--770}
116 | }
117 | 
118 | @inproceedings{socher_recursive_2013,
119 | 	title = {Recursive deep models for semantic compositionality over a sentiment treebank},
120 | 	volume = {1631},
121 | 	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.383.1327&rep=rep1&type=pdf},
122 | 	urldate = {2015-04-24},
123 | 	booktitle = {Proceedings of the conference on empirical methods in natural language processing ({EMNLP})},
124 | 	author = {Socher, Richard and Perelygin, Alex and Wu, Jean Y. and Chuang, Jason and Manning, Christopher D. and Ng, Andrew Y. and Potts, Christopher},
125 | 	year = {2013},
126 | 	pages = {1642}
127 | }
128 | 
129 | @inproceedings{pang_thumbs_2002,
130 | 	title = {Thumbs up?: sentiment classification using machine learning techniques},
131 | 	shorttitle = {Thumbs up?},
132 | 	url = {http://dl.acm.org/citation.cfm?id=1118704},
133 | 	urldate = {2014-10-28},
134 | 	booktitle = {Proceedings of the {ACL}-02 conference on {Empirical} methods in natural language processing-{Volume} 10},
135 | 	publisher = {Association for Computational Linguistics},
136 | 	author = {Pang, Bo and Lee, Lillian and Vaithyanathan, Shivakumar},
137 | 	year = {2002},
138 | 	pages = {79--86}
139 | }
140 | 
141 | @article{pennington_glove:_2014,
142 | 	title = {Glove: {Global} vectors for word representation},
143 | 	volume = {12},
144 | 	shorttitle = {Glove},
145 | 	url = {http://nlp.stanford.edu/projects/glove/glove.pdf},
146 | 	urldate = {2015-04-24},
147 | 	journal = {Proceedings of the Empiricial Methods in Natural Language Processing (EMNLP 2014)},
148 | 	author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D.},
149 | 	year = {2014}
150 | }
151 | 
152 | @article{mosteller_inference_1963,
153 | 	title = {Inference in an {Authorship} {Problem}},
154 | 	volume = {58},
155 | 	journal = {Journal of the American Statistical Association},
156 | 	author = {Mosteller, Frederick and Wallace, David L.},
157 | 	year = {1963},
158 | 	pages = {275--309}
159 | }
160 | 
161 | @article{taddy_one-step_2014,
162 | 	title = {One-step estimator paths for concave regularization},
163 | 	author = {Taddy, Matt},
164 | 	year = {2014},
165 | 	note = {arXiv:1308.5623}
166 | }
167 | 
168 | @article{sebastiani_machine_2002,
169 | 	title = {Machine {Learning} in {Automated} {Test} {Categorization}},
170 | 	volume = {34},
171 | 	journal = {ACM Computing Surveys},
172 | 	author = {Sebastiani, Fabrizio},
173 | 	year = {2002},
174 | 	pages = {1--47}
175 | }
176 | 
177 | @article{taddy_distributed_2015,
178 | 	title = {Distributed {Multinomial} {Regression}},
179 | 	volume = {To appear},
180 | 	journal = {Annals of Applied Statistics},
181 | 	author = {Taddy, Matt},
182 | 	year = {2015}
183 | }
184 | 
185 | @inproceedings{joshi_movie_2010,
186 | 	title = {Movie reviews and revenues: {An} experiment in text regression},
187 | 	shorttitle = {Movie reviews and revenues},
188 | 	url = {http://dl.acm.org/citation.cfm?id=1858037},
189 | 	urldate = {2014-10-28},
190 | 	booktitle = {Human {Language} {Technologies}: {The} 2010 {Annual} {Conference} of the {North} {American} {Chapter} of the {Association} for {Computational} {Linguistics}},
191 | 	publisher = {Association for Computational Linguistics},
192 | 	author = {Joshi, Mahesh and Das, Dipanjan and Gimpel, Kevin and Smith, Noah A.},
193 | 	year = {2010},
194 | 	pages = {293--296}
195 | }
196 | 
197 | @inproceedings{rehurek_software_2010,
198 | 	title = {Software {Framework} for {Topic} {Modelling} with {Large} {Corpora}},
199 | 	booktitle = {Proceedings of the {LREC} 2010 {Workshop} on {New} {Challenges} for {NLP} {Frameworks}},
200 | 	author = {Rehurek, Radim and Sojka, Petr},
201 | 	year = {2010},
202 | 	pages = {45--50}
203 | }
204 | 
205 | @inproceedings{ng_discriminative_2002,
206 | 	title = {On {Discriminative} vs {Generative} {Classifiers}: {A} {Comparison} of {Logistic} {Regression} and naive {Bayes}},
207 | 	booktitle = {Advances in {Neural} {Information} {Processing} {Systems} ({NIPS})},
208 | 	author = {Ng, Andrew Y. and Jordan, Michael I.},
209 | 	year = {2002}
210 | }
211 | 
212 | @inproceedings{socher_parsing_2011,
213 | 	title = {Parsing natural scenes and natural language with recursive neural networks},
214 | 	url = {http://machinelearning.wustl.edu/mlpapers/paper_files/ICML2011Socher_125.pdf},
215 | 	urldate = {2015-04-25},
216 | 	booktitle = {Proceedings of the 28th international conference on machine learning ({ICML}-11)},
217 | 	author = {Socher, Richard and Lin, Cliff C. and Manning, Chris and Ng, Andrew Y.},
218 | 	year = {2011},
219 | 	pages = {129--136}
220 | }
221 | 
222 | @article{flynn_efficiency_2013,
223 | 	title = {Efficiency for {Regularization} {Parameter} {Selection} in {Penalized} {Likelihood} {Estimation} of {Misspecified} {Models}},
224 | 	volume = {108},
225 | 	journal = {Journal of the American Statistical Association},
226 | 	author = {Flynn, Cheryl and Hurvich, Clifford and Simonoff, Jefferey},
227 | 	year = {2013},
228 | 	pages = {1031--1043}
229 | }
230 | 
231 | @article{rumelhart_learning_1986,
232 | 	title = {Learning representations by back-propagating errors},
233 | 	volume = {323},
234 | 	journal = {Nature},
235 | 	author = {Rumelhart, David and Hinton, Geoffrey and Williams, Ronald},
236 | 	year = {1986},
237 | 	pages = {533--536}
238 | }
239 | 
240 | @article{blei_latent_2003,
241 | 	title = {Latent {Dirichlet} {Allocation}},
242 | 	volume = {3},
243 | 	url = {http://dl.acm.org/citation.cfm?id=944937},
244 | 	urldate = {2013-11-01},
245 | 	journal = {the Journal of machine Learning research},
246 | 	author = {Blei, David M. and Ng, Andrew Y. and Jordan, Michael I.},
247 | 	year = {2003},
248 | 	pages = {993--1022}
249 | }
250 | 
251 | @inproceedings{morin_hierarchical_2005,
252 | 	title = {Hierarchical probabilistic neural network language model},
253 | 	url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.8829&rep=rep1&type=pdf#page=255},
254 | 	urldate = {2015-04-24},
255 | 	booktitle = {Proceedings of the {International} {Workshop} on {Artificial} {Intelligence} and {Statistics}},
256 | 	author = {Morin, Frederic and Bengio, Yoshua},
257 | 	year = {2005},
258 | 	pages = {246--252}
259 | }
260 | 
261 | @inproceedings{le_distributed_2014,
262 | 	title = {Distributed representations of sentences and documents},
263 | 	url = {http://arxiv.org/abs/1405.4053},
264 | 	urldate = {2015-04-24},
265 | 	booktitle = {Proceedings  of  the 31 st {International}  {Conference}  on  {Machine} {Learning}},
266 | 	author = {Le, Quoc V. and Mikolov, Tomas},
267 | 	year = {2014}
268 | }
269 | 
270 | @article{taddy_measuring_2013,
271 | 	title = {Measuring {Political} {Sentiment} on {Twitter}: {Factor} {Optimal} {Design} for {Multinomial} {Inverse} {Regression}},
272 | 	volume = {55},
273 | 	issn = {0040-1706, 1537-2723},
274 | 	shorttitle = {Measuring {Political} {Sentiment} on {Twitter}},
275 | 	url = {http://www.tandfonline.com/doi/abs/10.1080/00401706.2013.778791},
276 | 	doi = {10.1080/00401706.2013.778791},
277 | 	language = {en},
278 | 	number = {4},
279 | 	urldate = {2014-10-28},
280 | 	journal = {Technometrics},
281 | 	author = {Taddy, Matt},
282 | 	month = nov,
283 | 	year = {2013},
284 | 	pages = {415--425}
285 | }


--------------------------------------------------------------------------------
/tex/deepir.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/deepir.pdf


--------------------------------------------------------------------------------
/tex/deepir.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | \documentclass[11pt]{article}
  3 | \usepackage{acl2015}
  4 | \usepackage{times}
  5 | \usepackage{url}
  6 | \usepackage{dsfont}
  7 | \usepackage{latexsym}
  8 | \usepackage{graphicx}
  9 | 
 10 | 
 11 | \title{Document Classification by Inversion of \\Distributed Language Representations}
 12 | 
 13 | \author{Matt Taddy \\
 14 |   University of Chicago Booth School of Business \\
 15 |   {\tt taddy@chicagobooth.edu} \\}
 16 | 
 17 | \date{}
 18 | 
 19 | \begin{document}
 20 | \maketitle
 21 | \begin{abstract}
 22 | There have been many recent advances in the structure and measurement of {\it distributed} language models: those that map from words to a vector-space that is rich in information about word choice and composition.  This vector-space is the distributed language representation.
 23 | 
 24 | 
 25 | The goal of this note is to point out that any distributed  representation can be turned into a classifier through inversion via Bayes rule.  
 26 | The approach is simple and modular, in that it will work with any language representation whose training can be formulated as optimizing a probability model. In our application to 2 million sentences from Yelp reviews, we also find that it performs as well as or better than  complex purpose-built algorithms. \end{abstract}
 27 | 
 28 | \section{Introduction}
 29 | 
 30 | Distributed, or vector-space, language representations $\mathcal{V}$ consist
 31 | of a location, or embedding, for every vocabulary {\it word} in $\mathds{R}^K$, where
 32 | $K$ is the dimension of the latent representation space.  These locations
 33 | are learned to optimize, perhaps approximately, an objective function
 34 | defined on the original text such as a likelihood for word occurrences.
 35 | 
 36 | A popular example is the Word2Vec machinery of
 37 | Mikolov et al.~\shortcite{mikolov_distributed_2013}.  This trains the distributed
 38 | representation to be useful as an input layer for prediction of words from
 39 | their neighbors in a Skip-gram likelihood.  That is, to maximize
 40 | \begin{equation}\label{eq:skipgram}
 41 | \sum_{j\neq t,~j=t-b}^{t+b} \log\mathrm{p}_{\mathcal{V}}(w_{sj}\mid w_{st})
 42 | \end{equation}
 43 | summed across all words $w_{st}$ in all sentences $\mathbf{w}_s$, where $b$ is
 44 | the skip-gram window (truncated by the ends of the
 45 | sentence) and  $\mathrm{p}_{\mathcal{V}}(w_{sj}| w_{st})$ is a neural network
 46 | classifier that takes vector representations for $w_{st}$ and $w_{sj}$
 47 | as input (see Section \ref{sec:w2v}).  
 48 | 
 49 | Distributed language representations have been studied since the early work on
 50 | neural networks \cite{rumelhart_learning_1986} and have long been applied in
 51 | natural language processing \cite{morin_hierarchical_2005}.  The models are
 52 | generating much recent interest due to the large performance gains from the
 53 | newer systems, including Word2Vec and the Glove model of Pennington et
 54 | al.~\shortcite{pennington_glove:_2014}, observed in, e.g., word
 55 | prediction, word analogy identification, and named entity recognition.
 56 | 
 57 | Given the success of these new models, researchers have begun searching for
 58 | ways to adapt the representations for use in document classification tasks
 59 | such as sentiment prediction or author identification.  One  naive approach is
 60 | to use aggregated word vectors across a document (e.g., a document's average
 61 | word-vector location) as input to a standard classifier (e.g.,
 62 | logistic regression).  However, a document is actually  an {\it ordered} path
 63 | of  locations through $\mathds{R}^K$, and simple averaging destroys much of the available
 64 | information.  
 65 | 
 66 | More sophisticated aggregation is proposed in Socher et al.
 67 | \shortcite{socher_parsing_2011,socher_recursive_2013}, where recursive neural
 68 | networks are used to combine the word vectors through the estimated parse tree
 69 | for each sentence.  Alternatively,  Le and Mikolov's Doc2Vec
 70 | \shortcite{le_distributed_2014} adds document labels to the conditioning set
 71 | in (\ref{eq:skipgram}) and has them influence the skip-gram likelihood through
 72 | a latent input vector location in $\mathcal{V}$. In each case, the end product
 73 | is a distributed representation for every sentence (or document for Doc2Vec)
 74 | that can be used as input to a generic classifier.
 75 | 
 76 | \subsection{Bayesian Inversion}
 77 | 
 78 | These approaches all add considerable model and estimation complexity to the
 79 | original underlying distributed representation.  We are proposing a
 80 | simple alternative that turns fitted distributed language representations into
 81 | document classifiers without any additional modeling or estimation.  
 82 | 
 83 | A typical language model is trained to maximize the likelihoods of single words and their neighbors.  For example, the  skip-gram
 84 |  in (\ref{eq:skipgram}) represents  conditional probability for a
 85 | word's context (surrounding words),  while the alternative CBOW Word2Vec
 86 | specification \cite{mikolov2013efficient} targets the conditional probability
 87 | for each word given its context.  Although these objectives do not correspond to a full document likelihood model, they can be interpreted as components in a \textit{composite likelihood}\footnote{Composite likelihoods are a common tool in analysis of spatial data and data on graphs.  They were popularized in statistics by Besag's \shortcite{besag_spatial_1974,besag1975statistical} work on the pseudolikelihood -- $\mathrm{p}(\mathbf{w}) \approx \prod_j \mathrm{p}(w_j |\mathbf{w}_{-j})$ -- for analysis of Markov random fields. See Varin et al. \shortcite{varin2011overview} for a detailed review.} approximation.
 88 | 
 89 | Use $\mathbf{w} = [w_1\dots w_T]'$ to denote a sentence: an ordered vector of words.
 90 | The skip-gram  in
 91 | (\ref{eq:skipgram}) yields the pairwise composite log likelihood\footnote{See  Molenberghs and Verbeke  \shortcite{molenberghs2006models} for  similar pairwise compositions in analysis of longitudinal data.}
 92 | \begin{equation}\label{eq:sentencelhd} \log\mathrm{p}_{ \mathcal{V}}(\mathbf{w}) = 
 93 | \sum_{j=1}^T\sum_{k=1}^T \mathds{1}_{\left[1\leq |k-j| \leq b\right]} \log\mathrm{p}_{ \mathcal{V}}(w_{k}|
 94 | w_{j} ). \end{equation} 
 95 | In another example, Jernite et al.~\shortcite{jernite2015mrf} show that CBOW Word2Vec corresponds to the pseudolikelihood for a Markov random field sentence model.  
 96 | 
 97 | Finally, given  a  sentence likelihood as in (\ref{eq:sentencelhd}), document $d =
 98 | \{\mathbf{w}_1, ... \mathbf{w}_S\}$ has  log likelihood 
 99 | \begin{equation}\label{eq:fulllhd} \log\mathrm{p}_{ \mathcal{V}}(d) = 
100 | \sum_{s}  \log\mathrm{p}_{ \mathcal{V}}(\mathbf{w}_s). \end{equation} 
101 | 
102 | 
103 | Now suppose that your training documents are grouped by class label, $y \in
104 | \{1 \dots C\}$.  We can train {\it separate} distributed language representations
105 | for each set of documents as partitioned by $y$; for example, fit Word2Vec independently on each sub-corpus $D_c = \{ d_i : y_i =c \}$ and obtain the labeled distributed representation map $\mathcal{V}_c$.  A new  document $d$ has probability
106 | $\mathrm{p}_{ \mathcal{V}_c}(d)$ if we treat it as a member of class $c$, and Bayes rule implies
107 | \begin{equation}\label{eq:bayesrule}
108 | \mathrm{p}( y | d) = \frac{\mathrm{p}_{ \mathcal{V}_y}(d)\pi_y }
109 | {\sum_c \mathrm{p}_{ \mathcal{V}_c}(d)\pi_c }
110 | \end{equation}
111 | where $\pi_c$ is our prior probability on class label $c$.
112 | 
113 | Thus distributed language representations trained separately for each class label 
114 | yield directly a document classification rule via (\ref{eq:bayesrule}).  This
115 | approach has a number of attractive qualities.
116 | 
117 | \vskip .1cm
118 | \noindent \textbf{Simplicity:} The inversion strategy works for any model of
119 | language that can (or its training can) be interpreted as a probabilistic
120 | model.  This makes for easy implementation in systems that are already 
121 | engineered to fit such language representations, leading to faster deployment and lower development costs.  
122 | The strategy is also interpretable: whatever intuition one has about the
123 | distributed language model can be applied directly to the 
124 | inversion-based classification rule.  Inversion adds a
125 | plausible model for reader understanding on top of any given language
126 | representation.
127 | 
128 | \vskip .1cm
129 | \noindent \textbf{Scalability:}  when working with
130 | massive corpora it is often useful to split the data into blocks as part of
131 | distributed computing strategies. Our model of classification via inversion
132 | provides a convenient top-level partitioning of the data.  An efficient system
133 | could fit separate by-class language representations, which
134 | will provide for document classification as in this article as well as
135 | class-specific answers for NLP tasks such as word prediction or analogy.  When
136 | one wishes to treat a document as unlabeled, NLP tasks can be answered through
137 | ensemble aggregation of the class-specific answers.  
138 | 
139 | \vskip .1cm
140 | \noindent \textbf{Performance:} We find that, in our examples,  inversion of
141 | Word2Vec yields lower misclassification rates than both Doc2Vec-based
142 | classification and the multinomial inverse regression (MNIR) of Taddy
143 | \shortcite{taddy_multinomial_2013}.  We did not anticipate such outright
144 | performance gain.  Moreover, we expect that with calibration (i.e., through
145 | cross-validation) of the  many various tuning parameters available when
146 | fitting both Word and Doc 2Vec the performance results will change.  Indeed,
147 | we find that all methods are often outperformed by phrase-count logistic
148 | regression with rare-feature up-weighting and carefully chosen regularization.
149 | However, the  out-of-the-box performance of Word2Vec inversion
150 | argues for its consideration as a simple default in document classification.
151 | 
152 | \vskip .2cm
153 | In the remainder, we outline classification through inversion of a specific
154 | Word2Vec model and illustrate  the ideas in classification of Yelp reviews.
155 | The implementation requires only a small extension of the popular
156 | \texttt{gensim} python library \cite{rehurek_software_2010}; the extended
157 | library as well as code to reproduce all of the results in this paper are
158 | available on \texttt{github}. In addition, the yelp data is publicly available
159 | as part of the corresponding data mining contest at
160 | \texttt{kaggle.com}.  
161 | See \texttt{github.com/taddylab/deepir} for detail.
162 | 
163 | 
164 | \section{Implementation}
165 | \label{sec:w2v}
166 | 
167 | Word2Vec trains $\mathcal{V}$ to maximize the skip-gram likelihood based on (\ref{eq:skipgram}).  We work with the Huffman softmax specification \cite{mikolov_distributed_2013}, which includes a pre-processing step to encode each vocabulary word in its representation via a binary Huffman tree (see Figure \ref{bht}).
168 | 
169 | \begin{figure}[b]
170 | ~\includegraphics[width=0.47\textwidth]{graphs/bht}
171 | \caption{\label{bht} Binary Huffman encoding of a 4 word vocabulary, based upon 18 total utterances.  
172 | At each step proceeding from left to right the two nodes with lowest count are
173 | combined into a parent node.  Binary encodings are read back off of the splits
174 | moving from right to left. }
175 | \end{figure}
176 | 
177 | Each individual probability is then
178 | \begin{equation} \label{eq:neuralnet}
179 | \mathrm{p}_{\mathcal{V}}(w | w_t) =\!\!\!
180 |  \prod_{j=1}^{L(w)-1} \!\!\!\sigma\!\left( \mathrm{ch}\left[\eta(w,j+1)\right] \mathbf{u}_{\eta(w,j)}^\top \mathbf{v}_{w_t} \right) 
181 | \end{equation}
182 | where $\eta(w,i)$ is the $i^{th}$ node in the Huffman tree path, of  length $L(w)$, for word $w$; $\sigma(x) = 1/(1 + \exp[-x])$; and $\mathrm{ch}(\eta)
183 | \in \{-1,+1\}$ translates from whether $\eta$ is a left or right child to +/-
184 | 1.  Every word thus has both input and output vector coordinates,
185 | $\mathbf{v}_w$ and $[\mathbf{u}_{\eta(w,1)} \cdots \mathbf{u}_{\eta(w,L(w))}]$.
186 | Typically, only the input space $\mathbf{V} = [\mathbf{v}_{w_1} \cdots \mathbf{v}_{w_p}]$,
187 | for a $p$-word vocabulary, is reported as the  language
188 | representation -- these vectors are used as input for NLP tasks.    However,
189 | the full representation $\mathcal{V}$ includes mapping from each word to both
190 | $\mathbf{V}$ and $\mathbf{U}$.
191 | 
192 | We apply the
193 | \texttt{gensim} python implementation of Word2Vec, which fits the model via stochastic gradient descent (SGD),  under default specification.  This includes a vector space of dimension $K=100$ and a skip-gram window of size $b=5$.  
194 | 
195 | \subsection{Word2Vec Inversion}
196 | 
197 | 
198 | \begin{figure*}
199 | %\includegraphics[width=\textwidth]{graphs/coarseprob}
200 | 
201 | % \vskip .5cm
202 | % \begin{center}
203 | \includegraphics[width=1\textwidth]{graphs/coarseprob_bystar}
204 | % \end{center}
205 | \vskip -.25cm
206 | \caption{\label{pic:coarseprob} Out-of-Sample fitted probabilities of a review being \emph{positive} (having greater than 2 stars) as a function of the true number of review stars. Box widths are proportional to number of observations in each class; roughly 10\% of reviews have each of 1-3 stars, while 30\% have 4 stars and 40\% have 5 stars.
207 | }
208 | \end{figure*}
209 |  
210 | % \begin{figure*}
211 | % \begin{center}
212 | % \includegraphics[width=.98\textwidth]{graphs/nnpprob}
213 | 
214 | % \vskip .25cm
215 | 
216 | % \includegraphics[width=.98\textwidth]{graphs/fineprob}
217 | % \end{center}
218 | % \vskip -.25cm
219 | % \caption{\label{pic:fineprob} Out-of-Sample fitted probabilities for  observed truth.  In the top plot, we are predicting Negative ($\leq 2$), Neutral ($3$), or Positive ($\geq 4$).  In the bottom, we are predicting each of the separate 5 star ratings.}
220 | % \end{figure*}
221 |  
222 | Given Word2Vec trained on each of $C$ class-specific corpora $D_1 \ldots D_C$,
223 | leading to $C$ distinct language representations $\mathcal{V}_1 \dots
224 | \mathcal{V}_C$, classification for new documents is straightforward.  Consider
225 | the $S$-sentence document $d$:  each sentence $\mathbf{w}_s$ is given a 
226 | probability under each representation $\mathcal{V}_c$ by applying the
227 | calculations in (\ref{eq:skipgram}) and (\ref{eq:neuralnet}).  This leads to
228 | the $S \times C$ matrix of sentence probabilities,
229 | $\mathrm{p}_{\mathcal{V}_c}(\mathbf{w}_s)$, and  document probabilities are
230 | obtained %as the column means
231 | \begin{equation}
232 | \mathrm{p}_{\mathcal{V}_c}(d) = \frac{1}{S}\sum_s \mathrm{p}_{\mathcal{V}_c}(\mathbf{w}_s).
233 | \end{equation}
234 | Finally, class probabilities are calculated via Bayes rule as in (\ref{eq:bayesrule}).  We use priors $\pi_c = 1/C$, so that classification proceeds by assigning the class
235 | \begin{equation}\label{eq:class}
236 | \hat y = \mathrm{argmax}_c ~~\mathrm{p}_{\mathcal{V}_c}(d).
237 | \end{equation}
238 | 
239 | 
240 | 
241 | \section{Illustration}
242 | 
243 | We consider a corpus of reviews provided by Yelp for a contest on {\tt
244 | kaggle.com}.  The text is tokenized simply by converting to lowercase before splitting on punctuation and white-space.  The training data are 230,000 reviews containing more than 2
245 | million sentences. Each review is marked by a number of {\it stars}, from 1
246 | to 5, and we fit separate Word2Vec representations $\mathcal{V}_1 \ldots
247 | \mathcal{V}_5$ for the documents at each star rating.  The validation data
248 | consist of 23,000 reviews, and we apply the inversion technique of Section
249 | \ref{sec:w2v} to score each validation document $d$ with class probabilities
250 | $\mathbf{q} = [q_1 \cdots q_5]$, where $q_c = \mathrm{p}(c|d)$.
251 | 
252 | The probabilities will be used in three different classification tasks; for reviews as 
253 | 
254 | \vskip .1cm
255 | $a.$ negative at 1-2 stars, or positive at 3-5 stars; 
256 | 
257 | \vskip .1cm
258 | $b.$ negative 1-2, neutral 3, or positive 4-5 stars;
259 | 
260 | \vskip .1cm
261 | $c.$ corresponding to each of 1 to 5 stars.
262 | 
263 | \vskip .1cm
264 | In each case, classification proceeds by summing across the relevant
265 | sub-class probabilities.  For example, in task $a$,
266 | $\mathrm{p}(\texttt{positive}) = q_3+q_4+q_5$. Note that the same five fitted
267 | Word2Vec representations are used for each task.
268 | 
269 | We consider a set of related comparator techniques.  In each case, some
270 | document representation (e.g., phrase counts or Doc2Vec vectors) is used as
271 | input to logistic regression prediction of the associated review rating.
272 | The logistic regressions are fit under $L_1$ regularization with the
273 | penalties weighted by feature standard deviation (which, e.g., up-weights rare
274 | phrases) and selected according to the corrected AICc criteria
275 | \cite{flynn_efficiency_2013} via the \texttt{gamlr} R package of Taddy
276 | \shortcite{taddy_one-step_2014}.  For multi-class tasks $b$-$c$, we use
277 | distributed Multinomial regression (DMR; Taddy
278 | 2015)\nocite{taddy_distributed_2015} via the \texttt{distrom} R package.  DMR
279 | fits multinomial logistic regression in a factorized representation wherein
280 | one estimates independent Poisson linear models for each response category.
281 | Document representations and logistic regressions are
282 | always trained using only the training corpus.
283 | 
284 | 
285 | \vskip .1cm
286 | \noindent \textit{Doc2Vec} is also fit via \texttt{gensim}, using the same
287 | latent space specification as for Word2Vec: $K=100$ and $b=5$.  
288 | As recommended in the documentation, we apply repeated SGD over 20 re-orderings of each
289 |  corpus (for comparability, this was also done when fitting Word2Vec).
290 | Le and Mikolov provide two alternative Doc2Vec specifications: distributed
291 | memory (DM) and distributed bag-of-words (DBOW).  We fit both. Vector representations for validation documents are trained without
292 | updating the word-vector elements, leading to 100 dimensional vectors for
293 | each document for each of DM and DCBOW.  We input each, as well as the combined 200 dimensional
294 | DM+DBOW representation, to logistic regression.   
295 | 
296 | 
297 | \vskip .1cm
298 | \noindent \textit{Phrase regression} applies logistic regression of 
299 | response classes directly onto counts for short 1-2 word `phrases'.  The phrases are
300 | obtained using \texttt{gensim}'s phrase builder, which simply combines highly
301 | probable pairings; e.g., \texttt{first\_date} and
302 | \texttt{chicken\_wing} are two pairings in this corpus.  
303 | 
304 | \vskip .1cm
305 | \noindent \textit{MNIR}, the multinomial inverse regression of Taddy
306 | \shortcite{taddy_measuring_2013,taddy_multinomial_2013,taddy_distributed_2015}
307 | is applied  as implemented in the \texttt{textir} package for R.  MNIR maps
308 | from text to the class-space of interest through a multinomial logistic
309 | regression of phrase counts onto variables relevant to the class-space. We
310 | apply MNIR to the same set of 1-2 word phrases used in phrase regression.
311 | Here, we regress phrase counts onto stars expressed numerically and as a
312 | 5-dimensional indicator vector, leading to a 6-feature multinomial logistic
313 | regression.  The MNIR procedure then uses the $6\times p$ matrix of 
314 | feature-phrase regression coefficients to map from phrase-count to feature space,
315 | resulting in 6 dimensional `sufficient reduction' statistics for each
316 | document.  These  are input to logistic
317 | regression.
318 | 
319 | \vskip .1cm
320 | \noindent \textit{Word2Vec aggregation}  averages  fitted word
321 | representations for a single Word2Vec trained on all sentences to obtain a
322 | fixed-length feature vector for each review ($K=100$, as for inversion).  This
323 | vector is then input to logistic regression.
324 | 
325 | % \vskip .1cm
326 | % \noindent \textit{Topic regression}  fits the Latent Dirichlet
327 | % Allocation  of  Blei et al.~\shortcite{blei_latent_2003} using the posterior
328 | %  maximization and Bayes factor selection strategy of Taddy
329 | % \shortcite{taddy_estimation_2012} as implemented in \texttt{maptpx} for R.
330 | % Estimated topic weights for each document are then used as inputs to logistic
331 | % regression.  Due to high computational costs, we limit to words occurring in
332 | % at least 200 documents.
333 | 
334 | \begin{table}
335 | \hspace{-.25cm}
336 | {
337 | \begin{tabular}{r|c c c}
338 | & $a$ (NP) & $b$ (NNP) &  $c$ (1-5)
339 | \\ \cline{2-4}\rule{0pt}{3ex}
340 | W2V inversion & .099 & \textbf{.189} & .435 \\
341 | Phrase regression & \textbf{.084} & .200 & \textbf{.410} \\
342 | D2V DBOW &  .144 &.282 & .496 \\
343 | D2V DM & .179 & .306 & .549 \\
344 | D2V combined & .148 & . 284 & .500 \\
345 | MNIR & .095 & .254 & .480 \\
346 | W2V aggregation & .118 & .248 & .461 
347 | \end{tabular}}
348 | \caption{ Out-of-sample misclassification rates.}
349 | \end{table}
350 | 
351 | \subsection{Results}
352 | 
353 | Misclassification rates for each task on the validation set are reported in
354 | Table 1. Simple phrase-count regression is consistently the
355 | strongest performer, bested only by Word2Vec inversion on task $b$.  This is
356 | partially due to the relative strengths of discriminative (e.g., logistic
357 | regression) vs generative (e.g., all others here) classifiers: given a large amount of
358 | training text, asymptotic efficiency of logistic regression will start to work
359 | in its favor over the finite sample advantages of a generative classifier
360 | \cite{ng_discriminative_2002,taddy_rejoinder:_2013}.
361 | However, the comparison is also unfair to Word2Vec and Doc2Vec: both
362 | phrase regression and MNIR are optimized exactly under
363 | AICc selected penalty, while Word and Doc 2Vec have only been approximately
364 | optimized under a single specification.  The
365 | distributed representations should improve  with some careful engineering.
366 | 
367 | Word2Vec inversion outperforms the other document representation-based
368 | alternatives (except, by a narrow margin, MNIR in task $a$).  Doc2Vec under
369 | DBOW specification and MNIR both do worse, but not by a large margin. In
370 | contrast to  Le and Mikolov, we find here that the Doc2Vec DM model does much
371 | worse than DBOW.  Regression onto simple within- document aggregations of
372 | Word2Vec perform slightly better than any Doc2Vec option (but not as well as
373 | the Word2Vec inversion).  This again contrasts the results of Le and Mikolov
374 | and we suspect that the more complex
375 | Doc2Vec model would benefit from a careful tuning of the SGD optimization
376 | routine.\footnote{Note also that the unsupervised document representations -- Doc2Vec or the single Word2Vec used in Word2Vec aggregation -- could be trained on larger unlabeled corpora.  A similar option is available for Word2Vec inversion: one could take a single Word2Vec model trained on a large unlabeled corpora as a shared baseline (prior) and  update separate models with additional training on each labeled sub-corpora.  The representations will all be shrunk towards a baseline language model, but will differ according to distinctions between the language in each labeled sub-corpora.}
377 | 
378 | 
379 | Looking at the fitted probabilities in detail we see that Word2Vec inversion
380 | provides a more useful document {\it ranking} than any comparator (including
381 | phrase regression).  For example, Figure \ref{pic:coarseprob} shows the
382 | probabilities of a review being `positive' in task $a$ as a function of the
383 | true star rating for each validation review. Although phrase regression does
384 | slightly better in terms of misclassification rate, it does so at the cost of
385 | classifying many terrible (1 star) reviews as positive.  This occurs  because 1-2 star reviews are more rare than 3-5 star reviews and because words of emphasis (e.g. \texttt{very}, \texttt{completely}, and \texttt{!!!}) are used both in very bad and in very good reviews.  Word2Vec inversion is
386 | the {\it only} method that yields positive-document probabilities that are
387 | clearly increasing in distribution with the true star rating.  It is not
388 | difficult to envision a misclassification cost structure that favors such
389 | nicely ordered probabilities.
390 | 
391 | 
392 | \section{Discussion}
393 | 
394 | The goal of this note is to point out inversion as an option for turning distributed language representations into classification rules.  We are not arguing for the supremacy of Word2Vec inversion in particular, and the approach should work well with alternative representations (e.g., Glove).  Moreover, we are not even arguing that it will always outperform purpose-built classification tools.  However, it is a simple, scalable, interpretable, and effective option for classification whenever you are working with such distributed representations.
395 | 
396 | \bibliographystyle{acl}
397 | \bibliography{deepir}
398 | 
399 | 
400 | \end{document}
401 | 


--------------------------------------------------------------------------------
/tex/graphs/bht.dot:
--------------------------------------------------------------------------------
 1 | digraph Tree {
 2 | 	rankdir="RL";
 3 | 	edge [arrowhead=none];
 4 | 	node [shape=box];
 5 | 	0 [label="18"] ;
 6 | 	0 -> 1 [label = "0"];
 7 | 	0 -> 2 [label = "1"];
 8 | 	1 [label="11"] ;
 9 | 	2 [label="7"] ;
10 | 	1 -> 3 ;
11 | 	2 -> 4 [label = "0"];
12 | 	2 -> 5 [label = "1"];
13 | 	3 [label="7"] ;
14 | 	4 [label="6"] ;
15 | 	5 [label="5"] ;
16 | 	3 -> 8 [label = "0"];
17 | 	3 -> 9 [label = "1"];
18 | 	4 -> 6 ;
19 | 	5 -> 7 ;
20 | 	6 [label="10  Thanks 6"] ;
21 | 	7 [label="11  Buddy  5"] ;
22 | 	8 [label="00  Hello 4"] ;
23 | 	9 [label="01   No  3"] ;
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/tex/graphs/bht.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bht.pdf


--------------------------------------------------------------------------------
/tex/graphs/bht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bht.png


--------------------------------------------------------------------------------
/tex/graphs/bystarshort.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/bystarshort.pdf


--------------------------------------------------------------------------------
/tex/graphs/coarseprob.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/coarseprob.pdf


--------------------------------------------------------------------------------
/tex/graphs/coarseprob_bystar.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/coarseprob_bystar.pdf


--------------------------------------------------------------------------------
/tex/graphs/fineprob.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/fineprob.pdf


--------------------------------------------------------------------------------
/tex/graphs/nnpprob.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/nnpprob.pdf


--------------------------------------------------------------------------------
/tex/graphs/posneg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/posneg.png


--------------------------------------------------------------------------------
/tex/graphs/yelp_logistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TaddyLab/deepir/e483f4a8312f7b49234957a6f04a2be27f76d7e1/tex/graphs/yelp_logistic.png


--------------------------------------------------------------------------------